-
Notifications
You must be signed in to change notification settings - Fork 673
Draft: kvbm - bigger page size for offload block pools only #2986
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -161,8 +161,12 @@ impl BlockManager { | |
| }) | ||
| } | ||
|
|
||
| fn block_size(&self) -> usize { | ||
| self.inner.block_size() | ||
| fn engine_block_size(&self) -> usize { | ||
| self.inner.engine_block_size() | ||
| } | ||
|
|
||
| fn offload_block_size(&self) -> usize { | ||
| self.inner.offload_block_size() | ||
| } | ||
|
|
||
| fn init_controller(&mut self, component: Component) -> PyResult<()> { | ||
|
|
@@ -214,14 +218,16 @@ impl BlockManager { | |
| pub struct BlockManagerBuilder { | ||
| worker_id: u64, | ||
| leader: Option<distributed::KvbmLeader>, | ||
| page_size: usize, | ||
| offload_page_size: usize, | ||
| engine_page_size: usize, | ||
| disable_device_pool: bool, | ||
| } | ||
|
|
||
| impl BlockManagerBuilder { | ||
| pub fn new() -> Self { | ||
| Self { | ||
| page_size: 32, // default consistent with BlockManager::new | ||
| engine_page_size: 32, // default consistent with BlockManager::new | ||
| offload_page_size: 1024, // default consistent with BlockManager::new | ||
| ..Default::default() | ||
| } | ||
| } | ||
|
|
@@ -230,8 +236,12 @@ impl BlockManagerBuilder { | |
| self.worker_id = id; | ||
| self | ||
| } | ||
| pub fn page_size(mut self, ps: usize) -> Self { | ||
| self.page_size = ps; | ||
| pub fn engine_page_size(mut self, ps: usize) -> Self { | ||
| self.engine_page_size = ps; | ||
| self | ||
| } | ||
| pub fn offload_page_size(mut self, ps: usize) -> Self { | ||
| self.offload_page_size = ps; | ||
| self | ||
| } | ||
| pub fn leader(mut self, l: distributed::KvbmLeader) -> Self { | ||
|
|
@@ -267,7 +277,7 @@ impl BlockManagerBuilder { | |
| let model_config = dynamo_llm::block_manager::KvManagerModelConfig::builder() | ||
| .num_layers(1) | ||
| .outer_dim(1) | ||
| .page_size(self.page_size) | ||
| .page_size(self.engine_page_size) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainOnly engine page size is propagated; offload is ignored build() writes model.page_size but never configures offload_block_size_ratio, so offload size falls back to the global default regardless of the builder’s offload_page_size. Follow the ratio derivation diff in my earlier comment to ensure offload_page_size actually takes effect. To confirm the API path exists, run: 🏁 Script executed: #!/bin/bash
# Verify the builder exposes offload_block_size_ratio and find its usage.
rg -n --glob 'lib/**' 'offload_block_size_ratio' -C2
# Confirm current usage of offload_page_size and engine_page_size.
rg -n --glob 'lib/**' 'offload_page_size|engine_page_size' -C2Length of output: 38270 Propagate offload_page_size in Python binding build() .page_size(self.engine_page_size)add .offload_page_size(self.offload_page_size)so the builder applies the configured offload size rather than falling back to the default. 🤖 Prompt for AI Agents |
||
| .inner_dim(1) | ||
| .build()?; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -176,7 +176,8 @@ impl KvbmWorker { | |||||||||||||||||||||||||||||||||||||||||||||||||||||
| let config = KvbmWorkerConfig::builder() | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| .drt(drt) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| .num_device_blocks(num_device_blocks) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| .page_size(page_size) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| .offload_page_size(page_size) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| .engine_page_size(page_size) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
176
to
+180
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Configuration uses the same value for both page sizes The configuration currently sets both Based on the PR objectives stating that offload blocks should be larger than device blocks, the Python bindings should accept two distinct page size parameters. Consider updating the function signature to accept both - #[pyo3(signature = (num_device_blocks, page_size, tensors, device_id=0, dtype_width_bytes=2, drt=None, layout_blocking=false))]
+ #[pyo3(signature = (num_device_blocks, engine_page_size, offload_page_size, tensors, device_id=0, dtype_width_bytes=2, drt=None, layout_blocking=false))]
fn new(
num_device_blocks: usize,
- page_size: usize,
+ engine_page_size: usize,
+ offload_page_size: usize,
tensors: Vec<Py<PyAny>>,And update the configuration accordingly: let config = KvbmWorkerConfig::builder()
.drt(drt)
.num_device_blocks(num_device_blocks)
- .offload_page_size(page_size)
- .engine_page_size(page_size)
+ .offload_page_size(offload_page_size)
+ .engine_page_size(engine_page_size)📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||||||||||||||||||
| .tensors(vllm_tensors) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| .device_id(device_id) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
| .dtype_width_bytes(dtype_width_bytes) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -83,7 +83,10 @@ impl KvbmCacheManager { | |||||||||||||||||||||
| #[new] | ||||||||||||||||||||||
| #[pyo3(signature = (block_manager))] | ||||||||||||||||||||||
| pub fn new(block_manager: PyBlockManager) -> PyResult<Self> { | ||||||||||||||||||||||
| let slot_manager = Mutex::new(SlotManager::new(block_manager.block_size())); | ||||||||||||||||||||||
| let slot_manager = Mutex::new(SlotManager::new( | ||||||||||||||||||||||
| block_manager.engine_block_size(), | ||||||||||||||||||||||
| block_manager.offload_block_size(), | ||||||||||||||||||||||
| )); | ||||||||||||||||||||||
| Ok(Self { | ||||||||||||||||||||||
| block_manager, | ||||||||||||||||||||||
| slot_manager, | ||||||||||||||||||||||
|
|
@@ -286,7 +289,7 @@ pub struct GenericSlotUpdate<R> { | |||||||||||||||||||||
| pub num_new_tokens: usize, | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| /// The number of new computed tokens in the request. | ||||||||||||||||||||||
| /// The `num_new_tokens / block_size` should be equal to the length of the `new_computed_blocks`, | ||||||||||||||||||||||
| /// The `num_new_tokens / engine_block_size` should be equal to the length of the `new_computed_blocks`, | ||||||||||||||||||||||
| /// it may have a remainder for the partial block state. | ||||||||||||||||||||||
| /// Note: this field is solely tied to the `new_computed_blocks` field and not used when `tokens_to_append` is provided. | ||||||||||||||||||||||
| /// The name might be confusing, but the name matched the vLLM implementation. | ||||||||||||||||||||||
|
|
@@ -401,15 +404,17 @@ impl SlotError { | |||||||||||||||||||||
|
|
||||||||||||||||||||||
| pub struct SlotManager<R: RequestKey> { | ||||||||||||||||||||||
| slots: HashMap<R, Slot<DeviceStorage, Logical<DistributedLeaderWorkerResources>>>, | ||||||||||||||||||||||
| block_size: usize, | ||||||||||||||||||||||
| engine_block_size: usize, | ||||||||||||||||||||||
| offload_block_size: usize, | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| impl<R: RequestKey> SlotManager<R> { | ||||||||||||||||||||||
| /// Creates a new slot manager. | ||||||||||||||||||||||
| pub fn new(block_size: usize) -> Self { | ||||||||||||||||||||||
| pub fn new(engine_block_size: usize, offload_block_size: usize) -> Self { | ||||||||||||||||||||||
| Self { | ||||||||||||||||||||||
| slots: HashMap::new(), | ||||||||||||||||||||||
| block_size, | ||||||||||||||||||||||
| engine_block_size, | ||||||||||||||||||||||
| offload_block_size, | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
|
|
@@ -436,7 +441,7 @@ impl<R: RequestKey> SlotManager<R> { | |||||||||||||||||||||
| if !self.slots.contains_key(request_id) { | ||||||||||||||||||||||
| self.slots.insert( | ||||||||||||||||||||||
| request_id.clone(), | ||||||||||||||||||||||
| Slot::new(tokens.into(), self.block_size, salt_hash), | ||||||||||||||||||||||
| Slot::new(tokens.into(), self.engine_block_size, salt_hash), | ||||||||||||||||||||||
| ); | ||||||||||||||||||||||
|
Comment on lines
+444
to
445
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Slot initialization should carry both sizes to support offload hashing Passing only engine_block_size makes it hard for Slot to produce offload-sized sequence hashes needed for host/disk caches. If Slot::new can be extended, pass both sizes: - Slot::new(tokens.into(), self.engine_block_size, salt_hash),
+ Slot::new(
+ tokens.into(),
+ self.engine_block_size,
+ self.offload_block_size,
+ salt_hash,
+ ),Otherwise, provide a method on Slot to compute offload-sized sequence hashes on demand. 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||
| tracing::debug!( | ||||||||||||||||||||||
| request_id, | ||||||||||||||||||||||
|
|
@@ -498,7 +503,7 @@ impl<R: RequestKey> SlotManager<R> { | |||||||||||||||||||||
| tracing::debug!( | ||||||||||||||||||||||
| request_id, | ||||||||||||||||||||||
| "applying {} cache-hit tokens", | ||||||||||||||||||||||
| blocks.len() * self.block_size | ||||||||||||||||||||||
| blocks.len() * self.engine_block_size | ||||||||||||||||||||||
| ); | ||||||||||||||||||||||
| slot.initialize_with_device_matches(blocks)?; | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
@@ -566,9 +571,9 @@ impl<R: RequestKey> SlotManager<R> { | |||||||||||||||||||||
| match self.slots.remove(request_id) { | ||||||||||||||||||||||
| Some(slot) => { | ||||||||||||||||||||||
| let isl = slot.num_tokens(SlotPosition::Prefill); | ||||||||||||||||||||||
| let isl_device = slot.num_blocks_cached_from_device() * self.block_size; | ||||||||||||||||||||||
| let isl_host = slot.num_blocks_cached_from_host() * self.block_size; | ||||||||||||||||||||||
| let isl_disk = slot.num_blocks_cached_from_disk() * self.block_size; | ||||||||||||||||||||||
| let isl_device = slot.num_blocks_cached_from_device() * self.engine_block_size; | ||||||||||||||||||||||
| let isl_host = slot.num_blocks_cached_from_host() * self.offload_block_size; | ||||||||||||||||||||||
| let isl_disk = slot.num_blocks_cached_from_disk() * self.offload_block_size; | ||||||||||||||||||||||
| tracing::info!( | ||||||||||||||||||||||
| request_id, | ||||||||||||||||||||||
| "request complete isl: {} - cache hits: device: {}, host: {}, disk: {} - prefilled: {}", | ||||||||||||||||||||||
|
|
@@ -603,14 +608,14 @@ impl<R: RequestKey> SlotManager<R> { | |||||||||||||||||||||
| assert!(num_computed_tokens <= request_num_tokens); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // early exit if we cannot match full block | ||||||||||||||||||||||
| if (request_num_tokens - num_computed_tokens) < self.block_size { | ||||||||||||||||||||||
| if (request_num_tokens - num_computed_tokens) < self.engine_block_size { | ||||||||||||||||||||||
| return Ok((0, false)); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // num_computed_tokens represents the number of tokens already on the device | ||||||||||||||||||||||
| // this much be a multiple of the block size | ||||||||||||||||||||||
| let num_device_blocks = num_computed_tokens / self.block_size; | ||||||||||||||||||||||
| debug_assert_eq!(num_computed_tokens % self.block_size, 0); | ||||||||||||||||||||||
| let num_device_blocks = num_computed_tokens / self.engine_block_size; | ||||||||||||||||||||||
| debug_assert_eq!(num_computed_tokens % self.engine_block_size, 0); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // get the sequence hashes for the device matched tokens | ||||||||||||||||||||||
| let sequence_hashes = slot.sequence_hashes(SlotPosition::All); | ||||||||||||||||||||||
|
|
@@ -661,7 +666,7 @@ impl<R: RequestKey> SlotManager<R> { | |||||||||||||||||||||
| return Ok((0, false)); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| let mut num_new_matched_tokens = num_matched_blocks * self.block_size; | ||||||||||||||||||||||
| let mut num_new_matched_tokens = num_matched_blocks * self.engine_block_size; | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // we are on a block boundary, so we need to throw away the last block | ||||||||||||||||||||||
| if num_computed_tokens + num_new_matched_tokens == request_num_tokens { | ||||||||||||||||||||||
|
|
@@ -681,7 +686,7 @@ impl<R: RequestKey> SlotManager<R> { | |||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // decrement the number of new matched tokens by the block size | ||||||||||||||||||||||
| num_new_matched_tokens -= self.block_size; | ||||||||||||||||||||||
| num_new_matched_tokens -= self.engine_block_size; | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| slot.store_onboard_blocks(host_blocks, disk_blocks); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
offload_page_size is never used; risk of silent misconfiguration
The builder stores offload_page_size but never propagates it into KvBlockManagerConfig. This makes the field effectively dead and prevents callers from controlling the offload size via the builder.
Apply this diff inside build() to derive and set the ratio from the two sizes (with validation):
Additionally, consider validating an upper bound if required by pools (e.g., offload ≤ 1024).