Skip to content

Commit 6f41dd4

Browse files
committed
fix computed tokens
fix fix position fix fix fix fix fix
1 parent 8eeee78 commit 6f41dd4

File tree

3 files changed

+13
-9
lines changed

3 files changed

+13
-9
lines changed

lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -589,15 +589,19 @@ impl Slot for VllmConnectorSlot {
589589
// in onborading case
590590
if computed_position < self.current_position {
591591
tracing::debug!(
592-
"computed_position={} <= current_position={}, so we are onboarding during prefilling phase",
592+
"computed_position={} < current_position={}, so we are onboarding during prefilling phase",
593593
computed_position, self.current_position
594594
);
595595
return Ok(());
596596
}
597597

598598
// now we decide what we should do for the new computed tokens
599+
tracing::debug!(
600+
"applying scheduler output, computed_position={}, sequence_total_tokens={}",
601+
computed_position, self.sequence.total_tokens()
602+
);
599603

600-
if computed_position < self.sequence.total_tokens() {
604+
if computed_position <= self.sequence.total_tokens() {
601605
// no need to apply new tokens, since it's applied when created the slot during prefilling
602606
self.state = SlotState::Prefilling;
603607
} else {
@@ -617,7 +621,7 @@ impl Slot for VllmConnectorSlot {
617621
}
618622

619623
let num_candidate_blocks =
620-
((computed_position + 1) / self.block_size) - self.evaluated_blocks;
624+
(computed_position / self.block_size) - self.evaluated_blocks;
621625

622626
if num_candidate_blocks != 0 {
623627
// do we have a mechanism for skipping gpu cache hit blocks? not sure yet.

lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ impl Leader for KvConnectorLeader {
210210
block_ids: Vec<BlockId>,
211211
context_current_position: usize,
212212
) -> anyhow::Result<()> {
213-
tracing::debug!(request_id, "num_device_blocks: {}", block_ids.len(),);
213+
tracing::debug!(request_id, "num_device_blocks: {}, context_current_position: {}", block_ids.len(), context_current_position);
214214

215215
let shared_slot = self.slot_manager().get_slot(&request_id)?;
216216
let mut slot = shared_slot
@@ -227,7 +227,7 @@ impl Leader for KvConnectorLeader {
227227
.get(&request_id)
228228
{
229229
if num_external_tokens > 0 {
230-
let num_computed_tokens = (context_current_position + 1) - num_external_tokens;
230+
let num_computed_tokens = context_current_position - num_external_tokens;
231231
slot.record_cached_device_tokens(num_computed_tokens);
232232
slot.advance_computed_position(num_computed_tokens)?;
233233

@@ -317,7 +317,7 @@ impl Leader for KvConnectorLeader {
317317
slot.apply_scheduler_output_with_computed_position(
318318
&new_req.prompt_token_ids,
319319
&new_req.block_ids,
320-
new_req.num_computed_tokens - 1,
320+
new_req.num_computed_tokens,
321321
)?;
322322

323323
if let Some(pending_ops) = slot.take_pending_operations() {
@@ -347,7 +347,7 @@ impl Leader for KvConnectorLeader {
347347
slot.apply_scheduler_output_with_computed_position(
348348
&cached_req.new_token_ids,
349349
&cached_req.new_block_ids,
350-
cached_req.num_computed_tokens - 1,
350+
cached_req.num_computed_tokens,
351351
)?;
352352

353353
if let Some(pending_ops) = slot.take_pending_operations() {

lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_leader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes:
5454
str(req.request_id),
5555
req.new_tokens,
5656
req.new_block_ids,
57-
req.computed_position + 1,
57+
req.computed_position,
5858
)
5959

6060
resumed_from_preemption = False
@@ -64,7 +64,7 @@ def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes:
6464
resumed_from_preemption,
6565
req.new_tokens,
6666
req.new_block_ids,
67-
req.computed_position + 1,
67+
req.computed_position,
6868
)
6969

7070
return self._connector.build_connector_metadata(output)

0 commit comments

Comments
 (0)