fix computed tokens

richardhuo-nv · richardhuo-nv · commit 6f41dd45c613 · 2025-08-22T02:53:27.000-07:00
fix

fix position

fix

fix

fix

fix

fix
diff --git a/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs b/lib/bindings/python/rust/llm/block_manager/vllm/connector/leader/slot.rs
@@ -589,15 +589,19 @@ impl Slot for VllmConnectorSlot {
         // in onborading case
         if computed_position < self.current_position {
             tracing::debug!(
-                "computed_position={} <= current_position={}, so we are onboarding during prefilling phase",
+                "computed_position={} < current_position={}, so we are onboarding during prefilling phase",
                 computed_position, self.current_position
             );
             return Ok(());
         }
 
         // now we decide what we should do for the new computed tokens
+        tracing::debug!(
+                "applying scheduler output, computed_position={}, sequence_total_tokens={}",
+                computed_position, self.sequence.total_tokens()
+            );
 
-        if computed_position < self.sequence.total_tokens() {
+        if computed_position <= self.sequence.total_tokens() {
             // no need to apply new tokens, since it's applied when created the slot during prefilling
             self.state = SlotState::Prefilling;
         } else {
@@ -617,7 +621,7 @@ impl Slot for VllmConnectorSlot {
         }
 
         let num_candidate_blocks =
-            ((computed_position + 1) / self.block_size) - self.evaluated_blocks;
+            (computed_position / self.block_size) - self.evaluated_blocks;
 
         if num_candidate_blocks != 0 {
             // do we have a mechanism for skipping gpu cache hit blocks?  not sure yet.
diff --git a/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs b/lib/bindings/python/rust/llm/block_manager/vllm/connector/trtllm_leader.rs
@@ -210,7 +210,7 @@ impl Leader for KvConnectorLeader {
         block_ids: Vec<BlockId>,
         context_current_position: usize,
     ) -> anyhow::Result<()> {
-        tracing::debug!(request_id, "num_device_blocks: {}", block_ids.len(),);
+        tracing::debug!(request_id, "num_device_blocks: {}, context_current_position: {}", block_ids.len(), context_current_position);
 
         let shared_slot = self.slot_manager().get_slot(&request_id)?;
         let mut slot = shared_slot
@@ -227,7 +227,7 @@ impl Leader for KvConnectorLeader {
             .get(&request_id)
         {
             if num_external_tokens > 0 {
-                let num_computed_tokens = (context_current_position + 1) - num_external_tokens;
+                let num_computed_tokens = context_current_position - num_external_tokens;
                 slot.record_cached_device_tokens(num_computed_tokens);
                 slot.advance_computed_position(num_computed_tokens)?;
 
@@ -317,7 +317,7 @@ impl Leader for KvConnectorLeader {
             slot.apply_scheduler_output_with_computed_position(
                 &new_req.prompt_token_ids,
                 &new_req.block_ids,
-                new_req.num_computed_tokens - 1,
+                new_req.num_computed_tokens,
             )?;
 
             if let Some(pending_ops) = slot.take_pending_operations() {
@@ -347,7 +347,7 @@ impl Leader for KvConnectorLeader {
             slot.apply_scheduler_output_with_computed_position(
                 &cached_req.new_token_ids,
                 &cached_req.new_block_ids,
-                cached_req.num_computed_tokens - 1,
+                cached_req.num_computed_tokens,
             )?;
 
             if let Some(pending_ops) = slot.take_pending_operations() {
diff --git a/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_leader.py b/lib/bindings/python/src/dynamo/llm/trtllm_integration/connector/kvbm_connector_leader.py
@@ -54,7 +54,7 @@ def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes:
                 str(req.request_id),
                 req.new_tokens,
                 req.new_block_ids,
-                req.computed_position + 1,
+                req.computed_position,
             )
 
         resumed_from_preemption = False
@@ -64,7 +64,7 @@ def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes:
                 resumed_from_preemption,
                 req.new_tokens,
                 req.new_block_ids,
-                req.computed_position + 1,
+                req.computed_position,
             )
 
         return self._connector.build_connector_metadata(output)