diff --git a/nanovllm/engine/block_manager.py b/nanovllm/engine/block_manager.py index f835a29c2..2ddc6b181 100644 --- a/nanovllm/engine/block_manager.py +++ b/nanovllm/engine/block_manager.py @@ -96,17 +96,16 @@ def can_append(self, seq: Sequence) -> bool: def may_append(self, seq: Sequence): block_table = seq.block_table last_block = self.blocks[block_table[-1]] - if len(seq) % self.block_size == 1: + + if len(block_table) < seq.num_blocks: assert last_block.hash != -1 block_id = self.free_block_ids[0] self._allocate_block(block_id) block_table.append(block_id) - elif len(seq) % self.block_size == 0: - assert last_block.hash == -1 + + elif len(seq) % self.block_size == 0 and last_block.hash == -1: token_ids = seq.block(seq.num_blocks-1) prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1 h = self.compute_hash(token_ids, prefix) last_block.update(h, token_ids) self.hash_to_block_id[h] = last_block.block_id - else: - assert last_block.hash == -1 diff --git a/nanovllm/engine/scheduler.py b/nanovllm/engine/scheduler.py index 287dd6238..faef6fd85 100644 --- a/nanovllm/engine/scheduler.py +++ b/nanovllm/engine/scheduler.py @@ -28,14 +28,23 @@ def schedule(self) -> tuple[list[Sequence], bool]: # prefill while self.waiting and len(scheduled_seqs) < self.max_num_seqs: seq = self.waiting[0] + + # Move allocate logic forward to ensure the state (seq.num_cached_tokens) + # from prefix cache hits is correctly updated + if not seq.block_table: + if not self.block_manager.can_allocate(seq): + break # no budget + self.block_manager.allocate(seq) + + # Calculate the actual num_tokens using the updated seq.num_cached_tokens num_tokens = max(seq.num_tokens - seq.num_cached_tokens, 1) remaining = self.max_num_batched_tokens - num_batched_tokens - if remaining == 0 or (not seq.block_table and not self.block_manager.can_allocate(seq)): # no budget + + if remaining == 0: # no budget break if remaining < num_tokens and scheduled_seqs: # only allow chunked prefill for the first seq break - if not seq.block_table: - self.block_manager.allocate(seq) + seq.num_scheduled_tokens = min(num_tokens, remaining) if seq.num_scheduled_tokens == num_tokens: seq.status = SequenceStatus.RUNNING