Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions nanovllm/engine/block_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,16 @@ def can_append(self, seq: Sequence) -> bool:
def may_append(self, seq: Sequence):
block_table = seq.block_table
last_block = self.blocks[block_table[-1]]
if len(seq) % self.block_size == 1:

if len(block_table) < seq.num_blocks:
assert last_block.hash != -1
block_id = self.free_block_ids[0]
self._allocate_block(block_id)
block_table.append(block_id)
elif len(seq) % self.block_size == 0:
assert last_block.hash == -1
elif len(seq) % self.block_size == 0 and last_block.hash == -1:
token_ids = seq.block(seq.num_blocks-1)
prefix = self.blocks[block_table[-2]].hash if len(block_table) > 1 else -1
h = self.compute_hash(token_ids, prefix)
last_block.update(h, token_ids)
self.hash_to_block_id[h] = last_block.block_id
else:
assert last_block.hash == -1
15 changes: 12 additions & 3 deletions nanovllm/engine/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,23 @@ def schedule(self) -> tuple[list[Sequence], bool]:
# prefill
while self.waiting and len(scheduled_seqs) < self.max_num_seqs:
seq = self.waiting[0]

# Move allocate logic forward to ensure the state (seq.num_cached_tokens)
# from prefix cache hits is correctly updated
if not seq.block_table:
if not self.block_manager.can_allocate(seq):
break # no budget
self.block_manager.allocate(seq)

# Calculate the actual num_tokens using the updated seq.num_cached_tokens
num_tokens = max(seq.num_tokens - seq.num_cached_tokens, 1)
remaining = self.max_num_batched_tokens - num_batched_tokens
if remaining == 0 or (not seq.block_table and not self.block_manager.can_allocate(seq)): # no budget

if remaining == 0: # no budget
break
if remaining < num_tokens and scheduled_seqs: # only allow chunked prefill for the first seq
break
if not seq.block_table:
self.block_manager.allocate(seq)

seq.num_scheduled_tokens = min(num_tokens, remaining)
if seq.num_scheduled_tokens == num_tokens:
seq.status = SequenceStatus.RUNNING
Expand Down