diff --git a/python/sglang/srt/mem_cache/allocator_ascend.py b/python/sglang/srt/mem_cache/allocator_ascend.py index 14fc1d1e362..2c606187a95 100644 --- a/python/sglang/srt/mem_cache/allocator_ascend.py +++ b/python/sglang/srt/mem_cache/allocator_ascend.py @@ -119,7 +119,7 @@ def alloc_extend( assert len(torch.unique(out_indices)) == len(out_indices) self.free_pages = self.free_pages[num_new_pages_item:] - return out_indices + return out_indices.int() def alloc_decode( self, diff --git a/python/sglang/srt/model_executor/npu_graph_runner.py b/python/sglang/srt/model_executor/npu_graph_runner.py index 67a31c62f92..8c0c64a8d23 100644 --- a/python/sglang/srt/model_executor/npu_graph_runner.py +++ b/python/sglang/srt/model_executor/npu_graph_runner.py @@ -75,7 +75,7 @@ def replay( self.positions[: self.raw_num_token].copy_(forward_batch.positions) # Replay - if self.model_runner.model_config.index_head_dim is None: + if getattr(self.model_runner.model_config, "index_head_dim", None) is None: seq_lens = forward_batch.seq_lens.cpu().tolist() + [0] * ( self.bs - self.raw_bs ) diff --git a/python/sglang/srt/multimodal/processors/base_processor.py b/python/sglang/srt/multimodal/processors/base_processor.py index ef076ae0931..ef41de7c316 100644 --- a/python/sglang/srt/multimodal/processors/base_processor.py +++ b/python/sglang/srt/multimodal/processors/base_processor.py @@ -237,8 +237,10 @@ def process_mm_data( if not _is_npu: kwargs["device"] = "cuda" elif processor.__class__.__name__ not in { + "Qwen2VLProcessor", "Qwen2_5_VLProcessor", "Qwen3VLProcessor", + "Glm4vProcessor", }: # Note: for qwen-vl, processor has some reshape issue because of dims restriction on Ascend. kwargs["device"] = "npu"