From cd9c2f8d054e3a4e54fe0d5f8bf3f5e997da4ff9 Mon Sep 17 00:00:00 2001
From: zhaozixin <zhaozixin1@huawei.com>
Date: Fri, 5 Sep 2025 19:44:31 +0800
Subject: [PATCH] fix qwen torchair attention PrefillCacheHit

Signed-off-by: zhaozixin <zhaozixin1@huawei.com>
---
 vllm_ascend/torchair/torchair_attention.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm_ascend/torchair/torchair_attention.py b/vllm_ascend/torchair/torchair_attention.py
index 81f2968a8e..d2443ad442 100644
--- a/vllm_ascend/torchair/torchair_attention.py
+++ b/vllm_ascend/torchair/torchair_attention.py
@@ -374,6 +374,9 @@ def forward(
             indices = torch.cat((block_indices, slots_indices), dim=1)
             torch_npu.npu_scatter_nd_update_(key_cache, indices, key)
             torch_npu.npu_scatter_nd_update_(value_cache, indices, value)
+            if attn_metadata.attn_state == AscendAttentionState.PrefillCacheHit:
+                self.key_cache = key_cache
+                self.value_cache = value_cache
 
         if attn_metadata.attn_state == AscendAttentionState.PrefillNoCache:
             assert attn_metadata is not None
@@ -411,11 +414,13 @@ def forward(
             assert attn_metadata is not None
             assert attn_metadata.attn_mask is not None
             compress_mask = attn_metadata.attn_mask
+            batch_size = attn_metadata.query_lens.shape[0]
+            block_table = attn_metadata.block_tables[:batch_size, :]
             torch_npu._npu_flash_attention_qlens(
                 query=query,
                 key_cache=self.key_cache,
                 value_cache=self.value_cache,
-                block_table=attn_metadata.block_tables,
+                block_table=block_table,
                 mask=compress_mask,
                 seq_len=attn_metadata.query_lens,
                 context_lens=attn_metadata.seq_lens,