add mlp weight prefetch

Shuming19 · rjg-lyh · commit 5ffd8db8355c · 2025-09-04T23:44:38.000+08:00
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -67,7 +67,9 @@ def set_ascend_forward_context(
         moe_comm_method: str = "",
         num_actual_tokens: Optional[int] = None,
         aclgraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-        batch_descriptor: Optional[BatchDescriptor] = None):
+        batch_descriptor: Optional[BatchDescriptor] = None,
+        prefetch_stream: torch.npu.Stream = None,
+        prefetch_model: torch.nn.Module = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     We add some additional param into forward_context.
@@ -82,6 +84,11 @@ def set_ascend_forward_context(
             batch_descriptor=batch_descriptor,
     ):
         forward_context = get_forward_context()
+
+        forward_context.prefetch_stream = prefetch_stream
+        forward_context.prefetch_model = prefetch_model
+        forward_context.prefetch_mlp_up = False
+
         forward_context.moe_comm_method_name = moe_comm_method + "commimpl"
         forward_context.with_prefill = with_prefill
         ep_size = (get_ep_group().world_size if
diff --git a/vllm_ascend/ops/activation.py b/vllm_ascend/ops/activation.py
@@ -17,6 +17,7 @@
 
 import torch
 from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
+from vllm.forward_context import get_forward_context
 
 
 class AscendQuickGELU(QuickGELU):
@@ -29,6 +30,26 @@ def forward_oot(self, x: torch.tensor) -> torch.Tensor:
 
 
 class AscendSiluAndMul(SiluAndMul):
+    def prefetch_down_proj(self,
+                           dependency: torch.Tensor):
+        import torch_npu
+        forward_context = get_forward_context()
+        prefetch_model = forward_context.prefetch_model
+        prefetch_stream = forward_context.prefetch_stream
+        layer_idx = forward_context.layer_idx
+
+        prefetch_stream.wait_stream(torch.npu.current_stream())
+
+        with torch.npu.stream(prefetch_stream):
+            MLP_DOWN_PREFETCH_SIZE = 6 * 1024 * 1024
+            torch_npu.npu_prefetch(prefetch_model.model.layers[layer_idx].mlp.down_proj.weight, \
+                                dependency, MLP_DOWN_PREFETCH_SIZE)
+            forward_context.layer_idx += 1
+
+    def wait_prefetch_done(self):
+        forward_context = get_forward_context()
+        prefetch_stream = forward_context.prefetch_stream
+        torch.npu.current_stream().wait_stream(prefetch_stream)
 
     def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
         import torch_npu
@@ -38,5 +59,10 @@ def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
         if is_310p():
             out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
         else:
+            dependency = x
+            self.prefetch_down_proj(dependency)
+
             out = torch_npu.npu_swiglu(x)
+
+            self.wait_prefetch_done()
         return out
diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py
@@ -36,6 +36,12 @@ def __init__(
         super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
         self.layer = layer
 
+    def wait_prefetch_done(self):
+        forward_context = get_forward_context()
+        prefetch_stream = forward_context.prefetch_stream
+        # wait until
+        torch.npu.current_stream().wait_stream(prefetch_stream)
+
     def forward(
         self,
         x: torch.Tensor,
@@ -53,10 +59,18 @@ def forward(
                 self.layer.aclnn_input_scale,
                 self.layer.aclnn_input_offset,
                 epsilon=self.variance_epsilon)
+
+            if forward_context.prefetch_mlp_up:
+                self.wait_prefetch_done()
+
             return x, residual
 
         x, residual = torch_npu.npu_rms_norm(x, self.weight,
                                              self.variance_epsilon)
+                                             
+        forward_context = get_forward_context()
+        if forward_context.prefetch_mlp_up:
+                self.wait_prefetch_done()
         return x
 
 
diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py
@@ -381,6 +381,33 @@ class AscendDenseRowParallelLinear(RowParallelLinear):
     communication-computation fusion.
     """
 
+    def prefetch_gate_up_proj(self,
+                              dependency: torch.Tensor):
+        # get prefetch model
+        forward_context = get_forward_context()
+        layer_num = int(self.prefix.split('.')[2])
+        prefetch_model = forward_context.prefetch_model
+        prefetch_stream = forward_context.prefetch_stream
+
+        # start point of weight prefetch
+        forward_context.prefetch_mlp_up = True if self.prefix.split('.')[-2] == 'self_attn' else False
+        if forward_context.prefetch_mlp_up:
+            prefetch_stream.wait_stream(torch.npu.current_stream())
+
+            with torch.npu.stream(prefetch_stream):
+                # For Qwen3-32B
+                MLP_GATE_UP_PREFETCH_SIZE = 50 * 1024 * 1024
+                torch_npu.npu_prefetch(prefetch_model.model.layers[layer_num].mlp.gate_up_proj.weight, \
+                                    dependency, MLP_GATE_UP_PREFETCH_SIZE)
+
+
+    def wait_prefetch_done(self):
+        forward_context = get_forward_context()
+        if forward_context.prefetch_mlp_up:
+            prefetch_stream = forward_context.prefetch_stream
+            # wait until reduce-scatter is done
+            torch.npu.current_stream().wait_stream(prefetch_stream)
+
     def forward(
         self, input_: torch.Tensor
     ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
@@ -404,6 +431,10 @@ def forward(
             output_parallel = self.quant_method.apply(self,
                                                       input_parallel,
                                                       bias=bias_)
+            dependency = output_parallel
+
+            self.prefetch_gate_up_proj(dependency)
+
             output = torch.ops.vllm.maybe_pad_and_reduce(output_parallel)
 
         output_bias = self.bias if self.skip_bias_add else None
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -181,6 +181,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.dp_rank = vllm_config.parallel_config.data_parallel_rank
         self.device = device
+        self.prefetch_stream = torch.npu.Stream(device=device)
         self.dtype = self.model_config.dtype
         if envs_ascend.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION:
             # TODO: drop the env config to use ascend sampler by default
@@ -1497,7 +1498,9 @@ def execute_model(
                     aclgraph_runtime_mode=aclgraph_runtime_mode,
                     batch_descriptor=batch_descriptor,
                     num_actual_tokens=scheduler_output.
-                    total_num_scheduled_tokens):
+                    total_num_scheduled_tokens,
+                    prefetch_stream=self.prefetch_stream,
+                    prefetch_model=self.model):
                 self.maybe_setup_kv_connector(scheduler_output)
 
                 hidden_states = self._generate_process_reqs_hidden_states(
@@ -1944,7 +1947,9 @@ def dummy_compute_logits(hidden_states):
                     moe_comm_method=moe_comm_method,
                     num_actual_tokens=0,
                     aclgraph_runtime_mode=aclgraph_runtime_mode,
-                    batch_descriptor=batch_descriptor):
+                    batch_descriptor=batch_descriptor,
+                    prefetch_stream=self.prefetch_stream,
+                    prefetch_model=self.model):
                 hidden_states = self._generate_dummy_run_hidden_states(
                     with_prefill, is_torchair_compile, input_ids, positions,
                     attn_metadata, num_tokens, intermediate_tensors,
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -55,6 +55,12 @@
 else:
     DraftTokenIds = None
 
+torch._dynamo.trace_rules.clear_lru_cache()
+from torch._dynamo.variables import TorchInGraphFunctionVariable
+torch_non_c_binding_in_graph_functions_npu = dict.fromkeys(["torch.npu.current_stream"], TorchInGraphFunctionVariable,)
+torch_non_c_binding_in_graph_functions_npu["torch.npu.stream"] = TorchInGraphFunctionVariable
+torch._dynamo.trace_rules.torch_name_rule_map.append(torch_non_c_binding_in_graph_functions_npu)
+
 
 class NPUWorker(WorkerBase):