[main] refactor and support in aclgraph

rjg-lyh · rjg-lyh · commit 617af6213705 · 2025-09-05T14:21:08.000+08:00
Signed-off-by: rjg-lyh &lt;1318825571@qq.com&gt;
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -85,10 +85,6 @@ def set_ascend_forward_context(
     ):
         forward_context = get_forward_context()
 
-        forward_context.prefetch_stream = prefetch_stream
-        forward_context.prefetch_model = prefetch_model
-        forward_context.prefetch_mlp_up = False
-
         forward_context.moe_comm_method_name = moe_comm_method + "commimpl"
         forward_context.with_prefill = with_prefill
         ep_size = (get_ep_group().world_size if
@@ -112,8 +108,18 @@ def set_ascend_forward_context(
         # due to multiple warmups before actual capturing
         forward_context.capturing = False
 
-        # set this for rope forward_oot using
-        forward_context.is_first_layer = True
+        # set this for layer index
+        forward_context.layer_idx = 0
+
+        # set for mlp weight prefetch
+        prefetch_mlp_enabled = envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP and \
+            num_tokens is not None and num_tokens < 500
+        if prefetch_mlp_enabled:
+            forward_context.prefetch_stream = prefetch_stream
+            forward_context.prefetch_model = prefetch_model
+            forward_context.prefetch_mlp_gate_up_proj = False
+            forward_context.prefetch_mlp_down_proj = False
+        forward_context.prefetch_mlp_enabled = prefetch_mlp_enabled
 
         # set for flashcomm_v1
         flashcomm_v1_enabled = envs_ascend.VLLM_ASCEND_ENABLE_FLASHCOMM and \
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -142,6 +142,15 @@
     # this feature in eager mode will get better performance.
     "VLLM_ASCEND_ENABLE_MLP_OPTIMIZE":
     lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_MLP_OPTIMIZE", '0'))),
+    # Whether to enable MLP weight prefetch, only used in decode.
+    "VLLM_ASCEND_ENABLE_PREFETCH_MLP":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_PREFETCH_MLP", '0'))),
+    # buffer size for gate up prefetch
+    "MLP_GATE_UP_PREFETCH_SIZE":
+    lambda: int(os.getenv("MLP_GATE_UP_PREFETCH_SIZE", 18 * 1024 * 1024)),
+    # buffer size for down proj prefetch
+    "MLP_DOWN_PREFETCH_SIZE":
+    lambda: int(os.getenv("MLP_DOWN_PREFETCH_SIZE", 18 * 1024 * 1024)),
     # Determine the number of physical devices in a non-full-use scenario
     # caused by the initialization of the Mooncake connector.
     "PHYSICAL_DEVICES":
diff --git a/vllm_ascend/ops/activation.py b/vllm_ascend/ops/activation.py
@@ -17,7 +17,6 @@
 
 import torch
 from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
-from vllm.forward_context import get_forward_context
 
 
 class AscendQuickGELU(QuickGELU):
@@ -30,26 +29,6 @@ def forward_oot(self, x: torch.tensor) -> torch.Tensor:
 
 
 class AscendSiluAndMul(SiluAndMul):
-    def prefetch_down_proj(self,
-                           dependency: torch.Tensor):
-        import torch_npu
-        forward_context = get_forward_context()
-        prefetch_model = forward_context.prefetch_model
-        prefetch_stream = forward_context.prefetch_stream
-        layer_idx = forward_context.layer_idx
-
-        prefetch_stream.wait_stream(torch.npu.current_stream())
-
-        with torch.npu.stream(prefetch_stream):
-            MLP_DOWN_PREFETCH_SIZE = 6 * 1024 * 1024
-            torch_npu.npu_prefetch(prefetch_model.model.layers[layer_idx].mlp.down_proj.weight, \
-                                dependency, MLP_DOWN_PREFETCH_SIZE)
-            forward_context.layer_idx += 1
-
-    def wait_prefetch_done(self):
-        forward_context = get_forward_context()
-        prefetch_stream = forward_context.prefetch_stream
-        torch.npu.current_stream().wait_stream(prefetch_stream)
 
     def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
         import torch_npu
@@ -59,10 +38,7 @@ def forward_oot(self, x: torch.Tensor) -> torch.Tensor:
         if is_310p():
             out = torch_npu.npu_swiglu(x.to(torch.float32)).to(torch.float16)
         else:
-            dependency = x
-            self.prefetch_down_proj(dependency)
-
+            torch.ops.vllm.maybe_prefetch_mlp_down_proj(x)
             out = torch_npu.npu_swiglu(x)
-
-            self.wait_prefetch_done()
+            torch.ops.vllm.maybe_wait_prefetch_done(out)
         return out
diff --git a/vllm_ascend/ops/flashcomm_gate_ops.py b/vllm_ascend/ops/flashcomm_gate_ops.py
@@ -1,12 +1,14 @@
 import torch
 import torch.nn.functional as F
+import torch_npu
 from vllm.utils import direct_register_custom_op
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_reduce_scatter,
                               tensor_model_parallel_all_reduce,
                               get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.forward_context import get_forward_context
+import vllm_ascend.envs as envs_ascend
 
 
 def _maybe_chunk_residual_impl(x: torch.Tensor, residual: torch.Tensor) -> torch.Tensor:
@@ -16,6 +18,9 @@ def _maybe_chunk_residual_impl(x: torch.Tensor, residual: torch.Tensor) -> torch
             "Currently, this situation only occurs "
             "when flashcomm_v1 is enabled"
         )
+        pad_size = get_forward_context().pad_size
+        if pad_size > 0:
+            residual = F.pad(residual, (0, 0, 0, pad_size))
         tp_size = get_tensor_model_parallel_world_size()
         tp_rank = get_tensor_model_parallel_rank()
         residual = torch.chunk(residual, tp_size, dim=0)[tp_rank]
@@ -44,6 +49,73 @@ def _maybe_pad_and_reduce_impl(x: torch.Tensor) -> torch.Tensor:
         return tensor_model_parallel_all_reduce(x)
 
 
+def _maybe_prefetch_mlp_gate_up_proj_impl(x_dependency: torch.Tensor, prefix: str) -> None:
+    forward_context = get_forward_context()
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    prefetch_model = forward_context.prefetch_model
+    prefetch_stream = forward_context.prefetch_stream
+    layer_idx = int(prefix.split('.')[2])
+
+    # start point of gate_up_proj weight prefetch
+    if prefix.split('.')[-2] == "self_attn":
+        forward_context.prefetch_mlp_gate_up_proj = True
+    if forward_context.prefetch_mlp_gate_up_proj:
+        prefetch_stream.wait_stream(torch.npu.current_stream())
+
+        with torch.npu.stream(prefetch_stream):
+            MLP_GATE_UP_PREFETCH_SIZE = envs_ascend.MLP_GATE_UP_PREFETCH_SIZE
+            torch_npu.npu_prefetch(prefetch_model.model.layers[layer_idx].mlp.gate_up_proj.weight, \
+                                x_dependency, MLP_GATE_UP_PREFETCH_SIZE)
+    return
+
+
+def _maybe_prefetch_mlp_gate_up_proj_impl_fake(x_dependency: torch.Tensor, prefix: str) -> None:
+    return
+
+
+def _maybe_prefetch_mlp_down_proj_impl(x_dependency: torch.Tensor) -> None:
+    forward_context = get_forward_context()
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    forward_context.prefetch_mlp_down_proj = True
+    prefetch_model = forward_context.prefetch_model
+    prefetch_stream = forward_context.prefetch_stream
+    layer_idx = forward_context.layer_idx
+
+    # start point of down_proj weight prefetch
+    prefetch_stream.wait_stream(torch.npu.current_stream())
+
+    with torch.npu.stream(prefetch_stream):
+        MLP_DOWN_PREFETCH_SIZE = envs_ascend.MLP_DOWN_PREFETCH_SIZE
+        torch_npu.npu_prefetch(prefetch_model.model.layers[layer_idx].mlp.down_proj.weight, \
+                            x_dependency, MLP_DOWN_PREFETCH_SIZE)
+    forward_context.layer_idx += 1
+    return
+
+
+def _maybe_prefetch_mlp_down_proj_impl_fake(x_dependency: torch.Tensor) -> None:
+    return
+
+
+def _maybe_wait_prefetch_done_impl(x: torch.Tensor) -> None:
+    forward_context = get_forward_context()
+    if not forward_context.prefetch_mlp_enabled:
+        return
+    if forward_context.prefetch_mlp_gate_up_proj or \
+        forward_context.prefetch_mlp_down_proj:
+        prefetch_stream = get_forward_context().prefetch_stream
+        # wait until prefetch done
+        torch.npu.current_stream().wait_stream(prefetch_stream)
+        forward_context.prefetch_mlp_gate_up_proj = False
+        forward_context.prefetch_mlp_down_proj = False
+    return
+
+
+def _maybe_wait_prefetch_done_impl_fake(x: torch.Tensor) -> None:
+    return
+
+
 direct_register_custom_op(
     op_name="maybe_chunk_residual",
     op_func=_maybe_chunk_residual_impl,
@@ -69,3 +141,30 @@ def _maybe_pad_and_reduce_impl(x: torch.Tensor) -> torch.Tensor:
     mutates_args=[],
     dispatch_key="PrivateUse1"
 )
+
+
+direct_register_custom_op(
+    op_name="maybe_prefetch_mlp_gate_up_proj",
+    op_func=_maybe_prefetch_mlp_gate_up_proj_impl,
+    fake_impl=_maybe_prefetch_mlp_gate_up_proj_impl_fake,
+    mutates_args=[],
+    dispatch_key="PrivateUse1"
+)
+
+
+direct_register_custom_op(
+    op_name="maybe_prefetch_mlp_down_proj",
+    op_func=_maybe_prefetch_mlp_down_proj_impl,
+    fake_impl=_maybe_prefetch_mlp_down_proj_impl_fake,
+    mutates_args=[],
+    dispatch_key="PrivateUse1"
+)
+
+
+direct_register_custom_op(
+    op_name="maybe_wait_prefetch_done",
+    op_func=_maybe_wait_prefetch_done_impl,
+    fake_impl=_maybe_wait_prefetch_done_impl_fake,
+    mutates_args=[],
+    dispatch_key="PrivateUse1"
+)
diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py
@@ -36,12 +36,6 @@ def __init__(
         super().__init__(hidden_size, eps, var_hidden_size, has_weight, dtype)
         self.layer = layer
 
-    def wait_prefetch_done(self):
-        forward_context = get_forward_context()
-        prefetch_stream = forward_context.prefetch_stream
-        # wait until
-        torch.npu.current_stream().wait_stream(prefetch_stream)
-
     def forward(
         self,
         x: torch.Tensor,
@@ -59,18 +53,11 @@ def forward(
                 self.layer.aclnn_input_scale,
                 self.layer.aclnn_input_offset,
                 epsilon=self.variance_epsilon)
-
-            if forward_context.prefetch_mlp_up:
-                self.wait_prefetch_done()
-
+            torch.ops.vllm.maybe_wait_prefetch_done(x)
             return x, residual
 
         x, residual = torch_npu.npu_rms_norm(x, self.weight,
                                              self.variance_epsilon)
-                                             
-        forward_context = get_forward_context()
-        if forward_context.prefetch_mlp_up:
-                self.wait_prefetch_done()
         return x
 
 
@@ -96,6 +83,7 @@ def forward_oot(
             else:
                 x, _, residual = torch_npu.npu_add_rms_norm(
                     x, residual, self.weight, self.variance_epsilon)
+                torch.ops.vllm.maybe_wait_prefetch_done(x)
             return x, residual
 
         x, residual = torch_npu.npu_rms_norm(x, self.weight,
diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py
@@ -24,22 +24,17 @@
                               split_tensor_along_last_dim,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
-from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.quantization.base_config import \
     QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
 
 from vllm_ascend.distributed.parallel_state import (
     get_mlp_tensor_model_parallel_rank,
     get_mlp_tensor_model_parallel_world_size, get_mlp_tp_group)
-from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod
-from vllm_ascend.utils import (all_gather_and_maybe_unpad,
-                               maybe_pad_and_reduce_scatter)
 
 from vllm.model_executor.layers.linear import (  # isort: skip
     WEIGHT_LOADER_V2_SUPPORTED, ColumnParallelLinear, LinearBase,
-    MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear,
-    UnquantizedLinearMethod)
+    MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
 
 
 class AscendMlpColumnParallelLinear(ColumnParallelLinear):
@@ -381,33 +376,6 @@ class AscendDenseRowParallelLinear(RowParallelLinear):
     communication-computation fusion.
     """
 
-    def prefetch_gate_up_proj(self,
-                              dependency: torch.Tensor):
-        # get prefetch model
-        forward_context = get_forward_context()
-        layer_num = int(self.prefix.split('.')[2])
-        prefetch_model = forward_context.prefetch_model
-        prefetch_stream = forward_context.prefetch_stream
-
-        # start point of weight prefetch
-        forward_context.prefetch_mlp_up = True if self.prefix.split('.')[-2] == 'self_attn' else False
-        if forward_context.prefetch_mlp_up:
-            prefetch_stream.wait_stream(torch.npu.current_stream())
-
-            with torch.npu.stream(prefetch_stream):
-                # For Qwen3-32B
-                MLP_GATE_UP_PREFETCH_SIZE = 50 * 1024 * 1024
-                torch_npu.npu_prefetch(prefetch_model.model.layers[layer_num].mlp.gate_up_proj.weight, \
-                                    dependency, MLP_GATE_UP_PREFETCH_SIZE)
-
-
-    def wait_prefetch_done(self):
-        forward_context = get_forward_context()
-        if forward_context.prefetch_mlp_up:
-            prefetch_stream = forward_context.prefetch_stream
-            # wait until reduce-scatter is done
-            torch.npu.current_stream().wait_stream(prefetch_stream)
-
     def forward(
         self, input_: torch.Tensor
     ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
@@ -431,11 +399,8 @@ def forward(
             output_parallel = self.quant_method.apply(self,
                                                       input_parallel,
                                                       bias=bias_)
-            dependency = output_parallel
-
-            self.prefetch_gate_up_proj(dependency)
-
             output = torch.ops.vllm.maybe_pad_and_reduce(output_parallel)
+            torch.ops.vllm.maybe_prefetch_mlp_gate_up_proj(output, self.prefix)
 
         output_bias = self.bias if self.skip_bias_add else None
 
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -37,6 +37,7 @@
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.monitor import set_cudagraph_capturing_enabled
 from vllm.config import CompilationLevel, CUDAGraphMode, VllmConfig
+from vllm.distributed import tensor_model_parallel_all_gather
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group)
 from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1
@@ -181,7 +182,10 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.dp_size = vllm_config.parallel_config.data_parallel_size
         self.dp_rank = vllm_config.parallel_config.data_parallel_rank
         self.device = device
-        self.prefetch_stream = torch.npu.Stream(device=device)
+        if envs_ascend.VLLM_ASCEND_ENABLE_PREFETCH_MLP:
+            self.prefetch_stream = torch.npu.Stream(device=device)
+        else:
+            self.prefetch_stream = None
         self.dtype = self.model_config.dtype
         if envs_ascend.VLLM_ASCEND_ENABLE_TOPK_TOPP_OPTIMIZATION:
             # TODO: drop the env config to use ascend sampler by default
@@ -1145,9 +1149,10 @@ def _generate_process_reqs_hidden_states(self, attn_metadata, with_prefill,
             inputs_embeds=inputs_embeds,
         )
         if get_forward_context().flashcomm_v1_enabled:
-            from vllm_ascend.utils import all_gather_and_maybe_unpad
-            hidden_states = all_gather_and_maybe_unpad(
-                hidden_states, get_forward_context().pad_size, dim=0)
+            hidden_states = tensor_model_parallel_all_gather(hidden_states, 0)
+            pad_size = get_forward_context().pad_size
+            if pad_size > 0:
+                hidden_states = hidden_states[:-pad_size, :]
         return hidden_states
 
     def _build_attn_state(self, num_reqs, num_scheduled_tokens,