diff --git a/.github/workflows/vllm_ascend_multi_nodes_test.yaml b/.github/workflows/vllm_ascend_multi_nodes_test.yaml
new file mode 100644
index 0000000000..7e1dfd4abc
--- /dev/null
+++ b/.github/workflows/vllm_ascend_multi_nodes_test.yaml
@@ -0,0 +1,118 @@
+name: 'e2e test / multi-dp'
+
+on:
+  workflow_dispatch:
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 8 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    strategy:
+      max-parallel: 2
+      matrix:
+        vllm_version: [main]
+    runs-on: linux-aarch64-a3-0
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      env:
+        KUBECONFIG: /root/.cache/.kube/kubeconfig.yaml
+        KUBECTLPATH: /root/.cache/.kube/kubectl
+        NAMESPACE: vllm-project
+        MODEL_PATH: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8
+    steps:
+        - name: Install system denpendencies
+          run: |
+           # configure apt and pip source
+           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+           apt-get update -y && apt-get install -y git curl
+           TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
+           git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
+
+        - name: Install kubectl
+          run: |
+            install -o root -g root -m 0755 $KUBECTLPATH /usr/local/bin/kubectl
+            kubectl version
+
+        - name: Checkout code
+          uses: actions/checkout@v4
+
+        - name: Prepare scripts
+          run: |
+            rm -rf /root/.cache/tests
+            mkdir -p /root/.cache/tests
+            cp -r tests/e2e/multi_nodes/* /root/.cache/tests/
+
+        - name: Launch cluster
+          run: |
+            kubectl apply -f tests/e2e/multi_nodes/multi_node_mp/lws.yaml
+
+        - name: Checkout vllm-project/vllm
+          uses: actions/checkout@v4
+          with:
+            repository: vllm-project/vllm
+            path: ./vllm-empty
+            ref: ${{ matrix.vllm_version }}
+
+        - name: Install vllm
+          working-directory: vllm-empty
+          run: |
+              pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+              VLLM_TARGET_DEVICE=empty pip install .
+
+        - name: Install benchmark
+          run: |
+            pip install -r benchmarks/requirements-bench.txt
+
+        - name: Wait for all pods ready
+          run: |
+            timeout 7200 bash -c '
+            while true; do
+                # Check if the pod is healthy
+                pod=$(kubectl get pod -l role=leader -n vllm-project -o jsonpath="{.items[0].metadata.name}")
+                status=$(kubectl get pod "$pod" -n vllm-project -o jsonpath="{.status.phase}")
+
+                # If pod failed, print the error logs and exit
+                if [[ "$status" == "Failed" ]] || [[ "$status" == "CrashLoopBackOff" ]]; then
+                echo "❌ Pod $pod failed with status=$status"
+                echo "---- Pod logs start ----"
+                kubectl logs "$pod" -n vllm-project --previous || kubectl logs "$pod" -n vllm-project
+                echo "---- Pod logs end ----"
+                exit 1
+                fi
+
+                # Check if service is ready
+                if curl -sf http://vllm-leader:8080/health > /dev/null; then
+                echo "✅ vllm cluster is ready (pod=$pod)"
+                exit 0
+                fi
+
+                echo "⏳ Waiting for vllm server to start... (pod=$pod, status=$status)"
+                sleep 5
+            done
+            '
+
+        - name: Run benchmark
+          run: |
+            python ./vllm-empty/benchmarks/benchmark_serving.py --model $MODEL_PATH  \
+                --dataset-name random --random-input-len 128 --random-output-len 128 \
+                --num-prompts 200  --trust-remote-code --base-url "http://vllm-leader:8080" \
+                --request-rate 1
+
+        - name: Post process
+          if: always()
+          run: |
+            kubectl get pods -n $NAMESPACE
+            kubectl delete -f tests/multi_node_mp/lws.yaml
diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt
index 229082311c..3346ad629c 100644
--- a/benchmarks/requirements-bench.txt
+++ b/benchmarks/requirements-bench.txt
@@ -1,4 +1,9 @@
-pandas
+pillow
+numpy>=1.24
+pandas>=2.0.0
+aiohttp>=3.10
+transformers>=4.46
+xlsxwriter>=3.2.1
 datasets
 modelscope
-tabulate
\ No newline at end of file
+tabulate
diff --git a/tests/e2e/multi_nodes/installer.sh b/tests/e2e/multi_nodes/installer.sh
new file mode 100644
index 0000000000..0c87359867
--- /dev/null
+++ b/tests/e2e/multi_nodes/installer.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -euo pipefail
+
+export SRC_DIR="$WORKSPACE/source_code"
+
+check_npu_info() {
+    echo "====> Check NPU info"
+    npu-smi info
+    cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info"
+}
+
+check_and_config() {
+    echo "====> Configure mirrors and git proxy"
+    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+
+    git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
+}
+
+checkout_src() {
+    echo "====> Checkout source code"
+    mkdir -p "$SRC_DIR"
+
+    # vllm-ascend
+    if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
+        git clone --depth 1 https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
+    fi
+
+    # vllm
+    if [ ! -d "$SRC_DIR/vllm" ]; then
+        git clone -b v0.10.1.1 https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
+    fi
+}
+
+install_sys_dependencies() {
+    echo "====> Install system dependencies"
+    apt-get update -y
+
+    DEP_LIST=()
+    while IFS= read -r line; do
+        [[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line")
+    done < "$SRC_DIR/vllm-ascend/packages.txt"
+
+    apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2
+}
+
+install_vllm() {
+    echo "====> Install vllm and vllm-ascend"
+    VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm"
+    pip install -e "$SRC_DIR/vllm-ascend"
+    pip install modelscope
+}
+
+main() {
+    check_npu_info
+    check_and_config
+    checkout_src
+    install_sys_dependencies
+    install_vllm
+}
+
+main "$@"
diff --git a/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh b/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh
new file mode 100644
index 0000000000..4d0c583c22
--- /dev/null
+++ b/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# this obtained through ifconfig
+# nic_name is the network interface name corresponding to local_ip
+local_ip=$(hostname -I | awk '{print $1}')
+nic_name=eth0
+
+
+export VLLM_USE_MODELSCOPE=True
+export HCCL_IF_IP=$local_ip
+export GLOO_SOCKET_IFNAME=$nic_name
+export TP_SOCKET_IFNAME=$nic_name
+export HCCL_SOCKET_IFNAME=$nic_name
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export HCCL_BUFFSIZE=1024
+
+SCRIPT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+bash $SCRIPT_DIR/installer.sh
+
+
+vllm serve $MODEL_PATH \
+--host 0.0.0.0 \
+--port 8080 \
+--data-parallel-size 2 \
+--data-parallel-size-local 1 \
+--data-parallel-address $local_ip \
+--data-parallel-rpc-port 13389 \
+--tensor-parallel-size 16 \
+--seed 1024 \
+--enable-expert-parallel \
+--max-num-seqs 16 \
+--max-model-len 32768 \
+--quantization ascend \
+--max-num-batched-tokens 4096 \
+--trust-remote-code \
+--gpu-memory-utilization 0.9 \
+--additional-config '{"torchair_graph_config":{"enabled":true}}'
diff --git a/tests/e2e/multi_nodes/multi_node_mp/launch_server_worker.sh b/tests/e2e/multi_nodes/multi_node_mp/launch_server_worker.sh
new file mode 100644
index 0000000000..cf8b7ee24e
--- /dev/null
+++ b/tests/e2e/multi_nodes/multi_node_mp/launch_server_worker.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+local_ip=$(hostname -I | awk '{print $1}')
+nic_name=eth0
+
+SCRIPT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+bash $SCRIPT_DIR/installer.sh
+leader_ip=$(getent hosts $LWS_LEADER_ADDRESS | awk '{print $1}')
+
+export VLLM_USE_MODELSCOPE=True
+export HCCL_IF_IP=$local_ip
+export GLOO_SOCKET_IFNAME=$nic_name
+export TP_SOCKET_IFNAME=$nic_name
+export HCCL_SOCKET_IFNAME=$nic_name
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export VLLM_USE_V1=1
+export HCCL_BUFFSIZE=1024
+
+vllm serve $MODEL_PATH \
+--headless \
+--data-parallel-size 2 \
+--data-parallel-size-local 1 \
+--data-parallel-start-rank 1 \
+--data-parallel-address $leader_ip \
+--data-parallel-rpc-port 13389 \
+--tensor-parallel-size 16 \
+--seed 1024 \
+--quantization ascend \
+--max-num-seqs 16 \
+--max-model-len 32768 \
+--max-num-batched-tokens 4096 \
+--enable-expert-parallel \
+--trust-remote-code \
+--gpu-memory-utilization 0.92 \
+--additional-config '{"torchair_graph_config":{"enabled":true}}'
diff --git a/tests/e2e/multi_nodes/multi_node_mp/lws.yaml b/tests/e2e/multi_nodes/multi_node_mp/lws.yaml
new file mode 100644
index 0000000000..0922507b84
--- /dev/null
+++ b/tests/e2e/multi_nodes/multi_node_mp/lws.yaml
@@ -0,0 +1,118 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+  namespace: vllm-project
+spec:
+  replicas: 1
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+            env:
+              - name: WORKSPACE
+                value: /root/workspace
+              - name: MODEL_PATH
+                value: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8
+            command:
+              - sh
+              - -c
+              - "bash /root/.cache/tests/multi_node_mp/launch_server_leader.sh"
+            resources:
+              limits:
+                huawei.com/ascend-1980: "16"
+                memory: 512Gi
+                ephemeral-storage: 100Gi
+              requests:
+                huawei.com/ascend-1980: "16"
+                ephemeral-storage: 100Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            # readinessProbe:
+            #   tcpSocket:
+            #     port: 8080
+            #   initialDelaySeconds: 15
+            #   periodSeconds: 10
+            volumeMounts:
+              - mountPath: /root/.cache
+                name: shared-volume
+              - mountPath: /usr/local/Ascend/driver/tools
+                name: driver-tools
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+        - name: shared-volume
+          persistentVolumeClaim:
+            claimName: nv-action-vllm-benchmarks-v2
+        - name: driver-tools
+          hostPath:
+            path: /usr/local/Ascend/driver/tools
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+            env:
+              - name: WORKSPACE
+                value: /root/workspace
+              - name: MODEL_PATH
+                value: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8
+            command:
+              - sh
+              - -c
+              - "bash /root/.cache/tests/multi_node_mp/launch_server_worker.sh"
+            resources:
+              limits:
+                huawei.com/ascend-1980: "16"
+                memory: 512Gi
+                ephemeral-storage: 100Gi
+              requests:
+                huawei.com/ascend-1980: "16"
+                ephemeral-storage: 100Gi
+                cpu: 125
+            volumeMounts:
+              - mountPath: /root/.cache
+                name: shared-volume
+              - mountPath: /usr/local/Ascend/driver/tools
+                name: driver-tools
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+        - name: shared-volume
+          persistentVolumeClaim:
+            claimName: nv-action-vllm-benchmarks-v2
+        - name: driver-tools
+          hostPath:
+            path: /usr/local/Ascend/driver/tools
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+  namespace: vllm-project
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP
diff --git a/tests/e2e/multi_nodes/multi_node_ray/launch_server.sh b/tests/e2e/multi_nodes/multi_node_ray/launch_server.sh
new file mode 100644
index 0000000000..e8ad8d3de5
--- /dev/null
+++ b/tests/e2e/multi_nodes/multi_node_ray/launch_server.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+#
+# Helper script to manually start or join a Ray cluster for online serving of vLLM models.
+# This script is first executed on the head node, and then on each worker node with the IP address
+# of the head node.
+#
+# Subcommands:
+#   leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers).
+#   worker: Starts a worker node that connects to an existing Ray head node.
+#
+# Example usage:
+# On the head node machine, start the Ray head node process and run a vLLM server.
+#   ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size=<SIZE> [<extra ray args>]  && \
+#   python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2
+# 
+# On each worker node, start the Ray worker node process.
+#   ./multi-node-serving.sh worker --ray_address=<HEAD_NODE_IP> --ray_port=6379 [<extra ray args>]
+#
+# About Ray:
+# Ray is an open-source distributed execution framework that simplifies
+# distributed computing. Learn more:
+# https://ray.io/
+
+
+subcommand=$1  # Either "leader" or "worker".
+shift          # Remove the subcommand from the argument list.
+
+ray_port=6379              # Port used by the Ray head node.
+ray_init_timeout=300       # Seconds to wait before timing out.
+declare -a start_params    # Parameters forwarded to the underlying 'ray start' command.
+
+# Handle the worker subcommand.
+case "$subcommand" in
+  worker)
+    ray_address=""
+    while [ $# -gt 0 ]; do
+      case "$1" in
+        --ray_address=*)
+          ray_address="${1#*=}"
+          ;;
+        --ray_port=*)
+          ray_port="${1#*=}"
+          ;;
+        --ray_init_timeout=*)
+          ray_init_timeout="${1#*=}"
+          ;;
+        *)
+          start_params+=("$1")
+      esac
+      shift
+    done
+
+    if [ -z "$ray_address" ]; then
+      echo "Error: Missing argument --ray_address"
+      exit 1
+    fi
+
+    # Retry until the worker node connects to the head node or the timeout expires.
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+      ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
+      if [ $? -eq 0 ]; then
+        echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
+        exit 0
+      fi
+      echo "Waiting until the ray worker is active..."
+      sleep 5s;
+    done
+    echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
+    exit 1
+    ;;
+
+  # Handle the leader subcommand.
+  leader)
+    ray_cluster_size=""
+    while [ $# -gt 0 ]; do
+          case "$1" in
+            --ray_port=*)
+              ray_port="${1#*=}"
+              ;;
+            --ray_cluster_size=*)
+              ray_cluster_size="${1#*=}"
+              ;;
+            --ray_init_timeout=*)
+              ray_init_timeout="${1#*=}"
+              ;;
+            *)
+              start_params+=("$1")
+          esac
+          shift
+    done
+
+    if [ -z "$ray_cluster_size" ]; then
+      echo "Error: Missing argument --ray_cluster_size"
+      exit 1
+    fi
+
+    # Start the Ray head node.
+    ray start --head --port=$ray_port "${start_params[@]}"
+
+    # Poll Ray until every worker node is active.
+    for (( i=0; i < $ray_init_timeout; i+=5 )); do
+        active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
+        if [ $active_nodes -eq $ray_cluster_size ]; then
+          echo "All ray workers are active and the ray cluster is initialized successfully."
+          exit 0
+        fi
+        echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
+        sleep 5s;
+    done
+
+    echo "Waiting for all ray workers to be active timed out."
+    exit 1
+    ;;
+
+  *)
+    echo "unknown subcommand: $subcommand"
+    exit 1
+    ;;
+esac