vllm-project · Potabk · Sep 4, 2025 · Sep 4, 2025 · gemini-code-assist · Sep 4, 2025
diff --git a/.github/workflows/vllm_ascend_multi_nodes_test.yaml b/.github/workflows/vllm_ascend_multi_nodes_test.yaml
@@ -0,0 +1,118 @@
+name: 'e2e test / multi-dp'
+
+on:
+  workflow_dispatch:
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 8 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    strategy:
+      max-parallel: 2
+      matrix:
+        vllm_version: [main]
+    runs-on: linux-aarch64-a3-0
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      env:
+        KUBECONFIG: /root/.cache/.kube/kubeconfig.yaml
+        KUBECTLPATH: /root/.cache/.kube/kubectl
+        NAMESPACE: vllm-project
+        MODEL_PATH: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8
+    steps:
+        - name: Install system denpendencies
+          run: |
+           # configure apt and pip source
+           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+           apt-get update -y && apt-get install -y git curl
+           TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
+           git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
+
+        - name: Install kubectl
+          run: |
+            install -o root -g root -m 0755 $KUBECTLPATH /usr/local/bin/kubectl
+            kubectl version
+
+        - name: Checkout code
+          uses: actions/checkout@v4
+
+        - name: Prepare scripts
+          run: |
+            rm -rf /root/.cache/tests
+            mkdir -p /root/.cache/tests
+            cp -r tests/e2e/multi_nodes/* /root/.cache/tests/
+
+        - name: Launch cluster
+          run: |
+            kubectl apply -f tests/e2e/multi_nodes/multi_node_mp/lws.yaml
+
+        - name: Checkout vllm-project/vllm
+          uses: actions/checkout@v4
+          with:
+            repository: vllm-project/vllm
+            path: ./vllm-empty
+            ref: ${{ matrix.vllm_version }}
+
+        - name: Install vllm
+          working-directory: vllm-empty
+          run: |
+              pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+              VLLM_TARGET_DEVICE=empty pip install .
+
+        - name: Install benchmark
+          run: |
+            pip install -r benchmarks/requirements-bench.txt
+
+        - name: Wait for all pods ready
+          run: |
+            timeout 7200 bash -c '
+            while true; do
+                # Check if the pod is healthy
+                pod=$(kubectl get pod -l role=leader -n vllm-project -o jsonpath="{.items[0].metadata.name}")
+                status=$(kubectl get pod "$pod" -n vllm-project -o jsonpath="{.status.phase}")
+
+                # If pod failed, print the error logs and exit
+                if [[ "$status" == "Failed" ]] || [[ "$status" == "CrashLoopBackOff" ]]; then
+                echo "❌ Pod $pod failed with status=$status"
+                echo "---- Pod logs start ----"
+                kubectl logs "$pod" -n vllm-project --previous || kubectl logs "$pod" -n vllm-project
+                echo "---- Pod logs end ----"
+                exit 1
+                fi
+
+                # Check if service is ready
+                if curl -sf http://vllm-leader:8080/health > /dev/null; then
+                echo "✅ vllm cluster is ready (pod=$pod)"
+                exit 0
+                fi
+
+                echo "⏳ Waiting for vllm server to start... (pod=$pod, status=$status)"
+                sleep 5
+            done
+            '
+
+        - name: Run benchmark
+          run: |
+            python ./vllm-empty/benchmarks/benchmark_serving.py --model $MODEL_PATH  \
+                --dataset-name random --random-input-len 128 --random-output-len 128 \
+                --num-prompts 200  --trust-remote-code --base-url "http://vllm-leader:8080" \
+                --request-rate 1
+
+        - name: Post process
+          if: always()
+          run: |
+            kubectl get pods -n $NAMESPACE
+            kubectl delete -f tests/multi_node_mp/lws.yaml
diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt
@@ -1,4 +1,9 @@
-pandas
+pillow
+numpy>=1.24
+pandas>=2.0.0
+aiohttp>=3.10
+transformers>=4.46
+xlsxwriter>=3.2.1
 datasets
 modelscope
-tabulate
+tabulate
diff --git a/tests/e2e/multi_nodes/installer.sh b/tests/e2e/multi_nodes/installer.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -euo pipefail
+
+export SRC_DIR="$WORKSPACE/source_code"
+
+check_npu_info() {
+    echo "====> Check NPU info"
+    npu-smi info
+    cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info"
+}
+
+check_and_config() {
+    echo "====> Configure mirrors and git proxy"
+    pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+
+    git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/"
+}
+
+checkout_src() {
+    echo "====> Checkout source code"
+    mkdir -p "$SRC_DIR"
+
+    # vllm-ascend
+    if [ ! -d "$SRC_DIR/vllm-ascend" ]; then
+        git clone --depth 1 https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend"
+    fi
+
+    # vllm
+    if [ ! -d "$SRC_DIR/vllm" ]; then
+        git clone -b v0.10.1.1 https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm"
+    fi
+}
+
+install_sys_dependencies() {
+    echo "====> Install system dependencies"
+    apt-get update -y
+
+    DEP_LIST=()
+    while IFS= read -r line; do
+        [[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line")
+    done < "$SRC_DIR/vllm-ascend/packages.txt"
+
+    apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2
+}
+
+install_vllm() {
+    echo "====> Install vllm and vllm-ascend"
+    VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm"
+    pip install -e "$SRC_DIR/vllm-ascend"
+    pip install modelscope
+}
+
+main() {
+    check_npu_info
+    check_and_config
+    checkout_src
+    install_sys_dependencies
+    install_vllm
+}
+
+main "$@"
diff --git a/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh b/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# this obtained through ifconfig
+# nic_name is the network interface name corresponding to local_ip
+local_ip=$(hostname -I | awk '{print $1}')
+nic_name=eth0
-nic_name=eth0
+nic_name=$(ip -o addr show | awk -v ip="$local_ip" '/inet / && $4 ~ ip {print $2}')
-nic_name=eth0
+nic_name=$(ip -o addr show | awk -v ip="$local_ip" '/inet / && $4 ~ ip {print $2}')
+
+
+export VLLM_USE_MODELSCOPE=True
+export HCCL_IF_IP=$local_ip
+export GLOO_SOCKET_IFNAME=$nic_name
+export TP_SOCKET_IFNAME=$nic_name
+export HCCL_SOCKET_IFNAME=$nic_name
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export HCCL_BUFFSIZE=1024
+
+SCRIPT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+bash $SCRIPT_DIR/installer.sh
+
+
+vllm serve $MODEL_PATH \
+--host 0.0.0.0 \
+--port 8080 \
+--data-parallel-size 2 \
+--data-parallel-size-local 1 \
+--data-parallel-address $local_ip \
+--data-parallel-rpc-port 13389 \
+--tensor-parallel-size 16 \
+--seed 1024 \
+--enable-expert-parallel \
+--max-num-seqs 16 \
+--max-model-len 32768 \
+--quantization ascend \
+--max-num-batched-tokens 4096 \
+--trust-remote-code \
+--gpu-memory-utilization 0.9 \
+--additional-config '{"torchair_graph_config":{"enabled":true}}'
diff --git a/tests/e2e/multi_nodes/multi_node_mp/launch_server_worker.sh b/tests/e2e/multi_nodes/multi_node_mp/launch_server_worker.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+local_ip=$(hostname -I | awk '{print $1}')
+nic_name=eth0
-nic_name=eth0
+nic_name=$(ip -o addr show | awk -v ip="$local_ip" '/inet / && $4 ~ ip {print $2}')
-nic_name=eth0
+nic_name=$(ip -o addr show | awk -v ip="$local_ip" '/inet / && $4 ~ ip {print $2}')
+
+SCRIPT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+bash $SCRIPT_DIR/installer.sh
+leader_ip=$(getent hosts $LWS_LEADER_ADDRESS | awk '{print $1}')
+
+export VLLM_USE_MODELSCOPE=True
+export HCCL_IF_IP=$local_ip
+export GLOO_SOCKET_IFNAME=$nic_name
+export TP_SOCKET_IFNAME=$nic_name
+export HCCL_SOCKET_IFNAME=$nic_name
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=100
+export VLLM_USE_V1=1
+export HCCL_BUFFSIZE=1024
+
+vllm serve $MODEL_PATH \
+--headless \
+--data-parallel-size 2 \
+--data-parallel-size-local 1 \
+--data-parallel-start-rank 1 \
+--data-parallel-address $leader_ip \
+--data-parallel-rpc-port 13389 \
+--tensor-parallel-size 16 \
+--seed 1024 \
+--quantization ascend \
+--max-num-seqs 16 \
+--max-model-len 32768 \
+--max-num-batched-tokens 4096 \
+--enable-expert-parallel \
+--trust-remote-code \
+--gpu-memory-utilization 0.92 \
+--additional-config '{"torchair_graph_config":{"enabled":true}}'
diff --git a/tests/e2e/multi_nodes/multi_node_mp/lws.yaml b/tests/e2e/multi_nodes/multi_node_mp/lws.yaml
@@ -0,0 +1,118 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: vllm
+  namespace: vllm-project
+spec:
+  replicas: 1
+  leaderWorkerTemplate:
+    size: 2
+    restartPolicy: RecreateGroupOnPodRestart
+    leaderTemplate:
+      metadata:
+        labels:
+          role: leader
+      spec:
+        containers:
+          - name: vllm-leader
+            image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+            env:
+              - name: WORKSPACE
+                value: /root/workspace
+              - name: MODEL_PATH
+                value: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8
+            command:
+              - sh
+              - -c
+              - "bash /root/.cache/tests/multi_node_mp/launch_server_leader.sh"
+            resources:
+              limits:
+                huawei.com/ascend-1980: "16"
+                memory: 512Gi
+                ephemeral-storage: 100Gi
+              requests:
+                huawei.com/ascend-1980: "16"
+                ephemeral-storage: 100Gi
+                cpu: 125
+            ports:
+              - containerPort: 8080
+            # readinessProbe:
+            #   tcpSocket:
+            #     port: 8080
+            #   initialDelaySeconds: 15
+            #   periodSeconds: 10
+            volumeMounts:
+              - mountPath: /root/.cache
+                name: shared-volume
+              - mountPath: /usr/local/Ascend/driver/tools
+                name: driver-tools
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+        - name: shared-volume
+          persistentVolumeClaim:
+            claimName: nv-action-vllm-benchmarks-v2
+        - name: driver-tools
+          hostPath:
+            path: /usr/local/Ascend/driver/tools
+    workerTemplate:
+      spec:
+        containers:
+          - name: vllm-worker
+            image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+            env:
+              - name: WORKSPACE
+                value: /root/workspace
+              - name: MODEL_PATH
+                value: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8
+            command:
+              - sh
+              - -c
+              - "bash /root/.cache/tests/multi_node_mp/launch_server_worker.sh"
+            resources:
+              limits:
+                huawei.com/ascend-1980: "16"
+                memory: 512Gi
+                ephemeral-storage: 100Gi
+              requests:
+                huawei.com/ascend-1980: "16"
+                ephemeral-storage: 100Gi
+                cpu: 125
+            volumeMounts:
+              - mountPath: /root/.cache
+                name: shared-volume
+              - mountPath: /usr/local/Ascend/driver/tools
+                name: driver-tools
+              - mountPath: /dev/shm
+                name: dshm
+        volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 15Gi
+        - name: shared-volume
+          persistentVolumeClaim:
+            claimName: nv-action-vllm-benchmarks-v2
+        - name: driver-tools
+          hostPath:
+            path: /usr/local/Ascend/driver/tools
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-leader
+  namespace: vllm-project
+spec:
+  ports:
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    leaderworkerset.sigs.k8s.io/name: vllm
+    role: leader
+  type: ClusterIP