diff --git a/.github/workflows/vllm_ascend_multi_nodes_test.yaml b/.github/workflows/vllm_ascend_multi_nodes_test.yaml new file mode 100644 index 0000000000..7e1dfd4abc --- /dev/null +++ b/.github/workflows/vllm_ascend_multi_nodes_test.yaml @@ -0,0 +1,118 @@ +name: 'e2e test / multi-dp' + +on: + workflow_dispatch: + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +# only cancel in-progress runs of the same workflow +# and ignore the lint / 8 cards test type +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + e2e: + strategy: + max-parallel: 2 + matrix: + vllm_version: [main] + runs-on: linux-aarch64-a3-0 + container: + image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + env: + KUBECONFIG: /root/.cache/.kube/kubeconfig.yaml + KUBECTLPATH: /root/.cache/.kube/kubectl + NAMESPACE: vllm-project + MODEL_PATH: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8 + steps: + - name: Install system denpendencies + run: | + # configure apt and pip source + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + apt-get update -y && apt-get install -y git curl + TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64` + git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN" + + - name: Install kubectl + run: | + install -o root -g root -m 0755 $KUBECTLPATH /usr/local/bin/kubectl + kubectl version + + - name: Checkout code + uses: actions/checkout@v4 + + - name: Prepare scripts + run: | + rm -rf /root/.cache/tests + mkdir -p /root/.cache/tests + cp -r tests/e2e/multi_nodes/* /root/.cache/tests/ + + - name: Launch cluster + run: | + kubectl apply -f tests/e2e/multi_nodes/multi_node_mp/lws.yaml + + - name: Checkout vllm-project/vllm + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: ./vllm-empty + ref: ${{ matrix.vllm_version }} + + - name: Install vllm + working-directory: vllm-empty + run: | + pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu + VLLM_TARGET_DEVICE=empty pip install . + + - name: Install benchmark + run: | + pip install -r benchmarks/requirements-bench.txt + + - name: Wait for all pods ready + run: | + timeout 7200 bash -c ' + while true; do + # Check if the pod is healthy + pod=$(kubectl get pod -l role=leader -n vllm-project -o jsonpath="{.items[0].metadata.name}") + status=$(kubectl get pod "$pod" -n vllm-project -o jsonpath="{.status.phase}") + + # If pod failed, print the error logs and exit + if [[ "$status" == "Failed" ]] || [[ "$status" == "CrashLoopBackOff" ]]; then + echo "❌ Pod $pod failed with status=$status" + echo "---- Pod logs start ----" + kubectl logs "$pod" -n vllm-project --previous || kubectl logs "$pod" -n vllm-project + echo "---- Pod logs end ----" + exit 1 + fi + + # Check if service is ready + if curl -sf http://vllm-leader:8080/health > /dev/null; then + echo "✅ vllm cluster is ready (pod=$pod)" + exit 0 + fi + + echo "⏳ Waiting for vllm server to start... (pod=$pod, status=$status)" + sleep 5 + done + ' + + - name: Run benchmark + run: | + python ./vllm-empty/benchmarks/benchmark_serving.py --model $MODEL_PATH \ + --dataset-name random --random-input-len 128 --random-output-len 128 \ + --num-prompts 200 --trust-remote-code --base-url "http://vllm-leader:8080" \ + --request-rate 1 + + - name: Post process + if: always() + run: | + kubectl get pods -n $NAMESPACE + kubectl delete -f tests/multi_node_mp/lws.yaml diff --git a/benchmarks/requirements-bench.txt b/benchmarks/requirements-bench.txt index 229082311c..3346ad629c 100644 --- a/benchmarks/requirements-bench.txt +++ b/benchmarks/requirements-bench.txt @@ -1,4 +1,9 @@ -pandas +pillow +numpy>=1.24 +pandas>=2.0.0 +aiohttp>=3.10 +transformers>=4.46 +xlsxwriter>=3.2.1 datasets modelscope -tabulate \ No newline at end of file +tabulate diff --git a/tests/e2e/multi_nodes/installer.sh b/tests/e2e/multi_nodes/installer.sh new file mode 100644 index 0000000000..0c87359867 --- /dev/null +++ b/tests/e2e/multi_nodes/installer.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -euo pipefail + +export SRC_DIR="$WORKSPACE/source_code" + +check_npu_info() { + echo "====> Check NPU info" + npu-smi info + cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info" +} + +check_and_config() { + echo "====> Configure mirrors and git proxy" + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi + + git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" +} + +checkout_src() { + echo "====> Checkout source code" + mkdir -p "$SRC_DIR" + + # vllm-ascend + if [ ! -d "$SRC_DIR/vllm-ascend" ]; then + git clone --depth 1 https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend" + fi + + # vllm + if [ ! -d "$SRC_DIR/vllm" ]; then + git clone -b v0.10.1.1 https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm" + fi +} + +install_sys_dependencies() { + echo "====> Install system dependencies" + apt-get update -y + + DEP_LIST=() + while IFS= read -r line; do + [[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line") + done < "$SRC_DIR/vllm-ascend/packages.txt" + + apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2 +} + +install_vllm() { + echo "====> Install vllm and vllm-ascend" + VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm" + pip install -e "$SRC_DIR/vllm-ascend" + pip install modelscope +} + +main() { + check_npu_info + check_and_config + checkout_src + install_sys_dependencies + install_vllm +} + +main "$@" diff --git a/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh b/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh new file mode 100644 index 0000000000..4d0c583c22 --- /dev/null +++ b/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +# this obtained through ifconfig +# nic_name is the network interface name corresponding to local_ip +local_ip=$(hostname -I | awk '{print $1}') +nic_name=eth0 + + +export VLLM_USE_MODELSCOPE=True +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export HCCL_BUFFSIZE=1024 + +SCRIPT_DIR=$(cd "$(dirname "$0")/.." && pwd) +bash $SCRIPT_DIR/installer.sh + + +vllm serve $MODEL_PATH \ +--host 0.0.0.0 \ +--port 8080 \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-address $local_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 16 \ +--seed 1024 \ +--enable-expert-parallel \ +--max-num-seqs 16 \ +--max-model-len 32768 \ +--quantization ascend \ +--max-num-batched-tokens 4096 \ +--trust-remote-code \ +--gpu-memory-utilization 0.9 \ +--additional-config '{"torchair_graph_config":{"enabled":true}}' diff --git a/tests/e2e/multi_nodes/multi_node_mp/launch_server_worker.sh b/tests/e2e/multi_nodes/multi_node_mp/launch_server_worker.sh new file mode 100644 index 0000000000..cf8b7ee24e --- /dev/null +++ b/tests/e2e/multi_nodes/multi_node_mp/launch_server_worker.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +local_ip=$(hostname -I | awk '{print $1}') +nic_name=eth0 + +SCRIPT_DIR=$(cd "$(dirname "$0")/.." && pwd) +bash $SCRIPT_DIR/installer.sh +leader_ip=$(getent hosts $LWS_LEADER_ADDRESS | awk '{print $1}') + +export VLLM_USE_MODELSCOPE=True +export HCCL_IF_IP=$local_ip +export GLOO_SOCKET_IFNAME=$nic_name +export TP_SOCKET_IFNAME=$nic_name +export HCCL_SOCKET_IFNAME=$nic_name +export OMP_PROC_BIND=false +export OMP_NUM_THREADS=100 +export VLLM_USE_V1=1 +export HCCL_BUFFSIZE=1024 + +vllm serve $MODEL_PATH \ +--headless \ +--data-parallel-size 2 \ +--data-parallel-size-local 1 \ +--data-parallel-start-rank 1 \ +--data-parallel-address $leader_ip \ +--data-parallel-rpc-port 13389 \ +--tensor-parallel-size 16 \ +--seed 1024 \ +--quantization ascend \ +--max-num-seqs 16 \ +--max-model-len 32768 \ +--max-num-batched-tokens 4096 \ +--enable-expert-parallel \ +--trust-remote-code \ +--gpu-memory-utilization 0.92 \ +--additional-config '{"torchair_graph_config":{"enabled":true}}' diff --git a/tests/e2e/multi_nodes/multi_node_mp/lws.yaml b/tests/e2e/multi_nodes/multi_node_mp/lws.yaml new file mode 100644 index 0000000000..0922507b84 --- /dev/null +++ b/tests/e2e/multi_nodes/multi_node_mp/lws.yaml @@ -0,0 +1,118 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: vllm + namespace: vllm-project +spec: + replicas: 1 + leaderWorkerTemplate: + size: 2 + restartPolicy: RecreateGroupOnPodRestart + leaderTemplate: + metadata: + labels: + role: leader + spec: + containers: + - name: vllm-leader + image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + env: + - name: WORKSPACE + value: /root/workspace + - name: MODEL_PATH + value: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8 + command: + - sh + - -c + - "bash /root/.cache/tests/multi_node_mp/launch_server_leader.sh" + resources: + limits: + huawei.com/ascend-1980: "16" + memory: 512Gi + ephemeral-storage: 100Gi + requests: + huawei.com/ascend-1980: "16" + ephemeral-storage: 100Gi + cpu: 125 + ports: + - containerPort: 8080 + # readinessProbe: + # tcpSocket: + # port: 8080 + # initialDelaySeconds: 15 + # periodSeconds: 10 + volumeMounts: + - mountPath: /root/.cache + name: shared-volume + - mountPath: /usr/local/Ascend/driver/tools + name: driver-tools + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi + - name: shared-volume + persistentVolumeClaim: + claimName: nv-action-vllm-benchmarks-v2 + - name: driver-tools + hostPath: + path: /usr/local/Ascend/driver/tools + workerTemplate: + spec: + containers: + - name: vllm-worker + image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + env: + - name: WORKSPACE + value: /root/workspace + - name: MODEL_PATH + value: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8 + command: + - sh + - -c + - "bash /root/.cache/tests/multi_node_mp/launch_server_worker.sh" + resources: + limits: + huawei.com/ascend-1980: "16" + memory: 512Gi + ephemeral-storage: 100Gi + requests: + huawei.com/ascend-1980: "16" + ephemeral-storage: 100Gi + cpu: 125 + volumeMounts: + - mountPath: /root/.cache + name: shared-volume + - mountPath: /usr/local/Ascend/driver/tools + name: driver-tools + - mountPath: /dev/shm + name: dshm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 15Gi + - name: shared-volume + persistentVolumeClaim: + claimName: nv-action-vllm-benchmarks-v2 + - name: driver-tools + hostPath: + path: /usr/local/Ascend/driver/tools +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-leader + namespace: vllm-project +spec: + ports: + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + leaderworkerset.sigs.k8s.io/name: vllm + role: leader + type: ClusterIP diff --git a/tests/e2e/multi_nodes/multi_node_ray/launch_server.sh b/tests/e2e/multi_nodes/multi_node_ray/launch_server.sh new file mode 100644 index 0000000000..e8ad8d3de5 --- /dev/null +++ b/tests/e2e/multi_nodes/multi_node_ray/launch_server.sh @@ -0,0 +1,119 @@ +#!/bin/bash +# +# Helper script to manually start or join a Ray cluster for online serving of vLLM models. +# This script is first executed on the head node, and then on each worker node with the IP address +# of the head node. +# +# Subcommands: +# leader: Launches a Ray head node and blocks until the cluster reaches the expected size (head + workers). +# worker: Starts a worker node that connects to an existing Ray head node. +# +# Example usage: +# On the head node machine, start the Ray head node process and run a vLLM server. +# ./multi-node-serving.sh leader --ray_port=6379 --ray_cluster_size= [] && \ +# python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2 +# +# On each worker node, start the Ray worker node process. +# ./multi-node-serving.sh worker --ray_address= --ray_port=6379 [] +# +# About Ray: +# Ray is an open-source distributed execution framework that simplifies +# distributed computing. Learn more: +# https://ray.io/ + + +subcommand=$1 # Either "leader" or "worker". +shift # Remove the subcommand from the argument list. + +ray_port=6379 # Port used by the Ray head node. +ray_init_timeout=300 # Seconds to wait before timing out. +declare -a start_params # Parameters forwarded to the underlying 'ray start' command. + +# Handle the worker subcommand. +case "$subcommand" in + worker) + ray_address="" + while [ $# -gt 0 ]; do + case "$1" in + --ray_address=*) + ray_address="${1#*=}" + ;; + --ray_port=*) + ray_port="${1#*=}" + ;; + --ray_init_timeout=*) + ray_init_timeout="${1#*=}" + ;; + *) + start_params+=("$1") + esac + shift + done + + if [ -z "$ray_address" ]; then + echo "Error: Missing argument --ray_address" + exit 1 + fi + + # Retry until the worker node connects to the head node or the timeout expires. + for (( i=0; i < $ray_init_timeout; i+=5 )); do + ray start --address=$ray_address:$ray_port --block "${start_params[@]}" + if [ $? -eq 0 ]; then + echo "Worker: Ray runtime started with head address $ray_address:$ray_port" + exit 0 + fi + echo "Waiting until the ray worker is active..." + sleep 5s; + done + echo "Ray worker starts timeout, head address: $ray_address:$ray_port" + exit 1 + ;; + + # Handle the leader subcommand. + leader) + ray_cluster_size="" + while [ $# -gt 0 ]; do + case "$1" in + --ray_port=*) + ray_port="${1#*=}" + ;; + --ray_cluster_size=*) + ray_cluster_size="${1#*=}" + ;; + --ray_init_timeout=*) + ray_init_timeout="${1#*=}" + ;; + *) + start_params+=("$1") + esac + shift + done + + if [ -z "$ray_cluster_size" ]; then + echo "Error: Missing argument --ray_cluster_size" + exit 1 + fi + + # Start the Ray head node. + ray start --head --port=$ray_port "${start_params[@]}" + + # Poll Ray until every worker node is active. + for (( i=0; i < $ray_init_timeout; i+=5 )); do + active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'` + if [ $active_nodes -eq $ray_cluster_size ]; then + echo "All ray workers are active and the ray cluster is initialized successfully." + exit 0 + fi + echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active" + sleep 5s; + done + + echo "Waiting for all ray workers to be active timed out." + exit 1 + ;; + + *) + echo "unknown subcommand: $subcommand" + exit 1 + ;; +esac