-
Notifications
You must be signed in to change notification settings - Fork 484
[CI] Add multi_node CI #2749
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[CI] Add multi_node CI #2749
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
name: 'e2e test / multi-dp' | ||
|
||
on: | ||
workflow_dispatch: | ||
|
||
# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly | ||
# declared as "shell: bash -el {0}" on steps that need to be properly activated. | ||
# It's used to activate ascend-toolkit environment variables. | ||
defaults: | ||
run: | ||
shell: bash -el {0} | ||
|
||
# only cancel in-progress runs of the same workflow | ||
# and ignore the lint / 8 cards test type | ||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
|
||
jobs: | ||
e2e: | ||
strategy: | ||
max-parallel: 2 | ||
matrix: | ||
vllm_version: [main] | ||
runs-on: linux-aarch64-a3-0 | ||
Check failure on line 25 in .github/workflows/vllm_ascend_multi_nodes_test.yaml
|
||
container: | ||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 | ||
env: | ||
KUBECONFIG: /root/.cache/.kube/kubeconfig.yaml | ||
KUBECTLPATH: /root/.cache/.kube/kubectl | ||
NAMESPACE: vllm-project | ||
MODEL_PATH: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8 | ||
steps: | ||
- name: Install system denpendencies | ||
run: | | ||
# configure apt and pip source | ||
sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list | ||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple | ||
|
||
apt-get update -y && apt-get install -y git curl | ||
TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64` | ||
git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN" | ||
|
||
- name: Install kubectl | ||
run: | | ||
install -o root -g root -m 0755 $KUBECTLPATH /usr/local/bin/kubectl | ||
kubectl version | ||
|
||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
- name: Prepare scripts | ||
run: | | ||
rm -rf /root/.cache/tests | ||
mkdir -p /root/.cache/tests | ||
cp -r tests/e2e/multi_nodes/* /root/.cache/tests/ | ||
|
||
- name: Launch cluster | ||
run: | | ||
kubectl apply -f tests/e2e/multi_nodes/multi_node_mp/lws.yaml | ||
|
||
- name: Checkout vllm-project/vllm | ||
uses: actions/checkout@v4 | ||
with: | ||
repository: vllm-project/vllm | ||
path: ./vllm-empty | ||
ref: ${{ matrix.vllm_version }} | ||
|
||
- name: Install vllm | ||
working-directory: vllm-empty | ||
run: | | ||
pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu | ||
VLLM_TARGET_DEVICE=empty pip install . | ||
|
||
- name: Install benchmark | ||
run: | | ||
pip install -r benchmarks/requirements-bench.txt | ||
|
||
- name: Wait for all pods ready | ||
run: | | ||
timeout 7200 bash -c ' | ||
while true; do | ||
# Check if the pod is healthy | ||
pod=$(kubectl get pod -l role=leader -n vllm-project -o jsonpath="{.items[0].metadata.name}") | ||
status=$(kubectl get pod "$pod" -n vllm-project -o jsonpath="{.status.phase}") | ||
|
||
# If pod failed, print the error logs and exit | ||
if [[ "$status" == "Failed" ]] || [[ "$status" == "CrashLoopBackOff" ]]; then | ||
echo "❌ Pod $pod failed with status=$status" | ||
echo "---- Pod logs start ----" | ||
kubectl logs "$pod" -n vllm-project --previous || kubectl logs "$pod" -n vllm-project | ||
echo "---- Pod logs end ----" | ||
exit 1 | ||
fi | ||
|
||
# Check if service is ready | ||
if curl -sf http://vllm-leader:8080/health > /dev/null; then | ||
echo "✅ vllm cluster is ready (pod=$pod)" | ||
exit 0 | ||
fi | ||
|
||
echo "⏳ Waiting for vllm server to start... (pod=$pod, status=$status)" | ||
sleep 5 | ||
done | ||
' | ||
|
||
- name: Run benchmark | ||
run: | | ||
python ./vllm-empty/benchmarks/benchmark_serving.py --model $MODEL_PATH \ | ||
--dataset-name random --random-input-len 128 --random-output-len 128 \ | ||
--num-prompts 200 --trust-remote-code --base-url "http://vllm-leader:8080" \ | ||
--request-rate 1 | ||
|
||
- name: Post process | ||
if: always() | ||
run: | | ||
kubectl get pods -n $NAMESPACE | ||
kubectl delete -f tests/multi_node_mp/lws.yaml |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,9 @@ | ||
pandas | ||
pillow | ||
numpy>=1.24 | ||
pandas>=2.0.0 | ||
aiohttp>=3.10 | ||
transformers>=4.46 | ||
xlsxwriter>=3.2.1 | ||
datasets | ||
modelscope | ||
tabulate | ||
tabulate |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/bash | ||
set -euo pipefail | ||
|
||
export SRC_DIR="$WORKSPACE/source_code" | ||
|
||
check_npu_info() { | ||
echo "====> Check NPU info" | ||
npu-smi info | ||
cat "/usr/local/Ascend/ascend-toolkit/latest/$(uname -i)-linux/ascend_toolkit_install.info" | ||
} | ||
|
||
check_and_config() { | ||
echo "====> Configure mirrors and git proxy" | ||
pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple | ||
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi | ||
|
||
git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" | ||
} | ||
|
||
checkout_src() { | ||
echo "====> Checkout source code" | ||
mkdir -p "$SRC_DIR" | ||
|
||
# vllm-ascend | ||
if [ ! -d "$SRC_DIR/vllm-ascend" ]; then | ||
git clone --depth 1 https://github.com/vllm-project/vllm-ascend.git "$SRC_DIR/vllm-ascend" | ||
fi | ||
|
||
# vllm | ||
if [ ! -d "$SRC_DIR/vllm" ]; then | ||
git clone -b v0.10.1.1 https://github.com/vllm-project/vllm.git "$SRC_DIR/vllm" | ||
fi | ||
} | ||
|
||
install_sys_dependencies() { | ||
echo "====> Install system dependencies" | ||
apt-get update -y | ||
|
||
DEP_LIST=() | ||
while IFS= read -r line; do | ||
[[ -n "$line" && ! "$line" =~ ^# ]] && DEP_LIST+=("$line") | ||
done < "$SRC_DIR/vllm-ascend/packages.txt" | ||
|
||
apt-get install -y "${DEP_LIST[@]}" gcc g++ cmake libnuma-dev iproute2 | ||
} | ||
|
||
install_vllm() { | ||
echo "====> Install vllm and vllm-ascend" | ||
VLLM_TARGET_DEVICE=empty pip install -e "$SRC_DIR/vllm" | ||
pip install -e "$SRC_DIR/vllm-ascend" | ||
pip install modelscope | ||
} | ||
|
||
main() { | ||
check_npu_info | ||
check_and_config | ||
checkout_src | ||
install_sys_dependencies | ||
install_vllm | ||
} | ||
|
||
main "$@" |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,38 @@ | ||||||
#!/bin/bash | ||||||
|
||||||
# this obtained through ifconfig | ||||||
# nic_name is the network interface name corresponding to local_ip | ||||||
local_ip=$(hostname -I | awk '{print $1}') | ||||||
nic_name=eth0 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The network interface name
Suggested change
|
||||||
|
||||||
|
||||||
export VLLM_USE_MODELSCOPE=True | ||||||
export HCCL_IF_IP=$local_ip | ||||||
export GLOO_SOCKET_IFNAME=$nic_name | ||||||
export TP_SOCKET_IFNAME=$nic_name | ||||||
export HCCL_SOCKET_IFNAME=$nic_name | ||||||
export OMP_PROC_BIND=false | ||||||
export OMP_NUM_THREADS=100 | ||||||
export HCCL_BUFFSIZE=1024 | ||||||
|
||||||
SCRIPT_DIR=$(cd "$(dirname "$0")/.." && pwd) | ||||||
bash $SCRIPT_DIR/installer.sh | ||||||
|
||||||
|
||||||
vllm serve $MODEL_PATH \ | ||||||
--host 0.0.0.0 \ | ||||||
--port 8080 \ | ||||||
--data-parallel-size 2 \ | ||||||
--data-parallel-size-local 1 \ | ||||||
--data-parallel-address $local_ip \ | ||||||
--data-parallel-rpc-port 13389 \ | ||||||
--tensor-parallel-size 16 \ | ||||||
--seed 1024 \ | ||||||
--enable-expert-parallel \ | ||||||
--max-num-seqs 16 \ | ||||||
--max-model-len 32768 \ | ||||||
--quantization ascend \ | ||||||
--max-num-batched-tokens 4096 \ | ||||||
--trust-remote-code \ | ||||||
--gpu-memory-utilization 0.9 \ | ||||||
--additional-config '{"torchair_graph_config":{"enabled":true}}' |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,36 @@ | ||||||
#!/bin/bash | ||||||
|
||||||
local_ip=$(hostname -I | awk '{print $1}') | ||||||
nic_name=eth0 | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The network interface name
Suggested change
|
||||||
|
||||||
SCRIPT_DIR=$(cd "$(dirname "$0")/.." && pwd) | ||||||
bash $SCRIPT_DIR/installer.sh | ||||||
leader_ip=$(getent hosts $LWS_LEADER_ADDRESS | awk '{print $1}') | ||||||
|
||||||
export VLLM_USE_MODELSCOPE=True | ||||||
export HCCL_IF_IP=$local_ip | ||||||
export GLOO_SOCKET_IFNAME=$nic_name | ||||||
export TP_SOCKET_IFNAME=$nic_name | ||||||
export HCCL_SOCKET_IFNAME=$nic_name | ||||||
export OMP_PROC_BIND=false | ||||||
export OMP_NUM_THREADS=100 | ||||||
export VLLM_USE_V1=1 | ||||||
export HCCL_BUFFSIZE=1024 | ||||||
|
||||||
vllm serve $MODEL_PATH \ | ||||||
--headless \ | ||||||
--data-parallel-size 2 \ | ||||||
--data-parallel-size-local 1 \ | ||||||
--data-parallel-start-rank 1 \ | ||||||
--data-parallel-address $leader_ip \ | ||||||
--data-parallel-rpc-port 13389 \ | ||||||
--tensor-parallel-size 16 \ | ||||||
--seed 1024 \ | ||||||
--quantization ascend \ | ||||||
--max-num-seqs 16 \ | ||||||
--max-model-len 32768 \ | ||||||
--max-num-batched-tokens 4096 \ | ||||||
--enable-expert-parallel \ | ||||||
--trust-remote-code \ | ||||||
--gpu-memory-utilization 0.92 \ | ||||||
--additional-config '{"torchair_graph_config":{"enabled":true}}' |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
apiVersion: leaderworkerset.x-k8s.io/v1 | ||
kind: LeaderWorkerSet | ||
metadata: | ||
name: vllm | ||
namespace: vllm-project | ||
spec: | ||
replicas: 1 | ||
leaderWorkerTemplate: | ||
size: 2 | ||
restartPolicy: RecreateGroupOnPodRestart | ||
leaderTemplate: | ||
metadata: | ||
labels: | ||
role: leader | ||
spec: | ||
containers: | ||
- name: vllm-leader | ||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 | ||
env: | ||
- name: WORKSPACE | ||
value: /root/workspace | ||
- name: MODEL_PATH | ||
value: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8 | ||
command: | ||
- sh | ||
- -c | ||
- "bash /root/.cache/tests/multi_node_mp/launch_server_leader.sh" | ||
Comment on lines
+24
to
+27
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The container's command executes a script from command:
- sh
- -c
- "bash $WORKSPACE/tests/e2e/multi_nodes/multi_node_mp/launch_server_leader.sh" |
||
resources: | ||
limits: | ||
huawei.com/ascend-1980: "16" | ||
memory: 512Gi | ||
ephemeral-storage: 100Gi | ||
requests: | ||
huawei.com/ascend-1980: "16" | ||
ephemeral-storage: 100Gi | ||
cpu: 125 | ||
ports: | ||
- containerPort: 8080 | ||
# readinessProbe: | ||
# tcpSocket: | ||
# port: 8080 | ||
# initialDelaySeconds: 15 | ||
# periodSeconds: 10 | ||
Comment on lines
+39
to
+43
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The readinessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 60
periodSeconds: 10
failureThreshold: 30 |
||
volumeMounts: | ||
- mountPath: /root/.cache | ||
name: shared-volume | ||
- mountPath: /usr/local/Ascend/driver/tools | ||
name: driver-tools | ||
- mountPath: /dev/shm | ||
name: dshm | ||
volumes: | ||
- name: dshm | ||
emptyDir: | ||
medium: Memory | ||
sizeLimit: 15Gi | ||
- name: shared-volume | ||
persistentVolumeClaim: | ||
claimName: nv-action-vllm-benchmarks-v2 | ||
- name: driver-tools | ||
hostPath: | ||
path: /usr/local/Ascend/driver/tools | ||
workerTemplate: | ||
spec: | ||
containers: | ||
- name: vllm-worker | ||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 | ||
env: | ||
- name: WORKSPACE | ||
value: /root/workspace | ||
- name: MODEL_PATH | ||
value: /root/.cache/modelscope/hub/models/vllm-ascend/DeepSeek-V3-W8A8 | ||
command: | ||
- sh | ||
- -c | ||
- "bash /root/.cache/tests/multi_node_mp/launch_server_worker.sh" | ||
resources: | ||
limits: | ||
huawei.com/ascend-1980: "16" | ||
memory: 512Gi | ||
ephemeral-storage: 100Gi | ||
requests: | ||
huawei.com/ascend-1980: "16" | ||
ephemeral-storage: 100Gi | ||
cpu: 125 | ||
volumeMounts: | ||
- mountPath: /root/.cache | ||
name: shared-volume | ||
- mountPath: /usr/local/Ascend/driver/tools | ||
name: driver-tools | ||
- mountPath: /dev/shm | ||
name: dshm | ||
volumes: | ||
- name: dshm | ||
emptyDir: | ||
medium: Memory | ||
sizeLimit: 15Gi | ||
- name: shared-volume | ||
persistentVolumeClaim: | ||
claimName: nv-action-vllm-benchmarks-v2 | ||
- name: driver-tools | ||
hostPath: | ||
path: /usr/local/Ascend/driver/tools | ||
--- | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
name: vllm-leader | ||
namespace: vllm-project | ||
spec: | ||
ports: | ||
- name: http | ||
port: 8080 | ||
protocol: TCP | ||
targetPort: 8080 | ||
selector: | ||
leaderworkerset.sigs.k8s.io/name: vllm | ||
role: leader | ||
type: ClusterIP |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
checkout_src
function clonesvllm-ascend
from its main repository. When running in a CI environment for a pull request, this will ignore the changes from the PR and test against themain
branch instead. The CI system should be responsible for checking out the correct version of the code, and these scripts should use that version. This defeats the purpose of running CI on pull requests.