Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
5a1d2f4
upgrade cuda to 12.9, add UCCL_EFA_DEVICES and UCCL_ENA_DEVICES
Oct 16, 2025
36e6582
Merge branch 'uccl-project:main' into main
whn09 Oct 22, 2025
b64c6a6
Merge branch 'uccl-project:main' into main
whn09 Oct 27, 2025
c84c7ca
Merge branch 'uccl-project:main' into main
whn09 Oct 28, 2025
534dff3
fix install_deps.sh, add libnuma-dev
Oct 28, 2025
3664e8c
add deep_ep_wrapper
Oct 28, 2025
68eaecb
add initialize_uccl
Oct 28, 2025
4543a85
change LOCAL_RANK/WORLD_SIZE
Oct 28, 2025
03de382
add IP discovery
Oct 28, 2025
cc40960
change sleep time to 10
Oct 28, 2025
9471ba1
change Buffer.__init__ signature
Oct 28, 2025
88b5178
fix get_local_rdma_ipc_handle bug
Oct 28, 2025
cbd5ee5
move initialize_uccl to buffer.py
Oct 28, 2025
296c85e
try to fix: Posting rdma to a different rank
Oct 28, 2025
ffcb3c0
try to fix: Posting rdma to a different rank
Oct 28, 2025
0bcd158
add debug info
Oct 28, 2025
6afe7f7
add debug info
Oct 28, 2025
facf61f
add debug info
Oct 28, 2025
1997570
add debug info
Oct 28, 2025
7431889
add debug info
Oct 28, 2025
9ea209a
Merge branch 'uccl-project:main' into main
whn09 Oct 28, 2025
75e5620
simplify buffer.py
Oct 29, 2025
744a178
Ref https://github.com/uccl-project/uccl/pull/491/
Oct 29, 2025
a670f7d
add sudo to install_deps.sh
Oct 29, 2025
5c5fc11
remove duplicated _cfg
Oct 29, 2025
e9f1570
change scratch to scratch_ptr in initialize_uccl
Oct 29, 2025
6497afa
remove set_rdma_buffer_raw
Oct 29, 2025
578a226
set rdma_buffer_ptr=scratch.data_ptr()
Oct 29, 2025
3ff012b
Merge branch 'uccl-project:main' into main
whn09 Oct 29, 2025
76de328
add use_normal_mode
Oct 29, 2025
4ffebe5
fix __init__
Oct 29, 2025
11d4237
change to time.sleep(3)
Oct 29, 2025
1a6dd33
update deep_ep_wrapper/deep_ep/test_internode.py
Oct 29, 2025
4844475
Merge branch 'uccl-project:main' into main
whn09 Oct 31, 2025
2dc8b28
Merge branch 'uccl-project:main' into main
whn09 Nov 3, 2025
b1ad10e
Merge branch 'uccl-project:main' into main
whn09 Nov 5, 2025
bbf3be2
Merge branch 'uccl-project:main' into main
whn09 Nov 11, 2025
5297e45
Merge branch 'uccl-project:main' into main
whn09 Nov 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ build_efa() {

# EFA requires a custom NCCL.
cd thirdparty/nccl-sg
make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"
# make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"
make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90"
cd ../..

echo "[container] Copying EFA .so to uccl/lib/"
Expand Down
19 changes: 13 additions & 6 deletions collective/efa/run_p5en.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# Disable NVLink.
NV_LINK_DISABLE=0
MULTI_GROUP=0
NIC=10.1.0.0/16
# NIC=10.1.0.0/16
NIC=172.31.0.0/16
# Processes/Ranks/GPUs per node.
PROCS_PER_NODE=8

TEST=${1:-srd}
NUM_PROCS=${2:-32}
# NUM_PROCS=${2:-32}
NUM_PROCS=${2:-16}
PROG_NAME=${3:-0}

# all_gather_perf all_reduce_perf alltoall_perf broadcast_perf gather_perf
Expand All @@ -32,7 +34,8 @@ else
exit 1
fi

CHANNELS=32
# CHANNELS=32
CHANNELS=16
CHANNELS_NET_PEER=1

# UCCL optimal parameters. Yang: for allreduce with nvlink, we need to use larger buffer to catch up with NCCL with larger buffers, and avoid outliers.
Expand All @@ -55,7 +58,7 @@ if [ "$TEST" = "srd" ]; then
>"nccl_test_outputs/output_rank_$rank.log"
done

LIBNCCL_PATH="${UCCL_HOME}/thirdparty/nccl/build/lib/libnccl.so"
LIBNCCL_PATH="${UCCL_HOME}/thirdparty/nccl-sg/build/lib/libnccl.so"
PLUGIN_PATH="/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-net.so"

mpirun --bind-to none -np ${NUM_PROCS} -N ${PROCS_PER_NODE} --hostfile $NODEFILE --map-by ppr:8:node \
Expand All @@ -75,7 +78,9 @@ if [ "$TEST" = "srd" ]; then
-x NCCL_NCHANNELS_PER_NET_PEER=${CHANNELS_NET_PEER} \
-x NCCL_P2P_NET_CHUNKSIZE=${CHUNK_SIZE} \
-x NCCL_BUFFSIZE=${BUFFSIZE} \
${UCCL_HOME}/thirdparty/nccl-tests/build/${PROG_NAME} \
-x UCCL_EFA_DEVICES=rdmap110s0,rdmap112s0,rdmap135s0,rdmap137s0,rdmap160s0,rdmap162s0,rdmap85s0,rdmap87s0,rdmap111s0,rdmap113s0,rdmap136s0,rdmap138s0,rdmap161s0,rdmap163s0,rdmap86s0,rdmap88s0 \
-x UCCL_ENA_DEVICES=enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0 \
/usr/local/cuda-12.9/efa/test-cuda-12.9/${PROG_NAME} \
-b 1K -e 1G -f 2 -c 1 -w 5 -n 10 -t 1 -g 1 \
2>&1 | while read -r line; do
if [[ "$line" =~ ^\[[0-9]+,([0-9]+)\](.+) ]]; then
Expand Down Expand Up @@ -129,7 +134,9 @@ elif [ "$TEST" = "ud" ]; then
-x NCCL_TOPO_FILE=${UCCL_HOME}/collective/efa/p4d-24xl-topo.xml \
-x NCCL_PXN_DISABLE=1 \
-x UCCL_ENGINE_QUIET=1 \
${UCCL_HOME}/thirdparty/nccl-tests/build/${PROG_NAME} \
-x UCCL_EFA_DEVICES=rdmap110s0,rdmap112s0,rdmap135s0,rdmap137s0,rdmap160s0,rdmap162s0,rdmap85s0,rdmap87s0,rdmap111s0,rdmap113s0,rdmap136s0,rdmap138s0,rdmap161s0,rdmap163s0,rdmap86s0,rdmap88s0 \
-x UCCL_ENA_DEVICES=enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0,enp71s0 \
/usr/local/cuda-12.9/efa/test-cuda-12.9/${PROG_NAME} \
-b 1K -e 1G -f 2 -c 1 -w 5 -n 10 -t 1 -g 1 \
2>&1 | while read -r line; do
if [[ "$line" =~ ^\[[0-9]+,([0-9]+)\](.+) ]]; then
Expand Down
7 changes: 5 additions & 2 deletions docker/Dockerfile.cuda
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
ARG BASE_IMAGE=nvidia/cuda:12.3.2-devel-ubuntu22.04
# ARG BASE_IMAGE=nvidia/cuda:12.3.2-devel-ubuntu22.04
# FROM ${BASE_IMAGE}
# ARG PY_VER=3.13
ARG BASE_IMAGE=nvidia/cuda:12.9.0-devel-ubuntu22.04
FROM ${BASE_IMAGE}
ARG PY_VER=3.13
ARG PY_VER=3.12

# Non-interactive APT
ENV DEBIAN_FRONTEND=noninteractive
Expand Down
10 changes: 7 additions & 3 deletions docker/Dockerfile.efa
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
ARG BASE_IMAGE=nvidia/cuda:12.3.2-devel-ubuntu22.04
# ARG BASE_IMAGE=nvidia/cuda:12.3.2-devel-ubuntu22.04
# FROM ${BASE_IMAGE}
# ARG PY_VER=3.13
ARG BASE_IMAGE=nvidia/cuda:12.9.0-devel-ubuntu22.04
FROM ${BASE_IMAGE}
ARG PY_VER=3.13
ARG PY_VER=3.12

# Non-interactive APT
ENV DEBIAN_FRONTEND=noninteractive
Expand Down Expand Up @@ -44,7 +47,8 @@ RUN ln -s /usr/lib/x86_64-linux-gnu/libevent_core-2.1.so.7 /usr/lib/x86_64-linux
ln -s /usr/lib/x86_64-linux-gnu/libhwloc.so.15 /usr/lib/x86_64-linux-gnu/libhwloc15.so

# Install EFA installer (without kernel driver)
ARG EFA_VER=1.42.0
# ARG EFA_VER=1.42.0
ARG EFA_VER=1.43.2
RUN curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_VER}.tar.gz && \
tar -xf aws-efa-installer-${EFA_VER}.tar.gz && \
cd aws-efa-installer && \
Expand Down
8 changes: 8 additions & 0 deletions ep/deep_ep_wrapper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
## DeepEP Wrapper of UCCL-EP

```
cp ../bench/buffer.py ./ # Change `utils` to `deep_ep.utils`
cp ../bench/utils.py ./

python setup.py install
```
15 changes: 15 additions & 0 deletions ep/deep_ep_wrapper/deep_ep/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from uccl.ep import Config, EventHandle

from .utils import EventOverlap, check_nvlink_connections, initialize_uccl, destroy_uccl
from .buffer import Buffer
import torch.distributed as dist

__all__ = [
'Config',
'EventHandle',
'Buffer',
'EventOverlap',
'check_nvlink_connections',
'initialize_uccl',
'destroy_uccl',
]
Loading