diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..fe900bbf2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,39 @@ +default_stages: [pre-commit, pre-push, manual] + +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: check-symlinks + - id: destroyed-symlinks + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + args: [--allow-multiple-documents] + # - id: check-toml + - id: check-ast + - id: check-added-large-files + - id: check-merge-conflict + - id: check-shebang-scripts-are-executable + - id: detect-private-key + - id: debug-statements + - id: no-commit-to-branch + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + args: + - "--profile" + - "black" + - "filter-files" + - repo: https://github.com/psf/black + rev: 24.10.0 + hooks: + - id: black-jupyter + - repo: https://github.com/kynan/nbstripout + rev: 0.8.1 + hooks: + - id: nbstripout + args: + - '--keep-output' + - '--extra-keys=metadata.kernelspec metadata.language_info.version' diff --git a/build_and_install.sh b/build_and_install.sh index bba4595d8..d0320c45a 100755 --- a/build_and_install.sh +++ b/build_and_install.sh @@ -8,13 +8,34 @@ ROCM_IDX_URL=${4:-https://rocm.prereleases.amd.com/whl/gfx94X-dcgpu} # The default for THEROCK_BASE_IMAGE is current, but may change. Make sure to track TheRock's dockerfile. THEROCK_BASE_IMAGE=${5:-quay.io/pypa/manylinux_2_28_x86_64@sha256:d632b5e68ab39e59e128dcf0e59e438b26f122d7f2d45f3eea69ffd2877ab017} +echo "TARGET : $TARGET" + if [[ $TARGET != cuda* && $TARGET != rocm* && $TARGET != "therock" ]]; then echo "Usage: $0 [cuda|rocm|therock] [all|rdma|p2p|efa|ep] [py_version] [rocm_index_url] [therock_base_image]" >&2 exit 1 fi ARCH_SUFFIX=$(uname -m) -./build.sh $TARGET $BUILD_TYPE $PY_VER $ROCM_IDX_URL $THEROCK_BASE_IMAGE + +echo "ARCH_SUFFIX : $ARCH_SUFFIX" + +is_docker_container() { + [ -f /.dockerenv ] || grep -q docker /proc/1/cgroup 2>/dev/null +} + +# Check if docker command is available and working +has_docker_command() { + command -v docker &> /dev/null && docker info &> /dev/null +} + +if is_docker_container || !has_docker_command; then + echo "Running inside the docker : ./build_insider_docker.sh $TARGET $BUILD_TYPE $PY_VER $ROCM_IDX_URL $THEROCK_BASE_IMAGE" + ./build_insider_docker.sh $TARGET $BUILD_TYPE $PY_VER $ROCM_IDX_URL $THEROCK_BASE_IMAGE +else + echo "Running with the docker" + ./build.sh $TARGET $BUILD_TYPE $PY_VER $ROCM_IDX_URL $THEROCK_BASE_IMAGE +fi + pip install -r requirements.txt pip uninstall uccl -y || true if [[ $TARGET != "therock" ]]; then diff --git a/build_insider_docker.sh b/build_insider_docker.sh new file mode 100755 index 000000000..3147bea12 --- /dev/null +++ b/build_insider_docker.sh @@ -0,0 +1,243 @@ +#!/bin/bash +set -e + +# ----------------------- +# Build uccl wheels for CUDA (NVIDIA) and ROCm (AMD) backends/targets. +# The host machine does *not* need CUDA or ROCm – everything lives inside +# a purpose-built Docker image derived from Ubuntu 22.04. +# +# Usage: +# ./build.sh [cuda|rocm|therock] [all|rdma|p2p|efa|ep] [py_version] [rocm_index_url] [therock_base_image] +# +# The wheels are written to wheelhouse-[cuda|rocm|therock] +# ----------------------- +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +TARGET=${1:-cuda} +BUILD_TYPE=${2:-all} +PY_VER=${3:-$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")} +ARCH="$(uname -m)" +# The default for ROCM_IDX_URL depends on the gfx architecture of your GPU and the index URLs may change. +ROCM_IDX_URL=${4:-https://rocm.prereleases.amd.com/whl/gfx94X-dcgpu} +# The default for THEROCK_BASE_IMAGE is current, but may change. Make sure to track TheRock's dockerfile. +THEROCK_BASE_IMAGE=${5:-quay.io/pypa/manylinux_2_28_x86_64@sha256:d632b5e68ab39e59e128dcf0e59e438b26f122d7f2d45f3eea69ffd2877ab017} +IS_EFA=$( [ -d "/sys/class/infiniband/" ] && ls /sys/class/infiniband/ 2>/dev/null | grep -q rdmap && echo "EFA support: true" ) || echo "EFA support: false" + + +if [[ $TARGET != cuda* && $TARGET != rocm* && $TARGET != "therock" ]]; then + echo "Usage: $0 [cuda|rocm|therock] [all|rdma|p2p|efa|ep|eccl] [py_version] [rocm_index_url]" >&2 + exit 1 +fi + +if [[ $ARCH == "aarch64" && ( $TARGET == rocm* || $TARGET == "therock" ) ]]; then + echo "Skipping ROCm build on Arm64 (no ROCm toolchain)." + exit 1 +fi + +rm -r uccl.egg-info >/dev/null 2>&1 || true +rm -r dist >/dev/null 2>&1 || true +rm -r build >/dev/null 2>&1 || true +WHEEL_DIR="wheelhouse-${TARGET}" +rm -r "${WHEEL_DIR}" >/dev/null 2>&1 || true +mkdir -p "${WHEEL_DIR}" + +build_rccl_nccl_h() { + # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h. + if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then + cd thirdparty/rccl + # Just to get nccl.h, not the whole library + CXX=/opt/rocm/bin/hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF >/dev/null 2>&1 || true + cd ../.. + fi +} + +build_rdma() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_rdma Target: $TARGET" + + if [[ "$TARGET" == cuda* ]]; then + cd collective/rdma && make clean && make -j$(nproc) && cd ../../ + TARGET_SO=collective/rdma/libnccl-net-uccl.so + elif [[ "$TARGET" == rocm* ]]; then + if [[ "$ARCH" == "aarch64" ]]; then + echo "Skipping ROCm build on Arm64 (no ROCm toolchain)." + return + fi + cd collective/rdma && make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm && cd ../../ + TARGET_SO=collective/rdma/librccl-net-uccl.so + elif [[ "$TARGET" == "therock" ]]; then + if [[ "$ARCH" == "aarch64" ]]; then + echo "Skipping ROCm build on Arm64 (no ROCm toolchain)." + return + fi + # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h. + if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then + cd thirdparty/rccl + # Just to get nccl.h, not the whole library + CXX=hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF -DCMAKE_PREFIX_PATH=$(rocm-sdk path --cmake) -DROCM_PATH=$(rocm-sdk path --root) -DHIP_PLATFORM=amd >/dev/null 2>&1 || true + cd ../.. + fi + cd collective/rdma && make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib && cd ../../ + TARGET_SO=collective/rdma/librccl-net-uccl.so + fi + + echo "[container] Copying RDMA .so to uccl/lib/" + mkdir -p uccl/lib + cp ${TARGET_SO} uccl/lib/ +} + +build_efa() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_efa Target: $TARGET" + + if [[ "$ARCH" == "aarch64" || "$TARGET" == rocm* || "$TARGET" == "therock" ]]; then + echo "Skipping EFA build on Arm64 (no EFA installer) or ROCm (no CUDA)." + return + fi + cd collective/efa && make clean && make -j$(nproc) && cd ../../ + + # EFA requires a custom NCCL. + cd thirdparty/nccl-sg + make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80" + cd ../.. + + echo "[container] Copying EFA .so to uccl/lib/" + mkdir -p uccl/lib + cp collective/efa/libnccl-net-efa.so uccl/lib/ + cp thirdparty/nccl-sg/build/lib/libnccl.so uccl/lib/libnccl-efa.so +} + +build_p2p() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_p2p Target: $TARGET" + + cd p2p + if [[ "$TARGET" == cuda* ]]; then + make clean && make -j$(nproc) + elif [[ "$TARGET" == rocm* ]]; then + make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm + elif [[ "$TARGET" == "therock" ]]; then + make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib + fi + cd .. + + echo "[container] Copying P2P .so, collective.py and utils.py to uccl/" + mkdir -p uccl + mkdir -p uccl/lib + if [[ -z "${USE_TCPX:-}" || "$USE_TCPX" != "1" ]]; then + cp p2p/p2p.*.so uccl/ + cp p2p/collective.py uccl/ + cp p2p/transfer.py uccl/ + cp p2p/utils.py uccl/ + else + echo "[container] USE_TCPX=1, skipping copying p2p runtime files" + fi +} + +build_ep() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_ep Target: $TARGET" + + if [[ "$TARGET" == "therock" ]]; then + echo "Skipping GPU-driven build on therock (no GPU-driven support yet)." + elif [[ "$TARGET" == rocm* ]]; then + cd ep + python3 setup.py build + cd .. + echo "[container] Copying GPU-driven .so to uccl/" + mkdir -p uccl/lib + cp ep/build/**/*.so uccl/ + elif [[ "$TARGET" == cuda* ]]; then + cd ep + make clean && make -j$(nproc) all + cd .. + echo "[container] Copying GPU-driven .so to uccl/" + mkdir -p uccl/lib + cp ep/*.so uccl/ + fi +} + +build_eccl() { + local TARGET="$1" + local ARCH="$2" + local IS_EFA="$3" + + set -euo pipefail + echo "[container] build_eccl Target: $TARGET" + + cd eccl + if [[ "$TARGET" == cuda* ]]; then + echo "Skipping eccl build on Cuda." + return + elif [[ "$TARGET" == rocm* ]]; then + make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm + fi + cd .. + + echo "[container] Copying eccl .so to uccl/" + # mkdir -p uccl/lib + # cp eccl/eccl.*.so uccl/ +} + +# Build (contains toolchain + CUDA/ROCm) +echo "[2/3] Building..." + +export USE_TCPX="${USE_TCPX:-0}" +export MAKE_NORMAL_MODE="${MAKE_NORMAL_MODE:-}" +export FUNCTION_DEF="$(declare -f build_rccl_nccl_h build_rdma build_efa build_p2p build_ep build_eccl)" + +set -euo pipefail + +eval "$FUNCTION_DEF" + +echo "BUILD_TYPE : ${BUILD_TYPE}" + +if [[ $TARGET == "cuda" && "$ARCH" == "x86_64" ]]; then + +export CUDA_HOME=/usr/local/cuda +export PATH=$PATH:$CUDA_HOME/bin + +# install dependencies +apt-get install libelf-dev + +# defaul to BUILD_TYPE all +build_rdma "$TARGET" "$ARCH" "$IS_EFA" +build_efa "$TARGET" "$ARCH" "$IS_EFA" +build_p2p "$TARGET" "$ARCH" "$IS_EFA" +build_ep "$TARGET" "$ARCH" "$IS_EFA" +# NOTE (yiakwy) : eccl is skpipped on CUDA platform +build_eccl "$TARGET" "$ARCH" "$IS_EFA" + +else + +echo "$TARGET is not supported yet." +exit 1 + +fi + +cd $ROOT + +python${PY_VER} -m build + +auditwheel repair dist/uccl-*.whl --exclude "libtorch*.so" --exclude "libc10*.so" --exclude "libibverbs.so.1" --exclude "libcudart.so.12" --exclude "libamdhip64.so.*" --exclude "libcuda.so.1" -w $ROOT/${WHEEL_DIR} +auditwheel show $ROOT/${WHEEL_DIR}/*.whl + +# 3. Done +echo "[3/3] Wheel built successfully (stored in ${WHEEL_DIR}):" +ls -lh "$ROOT/${WHEEL_DIR}"/uccl-*.whl || true diff --git a/collective/rdma/nccl_plugin.cc b/collective/rdma/nccl_plugin.cc index 578485cc3..b52b2b1a6 100644 --- a/collective/rdma/nccl_plugin.cc +++ b/collective/rdma/nccl_plugin.cc @@ -1,4 +1,4 @@ -#include "nccl_net.h" +#include #include "transport.h" #include "transport_config.h" #include "util_rdma.h" diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda index 6ff363942..098f173e1 100644 --- a/docker/Dockerfile.cuda +++ b/docker/Dockerfile.cuda @@ -16,7 +16,7 @@ RUN apt-get update && \ build-essential cmake git ninja-build g++ make patchelf \ rdma-core libibverbs-dev \ libgoogle-glog-dev libgflags-dev libgtest-dev libelf-dev \ - libnuma-dev libdrm-dev libdrm-amdgpu1 \ + libnuma-dev \ pkg-config zlib1g-dev curl && \ \ # ───── Add Python ${PY_VER} PPA & install Python ${PY_VER} + setuptools ───── @@ -47,7 +47,7 @@ RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools # ───── Set Python ${PY_VER} as default python3 and python3-config ───── RUN ln -sf /usr/bin/python${PY_VER} /usr/local/bin/python3 && \ ln -sf /usr/bin/python${PY_VER}-config /usr/local/bin/python3-config - + WORKDIR /io -CMD ["bash"] \ No newline at end of file +CMD ["bash"]