diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..fe900bbf2
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,39 @@
+default_stages: [pre-commit, pre-push, manual]
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-symlinks
+      - id: destroyed-symlinks
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+        args: [--allow-multiple-documents]
+      # - id: check-toml
+      - id: check-ast
+      - id: check-added-large-files
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: detect-private-key
+      - id: debug-statements
+      - id: no-commit-to-branch
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        args:
+          - "--profile"
+          - "black"
+          - "filter-files"
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black-jupyter
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.8.1
+    hooks:
+      - id: nbstripout
+        args:
+          - '--keep-output'
+          - '--extra-keys=metadata.kernelspec metadata.language_info.version'
diff --git a/build_and_install.sh b/build_and_install.sh
index bba4595d8..d0320c45a 100755
--- a/build_and_install.sh
+++ b/build_and_install.sh
@@ -8,13 +8,34 @@ ROCM_IDX_URL=${4:-https://rocm.prereleases.amd.com/whl/gfx94X-dcgpu}
 # The default for THEROCK_BASE_IMAGE is current, but may change. Make sure to track TheRock's dockerfile.
 THEROCK_BASE_IMAGE=${5:-quay.io/pypa/manylinux_2_28_x86_64@sha256:d632b5e68ab39e59e128dcf0e59e438b26f122d7f2d45f3eea69ffd2877ab017}
 
+echo "TARGET : $TARGET"
+
 if [[ $TARGET != cuda* && $TARGET != rocm* && $TARGET != "therock" ]]; then
   echo "Usage: $0 [cuda|rocm|therock] [all|rdma|p2p|efa|ep] [py_version] [rocm_index_url] [therock_base_image]" >&2
   exit 1
 fi
 
 ARCH_SUFFIX=$(uname -m)
-./build.sh $TARGET $BUILD_TYPE $PY_VER $ROCM_IDX_URL $THEROCK_BASE_IMAGE
+
+echo "ARCH_SUFFIX : $ARCH_SUFFIX"
+
+is_docker_container() {
+    [ -f /.dockerenv ] || grep -q docker /proc/1/cgroup 2>/dev/null
+}
+
+# Check if docker command is available and working
+has_docker_command() {
+    command -v docker &> /dev/null && docker info &> /dev/null
+}
+
+if is_docker_container || !has_docker_command; then
+  echo "Running inside the docker : ./build_insider_docker.sh $TARGET $BUILD_TYPE $PY_VER $ROCM_IDX_URL $THEROCK_BASE_IMAGE"
+  ./build_insider_docker.sh $TARGET $BUILD_TYPE $PY_VER $ROCM_IDX_URL $THEROCK_BASE_IMAGE
+else
+  echo "Running with the docker"
+  ./build.sh $TARGET $BUILD_TYPE $PY_VER $ROCM_IDX_URL $THEROCK_BASE_IMAGE
+fi
+
 pip install -r requirements.txt
 pip uninstall uccl -y || true
 if [[ $TARGET != "therock" ]]; then
diff --git a/build_insider_docker.sh b/build_insider_docker.sh
new file mode 100755
index 000000000..3147bea12
--- /dev/null
+++ b/build_insider_docker.sh
@@ -0,0 +1,243 @@
+#!/bin/bash
+set -e
+
+# -----------------------
+# Build uccl wheels for CUDA (NVIDIA) and ROCm (AMD) backends/targets.
+# The host machine does *not* need CUDA or ROCm – everything lives inside
+# a purpose-built Docker image derived from Ubuntu 22.04.
+#
+# Usage:
+#   ./build.sh [cuda|rocm|therock] [all|rdma|p2p|efa|ep] [py_version] [rocm_index_url] [therock_base_image]
+#
+# The wheels are written to wheelhouse-[cuda|rocm|therock]
+# -----------------------
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+TARGET=${1:-cuda}
+BUILD_TYPE=${2:-all}
+PY_VER=${3:-$(python3 -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")}
+ARCH="$(uname -m)"
+# The default for ROCM_IDX_URL depends on the gfx architecture of your GPU and the index URLs may change.
+ROCM_IDX_URL=${4:-https://rocm.prereleases.amd.com/whl/gfx94X-dcgpu}
+# The default for THEROCK_BASE_IMAGE is current, but may change. Make sure to track TheRock's dockerfile.
+THEROCK_BASE_IMAGE=${5:-quay.io/pypa/manylinux_2_28_x86_64@sha256:d632b5e68ab39e59e128dcf0e59e438b26f122d7f2d45f3eea69ffd2877ab017}
+IS_EFA=$( [ -d "/sys/class/infiniband/" ] && ls /sys/class/infiniband/ 2>/dev/null | grep -q rdmap && echo "EFA support: true" ) || echo "EFA support: false"
+
+
+if [[ $TARGET != cuda* && $TARGET != rocm* && $TARGET != "therock" ]]; then
+  echo "Usage: $0 [cuda|rocm|therock] [all|rdma|p2p|efa|ep|eccl] [py_version] [rocm_index_url]" >&2
+  exit 1
+fi
+
+if [[ $ARCH == "aarch64" && ( $TARGET == rocm* || $TARGET == "therock" ) ]]; then
+  echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
+  exit 1
+fi
+
+rm -r uccl.egg-info >/dev/null 2>&1 || true
+rm -r dist >/dev/null 2>&1 || true
+rm -r build >/dev/null 2>&1 || true
+WHEEL_DIR="wheelhouse-${TARGET}"
+rm -r "${WHEEL_DIR}" >/dev/null 2>&1 || true
+mkdir -p "${WHEEL_DIR}"
+
+build_rccl_nccl_h() {
+  # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h.
+  if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then
+    cd thirdparty/rccl
+    # Just to get nccl.h, not the whole library
+    CXX=/opt/rocm/bin/hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF >/dev/null 2>&1 || true
+    cd ../..
+  fi
+}
+
+build_rdma() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_rdma Target: $TARGET"
+
+  if [[ "$TARGET" == cuda* ]]; then
+    cd collective/rdma && make clean && make -j$(nproc) && cd ../../
+    TARGET_SO=collective/rdma/libnccl-net-uccl.so
+  elif [[ "$TARGET" == rocm* ]]; then
+    if [[ "$ARCH" == "aarch64" ]]; then
+      echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
+      return
+    fi
+    cd collective/rdma && make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm && cd ../../
+    TARGET_SO=collective/rdma/librccl-net-uccl.so
+  elif [[ "$TARGET" == "therock" ]]; then
+    if [[ "$ARCH" == "aarch64" ]]; then
+      echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
+      return
+    fi
+    # Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h.
+    if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then
+      cd thirdparty/rccl
+      # Just to get nccl.h, not the whole library
+      CXX=hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF -DCMAKE_PREFIX_PATH=$(rocm-sdk path --cmake) -DROCM_PATH=$(rocm-sdk path --root) -DHIP_PLATFORM=amd >/dev/null 2>&1 || true
+      cd ../..
+    fi
+    cd collective/rdma && make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib && cd ../../
+    TARGET_SO=collective/rdma/librccl-net-uccl.so
+  fi
+
+  echo "[container] Copying RDMA .so to uccl/lib/"
+  mkdir -p uccl/lib
+  cp ${TARGET_SO} uccl/lib/
+}
+
+build_efa() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_efa Target: $TARGET"
+
+  if [[ "$ARCH" == "aarch64" || "$TARGET" == rocm* || "$TARGET" == "therock" ]]; then
+    echo "Skipping EFA build on Arm64 (no EFA installer) or ROCm (no CUDA)."
+    return
+  fi
+  cd collective/efa && make clean && make -j$(nproc) && cd ../../
+
+  # EFA requires a custom NCCL.
+  cd thirdparty/nccl-sg
+  make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80"
+  cd ../..
+
+  echo "[container] Copying EFA .so to uccl/lib/"
+  mkdir -p uccl/lib
+  cp collective/efa/libnccl-net-efa.so uccl/lib/
+  cp thirdparty/nccl-sg/build/lib/libnccl.so uccl/lib/libnccl-efa.so
+}
+
+build_p2p() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_p2p Target: $TARGET"
+
+  cd p2p
+  if [[ "$TARGET" == cuda* ]]; then
+    make clean && make -j$(nproc)
+  elif [[ "$TARGET" == rocm* ]]; then
+    make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
+  elif [[ "$TARGET" == "therock" ]]; then
+    make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib
+  fi
+  cd ..
+
+  echo "[container] Copying P2P .so, collective.py and utils.py to uccl/"
+  mkdir -p uccl
+  mkdir -p uccl/lib
+  if [[ -z "${USE_TCPX:-}" || "$USE_TCPX" != "1" ]]; then
+    cp p2p/p2p.*.so uccl/
+    cp p2p/collective.py uccl/
+    cp p2p/transfer.py uccl/
+    cp p2p/utils.py uccl/
+  else
+    echo "[container] USE_TCPX=1, skipping copying p2p runtime files"
+  fi
+}
+
+build_ep() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_ep Target: $TARGET"
+
+  if [[ "$TARGET" == "therock" ]]; then
+    echo "Skipping GPU-driven build on therock (no GPU-driven support yet)."
+  elif [[ "$TARGET" == rocm* ]]; then
+    cd ep
+    python3 setup.py build
+    cd ..
+    echo "[container] Copying GPU-driven .so to uccl/"
+    mkdir -p uccl/lib
+    cp ep/build/**/*.so uccl/
+  elif [[ "$TARGET" == cuda* ]]; then
+    cd ep
+    make clean && make -j$(nproc) all
+    cd ..
+    echo "[container] Copying GPU-driven .so to uccl/"
+    mkdir -p uccl/lib
+    cp ep/*.so uccl/
+  fi
+}
+
+build_eccl() {
+  local TARGET="$1"
+  local ARCH="$2"
+  local IS_EFA="$3"
+
+  set -euo pipefail
+  echo "[container] build_eccl Target: $TARGET"
+
+  cd eccl
+  if [[ "$TARGET" == cuda* ]]; then
+    echo "Skipping eccl build on Cuda."
+    return
+  elif [[ "$TARGET" == rocm* ]]; then
+    make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
+  fi
+  cd ..
+
+  echo "[container] Copying eccl .so to uccl/"
+  # mkdir -p uccl/lib
+  # cp eccl/eccl.*.so uccl/
+}
+
+# Build (contains toolchain + CUDA/ROCm)
+echo "[2/3] Building..."
+
+export USE_TCPX="${USE_TCPX:-0}"
+export MAKE_NORMAL_MODE="${MAKE_NORMAL_MODE:-}"
+export FUNCTION_DEF="$(declare -f build_rccl_nccl_h build_rdma build_efa build_p2p build_ep build_eccl)"
+
+set -euo pipefail
+
+eval "$FUNCTION_DEF"
+
+echo "BUILD_TYPE : ${BUILD_TYPE}"
+
+if [[ $TARGET == "cuda" && "$ARCH" == "x86_64" ]]; then
+
+export CUDA_HOME=/usr/local/cuda
+export PATH=$PATH:$CUDA_HOME/bin
+
+# install dependencies
+apt-get install libelf-dev
+
+# defaul to BUILD_TYPE all
+build_rdma "$TARGET" "$ARCH" "$IS_EFA"
+build_efa "$TARGET" "$ARCH" "$IS_EFA"
+build_p2p "$TARGET" "$ARCH" "$IS_EFA"
+build_ep "$TARGET" "$ARCH" "$IS_EFA"
+# NOTE (yiakwy) : eccl is skpipped on CUDA platform
+build_eccl "$TARGET" "$ARCH" "$IS_EFA"
+
+else
+
+echo "$TARGET is not supported yet."
+exit 1
+
+fi
+
+cd $ROOT
+
+python${PY_VER} -m build
+
+auditwheel repair dist/uccl-*.whl --exclude "libtorch*.so" --exclude "libc10*.so" --exclude "libibverbs.so.1" --exclude "libcudart.so.12" --exclude "libamdhip64.so.*" --exclude "libcuda.so.1" -w $ROOT/${WHEEL_DIR}
+auditwheel show $ROOT/${WHEEL_DIR}/*.whl
+
+# 3. Done
+echo "[3/3] Wheel built successfully (stored in ${WHEEL_DIR}):"
+ls -lh "$ROOT/${WHEEL_DIR}"/uccl-*.whl || true
diff --git a/collective/rdma/nccl_plugin.cc b/collective/rdma/nccl_plugin.cc
index 578485cc3..b52b2b1a6 100644
--- a/collective/rdma/nccl_plugin.cc
+++ b/collective/rdma/nccl_plugin.cc
@@ -1,4 +1,4 @@
-#include "nccl_net.h"
+#include <nccl_net.h>
 #include "transport.h"
 #include "transport_config.h"
 #include "util_rdma.h"
diff --git a/docker/Dockerfile.cuda b/docker/Dockerfile.cuda
index 6ff363942..098f173e1 100644
--- a/docker/Dockerfile.cuda
+++ b/docker/Dockerfile.cuda
@@ -16,7 +16,7 @@ RUN apt-get update && \
         build-essential cmake git ninja-build g++ make patchelf \
         rdma-core libibverbs-dev \
         libgoogle-glog-dev libgflags-dev libgtest-dev libelf-dev \
-        libnuma-dev libdrm-dev libdrm-amdgpu1 \
+        libnuma-dev \
         pkg-config zlib1g-dev curl && \
 \
 # ───── Add Python ${PY_VER} PPA & install Python ${PY_VER} + setuptools ─────
@@ -47,7 +47,7 @@ RUN python${PY_VER} -m pip install --no-cache-dir --upgrade setuptools
 # ───── Set Python ${PY_VER} as default python3 and python3-config ─────
 RUN ln -sf /usr/bin/python${PY_VER} /usr/local/bin/python3 && \
     ln -sf /usr/bin/python${PY_VER}-config /usr/local/bin/python3-config
-    
+
 WORKDIR /io
 
-CMD ["bash"] 
\ No newline at end of file
+CMD ["bash"]