Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
2f53487
Add topology discovery tool
pentschev Oct 18, 2025
da7304d
Improve network topology discovery to account PCIe for proximity
pentschev Nov 3, 2025
ee789d1
Refactor into separate API and CLI tool
pentschev Nov 3, 2025
5a3cd48
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 3, 2025
4dbb65b
Add NVML dependency
pentschev Nov 3, 2025
e080dfc
Fix CMakeLists.txt linting
pentschev Nov 3, 2025
1c5ad2e
Cleanup
pentschev Nov 4, 2025
73e56be
Use `std::optional` instead of additional `bool`
pentschev Nov 4, 2025
9ff4917
Merge branch 'main' into topology-discovery
pentschev Nov 4, 2025
c671ff2
Fix linting
pentschev Nov 4, 2025
3f8debd
Apply std::optional changes to cpp file
pentschev Nov 4, 2025
e5bb157
Code formatting
pentschev Nov 5, 2025
8ec3d31
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 5, 2025
48aa0a6
Update CMakeLists
pentschev Nov 5, 2025
1c45505
Do not link to nvml
pentschev Nov 5, 2025
9af9451
Test topology discovery
pentschev Nov 5, 2025
4e9bee7
Improve docs
pentschev Nov 5, 2025
859acbb
Link to NVML again and allow missing DSO
pentschev Nov 5, 2025
bc8f9fb
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 5, 2025
1148fee
Enable debug output
pentschev Nov 5, 2025
120f32c
More debugging output
pentschev Nov 5, 2025
c15814f
Disable failure on first error
pentschev Nov 5, 2025
b27c087
Fix disable failure on first error
pentschev Nov 5, 2025
dd8a4b2
Print numa_node contents
pentschev Nov 5, 2025
2fd6577
Revert debug output
pentschev Nov 5, 2025
0683a46
Remove memory binding validation
pentschev Nov 5, 2025
3088770
Fix clang-tidy failures
pentschev Nov 5, 2025
320ec5a
Do not link, dlopen
pentschev Nov 5, 2025
89bc8c7
Fix more clang-tidy failures
pentschev Nov 5, 2025
872dcbb
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 5, 2025
cb96bed
One more clang-tidy error...
pentschev Nov 5, 2025
4f85ad9
Attempt to remove linking to librapidsmpf
pentschev Nov 6, 2025
7f44047
Fix typo
pentschev Nov 6, 2025
f306994
Add CMAKE_DL_LIBS
pentschev Nov 6, 2025
4b7fe9d
Add wheels dependency
pentschev Nov 6, 2025
94d3505
Install nvidia-nvml-dev in devcontainers
pentschev Nov 6, 2025
2950cb8
Attempt to find nvml.h from wheels
pentschev Nov 6, 2025
9b2cb9a
Set SITE_PACKAGES
pentschev Nov 6, 2025
28dc7ee
Make SITE_PACKAGES available for singlecomm build also
pentschev Nov 6, 2025
9eaf2c2
Install NVML packages to venv
pentschev Nov 6, 2025
506769d
Set SITE_PACKAGES in devcontainer
pentschev Nov 6, 2025
cf0c51e
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 6, 2025
a7b1723
Attempt to find nvml.h in CUDAToolkit_INCLUDE_DIRS
pentschev Nov 10, 2025
b9d8be8
Remove NVML install from devcontainers
pentschev Nov 10, 2025
b59afb8
Remove setting NVML_INCLUDE_DIR
pentschev Nov 11, 2025
5310fe5
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 11, 2025
0985a0c
Add pyproject to build-nvml
pentschev Nov 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ci/build_wheel_librapidsmpf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ export PIP_NO_BUILD_ISOLATION=0

export SKBUILD_CMAKE_ARGS="-DBUILD_MPI_SUPPORT=OFF;-DBUILD_TESTS=OFF;-DBUILD_BENCHMARKS=OFF;-DBUILD_EXAMPLES=OFF;-DBUILD_NUMA_SUPPORT=OFF"

# Needed to find nvml.h
SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
export SITE_PACKAGES

./ci/build_wheel.sh "${package_name}" "${package_dir}"

python -m auditwheel repair \
Expand Down
7 changes: 4 additions & 3 deletions ci/build_wheel_singlecomm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ export PIP_NO_BUILD_ISOLATION=0

export SKBUILD_CMAKE_ARGS="-DBUILD_MPI_SUPPORT=OFF;-DBUILD_UCXX_SUPPORT=OFF;-DBUILD_TESTS=OFF;-DBUILD_BENCHMARKS=OFF;-DBUILD_EXAMPLES=OFF;-DBUILD_NUMA_SUPPORT=OFF"

# Needed also for librapidsmpf to find nvml.h
SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
export SITE_PACKAGES

./ci/build_wheel.sh "${package_name}" "${package_dir}"

python -m auditwheel repair \
Expand Down Expand Up @@ -70,9 +74,6 @@ rapids-pip-retry install \

export SKBUILD_CMAKE_ARGS=""

SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
export SITE_PACKAGES

./ci/build_wheel.sh "${package_name_py}" "${package_dir_py}"

python -m auditwheel repair \
Expand Down
12 changes: 0 additions & 12 deletions ci/run_cpp_benchmark_smoketests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,3 @@ for i in {0..2}; do
exit 1
fi
done

# Test with rrun

# Confirm no dependencies on OpenMPI variables
unset OMPI_ALLOW_RUN_AS_ROOT
unset OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
unset OMPI_MCA_opal_cuda_support

python "${TIMEOUT_TOOL_PATH}" 30 \
rrun -n 3 -g 0,0,0 ./bench_comm -m cuda -C ucxx
python "${TIMEOUT_TOOL_PATH}" 30 \
rrun --tag-output -n 3 -g 0,0,0 ./bench_comm -m cuda -C ucxx
24 changes: 24 additions & 0 deletions ci/run_cpp_tools_smoketests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

set -xeuo pipefail

CI_PATH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
TIMEOUT_TOOL_PATH="${CI_PATH}"/timeout_with_stack.py
VALIDATE_TOPOLOGY_PATH="${CI_PATH}"/validate_topology_json.py

# Support customizing the ctests' install location
cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/benchmarks/librapidsmpf/"

# Confirm no dependencies on OpenMPI variables
unset OMPI_ALLOW_RUN_AS_ROOT
unset OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
unset OMPI_MCA_opal_cuda_support

python "${TIMEOUT_TOOL_PATH}" 30 \
rrun -n 3 -g 0,0,0 ./bench_comm -m cuda -C ucxx
python "${TIMEOUT_TOOL_PATH}" 30 \
rrun --tag-output -n 3 -g 0,0,0 ./bench_comm -m cuda -C ucxx

topology_discovery | python "${VALIDATE_TOPOLOGY_PATH}" -
4 changes: 4 additions & 0 deletions ci/test_cpp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,9 @@ rapids-logger "Run example smoketests"
rapids-logger "Run benchmark smoketests"
./run_cpp_benchmark_smoketests.sh

# Ensure tools are runnable
rapids-logger "Run tools smoketests"
./run_cpp_tools_smoketests.sh

rapids-logger "Test script exiting with exit code: $EXITCODE"
exit ${EXITCODE}
138 changes: 138 additions & 0 deletions ci/validate_topology_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import argparse
import json
import sys
from typing import Any, List


def load_json_from_path_or_stdin(input_path: str) -> Any:
if input_path == "-":
try:
content = sys.stdin.read()
except Exception as exc:
raise RuntimeError(f"failed reading stdin: {exc}")
if not content.strip():
raise ValueError("no input provided on stdin; pass --input <file> or pipe JSON")
try:
return json.loads(content)
except json.JSONDecodeError as exc:
raise ValueError(f"invalid JSON from stdin: {exc}")
else:
try:
with open(input_path, "r", encoding="utf-8") as f:
return json.load(f)
except FileNotFoundError:
raise FileNotFoundError(f"input file not found: {input_path}")
except json.JSONDecodeError as exc:
raise ValueError(f"invalid JSON in file {input_path}: {exc}")


def ensure_non_empty_string(errors: List[str], obj: Any, key: str, context: str) -> None:
value = obj.get(key)
if not isinstance(value, str) or not value.strip():
errors.append(f"{context}.{key} must be a non-empty string")


def ensure_int(errors: List[str], obj: Any, key: str, context: str, *, min_value: int = None) -> None:
value = obj.get(key)
if not isinstance(value, int):
errors.append(f"{context}.{key} must be an integer")
return
if min_value is not None and value < min_value:
errors.append(f"{context}.{key} must be >= {min_value} (got {value})")


def ensure_non_empty_list(errors: List[str], obj: Any, key: str, context: str) -> None:
value = obj.get(key)
if not isinstance(value, list) or len(value) == 0:
errors.append(f"{context}.{key} must be a non-empty list")


def validate_topology(data: Any) -> List[str]:
errors: List[str] = []

if not isinstance(data, dict):
return ["top-level JSON must be an object"]

# System section
system = data.get("system")
if not isinstance(system, dict):
errors.append("system must be an object")
else:
ensure_non_empty_string(errors, system, "hostname", "system")
ensure_int(errors, system, "num_gpus", "system", min_value=1)
ensure_int(errors, system, "num_numa_nodes", "system", min_value=1)
ensure_int(errors, system, "num_network_devices", "system", min_value=0)

# GPUs section
gpus = data.get("gpus")
if not isinstance(gpus, list) or len(gpus) == 0:
errors.append("gpus must be a non-empty array (at least one GPU must be present)")
else:
gpu_ids = []
for index, gpu in enumerate(gpus):
ctx = f"gpus[{index}]"
if not isinstance(gpu, dict):
errors.append(f"{ctx} must be an object")
continue

ensure_int(errors, gpu, "id", ctx)
if isinstance(gpu.get("id"), int):
gpu_ids.append(gpu["id"])
ensure_non_empty_string(errors, gpu, "name", ctx)
ensure_non_empty_string(errors, gpu, "pci_bus_id", ctx)
ensure_non_empty_string(errors, gpu, "uuid", ctx)
ensure_int(errors, gpu, "numa_node", ctx)

cpu_affinity = gpu.get("cpu_affinity")
if not isinstance(cpu_affinity, dict):
errors.append(f"{ctx}.cpu_affinity must be an object")
else:
ensure_non_empty_string(errors, cpu_affinity, "cpulist", f"{ctx}.cpu_affinity")
ensure_non_empty_list(errors, cpu_affinity, "cores", f"{ctx}.cpu_affinity")

# GPU id uniqueness check
if len(gpu_ids) != len(set(gpu_ids)):
seen = set()
dups = set()
for gid in gpu_ids:
if gid in seen:
dups.add(gid)
else:
seen.add(gid)
dup_list = ", ".join(str(x) for x in sorted(dups))
errors.append(f"gpus ids must be unique; duplicates found: {dup_list}")

# We don't validate several fields and their contents because the virtualized CI
# environment doesn't have complete topology information.

return errors


def main() -> int:
parser = argparse.ArgumentParser(description="Validate topology_discovery JSON output")
parser.add_argument(
"input",
help="Path to JSON file to validate; pass '-' to read from stdin",
)
args = parser.parse_args()

try:
data = load_json_from_path_or_stdin(args.input)
except Exception as exc:
print(f"ERROR: {exc}", file=sys.stderr)
return 2

errors = validate_topology(data)
if errors:
for err in errors:
print(f"ERROR: {err}", file=sys.stderr)
return 1

return 0


if __name__ == "__main__":
sys.exit(main())
1 change: 1 addition & 0 deletions conda/environments/all_cuda-129_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-sanitizer-api
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-129_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-sanitizer-api
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-130_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-sanitizer-api
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-130_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-sanitizer-api
Expand Down
8 changes: 8 additions & 0 deletions conda/recipes/librapidsmpf/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ cache:
- ${{ compiler("cxx") }}
- ${{ compiler("cuda") }} =${{ cuda_version }}
- cuda-cupti-dev
- cuda-nvml-dev
- cuda-version =${{ cuda_version }}
- cmake ${{ cmake_version }}
- ninja
Expand All @@ -60,6 +61,7 @@ cache:
- cuda-version =${{ cuda_version }}
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvml-dev
- librmm =${{ minor_version }}
- libcudf =${{ minor_version }}
- openmpi >=5.0 # See <https://github.com/rapidsai/rapidsmpf/issues/17>
Expand All @@ -77,6 +79,8 @@ outputs:
cmake --install cpp/build --component=tools
dynamic_linking:
overlinking_behavior: "error"
missing_dso_allowlist:
- "libnvidia-ml.so.1"
prefix_detection:
ignore:
# See https://github.com/rapidsai/build-planning/issues/160
Expand All @@ -86,11 +90,13 @@ outputs:
build:
- cmake ${{ cmake_version }}
- cuda-cupti-dev
- cuda-nvml-dev
- ${{ stdlib("c") }}
host:
- cuda-version =${{ cuda_version }}
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvml-dev
- libcudf =${{ minor_version }}
- openmpi >=5.0
- ucxx ${{ ucxx_version }}
Expand Down Expand Up @@ -131,10 +137,12 @@ outputs:
build:
- cmake ${{ cmake_version }}
- cuda-cupti-dev
- cuda-nvml-dev
- ${{ stdlib("c") }}
host:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvml-dev
- cuda-version =${{ cuda_version }}
- libcudf =${{ minor_version }}
- librmm =${{ minor_version }}
Expand Down
34 changes: 34 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ add_library(
src/shuffler/postbox.cpp
src/shuffler/shuffler.cpp
src/statistics.cpp
src/topology_discovery.cpp
src/utils.cpp
)
if(RAPIDSMPF_HAVE_STREAMING)
Expand Down Expand Up @@ -234,6 +235,33 @@ target_include_directories(
INTERFACE "$<INSTALL_INTERFACE:include>"
)

# Try to locate NVML headers when building wheels (installed in site-packages) Prefer any existing
# include path resolution first; fall back to scanning Python site-packages
find_path(
NVML_INCLUDE_DIR
NAMES nvml.h
HINTS ${CUDAToolkit_INCLUDE_DIRS}
PATH_SUFFIXES include
)

if(NOT NVML_INCLUDE_DIR)
# Use environment-provided site-packages if available (set by CI/build scripts)
if(DEFINED ENV{SITE_PACKAGES})
file(GLOB _nvml_sites "$ENV{SITE_PACKAGES}/nvidia/*/include")
foreach(_cand IN LISTS _nvml_sites)
if(EXISTS "${_cand}/nvml.h")
set(NVML_INCLUDE_DIR "${_cand}")
break()
endif()
endforeach()
endif()
endif()

if(NVML_INCLUDE_DIR)
message(STATUS "Found NVML headers at: ${NVML_INCLUDE_DIR}")
target_include_directories(rapidsmpf PRIVATE ${NVML_INCLUDE_DIR})
endif()

target_link_libraries(
rapidsmpf
PUBLIC rmm::rmm cudf::cudf CCCL::CCCL $<TARGET_NAME_IF_EXISTS:ucxx::ucxx>
Expand All @@ -244,6 +272,7 @@ target_link_libraries(
$<TARGET_NAME_IF_EXISTS:conda_env>
maybe_asan
$<TARGET_NAME_IF_EXISTS:CCCL::cudax>
${CMAKE_DL_LIBS}
)

target_compile_definitions(
Expand Down Expand Up @@ -335,6 +364,11 @@ endif()
# -------------------------------------------------------------------------------------
add_subdirectory(tools)

# Ensure NVML include directory is also available to the topology_discovery tool
if(NVML_INCLUDE_DIR AND TARGET topology_discovery)
target_include_directories(topology_discovery PRIVATE ${NVML_INCLUDE_DIR})
endif()

# ##################################################################################################
# * install targets -------------------------------------------------------------------------------
rapids_cmake_install_lib_dir(lib_dir)
Expand Down
Loading