rapidsai · pentschev · Oct 18, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
@@ -29,6 +29,10 @@ export PIP_NO_BUILD_ISOLATION=0
 
 export SKBUILD_CMAKE_ARGS="-DBUILD_MPI_SUPPORT=OFF;-DBUILD_TESTS=OFF;-DBUILD_BENCHMARKS=OFF;-DBUILD_EXAMPLES=OFF;-DBUILD_NUMA_SUPPORT=OFF"
 
+# Needed to find nvml.h
+SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+export SITE_PACKAGES
+
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \

@@ -36,6 +36,10 @@ export PIP_NO_BUILD_ISOLATION=0
 
 export SKBUILD_CMAKE_ARGS="-DBUILD_MPI_SUPPORT=OFF;-DBUILD_UCXX_SUPPORT=OFF;-DBUILD_TESTS=OFF;-DBUILD_BENCHMARKS=OFF;-DBUILD_EXAMPLES=OFF;-DBUILD_NUMA_SUPPORT=OFF"
 
+# Needed also for librapidsmpf to find nvml.h
+SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
+export SITE_PACKAGES
+
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
 python -m auditwheel repair \
@@ -70,9 +74,6 @@ rapids-pip-retry install \
 
 export SKBUILD_CMAKE_ARGS=""
 
-SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
-export SITE_PACKAGES
-
 ./ci/build_wheel.sh "${package_name_py}" "${package_dir_py}"
 
 python -m auditwheel repair \

@@ -40,15 +40,3 @@ for i in {0..2}; do
     exit 1
   fi
 done
-
-# Test with rrun
-
-# Confirm no dependencies on OpenMPI variables
-unset OMPI_ALLOW_RUN_AS_ROOT
-unset OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
-unset OMPI_MCA_opal_cuda_support
-
-python "${TIMEOUT_TOOL_PATH}" 30 \
-    rrun -n 3 -g 0,0,0 ./bench_comm -m cuda -C ucxx
-python "${TIMEOUT_TOOL_PATH}" 30 \
-    rrun --tag-output -n 3 -g 0,0,0 ./bench_comm -m cuda -C ucxx
@@ -0,0 +1,24 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+set -xeuo pipefail
+
+CI_PATH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
+TIMEOUT_TOOL_PATH="${CI_PATH}"/timeout_with_stack.py
+VALIDATE_TOPOLOGY_PATH="${CI_PATH}"/validate_topology_json.py
+
+# Support customizing the ctests' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/benchmarks/librapidsmpf/"
+
+# Confirm no dependencies on OpenMPI variables
+unset OMPI_ALLOW_RUN_AS_ROOT
+unset OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
+unset OMPI_MCA_opal_cuda_support
+
+python "${TIMEOUT_TOOL_PATH}" 30 \
+    rrun -n 3 -g 0,0,0 ./bench_comm -m cuda -C ucxx
+python "${TIMEOUT_TOOL_PATH}" 30 \
+    rrun --tag-output -n 3 -g 0,0,0 ./bench_comm -m cuda -C ucxx
+
+topology_discovery | python "${VALIDATE_TOPOLOGY_PATH}" -
@@ -56,5 +56,9 @@ rapids-logger "Run example smoketests"
 rapids-logger "Run benchmark smoketests"
 ./run_cpp_benchmark_smoketests.sh
 
+# Ensure tools are runnable
+rapids-logger "Run tools smoketests"
+./run_cpp_tools_smoketests.sh
+
 rapids-logger "Test script exiting with exit code: $EXITCODE"
 exit ${EXITCODE}
@@ -0,0 +1,138 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import json
+import sys
+from typing import Any, List
+
+
+def load_json_from_path_or_stdin(input_path: str) -> Any:
+    if input_path == "-":
+        try:
+            content = sys.stdin.read()
+        except Exception as exc:
+            raise RuntimeError(f"failed reading stdin: {exc}")
+        if not content.strip():
+            raise ValueError("no input provided on stdin; pass --input <file> or pipe JSON")
+        try:
+            return json.loads(content)
+        except json.JSONDecodeError as exc:
+            raise ValueError(f"invalid JSON from stdin: {exc}")
+    else:
+        try:
+            with open(input_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError(f"input file not found: {input_path}")
+        except json.JSONDecodeError as exc:
+            raise ValueError(f"invalid JSON in file {input_path}: {exc}")
+
+
+def ensure_non_empty_string(errors: List[str], obj: Any, key: str, context: str) -> None:
+    value = obj.get(key)
+    if not isinstance(value, str) or not value.strip():
+        errors.append(f"{context}.{key} must be a non-empty string")
+
+
+def ensure_int(errors: List[str], obj: Any, key: str, context: str, *, min_value: int = None) -> None:
+    value = obj.get(key)
+    if not isinstance(value, int):
+        errors.append(f"{context}.{key} must be an integer")
+        return
+    if min_value is not None and value < min_value:
+        errors.append(f"{context}.{key} must be >= {min_value} (got {value})")
+
+
+def ensure_non_empty_list(errors: List[str], obj: Any, key: str, context: str) -> None:
+    value = obj.get(key)
+    if not isinstance(value, list) or len(value) == 0:
+        errors.append(f"{context}.{key} must be a non-empty list")
+
+
+def validate_topology(data: Any) -> List[str]:
+    errors: List[str] = []
+
+    if not isinstance(data, dict):
+        return ["top-level JSON must be an object"]
+
+    # System section
+    system = data.get("system")
+    if not isinstance(system, dict):
+        errors.append("system must be an object")
+    else:
+        ensure_non_empty_string(errors, system, "hostname", "system")
+        ensure_int(errors, system, "num_gpus", "system", min_value=1)
+        ensure_int(errors, system, "num_numa_nodes", "system", min_value=1)
+        ensure_int(errors, system, "num_network_devices", "system", min_value=0)
+
+    # GPUs section
+    gpus = data.get("gpus")
+    if not isinstance(gpus, list) or len(gpus) == 0:
+        errors.append("gpus must be a non-empty array (at least one GPU must be present)")
+    else:
+        gpu_ids = []
+        for index, gpu in enumerate(gpus):
+            ctx = f"gpus[{index}]"
+            if not isinstance(gpu, dict):
+                errors.append(f"{ctx} must be an object")
+                continue
+
+            ensure_int(errors, gpu, "id", ctx)
+            if isinstance(gpu.get("id"), int):
+                gpu_ids.append(gpu["id"])
+            ensure_non_empty_string(errors, gpu, "name", ctx)
+            ensure_non_empty_string(errors, gpu, "pci_bus_id", ctx)
+            ensure_non_empty_string(errors, gpu, "uuid", ctx)
+            ensure_int(errors, gpu, "numa_node", ctx)
+
+            cpu_affinity = gpu.get("cpu_affinity")
+            if not isinstance(cpu_affinity, dict):
+                errors.append(f"{ctx}.cpu_affinity must be an object")
+            else:
+                ensure_non_empty_string(errors, cpu_affinity, "cpulist", f"{ctx}.cpu_affinity")
+                ensure_non_empty_list(errors, cpu_affinity, "cores", f"{ctx}.cpu_affinity")
+
+        # GPU id uniqueness check
+        if len(gpu_ids) != len(set(gpu_ids)):
+            seen = set()
+            dups = set()
+            for gid in gpu_ids:
+                if gid in seen:
+                    dups.add(gid)
+                else:
+                    seen.add(gid)
+            dup_list = ", ".join(str(x) for x in sorted(dups))
+            errors.append(f"gpus ids must be unique; duplicates found: {dup_list}")
+
+    # We don't validate several fields and their contents because the virtualized CI
+    # environment doesn't have complete topology information.
+
+    return errors
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Validate topology_discovery JSON output")
+    parser.add_argument(
+        "input",
+        help="Path to JSON file to validate; pass '-' to read from stdin",
+    )
+    args = parser.parse_args()
+
+    try:
+        data = load_json_from_path_or_stdin(args.input)
+    except Exception as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
+
+    errors = validate_topology(data)
+    if errors:
+        for err in errors:
+            print(f"ERROR: {err}", file=sys.stderr)
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
@@ -14,6 +14,7 @@ dependencies:
 - cuda-cudart-dev
 - cuda-cupti-dev
 - cuda-nvcc
+- cuda-nvml-dev
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
 - cuda-sanitizer-api

@@ -14,6 +14,7 @@ dependencies:
 - cuda-cudart-dev
 - cuda-cupti-dev
 - cuda-nvcc
+- cuda-nvml-dev
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
 - cuda-sanitizer-api

@@ -14,6 +14,7 @@ dependencies:
 - cuda-cudart-dev
 - cuda-cupti-dev
 - cuda-nvcc
+- cuda-nvml-dev
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
 - cuda-sanitizer-api

@@ -14,6 +14,7 @@ dependencies:
 - cuda-cudart-dev
 - cuda-cupti-dev
 - cuda-nvcc
+- cuda-nvml-dev
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
 - cuda-sanitizer-api

@@ -52,6 +52,7 @@ cache:
       - ${{ compiler("cxx") }}
       - ${{ compiler("cuda") }} =${{ cuda_version }}
       - cuda-cupti-dev
+      - cuda-nvml-dev
       - cuda-version =${{ cuda_version }}
       - cmake ${{ cmake_version }}
       - ninja
@@ -60,6 +61,7 @@ cache:
       - cuda-version =${{ cuda_version }}
       - cuda-cudart-dev
       - cuda-cupti-dev
+      - cuda-nvml-dev
       - librmm =${{ minor_version }}
       - libcudf =${{ minor_version }}
       - openmpi >=5.0  # See <https://github.com/rapidsai/rapidsmpf/issues/17>
@@ -77,6 +79,8 @@ outputs:
           cmake --install cpp/build --component=tools
       dynamic_linking:
         overlinking_behavior: "error"
+        missing_dso_allowlist:
+          - "libnvidia-ml.so.1"
       prefix_detection:
         ignore:
           # See https://github.com/rapidsai/build-planning/issues/160
@@ -86,11 +90,13 @@ outputs:
       build:
         - cmake ${{ cmake_version }}
         - cuda-cupti-dev
+        - cuda-nvml-dev
         - ${{ stdlib("c") }}
       host:
         - cuda-version =${{ cuda_version }}
         - cuda-cudart-dev
         - cuda-cupti-dev
+        - cuda-nvml-dev
         - libcudf =${{ minor_version }}
         - openmpi >=5.0
         - ucxx ${{ ucxx_version }}
@@ -131,10 +137,12 @@ outputs:
       build:
         - cmake ${{ cmake_version }}
         - cuda-cupti-dev
+        - cuda-nvml-dev
         - ${{ stdlib("c") }}
       host:
         - cuda-cudart-dev
         - cuda-cupti-dev
+        - cuda-nvml-dev
         - cuda-version =${{ cuda_version }}
         - libcudf =${{ minor_version }}
         - librmm =${{ minor_version }}

@@ -177,6 +177,7 @@ add_library(
   src/shuffler/postbox.cpp
   src/shuffler/shuffler.cpp
   src/statistics.cpp
+  src/topology_discovery.cpp
   src/utils.cpp
 )
 if(RAPIDSMPF_HAVE_STREAMING)
@@ -234,6 +235,33 @@ target_include_directories(
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
+# Try to locate NVML headers when building wheels (installed in site-packages) Prefer any existing
+# include path resolution first; fall back to scanning Python site-packages
+find_path(
+  NVML_INCLUDE_DIR
+  NAMES nvml.h
+  HINTS ${CUDAToolkit_INCLUDE_DIRS}
+  PATH_SUFFIXES include
+)
+
+if(NOT NVML_INCLUDE_DIR)
+  # Use environment-provided site-packages if available (set by CI/build scripts)
+  if(DEFINED ENV{SITE_PACKAGES})
+    file(GLOB _nvml_sites "$ENV{SITE_PACKAGES}/nvidia/*/include")
+    foreach(_cand IN LISTS _nvml_sites)
+      if(EXISTS "${_cand}/nvml.h")
+        set(NVML_INCLUDE_DIR "${_cand}")
+        break()
+      endif()
+    endforeach()
+  endif()
+endif()
+
+if(NVML_INCLUDE_DIR)
+  message(STATUS "Found NVML headers at: ${NVML_INCLUDE_DIR}")
+  target_include_directories(rapidsmpf PRIVATE ${NVML_INCLUDE_DIR})
+endif()
+
 target_link_libraries(
   rapidsmpf
   PUBLIC rmm::rmm cudf::cudf CCCL::CCCL $<TARGET_NAME_IF_EXISTS:ucxx::ucxx>
@@ -244,6 +272,7 @@ target_link_libraries(
           $<TARGET_NAME_IF_EXISTS:conda_env>
           maybe_asan
           $<TARGET_NAME_IF_EXISTS:CCCL::cudax>
+          ${CMAKE_DL_LIBS}
 )
 
 target_compile_definitions(
@@ -335,6 +364,11 @@ endif()
 # -------------------------------------------------------------------------------------
 add_subdirectory(tools)
 
+# Ensure NVML include directory is also available to the topology_discovery tool
+if(NVML_INCLUDE_DIR AND TARGET topology_discovery)
+  target_include_directories(topology_discovery PRIVATE ${NVML_INCLUDE_DIR})
+endif()
+
 # ##################################################################################################
 # * install targets -------------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)