Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
2f53487
Add topology discovery tool
pentschev Oct 18, 2025
da7304d
Improve network topology discovery to account PCIe for proximity
pentschev Nov 3, 2025
ee789d1
Refactor into separate API and CLI tool
pentschev Nov 3, 2025
5a3cd48
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 3, 2025
4dbb65b
Add NVML dependency
pentschev Nov 3, 2025
e080dfc
Fix CMakeLists.txt linting
pentschev Nov 3, 2025
1c5ad2e
Cleanup
pentschev Nov 4, 2025
73e56be
Use `std::optional` instead of additional `bool`
pentschev Nov 4, 2025
9ff4917
Merge branch 'main' into topology-discovery
pentschev Nov 4, 2025
c671ff2
Fix linting
pentschev Nov 4, 2025
3f8debd
Apply std::optional changes to cpp file
pentschev Nov 4, 2025
e5bb157
Code formatting
pentschev Nov 5, 2025
8ec3d31
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 5, 2025
48aa0a6
Update CMakeLists
pentschev Nov 5, 2025
1c45505
Do not link to nvml
pentschev Nov 5, 2025
9af9451
Test topology discovery
pentschev Nov 5, 2025
4e9bee7
Improve docs
pentschev Nov 5, 2025
859acbb
Link to NVML again and allow missing DSO
pentschev Nov 5, 2025
bc8f9fb
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 5, 2025
1148fee
Enable debug output
pentschev Nov 5, 2025
120f32c
More debugging output
pentschev Nov 5, 2025
c15814f
Disable failure on first error
pentschev Nov 5, 2025
b27c087
Fix disable failure on first error
pentschev Nov 5, 2025
dd8a4b2
Print numa_node contents
pentschev Nov 5, 2025
2fd6577
Revert debug output
pentschev Nov 5, 2025
0683a46
Remove memory binding validation
pentschev Nov 5, 2025
3088770
Fix clang-tidy failures
pentschev Nov 5, 2025
320ec5a
Do not link, dlopen
pentschev Nov 5, 2025
89bc8c7
Fix more clang-tidy failures
pentschev Nov 5, 2025
872dcbb
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 5, 2025
cb96bed
One more clang-tidy error...
pentschev Nov 5, 2025
4f85ad9
Attempt to remove linking to librapidsmpf
pentschev Nov 6, 2025
7f44047
Fix typo
pentschev Nov 6, 2025
f306994
Add CMAKE_DL_LIBS
pentschev Nov 6, 2025
4b7fe9d
Add wheels dependency
pentschev Nov 6, 2025
94d3505
Install nvidia-nvml-dev in devcontainers
pentschev Nov 6, 2025
2950cb8
Attempt to find nvml.h from wheels
pentschev Nov 6, 2025
9b2cb9a
Set SITE_PACKAGES
pentschev Nov 6, 2025
28dc7ee
Make SITE_PACKAGES available for singlecomm build also
pentschev Nov 6, 2025
9eaf2c2
Install NVML packages to venv
pentschev Nov 6, 2025
506769d
Set SITE_PACKAGES in devcontainer
pentschev Nov 6, 2025
cf0c51e
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 6, 2025
a7b1723
Attempt to find nvml.h in CUDAToolkit_INCLUDE_DIRS
pentschev Nov 10, 2025
b9d8be8
Remove NVML install from devcontainers
pentschev Nov 10, 2025
b59afb8
Remove setting NVML_INCLUDE_DIR
pentschev Nov 11, 2025
5310fe5
Merge remote-tracking branch 'upstream/main' into topology-discovery
pentschev Nov 11, 2025
0985a0c
Add pyproject to build-nvml
pentschev Nov 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda/environments/all_cuda-129_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-sanitizer-api
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-129_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-sanitizer-api
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-130_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-sanitizer-api
Expand Down
1 change: 1 addition & 0 deletions conda/environments/all_cuda-130_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ dependencies:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvcc
- cuda-nvml-dev
- cuda-nvrtc-dev
- cuda-nvtx-dev
- cuda-sanitizer-api
Expand Down
6 changes: 6 additions & 0 deletions conda/recipes/librapidsmpf/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ cache:
- ${{ compiler("cxx") }}
- ${{ compiler("cuda") }} =${{ cuda_version }}
- cuda-cupti-dev
- cuda-nvml-dev
- cuda-version =${{ cuda_version }}
- cmake ${{ cmake_version }}
- ninja
Expand All @@ -60,6 +61,7 @@ cache:
- cuda-version =${{ cuda_version }}
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvml-dev
- librmm =${{ minor_version }}
- libcudf =${{ minor_version }}
- openmpi >=5.0 # See <https://github.com/rapidsai/rapidsmpf/issues/17>
Expand All @@ -85,11 +87,13 @@ outputs:
build:
- cmake ${{ cmake_version }}
- cuda-cupti-dev
- cuda-nvml-dev
- ${{ stdlib("c") }}
host:
- cuda-version =${{ cuda_version }}
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvml-dev
- libcudf =${{ minor_version }}
- openmpi >=5.0
- ucxx ${{ ucxx_version }}
Expand Down Expand Up @@ -130,10 +134,12 @@ outputs:
build:
- cmake ${{ cmake_version }}
- cuda-cupti-dev
- cuda-nvml-dev
- ${{ stdlib("c") }}
host:
- cuda-cudart-dev
- cuda-cupti-dev
- cuda-nvml-dev
- cuda-version =${{ cuda_version }}
- libcudf =${{ minor_version }}
- librmm =${{ minor_version }}
Expand Down
8 changes: 7 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ add_library(
src/shuffler/postbox.cpp
src/shuffler/shuffler.cpp
src/statistics.cpp
src/topology_discovery.cpp
src/utils.cpp
)
if(RAPIDSMPF_HAVE_STREAMING)
Expand Down Expand Up @@ -230,7 +231,7 @@ target_include_directories(

target_link_libraries(
rapidsmpf
PUBLIC rmm::rmm cudf::cudf CCCL::CCCL $<TARGET_NAME_IF_EXISTS:ucxx::ucxx>
PUBLIC rmm::rmm cudf::cudf CCCL::CCCL CUDA::nvml $<TARGET_NAME_IF_EXISTS:ucxx::ucxx>
$<TARGET_NAME_IF_EXISTS:libcoro>
PRIVATE $<$<BOOL:${RAPIDSMPF_HAVE_NUMA}>:numa>
$<TARGET_NAME_IF_EXISTS:MPI::MPI_C>
Expand Down Expand Up @@ -324,6 +325,11 @@ if(RAPIDSMPF_BUILD_EXAMPLES)
add_subdirectory(examples)
endif()

# ##################################################################################################
# * add tools
# -------------------------------------------------------------------------------------
add_subdirectory(tools)

# ##################################################################################################
# * install targets -------------------------------------------------------------------------------
rapids_cmake_install_lib_dir(lib_dir)
Expand Down
122 changes: 122 additions & 0 deletions cpp/include/rapidsmpf/topology_discovery.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/**
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once

#include <string>
#include <vector>

namespace rapidsmpf {

/**
* @brief GPU information.
*/
struct GpuTopologyInfo {
unsigned int id; ///< GPU device ID.
std::string name; ///< GPU device name.
std::string pci_bus_id; ///< PCI bus ID.
std::string uuid; ///< GPU UUID.
int numa_node; ///< NUMA node ID (-1 if unknown).
std::string cpu_affinity_list; ///< CPU affinity list.
std::vector<int> cpu_cores; ///< List of CPU core IDs.
std::vector<int> memory_binding; ///< NUMA nodes for memory binding.
std::vector<std::string>
network_devices; ///< Network devices (NICs) optimal for this GPU.
};

/**
* @brief Network device information.
*/
struct NetworkDeviceInfo {
std::string name; ///< Device name (e.g., "mlx5_0").
int numa_node; ///< NUMA node ID (-1 if unknown).
std::string pci_bus_id; ///< PCI bus ID.
};

/**
* @brief System topology information.
*/
struct SystemTopologyInfo {
std::string hostname; ///< System hostname.
unsigned int num_gpus; ///< Total number of GPUs.
int num_numa_nodes; ///< Total number of NUMA nodes.
int num_network_devices; ///< Total number of network devices.
std::vector<GpuTopologyInfo> gpus; ///< GPU topology information.
std::vector<NetworkDeviceInfo> network_devices; ///< Network device information.
};

/**
* @brief PCIe topology path types.
*/
enum class PciePathType {
PIX = 0, ///< Connection traversing at most a single PCIe bridge (best).
PXB = 1, ///< Connection traversing multiple PCIe bridges.
PHB = 2, ///< Connection traversing PCIe Host Bridge.
NODE = 3, ///< Connection traversing PCIe and interconnect within NUMA node.
SYS = 4 ///< Connection traversing NUMA interconnect (worst).
};

/**
* @brief Discover system topology including GPUs, NUMA nodes, and network devices.
*
* This class provides methods to discover system topology information using NVML
* and /sys filesystem queries. It dynamically identifies GPU-to-NUMA-to-NIC mappings
* based on PCIe topology.
*
* Example usage:
* @code
* rapidsmpf::TopologyDiscovery discovery;
* if (discovery.discover()) {
* auto topology = discovery.get_topology();
* }
* @endcode
*/
class TopologyDiscovery {
public:
/**
* @brief Construct a TopologyDiscovery instance.
*/
TopologyDiscovery() = default;

/**
* @brief Destroy the TopologyDiscovery instance.
*/
~TopologyDiscovery() = default;

/**
* @brief Discover system topology.
*
* This method performs the actual discovery of GPUs, NUMA nodes, CPU affinity,
* and network devices. It must be called before `get_topology()`.
*
* @return true if discovery was successful, false otherwise.
*/
bool discover();

/**
* @brief Get the discovered topology information.
*
* @return SystemTopologyInfo structure containing all topology data.
* @note `discover()` must be called first.
*/
SystemTopologyInfo const& get_topology() const {
return topology_;
}

/**
* @brief Check if topology has been discovered.
*
* @return true if `discover()` has been called successfully.
*/
bool is_discovered() const {
return discovered_;
}

private:
SystemTopologyInfo topology_; ///< Discovered topology information.
bool discovered_{false}; ///< Flag indicating if topology has been discovered.
};

} // namespace rapidsmpf
Loading
Loading