rapidsai · rapids-bot · Nov 14, 2025 · Oct 29, 2025 · Oct 29, 2025 · Oct 30, 2025
@@ -37,6 +37,13 @@ rapids_cmake_write_version_file(include/rapidsmpf/version_config.hpp)
 # Set a default build type if none was specified
 rapids_cmake_build_type(Release)
 
+# Set RAPIDSMPF_DEBUG default based on build type
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+  set(RAPIDSMPF_DEBUG_DEFAULT ON)
+else()
+  set(RAPIDSMPF_DEBUG_DEFAULT OFF)
+endif()
+
 # ##################################################################################################
 # * build options ---------------------------------------------------------------------------------
 
@@ -53,6 +60,7 @@ option(BUILD_SHARED_LIBS "Build RapidsMPF shared library" ON)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 option(RAPIDSMPF_CLANG_TIDY "Enable clang-tidy during compilation" OFF)
 option(RAPIDSMPF_ASAN "Enable AddressSanitizer" OFF)
+option(RAPIDSMPF_DEBUG "Enable debug mode" ${RAPIDSMPF_DEBUG_DEFAULT})
 
 message(STATUS "librapidsmpf build options:")
 message(STATUS "  BUILD_MPI_SUPPORT       : ${BUILD_MPI_SUPPORT}")
@@ -67,6 +75,7 @@ message(STATUS "  BUILD_SHARED_LIBS       : ${BUILD_SHARED_LIBS}")
 message(STATUS "  CUDA_STATIC_RUNTIME     : ${CUDA_STATIC_RUNTIME}")
 message(STATUS "  RAPIDSMPF_CLANG_TIDY    : ${RAPIDSMPF_CLANG_TIDY}")
 message(STATUS "  RAPIDSMPF_ASAN          : ${RAPIDSMPF_ASAN}")
+message(STATUS "  RAPIDSMPF_DEBUG         : ${RAPIDSMPF_DEBUG}")
 
 # Copy options to our prefix to prevent upstream projects from modifying them.
 set(RAPIDSMPF_HAVE_MPI ${BUILD_MPI_SUPPORT})
@@ -245,6 +254,7 @@ target_compile_definitions(
          $<$<BOOL:${RAPIDSMPF_HAVE_STREAMING}>:RAPIDSMPF_HAVE_STREAMING>
          $<$<BOOL:${RAPIDSMPF_HAVE_CUPTI}>:RAPIDSMPF_HAVE_CUPTI>
          $<$<BOOL:${RAPIDSMPF_HAVE_NUMA}>:RAPIDSMPF_HAVE_NUMA>
+         $<$<BOOL:${RAPIDSMPF_DEBUG}>:RAPIDSMPF_DEBUG>
 )
 
 rapids_cuda_set_runtime(rapidsmpf USE_STATIC ${CUDA_STATIC_RUNTIME})

@@ -171,7 +171,9 @@ class Shuffler::Progress {
      * @return The progress state of the shuffler.
      */
     ProgressThread::ProgressState operator()() {
+#if RAPIDSMPF_DEBUG
         RAPIDSMPF_NVTX_SCOPED_RANGE("Shuffler.Progress", p_iters++);
+#endif
         auto const t0_event_loop = Clock::now();
 
         // Tags for each stage of the shuffle
@@ -186,7 +188,9 @@ class Shuffler::Progress {
         {
             auto const t0_send_metadata = Clock::now();
             auto ready_chunks = shuffler_.outgoing_postbox_.extract_all_ready();
+#if RAPIDSMPF_DEBUG
             RAPIDSMPF_NVTX_SCOPED_RANGE("meta_send", ready_chunks.size());
+#endif
             for (auto&& chunk : ready_chunks) {
                 // All messages in the chunk maps to the same key (checked by the PostBox)
                 // thus we can use the partition ID of the first message in the chunk to
@@ -226,8 +230,10 @@ class Shuffler::Progress {
         // `incoming_chunks_`.
         {
             auto const t0_metadata_recv = Clock::now();
+#if RAPIDSMPF_DEBUG
             RAPIDSMPF_NVTX_SCOPED_RANGE("meta_recv");
             int i = 0;
+#endif
             while (true) {
                 auto const [msg, src] = shuffler_.comm_->recv_any(metadata_tag);
                 if (msg) {
@@ -245,17 +251,23 @@ class Shuffler::Progress {
                 } else {
                     break;
                 }
+#if RAPIDSMPF_DEBUG
                 i++;
+#endif
             }
             stats.add_duration_stat(
                 "event-loop-metadata-recv", Clock::now() - t0_metadata_recv
             );
+#if RAPIDSMPF_DEBUG
             RAPIDSMPF_NVTX_MARKER("meta_recv_iters", i);
+#endif
         }
 
         // Post receives for incoming chunks
         {
+#if RAPIDSMPF_DEBUG
             RAPIDSMPF_NVTX_SCOPED_RANGE("post_chunk_recv", incoming_chunks_.size());
+#endif
             auto const t0_post_incoming_chunk_recv = Clock::now();
             for (auto it = incoming_chunks_.begin(); it != incoming_chunks_.end();) {
                 auto& [src, chunk] = *it;
@@ -342,6 +354,7 @@ class Shuffler::Progress {
         // requested data.
         {
             auto const t0_init_gpu_data_send = Clock::now();
+#if RAPIDSMPF_DEBUG
             RAPIDSMPF_NVTX_SCOPED_RANGE(
                 "init_gpu_send",
                 std::transform_reduce(
@@ -352,6 +365,7 @@ class Shuffler::Progress {
                     [](auto& kv) { return kv.second.size(); }
                 )
             );
+#endif
             // ready_ack_receives_ are separated by rank so that we
             // can guarantee that we don't match messages out of order
             // when using the UCXX communicator. See comment in
@@ -379,7 +393,9 @@ class Shuffler::Progress {
         // Check if any data in transit is finished.
         {
             auto const t0_check_future_finish = Clock::now();
+#if RAPIDSMPF_DEBUG
             RAPIDSMPF_NVTX_SCOPED_RANGE("check_fut_finish", in_transit_futures_.size());
+#endif
             if (!in_transit_futures_.empty()) {
                 std::vector<ChunkID> finished =
                     shuffler_.comm_->test_some(in_transit_futures_);
@@ -439,7 +455,9 @@ class Shuffler::Progress {
     std::unordered_map<Rank, std::vector<std::unique_ptr<Communicator::Future>>>
         ready_ack_receives_;  ///< Receives matching ready for data messages.
 
+#if RAPIDSMPF_DEBUG
     int64_t p_iters = 0;  ///< Number of progress iterations (for NVTX)
+#endif
 };
 
 std::vector<PartID> Shuffler::local_partitions(