From bf59e3faeada072f8b60c08d52deeb1a5e7082dd Mon Sep 17 00:00:00 2001 From: divya2108 Date: Tue, 6 Aug 2024 15:12:21 +0530 Subject: [PATCH 1/7] Added SVE implementation to improve the performance on ARM architecture --- CMakeLists.txt | 45 ++++++++++++++++++++++++++++++++ src/common/hist_util.cc | 58 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 96 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0abe69821d14..0828a972b7b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -270,6 +270,51 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "OS400") set(CMAKE_CXX_ARCHIVE_CREATE " -X64 qc ") endif() +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + include(CheckCSourceCompiles) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+sve") + check_c_source_compiles(" + #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) + #include + int main() { + svfloat64_t a; + a = svdup_n_f64(0); + return 0; + } + #endif + " COMPILER_HAS_ARM_SVE) + + if(COMPILER_HAS_ARM_SVE) + message(STATUS "ARM SVE compiler support detected") + set(SOURCE_CODE " + #include + int main() { + int ret = prctl(PR_SVE_GET_VL); + return ret >= 0 ? 0 : 1; + } + ") + file(WRITE ${CMAKE_BINARY_DIR}/check_sve_support.c "${SOURCE_CODE}") + try_run(RUN_RESULT COMPILE_RESULT + ${CMAKE_BINARY_DIR}/check_sve_support_output + ${CMAKE_BINARY_DIR}/check_sve_support.c + ) + + if(RUN_RESULT EQUAL 0) + message(STATUS "ARM SVE hardware support detected") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+sve") + string(APPEND CMAKE_CXX_FLAGS " -DSVE_SUPPORT_DETECTED") + else() + message(STATUS "ARM SVE hardware support not detected") + endif() + else() + message(STATUS "ARM SVE compiler support not detected") + endif() + + set(CMAKE_C_FLAGS "${ORIGINAL_CMAKE_C_FLAGS}") +else() + message(STATUS "Not an aarch64 architecture") +endif() + if(USE_NCCL) find_package(Nccl REQUIRED) endif() diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index dfd80cb68c13..a01bd675f4d4 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -1,5 +1,6 @@ /** * Copyright 2017-2023 by XGBoost Contributors + * Copyright 2024 FUJITSU LIMITED * \file hist_util.cc */ #include "hist_util.h" @@ -15,6 +16,10 @@ #include "xgboost/context.h" // for Context #include "xgboost/data.h" // for SparsePage, SortedCSCPage +#if defined(SVE_SUPPORT_DETECTED) +#include // to leverage sve intrinsics +#endif + #if defined(XGBOOST_MM_PREFETCH_PRESENT) #include #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) @@ -252,13 +257,52 @@ void RowsWiseBuildHistKernel(Span gpair, Span(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); - auto hist_local = hist_data + idx_bin; - *(hist_local) += pgh_t[0]; - *(hist_local + 1) += pgh_t[1]; - } + #if defined(SVE_SUPPORT_DETECTED) + svfloat64_t pgh_t0_vec = svdup_n_f64(pgh_t[0]); + svfloat64_t pgh_t1_vec = svdup_n_f64(pgh_t[1]); + + for (size_t j = 0; j < row_size; j+=svcntw()) { + svbool_t pg32 = svwhilelt_b32(j, row_size); + svbool_t pg64 = svwhilelt_b64(j, row_size); + svuint32_t gr_index_vec = + svld1ub_u32(pg32, reinterpret_cast (&gr_index_local[j])); + svuint32_t offsets_vec = svld1(pg32, &offsets[j]); + svuint32_t idx_bin_vec; + if (kAnyMissing) { + idx_bin_vec = svmul_n_u32_x(pg32, gr_index_vec, two); + } else { + svuint32_t temp = svadd_u32_m(pg32, gr_index_vec, offsets_vec); + idx_bin_vec = svmul_n_u32_x(pg32, temp, two); + } + svuint64_t idx_bin_vec0_0 = svunpklo_u64(idx_bin_vec); + svuint64_t idx_bin_vec0_1 = svunpkhi_u64(idx_bin_vec); + svuint64_t idx_bin_vec1_0 = svadd_n_u64_m(pg64, idx_bin_vec0_0, 1); + svuint64_t idx_bin_vec1_1 = svadd_n_u64_m(pg64, idx_bin_vec0_1, 1); + + svfloat64_t hist0_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_0); + svfloat64_t hist0_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_1); + svfloat64_t hist1_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_0); + svfloat64_t hist1_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_1); + + hist0_vec0 = svadd_f64_m(pg64, hist0_vec0, pgh_t0_vec); + hist0_vec1 = svadd_f64_m(pg64, hist0_vec1, pgh_t0_vec); + hist1_vec0 = svadd_f64_m(pg64, hist1_vec0, pgh_t1_vec); + hist1_vec1 = svadd_f64_m(pg64, hist1_vec1, pgh_t1_vec); + + svst1_scatter_index(pg64, hist_data, idx_bin_vec0_0, hist0_vec0); + svst1_scatter_index(pg64, hist_data, idx_bin_vec0_1, hist0_vec1); + svst1_scatter_index(pg64, hist_data, idx_bin_vec1_0, hist1_vec0); + svst1_scatter_index(pg64, hist_data, idx_bin_vec1_1, hist1_vec1); + } + #else + for (size_t j = 0; j < row_size; ++j) { + const uint32_t idx_bin = + two * (static_cast(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); + auto hist_local = hist_data + idx_bin; + *(hist_local) += pgh_t[0]; + *(hist_local + 1) += pgh_t[1]; + } + #endif } } From e4d2869f4bdd1bd6e1bd460c92fd212d2f290a1d Mon Sep 17 00:00:00 2001 From: divya2108 Date: Mon, 26 Aug 2024 12:59:54 +0530 Subject: [PATCH 2/7] Addressed review comments - Changed cmake design by extracting the code into cmake/CheckSVEsupport.cmake - Prefixed the flags with XGBOOST_ and used targeted flags - Extracted the SVE code into an inlined function - Added detailed code comments - Modified vector names for better readability --- CMakeLists.txt | 52 ++-------- cmake/CheckSVEsupport.cmake | 56 +++++++++++ src/common/hist_util.cc | 185 +++++++++++++++++++----------------- 3 files changed, 162 insertions(+), 131 deletions(-) create mode 100644 cmake/CheckSVEsupport.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 0828a972b7b8..ec7ba2e3a3d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -270,51 +270,6 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "OS400") set(CMAKE_CXX_ARCHIVE_CREATE " -X64 qc ") endif() -if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - include(CheckCSourceCompiles) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+sve") - check_c_source_compiles(" - #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) - #include - int main() { - svfloat64_t a; - a = svdup_n_f64(0); - return 0; - } - #endif - " COMPILER_HAS_ARM_SVE) - - if(COMPILER_HAS_ARM_SVE) - message(STATUS "ARM SVE compiler support detected") - set(SOURCE_CODE " - #include - int main() { - int ret = prctl(PR_SVE_GET_VL); - return ret >= 0 ? 0 : 1; - } - ") - file(WRITE ${CMAKE_BINARY_DIR}/check_sve_support.c "${SOURCE_CODE}") - try_run(RUN_RESULT COMPILE_RESULT - ${CMAKE_BINARY_DIR}/check_sve_support_output - ${CMAKE_BINARY_DIR}/check_sve_support.c - ) - - if(RUN_RESULT EQUAL 0) - message(STATUS "ARM SVE hardware support detected") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a+sve") - string(APPEND CMAKE_CXX_FLAGS " -DSVE_SUPPORT_DETECTED") - else() - message(STATUS "ARM SVE hardware support not detected") - endif() - else() - message(STATUS "ARM SVE compiler support not detected") - endif() - - set(CMAKE_C_FLAGS "${ORIGINAL_CMAKE_C_FLAGS}") -else() - message(STATUS "Not an aarch64 architecture") -endif() - if(USE_NCCL) find_package(Nccl REQUIRED) endif() @@ -399,6 +354,13 @@ target_include_directories(xgboost $) #-- End shared library +include(${xgboost_SOURCE_DIR}/cmake/CheckSVEsupport.cmake) +check_xgboost_sve_support() +if(XGBOOST_ARM_SVE_HARDWARE_SUPPORT) + target_compile_definitions(objxgboost PUBLIC XGBOOST_SVE_SUPPORT_DETECTED) + target_compile_options(objxgboost PRIVATE ${XGBOOST_SVE_FLAGS}) +endif() + #-- CLI for xgboost if(BUILD_DEPRECATED_CLI) add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc) diff --git a/cmake/CheckSVEsupport.cmake b/cmake/CheckSVEsupport.cmake new file mode 100644 index 000000000000..c844c4c01116 --- /dev/null +++ b/cmake/CheckSVEsupport.cmake @@ -0,0 +1,56 @@ +function(check_xgboost_sve_support) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + include(CheckCSourceCompiles) + + # Save the original C_FLAGS to restore later + set(ORIGINAL_C_FLAGS "${CMAKE_C_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+sve") + + # Check if the compiler supports ARM SVE + check_c_source_compiles(" + #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) + #include + int main() { + svfloat64_t a; + a = svdup_n_f64(0); + return 0; + } + #endif + " XGBOOST_COMPILER_HAS_ARM_SVE) + + if(XGBOOST_COMPILER_HAS_ARM_SVE) + message(STATUS "ARM SVE compiler support detected") + + # Check for hardware support + set(SOURCE_CODE " + #include + int main() { + int ret = prctl(PR_SVE_GET_VL); + return ret >= 0 ? 0 : 1; + } + ") + file(WRITE ${CMAKE_BINARY_DIR}/check_sve_support.c "${SOURCE_CODE}") + try_run(RUN_RESULT COMPILE_RESULT + ${CMAKE_BINARY_DIR}/check_sve_support_output + ${CMAKE_BINARY_DIR}/check_sve_support.c + ) + + if(RUN_RESULT EQUAL 0) + message(STATUS "ARM SVE hardware support detected") + # Apply the SVE flags and definitions specifically to the xgboost target + set(XGBOOST_ARM_SVE_HARDWARE_SUPPORT TRUE PARENT_SCOPE) + set(XGBOOST_SVE_FLAGS "-march=armv8-a+sve" PARENT_SCOPE) + set(XGBOOST_SVE_DEFINITIONS "-DXGBOOST_SVE_SUPPORT_DETECTED" PARENT_SCOPE) + else() + message(STATUS "ARM SVE hardware support not detected") + endif() + else() + message(STATUS "ARM SVE compiler support not detected") + endif() + + # Restore the original C_FLAGS + set(CMAKE_C_FLAGS "${ORIGINAL_C_FLAGS}") +else() + message(STATUS "Not an aarch64 architecture") +endif() +endfunction() \ No newline at end of file diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index a01bd675f4d4..9665e2477cd1 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -16,23 +16,23 @@ #include "xgboost/context.h" // for Context #include "xgboost/data.h" // for SparsePage, SortedCSCPage -#if defined(SVE_SUPPORT_DETECTED) +#ifdef XGBOOST_SVE_SUPPORT_DETECTED #include // to leverage sve intrinsics #endif #if defined(XGBOOST_MM_PREFETCH_PRESENT) - #include - #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) +#include +#define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) #elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) - #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) +#define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) #else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op - #define PREFETCH_READ_T0(addr) do {} while (0) +#define PREFETCH_READ_T0(addr) \ + do { \ + } while (0) #endif // defined(XGBOOST_MM_PREFETCH_PRESENT) namespace xgboost::common { -HistogramCuts::HistogramCuts() { - cut_ptrs_.HostVector().emplace_back(0); -} +HistogramCuts::HistogramCuts() { cut_ptrs_.HostVector().emplace_back(0); } HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted, Span hessian) { @@ -58,10 +58,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins } container.MakeCuts(ctx, m->Info(), &out); } else { - SortedSketchContainer container{ctx, - max_bins, - m->Info().feature_types.ConstHostSpan(), - reduced, + SortedSketchContainer container{ctx, max_bins, m->Info().feature_types.ConstHostSpan(), reduced, HostSketchContainer::UseGroup(info)}; for (auto const &page : m->GetBatches(ctx)) { container.PushColPage(page, info, hessian); @@ -101,9 +98,9 @@ void CopyHist(GHistRow dst, const GHistRow src, size_t begin, size_t end) { */ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2, size_t begin, size_t end) { - double* pdst = reinterpret_cast(dst.data()); - const double* psrc1 = reinterpret_cast(src1.data()); - const double* psrc2 = reinterpret_cast(src2.data()); + double *pdst = reinterpret_cast(dst.data()); + const double *psrc1 = reinterpret_cast(src1.data()); + const double *psrc2 = reinterpret_cast(src2.data()); for (size_t i = 2 * begin; i < 2 * end; ++i) { pdst[i] = psrc1[i] - psrc2[i]; @@ -117,13 +114,10 @@ struct Prefetch { private: static constexpr size_t kNoPrefetchSize = - kPrefetchOffset + kCacheLineSize / - sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type); + kPrefetchOffset + kCacheLineSize / sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type); public: - static size_t NoPrefetchSize(size_t rows) { - return std::min(rows, kNoPrefetchSize); - } + static size_t NoPrefetchSize(size_t rows) { return std::min(rows, kNoPrefetchSize); } template static constexpr size_t GetPrefetchStep() { @@ -139,9 +133,7 @@ struct RuntimeFlags { const BinTypeSize bin_type_size; }; -template class GHistBuildingManager { public: @@ -175,7 +167,7 @@ class GHistBuildingManager { * and forward the call there. */ template - static void DispatchAndExecute(const RuntimeFlags& flags, Fn&& fn) { + static void DispatchAndExecute(const RuntimeFlags &flags, Fn &&fn) { if (flags.first_page != kFirstPage) { SetFirstPage::Type::DispatchAndExecute(flags, std::forward(fn)); } else if (flags.read_by_column != kReadByColumn) { @@ -191,6 +183,64 @@ class GHistBuildingManager { } }; +#ifdef XGBOOST_SVE_SUPPORT_DETECTED +template +inline void UpdateHistogramWithSVE(size_t row_size, const BinIdxType *gr_index_local, + const std::uint32_t *offsets, double *hist_data, + const float *p_gpair, size_t idx_gh, const uint32_t two, + bool kAnyMissing) { + // Load the gradient and hessian values from p_gpair into SVE vector registers + svfloat64_t grad = svdup_n_f64(p_gpair[idx_gh]); + svfloat64_t hess = svdup_n_f64(p_gpair[idx_gh + 1]); + + for (size_t j = 0; j < row_size; j += svcntw()) { + // Create a predicate (mask) for 32-bit & 64-bit elements, active only for valid elements + svbool_t pg32 = svwhilelt_b32(j, row_size); + svbool_t pg64 = svwhilelt_b64(j, row_size); + + // Load the gradient index values and offsets for the current chunk of the row + svuint32_t gr_index_vec = + svld1ub_u32(pg32, reinterpret_cast(&gr_index_local[j])); + svuint32_t offsets_vec = svld1(pg32, &offsets[j]); + + svuint32_t idx_bin_vec; + if (kAnyMissing) { + idx_bin_vec = svmul_n_u32_x(pg32, gr_index_vec, two); + } else { + svuint32_t temp = svadd_u32_m(pg32, gr_index_vec, offsets_vec); + idx_bin_vec = svmul_n_u32_x(pg32, temp, two); + } + + // Unpack the 32-bit index binary vector into 64-bit vectors from lower and upper half + // respectively + svuint64_t idx_bin_vec0_0 = svunpklo_u64(idx_bin_vec); + svuint64_t idx_bin_vec0_1 = svunpkhi_u64(idx_bin_vec); + + // Increment the indices by 1 for hessian. + svuint64_t idx_bin_vec1_0 = svadd_n_u64_m(pg64, idx_bin_vec0_0, 1); + svuint64_t idx_bin_vec1_1 = svadd_n_u64_m(pg64, idx_bin_vec0_1, 1); + + // Gather the histogram data corresponding to the computed indices + svfloat64_t hist0_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_0); + svfloat64_t hist0_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_1); + svfloat64_t hist1_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_0); + svfloat64_t hist1_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_1); + + // Accumulate the gradient and hessian values into the histogram + hist0_vec0 = svadd_f64_m(pg64, hist0_vec0, grad); + hist0_vec1 = svadd_f64_m(pg64, hist0_vec1, grad); + hist1_vec0 = svadd_f64_m(pg64, hist1_vec0, hess); + hist1_vec1 = svadd_f64_m(pg64, hist1_vec1, hess); + + // Store the updated histogram data back into memory + svst1_scatter_index(pg64, hist_data, idx_bin_vec0_0, hist0_vec0); + svst1_scatter_index(pg64, hist_data, idx_bin_vec0_1, hist0_vec1); + svst1_scatter_index(pg64, hist_data, idx_bin_vec1_0, hist1_vec0); + svst1_scatter_index(pg64, hist_data, idx_bin_vec1_1, hist1_vec1); + } +} +#endif + template void RowsWiseBuildHistKernel(Span gpair, Span row_indices, const GHistIndexMatrix &gmat, GHistRow hist) { @@ -230,22 +280,19 @@ void RowsWiseBuildHistKernel(Span gpair, Span gpair, Span (&gr_index_local[j])); - svuint32_t offsets_vec = svld1(pg32, &offsets[j]); - svuint32_t idx_bin_vec; - if (kAnyMissing) { - idx_bin_vec = svmul_n_u32_x(pg32, gr_index_vec, two); - } else { - svuint32_t temp = svadd_u32_m(pg32, gr_index_vec, offsets_vec); - idx_bin_vec = svmul_n_u32_x(pg32, temp, two); - } - svuint64_t idx_bin_vec0_0 = svunpklo_u64(idx_bin_vec); - svuint64_t idx_bin_vec0_1 = svunpkhi_u64(idx_bin_vec); - svuint64_t idx_bin_vec1_0 = svadd_n_u64_m(pg64, idx_bin_vec0_0, 1); - svuint64_t idx_bin_vec1_1 = svadd_n_u64_m(pg64, idx_bin_vec0_1, 1); - - svfloat64_t hist0_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_0); - svfloat64_t hist0_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_1); - svfloat64_t hist1_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_0); - svfloat64_t hist1_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_1); - - hist0_vec0 = svadd_f64_m(pg64, hist0_vec0, pgh_t0_vec); - hist0_vec1 = svadd_f64_m(pg64, hist0_vec1, pgh_t0_vec); - hist1_vec0 = svadd_f64_m(pg64, hist1_vec0, pgh_t1_vec); - hist1_vec1 = svadd_f64_m(pg64, hist1_vec1, pgh_t1_vec); - - svst1_scatter_index(pg64, hist_data, idx_bin_vec0_0, hist0_vec0); - svst1_scatter_index(pg64, hist_data, idx_bin_vec0_1, hist0_vec1); - svst1_scatter_index(pg64, hist_data, idx_bin_vec1_0, hist1_vec0); - svst1_scatter_index(pg64, hist_data, idx_bin_vec1_1, hist1_vec1); - } - #else - for (size_t j = 0; j < row_size; ++j) { - const uint32_t idx_bin = - two * (static_cast(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); - auto hist_local = hist_data + idx_bin; - *(hist_local) += pgh_t[0]; - *(hist_local + 1) += pgh_t[1]; - } - #endif + for (size_t j = 0; j < row_size; ++j) { + const uint32_t idx_bin = + two * (static_cast(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); + auto hist_local = hist_data + idx_bin; + *(hist_local) += pgh_t[0]; + *(hist_local + 1) += pgh_t[1]; + } +#endif } } @@ -323,7 +336,9 @@ void ColsWiseBuildHistKernel(Span gpair, Span gpair, Span gpair, Span Date: Fri, 6 Sep 2024 14:57:18 +0530 Subject: [PATCH 3/7] Modified the cmake logic Signed-off-by: divya2108 --- CMakeLists.txt | 13 ++++++----- cmake/CheckSVEsupport.cmake | 24 --------------------- src/common/hist_util.cc | 43 ++++++++++++++++++++++++------------- 3 files changed, 34 insertions(+), 46 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ec7ba2e3a3d1..841c2ec34418 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,12 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") endif() endif() +include(${xgboost_SOURCE_DIR}/cmake/CheckSVEsupport.cmake) +check_xgboost_sve_support() +if(XGBOOST_COMPILER_HAS_ARM_SVE) + add_compile_definitions(XGBOOST_SVE_COMPILER_SUPPORT) +endif() + include(${xgboost_SOURCE_DIR}/cmake/PrefetchIntrinsics.cmake) find_prefetch_intrinsics() include(${xgboost_SOURCE_DIR}/cmake/Version.cmake) @@ -354,13 +360,6 @@ target_include_directories(xgboost $) #-- End shared library -include(${xgboost_SOURCE_DIR}/cmake/CheckSVEsupport.cmake) -check_xgboost_sve_support() -if(XGBOOST_ARM_SVE_HARDWARE_SUPPORT) - target_compile_definitions(objxgboost PUBLIC XGBOOST_SVE_SUPPORT_DETECTED) - target_compile_options(objxgboost PRIVATE ${XGBOOST_SVE_FLAGS}) -endif() - #-- CLI for xgboost if(BUILD_DEPRECATED_CLI) add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc) diff --git a/cmake/CheckSVEsupport.cmake b/cmake/CheckSVEsupport.cmake index c844c4c01116..f7d92ad5f679 100644 --- a/cmake/CheckSVEsupport.cmake +++ b/cmake/CheckSVEsupport.cmake @@ -20,30 +20,6 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") if(XGBOOST_COMPILER_HAS_ARM_SVE) message(STATUS "ARM SVE compiler support detected") - - # Check for hardware support - set(SOURCE_CODE " - #include - int main() { - int ret = prctl(PR_SVE_GET_VL); - return ret >= 0 ? 0 : 1; - } - ") - file(WRITE ${CMAKE_BINARY_DIR}/check_sve_support.c "${SOURCE_CODE}") - try_run(RUN_RESULT COMPILE_RESULT - ${CMAKE_BINARY_DIR}/check_sve_support_output - ${CMAKE_BINARY_DIR}/check_sve_support.c - ) - - if(RUN_RESULT EQUAL 0) - message(STATUS "ARM SVE hardware support detected") - # Apply the SVE flags and definitions specifically to the xgboost target - set(XGBOOST_ARM_SVE_HARDWARE_SUPPORT TRUE PARENT_SCOPE) - set(XGBOOST_SVE_FLAGS "-march=armv8-a+sve" PARENT_SCOPE) - set(XGBOOST_SVE_DEFINITIONS "-DXGBOOST_SVE_SUPPORT_DETECTED" PARENT_SCOPE) - else() - message(STATUS "ARM SVE hardware support not detected") - endif() else() message(STATUS "ARM SVE compiler support not detected") endif() diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 9665e2477cd1..6ee073523f9d 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -16,7 +16,11 @@ #include "xgboost/context.h" // for Context #include "xgboost/data.h" // for SparsePage, SortedCSCPage -#ifdef XGBOOST_SVE_SUPPORT_DETECTED +#ifdef __linux__ +#include +#endif + +#ifdef XGBOOST_SVE_COMPILER_SUPPORT #include // to leverage sve intrinsics #endif @@ -183,8 +187,9 @@ class GHistBuildingManager { } }; -#ifdef XGBOOST_SVE_SUPPORT_DETECTED +#ifdef XGBOOST_SVE_COMPILER_SUPPORT template +__attribute__((target("arch=armv8-a+sve"))) inline void UpdateHistogramWithSVE(size_t row_size, const BinIdxType *gr_index_local, const std::uint32_t *offsets, double *hist_data, const float *p_gpair, size_t idx_gh, const uint32_t two, @@ -241,6 +246,12 @@ inline void UpdateHistogramWithSVE(size_t row_size, const BinIdxType *gr_index_l } #endif +// Returns true if SVE ISA is available on the current CPU +bool check_sve_hw_support() { + int ret = prctl(PR_SVE_GET_VL); + return ret >= 0 ? 1 : 0; +} + template void RowsWiseBuildHistKernel(Span gpair, Span row_indices, const GHistIndexMatrix &gmat, GHistRow hist) { @@ -302,20 +313,22 @@ void RowsWiseBuildHistKernel(Span gpair, Span(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); - auto hist_local = hist_data + idx_bin; - *(hist_local) += pgh_t[0]; - *(hist_local + 1) += pgh_t[1]; + if (check_sve_hw_support()) { + #ifdef XGBOOST_SVE_COMPILER_SUPPORT + UpdateHistogramWithSVE(row_size, gr_index_local, offsets, hist_data, p_gpair, idx_gh, two, + kAnyMissing); + #endif + } else { + // The trick with pgh_t buffer helps the compiler to generate faster binary. + const float pgh_t[] = {p_gpair[idx_gh], p_gpair[idx_gh + 1]}; + for (size_t j = 0; j < row_size; ++j) { + const uint32_t idx_bin = + two * (static_cast(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); + auto hist_local = hist_data + idx_bin; + *(hist_local) += pgh_t[0]; + *(hist_local + 1) += pgh_t[1]; + } } -#endif } } From 7af1fd64b143d6c9b99f7b5261c342689912ea05 Mon Sep 17 00:00:00 2001 From: divya2108 Date: Thu, 3 Oct 2024 14:24:00 +0530 Subject: [PATCH 4/7] Optimised code design and handled ci test failures --- cmake/CheckSVEsupport.cmake | 2 +- src/common/hist_util.cc | 70 ++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/cmake/CheckSVEsupport.cmake b/cmake/CheckSVEsupport.cmake index f7d92ad5f679..3abc19e6b1b2 100644 --- a/cmake/CheckSVEsupport.cmake +++ b/cmake/CheckSVEsupport.cmake @@ -1,7 +1,7 @@ function(check_xgboost_sve_support) if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") include(CheckCSourceCompiles) - + # Save the original C_FLAGS to restore later set(ORIGINAL_C_FLAGS "${CMAKE_C_FLAGS}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+sve") diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 6ee073523f9d..1986a7e4277f 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -18,6 +18,7 @@ #ifdef __linux__ #include +#define PR_SVE_GET_VL 51 #endif #ifdef XGBOOST_SVE_COMPILER_SUPPORT @@ -201,7 +202,8 @@ inline void UpdateHistogramWithSVE(size_t row_size, const BinIdxType *gr_index_l for (size_t j = 0; j < row_size; j += svcntw()) { // Create a predicate (mask) for 32-bit & 64-bit elements, active only for valid elements svbool_t pg32 = svwhilelt_b32(j, row_size); - svbool_t pg64 = svwhilelt_b64(j, row_size); + svbool_t pg64_lower = svwhilelt_b64(j, row_size); + svbool_t pg64_upper = svwhilelt_b64(j+svcntd(), row_size); // Load the gradient index values and offsets for the current chunk of the row svuint32_t gr_index_vec = @@ -216,42 +218,53 @@ inline void UpdateHistogramWithSVE(size_t row_size, const BinIdxType *gr_index_l idx_bin_vec = svmul_n_u32_x(pg32, temp, two); } - // Unpack the 32-bit index binary vector into 64-bit vectors from lower and upper half - // respectively + // Unpack 32-bit index binary vector into 64-bit vectors from lower & upper half respectively svuint64_t idx_bin_vec0_0 = svunpklo_u64(idx_bin_vec); svuint64_t idx_bin_vec0_1 = svunpkhi_u64(idx_bin_vec); // Increment the indices by 1 for hessian. - svuint64_t idx_bin_vec1_0 = svadd_n_u64_m(pg64, idx_bin_vec0_0, 1); - svuint64_t idx_bin_vec1_1 = svadd_n_u64_m(pg64, idx_bin_vec0_1, 1); + svuint64_t idx_bin_vec1_0 = svadd_n_u64_m(pg64_lower, idx_bin_vec0_0, 1); + svuint64_t idx_bin_vec1_1 = svadd_n_u64_m(pg64_upper, idx_bin_vec0_1, 1); // Gather the histogram data corresponding to the computed indices - svfloat64_t hist0_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_0); - svfloat64_t hist0_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec0_1); - svfloat64_t hist1_vec0 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_0); - svfloat64_t hist1_vec1 = svld1_gather_index(pg64, hist_data, idx_bin_vec1_1); + svfloat64_t hist0_vec0 = svld1_gather_index(pg64_lower, hist_data, idx_bin_vec0_0); + svfloat64_t hist0_vec1 = svld1_gather_index(pg64_upper, hist_data, idx_bin_vec0_1); + svfloat64_t hist1_vec0 = svld1_gather_index(pg64_lower, hist_data, idx_bin_vec1_0); + svfloat64_t hist1_vec1 = svld1_gather_index(pg64_upper, hist_data, idx_bin_vec1_1); // Accumulate the gradient and hessian values into the histogram - hist0_vec0 = svadd_f64_m(pg64, hist0_vec0, grad); - hist0_vec1 = svadd_f64_m(pg64, hist0_vec1, grad); - hist1_vec0 = svadd_f64_m(pg64, hist1_vec0, hess); - hist1_vec1 = svadd_f64_m(pg64, hist1_vec1, hess); + hist0_vec0 = svadd_f64_m(pg64_lower, hist0_vec0, grad); + hist0_vec1 = svadd_f64_m(pg64_upper, hist0_vec1, grad); + hist1_vec0 = svadd_f64_m(pg64_lower, hist1_vec0, hess); + hist1_vec1 = svadd_f64_m(pg64_upper, hist1_vec1, hess); // Store the updated histogram data back into memory - svst1_scatter_index(pg64, hist_data, idx_bin_vec0_0, hist0_vec0); - svst1_scatter_index(pg64, hist_data, idx_bin_vec0_1, hist0_vec1); - svst1_scatter_index(pg64, hist_data, idx_bin_vec1_0, hist1_vec0); - svst1_scatter_index(pg64, hist_data, idx_bin_vec1_1, hist1_vec1); + svst1_scatter_index(pg64_lower, hist_data, idx_bin_vec0_0, hist0_vec0); + svst1_scatter_index(pg64_upper, hist_data, idx_bin_vec0_1, hist0_vec1); + svst1_scatter_index(pg64_lower, hist_data, idx_bin_vec1_0, hist1_vec0); + svst1_scatter_index(pg64_upper, hist_data, idx_bin_vec1_1, hist1_vec1); } } #endif -// Returns true if SVE ISA is available on the current CPU -bool check_sve_hw_support() { - int ret = prctl(PR_SVE_GET_VL); - return ret >= 0 ? 1 : 0; +// Returns true if SVE ISA is available on the current CPU (with caching) +#ifdef __linux__ +int check_sve_hw_support() { + static int cached_sve_support = -1; + if (cached_sve_support == -1) { + int ret = prctl(PR_SVE_GET_VL); + if (ret == -1) { + cached_sve_support = 0; + } else { + cached_sve_support = 1; + } + } + return cached_sve_support; } +static int sve_enabled = check_sve_hw_support(); +#endif + template void RowsWiseBuildHistKernel(Span gpair, Span row_indices, const GHistIndexMatrix &gmat, GHistRow hist) { @@ -289,7 +302,6 @@ void RowsWiseBuildHistKernel(Span gpair, Span gpair, Span gpair, Span Date: Thu, 17 Oct 2024 14:25:27 +0530 Subject: [PATCH 5/7] Resolved unit test failures --- src/common/hist_util.cc | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index 1986a7e4277f..a3b46c6eac68 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -189,6 +189,32 @@ class GHistBuildingManager { }; #ifdef XGBOOST_SVE_COMPILER_SUPPORT +template +__attribute__((target("arch=armv8-a+sve"))) +inline svuint32_t load_index_vec(svbool_t pg, BinIdxType *d) { + std::cout << "Missing template for type " << typeid(BinIdxType).name() << std::endl; + assert(0); + return svindex_u32(0, 2); // dummy +} + +template <> +__attribute__((target("arch=armv8-a+sve"))) +inline svuint32_t load_index_vec(svbool_t pg, const uint32_t *d) { + return svld1(pg, d); +} + +template <> +__attribute__((target("arch=armv8-a+sve"))) +inline svuint32_t load_index_vec(svbool_t pg, const uint16_t *d) { + return svld1uh_u32(pg, d); +} + +template <> +__attribute__((target("arch=armv8-a+sve"))) +inline svuint32_t load_index_vec(svbool_t pg, const uint8_t *d) { + return svld1ub_u32(pg, d); +} + template __attribute__((target("arch=armv8-a+sve"))) inline void UpdateHistogramWithSVE(size_t row_size, const BinIdxType *gr_index_local, @@ -206,14 +232,12 @@ inline void UpdateHistogramWithSVE(size_t row_size, const BinIdxType *gr_index_l svbool_t pg64_upper = svwhilelt_b64(j+svcntd(), row_size); // Load the gradient index values and offsets for the current chunk of the row - svuint32_t gr_index_vec = - svld1ub_u32(pg32, reinterpret_cast(&gr_index_local[j])); - svuint32_t offsets_vec = svld1(pg32, &offsets[j]); - + svuint32_t gr_index_vec = load_index_vec(pg32, &gr_index_local[j]); svuint32_t idx_bin_vec; if (kAnyMissing) { idx_bin_vec = svmul_n_u32_x(pg32, gr_index_vec, two); } else { + svuint32_t offsets_vec = svld1(pg32, &offsets[j]); svuint32_t temp = svadd_u32_m(pg32, gr_index_vec, offsets_vec); idx_bin_vec = svmul_n_u32_x(pg32, temp, two); } @@ -341,7 +365,7 @@ void RowsWiseBuildHistKernel(Span gpair, Span Date: Wed, 11 Dec 2024 15:13:20 +0530 Subject: [PATCH 6/7] Enables SVE at runtime for ARM CPU's having VL >=256 Disables SVE for SVE128 supported hardware and runs the default Neon flow --- src/common/hist_util.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index a3b46c6eac68..d26db5ed8d5a 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -21,6 +21,10 @@ #define PR_SVE_GET_VL 51 #endif +#ifndef PR_SVE_VL_LEN_MASK +#define PR_SVE_VL_LEN_MASK 0xffff +#endif + #ifdef XGBOOST_SVE_COMPILER_SUPPORT #include // to leverage sve intrinsics #endif @@ -286,7 +290,18 @@ int check_sve_hw_support() { return cached_sve_support; } +int check_vector_length() { + int ret = prctl(PR_SVE_GET_VL); + if (ret < 0) { + return 0; + } else { + // Mask out the SVE vector length bits + return (ret & PR_SVE_VL_LEN_MASK) * 8; // bytes * 8 = bit length(vector length) + } +} + static int sve_enabled = check_sve_hw_support(); +static int vector_length = check_vector_length(); #endif template @@ -350,7 +365,7 @@ void RowsWiseBuildHistKernel(Span gpair, Span 128) { UpdateHistogramWithSVE(row_size, gr_index_local, offsets, hist_data, p_gpair, idx_gh, two, kAnyMissing); } else { From a936141f6f8fe4b990a8649cab2d7ee4788919ed Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 17 Dec 2024 22:48:19 +0800 Subject: [PATCH 7/7] CMake target. --- CMakeLists.txt | 3 --- cmake/Utils.cmake | 11 +++++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 841c2ec34418..f8c04f25abe0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,9 +40,6 @@ endif() include(${xgboost_SOURCE_DIR}/cmake/CheckSVEsupport.cmake) check_xgboost_sve_support() -if(XGBOOST_COMPILER_HAS_ARM_SVE) - add_compile_definitions(XGBOOST_SVE_COMPILER_SUPPORT) -endif() include(${xgboost_SOURCE_DIR}/cmake/PrefetchIntrinsics.cmake) find_prefetch_intrinsics() diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index ec47bf6eb62a..62c4463e598d 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -214,14 +214,13 @@ macro(xgboost_target_defs target) target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_DEBUG_OUTPUT=1) endif() if(XGBOOST_MM_PREFETCH_PRESENT) - target_compile_definitions(${target} - PRIVATE - -DXGBOOST_MM_PREFETCH_PRESENT=1) + target_compile_definitions(${target} PRIVATE -DXGBOOST_MM_PREFETCH_PRESENT=1) endif() if(XGBOOST_BUILTIN_PREFETCH_PRESENT) - target_compile_definitions(${target} - PRIVATE - -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1) + target_compile_definitions(${target} PRIVATE -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1) + endif() + if(XGBOOST_COMPILER_HAS_ARM_SVE) + target_compile_definitions(${target} PRIVATE -DXGBOOST_SVE_COMPILER_SUPPORT=1) endif() if(PLUGIN_RMM)