-
Notifications
You must be signed in to change notification settings - Fork 13.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc++] Optimize ranges::copy for forward_iterator and segmented_iterator #120134
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-libcxx Author: Peng Liu (winner245) ChangesGeneral descriptionThis PR is part of a series aimed at significantly improving the performance of Current PRThis PR enhances the performance of all range-based operations in
Before:
After:
Full diff: https://github.com/llvm/llvm-project/pull/120134.diff 4 Files Affected:
diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index 4f30b2050abbaf..f737bc4e98e6d6 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -13,6 +13,8 @@
#include <__algorithm/for_each_segment.h>
#include <__algorithm/min.h>
#include <__config>
+#include <__fwd/bit_reference.h>
+#include <__iterator/distance.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__type_traits/common_type.h>
@@ -95,6 +97,54 @@ struct __copy_impl {
}
}
+ template <class _InIter, class _Cp, __enable_if_t<__has_forward_iterator_category<_InIter>::value, int> = 0>
+ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<_InIter, __bit_iterator<_Cp, false>>
+ operator()(_InIter __first, _InIter __last, __bit_iterator<_Cp, false> __result) {
+ using _It = __bit_iterator<_Cp, false>;
+ using __storage_type = typename _It::__storage_type;
+ __storage_type __n = static_cast<__storage_type>(std::distance(__first, __last));
+ const unsigned __bits_per_word = _It::__bits_per_word;
+
+ if (__n) {
+ // do first partial word, if present
+ if (__result.__ctz_ != 0) {
+ __storage_type __clz = static_cast<__storage_type>(__bits_per_word - __result.__ctz_);
+ __storage_type __dn = std::min(__clz, __n);
+ __storage_type __w = *__result.__seg_;
+ __storage_type __m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
+ __w &= ~__m;
+ for (__storage_type __i = 0; __i < __dn; ++__i, ++__first)
+ __w |= static_cast<__storage_type>(*__first) << __result.__ctz_++;
+ *__result.__seg_ = __w;
+ if (__result.__ctz_ == __bits_per_word) {
+ __result.__ctz_ = 0;
+ ++__result.__seg_;
+ }
+ __n -= __dn;
+ }
+ }
+ // do middle whole words, if present
+ __storage_type __nw = __n / __bits_per_word;
+ __n -= __nw * __bits_per_word;
+ for (; __nw; --__nw) {
+ __storage_type __w = 0;
+ for (__storage_type __i = 0; __i < __bits_per_word; ++__i, ++__first)
+ __w |= static_cast<__storage_type>(*__first) << __i;
+ *__result.__seg_++ = __w;
+ }
+ // do last partial word, if present
+ if (__n) {
+ __storage_type __w = 0;
+ for (__storage_type __i = 0; __i < __n; ++__i, ++__first)
+ __w |= static_cast<__storage_type>(*__first) << __i;
+ __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+ *__result.__seg_ &= ~__m;
+ *__result.__seg_ |= __w;
+ __result.__ctz_ = __n;
+ }
+ return std::make_pair(std::move(__first), std::move(__result));
+ }
+
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 22637d43974123..e8cbb63988ba54 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -10,6 +10,7 @@
#ifndef _LIBCPP___BIT_REFERENCE
#define _LIBCPP___BIT_REFERENCE
+#include <__algorithm/copy.h>
#include <__algorithm/copy_n.h>
#include <__algorithm/fill_n.h>
#include <__algorithm/min.h>
@@ -970,6 +971,8 @@ private:
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend void
__fill_n_bool(__bit_iterator<_Dp, false> __first, typename _Dp::size_type __n);
+ friend struct __copy_impl;
+
template <class _Dp, bool _IC>
_LIBCPP_CONSTEXPR_SINCE_CXX20 friend __bit_iterator<_Dp, false> __copy_aligned(
__bit_iterator<_Dp, _IC> __first, __bit_iterator<_Dp, _IC> __last, __bit_iterator<_Dp, false> __result);
diff --git a/libcxx/test/benchmarks/containers/ContainerBenchmarks.h b/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
index 6d21e12896ec9e..123f7bc95d4745 100644
--- a/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
+++ b/libcxx/test/benchmarks/containers/ContainerBenchmarks.h
@@ -51,6 +51,30 @@ void BM_Assignment(benchmark::State& st, Container) {
}
}
+template <class Container, class GenInputs>
+void BM_Assign_IterIter(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ auto beg = in.begin();
+ auto end = in.end();
+ for (auto _ : st) {
+ c.assign(beg, end);
+ DoNotOptimizeData(c);
+ DoNotOptimizeData(in);
+ benchmark::ClobberMemory();
+ }
+}
+
+template <std::size_t... sz, typename Container, typename GenInputs>
+void BM_Assign_Range(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ for (auto _ : st) {
+ c.assign_range(in);
+ DoNotOptimizeData(c);
+ DoNotOptimizeData(in);
+ benchmark::ClobberMemory();
+ }
+}
+
template <std::size_t... sz, typename Container, typename GenInputs>
void BM_AssignInputIterIter(benchmark::State& st, Container c, GenInputs gen) {
auto v = gen(1, sz...);
@@ -108,6 +132,40 @@ void BM_Pushback_no_grow(benchmark::State& state, Container c) {
}
}
+template <class Container, class GenInputs>
+void BM_Insert_Iter_IterIter(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ const auto beg = in.begin();
+ const auto end = in.end();
+ for (auto _ : st) {
+ c.resize(100);
+ c.insert(c.begin() + 50, beg, end);
+ DoNotOptimizeData(c);
+ benchmark::ClobberMemory();
+ }
+}
+
+template <class Container, class GenInputs>
+void BM_Insert_Range(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ for (auto _ : st) {
+ c.resize(100);
+ c.insert_range(c.begin() + 50, in);
+ DoNotOptimizeData(c);
+ benchmark::ClobberMemory();
+ }
+}
+
+template <class Container, class GenInputs>
+void BM_Append_Range(benchmark::State& st, Container c, GenInputs gen) {
+ auto in = gen(st.range(0));
+ for (auto _ : st) {
+ c.append_range(in);
+ DoNotOptimizeData(c);
+ benchmark::ClobberMemory();
+ }
+}
+
template <class Container, class GenInputs>
void BM_InsertValue(benchmark::State& st, Container c, GenInputs gen) {
auto in = gen(st.range(0));
diff --git a/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp b/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp
new file mode 100644
index 00000000000000..2ce10cb6d3d1b6
--- /dev/null
+++ b/libcxx/test/benchmarks/containers/vector_bool_operations.bench.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+#include "ContainerBenchmarks.h"
+#include "../GenerateInput.h"
+
+using namespace ContainerBenchmarks;
+
+BENCHMARK_CAPTURE(BM_ConstructIterIter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_ConstructFromRange, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_Assign_IterIter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_Assign_Range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_CAPTURE(BM_Insert_Iter_IterIter, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)
+ ->Arg(5140480);
+BENCHMARK_CAPTURE(BM_Insert_Range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+BENCHMARK_CAPTURE(BM_Append_Range, vector_bool, std::vector<bool>{}, getRandomIntegerInputs<bool>)->Arg(5140480);
+
+BENCHMARK_MAIN();
\ No newline at end of file
|
b98a2ef
to
4239066
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
996d1fe
to
81c929a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since you're optimizing an algorithm (and nothing specific to vector<bool>
itself) we should just benchmark that instead. That's significantly less convoluted. I'd also like to see some additional tests, especially with iterators that don't return a bool. I'm pretty sure your current implementation is completely broken with that. Lastly, I think we should move this into __copy_impl
, since we might be able to unwrap iterators to __bit_iterator
s. I don't think we do that currently, but I see no reason we couldn't in the future. It would also be nice to improve std::move
in the same way (and hopefully share the code).
Thank you for your suggestion.
My original motivation for this series of work was to improve the performance of
Since we are dealing with
I agree with you and this was also what I planned to do next. |
05c18a3
to
7bef800
Compare
2f32561
to
948da90
Compare
a4f4910
to
ad33a01
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this patch is promising, but it should be rebased onto the latest main
.
I would also like to encourage you to think about whether making __bit_iterator
a segmented iterator would make sense and whether it might unlock various optimizations without needing such special cases just for __bit_iterator
. This may end up being a misleading suggestion, but I'd like you to investigate it since it could pay off if that works.
libcxx/include/__algorithm/copy.h
Outdated
is_convertible<typename iterator_traits<_InIter>::value_type, bool>::value, | ||
int> = 0> | ||
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, __bit_iterator<_Cp, false> > | ||
operator()(_InIter __first, _Sent __last, __bit_iterator<_Cp, false> __result) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
operator()(_InIter __first, _Sent __last, __bit_iterator<_Cp, false> __result) const { | |
operator()(_InIter __first, _Sent __last, __bit_iterator<_Cp, /* IsConst */false> __result) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for the thought-provoking comment. As suggested, I've implemented segmented iterator inputs for std::copy
. With segmented_iterator
input, each input segment reduces to a forward_iterator
-pair, which is the case this patch optimizes for. As a result, the performance improvements for forward_iterator
-pair inputs also extend to segmented_iterator
inputs, yielding a 9x speed-up in both cases. For a more detailed explanation, please refer to my updated PR description.
3dbf418
to
37d52d7
Compare
37d52d7
to
e5c04a1
Compare
e5c04a1
to
50eb099
Compare
50eb099
to
d6119cd
Compare
2876f93
to
f0eb051
Compare
This patch optimizes the performance of
{std, ranges}::copy
when copying fromforward_iterator
orsegmented_iterator
inputs to avector<bool>::iterator
output, yielding a performance improvement of up to 9x. The key optimizations include:vector<bool>
using bitwise masks and writing bit by bit, the optimization first assembles the input data into whole words and then directly copy the whole word to the underlying storage ofvector<bool>
. This word-wise copying approach leads to a 9x performance improvement for{std, ranges}::copy
.segmented_iterator
inputs, the optimization subdivides the input data into segments, where each segment reduces to a range representable by aforward_iterator
-pair. This transforms the problem to the case offorward_iterator
-pair inputs, resulting in a similar 9.0x speed-up for segmented iterators such asdeque
iterators.As a byproduct of this work, all the iterator-pair and range-based operations that internally call
std::copy
have also achieved a similar speed-up of up to 9x. The improvedvector<bool>
operations include:vector(std::from_range_t, R&& rg, const Allocator& alloc)
assign_range(R&& rg)
insert_range(const_iterator pos, R&& rg)
append_range(R&& rg)
vector(InputIt first, InputIt last, const Allocator& alloc)
assign(InputIt first, InputIt last)
insert(const_iterator pos, InputIt first, InputIt last)
Benchmarks
Comprehensive benchmarks have been provided to seamlessly integrate into the recently enhanced benchmark framework developed by @ldionne. These results demonstrate the substantial performance improvements for both
{std, ranges}::copy
andvector<bool>
operations.{std, ranges}::copy
vector<bool>