Skip to content

Commit 9415686

Browse files
committed
Avoid heap allocation for function calls with a small number of arguments
We don't have access to llvm::SmallVector or similar, but given the limited subset of the `std::vector` API that `function_call::args{,_convert}` need and the "reserve-then-fill" usage pattern, it is relatively straightforward to implement custom containers that get the job done. Seems to improves time to call the collatz function in pybind/pybind11_benchmark significantly; numbers are a little noisy but there's a clear improvement from "about 60 ns per call" to "about 45 ns per call" on my machine (M4 Max Mac), as measured with `timeit.repeat('collatz(4)', 'from pybind11_benchmark import collatz')`.
1 parent bf2d56e commit 9415686

File tree

8 files changed

+505
-5
lines changed

8 files changed

+505
-5
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
180180
endif()
181181

182182
set(PYBIND11_HEADERS
183+
include/pybind11/detail/argument_vector.h
183184
include/pybind11/detail/class.h
184185
include/pybind11/detail/common.h
185186
include/pybind11/detail/cpp_conduit.h

include/pybind11/cast.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#pragma once
1212

13+
#include "detail/argument_vector.h"
1314
#include "detail/common.h"
1415
#include "detail/descr.h"
1516
#include "detail/native_enum_data.h"
@@ -2045,10 +2046,12 @@ struct function_call {
20452046
const function_record &func;
20462047

20472048
/// Arguments passed to the function:
2048-
std::vector<handle> args;
2049+
/// (Inline size chosen mostly arbitrarily; 5 should pad function_call out to two cache lines
2050+
/// (16 pointers) in size.)
2051+
argument_vector<5> args;
20492052

20502053
/// The `convert` value the arguments should be loaded with
2051-
std::vector<bool> args_convert;
2054+
args_convert_vector<5> args_convert;
20522055

20532056
/// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
20542057
/// present, are also in `args` but without a reference).
Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
/*
2+
pybind11/detail/argument_vector.h: small_vector-like containers to
3+
avoid heap allocation of arguments during function call dispatch.
4+
5+
Copyright (c) Meta Platforms, Inc. and affiliates.
6+
7+
All rights reserved. Use of this source code is governed by a
8+
BSD-style license that can be found in the LICENSE file.
9+
*/
10+
11+
#pragma once
12+
13+
#include <pybind11/pytypes.h>
14+
15+
#include "common.h"
16+
17+
#include <algorithm>
18+
#include <array>
19+
#include <cstdint>
20+
#include <cstring>
21+
#include <iterator>
22+
#include <utility>
23+
#include <vector>
24+
25+
PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
26+
27+
PYBIND11_WARNING_DISABLE_MSVC(4127)
28+
29+
PYBIND11_NAMESPACE_BEGIN(detail)
30+
31+
// Shared implementation utility for our small_vector-like containers.
32+
// We support C++11 and C++14, so we cannot use
33+
// std::variant. Union with the tag packed next to the inline
34+
// array's size is smaller anyway, allowing 1 extra handle of
35+
// inline storage for free. Compare the layouts (1 line per
36+
// size_t/void*):
37+
// With variant, total is N + 2 for N >= 2:
38+
// - variant tag (cannot be packed with the array size)
39+
// - array size (or first pointer of 3 in std::vector)
40+
// - N pointers of inline storage (or 2 remaining pointers of std::vector)
41+
// Custom union, total is N + 1 for N >= 3:
42+
// - variant tag & array size if applicable
43+
// - N pointers of inline storage (or 3 pointers of std::vector)
44+
//
45+
// NOTE: this is a low-level representational convenience; the two
46+
// use cases of this union are materially different and in particular
47+
// have different semantics for inline_array::size. All that is being
48+
// shared is the memory management behavior.
49+
template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
50+
union inline_array_or_vector {
51+
struct inline_array {
52+
bool is_inline = true;
53+
std::uint32_t size = 0;
54+
std::array<ArrayT, InlineSize> arr;
55+
};
56+
struct heap_vector {
57+
bool is_inline = false;
58+
std::vector<VectorT> vec;
59+
60+
heap_vector() = default;
61+
heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
62+
};
63+
64+
inline_array array;
65+
heap_vector vector;
66+
67+
static_assert(std::is_trivially_move_constructible<ArrayT>::value,
68+
"ArrayT must be trivially move constructible");
69+
static_assert(std::is_trivially_destructible<ArrayT>::value,
70+
"ArrayT must be trivially destructible");
71+
72+
inline_array_or_vector() : array() {}
73+
~inline_array_or_vector() {
74+
if (!is_inline()) {
75+
vector.~heap_vector();
76+
}
77+
}
78+
inline_array_or_vector(const inline_array_or_vector &) = delete;
79+
inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;
80+
81+
inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
82+
if (rhs.is_inline()) {
83+
std::memcpy(&array, &rhs.array, sizeof(array));
84+
} else {
85+
new (&vector) heap_vector(std::move(rhs.vector));
86+
}
87+
assert(is_inline() == rhs.is_inline());
88+
}
89+
90+
inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
91+
if (this == &rhs) {
92+
return *this;
93+
}
94+
95+
if (rhs.is_inline()) {
96+
if (!is_inline()) {
97+
vector.~heap_vector();
98+
}
99+
std::memcpy(&array, &rhs.array, sizeof(array));
100+
} else {
101+
if (is_inline()) {
102+
new (&vector) heap_vector(std::move(rhs.vector));
103+
} else {
104+
vector = std::move(rhs.vector);
105+
}
106+
}
107+
return *this;
108+
}
109+
110+
bool is_inline() const {
111+
// It is undefined behavior to access the inactive member of a
112+
// union directly. However, it is well-defined to reinterpret_cast any
113+
// pointer into a pointer to char and examine it as an array
114+
// of bytes. See
115+
// https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
116+
bool result = false;
117+
std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
118+
return result;
119+
}
120+
};
121+
122+
// small_vector-like container to avoid heap allocation for N or fewer
123+
// arguments.
124+
template <std::size_t N>
125+
struct argument_vector {
126+
public:
127+
argument_vector() = default;
128+
129+
argument_vector(const argument_vector &) = delete;
130+
argument_vector &operator=(const argument_vector &) = delete;
131+
argument_vector(argument_vector &&) noexcept = default;
132+
argument_vector &operator=(argument_vector &&) noexcept = default;
133+
134+
std::size_t size() const {
135+
if (is_inline()) {
136+
return m_repr.array.size;
137+
} else {
138+
return m_repr.vector.vec.size();
139+
}
140+
}
141+
142+
handle &operator[](std::size_t idx) {
143+
assert(idx < size());
144+
if (is_inline()) {
145+
return m_repr.array.arr[idx];
146+
} else {
147+
return m_repr.vector.vec[idx];
148+
}
149+
}
150+
151+
handle operator[](std::size_t idx) const {
152+
assert(idx < size());
153+
if (is_inline()) {
154+
return m_repr.array.arr[idx];
155+
} else {
156+
return m_repr.vector.vec[idx];
157+
}
158+
}
159+
160+
void push_back(handle x) {
161+
if (is_inline()) {
162+
auto &ha = m_repr.array;
163+
if (ha.size == N) {
164+
move_to_vector_with_reserved_size(N + 1);
165+
m_repr.vector.vec.push_back(x);
166+
} else {
167+
ha.arr[ha.size++] = x;
168+
}
169+
} else {
170+
m_repr.vector.vec.push_back(x);
171+
}
172+
}
173+
174+
template <typename Arg>
175+
void emplace_back(Arg &&x) {
176+
push_back(handle(x));
177+
}
178+
179+
void reserve(std::size_t sz) {
180+
if (is_inline()) {
181+
if (sz > N) {
182+
move_to_vector_with_reserved_size(sz);
183+
}
184+
} else {
185+
m_repr.vector.vec.reserve(sz);
186+
}
187+
}
188+
189+
private:
190+
using repr_type = inline_array_or_vector<handle, N>;
191+
repr_type m_repr;
192+
193+
void move_to_vector_with_reserved_size(std::size_t reserved_size) {
194+
assert(is_inline());
195+
auto &ha = m_repr.array;
196+
using heap_vector = typename repr_type::heap_vector;
197+
heap_vector hv;
198+
hv.vec.reserve(reserved_size);
199+
std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
200+
new (&m_repr.vector) heap_vector(std::move(hv));
201+
}
202+
203+
bool is_inline() const { return m_repr.is_inline(); }
204+
};
205+
206+
// small_vector-like container to avoid heap allocation for N or fewer
207+
// arguments.
208+
template <std::size_t kRequestedInlineSize>
209+
struct args_convert_vector {
210+
private:
211+
public:
212+
args_convert_vector() = default;
213+
214+
args_convert_vector(const args_convert_vector &) = delete;
215+
args_convert_vector &operator=(const args_convert_vector &) = delete;
216+
args_convert_vector(args_convert_vector &&) noexcept = default;
217+
args_convert_vector &operator=(args_convert_vector &&) noexcept = default;
218+
219+
args_convert_vector(std::size_t count, bool value) {
220+
if (count > kInlineSize) {
221+
new (&m_repr.vector) typename repr_type::heap_vector(count, value);
222+
} else {
223+
auto &inline_arr = m_repr.array;
224+
inline_arr.arr.fill(value ? std::size_t(-1) : 0);
225+
inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
226+
}
227+
}
228+
229+
std::size_t size() const {
230+
if (is_inline()) {
231+
return m_repr.array.size;
232+
} else {
233+
return m_repr.vector.vec.size();
234+
}
235+
}
236+
237+
void reserve(std::size_t sz) {
238+
if (is_inline()) {
239+
if (sz > kInlineSize) {
240+
move_to_vector_with_reserved_size(sz);
241+
}
242+
} else {
243+
m_repr.vector.vec.reserve(sz);
244+
}
245+
}
246+
247+
bool operator[](std::size_t idx) const {
248+
if (is_inline()) {
249+
return inline_index(idx);
250+
} else {
251+
assert(idx < m_repr.vector.vec.size());
252+
return m_repr.vector.vec[idx];
253+
}
254+
}
255+
256+
void push_back(bool b) {
257+
if (is_inline()) {
258+
auto &ha = m_repr.array;
259+
if (ha.size == kInlineSize) {
260+
move_to_vector_with_reserved_size(kInlineSize + 1);
261+
m_repr.vector.vec.push_back(b);
262+
} else {
263+
assert(ha.size < kInlineSize);
264+
const auto wbi = word_and_bit_index(ha.size++);
265+
assert(wbi.word < kWords);
266+
assert(wbi.bit < kBitsPerWord);
267+
if (b) {
268+
ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
269+
} else {
270+
ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
271+
}
272+
assert(operator[](ha.size - 1) == b);
273+
}
274+
} else {
275+
m_repr.vector.vec.push_back(b);
276+
}
277+
}
278+
279+
void swap(args_convert_vector &rhs) { std::swap(m_repr, rhs.m_repr); }
280+
281+
private:
282+
struct WordAndBitIndex {
283+
std::size_t word;
284+
std::size_t bit;
285+
};
286+
287+
static WordAndBitIndex word_and_bit_index(std::size_t idx) {
288+
return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
289+
}
290+
291+
bool inline_index(std::size_t idx) const {
292+
const auto wbi = word_and_bit_index(idx);
293+
assert(wbi.word < kWords);
294+
assert(wbi.bit < kBitsPerWord);
295+
return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit);
296+
}
297+
298+
void move_to_vector_with_reserved_size(std::size_t reserved_size) {
299+
auto &inline_arr = m_repr.array;
300+
using heap_vector = typename repr_type::heap_vector;
301+
heap_vector hv;
302+
hv.vec.reserve(reserved_size);
303+
for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
304+
hv.vec.push_back(inline_index(ii));
305+
}
306+
new (&m_repr.vector) heap_vector(std::move(hv));
307+
}
308+
309+
static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
310+
static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
311+
static constexpr auto kInlineSize = kWords * kBitsPerWord;
312+
313+
using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
314+
repr_type m_repr;
315+
316+
bool is_inline() const { return m_repr.is_inline(); }
317+
};
318+
319+
PYBIND11_NAMESPACE_END(detail)
320+
PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)

include/pybind11/pybind11.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1048,12 +1048,12 @@ class cpp_function : public function {
10481048
}
10491049
#endif
10501050

1051-
std::vector<bool> second_pass_convert;
1051+
args_convert_vector<5> second_pass_convert;
10521052
if (overloaded) {
10531053
// We're in the first no-convert pass, so swap out the conversion flags for a
10541054
// set of all-false flags. If the call fails, we'll swap the flags back in for
10551055
// the conversion-allowed call below.
1056-
second_pass_convert.resize(func.nargs, false);
1056+
second_pass_convert = args_convert_vector<5>(func.nargs, false);
10571057
call.args_convert.swap(second_pass_convert);
10581058
}
10591059

tests/extra_python_package/test_files.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@
7676
}
7777

7878
detail_headers = {
79+
"include/pybind11/detail/argument_vector.h",
7980
"include/pybind11/detail/class.h",
8081
"include/pybind11/detail/common.h",
8182
"include/pybind11/detail/cpp_conduit.h",

tests/test_embed/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ if(PYBIND11_TEST_SMART_HOLDER)
3333
-DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE)
3434
endif()
3535

36-
add_executable(test_embed catch.cpp test_interpreter.cpp test_subinterpreter.cpp)
36+
add_executable(test_embed catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
37+
test_interpreter.cpp test_subinterpreter.cpp)
3738
pybind11_enable_warnings(test_embed)
3839

3940
target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)

0 commit comments

Comments
 (0)