Avoid heap allocation for function calls with a small number of arguments

swolchok · swolchok · commit 9415686fede5 · 2025-09-04T12:48:39.000-07:00
We don't have access to llvm::SmallVector or similar, but given the
limited subset of the `std::vector` API that
`function_call::args{,_convert}` need and the "reserve-then-fill"
usage pattern, it is relatively straightforward to implement custom
containers that get the job done.

Seems to improves time to call the collatz function in
pybind/pybind11_benchmark significantly; numbers are a little noisy
but there's a clear improvement from "about 60 ns per call" to "about
45 ns per call" on my machine (M4 Max Mac), as measured with
`timeit.repeat('collatz(4)', 'from pybind11_benchmark import
collatz')`.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -180,6 +180,7 @@ if(PYBIND11_MASTER_PROJECT)
 endif()
 
 set(PYBIND11_HEADERS
+    include/pybind11/detail/argument_vector.h
     include/pybind11/detail/class.h
     include/pybind11/detail/common.h
     include/pybind11/detail/cpp_conduit.h
diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include "detail/argument_vector.h"
 #include "detail/common.h"
 #include "detail/descr.h"
 #include "detail/native_enum_data.h"
@@ -2045,10 +2046,12 @@ struct function_call {
     const function_record &func;
 
     /// Arguments passed to the function:
-    std::vector<handle> args;
+    /// (Inline size chosen mostly arbitrarily; 5 should pad function_call out to two cache lines
+    /// (16 pointers) in size.)
+    argument_vector<5> args;
 
     /// The `convert` value the arguments should be loaded with
-    std::vector<bool> args_convert;
+    args_convert_vector<5> args_convert;
 
     /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
     /// present, are also in `args` but without a reference).
diff --git a/include/pybind11/detail/argument_vector.h b/include/pybind11/detail/argument_vector.h
@@ -0,0 +1,320 @@
+/*
+    pybind11/detail/argument_vector.h: small_vector-like containers to
+    avoid heap allocation of arguments during function call dispatch.
+
+    Copyright (c) Meta Platforms, Inc. and affiliates.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Shared implementation utility for our small_vector-like containers.
+// We support C++11 and C++14, so we cannot use
+// std::variant. Union with the tag packed next to the inline
+// array's size is smaller anyway, allowing 1 extra handle of
+// inline storage for free. Compare the layouts (1 line per
+// size_t/void*):
+// With variant, total is N + 2 for N >= 2:
+// - variant tag (cannot be packed with the array size)
+// - array size (or first pointer of 3 in std::vector)
+// - N pointers of inline storage (or 2 remaining pointers of std::vector)
+// Custom union, total is N + 1 for N >= 3:
+// - variant tag & array size if applicable
+// - N pointers of inline storage (or 3 pointers of std::vector)
+//
+// NOTE: this is a low-level representational convenience; the two
+// use cases of this union are materially different and in particular
+// have different semantics for inline_array::size. All that is being
+// shared is the memory management behavior.
+template <typename ArrayT, std::size_t InlineSize, typename VectorT = ArrayT>
+union inline_array_or_vector {
+    struct inline_array {
+        bool is_inline = true;
+        std::uint32_t size = 0;
+        std::array<ArrayT, InlineSize> arr;
+    };
+    struct heap_vector {
+        bool is_inline = false;
+        std::vector<VectorT> vec;
+
+        heap_vector() = default;
+        heap_vector(std::size_t count, VectorT value) : vec(count, value) {}
+    };
+
+    inline_array array;
+    heap_vector vector;
+
+    static_assert(std::is_trivially_move_constructible<ArrayT>::value,
+                  "ArrayT must be trivially move constructible");
+    static_assert(std::is_trivially_destructible<ArrayT>::value,
+                  "ArrayT must be trivially destructible");
+
+    inline_array_or_vector() : array() {}
+    ~inline_array_or_vector() {
+        if (!is_inline()) {
+            vector.~heap_vector();
+        }
+    }
+    inline_array_or_vector(const inline_array_or_vector &) = delete;
+    inline_array_or_vector &operator=(const inline_array_or_vector &) = delete;
+
+    inline_array_or_vector(inline_array_or_vector &&rhs) noexcept {
+        if (rhs.is_inline()) {
+            std::memcpy(&array, &rhs.array, sizeof(array));
+        } else {
+            new (&vector) heap_vector(std::move(rhs.vector));
+        }
+        assert(is_inline() == rhs.is_inline());
+    }
+
+    inline_array_or_vector &operator=(inline_array_or_vector &&rhs) noexcept {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.is_inline()) {
+            if (!is_inline()) {
+                vector.~heap_vector();
+            }
+            std::memcpy(&array, &rhs.array, sizeof(array));
+        } else {
+            if (is_inline()) {
+                new (&vector) heap_vector(std::move(rhs.vector));
+            } else {
+                vector = std::move(rhs.vector);
+            }
+        }
+        return *this;
+    }
+
+    bool is_inline() const {
+        // It is undefined behavior to access the inactive member of a
+        // union directly. However, it is well-defined to reinterpret_cast any
+        // pointer into a pointer to char and examine it as an array
+        // of bytes. See
+        // https://dev-discuss.pytorch.org/t/unionizing-for-profit-how-to-exploit-the-power-of-unions-in-c/444#the-memcpy-loophole-4
+        bool result = false;
+        std::memcpy(&result, reinterpret_cast<const char *>(this), sizeof(bool));
+        return result;
+    }
+};
+
+// small_vector-like container to avoid heap allocation for N or fewer
+// arguments.
+template <std::size_t N>
+struct argument_vector {
+public:
+    argument_vector() = default;
+
+    argument_vector(const argument_vector &) = delete;
+    argument_vector &operator=(const argument_vector &) = delete;
+    argument_vector(argument_vector &&) noexcept = default;
+    argument_vector &operator=(argument_vector &&) noexcept = default;
+
+    std::size_t size() const {
+        if (is_inline()) {
+            return m_repr.array.size;
+        } else {
+            return m_repr.vector.vec.size();
+        }
+    }
+
+    handle &operator[](std::size_t idx) {
+        assert(idx < size());
+        if (is_inline()) {
+            return m_repr.array.arr[idx];
+        } else {
+            return m_repr.vector.vec[idx];
+        }
+    }
+
+    handle operator[](std::size_t idx) const {
+        assert(idx < size());
+        if (is_inline()) {
+            return m_repr.array.arr[idx];
+        } else {
+            return m_repr.vector.vec[idx];
+        }
+    }
+
+    void push_back(handle x) {
+        if (is_inline()) {
+            auto &ha = m_repr.array;
+            if (ha.size == N) {
+                move_to_vector_with_reserved_size(N + 1);
+                m_repr.vector.vec.push_back(x);
+            } else {
+                ha.arr[ha.size++] = x;
+            }
+        } else {
+            m_repr.vector.vec.push_back(x);
+        }
+    }
+
+    template <typename Arg>
+    void emplace_back(Arg &&x) {
+        push_back(handle(x));
+    }
+
+    void reserve(std::size_t sz) {
+        if (is_inline()) {
+            if (sz > N) {
+                move_to_vector_with_reserved_size(sz);
+            }
+        } else {
+            m_repr.vector.vec.reserve(sz);
+        }
+    }
+
+private:
+    using repr_type = inline_array_or_vector<handle, N>;
+    repr_type m_repr;
+
+    void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+        assert(is_inline());
+        auto &ha = m_repr.array;
+        using heap_vector = typename repr_type::heap_vector;
+        heap_vector hv;
+        hv.vec.reserve(reserved_size);
+        std::copy(ha.arr.begin(), ha.arr.begin() + ha.size, std::back_inserter(hv.vec));
+        new (&m_repr.vector) heap_vector(std::move(hv));
+    }
+
+    bool is_inline() const { return m_repr.is_inline(); }
+};
+
+// small_vector-like container to avoid heap allocation for N or fewer
+// arguments.
+template <std::size_t kRequestedInlineSize>
+struct args_convert_vector {
+private:
+public:
+    args_convert_vector() = default;
+
+    args_convert_vector(const args_convert_vector &) = delete;
+    args_convert_vector &operator=(const args_convert_vector &) = delete;
+    args_convert_vector(args_convert_vector &&) noexcept = default;
+    args_convert_vector &operator=(args_convert_vector &&) noexcept = default;
+
+    args_convert_vector(std::size_t count, bool value) {
+        if (count > kInlineSize) {
+            new (&m_repr.vector) typename repr_type::heap_vector(count, value);
+        } else {
+            auto &inline_arr = m_repr.array;
+            inline_arr.arr.fill(value ? std::size_t(-1) : 0);
+            inline_arr.size = static_cast<decltype(inline_arr.size)>(count);
+        }
+    }
+
+    std::size_t size() const {
+        if (is_inline()) {
+            return m_repr.array.size;
+        } else {
+            return m_repr.vector.vec.size();
+        }
+    }
+
+    void reserve(std::size_t sz) {
+        if (is_inline()) {
+            if (sz > kInlineSize) {
+                move_to_vector_with_reserved_size(sz);
+            }
+        } else {
+            m_repr.vector.vec.reserve(sz);
+        }
+    }
+
+    bool operator[](std::size_t idx) const {
+        if (is_inline()) {
+            return inline_index(idx);
+        } else {
+            assert(idx < m_repr.vector.vec.size());
+            return m_repr.vector.vec[idx];
+        }
+    }
+
+    void push_back(bool b) {
+        if (is_inline()) {
+            auto &ha = m_repr.array;
+            if (ha.size == kInlineSize) {
+                move_to_vector_with_reserved_size(kInlineSize + 1);
+                m_repr.vector.vec.push_back(b);
+            } else {
+                assert(ha.size < kInlineSize);
+                const auto wbi = word_and_bit_index(ha.size++);
+                assert(wbi.word < kWords);
+                assert(wbi.bit < kBitsPerWord);
+                if (b) {
+                    ha.arr[wbi.word] |= (std::size_t(1) << wbi.bit);
+                } else {
+                    ha.arr[wbi.word] &= ~(std::size_t(1) << wbi.bit);
+                }
+                assert(operator[](ha.size - 1) == b);
+            }
+        } else {
+            m_repr.vector.vec.push_back(b);
+        }
+    }
+
+    void swap(args_convert_vector &rhs) { std::swap(m_repr, rhs.m_repr); }
+
+private:
+    struct WordAndBitIndex {
+        std::size_t word;
+        std::size_t bit;
+    };
+
+    static WordAndBitIndex word_and_bit_index(std::size_t idx) {
+        return WordAndBitIndex{idx / kBitsPerWord, idx % kBitsPerWord};
+    }
+
+    bool inline_index(std::size_t idx) const {
+        const auto wbi = word_and_bit_index(idx);
+        assert(wbi.word < kWords);
+        assert(wbi.bit < kBitsPerWord);
+        return m_repr.array.arr[wbi.word] & (std::size_t(1) << wbi.bit);
+    }
+
+    void move_to_vector_with_reserved_size(std::size_t reserved_size) {
+        auto &inline_arr = m_repr.array;
+        using heap_vector = typename repr_type::heap_vector;
+        heap_vector hv;
+        hv.vec.reserve(reserved_size);
+        for (std::size_t ii = 0; ii < inline_arr.size; ++ii) {
+            hv.vec.push_back(inline_index(ii));
+        }
+        new (&m_repr.vector) heap_vector(std::move(hv));
+    }
+
+    static constexpr auto kBitsPerWord = 8 * sizeof(std::size_t);
+    static constexpr auto kWords = (kRequestedInlineSize + kBitsPerWord - 1) / kBitsPerWord;
+    static constexpr auto kInlineSize = kWords * kBitsPerWord;
+
+    using repr_type = inline_array_or_vector<std::size_t, kWords, bool>;
+    repr_type m_repr;
+
+    bool is_inline() const { return m_repr.is_inline(); }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h
@@ -1048,12 +1048,12 @@ class cpp_function : public function {
                 }
 #endif
 
-                std::vector<bool> second_pass_convert;
+                args_convert_vector<5> second_pass_convert;
                 if (overloaded) {
                     // We're in the first no-convert pass, so swap out the conversion flags for a
                     // set of all-false flags.  If the call fails, we'll swap the flags back in for
                     // the conversion-allowed call below.
-                    second_pass_convert.resize(func.nargs, false);
+                    second_pass_convert = args_convert_vector<5>(func.nargs, false);
                     call.args_convert.swap(second_pass_convert);
                 }
 
diff --git a/tests/extra_python_package/test_files.py b/tests/extra_python_package/test_files.py
@@ -76,6 +76,7 @@
 }
 
 detail_headers = {
+    "include/pybind11/detail/argument_vector.h",
     "include/pybind11/detail/class.h",
     "include/pybind11/detail/common.h",
     "include/pybind11/detail/cpp_conduit.h",
diff --git a/tests/test_embed/CMakeLists.txt b/tests/test_embed/CMakeLists.txt
@@ -33,7 +33,8 @@ if(PYBIND11_TEST_SMART_HOLDER)
     -DPYBIND11_RUN_TESTING_WITH_SMART_HOLDER_AS_DEFAULT_BUT_NEVER_USE_IN_PRODUCTION_PLEASE)
 endif()
 
-add_executable(test_embed catch.cpp test_interpreter.cpp test_subinterpreter.cpp)
+add_executable(test_embed catch.cpp test_args_convert_vector.cpp test_argument_vector.cpp
+                          test_interpreter.cpp test_subinterpreter.cpp)
 pybind11_enable_warnings(test_embed)
 
 target_link_libraries(test_embed PRIVATE pybind11::embed Catch2::Catch2 Threads::Threads)
diff --git a/tests/test_embed/test_args_convert_vector.cpp b/tests/test_embed/test_args_convert_vector.cpp
diff --git a/tests/test_embed/test_argument_vector.cpp b/tests/test_embed/test_argument_vector.cpp

Original file line number	Diff line number	Diff line change
`@@ -1048,12 +1048,12 @@ class cpp_function : public function {`
`1048`	`1048`	`}`
`1049`	`1049`	`#endif`
`1050`	`1050`
`1051`		`- std::vector<bool> second_pass_convert;`
	`1051`	`+ args_convert_vector<5> second_pass_convert;`
`1052`	`1052`	`if (overloaded) {`
`1053`	`1053`	`// We're in the first no-convert pass, so swap out the conversion flags for a`
`1054`	`1054`	`// set of all-false flags. If the call fails, we'll swap the flags back in for`
`1055`	`1055`	`// the conversion-allowed call below.`
`1056`		`- second_pass_convert.resize(func.nargs, false);`
	`1056`	`+ second_pass_convert = args_convert_vector<5>(func.nargs, false);`
`1057`	`1057`	`call.args_convert.swap(second_pass_convert);`
`1058`	`1058`	`}`
`1059`	`1059`
Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,7 @@`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`detail_headers = {`
	`79`	`+ "include/pybind11/detail/argument_vector.h",`
`79`	`80`	`"include/pybind11/detail/class.h",`
`80`	`81`	`"include/pybind11/detail/common.h",`
`81`	`82`	`"include/pybind11/detail/cpp_conduit.h",`