From edd2b9be9330fa78c31969ab4fa76de76ab04d06 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Tue, 28 Jan 2025 18:14:22 -0800
Subject: [PATCH 1/3] Add the 'bm_btree' benchmark.

---
 doc/benchmarks.rst                            |  16 +
 pyperformance/data-files/benchmarks/MANIFEST  |   2 +
 .../benchmarks/bm_btree/bm_btree.toml         |   3 +
 .../benchmarks/bm_btree/bm_btree_gc_only.toml |   3 +
 .../benchmarks/bm_btree/pyproject.toml        |   9 +
 .../benchmarks/bm_btree/run_benchmark.py      | 474 ++++++++++++++++++
 6 files changed, 507 insertions(+)
 create mode 100644 pyperformance/data-files/benchmarks/bm_btree/bm_btree.toml
 create mode 100644 pyperformance/data-files/benchmarks/bm_btree/bm_btree_gc_only.toml
 create mode 100644 pyperformance/data-files/benchmarks/bm_btree/pyproject.toml
 create mode 100644 pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py

diff --git a/doc/benchmarks.rst b/doc/benchmarks.rst
index 54c5e69c..0edae21a 100644
--- a/doc/benchmarks.rst
+++ b/doc/benchmarks.rst
@@ -76,6 +76,22 @@ These benchmarks also have an "eager" flavor that uses asyncio eager task factor
 if available.
 
 
+btree
+-----
+
+Benchmark a pure-Python implementation of a B-tree data structure.  The tree
+is created with a relatively large number of nodes (default is 200,000).  This
+attempts to simulate an application that operates on a large number of objects
+in memory (at least, large compared to other benchmarks currently in this
+suite). There are two variations of this benchmark: `btree` records the time to
+create the B-tree, run `gc.collect()` and then do some operations on it; the
+`btree_gc_only` variant records only the time to run `gc.collect()` and it
+skips the operations after creation.
+
+Note that this benchmark does not create any reference cycles that the garbage
+collector will need to break to free memory.
+
+
 chameleon
 ---------
 
diff --git a/pyperformance/data-files/benchmarks/MANIFEST b/pyperformance/data-files/benchmarks/MANIFEST
index 301245a9..cffe891a 100644
--- a/pyperformance/data-files/benchmarks/MANIFEST
+++ b/pyperformance/data-files/benchmarks/MANIFEST
@@ -25,6 +25,8 @@ asyncio_tcp	<local>
 asyncio_tcp_ssl	<local:asyncio_tcp>
 asyncio_websockets	<local>
 bpe_tokeniser	<local>
+btree	<local>
+btree_gc_only	<local:btree>
 concurrent_imap	<local>
 coroutines	<local>
 coverage	<local>
diff --git a/pyperformance/data-files/benchmarks/bm_btree/bm_btree.toml b/pyperformance/data-files/benchmarks/bm_btree/bm_btree.toml
new file mode 100644
index 00000000..044ce719
--- /dev/null
+++ b/pyperformance/data-files/benchmarks/bm_btree/bm_btree.toml
@@ -0,0 +1,3 @@
+[tool.pyperformance]
+name = "btree_gc"
+extra_opts = ["all"]
diff --git a/pyperformance/data-files/benchmarks/bm_btree/bm_btree_gc_only.toml b/pyperformance/data-files/benchmarks/bm_btree/bm_btree_gc_only.toml
new file mode 100644
index 00000000..9b2a2b3a
--- /dev/null
+++ b/pyperformance/data-files/benchmarks/bm_btree/bm_btree_gc_only.toml
@@ -0,0 +1,3 @@
+[tool.pyperformance]
+name = "btree_gc"
+extra_opts = ["--gc-only"]
diff --git a/pyperformance/data-files/benchmarks/bm_btree/pyproject.toml b/pyperformance/data-files/benchmarks/bm_btree/pyproject.toml
new file mode 100644
index 00000000..77d630c9
--- /dev/null
+++ b/pyperformance/data-files/benchmarks/bm_btree/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "pyperformance_bm_btree"
+requires-python = ">=3.8"
+dependencies = ["pyperf"]
+urls = {repository = "https://github.com/python/pyperformance"}
+dynamic = ["version"]
+
+[tool.pyperformance]
+name = "btree"
diff --git a/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py b/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py
new file mode 100644
index 00000000..04d20247
--- /dev/null
+++ b/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py
@@ -0,0 +1,474 @@
+"""
+Benchmark for b-tree workload.  This is intended to exercise the cyclic
+garbage collector by presenting it with a large and interconnected
+object graph.
+"""
+
+import collections.abc
+import gc
+import random
+import sys
+
+import pyperf
+
+# Total number of b-tree nodes to create.  We would like this to be
+# large enough so that the working set of data doesn't fit into the CPU
+# cache.  This benchmark is supposed to be similar to a real application
+# that hold a large number of Python objects in RAM and does some
+# processing on them.
+NUM_NODES = 200_000
+
+# Fraction of tree to re-create after initial creation.  Set to zero to
+# disable re-creation.
+RECREATE_FRACTION = 0.2
+
+# Seed value for random generator
+RANDOM_SEED = 0
+
+
+class BNode:
+    """
+    Instance attributes:
+      items: list
+      nodes: [BNode]
+    """
+
+    __slots__ = ['items', 'nodes']
+
+    minimum_degree = 16  # a.k.a. t
+
+    def __init__(self):
+        self.items = []
+        self.nodes = None
+
+    def is_leaf(self):
+        return self.nodes is None
+
+    def __iter__(self):
+        if self.is_leaf():
+            for item in self.items:
+                yield item
+        else:
+            for position, item in enumerate(self.items):
+                for it in self.nodes[position]:
+                    yield it
+                yield item
+            for it in self.nodes[-1]:
+                yield it
+
+    def is_full(self):
+        return len(self.items) == 2 * self.minimum_degree - 1
+
+    def get_position(self, key):
+        for position, item in enumerate(self.items):
+            if item[0] >= key:
+                return position
+        return len(self.items)
+
+    def search(self, key):
+        """(key:anything) -> None | (key:anything, value:anything)
+        Return the matching pair, or None.
+        """
+        position = self.get_position(key)
+        if position < len(self.items) and self.items[position][0] == key:
+            return self.items[position]
+        elif self.is_leaf():
+            return None
+        else:
+            return self.nodes[position].search(key)
+
+    def insert_item(self, item):
+        """(item:(key:anything, value:anything))"""
+        assert not self.is_full()
+        key = item[0]
+        position = self.get_position(key)
+        if position < len(self.items) and self.items[position][0] == key:
+            self.items[position] = item
+        elif self.is_leaf():
+            self.items.insert(position, item)
+        else:
+            child = self.nodes[position]
+            if child.is_full():
+                self.split_child(position, child)
+                if key == self.items[position][0]:
+                    self.items[position] = item
+                else:
+                    if key > self.items[position][0]:
+                        position += 1
+                    self.nodes[position].insert_item(item)
+            else:
+                self.nodes[position].insert_item(item)
+
+    def split_child(self, position, child):
+        """(position:int, child:BNode)"""
+        assert not self.is_full()
+        assert not self.is_leaf()
+        assert self.nodes[position] is child
+        assert child.is_full()
+        bigger = self.__class__()
+        middle = self.minimum_degree - 1
+        splitting_key = child.items[middle]
+        bigger.items = child.items[middle + 1 :]
+        child.items = child.items[:middle]
+        assert len(bigger.items) == len(child.items)
+        if not child.is_leaf():
+            bigger.nodes = child.nodes[middle + 1 :]
+            child.nodes = child.nodes[: middle + 1]
+            assert len(bigger.nodes) == len(child.nodes)
+        self.items.insert(position, splitting_key)
+        self.nodes.insert(position + 1, bigger)
+
+    def get_count(self):
+        """() -> int
+        How many items are stored in this node and descendants?
+        """
+        result = len(self.items)
+        for node in self.nodes or []:
+            result += node.get_count()
+        return result
+
+    def get_node_count(self):
+        """() -> int
+        How many nodes are here, including descendants?
+        """
+        result = 1
+        for node in self.nodes or []:
+            result += node.get_node_count()
+        return result
+
+    def get_level(self):
+        """() -> int
+        How many levels of nodes are there between this node
+        and descendant leaf nodes?
+        """
+        if self.is_leaf():
+            return 0
+        else:
+            return 1 + self.nodes[0].get_level()
+
+    def delete(self, key):
+        """(key:anything)
+        Delete the item with this key.
+        This is intended to follow the description in 19.3 of
+        'Introduction to Algorithms' by Cormen, Lieserson, and Rivest.
+        """
+
+        def is_big(node):
+            # Precondition for recursively calling node.delete(key).
+            return node and len(node.items) >= node.minimum_degree
+
+        p = self.get_position(key)
+        matches = p < len(self.items) and self.items[p][0] == key
+        if self.is_leaf():
+            if matches:
+                # Case 1.
+                del self.items[p]
+            else:
+                raise KeyError(key)
+        else:
+            node = self.nodes[p]
+            lower_sibling = p > 0 and self.nodes[p - 1]
+            upper_sibling = p < len(self.nodes) - 1 and self.nodes[p + 1]
+            if matches:
+                # Case 2.
+                if is_big(node):
+                    # Case 2a.
+                    extreme = node.get_max_item()
+                    node.delete(extreme[0])
+                    self.items[p] = extreme
+                elif is_big(upper_sibling):
+                    # Case 2b.
+                    extreme = upper_sibling.get_min_item()
+                    upper_sibling.delete(extreme[0])
+                    self.items[p] = extreme
+                else:
+                    # Case 2c.
+                    extreme = upper_sibling.get_min_item()
+                    upper_sibling.delete(extreme[0])
+                    node.items = node.items + [extreme] + upper_sibling.items
+                    if not node.is_leaf():
+                        node.nodes = node.nodes + upper_sibling.nodes
+                    del self.items[p]
+                    del self.nodes[p + 1]
+            else:
+                if not is_big(node):
+                    if is_big(lower_sibling):
+                        # Case 3a1: Shift an item from lower_sibling.
+                        node.items.insert(0, self.items[p - 1])
+                        self.items[p - 1] = lower_sibling.items[-1]
+                        del lower_sibling.items[-1]
+                        if not node.is_leaf():
+                            node.nodes.insert(0, lower_sibling.nodes[-1])
+                            del lower_sibling.nodes[-1]
+                    elif is_big(upper_sibling):
+                        # Case 3a2: Shift an item from upper_sibling.
+                        node.items.append(self.items[p])
+                        self.items[p] = upper_sibling.items[0]
+                        del upper_sibling.items[0]
+                        if not node.is_leaf():
+                            node.nodes.append(upper_sibling.nodes[0])
+                            del upper_sibling.nodes[0]
+                    elif lower_sibling:
+                        # Case 3b1: Merge with lower_sibling
+                        node.items = (
+                            lower_sibling.items
+                            + [self.items[p - 1]]
+                            + node.items
+                        )
+                        if not node.is_leaf():
+                            node.nodes = lower_sibling.nodes + node.nodes
+                        del self.items[p - 1]
+                        del self.nodes[p - 1]
+                    else:
+                        # Case 3b2: Merge with upper_sibling
+                        node.items = (
+                            node.items + [self.items[p]] + upper_sibling.items
+                        )
+                        if not node.is_leaf():
+                            node.nodes = node.nodes + upper_sibling.nodes
+                        del self.items[p]
+                        del self.nodes[p + 1]
+                assert is_big(node)
+                node.delete(key)
+            if not self.items:
+                # This can happen when self is the root node.
+                self.items = self.nodes[0].items
+                self.nodes = self.nodes[0].nodes
+
+
+class BTree(collections.abc.MutableMapping):
+    """
+    Instance attributes:
+      root: BNode
+    """
+
+    __slots__ = ['root']
+
+    def __init__(self, node_constructor=BNode):
+        assert issubclass(node_constructor, BNode)
+        self.root = node_constructor()
+
+    def __nonzero__(self):
+        return bool(self.root.items)
+
+    __bool__ = __nonzero__
+
+    def iteritems(self):
+        for item in self.root:
+            yield item
+
+    def iterkeys(self):
+        for item in self.root:
+            yield item[0]
+
+    def itervalues(self):
+        for item in self.root:
+            yield item[1]
+
+    def items(self):
+        return list(self.iteritems())
+
+    def keys(self):
+        return list(self.iterkeys())
+
+    def values(self):
+        return list(self.itervalues())
+
+    def __iter__(self):
+        for key in self.iterkeys():
+            yield key
+
+    def __contains__(self, key):
+        return self.root.search(key) is not None
+
+    def has_key(self, key):
+        return self.root.search(key) is not None
+
+    def __setitem__(self, key, value):
+        self.add(key, value)
+
+    def setdefault(self, key, value):
+        item = self.root.search(key)
+        if item is None:
+            self.add(key, value)
+            return value
+        return item[1]
+
+    def __getitem__(self, key):
+        item = self.root.search(key)
+        if item is None:
+            raise KeyError(key)
+        return item[1]
+
+    def __delitem__(self, key):
+        self.root.delete(key)
+
+    def clear(self):
+        self.root = self.root.__class__()
+
+    def get(self, key, default=None):
+        """(key:anything, default:anything=None) -> anything"""
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    def add(self, key, value=True):
+        """(key:anything, value:anything=True)
+        Make self[key] == val.
+        """
+        if self.root.is_full():
+            # replace and split.
+            node = self.root.__class__()
+            node.nodes = [self.root]
+            node.split_child(0, node.nodes[0])
+            self.root = node
+        self.root.insert_item((key, value))
+
+    def __len__(self):
+        """() -> int
+        Compute and return the total number of items."""
+        return self.root.get_count()
+
+    def get_depth(self):
+        """() -> int
+        How many levels of nodes are used for this BTree?
+        """
+        return self.root.get_level() + 1
+
+    def get_node_count(self):
+        """() -> int
+        How many nodes are used for this BTree?
+        """
+        return self.root.get_node_count()
+
+
+class Record:
+    def __init__(self, a, b, c, d, e, f):
+        self.a = a
+        self.b = b
+        self.c = c
+        self.d = d
+        self.e = e
+        self.f = f
+
+
+def make_records(num_nodes):
+    rnd = random.Random(RANDOM_SEED)
+    for node_id in range(num_nodes):
+        a = node_id
+        b = f'node {node_id}'
+        c = rnd.randbytes(node_id % 100)
+        d = rnd.random()
+        e = sys.intern(str(rnd.randint(0, 30)))
+        f = rnd.choice([None, True, False])
+        yield Record(a, b, c, d, e, f)
+
+
+def make_tree(num_nodes, records):
+    ids = list(range(num_nodes))
+    # Create the tree with randomized key order.
+    random.shuffle(ids)
+
+    tree = BTree()
+    for node_id in ids:
+        tree[node_id] = records[node_id]
+
+    if RECREATE_FRACTION > 0:
+        # Re-create part of the tree.  This can cause objects in memory
+        # to become more fragmented or shuffled since they are not allocated
+        # in sequence.  Since we created nodes with keys in random order, we
+        # can delete the lowest numbered ones and re-make those.
+        remake_ids = range(int(num_nodes * RECREATE_FRACTION))
+        for node_id in remake_ids:
+            del tree[node_id]
+        for node_id in remake_ids:
+            tree[node_id] = records[node_id]
+
+    return tree
+
+
+def run_once(gc_only, records):
+    start = pyperf.perf_counter()
+    obj = make_tree(NUM_NODES, records)
+
+    gc_total_time = 0
+    gc_start = pyperf.perf_counter()
+    gc.collect()
+    gc_total_time += pyperf.perf_counter() - gc_start
+
+    if not gc_only:
+        # Iterate over all nodes and add up the value of the 'd' attribute.
+        d_total = 0.0
+        for key in obj:
+            node = obj[key]
+            d_total += node.d
+
+        # Lookup a random subset of nodes, add up value of 'd'
+        num_lookup = max(200, NUM_NODES // 20)
+        d_total = 0
+        rnd = random.Random(RANDOM_SEED)
+        for i in range(num_lookup):
+            node_id = rnd.randint(0, NUM_NODES)
+            node = obj.get(node_id)
+            if node is not None:
+                d_total += node.d
+
+        # Return the time to do everything, except creating the records
+        return pyperf.perf_counter() - start
+
+    else:
+        # Return time only for gc.collect()
+        return gc_total_time
+
+
+def run_bench(loops, gc_only):
+    # Create the set of records outside the timed section.  In a real
+    # application, the data would likely come from a file, a database or
+    # from some other network service.  We don't want to benchmark the
+    # 'random' module.
+    records = list(make_records(NUM_NODES))
+    total_time = 0
+    for i in range(loops):
+        random.seed(RANDOM_SEED)
+        total_time += run_once(gc_only, records)
+    return total_time
+
+
+def add_metadata(runner):
+    runner.metadata["description"] = "BTree data structure operations."
+    runner.metadata["btree_num_nodes"] = NUM_NODES
+    runner.metadata["btree_recreate_fraction"] = RECREATE_FRACTION
+    runner.metadata["btree_random_seed"] = RANDOM_SEED
+
+
+def add_cmdline_args(cmd, args):
+    if args.gc_only:
+        cmd.append("--gc-only")
+
+
+if __name__ == "__main__":
+    # This benchmark takes a long time to run one loop, compared to most other pyperformance benchmarks.
+    # We override the defaults for 'processes', 'loops', etc in order to run in a reasonable amount of
+    # time while still (hopefully) keeping the timings stable.
+    runner = pyperf.Runner(
+        add_cmdline_args=add_cmdline_args,
+        processes=1,
+        loops=1,
+        values=3,
+        warmups=1,
+        min_time=0.4,
+    )
+    parser = runner.argparser
+    add_metadata(runner)
+    parser.add_argument(
+        '--gc-only',
+        action='store_true',
+        default=False,
+        help='Record time only for the GC collection.',
+    )
+    args = runner.parse_args()
+    bench_name = "btree"
+    if args.gc_only:
+        bench_name += '_gc_only'
+    runner.bench_time_func(bench_name, run_bench, args.gc_only)

From bc21dbcce892cf1323d9adc31b1f013de7572907 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Thu, 30 Jan 2025 12:51:58 -0800
Subject: [PATCH 2/3] Remove unused methods in BTree classes.

---
 .../benchmarks/bm_btree/run_benchmark.py      | 59 ++-----------------
 1 file changed, 4 insertions(+), 55 deletions(-)

diff --git a/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py b/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py
index 04d20247..ecc7fa3b 100644
--- a/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py
+++ b/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py
@@ -4,7 +4,6 @@
 object graph.
 """
 
-import collections.abc
 import gc
 import random
 import sys
@@ -127,15 +126,6 @@ def get_count(self):
             result += node.get_count()
         return result
 
-    def get_node_count(self):
-        """() -> int
-        How many nodes are here, including descendants?
-        """
-        result = 1
-        for node in self.nodes or []:
-            result += node.get_node_count()
-        return result
-
     def get_level(self):
         """() -> int
         How many levels of nodes are there between this node
@@ -236,7 +226,7 @@ def is_big(node):
                 self.nodes = self.nodes[0].nodes
 
 
-class BTree(collections.abc.MutableMapping):
+class BTree:
     """
     Instance attributes:
       root: BNode
@@ -244,15 +234,12 @@ class BTree(collections.abc.MutableMapping):
 
     __slots__ = ['root']
 
-    def __init__(self, node_constructor=BNode):
-        assert issubclass(node_constructor, BNode)
-        self.root = node_constructor()
+    def __init__(self):
+        self.root = BNode()
 
-    def __nonzero__(self):
+    def __bool__(self):
         return bool(self.root.items)
 
-    __bool__ = __nonzero__
-
     def iteritems(self):
         for item in self.root:
             yield item
@@ -261,19 +248,6 @@ def iterkeys(self):
         for item in self.root:
             yield item[0]
 
-    def itervalues(self):
-        for item in self.root:
-            yield item[1]
-
-    def items(self):
-        return list(self.iteritems())
-
-    def keys(self):
-        return list(self.iterkeys())
-
-    def values(self):
-        return list(self.itervalues())
-
     def __iter__(self):
         for key in self.iterkeys():
             yield key
@@ -281,19 +255,9 @@ def __iter__(self):
     def __contains__(self, key):
         return self.root.search(key) is not None
 
-    def has_key(self, key):
-        return self.root.search(key) is not None
-
     def __setitem__(self, key, value):
         self.add(key, value)
 
-    def setdefault(self, key, value):
-        item = self.root.search(key)
-        if item is None:
-            self.add(key, value)
-            return value
-        return item[1]
-
     def __getitem__(self, key):
         item = self.root.search(key)
         if item is None:
@@ -303,9 +267,6 @@ def __getitem__(self, key):
     def __delitem__(self, key):
         self.root.delete(key)
 
-    def clear(self):
-        self.root = self.root.__class__()
-
     def get(self, key, default=None):
         """(key:anything, default:anything=None) -> anything"""
         try:
@@ -330,18 +291,6 @@ def __len__(self):
         Compute and return the total number of items."""
         return self.root.get_count()
 
-    def get_depth(self):
-        """() -> int
-        How many levels of nodes are used for this BTree?
-        """
-        return self.root.get_level() + 1
-
-    def get_node_count(self):
-        """() -> int
-        How many nodes are used for this BTree?
-        """
-        return self.root.get_node_count()
-
 
 class Record:
     def __init__(self, a, b, c, d, e, f):

From 52ac69d0a23894240ea2a7da8620df27c96c4317 Mon Sep 17 00:00:00 2001
From: Neil Schemenauer <nas@arctrix.com>
Date: Thu, 30 Jan 2025 13:02:57 -0800
Subject: [PATCH 3/3] Use 'yield from' in some places.

---
 .../benchmarks/bm_btree/run_benchmark.py          | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py b/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py
index ecc7fa3b..9f7635e3 100644
--- a/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py
+++ b/pyperformance/data-files/benchmarks/bm_btree/run_benchmark.py
@@ -45,15 +45,12 @@ def is_leaf(self):
 
     def __iter__(self):
         if self.is_leaf():
-            for item in self.items:
-                yield item
+            yield from self.items
         else:
             for position, item in enumerate(self.items):
-                for it in self.nodes[position]:
-                    yield it
+                yield from self.nodes[position]
                 yield item
-            for it in self.nodes[-1]:
-                yield it
+            yield from self.nodes[-1]
 
     def is_full(self):
         return len(self.items) == 2 * self.minimum_degree - 1
@@ -241,16 +238,14 @@ def __bool__(self):
         return bool(self.root.items)
 
     def iteritems(self):
-        for item in self.root:
-            yield item
+        yield from self.root
 
     def iterkeys(self):
         for item in self.root:
             yield item[0]
 
     def __iter__(self):
-        for key in self.iterkeys():
-            yield key
+        yield from self.iterkeys()
 
     def __contains__(self, key):
         return self.root.search(key) is not None