Skip to content

Commit 87d003b

Browse files
committed
Add Decay Range.
1 parent 0f38f82 commit 87d003b

File tree

2 files changed

+359
-4
lines changed

2 files changed

+359
-4
lines changed

src/backend/backend.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "../pal/pal.h"
44
#include "commitrange.h"
55
#include "commonconfig.h"
6+
#include "decayrange.h"
67
#include "empty_range.h"
78
#include "globalrange.h"
89
#include "largebuddyrange.h"
@@ -148,9 +149,10 @@ namespace snmalloc
148149
using GlobalR = GlobalRange<StatsR>;
149150

150151
# ifdef SNMALLOC_META_PROTECTED
152+
using CommittedRange =
153+
DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
151154
// Source for object allocations
152-
using ObjectRange =
153-
LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>;
155+
using ObjectRange = LargeBuddyRange<CommittedRange, 21, 21, Pagemap>;
154156
// Set up protected range for metadata
155157
using SubR = CommitRange<SubRange<GlobalR, DefaultPal, 6>, DefaultPal>;
156158
using MetaRange =
@@ -159,8 +161,10 @@ namespace snmalloc
159161
# else
160162
// Source for object allocations and metadata
161163
// No separation between the two
162-
using ObjectRange = SmallBuddyRange<
163-
LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>>;
164+
using CommittedRange =
165+
DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
166+
using ObjectRange =
167+
SmallBuddyRange<LargeBuddyRange<CommittedRange, 21, 21, Pagemap>>;
164168
using GlobalMetaRange = GlobalRange<ObjectRange>;
165169
# endif
166170
#endif

src/backend/decayrange.h

Lines changed: 351 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,351 @@
1+
#pragma once
2+
3+
#include "../ds/ptrwrap.h"
4+
#include "../pal/pal_ds.h"
5+
#include "largebuddyrange.h"
6+
7+
namespace snmalloc
8+
{
9+
template<SNMALLOC_CONCEPT(RBRep) Rep>
10+
class RepList
11+
{
12+
uintptr_t head = 0;
13+
14+
RepList(uintptr_t head) : head(head) {}
15+
16+
public:
17+
constexpr RepList() = default;
18+
19+
[[nodiscard]] bool is_empty() const
20+
{
21+
return head == 0;
22+
}
23+
24+
RepList get_next()
25+
{
26+
SNMALLOC_ASSERT(!is_empty());
27+
auto next_field = &(Rep::ref(false, head));
28+
auto next = Rep::get(next_field);
29+
return {next};
30+
}
31+
32+
capptr::Chunk<void> get_capability()
33+
{
34+
return capptr::Chunk<void>(reinterpret_cast<void*>(head));
35+
}
36+
37+
RepList cons(capptr::Chunk<void> new_head_cap)
38+
{
39+
auto new_head = new_head_cap.unsafe_uintptr();
40+
auto field = &(Rep::ref(false, new_head));
41+
Rep::set(field, head);
42+
return {new_head};
43+
}
44+
45+
template<typename F>
46+
void forall(F f)
47+
{
48+
auto curr = *this;
49+
while (!curr.is_empty())
50+
{
51+
auto next = curr.get_next();
52+
53+
f(curr.get_capability());
54+
55+
curr = next;
56+
}
57+
}
58+
};
59+
60+
/**
61+
* Concurrent Stack
62+
*
63+
* This stack supports the following clients
64+
* (push|pop)* || pop_all* || ... || pop_all*
65+
*
66+
* That is a single thread that can do push and pop, and other threads
67+
* that do pop_all. pop_all if it returns a value, returns all of the
68+
* stack, however, it may return nullptr if it races with either a push
69+
* or a pop.
70+
*
71+
* The primary use case is single-threaded access, where other threads
72+
* can attempt to steal all the values.
73+
*/
74+
template<SNMALLOC_CONCEPT(RBRep) Rep>
75+
class RepStack
76+
{
77+
static constexpr auto empty = RepList<Rep>{};
78+
79+
private:
80+
alignas(CACHELINE_SIZE) std::atomic<RepList<Rep>> stack{};
81+
82+
RepList<Rep> take()
83+
{
84+
if (stack.load(std::memory_order_relaxed).is_empty())
85+
return empty;
86+
return stack.exchange(empty, std::memory_order_acquire);
87+
}
88+
89+
void replace(RepList<Rep> new_head)
90+
{
91+
SNMALLOC_ASSERT(stack.load().is_empty());
92+
stack.store(new_head, std::memory_order_release);
93+
}
94+
95+
public:
96+
constexpr RepStack() = default;
97+
98+
void push(capptr::Chunk<void> new_head_cap)
99+
{
100+
auto old_head = take();
101+
auto new_head = old_head.cons(new_head_cap);
102+
replace(new_head);
103+
}
104+
105+
capptr::Chunk<void> pop()
106+
{
107+
auto old_head = take();
108+
if (old_head.is_empty())
109+
return nullptr;
110+
111+
auto next = old_head.get_next();
112+
replace(next);
113+
114+
return old_head.get_capability();
115+
}
116+
117+
RepList<Rep> pop_all()
118+
{
119+
return take();
120+
}
121+
};
122+
123+
/**
124+
* This range slowly filters back memory to the parent range.
125+
* It locally caches memory and after it hasn't been used for some time
126+
* it goes back to its parent range.
127+
*/
128+
129+
template<typename ParentRange, typename PAL, typename Pagemap>
130+
class DecayRange
131+
{
132+
/**
133+
* How many slab sizes that can be provided.
134+
*/
135+
static constexpr size_t NUM_SLAB_SIZES = Pal::address_bits - MIN_CHUNK_BITS;
136+
137+
/**
138+
* Number of free stacks per chunk size that each allocator will use.
139+
* For performance ideally a power of 2. We will return to the central
140+
* pool anything that has not be used in the last NUM_EPOCHS - 1, where
141+
* each epoch is separated by DecayMemoryTimerObject::PERIOD.
142+
* I.e. if period is 500ms and num of epochs is 4, then we will return to
143+
* the central pool anything not used for the last 1500-2000ms.
144+
*/
145+
static constexpr size_t NUM_EPOCHS = 4;
146+
static_assert(bits::is_pow2(NUM_EPOCHS), "Code assumes power of two.");
147+
148+
/**
149+
* Stack of ranges that have been returned for reuse.
150+
*/
151+
ModArray<
152+
NUM_SLAB_SIZES,
153+
ModArray<NUM_EPOCHS, RepStack<BuddyChunkRep<Pagemap>>>>
154+
chunk_stack;
155+
156+
typename ParentRange::State parent{};
157+
158+
/**
159+
* Which is the current epoch to place dealloced chunks, and the
160+
* first place we look for allocating chunks.
161+
*/
162+
static inline // alignas(CACHELINE_SIZE)
163+
std::atomic<size_t>
164+
epoch{0};
165+
166+
/**
167+
* Flag to ensure one-shot registration with the PAL.
168+
*/
169+
static inline std::atomic_bool registered_timer{false};
170+
171+
std::atomic_bool registered_local{false};
172+
173+
/**
174+
* All activated DecayRanges.
175+
*/
176+
static inline std::atomic<DecayRange*> all_local{nullptr};
177+
178+
DecayRange* all_local_next{nullptr};
179+
180+
static void handle_decay_tick()
181+
{
182+
static_assert(
183+
ParentRange::ConcurrencySafe,
184+
"Parent must be concurrency safe, as dealloc_range is called here on "
185+
"potentially another thread's state.");
186+
auto new_epoch = (epoch + 1) % NUM_EPOCHS;
187+
// Flush old index for all threads.
188+
auto curr = all_local.load(std::memory_order_acquire);
189+
while (curr != nullptr)
190+
{
191+
for (size_t sc = 0; sc < NUM_SLAB_SIZES; sc++)
192+
{
193+
auto old_stack = curr->chunk_stack[sc][new_epoch].pop_all();
194+
195+
old_stack.forall([curr, sc](auto cap) {
196+
curr->parent->dealloc_range(cap, MIN_CHUNK_SIZE << sc);
197+
});
198+
}
199+
curr = curr->all_local_next;
200+
}
201+
202+
// Advance current index
203+
epoch = new_epoch;
204+
}
205+
206+
class DecayMemoryTimerObject : public PalTimerObject
207+
{
208+
/***
209+
* Method for callback object to perform lazy decommit.
210+
*/
211+
static void process(PalTimerObject*)
212+
{
213+
#ifdef SNMALLOC_TRACING
214+
message<1024>("DecayRange::handle_decay_tick timer");
215+
#endif
216+
handle_decay_tick();
217+
}
218+
219+
// Specify that we notify the ChunkAllocator every 500ms.
220+
static constexpr size_t PERIOD = 500;
221+
222+
public:
223+
constexpr DecayMemoryTimerObject() : PalTimerObject(&process, PERIOD) {}
224+
};
225+
226+
static inline DecayMemoryTimerObject timer_object;
227+
228+
public:
229+
class State
230+
{
231+
DecayRange commit_range{};
232+
233+
public:
234+
constexpr State() = default;
235+
236+
DecayRange* operator->()
237+
{
238+
return &commit_range;
239+
}
240+
};
241+
242+
static constexpr bool Aligned = ParentRange::Aligned;
243+
244+
static constexpr bool ConcurrencySafe = false;
245+
246+
constexpr DecayRange() = default;
247+
248+
capptr::Chunk<void> alloc_range(size_t size)
249+
{
250+
// Check local cache
251+
252+
if constexpr (pal_supports<Time, PAL>)
253+
{
254+
auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
255+
// Try local cache of chunks first
256+
for (size_t e = 0; e < NUM_EPOCHS; e++)
257+
{
258+
auto p = chunk_stack[slab_sizeclass][(epoch - e) % NUM_EPOCHS].pop();
259+
260+
if (p != nullptr)
261+
{
262+
#ifdef SNMALLOC_TRACING
263+
message<1024>(
264+
"DecayRange::alloc_range: returning from local cache: {} on {}",
265+
address_cast(p),
266+
this);
267+
#endif
268+
return p;
269+
}
270+
}
271+
}
272+
273+
// Loop to possibly flush all the other local threads caches.
274+
// Note that flushing passes to the parent range, which may consolidate
275+
// blocks and thus be able to service this request.
276+
// Alternatively, we could implement stealing, but that wouldn't
277+
// be able to consolidate.
278+
capptr::Chunk<void> result;
279+
for (auto i = NUM_EPOCHS; i > 0; i--)
280+
{
281+
// Nothing in local cache, so allocate from parent.
282+
result = parent->alloc_range(size);
283+
if (result != nullptr)
284+
{
285+
#ifdef SNMALLOC_TRACING
286+
message<1024>(
287+
"DecayRange::alloc_range: returning from parent: {} on {}",
288+
address_cast(result),
289+
this);
290+
#endif
291+
return result;
292+
}
293+
294+
// We have run out of memory.
295+
// Try to free some memory to the parent.
296+
#ifdef SNMALLOC_TRACING
297+
message<1024>("DecayRange::handle_decay_tick OOM");
298+
#endif
299+
handle_decay_tick();
300+
}
301+
302+
// Last try.
303+
result = parent->alloc_range(size);
304+
305+
#ifdef SNMALLOC_TRACING
306+
message<1024>(
307+
"DecayRange::alloc_range: returning from parent last try: {} on {}",
308+
address_cast(result),
309+
this);
310+
#endif
311+
312+
return result;
313+
}
314+
315+
void dealloc_range(capptr::Chunk<void> base, size_t size)
316+
{
317+
if constexpr (!pal_supports<Time, PAL>)
318+
{
319+
parent->dealloc_range(base, size);
320+
return;
321+
}
322+
323+
if (!registered_timer.exchange(true))
324+
{
325+
// Register with the PAL.
326+
PAL::register_timer(&timer_object);
327+
}
328+
329+
// Check we have registered
330+
if (!registered_local.exchange(true))
331+
{
332+
// Add to the list of local states.
333+
auto* head = all_local.load();
334+
do
335+
{
336+
all_local_next = head;
337+
} while (!all_local.compare_exchange_strong(head, this));
338+
}
339+
340+
auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
341+
// Add to local cache.
342+
#ifdef SNMALLOC_TRACING
343+
message<1024>(
344+
"DecayRange::dealloc_range: returning to local cache: {} on {}",
345+
address_cast(base),
346+
this);
347+
#endif
348+
chunk_stack[slab_sizeclass][epoch].push(base);
349+
}
350+
};
351+
} // namespace snmalloc

0 commit comments

Comments
 (0)