Skip to content

Commit 795f505

Browse files
wdebruijKernel Patches Daemon
authored andcommitted
bpf: lru: adjust free target to avoid global table starvation
BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the map is full, due to percpu reservations and force shrink before neighbor stealing. Once a CPU is unable to borrow from the global map, it will once steal one elem from a neighbor and after that each time flush this one element to the global list and immediately recycle it. Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map with 79 CPUs. CPU 79 will observe this behavior even while its neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%). CPUs need not be active concurrently. The issue can appear with affinity migration, e.g., irqbalance. Each CPU can reserve and then hold onto its 128 elements indefinitely. Avoid global list exhaustion by limiting aggregate percpu caches to half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count. This change has no effect on sufficiently large tables. Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable lru->free_target. The extra field fits in a hole in struct bpf_lru. The cacheline is already warm where read in the hot path. The field is only accessed with the lru lock held. The tests are updated to pass. Test comments are extensive: updating those is left for a v2 if the approach is considered ok. Signed-off-by: Willem de Bruijn <[email protected]>
1 parent 13e07b5 commit 795f505

File tree

3 files changed

+25
-13
lines changed

3 files changed

+25
-13
lines changed

kernel/bpf/bpf_lru_list.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
337337
list) {
338338
__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
339339
BPF_LRU_LOCAL_LIST_T_FREE);
340-
if (++nfree == LOCAL_FREE_TARGET)
340+
if (++nfree == lru->target_free)
341341
break;
342342
}
343343

344-
if (nfree < LOCAL_FREE_TARGET)
345-
__bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
344+
if (nfree < lru->target_free)
345+
__bpf_lru_list_shrink(lru, l, lru->target_free - nfree,
346346
local_free_list(loc_l),
347347
BPF_LRU_LOCAL_LIST_T_FREE);
348348

@@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
577577
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
578578
buf += elem_size;
579579
}
580+
581+
lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2,
582+
1, LOCAL_FREE_TARGET);
580583
}
581584

582585
static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,

kernel/bpf/bpf_lru_list.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ struct bpf_lru {
5858
del_from_htab_func del_from_htab;
5959
void *del_arg;
6060
unsigned int hash_offset;
61+
unsigned int target_free;
6162
unsigned int nr_scans;
6263
bool percpu;
6364
};

tools/testing/selftests/bpf/test_lru_map.c

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try)
138138
return ret;
139139
}
140140

141+
/* inverse of how bpf_common_lru_populate derives target_free from map_size. */
142+
static unsigned int __map_size(unsigned int tgt_free)
143+
{
144+
return tgt_free * nr_cpus * 2;
145+
}
146+
141147
/* Size of the LRU map is 2
142148
* Add key=1 (+1 key)
143149
* Add key=2 (+1 key)
@@ -257,7 +263,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
257263
batch_size = tgt_free / 2;
258264
assert(batch_size * 2 == tgt_free);
259265

260-
map_size = tgt_free + batch_size;
266+
map_size = __map_size(tgt_free) + batch_size;
261267
lru_map_fd = create_map(map_type, map_flags, map_size);
262268
assert(lru_map_fd != -1);
263269

@@ -267,7 +273,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
267273
value[0] = 1234;
268274

269275
/* Insert 1 to tgt_free (+tgt_free keys) */
270-
end_key = 1 + tgt_free;
276+
end_key = 1 + __map_size(tgt_free);
271277
for (key = 1; key < end_key; key++)
272278
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
273279
BPF_NOEXIST));
@@ -284,8 +290,8 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
284290
* => 1+tgt_free/2 to LOCALFREE_TARGET will be
285291
* removed by LRU
286292
*/
287-
key = 1 + tgt_free;
288-
end_key = key + tgt_free;
293+
key = 1 + __map_size(tgt_free);
294+
end_key = key + __map_size(tgt_free);
289295
for (; key < end_key; key++) {
290296
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
291297
BPF_NOEXIST));
@@ -334,7 +340,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
334340
batch_size = tgt_free / 2;
335341
assert(batch_size * 2 == tgt_free);
336342

337-
map_size = tgt_free + batch_size;
343+
map_size = __map_size(tgt_free) + batch_size;
338344
lru_map_fd = create_map(map_type, map_flags, map_size);
339345
assert(lru_map_fd != -1);
340346

@@ -344,7 +350,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
344350
value[0] = 1234;
345351

346352
/* Insert 1 to tgt_free (+tgt_free keys) */
347-
end_key = 1 + tgt_free;
353+
end_key = 1 + __map_size(tgt_free);
348354
for (key = 1; key < end_key; key++)
349355
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
350356
BPF_NOEXIST));
@@ -388,16 +394,17 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
388394
value[0] = 1234;
389395

390396
/* Insert 1+tgt_free to tgt_free*3/2 */
391-
end_key = 1 + tgt_free + batch_size;
392-
for (key = 1 + tgt_free; key < end_key; key++)
397+
key = 1 + __map_size(tgt_free);
398+
end_key = key + batch_size;
399+
for (; key < end_key; key++)
393400
/* These newly added but not referenced keys will be
394401
* gone during the next LRU shrink.
395402
*/
396403
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
397404
BPF_NOEXIST));
398405

399406
/* Insert 1+tgt_free*3/2 to tgt_free*5/2 */
400-
end_key = key + tgt_free;
407+
end_key += __map_size(tgt_free);
401408
for (; key < end_key; key++) {
402409
assert(!bpf_map_update_elem(lru_map_fd, &key, value,
403410
BPF_NOEXIST));
@@ -500,7 +507,8 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
500507
lru_map_fd = create_map(map_type, map_flags,
501508
3 * tgt_free * nr_cpus);
502509
else
503-
lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free);
510+
lru_map_fd = create_map(map_type, map_flags,
511+
3 * __map_size(tgt_free));
504512
assert(lru_map_fd != -1);
505513

506514
expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0,

0 commit comments

Comments
 (0)