Skip to content

Commit e6135df

Browse files
author
Alexei Starovoitov
committed
Merge branch 'hashmap_iter_bucket_lock_fix'
Yonghong Song says: ==================== Currently, the bpf hashmap iterator takes a bucket_lock, a spin_lock, before visiting each element in the bucket. This will cause a deadlock if a map update/delete operates on an element with the same bucket id of the visited map. To avoid the deadlock, let us just use rcu_read_lock instead of bucket_lock. This may result in visiting stale elements, missing some elements, or repeating some elements, if concurrent map delete/update happens for the same map. I think using rcu_read_lock is a reasonable compromise. For users caring stale/missing/repeating element issues, bpf map batch access syscall interface can be used. Note that another approach is during bpf_iter link stage, we check whether the iter program might be able to do update/delete to the visited map. If it is, reject the link_create. Verifier needs to record whether an update/delete operation happens for each map for this approach. I just feel this checking is too specialized, hence still prefer rcu_read_lock approach. Patch #1 has the kernel implementation and Patch #2 added a selftest which can trigger deadlock without Patch #1. ==================== Signed-off-by: Alexei Starovoitov <[email protected]>
2 parents 21e9ba5 + 4daab71 commit e6135df

File tree

2 files changed

+19
-11
lines changed

2 files changed

+19
-11
lines changed

kernel/bpf/hashtab.c

+4-11
Original file line numberDiff line numberDiff line change
@@ -1622,7 +1622,6 @@ struct bpf_iter_seq_hash_map_info {
16221622
struct bpf_map *map;
16231623
struct bpf_htab *htab;
16241624
void *percpu_value_buf; // non-zero means percpu hash
1625-
unsigned long flags;
16261625
u32 bucket_id;
16271626
u32 skip_elems;
16281627
};
@@ -1632,7 +1631,6 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
16321631
struct htab_elem *prev_elem)
16331632
{
16341633
const struct bpf_htab *htab = info->htab;
1635-
unsigned long flags = info->flags;
16361634
u32 skip_elems = info->skip_elems;
16371635
u32 bucket_id = info->bucket_id;
16381636
struct hlist_nulls_head *head;
@@ -1656,27 +1654,26 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
16561654

16571655
/* not found, unlock and go to the next bucket */
16581656
b = &htab->buckets[bucket_id++];
1659-
htab_unlock_bucket(htab, b, flags);
1657+
rcu_read_unlock();
16601658
skip_elems = 0;
16611659
}
16621660

16631661
for (i = bucket_id; i < htab->n_buckets; i++) {
16641662
b = &htab->buckets[i];
1665-
flags = htab_lock_bucket(htab, b);
1663+
rcu_read_lock();
16661664

16671665
count = 0;
16681666
head = &b->head;
16691667
hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
16701668
if (count >= skip_elems) {
1671-
info->flags = flags;
16721669
info->bucket_id = i;
16731670
info->skip_elems = count;
16741671
return elem;
16751672
}
16761673
count++;
16771674
}
16781675

1679-
htab_unlock_bucket(htab, b, flags);
1676+
rcu_read_unlock();
16801677
skip_elems = 0;
16811678
}
16821679

@@ -1754,14 +1751,10 @@ static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
17541751

17551752
static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
17561753
{
1757-
struct bpf_iter_seq_hash_map_info *info = seq->private;
1758-
17591754
if (!v)
17601755
(void)__bpf_hash_map_seq_show(seq, NULL);
17611756
else
1762-
htab_unlock_bucket(info->htab,
1763-
&info->htab->buckets[info->bucket_id],
1764-
info->flags);
1757+
rcu_read_unlock();
17651758
}
17661759

17671760
static int bpf_iter_init_hash_map(void *priv_data,

tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c

+15
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ int dump_bpf_hash_map(struct bpf_iter__bpf_map_elem *ctx)
4747
__u32 seq_num = ctx->meta->seq_num;
4848
struct bpf_map *map = ctx->map;
4949
struct key_t *key = ctx->key;
50+
struct key_t tmp_key;
5051
__u64 *val = ctx->value;
52+
__u64 tmp_val = 0;
53+
int ret;
5154

5255
if (in_test_mode) {
5356
/* test mode is used by selftests to
@@ -61,6 +64,18 @@ int dump_bpf_hash_map(struct bpf_iter__bpf_map_elem *ctx)
6164
if (key == (void *)0 || val == (void *)0)
6265
return 0;
6366

67+
/* update the value and then delete the <key, value> pair.
68+
* it should not impact the existing 'val' which is still
69+
* accessible under rcu.
70+
*/
71+
__builtin_memcpy(&tmp_key, key, sizeof(struct key_t));
72+
ret = bpf_map_update_elem(&hashmap1, &tmp_key, &tmp_val, 0);
73+
if (ret)
74+
return 0;
75+
ret = bpf_map_delete_elem(&hashmap1, &tmp_key);
76+
if (ret)
77+
return 0;
78+
6479
key_sum_a += key->a;
6580
key_sum_b += key->b;
6681
key_sum_c += key->c;

0 commit comments

Comments
 (0)