Skip to content

Commit dc0988b

Browse files
yonghong-songAlexei Starovoitov
authored andcommitted
bpf: Do not use bucket_lock for hashmap iterator
Currently, for hashmap, the bpf iterator will grab a bucket lock, a spinlock, before traversing the elements in the bucket. This can ensure all bpf visted elements are valid. But this mechanism may cause deadlock if update/deletion happens to the same bucket of the visited map in the program. For example, if we added bpf_map_update_elem() call to the same visited element in selftests bpf_iter_bpf_hash_map.c, we will have the following deadlock: ============================================ WARNING: possible recursive locking detected 5.9.0-rc1+ #841 Not tainted -------------------------------------------- test_progs/1750 is trying to acquire lock: ffff9a5bb73c5e70 (&htab->buckets[i].raw_lock){....}-{2:2}, at: htab_map_update_elem+0x1cf/0x410 but task is already holding lock: ffff9a5bb73c5e20 (&htab->buckets[i].raw_lock){....}-{2:2}, at: bpf_hash_map_seq_find_next+0x94/0x120 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&htab->buckets[i].raw_lock); lock(&htab->buckets[i].raw_lock); *** DEADLOCK *** ... Call Trace: dump_stack+0x78/0xa0 __lock_acquire.cold.74+0x209/0x2e3 lock_acquire+0xba/0x380 ? htab_map_update_elem+0x1cf/0x410 ? __lock_acquire+0x639/0x20c0 _raw_spin_lock_irqsave+0x3b/0x80 ? htab_map_update_elem+0x1cf/0x410 htab_map_update_elem+0x1cf/0x410 ? lock_acquire+0xba/0x380 bpf_prog_ad6dab10433b135d_dump_bpf_hash_map+0x88/0xa9c ? find_held_lock+0x34/0xa0 bpf_iter_run_prog+0x81/0x16e __bpf_hash_map_seq_show+0x145/0x180 bpf_seq_read+0xff/0x3d0 vfs_read+0xad/0x1c0 ksys_read+0x5f/0xe0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ... The bucket_lock first grabbed in seq_ops->next() called by bpf_seq_read(), and then grabbed again in htab_map_update_elem() in the bpf program, causing deadlocks. Actually, we do not need bucket_lock here, we can just use rcu_read_lock() similar to netlink iterator where the rcu_read_{lock,unlock} likes below: seq_ops->start(): rcu_read_lock(); seq_ops->next(): rcu_read_unlock(); /* next element */ rcu_read_lock(); seq_ops->stop(); rcu_read_unlock(); Compared to old bucket_lock mechanism, if concurrent updata/delete happens, we may visit stale elements, miss some elements, or repeat some elements. I think this is a reasonable compromise. For users wanting to avoid stale, missing/repeated accesses, bpf_map batch access syscall interface can be used. Signed-off-by: Yonghong Song <[email protected]> Signed-off-by: Alexei Starovoitov <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent 21e9ba5 commit dc0988b

File tree

1 file changed

+4
-11
lines changed

1 file changed

+4
-11
lines changed

kernel/bpf/hashtab.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1622,7 +1622,6 @@ struct bpf_iter_seq_hash_map_info {
16221622
struct bpf_map *map;
16231623
struct bpf_htab *htab;
16241624
void *percpu_value_buf; // non-zero means percpu hash
1625-
unsigned long flags;
16261625
u32 bucket_id;
16271626
u32 skip_elems;
16281627
};
@@ -1632,7 +1631,6 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
16321631
struct htab_elem *prev_elem)
16331632
{
16341633
const struct bpf_htab *htab = info->htab;
1635-
unsigned long flags = info->flags;
16361634
u32 skip_elems = info->skip_elems;
16371635
u32 bucket_id = info->bucket_id;
16381636
struct hlist_nulls_head *head;
@@ -1656,27 +1654,26 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
16561654

16571655
/* not found, unlock and go to the next bucket */
16581656
b = &htab->buckets[bucket_id++];
1659-
htab_unlock_bucket(htab, b, flags);
1657+
rcu_read_unlock();
16601658
skip_elems = 0;
16611659
}
16621660

16631661
for (i = bucket_id; i < htab->n_buckets; i++) {
16641662
b = &htab->buckets[i];
1665-
flags = htab_lock_bucket(htab, b);
1663+
rcu_read_lock();
16661664

16671665
count = 0;
16681666
head = &b->head;
16691667
hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
16701668
if (count >= skip_elems) {
1671-
info->flags = flags;
16721669
info->bucket_id = i;
16731670
info->skip_elems = count;
16741671
return elem;
16751672
}
16761673
count++;
16771674
}
16781675

1679-
htab_unlock_bucket(htab, b, flags);
1676+
rcu_read_unlock();
16801677
skip_elems = 0;
16811678
}
16821679

@@ -1754,14 +1751,10 @@ static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
17541751

17551752
static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
17561753
{
1757-
struct bpf_iter_seq_hash_map_info *info = seq->private;
1758-
17591754
if (!v)
17601755
(void)__bpf_hash_map_seq_show(seq, NULL);
17611756
else
1762-
htab_unlock_bucket(info->htab,
1763-
&info->htab->buckets[info->bucket_id],
1764-
info->flags);
1757+
rcu_read_unlock();
17651758
}
17661759

17671760
static int bpf_iter_init_hash_map(void *priv_data,

0 commit comments

Comments
 (0)