Skip to content

Commit 7f36642

Browse files
committed
netfilter: nf_tables: GC transaction API to avoid race with control plane
jira VULN-429 cve CVE-2023-4244 commit-author Pablo Neira Ayuso <[email protected]>' commit 5f68718 upstream-diff - Upstream fuzz and conflicts. Resolved by pointing to the rocky8_10 branch as the source of truth. The set types rhashtable and rbtree use a GC worker to reclaim memory. From system work queue, in periodic intervals, a scan of the table is done. The major caveat here is that the nft transaction mutex is not held. This causes a race between control plane and GC when they attempt to delete the same element. We cannot grab the netlink mutex from the work queue, because the control plane has to wait for the GC work queue in case the set is to be removed, so we get following deadlock: cpu 1 cpu2 GC work transaction comes in , lock nft mutex `acquire nft mutex // BLOCKS transaction asks to remove the set set destruction calls cancel_work_sync() cancel_work_sync will now block forever, because it is waiting for the mutex the caller already owns. This patch adds a new API that deals with garbage collection in two steps: 1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT so they are not visible via lookup. Annotate current GC sequence in the GC transaction. Enqueue GC transaction work as soon as it is full. If ruleset is updated, then GC transaction is aborted and retried later. 2) GC work grabs the mutex. If GC sequence has changed then this GC transaction lost race with control plane, abort it as it contains stale references to objects and let GC try again later. If the ruleset is intact, then this GC transaction deactivates and removes the elements and it uses call_rcu() to destroy elements. Note that no elements are removed from GC lockless path, the _DEAD bit is set and pointers are collected. GC catchall does not remove the elements anymore too. There is a new set->dead flag that is set on to abort the GC transaction to deal with set->ops->destroy() path which removes the remaining elements in the set from commit_release, where no mutex is held. To deal with GC when mutex is held, which allows safe deactivate and removal, add sync GC API which releases the set element object via call_rcu(). This is used by rbtree and pipapo backends which also perform garbage collection from control plane path. Since element removal from sets can happen from control plane and element garbage collection/timeout, it is necessary to keep the set structure alive until all elements have been deactivated and destroyed. We cannot do a cancel_work_sync or flush_work in nft_set_destroy because its called with the transaction mutex held, but the aforementioned async work queue might be blocked on the very mutex that nft_set_destroy() callchain is sitting on. This gives us the choice of ABBA deadlock or UaF. To avoid both, add set->refs refcount_t member. The GC API can then increment the set refcount and release it once the elements have been free'd. Set backends are adapted to use the GC transaction API in a follow up patch entitled: ("netfilter: nf_tables: use gc transaction API in set backends") This is joint work with Florian Westphal. Fixes: cfed7e1 ("netfilter: nf_tables: add set garbage collection helpers") Signed-off-by: Pablo Neira Ayuso <[email protected]>
1 parent 332c446 commit 7f36642

File tree

2 files changed

+275
-8
lines changed

2 files changed

+275
-8
lines changed

include/net/netfilter/nf_tables.h

+66-3
Original file line numberDiff line numberDiff line change
@@ -445,6 +445,7 @@ struct nft_set_elem_expr {
445445
*
446446
* @list: table set list node
447447
* @bindings: list of set bindings
448+
* @refs: internal refcounting for async set destruction
448449
* @table: table this set belongs to
449450
* @net: netnamespace this set belongs to
450451
* @name: name of the set
@@ -474,6 +475,7 @@ struct nft_set_elem_expr {
474475
struct nft_set {
475476
struct list_head list;
476477
struct list_head bindings;
478+
refcount_t refs;
477479
struct nft_table *table;
478480
possible_net_t net;
479481
char *name;
@@ -495,7 +497,8 @@ struct nft_set {
495497
struct list_head pending_update;
496498
/* runtime data below here */
497499
const struct nft_set_ops *ops ____cacheline_aligned;
498-
u16 flags:14,
500+
u16 flags:13,
501+
dead:1,
499502
genmask:2;
500503
u8 klen;
501504
u8 dlen;
@@ -1444,6 +1447,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext)
14441447
clear_bit(NFT_SET_ELEM_BUSY_BIT, word);
14451448
}
14461449

1450+
#define NFT_SET_ELEM_DEAD_MASK (1 << 3)
1451+
1452+
#if defined(__LITTLE_ENDIAN_BITFIELD)
1453+
#define NFT_SET_ELEM_DEAD_BIT 3
1454+
#elif defined(__BIG_ENDIAN_BITFIELD)
1455+
#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3)
1456+
#else
1457+
#error
1458+
#endif
1459+
1460+
static inline void nft_set_elem_dead(struct nft_set_ext *ext)
1461+
{
1462+
unsigned long *word = (unsigned long *)ext;
1463+
1464+
BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
1465+
set_bit(NFT_SET_ELEM_DEAD_BIT, word);
1466+
}
1467+
1468+
static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
1469+
{
1470+
unsigned long *word = (unsigned long *)ext;
1471+
1472+
BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
1473+
return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
1474+
}
1475+
14471476
/**
14481477
* struct nft_trans - nf_tables object update in transaction
14491478
*
@@ -1546,6 +1575,35 @@ struct nft_trans_flowtable {
15461575
#define nft_trans_flowtable(trans) \
15471576
(((struct nft_trans_flowtable *)trans->data)->flowtable)
15481577

1578+
#define NFT_TRANS_GC_BATCHCOUNT 256
1579+
1580+
struct nft_trans_gc {
1581+
struct list_head list;
1582+
struct net *net;
1583+
struct nft_set *set;
1584+
u32 seq;
1585+
u8 count;
1586+
void *priv[NFT_TRANS_GC_BATCHCOUNT];
1587+
struct rcu_head rcu;
1588+
};
1589+
1590+
struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
1591+
unsigned int gc_seq, gfp_t gfp);
1592+
void nft_trans_gc_destroy(struct nft_trans_gc *trans);
1593+
1594+
struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
1595+
unsigned int gc_seq, gfp_t gfp);
1596+
void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);
1597+
1598+
struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
1599+
void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);
1600+
1601+
void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);
1602+
1603+
void nft_setelem_data_deactivate(const struct net *net,
1604+
const struct nft_set *set,
1605+
struct nft_set_elem *elem);
1606+
15491607
int __init nft_chain_filter_init(void);
15501608
void nft_chain_filter_fini(void);
15511609

@@ -1564,8 +1622,8 @@ static inline int nft_request_module(struct net *net, const char *fmt, ...) { re
15641622
#endif
15651623

15661624
struct nftables_pernet {
1567-
u64 tstamp;
1568-
unsigned int gc_seq;
1625+
u64 tstamp;
1626+
unsigned int gc_seq;
15691627
};
15701628

15711629
extern unsigned int nf_tables_net_id;
@@ -1575,4 +1633,9 @@ static inline struct nftables_pernet *nft_pernet(const struct net *net)
15751633
return net_generic(net, nf_tables_net_id);
15761634
}
15771635

1636+
static inline u64 nft_net_tstamp(const struct net *net)
1637+
{
1638+
return nft_pernet(net)->tstamp;
1639+
}
1640+
15781641
#endif /* _NET_NF_TABLES_H */

0 commit comments

Comments
 (0)