Skip to content

Commit 95ec819

Browse files
author
Paolo Abeni
committed
net: introduce per netns packet chains
JIRA: https://issues.redhat.com/browse/RHEL-88921 Tested: vs issue reproducer Conflicts: rhel-9 lacks the hotdata optimization and ptype_all lives in\ the global scope, update all the affected chunks accordingly. \ Different context in preinit_net as rhel-9 lacks, among others, the \ upstream commit 76aed95 ("rtnetlink: Add per-netns RTNL.") Upstream commit: commit c353e89 Author: Paolo Abeni <[email protected]> Date: Thu Mar 20 19:22:38 2025 +0100 net: introduce per netns packet chains Currently network taps unbound to any interface are linked in the global ptype_all list, affecting the performance in all the network namespaces. Add per netns ptypes chains, so that in the mentioned case only the netns owning the packet socket(s) is affected. While at that drop the global ptype_all list: no in kernel user registers a tap on "any" type without specifying either the target device or the target namespace (and IMHO doing that would not make any sense). Note that this adds a conditional in the fast path (to check for per netns ptype_specific list) and increases the dataset size by a cacheline (owing the per netns lists). Reviewed-by: Sabrina Dubroca <[email protected]> Signed-off-by: Paolo Abeni <[email protected]> Reviewed-by: Eric Dumazet <[email protected]> Link: https://patch.msgid.link/ae405f98875ee87f8150c460ad162de7e466f8a7.1742494826.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski <[email protected]> Signed-off-by: Paolo Abeni <[email protected]>
1 parent 2a96a5d commit 95ec819

File tree

5 files changed

+80
-21
lines changed

5 files changed

+80
-21
lines changed

include/linux/netdevice.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4157,7 +4157,17 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev,
41574157
return 0;
41584158
}
41594159

4160-
bool dev_nit_active(struct net_device *dev);
4160+
bool dev_nit_active_rcu(const struct net_device *dev);
4161+
static inline bool dev_nit_active(const struct net_device *dev)
4162+
{
4163+
bool ret;
4164+
4165+
rcu_read_lock();
4166+
ret = dev_nit_active_rcu(dev);
4167+
rcu_read_unlock();
4168+
return ret;
4169+
}
4170+
41614171
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
41624172

41634173
static inline void __dev_put(struct net_device *dev)

include/net/net_namespace.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,9 @@ struct net {
8383
struct llist_node defer_free_list;
8484
struct llist_node cleanup_list; /* namespaces on death row */
8585

86+
struct list_head ptype_all;
87+
struct list_head ptype_specific;
88+
8689
#ifdef CONFIG_KEYS
8790
struct key_tag *key_domain; /* Key domain of operation tag */
8891
#endif

net/core/dev.c

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,6 @@
164164

165165
static DEFINE_SPINLOCK(ptype_lock);
166166
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
167-
struct list_head ptype_all __read_mostly; /* Taps */
168167

169168
static int netif_rx_internal(struct sk_buff *skb);
170169
static int call_netdevice_notifiers_extack(unsigned long val,
@@ -569,10 +568,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
569568

570569
static inline struct list_head *ptype_head(const struct packet_type *pt)
571570
{
572-
if (pt->type == htons(ETH_P_ALL))
573-
return pt->dev ? &pt->dev->ptype_all : &ptype_all;
574-
else
575-
return pt->dev ? &pt->dev->ptype_specific :
571+
if (pt->type == htons(ETH_P_ALL)) {
572+
if (!pt->af_packet_net && !pt->dev)
573+
return NULL;
574+
575+
return pt->dev ? &pt->dev->ptype_all :
576+
&pt->af_packet_net->ptype_all;
577+
}
578+
579+
if (pt->dev)
580+
return &pt->dev->ptype_specific;
581+
582+
return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
576583
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
577584
}
578585

@@ -593,6 +600,9 @@ void dev_add_pack(struct packet_type *pt)
593600
{
594601
struct list_head *head = ptype_head(pt);
595602

603+
if (WARN_ON_ONCE(!head))
604+
return;
605+
596606
spin_lock(&ptype_lock);
597607
list_add_rcu(&pt->list, head);
598608
spin_unlock(&ptype_lock);
@@ -617,6 +627,9 @@ void __dev_remove_pack(struct packet_type *pt)
617627
struct list_head *head = ptype_head(pt);
618628
struct packet_type *pt1;
619629

630+
if (!head)
631+
return;
632+
620633
spin_lock(&ptype_lock);
621634

622635
list_for_each_entry(pt1, head, list) {
@@ -2301,15 +2314,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
23012314
}
23022315

23032316
/**
2304-
* dev_nit_active - return true if any network interface taps are in use
2317+
* dev_nit_active_rcu - return true if any network interface taps are in use
2318+
*
2319+
* The caller must hold the RCU lock
23052320
*
23062321
* @dev: network device to check for the presence of taps
23072322
*/
2308-
bool dev_nit_active(struct net_device *dev)
2323+
bool dev_nit_active_rcu(const struct net_device *dev)
23092324
{
2310-
return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2325+
/* Callers may hold either RCU or RCU BH lock */
2326+
WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
2327+
2328+
return !list_empty(&dev_net(dev)->ptype_all) ||
2329+
!list_empty(&dev->ptype_all);
23112330
}
2312-
EXPORT_SYMBOL_GPL(dev_nit_active);
2331+
EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
23132332

23142333
/*
23152334
* Support routine. Sends outgoing frames to any network
@@ -2321,9 +2340,10 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
23212340
struct packet_type *ptype;
23222341
struct sk_buff *skb2 = NULL;
23232342
struct packet_type *pt_prev = NULL;
2324-
struct list_head *ptype_list = &ptype_all;
2343+
struct list_head *ptype_list;
23252344

23262345
rcu_read_lock();
2346+
ptype_list = &dev_net_rcu(dev)->ptype_all;
23272347
again:
23282348
list_for_each_entry_rcu(ptype, ptype_list, list) {
23292349
if (READ_ONCE(ptype->ignore_outgoing))
@@ -2367,7 +2387,7 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
23672387
pt_prev = ptype;
23682388
}
23692389

2370-
if (ptype_list == &ptype_all) {
2390+
if (ptype_list != &dev->ptype_all) {
23712391
ptype_list = &dev->ptype_all;
23722392
goto again;
23732393
}
@@ -3581,7 +3601,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
35813601
unsigned int len;
35823602
int rc;
35833603

3584-
if (dev_nit_active(dev))
3604+
if (dev_nit_active_rcu(dev))
35853605
dev_queue_xmit_nit(skb, dev);
35863606

35873607
len = skb->len;
@@ -5445,7 +5465,8 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
54455465
if (pfmemalloc)
54465466
goto skip_taps;
54475467

5448-
list_for_each_entry_rcu(ptype, &ptype_all, list) {
5468+
list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
5469+
list) {
54495470
if (pt_prev)
54505471
ret = deliver_skb(skb, pt_prev, orig_dev);
54515472
pt_prev = ptype;
@@ -5557,6 +5578,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
55575578
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
55585579
&ptype_base[ntohs(type) &
55595580
PTYPE_HASH_MASK]);
5581+
5582+
/* orig_dev and skb->dev could belong to different netns;
5583+
* Even in such case we need to traverse only the list
5584+
* coming from skb->dev, as the ptype owner (packet socket)
5585+
* will use dev_net(skb->dev) to do namespace filtering.
5586+
*/
5587+
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5588+
&dev_net_rcu(skb->dev)->ptype_specific);
55605589
}
55615590

55625591
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
@@ -11921,7 +11950,6 @@ static int __init net_dev_init(void)
1192111950
if (netdev_kobject_init())
1192211951
goto out;
1192311952

11924-
INIT_LIST_HEAD(&ptype_all);
1192511953
for (i = 0; i < PTYPE_HASH_SIZE; i++)
1192611954
INIT_LIST_HEAD(&ptype_base[i]);
1192711955

net/core/net-procfs.c

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -175,12 +175,18 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
175175
}
176176
}
177177

178-
list_for_each_entry_rcu(pt, &ptype_all, list) {
178+
list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
179179
if (i == pos)
180180
return pt;
181181
++i;
182182
}
183183

184+
list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) {
185+
if (i == pos)
186+
return pt;
187+
++i;
188+
}
189+
184190
for (t = 0; t < PTYPE_HASH_SIZE; t++) {
185191
list_for_each_entry_rcu(pt, &ptype_base[t], list) {
186192
if (i == pos)
@@ -200,6 +206,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
200206

201207
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
202208
{
209+
struct net *net = seq_file_net(seq);
203210
struct net_device *dev;
204211
struct packet_type *pt;
205212
struct list_head *nxt;
@@ -223,14 +230,22 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
223230
}
224231
}
225232

226-
nxt = ptype_all.next;
227-
goto ptype_all;
233+
nxt = net->ptype_all.next;
234+
goto net_ptype_all;
228235
}
229236

230-
if (pt->type == htons(ETH_P_ALL)) {
231-
ptype_all:
232-
if (nxt != &ptype_all)
237+
if (pt->af_packet_net) {
238+
net_ptype_all:
239+
if (nxt != &net->ptype_all && nxt != &net->ptype_specific)
233240
goto found;
241+
242+
if (nxt == &net->ptype_all) {
243+
/* continue with ->ptype_specific if it's not empty */
244+
nxt = net->ptype_specific.next;
245+
if (nxt != &net->ptype_specific)
246+
goto found;
247+
}
248+
234249
hash = 0;
235250
nxt = ptype_base[0].next;
236251
} else

net/core/net_namespace.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,9 @@ EXPORT_SYMBOL_GPL(get_net_ns_by_id);
311311
static __net_init void preinit_net(struct net *net)
312312
{
313313
ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");
314+
315+
INIT_LIST_HEAD(&net->ptype_all);
316+
INIT_LIST_HEAD(&net->ptype_specific);
314317
}
315318

316319
/*

0 commit comments

Comments
 (0)