@@ -68,17 +68,26 @@ u64 unprotected_seq = 0;
68
68
69
69
private (all_cpumask ) struct bpf_cpumask __kptr * all_cpumask ;
70
70
private (big_cpumask ) struct bpf_cpumask __kptr * big_cpumask ;
71
- // XXXLIKEWHATEVS -- this should be a map of kptrs.
72
- // for now use one cpumask consisting of all cpuset cpumasks
73
- // anded.
74
- private (cpuset_cpumask ) struct bpf_cpumask __kptr * cpuset_cpumask ;
75
71
struct layer layers [MAX_LAYERS ];
76
72
u32 fallback_cpu ;
77
73
u32 layered_root_tgid = 0 ;
78
74
79
75
u32 empty_layer_ids [MAX_LAYERS ];
80
76
u32 nr_empty_layer_ids ;
81
77
78
+ struct cpumask_box {
79
+ struct bpf_cpumask __kptr * mask ;
80
+ };
81
+
82
+ struct {
83
+ __uint (type , BPF_MAP_TYPE_PERCPU_ARRAY );
84
+ __uint (max_entries , MAX_CONTAINERS );
85
+ __type (key , u32 );
86
+ __type (value , struct cpumask_box );
87
+ } cpuset_cpumask SEC (".maps" );
88
+
89
+
90
+
82
91
UEI_DEFINE (uei );
83
92
84
93
static inline s32 prio_to_nice (s32 static_prio )
@@ -489,6 +498,7 @@ struct task_ctx {
489
498
struct bpf_cpumask __kptr * layered_unprotected_mask ;
490
499
bool all_cpus_allowed ;
491
500
bool cpus_node_aligned ;
501
+ bool cpus_cpuset_aligned ;
492
502
u64 runnable_at ;
493
503
u64 running_at ;
494
504
u64 runtime_avg ;
@@ -1340,8 +1350,10 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
1340
1350
* with open layers on non-saturated machines to avoid possible stalls.
1341
1351
*/
1342
1352
if ((!taskc -> all_cpus_allowed &&
1343
- !(layer -> allow_node_aligned && taskc -> cpus_node_aligned )) ||
1344
- !layer -> nr_cpus ) {
1353
+ !(layer -> allow_node_aligned && taskc -> cpus_node_aligned )) ||
1354
+ !(enable_container && taskc -> cpus_cpuset_aligned ) ||
1355
+ !layer -> nr_cpus ) {
1356
+
1345
1357
taskc -> dsq_id = task_cpuc -> lo_fb_dsq_id ;
1346
1358
/*
1347
1359
* Start a new lo fallback queued region if the DSQ is empty.
@@ -2579,7 +2591,7 @@ void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
2579
2591
static void refresh_cpus_flags (struct task_ctx * taskc ,
2580
2592
const struct cpumask * cpumask )
2581
2593
{
2582
- u32 node_id ;
2594
+ u32 node_id , container_id ;
2583
2595
2584
2596
if (!all_cpumask ) {
2585
2597
scx_bpf_error ("NULL all_cpumask" );
@@ -2596,7 +2608,7 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
2596
2608
2597
2609
if (!(nodec = lookup_node_ctx (node_id )) ||
2598
2610
!(node_cpumask = cast_mask (nodec -> cpumask )))
2599
- return ;
2611
+ break ;
2600
2612
2601
2613
/* not llc aligned if partially overlaps */
2602
2614
if (bpf_cpumask_intersects (node_cpumask , cpumask ) &&
@@ -2605,6 +2617,21 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
2605
2617
break ;
2606
2618
}
2607
2619
}
2620
+ if (enable_container ) {
2621
+ bpf_for (container_id , 0 , nr_containers ) {
2622
+ struct cpumask_box * box ;
2623
+ box = bpf_map_lookup_elem (& cpuset_cpumask , & container_id );
2624
+ if (!box || !box -> mask ) {
2625
+ scx_bpf_error ("error marking tasks as cpuset aligned" );
2626
+ return ;
2627
+ }
2628
+ if (bpf_cpumask_equal (cast_mask (box -> mask ), cpumask )) {
2629
+ taskc -> cpus_cpuset_aligned = true;
2630
+ return ;
2631
+ }
2632
+ }
2633
+ taskc -> cpus_cpuset_aligned = false;
2634
+ }
2608
2635
}
2609
2636
2610
2637
static int init_cached_cpus (struct cached_cpus * ccpus )
@@ -3264,8 +3291,10 @@ static s32 init_cpu(s32 cpu, int *nr_online_cpus,
3264
3291
3265
3292
s32 BPF_STRUCT_OPS_SLEEPABLE (layered_init )
3266
3293
{
3267
- struct bpf_cpumask * cpumask , * tmp_big_cpumask , * tmp_unprotected_cpumask , tmptmp ;
3268
- int i , nr_online_cpus , ret , x ;
3294
+ struct bpf_cpumask * cpumask , * tmp_big_cpumask , * tmp_unprotected_cpumask ,
3295
+ * tmp_cpuset_cpumask , * tmp_swap_dst_cpumask ;
3296
+ int i , j , cpu , nr_online_cpus , ret ;
3297
+ struct cpumask_box * cpumask_box ;
3269
3298
3270
3299
cpumask = bpf_cpumask_create ();
3271
3300
if (!cpumask )
@@ -3316,28 +3345,41 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
3316
3345
if (!cpumask )
3317
3346
return - ENOMEM ;
3318
3347
3319
- bpf_for (x , 0 , MAX_CPUS /64 ) {
3348
+
3349
+ bpf_for (j , 0 , MAX_CPUS /64 ) {
3350
+ // verifier
3351
+ if (i < 0 || i >= MAX_CONTAINERS || j < 0 || j >= (MAX_CPUS / 64 )) {
3352
+ bpf_cpumask_release (cpumask );
3353
+ return -1 ;
3354
+ }
3320
3355
// container then cpu bit
3321
- if (cpuset_fakemasks [i ][x ] == 1 ) {
3322
- bpf_cpumask_set_cpu (x , cpumask );
3356
+ if (cpuset_fakemasks [i ][j ] == 1 ) {
3357
+ bpf_cpumask_set_cpu (j , cpumask );
3323
3358
}
3324
3359
}
3325
3360
3326
- if (cpuset_cpumask ) {
3327
- struct bpf_cpumask * tmp_cpuset_cpumask = bpf_kptr_xchg (& cpuset_cpumask , NULL );
3328
- if (!tmp_cpuset_cpumask ) {
3329
- bpf_cpumask_release (cpumask );
3361
+
3362
+ // pay init cost once for faster lookups later.
3363
+ bpf_for (cpu , 0 , nr_possible_cpus ) {
3364
+ cpumask_box = bpf_map_lookup_percpu_elem (& cpuset_cpumask , & i , cpu );
3365
+ tmp_cpuset_cpumask = bpf_cpumask_create ();
3366
+
3367
+ if (!cpumask || !tmp_cpuset_cpumask || !cpumask_box ) {
3368
+ if (cpumask )
3369
+ bpf_cpumask_release (cpumask );
3370
+ if (tmp_cpuset_cpumask )
3371
+ bpf_cpumask_release (tmp_cpuset_cpumask );
3372
+ scx_bpf_error ("cpumask is null" );
3330
3373
return -1 ;
3331
3374
}
3332
- bpf_cpumask_and (cpumask , cast_mask (tmp_cpuset_cpumask ), cast_mask (cpumask ));
3333
- bpf_cpumask_release (tmp_cpuset_cpumask );
3334
- }
3335
-
3336
- struct bpf_cpumask * old_cpumask = bpf_kptr_xchg (& cpuset_cpumask , cpumask );
3375
+ bpf_cpumask_copy (tmp_cpuset_cpumask , cast_mask (cpumask ));
3337
3376
3338
- if (old_cpumask ) {
3339
- bpf_cpumask_release (old_cpumask );
3377
+ tmp_swap_dst_cpumask = bpf_kptr_xchg (& cpumask_box -> mask , tmp_cpuset_cpumask );
3378
+ if (tmp_swap_dst_cpumask )
3379
+ bpf_cpumask_release (tmp_swap_dst_cpumask );
3340
3380
}
3381
+ if (cpumask )
3382
+ bpf_cpumask_release (cpumask );
3341
3383
3342
3384
}
3343
3385
}
0 commit comments