@@ -69,17 +69,26 @@ u64 unprotected_seq = 0;
69
69
70
70
private (all_cpumask ) struct bpf_cpumask __kptr * all_cpumask ;
71
71
private (big_cpumask ) struct bpf_cpumask __kptr * big_cpumask ;
72
- // XXXLIKEWHATEVS -- this should be a map of kptrs.
73
- // for now use one cpumask consisting of all cpuset cpumasks
74
- // anded.
75
- private (cpuset_cpumask ) struct bpf_cpumask __kptr * cpuset_cpumask ;
76
72
struct layer layers [MAX_LAYERS ];
77
73
u32 fallback_cpu ;
78
74
u32 layered_root_tgid = 0 ;
79
75
80
76
u32 empty_layer_ids [MAX_LAYERS ];
81
77
u32 nr_empty_layer_ids ;
82
78
79
+ struct cpumask_box {
80
+ struct bpf_cpumask __kptr * mask ;
81
+ };
82
+
83
+ struct {
84
+ __uint (type , BPF_MAP_TYPE_PERCPU_ARRAY );
85
+ __uint (max_entries , MAX_CONTAINERS );
86
+ __type (key , u32 );
87
+ __type (value , struct cpumask_box );
88
+ } cpuset_cpumask SEC (".maps" );
89
+
90
+
91
+
83
92
UEI_DEFINE (uei );
84
93
85
94
struct task_hint {
@@ -502,6 +511,7 @@ struct task_ctx {
502
511
struct bpf_cpumask __kptr * layered_unprotected_mask ;
503
512
bool all_cpus_allowed ;
504
513
bool cpus_node_aligned ;
514
+ bool cpus_cpuset_aligned ;
505
515
u64 runnable_at ;
506
516
u64 running_at ;
507
517
u64 runtime_avg ;
@@ -1366,8 +1376,10 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
1366
1376
* with open layers on non-saturated machines to avoid possible stalls.
1367
1377
*/
1368
1378
if ((!taskc -> all_cpus_allowed &&
1369
- !(layer -> allow_node_aligned && taskc -> cpus_node_aligned )) ||
1370
- !layer -> nr_cpus ) {
1379
+ !(layer -> allow_node_aligned && taskc -> cpus_node_aligned )) ||
1380
+ !(enable_container && taskc -> cpus_cpuset_aligned ) ||
1381
+ !layer -> nr_cpus ) {
1382
+
1371
1383
taskc -> dsq_id = task_cpuc -> lo_fb_dsq_id ;
1372
1384
/*
1373
1385
* Start a new lo fallback queued region if the DSQ is empty.
@@ -2627,7 +2639,7 @@ void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
2627
2639
static void refresh_cpus_flags (struct task_ctx * taskc ,
2628
2640
const struct cpumask * cpumask )
2629
2641
{
2630
- u32 node_id ;
2642
+ u32 node_id , container_id ;
2631
2643
2632
2644
if (!all_cpumask ) {
2633
2645
scx_bpf_error ("NULL all_cpumask" );
@@ -2644,7 +2656,7 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
2644
2656
2645
2657
if (!(nodec = lookup_node_ctx (node_id )) ||
2646
2658
!(node_cpumask = cast_mask (nodec -> cpumask )))
2647
- return ;
2659
+ break ;
2648
2660
2649
2661
/* not llc aligned if partially overlaps */
2650
2662
if (bpf_cpumask_intersects (node_cpumask , cpumask ) &&
@@ -2653,6 +2665,21 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
2653
2665
break ;
2654
2666
}
2655
2667
}
2668
+ if (enable_container ) {
2669
+ bpf_for (container_id , 0 , nr_containers ) {
2670
+ struct cpumask_box * box ;
2671
+ box = bpf_map_lookup_elem (& cpuset_cpumask , & container_id );
2672
+ if (!box || !box -> mask ) {
2673
+ scx_bpf_error ("error marking tasks as cpuset aligned" );
2674
+ return ;
2675
+ }
2676
+ if (bpf_cpumask_equal (cast_mask (box -> mask ), cpumask )) {
2677
+ taskc -> cpus_cpuset_aligned = true;
2678
+ return ;
2679
+ }
2680
+ }
2681
+ taskc -> cpus_cpuset_aligned = false;
2682
+ }
2656
2683
}
2657
2684
2658
2685
static int init_cached_cpus (struct cached_cpus * ccpus )
@@ -3320,8 +3347,10 @@ static s32 init_cpu(s32 cpu, int *nr_online_cpus,
3320
3347
3321
3348
s32 BPF_STRUCT_OPS_SLEEPABLE (layered_init )
3322
3349
{
3323
- struct bpf_cpumask * cpumask , * tmp_big_cpumask , * tmp_unprotected_cpumask , tmptmp ;
3324
- int i , nr_online_cpus , ret , x ;
3350
+ struct bpf_cpumask * cpumask , * tmp_big_cpumask , * tmp_unprotected_cpumask ,
3351
+ * tmp_cpuset_cpumask , * tmp_swap_dst_cpumask ;
3352
+ int i , j , cpu , nr_online_cpus , ret ;
3353
+ struct cpumask_box * cpumask_box ;
3325
3354
3326
3355
cpumask = bpf_cpumask_create ();
3327
3356
if (!cpumask )
@@ -3372,28 +3401,41 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
3372
3401
if (!cpumask )
3373
3402
return - ENOMEM ;
3374
3403
3375
- bpf_for (x , 0 , MAX_CPUS /64 ) {
3404
+
3405
+ bpf_for (j , 0 , MAX_CPUS /64 ) {
3406
+ // verifier
3407
+ if (i < 0 || i >= MAX_CONTAINERS || j < 0 || j >= (MAX_CPUS / 64 )) {
3408
+ bpf_cpumask_release (cpumask );
3409
+ return -1 ;
3410
+ }
3376
3411
// container then cpu bit
3377
- if (cpuset_fakemasks [i ][x ] == 1 ) {
3378
- bpf_cpumask_set_cpu (x , cpumask );
3412
+ if (cpuset_fakemasks [i ][j ] == 1 ) {
3413
+ bpf_cpumask_set_cpu (j , cpumask );
3379
3414
}
3380
3415
}
3381
3416
3382
- if (cpuset_cpumask ) {
3383
- struct bpf_cpumask * tmp_cpuset_cpumask = bpf_kptr_xchg (& cpuset_cpumask , NULL );
3384
- if (!tmp_cpuset_cpumask ) {
3385
- bpf_cpumask_release (cpumask );
3417
+
3418
+ // pay init cost once for faster lookups later.
3419
+ bpf_for (cpu , 0 , nr_possible_cpus ) {
3420
+ cpumask_box = bpf_map_lookup_percpu_elem (& cpuset_cpumask , & i , cpu );
3421
+ tmp_cpuset_cpumask = bpf_cpumask_create ();
3422
+
3423
+ if (!cpumask || !tmp_cpuset_cpumask || !cpumask_box ) {
3424
+ if (cpumask )
3425
+ bpf_cpumask_release (cpumask );
3426
+ if (tmp_cpuset_cpumask )
3427
+ bpf_cpumask_release (tmp_cpuset_cpumask );
3428
+ scx_bpf_error ("cpumask is null" );
3386
3429
return -1 ;
3387
3430
}
3388
- bpf_cpumask_and (cpumask , cast_mask (tmp_cpuset_cpumask ), cast_mask (cpumask ));
3389
- bpf_cpumask_release (tmp_cpuset_cpumask );
3390
- }
3391
-
3392
- struct bpf_cpumask * old_cpumask = bpf_kptr_xchg (& cpuset_cpumask , cpumask );
3431
+ bpf_cpumask_copy (tmp_cpuset_cpumask , cast_mask (cpumask ));
3393
3432
3394
- if (old_cpumask ) {
3395
- bpf_cpumask_release (old_cpumask );
3433
+ tmp_swap_dst_cpumask = bpf_kptr_xchg (& cpumask_box -> mask , tmp_cpuset_cpumask );
3434
+ if (tmp_swap_dst_cpumask )
3435
+ bpf_cpumask_release (tmp_swap_dst_cpumask );
3396
3436
}
3437
+ if (cpumask )
3438
+ bpf_cpumask_release (cpumask );
3397
3439
3398
3440
}
3399
3441
}
0 commit comments