Skip to content

Commit 8dd4e4d

Browse files
committed
layered: write cpuset masks to map and check
1 parent 52ae0be commit 8dd4e4d

File tree

2 files changed

+71
-29
lines changed

2 files changed

+71
-29
lines changed

scheds/rust/scx_layered/src/bpf/main.bpf.c

Lines changed: 66 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -68,17 +68,26 @@ u64 unprotected_seq = 0;
6868

6969
private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
7070
private(big_cpumask) struct bpf_cpumask __kptr *big_cpumask;
71-
// XXXLIKEWHATEVS -- this should be a map of kptrs.
72-
// for now use one cpumask consisting of all cpuset cpumasks
73-
// anded.
74-
private(cpuset_cpumask) struct bpf_cpumask __kptr *cpuset_cpumask;
7571
struct layer layers[MAX_LAYERS];
7672
u32 fallback_cpu;
7773
u32 layered_root_tgid = 0;
7874

7975
u32 empty_layer_ids[MAX_LAYERS];
8076
u32 nr_empty_layer_ids;
8177

78+
struct cpumask_box {
79+
struct bpf_cpumask __kptr *mask;
80+
};
81+
82+
struct {
83+
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
84+
__uint(max_entries, MAX_CONTAINERS);
85+
__type(key, u32);
86+
__type(value, struct cpumask_box);
87+
} cpuset_cpumask SEC(".maps");
88+
89+
90+
8291
UEI_DEFINE(uei);
8392

8493
static inline s32 prio_to_nice(s32 static_prio)
@@ -489,6 +498,7 @@ struct task_ctx {
489498
struct bpf_cpumask __kptr *layered_unprotected_mask;
490499
bool all_cpus_allowed;
491500
bool cpus_node_aligned;
501+
bool cpus_cpuset_aligned;
492502
u64 runnable_at;
493503
u64 running_at;
494504
u64 runtime_avg;
@@ -1340,8 +1350,10 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
13401350
* with open layers on non-saturated machines to avoid possible stalls.
13411351
*/
13421352
if ((!taskc->all_cpus_allowed &&
1343-
!(layer->allow_node_aligned && taskc->cpus_node_aligned)) ||
1344-
!layer->nr_cpus) {
1353+
!(layer->allow_node_aligned && taskc->cpus_node_aligned)) ||
1354+
!(enable_container && taskc->cpus_cpuset_aligned) ||
1355+
!layer->nr_cpus) {
1356+
13451357
taskc->dsq_id = task_cpuc->lo_fb_dsq_id;
13461358
/*
13471359
* Start a new lo fallback queued region if the DSQ is empty.
@@ -2579,7 +2591,7 @@ void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
25792591
static void refresh_cpus_flags(struct task_ctx *taskc,
25802592
const struct cpumask *cpumask)
25812593
{
2582-
u32 node_id;
2594+
u32 node_id, container_id;
25832595

25842596
if (!all_cpumask) {
25852597
scx_bpf_error("NULL all_cpumask");
@@ -2596,7 +2608,7 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
25962608

25972609
if (!(nodec = lookup_node_ctx(node_id)) ||
25982610
!(node_cpumask = cast_mask(nodec->cpumask)))
2599-
return;
2611+
break;
26002612

26012613
/* not llc aligned if partially overlaps */
26022614
if (bpf_cpumask_intersects(node_cpumask, cpumask) &&
@@ -2605,6 +2617,21 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
26052617
break;
26062618
}
26072619
}
2620+
if (enable_container) {
2621+
bpf_for(container_id, 0, nr_containers) {
2622+
struct cpumask_box* box;
2623+
box = bpf_map_lookup_elem(&cpuset_cpumask, &container_id);
2624+
if (!box || !box->mask) {
2625+
scx_bpf_error("error marking tasks as cpuset aligned");
2626+
return;
2627+
}
2628+
if (bpf_cpumask_equal(cast_mask(box->mask), cpumask)) {
2629+
taskc->cpus_cpuset_aligned = true;
2630+
return;
2631+
}
2632+
}
2633+
taskc->cpus_cpuset_aligned = false;
2634+
}
26082635
}
26092636

26102637
static int init_cached_cpus(struct cached_cpus *ccpus)
@@ -3264,8 +3291,10 @@ static s32 init_cpu(s32 cpu, int *nr_online_cpus,
32643291

32653292
s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
32663293
{
3267-
struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask, tmptmp;
3268-
int i, nr_online_cpus, ret, x;
3294+
struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask,
3295+
*tmp_cpuset_cpumask, *tmp_swap_dst_cpumask;
3296+
int i, j, cpu, nr_online_cpus, ret;
3297+
struct cpumask_box* cpumask_box;
32693298

32703299
cpumask = bpf_cpumask_create();
32713300
if (!cpumask)
@@ -3316,28 +3345,41 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
33163345
if (!cpumask)
33173346
return -ENOMEM;
33183347

3319-
bpf_for(x, 0, MAX_CPUS/64) {
3348+
3349+
bpf_for(j, 0, MAX_CPUS/64) {
3350+
// verifier
3351+
if (i < 0 || i >= MAX_CONTAINERS || j < 0 || j >= (MAX_CPUS / 64)) {
3352+
bpf_cpumask_release(cpumask);
3353+
return -1;
3354+
}
33203355
// container then cpu bit
3321-
if (cpuset_fakemasks[i][x] == 1) {
3322-
bpf_cpumask_set_cpu(x, cpumask);
3356+
if (cpuset_fakemasks[i][j] == 1) {
3357+
bpf_cpumask_set_cpu(j, cpumask);
33233358
}
33243359
}
33253360

3326-
if (cpuset_cpumask) {
3327-
struct bpf_cpumask *tmp_cpuset_cpumask = bpf_kptr_xchg(&cpuset_cpumask, NULL);
3328-
if (!tmp_cpuset_cpumask) {
3329-
bpf_cpumask_release(cpumask);
3361+
3362+
// pay init cost once for faster lookups later.
3363+
bpf_for(cpu, 0, nr_possible_cpus) {
3364+
cpumask_box = bpf_map_lookup_percpu_elem(&cpuset_cpumask, &i, cpu);
3365+
tmp_cpuset_cpumask = bpf_cpumask_create();
3366+
3367+
if (!cpumask || !tmp_cpuset_cpumask || !cpumask_box) {
3368+
if (cpumask)
3369+
bpf_cpumask_release(cpumask);
3370+
if (tmp_cpuset_cpumask)
3371+
bpf_cpumask_release(tmp_cpuset_cpumask);
3372+
scx_bpf_error("cpumask is null");
33303373
return -1;
33313374
}
3332-
bpf_cpumask_and(cpumask, cast_mask(tmp_cpuset_cpumask), cast_mask(cpumask));
3333-
bpf_cpumask_release(tmp_cpuset_cpumask);
3334-
}
3335-
3336-
struct bpf_cpumask *old_cpumask = bpf_kptr_xchg(&cpuset_cpumask, cpumask);
3375+
bpf_cpumask_copy(tmp_cpuset_cpumask, cast_mask(cpumask));
33373376

3338-
if (old_cpumask) {
3339-
bpf_cpumask_release(old_cpumask);
3377+
tmp_swap_dst_cpumask = bpf_kptr_xchg(&cpumask_box->mask, tmp_cpuset_cpumask);
3378+
if (tmp_swap_dst_cpumask)
3379+
bpf_cpumask_release(tmp_swap_dst_cpumask);
33403380
}
3381+
if (cpumask)
3382+
bpf_cpumask_release(cpumask);
33413383

33423384
}
33433385
}

scheds/rust/scx_layered/src/main.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use anyhow::Result;
2727
pub use bpf_skel::*;
2828
use clap::Parser;
2929
use crossbeam::channel::RecvTimeoutError;
30+
use layer_core_growth::get_cpusets;
3031
use lazy_static::lazy_static;
3132
use libbpf_rs::MapCore as _;
3233
use libbpf_rs::OpenObject;
@@ -59,7 +60,6 @@ use stats::LayerStats;
5960
use stats::StatsReq;
6061
use stats::StatsRes;
6162
use stats::SysStats;
62-
use layer_core_growth::get_cpusets;
6363

6464
const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
6565
const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
@@ -587,7 +587,7 @@ struct Opts {
587587
/// Enable container support
588588
#[clap(long, default_value = "false")]
589589
enable_container: bool,
590-
590+
591591
/// Maximum task runnable_at delay (in seconds) before antistall turns on
592592
#[clap(long, default_value = "3")]
593593
antistall_sec: u64,
@@ -1398,8 +1398,8 @@ impl<'a> Scheduler<'a> {
13981398
fn init_cpusets(skel: &mut OpenBpfSkel, topo: &Topology) -> Result<()> {
13991399
let cpusets = get_cpusets(topo)?;
14001400
for (i, cpuset) in cpusets.iter().enumerate() {
1401-
let mut cpumask_bitvec: [u64; MAX_CPUS/64] = [0; MAX_CPUS/64];
1402-
for j in 0..MAX_CPUS/64 {
1401+
let mut cpumask_bitvec: [u64; MAX_CPUS / 64] = [0; MAX_CPUS / 64];
1402+
for j in 0..MAX_CPUS / 64 {
14031403
if cpuset.cpus.contains(&j) {
14041404
cpumask_bitvec[j] = 1;
14051405
}
@@ -1919,7 +1919,7 @@ impl<'a> Scheduler<'a> {
19191919

19201920
Self::init_layers(&mut skel, &layer_specs, &topo)?;
19211921
Self::init_nodes(&mut skel, opts, &topo);
1922-
1922+
19231923
if opts.enable_container {
19241924
Self::init_cpusets(&mut skel, &topo)?;
19251925
}

0 commit comments

Comments
 (0)