Skip to content

Commit a3e086a

Browse files
committed
layered: write cpuset masks to map and check
1 parent 88c0702 commit a3e086a

File tree

2 files changed

+71
-29
lines changed

2 files changed

+71
-29
lines changed

scheds/rust/scx_layered/src/bpf/main.bpf.c

Lines changed: 66 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -69,17 +69,26 @@ u64 unprotected_seq = 0;
6969

7070
private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
7171
private(big_cpumask) struct bpf_cpumask __kptr *big_cpumask;
72-
// XXXLIKEWHATEVS -- this should be a map of kptrs.
73-
// for now use one cpumask consisting of all cpuset cpumasks
74-
// anded.
75-
private(cpuset_cpumask) struct bpf_cpumask __kptr *cpuset_cpumask;
7672
struct layer layers[MAX_LAYERS];
7773
u32 fallback_cpu;
7874
u32 layered_root_tgid = 0;
7975

8076
u32 empty_layer_ids[MAX_LAYERS];
8177
u32 nr_empty_layer_ids;
8278

79+
struct cpumask_box {
80+
struct bpf_cpumask __kptr *mask;
81+
};
82+
83+
struct {
84+
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
85+
__uint(max_entries, MAX_CONTAINERS);
86+
__type(key, u32);
87+
__type(value, struct cpumask_box);
88+
} cpuset_cpumask SEC(".maps");
89+
90+
91+
8392
UEI_DEFINE(uei);
8493

8594
struct task_hint {
@@ -502,6 +511,7 @@ struct task_ctx {
502511
struct bpf_cpumask __kptr *layered_unprotected_mask;
503512
bool all_cpus_allowed;
504513
bool cpus_node_aligned;
514+
bool cpus_cpuset_aligned;
505515
u64 runnable_at;
506516
u64 running_at;
507517
u64 runtime_avg;
@@ -1366,8 +1376,10 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
13661376
* with open layers on non-saturated machines to avoid possible stalls.
13671377
*/
13681378
if ((!taskc->all_cpus_allowed &&
1369-
!(layer->allow_node_aligned && taskc->cpus_node_aligned)) ||
1370-
!layer->nr_cpus) {
1379+
!(layer->allow_node_aligned && taskc->cpus_node_aligned)) ||
1380+
!(enable_container && taskc->cpus_cpuset_aligned) ||
1381+
!layer->nr_cpus) {
1382+
13711383
taskc->dsq_id = task_cpuc->lo_fb_dsq_id;
13721384
/*
13731385
* Start a new lo fallback queued region if the DSQ is empty.
@@ -2627,7 +2639,7 @@ void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
26272639
static void refresh_cpus_flags(struct task_ctx *taskc,
26282640
const struct cpumask *cpumask)
26292641
{
2630-
u32 node_id;
2642+
u32 node_id, container_id;
26312643

26322644
if (!all_cpumask) {
26332645
scx_bpf_error("NULL all_cpumask");
@@ -2644,7 +2656,7 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
26442656

26452657
if (!(nodec = lookup_node_ctx(node_id)) ||
26462658
!(node_cpumask = cast_mask(nodec->cpumask)))
2647-
return;
2659+
break;
26482660

26492661
/* not llc aligned if partially overlaps */
26502662
if (bpf_cpumask_intersects(node_cpumask, cpumask) &&
@@ -2653,6 +2665,21 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
26532665
break;
26542666
}
26552667
}
2668+
if (enable_container) {
2669+
bpf_for(container_id, 0, nr_containers) {
2670+
struct cpumask_box* box;
2671+
box = bpf_map_lookup_elem(&cpuset_cpumask, &container_id);
2672+
if (!box || !box->mask) {
2673+
scx_bpf_error("error marking tasks as cpuset aligned");
2674+
return;
2675+
}
2676+
if (bpf_cpumask_equal(cast_mask(box->mask), cpumask)) {
2677+
taskc->cpus_cpuset_aligned = true;
2678+
return;
2679+
}
2680+
}
2681+
taskc->cpus_cpuset_aligned = false;
2682+
}
26562683
}
26572684

26582685
static int init_cached_cpus(struct cached_cpus *ccpus)
@@ -3320,8 +3347,10 @@ static s32 init_cpu(s32 cpu, int *nr_online_cpus,
33203347

33213348
s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
33223349
{
3323-
struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask, tmptmp;
3324-
int i, nr_online_cpus, ret, x;
3350+
struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask,
3351+
*tmp_cpuset_cpumask, *tmp_swap_dst_cpumask;
3352+
int i, j, cpu, nr_online_cpus, ret;
3353+
struct cpumask_box* cpumask_box;
33253354

33263355
cpumask = bpf_cpumask_create();
33273356
if (!cpumask)
@@ -3372,28 +3401,41 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
33723401
if (!cpumask)
33733402
return -ENOMEM;
33743403

3375-
bpf_for(x, 0, MAX_CPUS/64) {
3404+
3405+
bpf_for(j, 0, MAX_CPUS/64) {
3406+
// verifier
3407+
if (i < 0 || i >= MAX_CONTAINERS || j < 0 || j >= (MAX_CPUS / 64)) {
3408+
bpf_cpumask_release(cpumask);
3409+
return -1;
3410+
}
33763411
// container then cpu bit
3377-
if (cpuset_fakemasks[i][x] == 1) {
3378-
bpf_cpumask_set_cpu(x, cpumask);
3412+
if (cpuset_fakemasks[i][j] == 1) {
3413+
bpf_cpumask_set_cpu(j, cpumask);
33793414
}
33803415
}
33813416

3382-
if (cpuset_cpumask) {
3383-
struct bpf_cpumask *tmp_cpuset_cpumask = bpf_kptr_xchg(&cpuset_cpumask, NULL);
3384-
if (!tmp_cpuset_cpumask) {
3385-
bpf_cpumask_release(cpumask);
3417+
3418+
// pay init cost once for faster lookups later.
3419+
bpf_for(cpu, 0, nr_possible_cpus) {
3420+
cpumask_box = bpf_map_lookup_percpu_elem(&cpuset_cpumask, &i, cpu);
3421+
tmp_cpuset_cpumask = bpf_cpumask_create();
3422+
3423+
if (!cpumask || !tmp_cpuset_cpumask || !cpumask_box) {
3424+
if (cpumask)
3425+
bpf_cpumask_release(cpumask);
3426+
if (tmp_cpuset_cpumask)
3427+
bpf_cpumask_release(tmp_cpuset_cpumask);
3428+
scx_bpf_error("cpumask is null");
33863429
return -1;
33873430
}
3388-
bpf_cpumask_and(cpumask, cast_mask(tmp_cpuset_cpumask), cast_mask(cpumask));
3389-
bpf_cpumask_release(tmp_cpuset_cpumask);
3390-
}
3391-
3392-
struct bpf_cpumask *old_cpumask = bpf_kptr_xchg(&cpuset_cpumask, cpumask);
3431+
bpf_cpumask_copy(tmp_cpuset_cpumask, cast_mask(cpumask));
33933432

3394-
if (old_cpumask) {
3395-
bpf_cpumask_release(old_cpumask);
3433+
tmp_swap_dst_cpumask = bpf_kptr_xchg(&cpumask_box->mask, tmp_cpuset_cpumask);
3434+
if (tmp_swap_dst_cpumask)
3435+
bpf_cpumask_release(tmp_swap_dst_cpumask);
33963436
}
3437+
if (cpumask)
3438+
bpf_cpumask_release(cpumask);
33973439

33983440
}
33993441
}

scheds/rust/scx_layered/src/main.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ use anyhow::Result;
2727
pub use bpf_skel::*;
2828
use clap::Parser;
2929
use crossbeam::channel::RecvTimeoutError;
30+
use layer_core_growth::get_cpusets;
3031
use lazy_static::lazy_static;
3132
use libbpf_rs::MapCore as _;
3233
use libbpf_rs::OpenObject;
@@ -59,7 +60,6 @@ use stats::LayerStats;
5960
use stats::StatsReq;
6061
use stats::StatsRes;
6162
use stats::SysStats;
62-
use layer_core_growth::get_cpusets;
6363

6464
const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
6565
const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
@@ -594,7 +594,7 @@ struct Opts {
594594
/// Enable container support
595595
#[clap(long, default_value = "false")]
596596
enable_container: bool,
597-
597+
598598
/// Maximum task runnable_at delay (in seconds) before antistall turns on
599599
#[clap(long, default_value = "3")]
600600
antistall_sec: u64,
@@ -1424,8 +1424,8 @@ impl<'a> Scheduler<'a> {
14241424
fn init_cpusets(skel: &mut OpenBpfSkel, topo: &Topology) -> Result<()> {
14251425
let cpusets = get_cpusets(topo)?;
14261426
for (i, cpuset) in cpusets.iter().enumerate() {
1427-
let mut cpumask_bitvec: [u64; MAX_CPUS/64] = [0; MAX_CPUS/64];
1428-
for j in 0..MAX_CPUS/64 {
1427+
let mut cpumask_bitvec: [u64; MAX_CPUS / 64] = [0; MAX_CPUS / 64];
1428+
for j in 0..MAX_CPUS / 64 {
14291429
if cpuset.cpus.contains(&j) {
14301430
cpumask_bitvec[j] = 1;
14311431
}
@@ -1945,7 +1945,7 @@ impl<'a> Scheduler<'a> {
19451945

19461946
Self::init_layers(&mut skel, &layer_specs, &topo)?;
19471947
Self::init_nodes(&mut skel, opts, &topo);
1948-
1948+
19491949
if opts.enable_container {
19501950
Self::init_cpusets(&mut skel, &topo)?;
19511951
}

0 commit comments

Comments
 (0)