layered: get cpuset support working with masks anded

likewhatevs · likewhatevs · commit 88c070251237 · 2025-05-06T11:05:08.000-04:00
diff --git a/scheds/rust/scx_layered/src/bpf/intf.h b/scheds/rust/scx_layered/src/bpf/intf.h
@@ -30,6 +30,7 @@ enum consts {
 	MAX_TASKS		= 131072,
 	MAX_PATH		= 4096,
 	MAX_NUMA_NODES		= 64,
+	MAX_CONTAINERS		= 64,
 	MAX_LLCS		= 64,
 	MAX_COMM		= 16,
 	MAX_LAYER_MATCH_ORS	= 32,
diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -34,9 +34,11 @@ const volatile u64 numa_cpumasks[MAX_NUMA_NODES][MAX_CPUS / 64];
 const volatile u32 llc_numa_id_map[MAX_LLCS];
 const volatile u32 cpu_llc_id_map[MAX_CPUS];
 const volatile u32 nr_layers = 1;
+const volatile u32 nr_containers = 1;
 const volatile u32 nr_nodes = 32;	/* !0 for veristat, set during init */
 const volatile u32 nr_llcs = 32;	/* !0 for veristat, set during init */
 const volatile bool smt_enabled = true;
+const volatile bool enable_container = true;
 const volatile bool has_little_cores = true;
 const volatile bool xnuma_preemption = false;
 const volatile s32 __sibling_cpu[MAX_CPUS];
@@ -53,6 +55,7 @@ const volatile u64 lo_fb_wait_ns = 5000000;	/* !0 for veristat */
 const volatile u32 lo_fb_share_ppk = 128;	/* !0 for veristat */
 const volatile bool percpu_kthread_preempt = true;
 volatile u64 layer_refresh_seq_avgruntime;
+const volatile u64 cpuset_fakemasks[MAX_CONTAINERS][MAX_CPUS / 64];
 
 /* Flag to enable or disable antistall feature */
 const volatile bool enable_antistall = true;
@@ -66,6 +69,10 @@ u64 unprotected_seq = 0;
 
 private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
 private(big_cpumask) struct bpf_cpumask __kptr *big_cpumask;
+// XXXLIKEWHATEVS -- this should be a map of kptrs.
+// for now use one cpumask consisting of all cpuset cpumasks
+// anded.
+private(cpuset_cpumask) struct bpf_cpumask __kptr *cpuset_cpumask;
 struct layer layers[MAX_LAYERS];
 u32 fallback_cpu;
 u32 layered_root_tgid = 0;
@@ -3313,8 +3320,8 @@ static s32 init_cpu(s32 cpu, int *nr_online_cpus,
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 {
-	struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask;
-	int i, nr_online_cpus, ret;
+	struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask, tmptmp;
+	int i, nr_online_cpus, ret, x;
 
 	cpumask = bpf_cpumask_create();
 	if (!cpumask)
@@ -3356,6 +3363,41 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 	if (tmp_unprotected_cpumask)
 		bpf_cpumask_release(tmp_unprotected_cpumask);
 
+
+
+	if (enable_container) {
+		bpf_for(i, 0, nr_containers) {
+			cpumask = bpf_cpumask_create();
+
+			if (!cpumask)
+				return -ENOMEM;
+			
+			bpf_for(x, 0, MAX_CPUS/64) {
+		 		// container then cpu bit
+				if (cpuset_fakemasks[i][x] == 1) {
+					bpf_cpumask_set_cpu(x, cpumask);
+				}
+			}
+
+			if (cpuset_cpumask) {
+				struct bpf_cpumask *tmp_cpuset_cpumask = bpf_kptr_xchg(&cpuset_cpumask, NULL);
+				if (!tmp_cpuset_cpumask) {
+					bpf_cpumask_release(cpumask);
+					return -1;
+				}
+				bpf_cpumask_and(cpumask, cast_mask(tmp_cpuset_cpumask), cast_mask(cpumask));
+				bpf_cpumask_release(tmp_cpuset_cpumask);
+			} 
+			
+			struct bpf_cpumask *old_cpumask = bpf_kptr_xchg(&cpuset_cpumask, cpumask);
+
+			if (old_cpumask) {
+				bpf_cpumask_release(old_cpumask);
+			}
+
+		}
+	}
+
 	bpf_for(i, 0, nr_nodes) {
 		ret = create_node(i);
 		if (ret)
diff --git a/scheds/rust/scx_layered/src/layer_core_growth.rs b/scheds/rust/scx_layered/src/layer_core_growth.rs
@@ -89,8 +89,8 @@ use std::collections::BTreeSet;
 
 #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
 pub struct CpuSet {
-    cpus: BTreeSet<usize>,
-    cores: BTreeSet<usize>,
+    pub cpus: BTreeSet<usize>,
+    pub cores: BTreeSet<usize>,
 }
 
 fn parse_cpu_ranges(s: &str) -> Result<BTreeSet<usize>> {
@@ -126,7 +126,7 @@ fn collect_cpuset_effective() -> Result<BTreeSet<BTreeSet<usize>>> {
 }
 
 // return cpuset layout.
-fn get_cpusets(topo: &Topology) -> Result<BTreeSet<CpuSet>> {
+pub fn get_cpusets(topo: &Topology) -> Result<BTreeSet<CpuSet>> {
     let mut cpusets: BTreeSet<CpuSet> = BTreeSet::new();
     let cpuset_cpus = collect_cpuset_effective()?;
     for x in cpuset_cpus {
diff --git a/scheds/rust/scx_layered/src/lib.rs b/scheds/rust/scx_layered/src/lib.rs
@@ -3,7 +3,7 @@
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2.
 mod config;
-mod layer_core_growth;
+pub mod layer_core_growth;
 
 pub mod bpf_intf;
 
@@ -189,7 +189,7 @@ impl CpuPool {
         cpus
     }
 
-    fn get_core_topological_id(&self, core: &Core) -> usize {
+    pub fn get_core_topological_id(&self, core: &Core) -> usize {
         *self
             .core_topology_to_id
             .get(&(core.node_id, core.llc_id, core.id))
diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs
@@ -59,6 +59,7 @@ use stats::LayerStats;
 use stats::StatsReq;
 use stats::StatsRes;
 use stats::SysStats;
+use layer_core_growth::get_cpusets;
 
 const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
 const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
@@ -67,6 +68,7 @@ const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT;
 const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize;
 const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize;
 const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize;
+const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize;
 const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT;
 const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE;
 const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0;
@@ -589,6 +591,10 @@ struct Opts {
     #[clap(long, default_value = "false")]
     disable_antistall: bool,
 
+    /// Enable container support
+    #[clap(long, default_value = "false")]
+    enable_container: bool,
+    
     /// Maximum task runnable_at delay (in seconds) before antistall turns on
     #[clap(long, default_value = "3")]
     antistall_sec: u64,
@@ -1415,6 +1421,22 @@ impl<'a> Scheduler<'a> {
         Ok(())
     }
 
+    fn init_cpusets(skel: &mut OpenBpfSkel, topo: &Topology) -> Result<()> {
+        let cpusets = get_cpusets(topo)?;
+        for (i, cpuset) in cpusets.iter().enumerate() {
+            let mut cpumask_bitvec: [u64; MAX_CPUS/64] = [0; MAX_CPUS/64];
+            for j in 0..MAX_CPUS/64 {
+                if cpuset.cpus.contains(&j) {
+                    cpumask_bitvec[j] = 1;
+                }
+            }
+            let cpuset_cpumask_slice = &mut skel.maps.rodata_data.cpuset_fakemasks[i];
+            cpuset_cpumask_slice.copy_from_slice(&cpumask_bitvec);
+        }
+        skel.maps.rodata_data.nr_containers = cpusets.len() as u32;
+        Ok(())
+    }
+
     fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) {
         skel.maps.rodata_data.nr_nodes = topo.nodes.len() as u32;
         skel.maps.rodata_data.nr_llcs = 0;
@@ -1855,6 +1877,7 @@ impl<'a> Scheduler<'a> {
         skel.maps.rodata_data.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000;
         skel.maps.rodata_data.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024);
         skel.maps.rodata_data.enable_antistall = !opts.disable_antistall;
+        skel.maps.rodata_data.enable_container = opts.enable_container;
         skel.maps.rodata_data.enable_gpu_support = opts.enable_gpu_support;
 
         for (cpu, sib) in topo.sibling_cpus().iter().enumerate() {
@@ -1922,6 +1945,10 @@ impl<'a> Scheduler<'a> {
 
         Self::init_layers(&mut skel, &layer_specs, &topo)?;
         Self::init_nodes(&mut skel, opts, &topo);
+        
+        if opts.enable_container {
+            Self::init_cpusets(&mut skel, &topo)?;
+        }
 
         // We set the pin path before loading the skeleton. This will ensure
         // libbpf creates and pins the map, or reuses the pinned map fd for us,