layered: write cpuset masks to map and check

likewhatevs · likewhatevs · commit a3e086a443c6 · 2025-05-06T11:05:12.000-04:00
diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c
@@ -69,17 +69,26 @@ u64 unprotected_seq = 0;
 
 private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask;
 private(big_cpumask) struct bpf_cpumask __kptr *big_cpumask;
-// XXXLIKEWHATEVS -- this should be a map of kptrs.
-// for now use one cpumask consisting of all cpuset cpumasks
-// anded.
-private(cpuset_cpumask) struct bpf_cpumask __kptr *cpuset_cpumask;
 struct layer layers[MAX_LAYERS];
 u32 fallback_cpu;
 u32 layered_root_tgid = 0;
 
 u32 empty_layer_ids[MAX_LAYERS];
 u32 nr_empty_layer_ids;
 
+struct cpumask_box {
+    struct bpf_cpumask __kptr *mask;
+};
+
+struct {
+    __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+    __uint(max_entries, MAX_CONTAINERS);
+    __type(key, u32);
+    __type(value, struct cpumask_box);
+} cpuset_cpumask SEC(".maps");
+
+
+
 UEI_DEFINE(uei);
 
 struct task_hint {
@@ -502,6 +511,7 @@ struct task_ctx {
 	struct bpf_cpumask __kptr *layered_unprotected_mask;
 	bool			all_cpus_allowed;
 	bool			cpus_node_aligned;
+	bool			cpus_cpuset_aligned;
 	u64			runnable_at;
 	u64			running_at;
 	u64			runtime_avg;
@@ -1366,8 +1376,10 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags)
 	 * with open layers on non-saturated machines to avoid possible stalls.
 	 */
 	if ((!taskc->all_cpus_allowed &&
-	     !(layer->allow_node_aligned && taskc->cpus_node_aligned)) ||
-	    !layer->nr_cpus) {
+		!(layer->allow_node_aligned && taskc->cpus_node_aligned)) ||
+		!(enable_container && taskc->cpus_cpuset_aligned) ||
+		!layer->nr_cpus) {
+
 		taskc->dsq_id = task_cpuc->lo_fb_dsq_id;
 		/*
 		 * Start a new lo fallback queued region if the DSQ is empty.
@@ -2627,7 +2639,7 @@ void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight)
 static void refresh_cpus_flags(struct task_ctx *taskc,
 			       const struct cpumask *cpumask)
 {
-	u32 node_id;
+	u32 node_id, container_id;
 
 	if (!all_cpumask) {
 		scx_bpf_error("NULL all_cpumask");
@@ -2644,7 +2656,7 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
 
 		if (!(nodec = lookup_node_ctx(node_id)) ||
 		    !(node_cpumask = cast_mask(nodec->cpumask)))
-			return;
+			break;
 
 		/* not llc aligned if partially overlaps */
 		if (bpf_cpumask_intersects(node_cpumask, cpumask) &&
@@ -2653,6 +2665,21 @@ static void refresh_cpus_flags(struct task_ctx *taskc,
 			break;
 		}
 	}
+	if (enable_container) {
+		bpf_for(container_id, 0, nr_containers) {
+			struct cpumask_box* box;
+			box = bpf_map_lookup_elem(&cpuset_cpumask, &container_id);
+			if (!box || !box->mask) {
+				scx_bpf_error("error marking tasks as cpuset aligned");
+				return;
+			}
+			if (bpf_cpumask_equal(cast_mask(box->mask), cpumask)) {
+				taskc->cpus_cpuset_aligned = true;
+				return;
+			}
+		}
+		taskc->cpus_cpuset_aligned = false;
+	}
 }
 
 static int init_cached_cpus(struct cached_cpus *ccpus)
@@ -3320,8 +3347,10 @@ static s32 init_cpu(s32 cpu, int *nr_online_cpus,
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 {
-	struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask, tmptmp;
-	int i, nr_online_cpus, ret, x;
+	struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask, 
+		*tmp_cpuset_cpumask, *tmp_swap_dst_cpumask;
+	int i, j, cpu, nr_online_cpus, ret;
+	struct cpumask_box* cpumask_box;
 
 	cpumask = bpf_cpumask_create();
 	if (!cpumask)
@@ -3372,28 +3401,41 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init)
 			if (!cpumask)
 				return -ENOMEM;
 			
-			bpf_for(x, 0, MAX_CPUS/64) {
+			
+			bpf_for(j, 0, MAX_CPUS/64) {
+				// verifier
+				if (i < 0 || i >= MAX_CONTAINERS || j < 0 || j >= (MAX_CPUS / 64)) {
+    					bpf_cpumask_release(cpumask);
+    					return -1;
+				}
 		 		// container then cpu bit
-				if (cpuset_fakemasks[i][x] == 1) {
-					bpf_cpumask_set_cpu(x, cpumask);
+				if (cpuset_fakemasks[i][j] == 1) {
+					bpf_cpumask_set_cpu(j, cpumask);
 				}
 			}
 
-			if (cpuset_cpumask) {
-				struct bpf_cpumask *tmp_cpuset_cpumask = bpf_kptr_xchg(&cpuset_cpumask, NULL);
-				if (!tmp_cpuset_cpumask) {
-					bpf_cpumask_release(cpumask);
+
+			// pay init cost once for faster lookups later.
+			bpf_for(cpu, 0, nr_possible_cpus) {
+				cpumask_box = bpf_map_lookup_percpu_elem(&cpuset_cpumask, &i, cpu);	
+				tmp_cpuset_cpumask = bpf_cpumask_create();
+			
+				if (!cpumask || !tmp_cpuset_cpumask || !cpumask_box) {
+					if (cpumask)
+						bpf_cpumask_release(cpumask);
+					if (tmp_cpuset_cpumask)
+						bpf_cpumask_release(tmp_cpuset_cpumask);
+					scx_bpf_error("cpumask is null");
 					return -1;
 				}
-				bpf_cpumask_and(cpumask, cast_mask(tmp_cpuset_cpumask), cast_mask(cpumask));
-				bpf_cpumask_release(tmp_cpuset_cpumask);
-			} 
-			
-			struct bpf_cpumask *old_cpumask = bpf_kptr_xchg(&cpuset_cpumask, cpumask);
+				bpf_cpumask_copy(tmp_cpuset_cpumask, cast_mask(cpumask));
 
-			if (old_cpumask) {
-				bpf_cpumask_release(old_cpumask);
+				tmp_swap_dst_cpumask = bpf_kptr_xchg(&cpumask_box->mask, tmp_cpuset_cpumask);
+				if (tmp_swap_dst_cpumask)
+					bpf_cpumask_release(tmp_swap_dst_cpumask);
 			}
+			if (cpumask)
+				bpf_cpumask_release(cpumask);
 
 		}
 	}
diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs
@@ -27,6 +27,7 @@ use anyhow::Result;
 pub use bpf_skel::*;
 use clap::Parser;
 use crossbeam::channel::RecvTimeoutError;
+use layer_core_growth::get_cpusets;
 use lazy_static::lazy_static;
 use libbpf_rs::MapCore as _;
 use libbpf_rs::OpenObject;
@@ -59,7 +60,6 @@ use stats::LayerStats;
 use stats::StatsReq;
 use stats::StatsRes;
 use stats::SysStats;
-use layer_core_growth::get_cpusets;
 
 const MAX_PATH: usize = bpf_intf::consts_MAX_PATH as usize;
 const MAX_COMM: usize = bpf_intf::consts_MAX_COMM as usize;
@@ -594,7 +594,7 @@ struct Opts {
     /// Enable container support
     #[clap(long, default_value = "false")]
     enable_container: bool,
-    
+
     /// Maximum task runnable_at delay (in seconds) before antistall turns on
     #[clap(long, default_value = "3")]
     antistall_sec: u64,
@@ -1424,8 +1424,8 @@ impl<'a> Scheduler<'a> {
     fn init_cpusets(skel: &mut OpenBpfSkel, topo: &Topology) -> Result<()> {
         let cpusets = get_cpusets(topo)?;
         for (i, cpuset) in cpusets.iter().enumerate() {
-            let mut cpumask_bitvec: [u64; MAX_CPUS/64] = [0; MAX_CPUS/64];
-            for j in 0..MAX_CPUS/64 {
+            let mut cpumask_bitvec: [u64; MAX_CPUS / 64] = [0; MAX_CPUS / 64];
+            for j in 0..MAX_CPUS / 64 {
                 if cpuset.cpus.contains(&j) {
                     cpumask_bitvec[j] = 1;
                 }
@@ -1945,7 +1945,7 @@ impl<'a> Scheduler<'a> {
 
         Self::init_layers(&mut skel, &layer_specs, &topo)?;
         Self::init_nodes(&mut skel, opts, &topo);
-        
+
         if opts.enable_container {
             Self::init_cpusets(&mut skel, &topo)?;
         }