Make reported core count of distributed builder configurable

ahartmetz · ahartmetz · commit 49036b7601d3 · 2025-06-17T15:44:40.000+02:00
Also move the slight inflation of CPU core count ("overcommit" to
make up for various latencies) to the builder in order to enable
setting an exact maximum number of cores to use which will never be
exceeded. That introduces a small problem in the scheduling
protocol (excess overcommit if the builder is new and the scheduler
is old) that seems pretty acceptable to me and, anyway, does not
occur if both builder and scheduler are of the same version.
As another side effect, it shouldn't occur anymore that the
scheduler reports more running jobs than available slots.
diff --git a/docs/Distributed.md b/docs/Distributed.md
@@ -355,6 +355,9 @@ type = "DANGEROUSLY_INSECURE"
 
 
 ```toml
+# The maximum number of cores to be used for build jobs.
+# If unspecified, slightly higher than the number of CPU cores (including SMT "cores").
+#core_count = 16
 # This is where client toolchains will be stored.
 cache_dir = "/tmp/toolchains"
 # The maximum size of the toolchain cache, in bytes.
diff --git a/docs/DistributedFreeBSD.md b/docs/DistributedFreeBSD.md
@@ -35,6 +35,9 @@ Then, a server.conf like the one below is created, making use of the `pot`
 builder type (commented out options show defaults):
 
 ```toml
+# The maximum number of cores to be used for build jobs.
+# If unspecified, slightly higher than the number of CPU cores (including SMT "cores").
+#core_count = 16
 # This is where client toolchains will be stored.
 cache_dir = "/tmp/toolchains"
 # The maximum size of the toolchain cache, in bytes.
diff --git a/docs/DistributedQuickstart.md b/docs/DistributedQuickstart.md
@@ -68,6 +68,9 @@ The build server requires [bubblewrap](https://github.com/projectatomic/bubblewr
 
 Create a server.conf file to configure authentication, storage locations, network addresses and the path to bubblewrap. A minimal example looks like:
 ```toml
+# The maximum number of cores to be used for build jobs.
+# If unspecified, slightly higher than the number of CPU cores (including SMT "cores").
+#core_count = 16
 # This is where client toolchains will be stored.
 cache_dir = "/tmp/toolchains"
 # The maximum size of the toolchain cache, in bytes.
diff --git a/src/bin/sccache-dist/main.rs b/src/bin/sccache-dist/main.rs
@@ -14,8 +14,7 @@ use sccache::dist::{
     ServerNonce, ServerOutgoing, SubmitToolchainResult, TcCache, Toolchain, ToolchainReader,
     UpdateJobStateResult,
 };
-use sccache::util::daemonize;
-use sccache::util::BASE64_URL_SAFE_ENGINE;
+use sccache::util::{daemonize, BASE64_URL_SAFE_ENGINE, num_cpus};
 use serde::{Deserialize, Serialize};
 use std::collections::{btree_map, BTreeMap, HashMap, HashSet};
 use std::env;
@@ -134,6 +133,16 @@ fn check_jwt_server_token(
         .ok()
 }
 
+fn default_core_count_this_machine() -> usize
+{
+    let core_count = num_cpus();
+    // Oversubscribe cores just a little to make up for network and I/O latency. This formula is
+    // not based on hard data but an extrapolation to high core counts of the conventional wisdom
+    // that slightly more jobs than cores achieve the shortest compile time. Which is originally
+    // about local compiles and this is over the network, so be slightly less conservative.
+    core_count + 1 + core_count / 8
+}
+
 fn run(command: Command) -> Result<i32> {
     match command {
         Command::Auth(AuthSubcommand::Base64 { num_bytes }) => {
@@ -229,6 +238,7 @@ fn run(command: Command) -> Result<i32> {
             scheduler_url,
             scheduler_auth,
             toolchain_cache_size,
+            core_count
         }) => {
             let builder: Box<dyn dist::BuilderIncoming> = match builder {
                 #[cfg(not(target_os = "freebsd"))]
@@ -293,6 +303,7 @@ fn run(command: Command) -> Result<i32> {
                 bind_address,
                 scheduler_url.to_url(),
                 scheduler_auth,
+                core_count.unwrap_or(default_core_count_this_machine()),
                 server,
             )
             .context("Failed to create sccache HTTP server instance")?;
@@ -403,13 +414,8 @@ impl Default for Scheduler {
 }
 
 fn load_weight(job_count: usize, core_count: usize) -> f64 {
-    // Oversubscribe cores just a little to make up for network and I/O latency. This formula is
-    // not based on hard data but an extrapolation to high core counts of the conventional wisdom
-    // that slightly more jobs than cores achieve the shortest compile time. Which is originally
-    // about local compiles and this is over the network, so be slightly less conservative.
-    let cores_plus_slack = core_count + 1 + core_count / 8;
     // Note >=, not >, because the question is "can we add another job"?
-    if job_count >= cores_plus_slack {
+    if job_count >= core_count {
         MAX_PER_CORE_LOAD + 1f64 // no new jobs for now
     } else {
         job_count as f64 / core_count as f64
diff --git a/src/config.rs b/src/config.rs
@@ -1196,6 +1196,7 @@ pub mod server {
         pub bind_address: Option<SocketAddr>,
         pub scheduler_url: HTTPUrl,
         pub scheduler_auth: SchedulerAuth,
+        pub core_count: Option<usize>,
         #[serde(default = "default_toolchain_cache_size")]
         pub toolchain_cache_size: u64,
     }
@@ -1589,6 +1590,9 @@ fn server_toml_parse() {
     use server::BuilderType;
     use server::SchedulerAuth;
     const CONFIG_STR: &str = r#"
+    # The maximum number of cores to be used for build jobs.
+    # If unspecified, slightly higher than the number of CPU cores (including SMT "cores").
+    core_count = 2097
     # This is where client toolchains will be stored.
     cache_dir = "/tmp/toolchains"
     # The maximum size of the toolchain cache, in bytes.
@@ -1641,6 +1645,7 @@ fn server_toml_parse() {
                 token: "my server's token".to_owned()
             },
             toolchain_cache_size: 10737418240,
+            core_count: Some(2097),
         }
     )
 }
diff --git a/src/dist/http.rs b/src/dist/http.rs
@@ -873,6 +873,7 @@ mod server {
         bind_address: SocketAddr,
         scheduler_url: reqwest::Url,
         scheduler_auth: String,
+        core_count: usize,
         // HTTPS pieces all the builders will use for connection encryption
         cert_digest: Vec<u8>,
         cert_pem: Vec<u8>,
@@ -890,6 +891,7 @@ mod server {
             bind_address: Option<SocketAddr>,
             scheduler_url: reqwest::Url,
             scheduler_auth: String,
+            core_count: usize,
             handler: S,
         ) -> Result<Self> {
             let (cert_digest, cert_pem, privkey_pem) =
@@ -903,6 +905,7 @@ mod server {
                 bind_address: bind_address.unwrap_or(public_addr),
                 scheduler_url,
                 scheduler_auth,
+                core_count,
                 cert_digest,
                 cert_pem,
                 privkey_pem,
@@ -914,7 +917,7 @@ mod server {
 
         pub fn start(self) -> Result<Infallible> {
             let heartbeat_req = HeartbeatServerHttpRequest {
-                num_cpus: num_cpus(),
+                num_cpus: self.core_count,
                 jwt_key: self.jwt_key.clone(),
                 server_nonce: self.server_nonce,
                 cert_digest: self.cert_digest,
diff --git a/tests/harness/mod.rs b/tests/harness/mod.rs
@@ -216,6 +216,7 @@ fn sccache_server_cfg(
             token: DIST_SERVER_TOKEN.to_owned(),
         },
         toolchain_cache_size: TC_CACHE_SIZE,
+        core_count: None,
     }
 }
 
@@ -430,6 +431,7 @@ impl DistSystem {
             Some(SocketAddr::from(([0, 0, 0, 0], server_addr.port()))),
             self.scheduler_url().to_url(),
             token,
+            4,
             handler,
         )
         .unwrap();

Original file line number	Diff line number	Diff line change
`@@ -1196,6 +1196,7 @@ pub mod server {`
`1196`	`1196`	`pub bind_address: Option<SocketAddr>,`
`1197`	`1197`	`pub scheduler_url: HTTPUrl,`
`1198`	`1198`	`pub scheduler_auth: SchedulerAuth,`
	`1199`	`+ pub core_count: Option<usize>,`
`1199`	`1200`	`#[serde(default = "default_toolchain_cache_size")]`
`1200`	`1201`	`pub toolchain_cache_size: u64,`
`1201`	`1202`	`}`
`@@ -1589,6 +1590,9 @@ fn server_toml_parse() {`
`1589`	`1590`	`use server::BuilderType;`
`1590`	`1591`	`use server::SchedulerAuth;`
`1591`	`1592`	`const CONFIG_STR: &str = r#"`
	`1593`	`+ # The maximum number of cores to be used for build jobs.`
	`1594`	`+ # If unspecified, slightly higher than the number of CPU cores (including SMT "cores").`
	`1595`	`+ core_count = 2097`
`1592`	`1596`	`# This is where client toolchains will be stored.`
`1593`	`1597`	`cache_dir = "/tmp/toolchains"`
`1594`	`1598`	`# The maximum size of the toolchain cache, in bytes.`
`@@ -1641,6 +1645,7 @@ fn server_toml_parse() {`
`1641`	`1645`	`token: "my server's token".to_owned()`
`1642`	`1646`	`},`
`1643`	`1647`	`toolchain_cache_size: 10737418240,`
	`1648`	`+ core_count: Some(2097),`
`1644`	`1649`	`}`
`1645`	`1650`	`)`
`1646`	`1651`	`}`
Original file line number	Diff line number	Diff line change
`@@ -216,6 +216,7 @@ fn sccache_server_cfg(`
`216`	`216`	`token: DIST_SERVER_TOKEN.to_owned(),`
`217`	`217`	`},`
`218`	`218`	`toolchain_cache_size: TC_CACHE_SIZE,`
	`219`	`+ core_count: None,`
`219`	`220`	`}`
`220`	`221`	`}`
`221`	`222`
`@@ -430,6 +431,7 @@ impl DistSystem {`
`430`	`431`	`Some(SocketAddr::from(([0, 0, 0, 0], server_addr.port()))),`
`431`	`432`	`self.scheduler_url().to_url(),`
`432`	`433`	`token,`
	`434`	`+ 4,`
`433`	`435`	`handler,`
`434`	`436`	`)`
`435`	`437`	`.unwrap();`