Skip to content

Commit 49036b7

Browse files
committed
Make reported core count of distributed builder configurable
Also move the slight inflation of CPU core count ("overcommit" to make up for various latencies) to the builder in order to enable setting an exact maximum number of cores to use which will never be exceeded. That introduces a small problem in the scheduling protocol (excess overcommit if the builder is new and the scheduler is old) that seems pretty acceptable to me and, anyway, does not occur if both builder and scheduler are of the same version. As another side effect, it shouldn't occur anymore that the scheduler reports more running jobs than available slots.
1 parent cb98a5c commit 49036b7

File tree

7 files changed

+34
-9
lines changed

7 files changed

+34
-9
lines changed

docs/Distributed.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,9 @@ type = "DANGEROUSLY_INSECURE"
355355

356356

357357
```toml
358+
# The maximum number of cores to be used for build jobs.
359+
# If unspecified, slightly higher than the number of CPU cores (including SMT "cores").
360+
#core_count = 16
358361
# This is where client toolchains will be stored.
359362
cache_dir = "/tmp/toolchains"
360363
# The maximum size of the toolchain cache, in bytes.

docs/DistributedFreeBSD.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ Then, a server.conf like the one below is created, making use of the `pot`
3535
builder type (commented out options show defaults):
3636

3737
```toml
38+
# The maximum number of cores to be used for build jobs.
39+
# If unspecified, slightly higher than the number of CPU cores (including SMT "cores").
40+
#core_count = 16
3841
# This is where client toolchains will be stored.
3942
cache_dir = "/tmp/toolchains"
4043
# The maximum size of the toolchain cache, in bytes.

docs/DistributedQuickstart.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ The build server requires [bubblewrap](https://github.com/projectatomic/bubblewr
6868

6969
Create a server.conf file to configure authentication, storage locations, network addresses and the path to bubblewrap. A minimal example looks like:
7070
```toml
71+
# The maximum number of cores to be used for build jobs.
72+
# If unspecified, slightly higher than the number of CPU cores (including SMT "cores").
73+
#core_count = 16
7174
# This is where client toolchains will be stored.
7275
cache_dir = "/tmp/toolchains"
7376
# The maximum size of the toolchain cache, in bytes.

src/bin/sccache-dist/main.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ use sccache::dist::{
1414
ServerNonce, ServerOutgoing, SubmitToolchainResult, TcCache, Toolchain, ToolchainReader,
1515
UpdateJobStateResult,
1616
};
17-
use sccache::util::daemonize;
18-
use sccache::util::BASE64_URL_SAFE_ENGINE;
17+
use sccache::util::{daemonize, BASE64_URL_SAFE_ENGINE, num_cpus};
1918
use serde::{Deserialize, Serialize};
2019
use std::collections::{btree_map, BTreeMap, HashMap, HashSet};
2120
use std::env;
@@ -134,6 +133,16 @@ fn check_jwt_server_token(
134133
.ok()
135134
}
136135

136+
fn default_core_count_this_machine() -> usize
137+
{
138+
let core_count = num_cpus();
139+
// Oversubscribe cores just a little to make up for network and I/O latency. This formula is
140+
// not based on hard data but an extrapolation to high core counts of the conventional wisdom
141+
// that slightly more jobs than cores achieve the shortest compile time. Which is originally
142+
// about local compiles and this is over the network, so be slightly less conservative.
143+
core_count + 1 + core_count / 8
144+
}
145+
137146
fn run(command: Command) -> Result<i32> {
138147
match command {
139148
Command::Auth(AuthSubcommand::Base64 { num_bytes }) => {
@@ -229,6 +238,7 @@ fn run(command: Command) -> Result<i32> {
229238
scheduler_url,
230239
scheduler_auth,
231240
toolchain_cache_size,
241+
core_count
232242
}) => {
233243
let builder: Box<dyn dist::BuilderIncoming> = match builder {
234244
#[cfg(not(target_os = "freebsd"))]
@@ -293,6 +303,7 @@ fn run(command: Command) -> Result<i32> {
293303
bind_address,
294304
scheduler_url.to_url(),
295305
scheduler_auth,
306+
core_count.unwrap_or(default_core_count_this_machine()),
296307
server,
297308
)
298309
.context("Failed to create sccache HTTP server instance")?;
@@ -403,13 +414,8 @@ impl Default for Scheduler {
403414
}
404415

405416
fn load_weight(job_count: usize, core_count: usize) -> f64 {
406-
// Oversubscribe cores just a little to make up for network and I/O latency. This formula is
407-
// not based on hard data but an extrapolation to high core counts of the conventional wisdom
408-
// that slightly more jobs than cores achieve the shortest compile time. Which is originally
409-
// about local compiles and this is over the network, so be slightly less conservative.
410-
let cores_plus_slack = core_count + 1 + core_count / 8;
411417
// Note >=, not >, because the question is "can we add another job"?
412-
if job_count >= cores_plus_slack {
418+
if job_count >= core_count {
413419
MAX_PER_CORE_LOAD + 1f64 // no new jobs for now
414420
} else {
415421
job_count as f64 / core_count as f64

src/config.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,6 +1196,7 @@ pub mod server {
11961196
pub bind_address: Option<SocketAddr>,
11971197
pub scheduler_url: HTTPUrl,
11981198
pub scheduler_auth: SchedulerAuth,
1199+
pub core_count: Option<usize>,
11991200
#[serde(default = "default_toolchain_cache_size")]
12001201
pub toolchain_cache_size: u64,
12011202
}
@@ -1589,6 +1590,9 @@ fn server_toml_parse() {
15891590
use server::BuilderType;
15901591
use server::SchedulerAuth;
15911592
const CONFIG_STR: &str = r#"
1593+
# The maximum number of cores to be used for build jobs.
1594+
# If unspecified, slightly higher than the number of CPU cores (including SMT "cores").
1595+
core_count = 2097
15921596
# This is where client toolchains will be stored.
15931597
cache_dir = "/tmp/toolchains"
15941598
# The maximum size of the toolchain cache, in bytes.
@@ -1641,6 +1645,7 @@ fn server_toml_parse() {
16411645
token: "my server's token".to_owned()
16421646
},
16431647
toolchain_cache_size: 10737418240,
1648+
core_count: Some(2097),
16441649
}
16451650
)
16461651
}

src/dist/http.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,7 @@ mod server {
873873
bind_address: SocketAddr,
874874
scheduler_url: reqwest::Url,
875875
scheduler_auth: String,
876+
core_count: usize,
876877
// HTTPS pieces all the builders will use for connection encryption
877878
cert_digest: Vec<u8>,
878879
cert_pem: Vec<u8>,
@@ -890,6 +891,7 @@ mod server {
890891
bind_address: Option<SocketAddr>,
891892
scheduler_url: reqwest::Url,
892893
scheduler_auth: String,
894+
core_count: usize,
893895
handler: S,
894896
) -> Result<Self> {
895897
let (cert_digest, cert_pem, privkey_pem) =
@@ -903,6 +905,7 @@ mod server {
903905
bind_address: bind_address.unwrap_or(public_addr),
904906
scheduler_url,
905907
scheduler_auth,
908+
core_count,
906909
cert_digest,
907910
cert_pem,
908911
privkey_pem,
@@ -914,7 +917,7 @@ mod server {
914917

915918
pub fn start(self) -> Result<Infallible> {
916919
let heartbeat_req = HeartbeatServerHttpRequest {
917-
num_cpus: num_cpus(),
920+
num_cpus: self.core_count,
918921
jwt_key: self.jwt_key.clone(),
919922
server_nonce: self.server_nonce,
920923
cert_digest: self.cert_digest,

tests/harness/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,7 @@ fn sccache_server_cfg(
216216
token: DIST_SERVER_TOKEN.to_owned(),
217217
},
218218
toolchain_cache_size: TC_CACHE_SIZE,
219+
core_count: None,
219220
}
220221
}
221222

@@ -430,6 +431,7 @@ impl DistSystem {
430431
Some(SocketAddr::from(([0, 0, 0, 0], server_addr.port()))),
431432
self.scheduler_url().to_url(),
432433
token,
434+
4,
433435
handler,
434436
)
435437
.unwrap();

0 commit comments

Comments
 (0)