Skip to content

Commit e07e968

Browse files
authored
Merge pull request #4308 from ProvableHQ/feat/stricter_fd_limit_checks
[Feat] Stricter fd limit checks
2 parents 2e5adcb + 59319e7 commit e07e968

5 files changed

Lines changed: 224 additions & 10 deletions

File tree

Cargo.lock

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cli/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,6 @@ features = [ "test-helpers" ]
182182
version = "0.30"
183183
default-features = false
184184
features = [ "resource" ]
185+
186+
[target.'cfg(unix)'.dependencies]
187+
rlimit = "0.11"

cli/src/commands/start.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ impl Start {
339339
// Error messages.
340340
let node_parse_error = || "Failed to start node";
341341

342+
// Periodically check if the number of file descriptors isn't becoming insufficient.
343+
#[cfg(unix)]
344+
crate::helpers::spawn_fd_monitor();
345+
342346
// Clone the configurations.
343347
let mut self_ = self.clone();
344348

cli/src/helpers/fd_check.rs

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
// Copyright (c) 2019-2026 Provable Inc.
2+
// This file is part of the snarkOS library.
3+
4+
// Licensed under the Apache License, Version 2.0 (the "License");
5+
// you may not use this file except in compliance with the License.
6+
// You may obtain a copy of the License at:
7+
8+
// http://www.apache.org/licenses/LICENSE-2.0
9+
10+
// Unless required by applicable law or agreed to in writing, software
11+
// distributed under the License is distributed on an "AS IS" BASIS,
12+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
// See the License for the specific language governing permissions and
14+
// limitations under the License.
15+
16+
use std::io;
17+
18+
use tokio::time::{Duration, MissedTickBehavior, interval};
19+
use tracing::*;
20+
21+
/// Node-scale fd use.
22+
#[derive(Debug, Clone, Copy)]
23+
pub struct FdUsage {
24+
/// File descriptors currently open.
25+
pub open: u64,
26+
/// Current soft limit (RLIMIT_NOFILE). `None` == unlimited.
27+
pub soft_limit: Option<u64>,
28+
}
29+
30+
impl FdUsage {
31+
/// Fraction of the soft limit in use (0.0..=1.0). 0.0 when unlimited.
32+
pub fn ratio(&self) -> f64 {
33+
match self.soft_limit {
34+
Some(limit) if limit > 0 => self.open as f64 / limit as f64,
35+
_ => 0.0,
36+
}
37+
}
38+
39+
/// True once usage reaches `threshold` of the soft limit (e.g. 0.8 == 80%).
40+
pub fn approaching_limit(&self, threshold: f64) -> bool {
41+
self.soft_limit.is_some() && self.ratio() >= threshold
42+
}
43+
}
44+
45+
/// Probe the live system: current soft limit + count of open descriptors.
46+
pub fn fd_usage() -> io::Result<FdUsage> {
47+
let soft_limit = soft_nofile_limit()?;
48+
let open = count_open_fds(soft_limit)?;
49+
Ok(FdUsage { open, soft_limit })
50+
}
51+
52+
fn soft_nofile_limit() -> io::Result<Option<u64>> {
53+
let (soft, _hard) = rlimit::Resource::NOFILE.get()?;
54+
Ok(if soft == rlimit::INFINITY { None } else { Some(soft) })
55+
}
56+
57+
#[cfg(target_os = "linux")]
58+
fn count_open_fds(_limit: Option<u64>) -> io::Result<u64> {
59+
// Each open descriptor is an entry in /proc/self/fd. The directory
60+
// handle itself holds one fd while we iterate, so subtract it back out.
61+
let mut n: u64 = 0;
62+
for entry in std::fs::read_dir("/proc/self/fd")? {
63+
entry?;
64+
n += 1;
65+
}
66+
Ok(n.saturating_sub(1))
67+
}
68+
69+
#[cfg(all(unix, not(target_os = "linux")))]
70+
fn count_open_fds(_limit: Option<u64>) -> io::Result<u64> {
71+
// macOS and most BSDs expose open fds via /dev/fd (same idea as Linux's /proc/self/fd).
72+
// The directory handle itself holds one fd while we iterate, so subtract it back out.
73+
let mut n: u64 = 0;
74+
for entry in std::fs::read_dir("/dev/fd")? {
75+
entry?;
76+
n += 1;
77+
}
78+
Ok(n.saturating_sub(1))
79+
}
80+
81+
/// System-wide (whole machine) fd use.
82+
#[derive(Debug, Clone, Copy)]
83+
pub struct SystemFd {
84+
pub allocated: u64,
85+
pub max: u64,
86+
}
87+
88+
impl SystemFd {
89+
pub fn ratio(&self) -> f64 {
90+
if self.max > 0 { self.allocated as f64 / self.max as f64 } else { 0.0 }
91+
}
92+
}
93+
94+
#[cfg(target_os = "linux")]
95+
pub fn system_fd_usage() -> std::io::Result<SystemFd> {
96+
// /proc/sys/fs/file-nr => "<allocated>\t<free, always 0>\t<max>"
97+
let s = std::fs::read_to_string("/proc/sys/fs/file-nr")?;
98+
let mut f = s.split_whitespace();
99+
let bad = || std::io::Error::new(std::io::ErrorKind::InvalidData, "unexpected file-nr format");
100+
let allocated = f.next().and_then(|v| v.parse().ok()).ok_or_else(bad)?;
101+
let _free = f.next(); // always 0 on modern kernels
102+
let max = f.next().and_then(|v| v.parse().ok()).ok_or_else(bad)?;
103+
Ok(SystemFd { allocated, max })
104+
}
105+
106+
#[cfg(all(unix, not(target_os = "linux")))]
107+
pub fn system_fd_usage() -> std::io::Result<SystemFd> {
108+
// OID names differ by flavor; values are plain integers.
109+
#[cfg(target_os = "freebsd")]
110+
let (cur_oid, max_oid) = ("kern.openfiles", "kern.maxfiles");
111+
#[cfg(target_os = "macos")]
112+
let (cur_oid, max_oid) = ("kern.num_files", "kern.maxfiles");
113+
#[cfg(any(target_os = "openbsd", target_os = "netbsd"))]
114+
let (cur_oid, max_oid) = ("kern.nfiles", "kern.maxfiles");
115+
#[cfg(not(any(target_os = "freebsd", target_os = "macos", target_os = "openbsd", target_os = "netbsd")))]
116+
return Err(std::io::Error::new(std::io::ErrorKind::Unsupported, "system fd probe unsupported on this OS"));
117+
118+
fn read(oid: &str) -> std::io::Result<u64> {
119+
let out = std::process::Command::new("sysctl").arg("-n").arg(oid).output()?;
120+
if !out.status.success() {
121+
return Err(std::io::Error::new(std::io::ErrorKind::NotFound, format!("sysctl {oid} unavailable")));
122+
}
123+
String::from_utf8_lossy(&out.stdout)
124+
.trim()
125+
.parse()
126+
.map_err(|_| std::io::Error::new(std::io::ErrorKind::InvalidData, format!("bad value for {oid}")))
127+
}
128+
129+
Ok(SystemFd { allocated: read(cur_oid)?, max: read(max_oid)? })
130+
}
131+
132+
pub fn spawn_fd_monitor() {
133+
tokio::spawn(async move {
134+
let mut tick = interval(Duration::from_secs(30));
135+
tick.set_missed_tick_behavior(MissedTickBehavior::Skip);
136+
137+
loop {
138+
tick.tick().await;
139+
140+
// (1) the node's own fds
141+
match fd_usage() {
142+
Ok(u) => {
143+
if let Some(limit) = u.soft_limit {
144+
let (pct, left) = (u.ratio() * 100.0, limit.saturating_sub(u.open));
145+
if u.ratio() >= 0.95 {
146+
error!(
147+
scope = "process",
148+
open = u.open,
149+
limit,
150+
left,
151+
pct = format!("{pct:.1}%"),
152+
"node fd usage critical"
153+
);
154+
} else if u.ratio() >= 0.80 {
155+
warn!(
156+
scope = "process",
157+
open = u.open,
158+
limit,
159+
left,
160+
pct = format!("{pct:.1}%"),
161+
"node fd usage elevated"
162+
);
163+
}
164+
}
165+
}
166+
Err(e) => error!(error = %e, "process fd probe failed"),
167+
}
168+
169+
// (2) whole-machine fds are allowed 5 percentage points more leeway.
170+
match system_fd_usage() {
171+
Ok(s) => {
172+
let (pct, left) = (s.ratio() * 100.0, s.max.saturating_sub(s.allocated));
173+
if s.ratio() >= 0.90 {
174+
error!(
175+
scope = "system",
176+
allocated = s.allocated,
177+
max = s.max,
178+
left,
179+
pct = format!("{pct:.1}%"),
180+
"system-wide fd usage critical"
181+
);
182+
} else if s.ratio() >= 0.75 {
183+
warn!(
184+
scope = "system",
185+
allocated = s.allocated,
186+
max = s.max,
187+
left,
188+
pct = format!("{pct:.1}%"),
189+
"system-wide fd usage elevated"
190+
);
191+
}
192+
}
193+
Err(e) => error!(error = %e, "system fd probe failed"),
194+
}
195+
}
196+
});
197+
}

cli/src/helpers/mod.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ use log_writer::*;
2222
mod dynamic_format;
2323
use dynamic_format::*;
2424

25+
#[cfg(target_family = "unix")]
26+
mod fd_check;
27+
#[cfg(target_family = "unix")]
28+
pub use fd_check::*;
29+
2530
pub(crate) mod args;
2631

2732
pub mod logger;
@@ -47,16 +52,11 @@ pub fn check_open_files_limit(minimum: u64) {
4752
Ok((soft_limit, _)) => {
4853
// Check if requirements are met.
4954
if soft_limit < minimum {
50-
// Warn about too low limit.
51-
let warning = [
52-
format!("⚠️ The open files limit ({soft_limit}) for this process is lower than recommended."),
53-
format!(" • To ensure correct behavior of the node, please raise it to at least {minimum}."),
54-
" • See the `ulimit` command and `/etc/security/limits.conf` for more details.".to_owned(),
55-
]
56-
.join("\n")
57-
.yellow()
58-
.bold();
59-
eprintln!("{warning}\n");
55+
panic!(
56+
"The open files limit ({soft_limit}) for this process is too low. \
57+
Please raise it to at least {minimum} \
58+
See the `ulimit` command and `/etc/security/limits.conf` for more details.",
59+
);
6060
}
6161
}
6262
Err(err) => {

0 commit comments

Comments
 (0)