Skip to content

Commit 7b039d9

Browse files
network monitor: harden against external problems (#2239)
Co-authored-by: Mirko <48352201+Mirko-von-Leipzig@users.noreply.github.com>
1 parent cc5ecde commit 7b039d9

11 files changed

Lines changed: 566 additions & 169 deletions

File tree

bin/network-monitor/assets/index.css

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,7 @@ body {
297297
justify-content: center;
298298
transition: all 0.2s;
299299
color: var(--color-text-faint);
300+
flex-shrink: 0;
300301
}
301302

302303
.copy-button:hover {
@@ -429,12 +430,16 @@ body {
429430

430431
/* When the value carries an inline action (currently only the copy button), keep them on the
431432
same line. Without this `.metric-value` is inline, and the whitespace between the value text
432-
and the button is a wrap opportunity — so long values (URLs in particular) push the button
433-
onto its own row below. */
433+
and the button is a wrap opportunity, so long values (URLs in particular) push the button
434+
onto its own row below. `min-width: 0` plus `overflow-wrap: anywhere` let values with no
435+
natural break points (e.g. URLs without hyphens) wrap inside the card instead of overflowing
436+
it and dragging the button outside the box. */
434437
.metric-value:has(.copy-button) {
435438
display: inline-flex;
436439
align-items: center;
437440
gap: 4px;
441+
min-width: 0;
442+
overflow-wrap: anywhere;
438443
}
439444

440445
.metric-value.warning-delta,

bin/network-monitor/src/commands/start.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,7 @@ pub async fn start_monitor(config: MonitorConfig) -> Result<()> {
3535

3636
let rpc_rx = tasks.spawn_rpc_checker(&config);
3737

38-
let prover_rxs = if config.remote_prover_urls.is_empty() {
39-
Vec::new()
40-
} else {
41-
tasks.spawn_prover_tasks(&config).await
42-
};
38+
let prover_rxs = tasks.spawn_prover_tasks(&config);
4339

4440
let faucet_rx = config.faucet_url.is_some().then(|| tasks.spawn_faucet(&config));
4541

@@ -48,7 +44,7 @@ pub async fn start_monitor(config: MonitorConfig) -> Result<()> {
4844
let (ntx_increment_rx, ntx_tracking_rx) = if config.disable_ntx_service {
4945
(None, None)
5046
} else {
51-
let (increment_rx, tracking_rx) = tasks.spawn_ntx_service(&config).await?;
47+
let (increment_rx, tracking_rx) = tasks.spawn_ntx_service(&config);
5248
(Some(increment_rx), Some(tracking_rx))
5349
};
5450

bin/network-monitor/src/counter.rs

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,10 @@ pub struct IncrementService {
149149
}
150150

151151
impl IncrementService {
152+
/// Display name of the service, shared with the bootstrap seeding code in
153+
/// [`crate::monitor::tasks`].
154+
pub const NAME: &'static str = "Local Transactions";
155+
152156
pub async fn new(
153157
config: MonitorConfig,
154158
wallet_account: Account,
@@ -330,7 +334,7 @@ impl IncrementService {
330334

331335
impl Service for IncrementService {
332336
fn name(&self) -> &'static str {
333-
"Local Transactions"
337+
Self::NAME
334338
}
335339

336340
fn interval(&self) -> Duration {
@@ -413,6 +417,10 @@ pub struct CounterTrackingService {
413417
}
414418

415419
impl CounterTrackingService {
420+
/// Display name of the service, shared with the bootstrap seeding code in
421+
/// [`crate::monitor::tasks`].
422+
pub const NAME: &'static str = "Network Transactions";
423+
416424
pub async fn new(
417425
config: MonitorConfig,
418426
counter_receiver: watch::Receiver<Account>,
@@ -552,7 +560,7 @@ impl CounterTrackingService {
552560

553561
impl Service for CounterTrackingService {
554562
fn name(&self) -> &'static str {
555-
"Network Transactions"
563+
Self::NAME
556564
}
557565

558566
fn interval(&self) -> Duration {
@@ -664,15 +672,15 @@ fn build_increment_status(details: &IncrementDetails, last_error: Option<String>
664672
let service_details = ServiceDetails::NtxIncrement(details.clone());
665673

666674
if let Some(err) = last_error {
667-
ServiceStatus::unhealthy("Local Transactions", err, service_details)
675+
ServiceStatus::unhealthy(IncrementService::NAME, err, service_details)
668676
} else if details.success_count == 0 && details.failure_count > 0 {
669677
ServiceStatus::unhealthy(
670-
"Local Transactions",
678+
IncrementService::NAME,
671679
format!("no successful increments ({} failures)", details.failure_count),
672680
service_details,
673681
)
674682
} else {
675-
ServiceStatus::healthy("Local Transactions", service_details)
683+
ServiceStatus::healthy(IncrementService::NAME, service_details)
676684
}
677685
}
678686

@@ -694,7 +702,7 @@ fn build_tracking_status(
694702
let service_details = ServiceDetails::NtxTracking(details.clone());
695703

696704
if let Some(err) = last_error {
697-
return ServiceStatus::unhealthy("Network Transactions", err, service_details);
705+
return ServiceStatus::unhealthy(CounterTrackingService::NAME, err, service_details);
698706
}
699707

700708
if over_threshold_streak >= PENDING_UNHEALTHY_CONFIRMATION_POLLS {
@@ -703,13 +711,13 @@ fn build_tracking_status(
703711
"counter trailing expected by {pending} (> {threshold}) for {over_threshold_streak} \
704712
consecutive polls",
705713
);
706-
return ServiceStatus::unhealthy("Network Transactions", err, service_details);
714+
return ServiceStatus::unhealthy(CounterTrackingService::NAME, err, service_details);
707715
}
708716

709717
if details.current_value.is_some() {
710-
ServiceStatus::healthy("Network Transactions", service_details)
718+
ServiceStatus::healthy(CounterTrackingService::NAME, service_details)
711719
} else {
712-
ServiceStatus::unknown("Network Transactions", service_details)
720+
ServiceStatus::unknown(CounterTrackingService::NAME, service_details)
713721
}
714722
}
715723

bin/network-monitor/src/deploy/mod.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,10 @@ pub mod wallet;
5050
///
5151
/// At startup the monitor may come up before the node's RPC endpoint is accepting connections, so
5252
/// the eager `connect()` (and the follow-up `get_block_header_by_number` request) is retried with
53-
/// exponential backoff instead of aborting the binary on the first refused connection. The schedule
54-
/// is bounded so a genuinely unreachable or misconfigured endpoint still surfaces as a fatal error
55-
/// rather than hanging forever.
53+
/// exponential backoff instead of failing on the first refused connection. The schedule is bounded
54+
/// so a single handshake attempt returns within a few minutes; callers that must survive a
55+
/// genuinely unreachable endpoint (e.g. the NTX bootstrap in `monitor::tasks`) wrap it in their
56+
/// own unbounded retry loop.
5657
const GENESIS_DISCOVERY_BACKOFF_INITIAL: Duration = Duration::from_secs(1);
5758
const GENESIS_DISCOVERY_BACKOFF_MAX: Duration = Duration::from_secs(30);
5859
const GENESIS_DISCOVERY_MAX_RETRIES: usize = 10;

bin/network-monitor/src/faucet.rs

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
//! This module contains the logic for periodically testing faucet functionality
44
//! by requesting proof-of-work challenges, solving them, and submitting token requests.
55
6-
use std::time::Duration;
6+
use std::time::{Duration, Instant};
77

88
use anyhow::Context;
99
use hex;
10+
use miden_node_utils::spawn::spawn_blocking_in_current_span;
1011
use reqwest::Client;
1112
use serde::{Deserialize, Serialize};
1213
use sha2::{Digest, Sha256};
@@ -88,6 +89,8 @@ pub struct FaucetService {
8889
url: Url,
8990
client: Client,
9091
interval: Duration,
92+
/// Wall-clock cap on solving a single `PoW` challenge.
93+
solve_timeout: Duration,
9194
/// A valid public account ID used as the recipient for faucet token requests. Generated once at
9295
/// construction from a throwaway wallet account; the minted tokens are never spent.
9396
account_id: String,
@@ -109,6 +112,7 @@ impl FaucetService {
109112
url,
110113
client,
111114
interval,
115+
solve_timeout: request_timeout,
112116
account_id: wallet_account.id().to_string(),
113117
success_count: 0,
114118
failure_count: 0,
@@ -145,7 +149,9 @@ impl Service for FaucetService {
145149
let start_time = std::time::Instant::now();
146150
let mut last_error: Option<String> = None;
147151

148-
match perform_faucet_test(&self.client, &self.url, &self.account_id).await {
152+
match perform_faucet_test(&self.client, &self.url, &self.account_id, self.solve_timeout)
153+
.await
154+
{
149155
Ok((minted_tokens, metadata)) => {
150156
self.success_count += 1;
151157
self.last_tx_id = Some(minted_tokens.tx_id.clone());
@@ -198,6 +204,7 @@ async fn perform_faucet_test(
198204
client: &Client,
199205
faucet_url: &Url,
200206
account_id: &str,
207+
solve_timeout: Duration,
201208
) -> anyhow::Result<(GetTokensResponse, GetMetadataResponse)> {
202209
debug!("Using recipient account ID: {} (length: {})", account_id, account_id.len());
203210

@@ -210,7 +217,7 @@ async fn perform_faucet_test(
210217

211218
let response = client.get(pow_url).send().await?;
212219

213-
let response_text: String = response.text().await?;
220+
let response_text = read_success_body(response).await.context("/pow request failed")?;
214221
debug!("Faucet PoW response: {}", response_text);
215222

216223
let challenge_response: PowChallengeResponse =
@@ -222,9 +229,16 @@ async fn perform_faucet_test(
222229
&challenge_response.challenge[..16.min(challenge_response.challenge.len())]
223230
);
224231

225-
// Step 2: Solve the PoW challenge
226-
let nonce = solve_pow_challenge(&challenge_response.challenge, challenge_response.target)
227-
.context("Failed to solve PoW challenge")?;
232+
// Step 2: Solve the PoW challenge off the async runtime; hashing is CPU-bound and would
233+
// otherwise stall every other checker task scheduled on this worker thread.
234+
let challenge = challenge_response.challenge.clone();
235+
let target = challenge_response.target;
236+
let nonce = spawn_blocking_in_current_span(move || {
237+
solve_pow_challenge(&challenge, target, solve_timeout)
238+
})
239+
.await
240+
.context("PoW solver task panicked")?
241+
.context("Failed to solve PoW challenge")?;
228242

229243
debug!("Solved PoW challenge with nonce: {}", nonce);
230244

@@ -240,7 +254,7 @@ async fn perform_faucet_test(
240254

241255
let response = client.get(tokens_url).send().await?;
242256

243-
let response_text: String = response.text().await?;
257+
let response_text = read_success_body(response).await.context("/get_tokens request failed")?;
244258
debug!("Faucet /get_tokens response: {}", response_text);
245259

246260
let tokens_response: GetTokensResponse =
@@ -251,7 +265,8 @@ async fn perform_faucet_test(
251265

252266
let response = client.get(metadata_url).send().await?;
253267

254-
let response_text = response.text().await?;
268+
let response_text =
269+
read_success_body(response).await.context("/get_metadata request failed")?;
255270
debug!("Faucet /get_metadata response: {}", response_text);
256271

257272
let metadata: GetMetadataResponse =
@@ -260,6 +275,16 @@ async fn perform_faucet_test(
260275
Ok((tokens_response, metadata))
261276
}
262277

278+
/// Reads the response body, failing with the HTTP status code and body when the request was not
279+
/// successful, so server-side errors (e.g. 429 or 500) surface directly on the card instead of as a
280+
/// deserialization failure.
281+
async fn read_success_body(response: reqwest::Response) -> anyhow::Result<String> {
282+
let status = response.status();
283+
let body = response.text().await?;
284+
anyhow::ensure!(status.is_success(), "HTTP {status}: {body}");
285+
Ok(body)
286+
}
287+
263288
/// Deserialize a faucet response using [`serde_path_to_error`] so that the failing JSON path (e.g.
264289
/// `max_supply`, `explorer_url`) is included in the error message. Combined with
265290
/// `#[serde(deny_unknown_fields)]` on each response type, this means renamed, removed, or newly
@@ -274,15 +299,19 @@ where
274299

275300
/// Solves a proof-of-work challenge using SHA-256 hashing.
276301
///
302+
/// This is CPU-bound and must run on a blocking thread (see the `spawn_blocking` call site).
303+
///
277304
/// # Arguments
278305
///
279306
/// * `challenge` - The challenge string in hexadecimal format.
280307
/// * `target` - The target value. A solution is valid if H(challenge, nonce) < target.
308+
/// * `timeout` - Wall-clock cap; checked every 100k attempts so a pathological difficulty cannot
309+
/// pin the blocking thread indefinitely.
281310
///
282311
/// # Returns
283312
///
284-
/// The nonce that solves the challenge, or an error if no solution is found within reasonable
285-
/// bounds.
313+
/// The nonce that solves the challenge, or an error if no solution is found within the attempt
314+
/// and time bounds.
286315
#[instrument(
287316
parent = None,
288317
target = COMPONENT,
@@ -292,8 +321,9 @@ where
292321
ret(level = "debug"),
293322
err
294323
)]
295-
fn solve_pow_challenge(challenge: &str, target: u64) -> anyhow::Result<u64> {
324+
fn solve_pow_challenge(challenge: &str, target: u64, timeout: Duration) -> anyhow::Result<u64> {
296325
let challenge_bytes = hex::decode(challenge).context("Failed to decode challenge from hex")?;
326+
let started = Instant::now();
297327

298328
// Try up to 100 million nonces.
299329
for nonce in 0..MAX_CHALLENGE_ATTEMPTS {
@@ -316,8 +346,15 @@ fn solve_pow_challenge(challenge: &str, target: u64) -> anyhow::Result<u64> {
316346
return Ok(nonce);
317347
}
318348

319-
// Log progress every 100k attempts
349+
// Check the deadline and log progress every 100k attempts
320350
if nonce % 100_000 == 0 && nonce > 0 {
351+
let elapsed = started.elapsed();
352+
if elapsed >= timeout {
353+
anyhow::bail!(
354+
"Failed to solve PoW challenge within {timeout:?} ({nonce} attempts, target \
355+
{target})"
356+
);
357+
}
321358
debug!(
322359
"PoW attempt {}: current_hash={}, target={} (~{} bits)",
323360
nonce,

0 commit comments

Comments
 (0)