Skip to content

Commit 7a7d7ca

Browse files
authored
TQ: Support persisting state to ledger (#9310)
Builds on #9296 This commit persists state to a ledger, following the pattern used in the bootstore. It's done this way because the `PersistentState` itself is contained in the sans-io layer, but we must save it in the async task layer. The sans-io layer shouldn't know how the state is persisted, just that it is, and so we recreate the ledger for every time we write it. A follow up will PR will deal with the early networking information saved by the bootstore, and will be very similar.
1 parent a6e111c commit 7a7d7ca

File tree

6 files changed

+374
-31
lines changed

6 files changed

+374
-31
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

trust-quorum/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ gfss.workspace = true
2222
hex.workspace = true
2323
hkdf.workspace = true
2424
iddqd.workspace = true
25+
omicron-common.workspace = true
2526
omicron-uuid-kinds.workspace = true
2627
rand = { workspace = true, features = ["os_rng"] }
2728
secrecy.workspace = true

trust-quorum/src/connection_manager.rs

Lines changed: 73 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ impl ConnMgr {
404404
self.on_task_exit(task_id).await;
405405
}
406406
Err(err) => {
407-
error!(self.log, "Connection task panic: {err}");
407+
warn!(self.log, "Connection task panic: {err}");
408408
self.on_task_exit(err.id()).await;
409409
}
410410

@@ -482,7 +482,19 @@ impl ConnMgr {
482482
tx,
483483
conn_type: ConnectionType::Accepted(addr),
484484
};
485-
assert!(self.accepting.insert_unique(task_handle).is_ok());
485+
let replaced = self.accepting.insert_overwrite(task_handle);
486+
for h in replaced {
487+
// We accepted a connection from the same `SocketAddrV6` before the
488+
// old one was torn down. This should be rare, if not impossible.
489+
warn!(
490+
self.log,
491+
"Accepted connection replaced. Aborting old task.";
492+
"task_id" => ?h.task_id(),
493+
"peer_addr" => %h.addr(),
494+
);
495+
h.abort();
496+
}
497+
486498
Ok(())
487499
}
488500

@@ -501,12 +513,38 @@ impl ConnMgr {
501513
"peer_id" => %peer_id
502514
);
503515

504-
let already_established = self.established.insert_unique(
516+
let replaced = self.established.insert_overwrite(
505517
EstablishedTaskHandle::new(peer_id, task_handle),
506518
);
507-
assert!(already_established.is_ok());
519+
520+
// The only reason for for established connections to be replaced
521+
// like this is when the IP address for a peer changes, but the
522+
// previous connection has not yet been torn down.
523+
//
524+
// Tear down usually happens quickly due to TCP reset or missed
525+
// pings. However if the new ip address is fed into the task via
526+
// `load_peer_addresses` and the peer at that address connects
527+
// before the old connection is torn down, you end up in this
528+
// situation.
529+
//
530+
// This isn't really possible, except in tests where we change port
531+
// numbers when simulating crash and restart of nodes, and do this
532+
// very quickly. We change port numbers because `NodeTask`s listen
533+
// on port 0 and use ephemeral ports to prevent collisions in tests
534+
// where the IP address is localhost.
535+
for h in replaced {
536+
warn!(
537+
self.log,
538+
"Established connection replaced. Aborting old task.";
539+
"task_id" => ?h.task_id(),
540+
"peer_addr" => %h.addr(),
541+
"peer_id" => %h.baseboard_id
542+
);
543+
h.abort();
544+
}
508545
} else {
509-
error!(self.log, "Server handshake completed, but no server addr in map";
546+
warn!(self.log,
547+
"Server handshake completed, but no server addr in map";
510548
"task_id" => ?task_id,
511549
"peer_addr" => %addr,
512550
"peer_id" => %peer_id
@@ -528,12 +566,38 @@ impl ConnMgr {
528566
"peer_addr" => %addr,
529567
"peer_id" => %peer_id
530568
);
531-
let already_established = self.established.insert_unique(
569+
let replaced = self.established.insert_overwrite(
532570
EstablishedTaskHandle::new(peer_id, task_handle),
533571
);
534-
assert!(already_established.is_ok());
572+
573+
// The only reason for for established connections to be replaced
574+
// like this is when the IP address for a peer changes, but the
575+
// previous connection has not yet been torn down.
576+
//
577+
// Tear down usually happens quickly due to TCP reset or missed
578+
// pings. However if the new ip address is fed into the task via
579+
// `load_peer_addresses` and the peer at that address connects
580+
// before the old connection is torn down, you end up in this
581+
// situation.
582+
//
583+
// This isn't really possible, except in tests where we change port
584+
// numbers when simulating crash and restart of nodes, and do this
585+
// very quickly. We change port numbers because `NodeTask`s listen
586+
// on port 0 and use ephemeral ports to prevent collisions in tests
587+
// where the IP address is localhost.
588+
for h in replaced {
589+
warn!(
590+
self.log,
591+
"Established connection replaced. Aborting old task.";
592+
"task_id" => ?h.task_id(),
593+
"peer_addr" => %h.addr(),
594+
"peer_id" => %h.baseboard_id
595+
);
596+
h.abort();
597+
}
535598
} else {
536-
error!(self.log, "Client handshake completed, but no client addr in map";
599+
warn!(self.log,
600+
"Client handshake completed, but no client addr in map";
537601
"task_id" => ?task_id,
538602
"peer_addr" => %addr,
539603
"peer_id" => %peer_id
@@ -634,7 +698,7 @@ impl ConnMgr {
634698
disconnected_peers
635699
}
636700

637-
/// Spawn a task to estalbish a sprockets connection for the given address
701+
/// Spawn a task to establish a sprockets connection for the given address
638702
async fn connect_client(
639703
&mut self,
640704
corpus: Vec<Utf8PathBuf>,

trust-quorum/src/ledgers.rs

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// This Source Code Form is subject to the terms of the Mozilla Public
2+
// License, v. 2.0. If a copy of the MPL was not distributed with this
3+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
5+
//! Persistent storage for the trust quorum task
6+
//!
7+
//! We write two pieces of data to M.2 devices in production via
8+
//! [`omicron_common::ledger::Ledger`]:
9+
//!
10+
//! 1. [`trust_quorum_protocol::PersistentState`] for trust quorum state
11+
//! 2. A network config blob required for pre-rack-unlock configuration
12+
13+
use camino::Utf8PathBuf;
14+
use omicron_common::ledger::{Ledger, Ledgerable};
15+
use serde::{Deserialize, Serialize};
16+
use slog::{Logger, info};
17+
use trust_quorum_protocol::PersistentState;
18+
19+
/// A wrapper type around [`PersistentState`] for use as a [`Ledger`]
20+
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
21+
pub struct PersistentStateLedger {
22+
pub generation: u64,
23+
pub state: PersistentState,
24+
}
25+
26+
impl Ledgerable for PersistentStateLedger {
27+
fn is_newer_than(&self, other: &Self) -> bool {
28+
self.generation > other.generation
29+
}
30+
31+
fn generation_bump(&mut self) {
32+
self.generation += 1;
33+
}
34+
}
35+
36+
impl PersistentStateLedger {
37+
/// Save the persistent state to a ledger and return the new generation
38+
/// number.
39+
///
40+
/// Panics if the ledger cannot be saved.
41+
///
42+
/// The trust quorum protocol relies on persisting state to disk, such
43+
/// as whether a node has prepared or committed a configuration, before
44+
/// responding to a coordinator node or Nexus. This is necessary in order
45+
/// to ensure that enough nodes actually have performed an operation and
46+
/// not have the overall state of the protocol go backward in the case of
47+
/// a crash and restart of a node. In this manner, trust quorum is similar
48+
/// to consensus protocols like Raft and Paxos.
49+
///
50+
/// If for any reason we cannot persist trust quorum state to the ledger,
51+
/// we must panic to ensure that the node does not take any further
52+
/// action incorrectly, like acknowledging a `Prepare` to a coordinator.
53+
/// Panicking is the simplest mechanism to ensure that a given node will
54+
/// not violate the invariants of the trust quorum protocol in the case
55+
/// of internal disk failures. It also ensures a very obvious failure that
56+
/// will allow support to get involved and replace internal disks.
57+
pub async fn save(
58+
log: &Logger,
59+
paths: Vec<Utf8PathBuf>,
60+
generation: u64,
61+
state: PersistentState,
62+
) -> u64 {
63+
let persistent_state = PersistentStateLedger { generation, state };
64+
let mut ledger = Ledger::new_with(log, paths, persistent_state);
65+
ledger
66+
.commit()
67+
.await
68+
.expect("Critical: Failed to save ledger for persistent state");
69+
ledger.data().generation
70+
}
71+
72+
/// Return Some(`PersistentStateLedger`) if it exists on disk, otherwise
73+
/// return `None`.
74+
pub async fn load(
75+
log: &Logger,
76+
paths: Vec<Utf8PathBuf>,
77+
) -> Option<PersistentStateLedger> {
78+
let Some(ledger) =
79+
Ledger::<PersistentStateLedger>::new(&log, paths).await
80+
else {
81+
return None;
82+
};
83+
let persistent_state = ledger.into_inner();
84+
info!(
85+
log,
86+
"Loaded persistent state from ledger with generation {}",
87+
persistent_state.generation
88+
);
89+
Some(persistent_state)
90+
}
91+
}

trust-quorum/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
77
mod connection_manager;
88
pub(crate) mod established_conn;
9+
mod ledgers;
910
mod task;
1011

1112
pub(crate) use connection_manager::{

0 commit comments

Comments
 (0)