Skip to content

Commit a1dee67

Browse files
committed
TQ: Integrate rack secret load/clear into Node API
Also add cluster test and minor diff support for tqdb. I actually found 1 real bug and 1 test bug with this PR and were able to diagnose via tqdb, which was awesome. The failing proptests seeds are included.
1 parent 8cde6ef commit a1dee67

File tree

7 files changed

+249
-23
lines changed

7 files changed

+249
-23
lines changed

trust-quorum/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ pub use configuration::Configuration;
3131
pub use coordinator_state::{
3232
CoordinatorOperation, CoordinatorState, CoordinatorStateDiff,
3333
};
34+
pub use rack_secret_loader::{LoadRackSecretError, RackSecretLoaderDiff};
3435
pub use validators::ValidatedReconfigureMsgDiff;
3536
mod alarm;
3637

trust-quorum/src/node.rs

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
//! Node, and so this should not be problematic.
1717
1818
use crate::compute_key_share::KeyShareComputer;
19+
use crate::crypto::ReconstructedRackSecret;
20+
use crate::rack_secret_loader::{
21+
LoadRackSecretError, RackSecretLoader, RackSecretLoaderDiff,
22+
};
1923
use crate::validators::{
2024
MismatchedRackIdError, ReconfigurationError, ValidatedReconfigureMsg,
2125
};
@@ -44,18 +48,26 @@ pub struct Node {
4448
/// In memory state for when this node is trying to compute its own key
4549
/// share for a committed epoch.
4650
key_share_computer: Option<KeyShareComputer>,
51+
52+
/// A mechanism for loading rack secrets by collecting key shares
53+
/// for the latest committed epoch.
54+
rack_secret_loader: RackSecretLoader,
4755
}
4856

4957
// For diffs we want to allow access to all fields, but not make them public in
5058
// the `Node` type itself.
51-
impl NodeDiff<'_> {
59+
impl<'a> NodeDiff<'a> {
5260
pub fn coordinator_state(&self) -> Leaf<Option<&CoordinatorState>> {
5361
self.coordinator_state
5462
}
5563

5664
pub fn key_share_computer(&self) -> Leaf<Option<&KeyShareComputer>> {
5765
self.key_share_computer
5866
}
67+
68+
pub fn rack_secret_loader(&self) -> &RackSecretLoaderDiff<'a> {
69+
&self.rack_secret_loader
70+
}
5971
}
6072

6173
#[cfg(feature = "danger_partial_eq_ct_wrapper")]
@@ -74,7 +86,32 @@ impl Node {
7486
let id_str = format!("{:?}", ctx.platform_id());
7587
let log =
7688
log.new(o!("component" => "trust-quorum", "platform_id" => id_str));
77-
Node { log, coordinator_state: None, key_share_computer: None }
89+
let rack_secret_loader = RackSecretLoader::new(&log);
90+
Node {
91+
log,
92+
coordinator_state: None,
93+
key_share_computer: None,
94+
rack_secret_loader,
95+
}
96+
}
97+
98+
/// Attempt to load a rack secret at the given epoch.
99+
///
100+
/// If no secrets are loaded the node will start collecting shares for the
101+
/// latest committed epoch and return `Ok(None)`. `Ok(None)` will continue
102+
/// to be returned while share collection is in progress. The secret will
103+
/// be returned on the next call after it becomes available.
104+
pub fn load_rack_secret(
105+
&mut self,
106+
ctx: &mut impl NodeHandlerCtx,
107+
epoch: Epoch,
108+
) -> Result<Option<ReconstructedRackSecret>, LoadRackSecretError> {
109+
self.rack_secret_loader.load(ctx, epoch)
110+
}
111+
112+
/// Clear all loaded rack secrets cached in memory
113+
pub fn clear_secrets(&mut self) {
114+
self.rack_secret_loader.clear_secrets();
78115
}
79116

80117
/// Start coordinating a reconfiguration
@@ -126,6 +163,10 @@ impl Node {
126163
self.key_share_computer.is_some()
127164
}
128165

166+
pub fn is_collecting_shares_for_rack_secret(&self, epoch: Epoch) -> bool {
167+
self.rack_secret_loader.is_collecting_shares_for_rack_secret(epoch)
168+
}
169+
129170
/// Commit a configuration
130171
///
131172
/// This is triggered by a message from Nexus for each node in the
@@ -659,22 +700,17 @@ impl Node {
659700
share: Share,
660701
) {
661702
if let Some(cs) = &mut self.coordinator_state {
662-
cs.handle_share(ctx, from, epoch, share);
703+
cs.handle_share(ctx, from.clone(), epoch, share.clone());
663704
} else if let Some(ksc) = &mut self.key_share_computer {
664-
if ksc.handle_share(ctx, from, epoch, share) {
705+
if ksc.handle_share(ctx, from.clone(), epoch, share.clone()) {
665706
// We're have completed computing our share and saved it to
666707
// our persistent state. We have also marked the configuration
667708
// committed.
668709
self.key_share_computer = None;
669710
}
670-
} else {
671-
warn!(
672-
self.log,
673-
"Received share when not coordinating or computing share";
674-
"from" => %from,
675-
"epoch" => %epoch
676-
);
677711
}
712+
713+
self.rack_secret_loader.handle_share(ctx, from, epoch, share);
678714
}
679715

680716
fn handle_prepare(

trust-quorum/src/rack_secret_loader.rs

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use crate::{
1212
Alarm, Configuration, Epoch, NodeHandlerCtx, PeerMsgKind, PlatformId,
1313
RackSecret, Share,
1414
};
15+
use daft::{BTreeMapDiff, Diffable, Leaf};
1516
use slog::{Logger, error, info, o};
1617

1718
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
@@ -27,22 +28,45 @@ pub enum LoadRackSecretError {
2728
}
2829

2930
/// Manage retrieval of key shares to load various rack secrets
30-
#[derive(Debug, Clone)]
31+
#[derive(Debug, Clone, Diffable)]
3132
pub struct RackSecretLoader {
33+
#[daft(ignore)]
3234
log: Logger,
3335
loaded: BTreeMap<Epoch, ReconstructedRackSecret>,
3436
// We can only collect shares for the latest committed epoch. We then derive
3537
// a key from the computed rack secret to decrypt rack secrets for prior
3638
// configurations.
39+
#[daft(leaf)]
3740
collector: Option<ShareCollector>,
3841
}
3942

43+
impl<'daft> RackSecretLoaderDiff<'daft> {
44+
pub fn loaded(
45+
&self,
46+
) -> &BTreeMapDiff<'daft, Epoch, ReconstructedRackSecret> {
47+
&self.loaded
48+
}
49+
50+
pub fn collector(&self) -> Leaf<&'daft Option<ShareCollector>> {
51+
self.collector
52+
}
53+
}
54+
4055
impl RackSecretLoader {
4156
pub fn new(log: &Logger) -> RackSecretLoader {
4257
let log = log.new(o!("component" => "tq-rack-secret-loader"));
4358
RackSecretLoader { log, loaded: BTreeMap::new(), collector: None }
4459
}
4560

61+
pub fn is_collecting_shares_for_rack_secret(&self, epoch: Epoch) -> bool {
62+
let Some(c) = &self.collector else {
63+
return false;
64+
};
65+
// We collect for the latest committed epoch which must be greater than
66+
// or equal to the epoch for the rack secret we are interested in.
67+
c.config.epoch >= epoch
68+
}
69+
4670
pub fn load(
4771
&mut self,
4872
ctx: &mut impl NodeHandlerCtx,
@@ -60,17 +84,20 @@ impl RackSecretLoader {
6084
return Err(LoadRackSecretError::NoCommittedConfigurations);
6185
};
6286

87+
if epoch > latest_committed_epoch {
88+
return Err(LoadRackSecretError::NotCommitted(epoch));
89+
}
90+
6391
// If we have loaded the latest committed epoch, then we have loaded all
6492
// possible rack secrets. Secrets for prior epochs are unavailable.
6593
if self.loaded.contains_key(&latest_committed_epoch) {
6694
if epoch < latest_committed_epoch {
6795
return Err(LoadRackSecretError::NotAvailable(epoch));
68-
} else if epoch > latest_committed_epoch {
69-
return Err(LoadRackSecretError::NotCommitted(epoch));
7096
} else {
7197
unreachable!(
72-
"already would have returned rack secret for latest \
73-
committed epoch ({epoch}) if requested"
98+
"epoch comparisons for epoch({epoch}) \
99+
<= latest_committed_epoch({latest_committed_epoch}) \
100+
already handled"
74101
);
75102
}
76103
}
@@ -107,7 +134,7 @@ impl RackSecretLoader {
107134
latest_committed_epoch,
108135
collecting_epoch: collector.config.epoch,
109136
});
110-
return Err(LoadRackSecretError::Alarm);
137+
Err(LoadRackSecretError::Alarm)
111138
}
112139
}
113140
}
@@ -154,14 +181,35 @@ impl RackSecretLoader {
154181
}
155182
}
156183

157-
#[derive(Debug, Clone)]
158-
struct ShareCollector {
184+
// Pub only for use in daft
185+
#[derive(Debug, Clone, Diffable)]
186+
pub struct ShareCollector {
187+
#[daft(ignore)]
159188
log: Logger,
160189
// A copy of the configuration stored in persistent state
190+
#[daft(leaf)]
161191
config: Configuration,
162192
shares: BTreeMap<PlatformId, Share>,
163193
}
164194

195+
impl PartialEq for ShareCollector {
196+
fn eq(&self, other: &Self) -> bool {
197+
self.config == other.config && self.shares == other.shares
198+
}
199+
}
200+
201+
impl Eq for ShareCollector {}
202+
203+
impl<'daft> ShareCollectorDiff<'daft> {
204+
pub fn config(&self) -> Leaf<&'daft Configuration> {
205+
self.config
206+
}
207+
208+
pub fn shares(&self) -> &BTreeMapDiff<'daft, PlatformId, Share> {
209+
&self.shares
210+
}
211+
}
212+
165213
impl ShareCollector {
166214
pub fn new(
167215
log: &Logger,

trust-quorum/test-utils/src/event.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ pub enum Event {
2727
/// Since replay is deterministic, we actually know what this value is,
2828
/// even though a prior event may not have yet sent the message.
2929
DeliverEnvelope(Envelope),
30+
LoadRackSecret(PlatformId, Epoch),
31+
ClearSecrets(PlatformId),
3032
/// Pull a `NexusReply` off the underlay network and update the `NexusState`
3133
DeliverNexusReply(NexusReply),
3234
CommitConfiguration(PlatformId),
@@ -44,6 +46,8 @@ impl Event {
4446
Self::SendNexusReplyOnUnderlay(_) => vec![],
4547
Self::DeliverEnvelope(envelope) => vec![envelope.to.clone()],
4648
Self::DeliverNexusReply(_) => vec![],
49+
Self::LoadRackSecret(id, _) => vec![id.clone()],
50+
Self::ClearSecrets(id) => vec![id.clone()],
4751
Self::CommitConfiguration(id) => vec![id.clone()],
4852
Self::Reconfigure(_) => vec![],
4953
}

trust-quorum/test-utils/src/state.rs

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ use std::collections::{BTreeMap, BTreeSet};
1515
use std::fmt::Display;
1616
use trust_quorum::{
1717
Configuration, CoordinatorOperation, CoordinatorStateDiff, Envelope, Epoch,
18-
Node, NodeCallerCtx, NodeCommonCtx, NodeCtx, NodeCtxDiff, NodeDiff,
19-
PeerMsgKind, PlatformId, ValidatedReconfigureMsgDiff,
18+
LoadRackSecretError, Node, NodeCallerCtx, NodeCommonCtx, NodeCtx,
19+
NodeCtxDiff, NodeDiff, PeerMsgKind, PlatformId,
20+
ValidatedReconfigureMsgDiff,
2021
};
2122

2223
// The state of our entire system including the system under test and
@@ -204,6 +205,12 @@ impl TqState {
204205
Event::DeliverEnvelope(envelope) => {
205206
self.apply_event_deliver_envelope(envelope);
206207
}
208+
Event::LoadRackSecret(id, epoch) => {
209+
self.apply_event_load_rack_secret(id, epoch);
210+
}
211+
Event::ClearSecrets(id) => {
212+
self.apply_event_clear_secrets(id);
213+
}
207214
Event::DeliverNexusReply(reply) => {
208215
self.apply_event_deliver_nexus_reply(reply);
209216
}
@@ -258,8 +265,7 @@ impl TqState {
258265
fn apply_event_commit(&mut self, id: PlatformId) {
259266
let rack_id = self.nexus.rack_id;
260267
let latest_config = self.nexus.latest_config();
261-
let (node, ctx) =
262-
self.sut.nodes.get_mut(&id).expect("destination exists");
268+
let (node, ctx) = self.sut.nodes.get_mut(&id).expect("node exists");
263269
node.commit_configuration(ctx, rack_id, latest_config.epoch)
264270
.expect("commit succeeded");
265271

@@ -269,6 +275,54 @@ impl TqState {
269275
});
270276
}
271277

278+
fn apply_event_load_rack_secret(&mut self, id: PlatformId, epoch: Epoch) {
279+
let (node, ctx) = self.sut.nodes.get_mut(&id).expect("node exists");
280+
281+
// Postcondition checks
282+
match node.load_rack_secret(ctx, epoch) {
283+
Ok(None) => {
284+
assert!(node.is_collecting_shares_for_rack_secret(epoch));
285+
}
286+
Ok(Some(_)) => {
287+
// We may be collecting for a later epoch, but haven't thrown
288+
// out the old secret, so we don't check if we are collecting as
289+
// in the `Ok(None)` clause above.
290+
291+
// If we can load a rack secret then we have either committed
292+
// for this epoch or a later epoch.
293+
assert!(
294+
ctx.persistent_state()
295+
.latest_committed_epoch()
296+
.expect("at least one committed epoch")
297+
>= epoch
298+
);
299+
}
300+
Err(LoadRackSecretError::NoCommittedConfigurations) => {
301+
assert!(ctx.persistent_state().is_uninitialized());
302+
}
303+
Err(LoadRackSecretError::NotCommitted(epoch)) => {
304+
assert!(!ctx.persistent_state().commits.contains(&epoch));
305+
}
306+
Err(LoadRackSecretError::Alarm) => {
307+
// We should not see any alarms in this test
308+
panic!("alarm seen");
309+
}
310+
Err(LoadRackSecretError::NotAvailable(_)) => {
311+
assert!(
312+
ctx.persistent_state()
313+
.latest_committed_epoch()
314+
.expect("latest committed epoch exists")
315+
> epoch
316+
);
317+
}
318+
}
319+
}
320+
321+
fn apply_event_clear_secrets(&mut self, id: PlatformId) {
322+
let (node, _) = self.sut.nodes.get_mut(&id).expect("node exists");
323+
node.clear_secrets();
324+
}
325+
272326
fn apply_event_send_nexus_reply_on_underlay(&mut self, reply: NexusReply) {
273327
self.underlay_network.push(reply);
274328
}
@@ -802,6 +856,19 @@ fn display_node_diff(
802856
}
803857
}
804858

859+
if node_diff.rack_secret_loader().collector().is_modified() {
860+
// It's too tedious to do the diff work here right now.
861+
writeln!(f, " Rack secret collector changed")?;
862+
}
863+
if !node_diff.rack_secret_loader().loaded().added.is_empty() {
864+
// It's too tedious to do the diff work here right now.
865+
writeln!(f, " Rack secrets loaded")?;
866+
}
867+
if !node_diff.rack_secret_loader().loaded().removed.is_empty() {
868+
// It's too tedious to do the diff work here right now.
869+
writeln!(f, " Rack secrets cleared")?;
870+
}
871+
805872
Ok(())
806873
}
807874

trust-quorum/tests/cluster.proptest-regressions

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)