From 0a63230dd9ab3416c1e479fec78bc4d35a9e6e7b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 9 May 2025 12:38:36 +0100 Subject: [PATCH 001/197] Add tla spec --- tla/disaster-recovery/.gitignore | 3 + tla/disaster-recovery/autoopen.cfg | 10 +++ tla/disaster-recovery/autoopen.tla | 117 +++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+) create mode 100644 tla/disaster-recovery/.gitignore create mode 100644 tla/disaster-recovery/autoopen.cfg create mode 100644 tla/disaster-recovery/autoopen.tla diff --git a/tla/disaster-recovery/.gitignore b/tla/disaster-recovery/.gitignore new file mode 100644 index 000000000000..f410e7eb4643 --- /dev/null +++ b/tla/disaster-recovery/.gitignore @@ -0,0 +1,3 @@ +.envrc +states + diff --git a/tla/disaster-recovery/autoopen.cfg b/tla/disaster-recovery/autoopen.cfg new file mode 100644 index 000000000000..93ff374f0eff --- /dev/null +++ b/tla/disaster-recovery/autoopen.cfg @@ -0,0 +1,10 @@ +SPECIFICATION + Spec + +INVARIANTS + TypeOk + InvNoTimeoutNoFork + InvNoTimeoutNoDeadlock + +CONSTANTS + NID = {0, 1, 2, 3, 4} \ No newline at end of file diff --git a/tla/disaster-recovery/autoopen.tla b/tla/disaster-recovery/autoopen.tla new file mode 100644 index 000000000000..fcd4a6a2be44 --- /dev/null +++ b/tla/disaster-recovery/autoopen.tla @@ -0,0 +1,117 @@ +---- MODULE autoopen ---- + +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + NID + +MAJ_QUORUM_LIMIT == (Cardinality(NID) + 1) \div 2 + +VARIABLES + next_step, + txids, + gossip_msgs, + vote_msgs, + open_msgs + +vars == <> + +TypeOk == + /\ next_step \in [NID -> {"gossip", "vote", "open/join", "open", "join"}] + /\ txids \in [NID -> Nat] + /\ gossip_msgs \subseteq [ + src : NID, + txid : Nat + ] + /\ vote_msgs \subseteq [ + src : NID, + vote : NID, + kind : {"quorum", "timeout"} + ] + /\ open_msgs \subseteq [ + src : NID + ] + +Init == + /\ next_step = [n \in NID |-> "gossip"] + /\ txids = [n \in NID |-> n] + /\ gossip_msgs = {} + /\ vote_msgs = {} + /\ open_msgs = {} + +ActionSendGossip(n) == + /\ next_step[n] = "gossip" + /\ next_step' = [next_step EXCEPT ![n] = "vote"] + /\ gossip_msgs' = gossip_msgs \cup {[src |-> n, txid |-> txids[n]]} + /\ UNCHANGED << txids, vote_msgs, open_msgs >> + +Vote(n, gossips, kind) == + LET max_txid_gossip == + CHOOSE g \in gossips: + \A g1 \in gossips: g.txid >= g1.txid + vote == [src |-> n, vote |-> max_txid_gossip.src, kind |-> kind] + IN + /\ next_step[n] = "vote" + /\ next_step' = [next_step EXCEPT ![n] = "open/join"] + /\ vote_msgs' = vote_msgs \cup {vote} + /\ UNCHANGED << txids, gossip_msgs, open_msgs >> + +ActionVoteQuorum(n) == + \E gossips \in SUBSET gossip_msgs: + \* Non-Unanimous gossips can cause deadlocks + /\ {g.src : g \in gossips} = NID + /\ Vote(n, gossips \cup {[src |-> n, txid |-> txids[n]]}, "quorum") + +ActionVoteTimeout(n) == + \E gossips \in SUBSET gossip_msgs: + /\ Cardinality({g.src : g \in gossips}) >= MAJ_QUORUM_LIMIT + /\ Vote(n, gossips \cup {[src |-> n, txid |-> txids[n]]}, "timeout") + +ActionOpen(n) == + \E Vs \in SUBSET {v \in vote_msgs: v.vote = n}: + /\ Cardinality(Vs) >= MAJ_QUORUM_LIMIT + /\ next_step[n] = "open/join" + /\ next_step' = [next_step EXCEPT ![n] = "open"] + /\ open_msgs' = open_msgs \cup {[src |-> n]} + /\ UNCHANGED << txids, gossip_msgs, vote_msgs >> + +ActionJoin(n) == + \E o \in open_msgs: + /\ next_step[n] = "open/join" + /\ next_step' = [next_step EXCEPT ![n] = "join"] + /\ UNCHANGED << txids, gossip_msgs, vote_msgs, open_msgs >> + + +Next == + \E n \in NID: + \/ ActionSendGossip(n) + \/ ActionVoteQuorum(n) + \/ ActionVoteTimeout(n) + \/ ActionOpen(n) + \/ ActionJoin(n) + +Spec == Init /\ [][Next]_vars + +InvNoTimeoutNoFork == + (\A m \in vote_msgs: m.kind = "quorum") + => + (Cardinality({n \in NID: next_step[n] = "open"}) <= 1) + +InvCorrectState == ~\A n \in NID: next_step[n] \in {"open", "join"} + +\* We optimally should be unable to reach a deadlock state +\* where every node is blocked but it may be impossible with timeouts +InvNoDeadlockStates == + (\A n \in NID: next_step[n] = "open/join") + => + ( + \E n \in NID: + \/ ENABLED ActionOpen(n) + \/ ENABLED ActionJoin(n) + ) + +InvNoTimeoutNoDeadlock == + (\A m \in vote_msgs: m.kind = "quorum") + => InvNoDeadlockStates + +==== \ No newline at end of file From 859d30d8a07559ec0fe421ded4a237e01650bc9e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 9 May 2025 18:55:20 +0100 Subject: [PATCH 002/197] Update spec to refine safety property --- tla/disaster-recovery/autoopen.cfg | 10 ++-- tla/disaster-recovery/autoopen.tla | 76 +++++++++++++++++++----------- 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/tla/disaster-recovery/autoopen.cfg b/tla/disaster-recovery/autoopen.cfg index 93ff374f0eff..3d16b6ceaeb9 100644 --- a/tla/disaster-recovery/autoopen.cfg +++ b/tla/disaster-recovery/autoopen.cfg @@ -3,8 +3,12 @@ SPECIFICATION INVARIANTS TypeOk - InvNoTimeoutNoFork - InvNoTimeoutNoDeadlock + InvNoFork + InvUnanimousLiveVotesNoDeadlock + InvNonUnanimousOpen CONSTANTS - NID = {0, 1, 2, 3, 4} \ No newline at end of file + NID = {r1, r2, r3, r4, r4} + +SYMMETRY + Symmetry \ No newline at end of file diff --git a/tla/disaster-recovery/autoopen.tla b/tla/disaster-recovery/autoopen.tla index fcd4a6a2be44..e0d709af4087 100644 --- a/tla/disaster-recovery/autoopen.tla +++ b/tla/disaster-recovery/autoopen.tla @@ -1,20 +1,21 @@ ---- MODULE autoopen ---- -EXTENDS Integers, Sequences, FiniteSets +EXTENDS Integers, Sequences, FiniteSets, TLC CONSTANTS NID -MAJ_QUORUM_LIMIT == (Cardinality(NID) + 1) \div 2 +MAJ_QUORUM_LIMIT == (Cardinality(NID)) \div 2 + 1 VARIABLES next_step, txids, gossip_msgs, + recv_gossips, vote_msgs, open_msgs -vars == <> +vars == <> TypeOk == /\ next_step \in [NID -> {"gossip", "vote", "open/join", "open", "join"}] @@ -23,49 +24,62 @@ TypeOk == src : NID, txid : Nat ] + /\ recv_gossips \in [NID -> SUBSET gossip_msgs] /\ vote_msgs \subseteq [ src : NID, vote : NID, - kind : {"quorum", "timeout"} + recv : SUBSET NID ] /\ open_msgs \subseteq [ src : NID ] +TXID == + CHOOSE F \in [NID -> 1..Cardinality(NID)]: + \A k1, k2 \in DOMAIN F: F[k1] = F[k2] => k1 = k2 + Init == /\ next_step = [n \in NID |-> "gossip"] - /\ txids = [n \in NID |-> n] + /\ txids = [n \in NID|-> TXID[n]] /\ gossip_msgs = {} + /\ recv_gossips = [n \in NID |-> {}] /\ vote_msgs = {} /\ open_msgs = {} ActionSendGossip(n) == + LET msg == [src |-> n, txid |-> txids[n]] IN /\ next_step[n] = "gossip" /\ next_step' = [next_step EXCEPT ![n] = "vote"] - /\ gossip_msgs' = gossip_msgs \cup {[src |-> n, txid |-> txids[n]]} + /\ recv_gossips' = [recv_gossips EXCEPT ![n] = recv_gossips[n] \cup {msg}] + /\ gossip_msgs' = gossip_msgs \cup {msg} /\ UNCHANGED << txids, vote_msgs, open_msgs >> -Vote(n, gossips, kind) == - LET max_txid_gossip == - CHOOSE g \in gossips: - \A g1 \in gossips: g.txid >= g1.txid - vote == [src |-> n, vote |-> max_txid_gossip.src, kind |-> kind] +ActionRecvGossip(n) == + \E m \in gossip_msgs: + /\ m \notin recv_gossips[n] + /\ recv_gossips' = [recv_gossips EXCEPT ![n] = recv_gossips[n] \cup {m}] + /\ UNCHANGED << next_step, txids, gossip_msgs, vote_msgs, open_msgs >> + +Vote(n) == + LET recv_nodes == {g.src : g \in recv_gossips[n]} + max_txid_gossip == + CHOOSE g \in recv_gossips[n]: + \A g1 \in recv_gossips[n]: g.txid >= g1.txid + vote == [src |-> n, vote |-> max_txid_gossip.src, recv |-> recv_nodes] IN /\ next_step[n] = "vote" /\ next_step' = [next_step EXCEPT ![n] = "open/join"] /\ vote_msgs' = vote_msgs \cup {vote} - /\ UNCHANGED << txids, gossip_msgs, open_msgs >> + /\ UNCHANGED << txids, gossip_msgs, recv_gossips, open_msgs >> ActionVoteQuorum(n) == - \E gossips \in SUBSET gossip_msgs: \* Non-Unanimous gossips can cause deadlocks - /\ {g.src : g \in gossips} = NID - /\ Vote(n, gossips \cup {[src |-> n, txid |-> txids[n]]}, "quorum") + /\ {g.src : g \in recv_gossips[n]} = NID + /\ Vote(n) ActionVoteTimeout(n) == - \E gossips \in SUBSET gossip_msgs: - /\ Cardinality({g.src : g \in gossips}) >= MAJ_QUORUM_LIMIT - /\ Vote(n, gossips \cup {[src |-> n, txid |-> txids[n]]}, "timeout") + /\ Cardinality({g.src : g \in recv_gossips[n]}) >= MAJ_QUORUM_LIMIT + /\ Vote(n) ActionOpen(n) == \E Vs \in SUBSET {v \in vote_msgs: v.vote = n}: @@ -73,18 +87,19 @@ ActionOpen(n) == /\ next_step[n] = "open/join" /\ next_step' = [next_step EXCEPT ![n] = "open"] /\ open_msgs' = open_msgs \cup {[src |-> n]} - /\ UNCHANGED << txids, gossip_msgs, vote_msgs >> + /\ UNCHANGED << txids, gossip_msgs, recv_gossips, vote_msgs >> ActionJoin(n) == \E o \in open_msgs: /\ next_step[n] = "open/join" /\ next_step' = [next_step EXCEPT ![n] = "join"] - /\ UNCHANGED << txids, gossip_msgs, vote_msgs, open_msgs >> + /\ UNCHANGED << txids, gossip_msgs, recv_gossips, vote_msgs, open_msgs >> Next == \E n \in NID: \/ ActionSendGossip(n) + \/ ActionRecvGossip(n) \/ ActionVoteQuorum(n) \/ ActionVoteTimeout(n) \/ ActionOpen(n) @@ -92,15 +107,13 @@ Next == Spec == Init /\ [][Next]_vars -InvNoTimeoutNoFork == - (\A m \in vote_msgs: m.kind = "quorum") - => +InvNoFork == (Cardinality({n \in NID: next_step[n] = "open"}) <= 1) InvCorrectState == ~\A n \in NID: next_step[n] \in {"open", "join"} \* We optimally should be unable to reach a deadlock state -\* where every node is blocked but it may be impossible with timeouts +\* where every node is blocked but it may be impossible due to timeouts InvNoDeadlockStates == (\A n \in NID: next_step[n] = "open/join") => @@ -110,8 +123,17 @@ InvNoDeadlockStates == \/ ENABLED ActionJoin(n) ) -InvNoTimeoutNoDeadlock == - (\A m \in vote_msgs: m.kind = "quorum") - => InvNoDeadlockStates +InvUnanimousLiveVotesNoDeadlock == + LET live_nid == {n \in NID: next_step[n] /= "gossip"} IN + (\A m \in vote_msgs: m.recv = live_nid) => InvNoDeadlockStates + +InvNonUnanimousOpen == +LET live_nid == {n \in NID: next_step[n] /= "gossip"} IN + ~ /\ \E n \in NID: next_step[n] = "gossip" + /\ \E n \in NID: next_step[n] = "open" + /\ \A m \in vote_msgs: m.recv = live_nid + /\ \A n \in NID: next_step[n] \in {"gossip", "open", "join"} + +Symmetry == Permutations(NID) ==== \ No newline at end of file From 9eb305dd48ad8af94b76663f1db26051e5067760 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 22 May 2025 11:35:26 +0100 Subject: [PATCH 003/197] Add basic fizzbee spec --- tla/disaster-recovery/autoopen.fizz | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tla/disaster-recovery/autoopen.fizz diff --git a/tla/disaster-recovery/autoopen.fizz b/tla/disaster-recovery/autoopen.fizz new file mode 100644 index 000000000000..49705045dda8 --- /dev/null +++ b/tla/disaster-recovery/autoopen.fizz @@ -0,0 +1,31 @@ +---- +options: + maxActions: 10 + +deadlock_detection: false +---- +NUM_NODES = 2 + +NextSteps = enum("GOSSIP", "VOTE", "OPENJOIN", "OPEN", "JOIN") + +role Node: + atomic action Init: + self.next_step = NextSteps.GOSSIP + self.recv_gossips = {} + + action Gossip: + if self.next_step == NextSteps.GOSSIP: + self.next_step = NextSteps.VOTE + self.gossip(self.__id__, self.txid) + for n in nodes: + if n.__id__ != self.__id__: + n.gossip(self.__id__, self.txid) + + func gossip(src_id, txid): + self.recv_gossips[src_id] = txid + +atomic action Init: + nodes = [] + for i in range(0, NUM_NODES): + node = Node(txid=i) + nodes.append(node) \ No newline at end of file From 0bf26f97252c3c3004449abf798ca99c0efa7353 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 28 May 2025 12:09:16 +0100 Subject: [PATCH 004/197] Add stateright model --- tla/disaster-recovery/stateright/.gitignore | 1 + tla/disaster-recovery/stateright/Cargo.lock | 609 ++++++++++++++++++ tla/disaster-recovery/stateright/Cargo.toml | 7 + tla/disaster-recovery/stateright/src/main.rs | 123 ++++ tla/disaster-recovery/stateright/src/model.rs | 243 +++++++ 5 files changed, 983 insertions(+) create mode 100644 tla/disaster-recovery/stateright/.gitignore create mode 100644 tla/disaster-recovery/stateright/Cargo.lock create mode 100644 tla/disaster-recovery/stateright/Cargo.toml create mode 100644 tla/disaster-recovery/stateright/src/main.rs create mode 100644 tla/disaster-recovery/stateright/src/model.rs diff --git a/tla/disaster-recovery/stateright/.gitignore b/tla/disaster-recovery/stateright/.gitignore new file mode 100644 index 000000000000..eb5a316cbd19 --- /dev/null +++ b/tla/disaster-recovery/stateright/.gitignore @@ -0,0 +1 @@ +target diff --git a/tla/disaster-recovery/stateright/Cargo.lock b/tla/disaster-recovery/stateright/Cargo.lock new file mode 100644 index 000000000000..d5641abbdc0c --- /dev/null +++ b/tla/disaster-recovery/stateright/Cargo.lock @@ -0,0 +1,609 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.3", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys", +] + +[[package]] +name = "ascii" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + +[[package]] +name = "ccf-autodr" +version = "0.0.0" +dependencies = [ + "clap", + "stateright", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "choice" +version = "0.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3b71fc821deaf602a933ada5c845d088156d0cdf2ebf43ede390afe93466553" + +[[package]] +name = "chunked_transfer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901" + +[[package]] +name = "clap" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi 0.14.2+wasi-0.2.4", +] + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "id-set" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9633fadf6346456cf8531119ba4838bc6d82ac4ce84d9852126dd2aa34d49264" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.16", +] + +[[package]] +name = "redox_syscall" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" +dependencies = [ + "bitflags", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "smallvec" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" + +[[package]] +name = "stateright" +version = "0.30.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebd37c74ff38ca9e5d370efb7af3c49ecab91cb5644affa23dc54a061d0f3a59" +dependencies = [ + "ahash", + "choice", + "crossbeam-utils", + "dashmap", + "id-set", + "log", + "nohash-hasher", + "parking_lot", + "rand", + "serde", + "serde_json", + "tiny_http", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tiny_http" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82" +dependencies = [ + "ascii", + "chunked_transfer", + "httpdate", + "log", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags", +] + +[[package]] +name = "zerocopy" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/tla/disaster-recovery/stateright/Cargo.toml b/tla/disaster-recovery/stateright/Cargo.toml new file mode 100644 index 000000000000..b2c2414ae798 --- /dev/null +++ b/tla/disaster-recovery/stateright/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "ccf-autodr" +version = "0.0.0" + +[dependencies] +clap = { version = "4.5.38", features = ["derive"] } +stateright = "0.30.2" diff --git a/tla/disaster-recovery/stateright/src/main.rs b/tla/disaster-recovery/stateright/src/main.rs new file mode 100644 index 000000000000..3a3ebe147634 --- /dev/null +++ b/tla/disaster-recovery/stateright/src/main.rs @@ -0,0 +1,123 @@ +extern crate clap; +extern crate stateright; +use clap::Parser; +mod model; +use model::{ModelCfg, Msg, NextStep, Node, State}; +use stateright::{actor::*, report::WriteReporter, util::HashableHashSet, Checker, Model}; +use std::sync::Arc; + +#[derive(Parser, Debug)] +#[command(version, about = "CCF auto-open model", long_about = None)] +struct CliArgs { + #[clap(short, long, default_value = "3")] + n_nodes: usize, + + #[command(subcommand)] + command: Commands, +} + +#[derive(Parser, Debug)] +enum Commands { + /// Check the model + Check, + /// Serve the model on localhost:8080 + Serve, +} + +fn check(model: ActorModel) { + let checker = model + .checker() + .spawn_bfs() + .join_and_report(&mut WriteReporter::new(&mut std::io::stderr())); + checker.assert_properties(); +} + +fn serve(model: ActorModel) { + let checker = model.checker(); + println!("Serving model on http://localhost:8080"); + checker.serve("localhost:8080"); +} + +fn reached_open(_model: &ActorModel, state: &ActorModelState) -> bool { + // Check if the open state is reached + state + .actor_states + .iter() + .any(|actor_state: &Arc| actor_state.next_step == NextStep::Open) +} + +fn non_unanimous_gossip( + model: &ActorModel, + state: &ActorModelState, +) -> bool { + // Check if there is any non-unanimous gossip + let peers: HashableHashSet = (0..model.cfg.n_nodes) + .map(|i| Id::from(i as usize)) + .collect(); + state.actor_states.iter().any(|actor_state: &Arc| { + actor_state.submitted_vote.is_some() + && actor_state.submitted_vote.clone().unwrap().1.recv != peers + }) +} + +fn main() { + let args = CliArgs::parse(); + + let model = ModelCfg { + n_nodes: args.n_nodes, + } + .into_model(); + + let model = model.property( + stateright::Expectation::Eventually, + "Unanimous votes => no deadlock", + |model: &ActorModel, state: &ActorModelState| { + // state where any vote is made using non-unanimous gossips + // OR + // open reached + reached_open(model, state) || non_unanimous_gossip(model, state) + }, + ); + + let model = model.property( + stateright::Expectation::Always, + "No fork", + |_model: &ActorModel, state: &ActorModelState| { + // Check if there is no fork in the state + state + .actor_states + .iter() + .filter(|actor_state: &&Arc| actor_state.next_step == NextStep::Open) + .count() + <= 1 + }, + ); + + let model = model + .property( + stateright::Expectation::Sometimes, + "Open state reached", + |model: &ActorModel, state: &ActorModelState| { + reached_open(model, state) && !non_unanimous_gossip(model, state) + }, + ) + .property( + stateright::Expectation::Sometimes, + "Deadlock reached", + |_model: &ActorModel, state: &ActorModelState| { + let all_open_join = state + .actor_states + .iter() + .all(|actor_state: &Arc| actor_state.next_step == NextStep::OpenJoin); + let all_votes_delivered = state.network.iter_all().filter(|msg| { + matches!(msg, Envelope { src:_, dst:_, msg: Msg::Vote(_)}) + }).count() == 0; + all_open_join && all_votes_delivered + }, + ); + + match args.command { + Commands::Check => check(model), + Commands::Serve => serve(model), + } +} diff --git a/tla/disaster-recovery/stateright/src/model.rs b/tla/disaster-recovery/stateright/src/model.rs new file mode 100644 index 000000000000..0ed22e02af62 --- /dev/null +++ b/tla/disaster-recovery/stateright/src/model.rs @@ -0,0 +1,243 @@ +extern crate stateright; +use stateright::{actor::*, util::HashableHashSet, Rewrite, RewritePlan}; +use std::borrow::Cow; + +type Txid = u64; + +#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct Gossip { + pub src: Id, + pub txid: Txid, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct Vote { + pub src: Id, + pub recv: HashableHashSet, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub enum Msg { + Gossip(Gossip), + Vote(Vote), + Open, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub enum Timer { + ElectionTimeout, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub enum NextStep { + // Gossip, + Vote, + OpenJoin, + Open, + Join, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub struct State { + pub next_step: NextStep, + pub gossips: HashableHashSet, + pub votes: HashableHashSet, + pub submitted_vote: Option<(Id, Vote)>, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub struct Node { + pub txid: Txid, + pub peers: HashableHashSet, +} + +impl Node { + fn vote_for_max<'a, I>(gossips: &I, id: Id) -> (Id, Vote) + where + I: Iterator + Clone, + { + let dst = gossips + .clone() + .max_by(|a, b| a.txid.cmp(&b.txid)) + .unwrap() + .src; + let vote = Vote { + src: id, + recv: gossips.clone().map(|g| g.src.clone()).collect(), + }; + return (dst, vote); + } + + fn other_peers(&self, id: Id) -> Vec { + self.peers.iter().filter(|&&p| p != id).cloned().collect() + } + + fn advance_step(&self, state: &mut State, o: &mut Out, id: Id, timeout: bool) { + match state.next_step { + NextStep::Vote if state.gossips.len() == self.peers.len() || timeout => { + let (dst, vote) = Node::vote_for_max(&state.gossips.iter(), id); + state.submitted_vote = Some((dst, vote.clone())); + o.send(dst, Msg::Vote(vote)); + state.next_step = NextStep::OpenJoin; + } + NextStep::OpenJoin if state.votes.len() >= (self.peers.len() + 1) / 2 => { + state.next_step = NextStep::Open; + o.broadcast(&self.other_peers(id), &Msg::Open); + } + _ => {} + } + } +} + +impl Actor for Node { + type Msg = Msg; + type State = State; + type Timer = Timer; + + fn on_start(&self, id: Id, o: &mut Out) -> Self::State { + let mut gossips = HashableHashSet::new(); + gossips.insert(Gossip { + src: id, + txid: self.txid, + }); + let state = State { + next_step: NextStep::Vote, + gossips: gossips, + votes: HashableHashSet::new(), + submitted_vote: None, + }; + o.broadcast( + &self.other_peers(id), + &Msg::Gossip(Gossip { + src: id, + txid: self.txid, + }), + ); + o.set_timer(Timer::ElectionTimeout, model_timeout()); + return state; + } + + fn on_timeout(&self, id: Id, state: &mut Cow, timer: &Timer, o: &mut Out) { + match timer { + Timer::ElectionTimeout => { + if state.next_step == NextStep::Vote && !state.gossips.is_empty() { + let state = state.to_mut(); + self.advance_step(state, o, id, true); + } else { + o.set_timer(Timer::ElectionTimeout, model_timeout()); + } + } + } + } + + fn on_msg( + &self, + id: Id, + state: &mut Cow, + _src: Id, + msg: Self::Msg, + o: &mut Out, + ) { + let state = state.to_mut(); + match msg { + Msg::Gossip(gossip) => { + // Freeze gossip collection after voting is submitted + if !state.gossips.contains(&gossip) && state.submitted_vote.is_none() { + state.gossips.insert(gossip.clone()); + } + } + Msg::Vote(vote) => { + if !state.votes.contains(&vote) { + state.votes.insert(vote); + } + } + Msg::Open => { + state.next_step = NextStep::Join; + } + }; + self.advance_step(state, o, id, false); + } +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +pub struct ModelCfg { + pub n_nodes: usize, +} + +impl ModelCfg { + pub fn into_model(self) -> ActorModel { + let peers: HashableHashSet = (0..self.n_nodes).map(|i| Id::from(i as usize)).collect(); + ActorModel::new(self.clone(), ()) + .actors( + (0..self.n_nodes) + .map(|i| Node { + peers: peers.clone(), + txid: i as u64, + }) + .collect::>(), + ) + //.init_network(Network::new_ordered([])) + .init_network(Network::new_unordered_nonduplicating([])) + .lossy_network(LossyNetwork::No) + } +} + +impl Rewrite for Gossip { + fn rewrite(&self, plan: &RewritePlan) -> Self { + Gossip { + src: self.src.rewrite(plan), + txid: self.txid, + } + } +} + +impl Rewrite for Vote { + fn rewrite(&self, plan: &RewritePlan) -> Self { + Vote { + src: self.src.rewrite(plan), + recv: self.recv.iter().map(|r| r.rewrite(plan)).collect(), + } + } +} + +impl Rewrite for Msg { + fn rewrite(&self, plan: &RewritePlan) -> Self { + match self { + Msg::Gossip(gossip) => Msg::Gossip(gossip.rewrite(plan)), + Msg::Vote(vote) => Msg::Vote(vote.rewrite(plan)), + Msg::Open => Msg::Open, + } + } +} + +impl Rewrite for Node { + fn rewrite(&self, plan: &RewritePlan) -> Self { + Node { + txid: self.txid, + peers: self.peers.iter().map(|p| p.rewrite(plan)).collect(), + } + } +} + +impl Rewrite for State { + fn rewrite(&self, plan: &RewritePlan) -> Self { + State { + next_step: self.next_step.clone(), + gossips: self.gossips.iter().map(|g| g.rewrite(plan)).collect(), + votes: self.votes.iter().map(|v| v.rewrite(plan)).collect(), + submitted_vote: None, + } + } +} + +impl Rewrite for Timer { + fn rewrite(&self, _plan: &RewritePlan) -> Self { + self.clone() + } +} + +impl Rewrite for ModelCfg { + fn rewrite(&self, _plan: &RewritePlan) -> Self { + self.clone() + } +} From 2b8a1d60eab780f054a238745816c9a007d8cc70 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 28 May 2025 12:20:12 +0100 Subject: [PATCH 005/197] Update stateright dr spec --- tla/disaster-recovery/stateright/Readme.md | 13 +++++++++++++ tla/disaster-recovery/stateright/src/main.rs | 15 ++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 tla/disaster-recovery/stateright/Readme.md diff --git a/tla/disaster-recovery/stateright/Readme.md b/tla/disaster-recovery/stateright/Readme.md new file mode 100644 index 000000000000..a23ea32d445c --- /dev/null +++ b/tla/disaster-recovery/stateright/Readme.md @@ -0,0 +1,13 @@ +# Auto-open specification in (stateright)[] + +The properties are specified in (main.rs)[./src/main.rs], while the model is specified in (model.rs)[./src/model.rs]. + +Due to stateright being executable, there is little syntactic sugar, and so there is quite a bit of boilerplate. +The functional parts of the specification are in `advance_step`, `on_start`, `on_timeout` and `on_msg`. + +The specification can be checked from the command line via `cargo run check`. + +However, a more useful UX is via the web-view which is hosted locally via `cargo run serve`. +This allows you to explore the specification actions interactively, and the checker can be exhaustively run using the `Run to completion` button, which should find several useful examples of states where the network is opened, and where a deadlock is reached. + + diff --git a/tla/disaster-recovery/stateright/src/main.rs b/tla/disaster-recovery/stateright/src/main.rs index 3a3ebe147634..146410fa51d2 100644 --- a/tla/disaster-recovery/stateright/src/main.rs +++ b/tla/disaster-recovery/stateright/src/main.rs @@ -96,22 +96,27 @@ fn main() { let model = model .property( stateright::Expectation::Sometimes, - "Open state reached", + "Open", |model: &ActorModel, state: &ActorModelState| { reached_open(model, state) && !non_unanimous_gossip(model, state) }, ) .property( stateright::Expectation::Sometimes, - "Deadlock reached", + "Deadlock", |_model: &ActorModel, state: &ActorModelState| { let all_open_join = state .actor_states .iter() .all(|actor_state: &Arc| actor_state.next_step == NextStep::OpenJoin); - let all_votes_delivered = state.network.iter_all().filter(|msg| { - matches!(msg, Envelope { src:_, dst:_, msg: Msg::Vote(_)}) - }).count() == 0; + let all_votes_delivered = state + .network + .iter_all() + .filter(|msg| { + matches!(msg.msg, Msg::Vote(_)) + }) + .count() + == 0; all_open_join && all_votes_delivered }, ); From bad8a138a7d05c85c961ebd1c5cf28868ab47808 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 28 May 2025 12:21:21 +0100 Subject: [PATCH 006/197] Update Readme.md --- tla/disaster-recovery/stateright/Readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tla/disaster-recovery/stateright/Readme.md b/tla/disaster-recovery/stateright/Readme.md index a23ea32d445c..488da6c2e782 100644 --- a/tla/disaster-recovery/stateright/Readme.md +++ b/tla/disaster-recovery/stateright/Readme.md @@ -1,4 +1,4 @@ -# Auto-open specification in (stateright)[] +# Auto-open specification in (stateright)[https://github.com/stateright/stateright] The properties are specified in (main.rs)[./src/main.rs], while the model is specified in (model.rs)[./src/model.rs]. From 1965453c6b94c05bada8aa189854e8ba381b0f6a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 28 May 2025 12:22:11 +0100 Subject: [PATCH 007/197] Update Readme.md --- tla/disaster-recovery/stateright/Readme.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tla/disaster-recovery/stateright/Readme.md b/tla/disaster-recovery/stateright/Readme.md index 488da6c2e782..30e19f7addfc 100644 --- a/tla/disaster-recovery/stateright/Readme.md +++ b/tla/disaster-recovery/stateright/Readme.md @@ -1,6 +1,6 @@ -# Auto-open specification in (stateright)[https://github.com/stateright/stateright] +# Auto-open specification in [stateright](https://github.com/stateright/stateright) -The properties are specified in (main.rs)[./src/main.rs], while the model is specified in (model.rs)[./src/model.rs]. +The properties are specified in [main.rs](./src/main.rs), while the model is specified in [model.rs](./src/model.rs). Due to stateright being executable, there is little syntactic sugar, and so there is quite a bit of boilerplate. The functional parts of the specification are in `advance_step`, `on_start`, `on_timeout` and `on_msg`. From 5f066e5150b39214aec8a905e173e31244a84a6b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 3 Jun 2025 16:55:09 +0100 Subject: [PATCH 008/197] broken version --- tla/disaster-recovery/stateright/Cargo.lock | 20 +-- tla/disaster-recovery/stateright/Cargo.toml | 2 +- tla/disaster-recovery/stateright/src/main.rs | 9 +- tla/disaster-recovery/stateright/src/model.rs | 152 +++++++++--------- 4 files changed, 94 insertions(+), 89 deletions(-) diff --git a/tla/disaster-recovery/stateright/Cargo.lock b/tla/disaster-recovery/stateright/Cargo.lock index d5641abbdc0c..762bfea34d07 100644 --- a/tla/disaster-recovery/stateright/Cargo.lock +++ b/tla/disaster-recovery/stateright/Cargo.lock @@ -163,11 +163,12 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "dashmap" -version = "5.5.3" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", + "crossbeam-utils", "hashbrown", "lock_api", "once_cell", @@ -241,9 +242,9 @@ checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "lock_api" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" dependencies = [ "autocfg", "scopeguard", @@ -281,9 +282,9 @@ checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" [[package]] name = "parking_lot" -version = "0.12.3" +version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" dependencies = [ "lock_api", "parking_lot_core", @@ -291,9 +292,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.10" +version = "0.9.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" dependencies = [ "cfg-if", "libc", @@ -427,8 +428,7 @@ checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] name = "stateright" version = "0.30.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebd37c74ff38ca9e5d370efb7af3c49ecab91cb5644affa23dc54a061d0f3a59" +source = "git+https://github.com/stateright/stateright?rev=4c385128baa589c6760e675ca32219fb722aedd3#4c385128baa589c6760e675ca32219fb722aedd3" dependencies = [ "ahash", "choice", diff --git a/tla/disaster-recovery/stateright/Cargo.toml b/tla/disaster-recovery/stateright/Cargo.toml index b2c2414ae798..30cd5a2b60d6 100644 --- a/tla/disaster-recovery/stateright/Cargo.toml +++ b/tla/disaster-recovery/stateright/Cargo.toml @@ -4,4 +4,4 @@ version = "0.0.0" [dependencies] clap = { version = "4.5.38", features = ["derive"] } -stateright = "0.30.2" +stateright = { git = "https://github.com/stateright/stateright", rev = "4c385128baa589c6760e675ca32219fb722aedd3", version = "0.30.2" } diff --git a/tla/disaster-recovery/stateright/src/main.rs b/tla/disaster-recovery/stateright/src/main.rs index 146410fa51d2..1048802ad883 100644 --- a/tla/disaster-recovery/stateright/src/main.rs +++ b/tla/disaster-recovery/stateright/src/main.rs @@ -38,6 +38,7 @@ fn serve(model: ActorModel) { checker.serve("localhost:8080"); } + fn reached_open(_model: &ActorModel, state: &ActorModelState) -> bool { // Check if the open state is reached state @@ -75,7 +76,7 @@ fn main() { // state where any vote is made using non-unanimous gossips // OR // open reached - reached_open(model, state) || non_unanimous_gossip(model, state) + reached_open(model, state) }, ); @@ -112,9 +113,7 @@ fn main() { let all_votes_delivered = state .network .iter_all() - .filter(|msg| { - matches!(msg.msg, Msg::Vote(_)) - }) + .filter(|msg| matches!(msg.msg, Msg::Vote(_))) .count() == 0; all_open_join && all_votes_delivered @@ -125,4 +124,4 @@ fn main() { Commands::Check => check(model), Commands::Serve => serve(model), } -} +} \ No newline at end of file diff --git a/tla/disaster-recovery/stateright/src/model.rs b/tla/disaster-recovery/stateright/src/model.rs index 0ed22e02af62..a6577150d466 100644 --- a/tla/disaster-recovery/stateright/src/model.rs +++ b/tla/disaster-recovery/stateright/src/model.rs @@ -5,21 +5,25 @@ use std::borrow::Cow; type Txid = u64; #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] -pub struct Gossip { +pub struct GossipStruct { pub src: Id, pub txid: Txid, } #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] -pub struct Vote { +pub struct VoteStruct { pub src: Id, pub recv: HashableHashSet, } +fn toHashSet(ids: Vec) -> HashableHashSet { + ids.into_iter().collect() +} + #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub enum Msg { - Gossip(Gossip), - Vote(Vote), + Gossip(GossipStruct), + Vote(VoteStruct), Open, } @@ -40,9 +44,9 @@ pub enum NextStep { #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct State { pub next_step: NextStep, - pub gossips: HashableHashSet, - pub votes: HashableHashSet, - pub submitted_vote: Option<(Id, Vote)>, + pub gossips: HashableHashSet, + pub votes: HashableHashSet, + pub submitted_vote: Option<(Id, VoteStruct)>, } #[derive(Debug, Clone, Hash, PartialEq, Eq)] @@ -52,16 +56,16 @@ pub struct Node { } impl Node { - fn vote_for_max<'a, I>(gossips: &I, id: Id) -> (Id, Vote) + fn vote_for_max<'a, I>(gossips: &I, id: Id) -> (Id, VoteStruct) where - I: Iterator + Clone, + I: Iterator + Clone, { let dst = gossips .clone() .max_by(|a, b| a.txid.cmp(&b.txid)) .unwrap() .src; - let vote = Vote { + let vote = VoteStruct { src: id, recv: gossips.clone().map(|g| g.src.clone()).collect(), }; @@ -93,10 +97,12 @@ impl Actor for Node { type Msg = Msg; type State = State; type Timer = Timer; + type Storage = (); + type Random = (); - fn on_start(&self, id: Id, o: &mut Out) -> Self::State { + fn on_start(&self, id: Id, _storage: &Option, o: &mut Out) -> Self::State { let mut gossips = HashableHashSet::new(); - gossips.insert(Gossip { + gossips.insert(GossipStruct { src: id, txid: self.txid, }); @@ -108,7 +114,7 @@ impl Actor for Node { }; o.broadcast( &self.other_peers(id), - &Msg::Gossip(Gossip { + &Msg::Gossip(GossipStruct { src: id, txid: self.txid, }), @@ -124,7 +130,7 @@ impl Actor for Node { let state = state.to_mut(); self.advance_step(state, o, id, true); } else { - o.set_timer(Timer::ElectionTimeout, model_timeout()); + o.set_timer(Timer::ElectionTimeout, model_timeout()); } } } @@ -182,62 +188,62 @@ impl ModelCfg { } } -impl Rewrite for Gossip { - fn rewrite(&self, plan: &RewritePlan) -> Self { - Gossip { - src: self.src.rewrite(plan), - txid: self.txid, - } - } -} - -impl Rewrite for Vote { - fn rewrite(&self, plan: &RewritePlan) -> Self { - Vote { - src: self.src.rewrite(plan), - recv: self.recv.iter().map(|r| r.rewrite(plan)).collect(), - } - } -} - -impl Rewrite for Msg { - fn rewrite(&self, plan: &RewritePlan) -> Self { - match self { - Msg::Gossip(gossip) => Msg::Gossip(gossip.rewrite(plan)), - Msg::Vote(vote) => Msg::Vote(vote.rewrite(plan)), - Msg::Open => Msg::Open, - } - } -} - -impl Rewrite for Node { - fn rewrite(&self, plan: &RewritePlan) -> Self { - Node { - txid: self.txid, - peers: self.peers.iter().map(|p| p.rewrite(plan)).collect(), - } - } -} - -impl Rewrite for State { - fn rewrite(&self, plan: &RewritePlan) -> Self { - State { - next_step: self.next_step.clone(), - gossips: self.gossips.iter().map(|g| g.rewrite(plan)).collect(), - votes: self.votes.iter().map(|v| v.rewrite(plan)).collect(), - submitted_vote: None, - } - } -} - -impl Rewrite for Timer { - fn rewrite(&self, _plan: &RewritePlan) -> Self { - self.clone() - } -} - -impl Rewrite for ModelCfg { - fn rewrite(&self, _plan: &RewritePlan) -> Self { - self.clone() - } -} +//impl Rewrite for Gossip { +// fn rewrite(&self, plan: &RewritePlan) -> Self { +// Gossip { +// src: self.src.rewrite(plan), +// txid: self.txid, +// } +// } +//} +// +//impl Rewrite for Vote { +// fn rewrite(&self, plan: &RewritePlan) -> Self { +// Vote { +// src: self.src.rewrite(plan), +// recv: self.recv.iter().map(|r| r.rewrite(plan)).collect(), +// } +// } +//} +// +//impl Rewrite for Msg { +// fn rewrite(&self, plan: &RewritePlan) -> Self { +// match self { +// Msg::Gossip(gossip) => Msg::Gossip(gossip.rewrite(plan)), +// Msg::Vote(vote) => Msg::Vote(vote.rewrite(plan)), +// Msg::Open => Msg::Open, +// } +// } +//} +// +//impl Rewrite for Node { +// fn rewrite(&self, plan: &RewritePlan) -> Self { +// Node { +// txid: self.txid, +// peers: self.peers.iter().map(|p| p.rewrite(plan)).collect(), +// } +// } +//} +// +//impl Rewrite for State { +// fn rewrite(&self, plan: &RewritePlan) -> Self { +// State { +// next_step: self.next_step.clone(), +// gossips: self.gossips.iter().map(|g| g.rewrite(plan)).collect(), +// votes: self.votes.iter().map(|v| v.rewrite(plan)).collect(), +// submitted_vote: None, +// } +// } +//} +// +//impl Rewrite for Timer { +// fn rewrite(&self, _plan: &RewritePlan) -> Self { +// self.clone() +// } +//} +// +//impl Rewrite for ModelCfg { +// fn rewrite(&self, _plan: &RewritePlan) -> Self { +// self.clone() +// } +//} From 09388e784da9b2ef36bc6f6b20ac87744fb49e5f Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 4 Jun 2025 11:44:31 +0100 Subject: [PATCH 009/197] refactor --- tla/disaster-recovery/stateright/Cargo.lock | 2 +- tla/disaster-recovery/stateright/Cargo.toml | 3 +- tla/disaster-recovery/stateright/src/main.rs | 139 ++++++++++-------- tla/disaster-recovery/stateright/src/model.rs | 64 -------- 4 files changed, 84 insertions(+), 124 deletions(-) diff --git a/tla/disaster-recovery/stateright/Cargo.lock b/tla/disaster-recovery/stateright/Cargo.lock index 762bfea34d07..12eb8287670e 100644 --- a/tla/disaster-recovery/stateright/Cargo.lock +++ b/tla/disaster-recovery/stateright/Cargo.lock @@ -428,7 +428,7 @@ checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" [[package]] name = "stateright" version = "0.30.2" -source = "git+https://github.com/stateright/stateright?rev=4c385128baa589c6760e675ca32219fb722aedd3#4c385128baa589c6760e675ca32219fb722aedd3" +source = "git+https://github.com/cjen1-msft/stateright?branch=eventually-fix#c59ac159513b88830f9bab58ce094afd58c7cb11" dependencies = [ "ahash", "choice", diff --git a/tla/disaster-recovery/stateright/Cargo.toml b/tla/disaster-recovery/stateright/Cargo.toml index 30cd5a2b60d6..74cf461168ec 100644 --- a/tla/disaster-recovery/stateright/Cargo.toml +++ b/tla/disaster-recovery/stateright/Cargo.toml @@ -4,4 +4,5 @@ version = "0.0.0" [dependencies] clap = { version = "4.5.38", features = ["derive"] } -stateright = { git = "https://github.com/stateright/stateright", rev = "4c385128baa589c6760e675ca32219fb722aedd3", version = "0.30.2" } +#stateright = { git = "https://github.com/stateright/stateright", rev = "4c385128baa589c6760e675ca32219fb722aedd3", version = "0.30.2" } +stateright = { git = "https://github.com/cjen1-msft/stateright", branch="eventually-fix" } diff --git a/tla/disaster-recovery/stateright/src/main.rs b/tla/disaster-recovery/stateright/src/main.rs index 1048802ad883..fc5b632e8e23 100644 --- a/tla/disaster-recovery/stateright/src/main.rs +++ b/tla/disaster-recovery/stateright/src/main.rs @@ -3,103 +3,75 @@ extern crate stateright; use clap::Parser; mod model; use model::{ModelCfg, Msg, NextStep, Node, State}; -use stateright::{actor::*, report::WriteReporter, util::HashableHashSet, Checker, Model}; +use stateright::{ + actor::*, + report::WriteReporter, + util::{HashableHashMap, HashableHashSet}, + Checker, Model, +}; use std::sync::Arc; -#[derive(Parser, Debug)] -#[command(version, about = "CCF auto-open model", long_about = None)] -struct CliArgs { - #[clap(short, long, default_value = "3")] - n_nodes: usize, - - #[command(subcommand)] - command: Commands, -} - -#[derive(Parser, Debug)] -enum Commands { - /// Check the model - Check, - /// Serve the model on localhost:8080 - Serve, -} - -fn check(model: ActorModel) { - let checker = model - .checker() - .spawn_bfs() - .join_and_report(&mut WriteReporter::new(&mut std::io::stderr())); - checker.assert_properties(); +fn implies(a: bool, b: bool) -> bool { + !a || b } -fn serve(model: ActorModel) { - let checker = model.checker(); - println!("Serving model on http://localhost:8080"); - checker.serve("localhost:8080"); -} - - -fn reached_open(_model: &ActorModel, state: &ActorModelState) -> bool { - // Check if the open state is reached +fn reached_open(model: &ActorModel, state: &ActorModelState) -> bool { state .actor_states .iter() .any(|actor_state: &Arc| actor_state.next_step == NextStep::Open) } -fn non_unanimous_gossip( - model: &ActorModel, - state: &ActorModelState, -) -> bool { - // Check if there is any non-unanimous gossip +fn unanimous_votes(model: &ActorModel, state: &ActorModelState) -> bool { let peers: HashableHashSet = (0..model.cfg.n_nodes) .map(|i| Id::from(i as usize)) .collect(); - state.actor_states.iter().any(|actor_state: &Arc| { + state.actor_states.iter().all(|actor_state: &Arc| { actor_state.submitted_vote.is_some() - && actor_state.submitted_vote.clone().unwrap().1.recv != peers + && actor_state.submitted_vote.clone().unwrap().1.recv == peers }) } -fn main() { - let args = CliArgs::parse(); - - let model = ModelCfg { - n_nodes: args.n_nodes, - } - .into_model(); - +fn liveness_properties(model: ActorModel) -> ActorModel { let model = model.property( stateright::Expectation::Eventually, "Unanimous votes => no deadlock", |model: &ActorModel, state: &ActorModelState| { - // state where any vote is made using non-unanimous gossips - // OR - // open reached - reached_open(model, state) + // Define deadlock as a path which does not reach open + // Hence unanimous votes => reach open + // Hence on every path unanimous votes => <> reached open + // Since votes are not forgotten on a node, we check for a state where unanimous votes => reached open + //return implies(unanimous_votes(model, state), reached_open(model, state)); + return implies(true, reached_open(model, state)); }, ); + return model; +} +fn invariant_properties(model: ActorModel) -> ActorModel { let model = model.property( stateright::Expectation::Always, "No fork", |_model: &ActorModel, state: &ActorModelState| { // Check if there is no fork in the state - state + let open_node_count = state .actor_states .iter() .filter(|actor_state: &&Arc| actor_state.next_step == NextStep::Open) - .count() - <= 1 + .count(); + open_node_count <= 1 }, ); + return model; +} +fn reachable_properties(model: ActorModel) -> ActorModel { let model = model .property( stateright::Expectation::Sometimes, "Open", |model: &ActorModel, state: &ActorModelState| { - reached_open(model, state) && !non_unanimous_gossip(model, state) + reached_open(model, state) }, ) .property( @@ -119,9 +91,60 @@ fn main() { all_open_join && all_votes_delivered }, ); + return model; +} + +fn properties(model: ActorModel) -> ActorModel { + let model = liveness_properties(model); + let model = invariant_properties(model); + let model = reachable_properties(model); + return model; +} + +#[derive(Parser, Debug)] +#[command(version, about = "CCF auto-open model", long_about = None)] +struct CliArgs { + #[clap(short, long, default_value = "3")] + n_nodes: usize, + + #[command(subcommand)] + command: Commands, +} + +#[derive(Parser, Debug)] +enum Commands { + /// Check the model + Check, + /// Serve the model on localhost:8080 + Serve, +} + +fn check(model: ActorModel) { + let checker = model + .checker() + .spawn_dfs() + .join_and_report(&mut WriteReporter::new(&mut std::io::stderr())); + checker.assert_properties(); +} + +fn serve(model: ActorModel) { + let checker = model.checker(); + println!("Serving model on http://localhost:8080"); + checker.serve("localhost:8080"); +} + +fn main() { + let args = CliArgs::parse(); + + let model = ModelCfg { + n_nodes: args.n_nodes, + } + .into_model(); + + let model = properties(model); match args.command { Commands::Check => check(model), Commands::Serve => serve(model), } -} \ No newline at end of file +} diff --git a/tla/disaster-recovery/stateright/src/model.rs b/tla/disaster-recovery/stateright/src/model.rs index a6577150d466..b8679c06451e 100644 --- a/tla/disaster-recovery/stateright/src/model.rs +++ b/tla/disaster-recovery/stateright/src/model.rs @@ -16,10 +16,6 @@ pub struct VoteStruct { pub recv: HashableHashSet, } -fn toHashSet(ids: Vec) -> HashableHashSet { - ids.into_iter().collect() -} - #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub enum Msg { Gossip(GossipStruct), @@ -187,63 +183,3 @@ impl ModelCfg { .lossy_network(LossyNetwork::No) } } - -//impl Rewrite for Gossip { -// fn rewrite(&self, plan: &RewritePlan) -> Self { -// Gossip { -// src: self.src.rewrite(plan), -// txid: self.txid, -// } -// } -//} -// -//impl Rewrite for Vote { -// fn rewrite(&self, plan: &RewritePlan) -> Self { -// Vote { -// src: self.src.rewrite(plan), -// recv: self.recv.iter().map(|r| r.rewrite(plan)).collect(), -// } -// } -//} -// -//impl Rewrite for Msg { -// fn rewrite(&self, plan: &RewritePlan) -> Self { -// match self { -// Msg::Gossip(gossip) => Msg::Gossip(gossip.rewrite(plan)), -// Msg::Vote(vote) => Msg::Vote(vote.rewrite(plan)), -// Msg::Open => Msg::Open, -// } -// } -//} -// -//impl Rewrite for Node { -// fn rewrite(&self, plan: &RewritePlan) -> Self { -// Node { -// txid: self.txid, -// peers: self.peers.iter().map(|p| p.rewrite(plan)).collect(), -// } -// } -//} -// -//impl Rewrite for State { -// fn rewrite(&self, plan: &RewritePlan) -> Self { -// State { -// next_step: self.next_step.clone(), -// gossips: self.gossips.iter().map(|g| g.rewrite(plan)).collect(), -// votes: self.votes.iter().map(|v| v.rewrite(plan)).collect(), -// submitted_vote: None, -// } -// } -//} -// -//impl Rewrite for Timer { -// fn rewrite(&self, _plan: &RewritePlan) -> Self { -// self.clone() -// } -//} -// -//impl Rewrite for ModelCfg { -// fn rewrite(&self, _plan: &RewritePlan) -> Self { -// self.clone() -// } -//} From b991b9de17bb976a7a972e6a259eee74f59a8b26 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 16 Jun 2025 10:26:03 +0100 Subject: [PATCH 010/197] Restore correct liveness property. --- tla/disaster-recovery/stateright/src/main.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tla/disaster-recovery/stateright/src/main.rs b/tla/disaster-recovery/stateright/src/main.rs index fc5b632e8e23..6145bb30ede3 100644 --- a/tla/disaster-recovery/stateright/src/main.rs +++ b/tla/disaster-recovery/stateright/src/main.rs @@ -41,8 +41,7 @@ fn liveness_properties(model: ActorModel) -> ActorModel reach open // Hence on every path unanimous votes => <> reached open // Since votes are not forgotten on a node, we check for a state where unanimous votes => reached open - //return implies(unanimous_votes(model, state), reached_open(model, state)); - return implies(true, reached_open(model, state)); + return implies(unanimous_votes(model, state), reached_open(model, state)); }, ); return model; From 4f6de457f11bb164b8beadae6ab65ea4a1696232 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 25 Jun 2025 11:55:30 +0100 Subject: [PATCH 011/197] Add more checked conditions --- tla/disaster-recovery/stateright/Cargo.lock | 72 ++++---- tla/disaster-recovery/stateright/Cargo.toml | 3 +- tla/disaster-recovery/stateright/src/main.rs | 174 ++++++++++++++---- tla/disaster-recovery/stateright/src/model.rs | 95 +++++----- 4 files changed, 225 insertions(+), 119 deletions(-) diff --git a/tla/disaster-recovery/stateright/Cargo.lock b/tla/disaster-recovery/stateright/Cargo.lock index 12eb8287670e..d7f54e1881b4 100644 --- a/tla/disaster-recovery/stateright/Cargo.lock +++ b/tla/disaster-recovery/stateright/Cargo.lock @@ -17,9 +17,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.18" +version = "0.6.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" dependencies = [ "anstyle", "anstyle-parse", @@ -32,33 +32,33 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" -version = "3.0.8" +version = "3.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" +checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", @@ -93,9 +93,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" [[package]] name = "choice" @@ -111,9 +111,9 @@ checksum = "6e4de3bc4ea267985becf712dc6d9eed8b04c953b3fcfb339ebc87acd9804901" [[package]] name = "clap" -version = "4.5.38" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed93b9805f8ba930df42c2590f05453d5ec36cbb85d018868a5b24d31f6ac000" +checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" dependencies = [ "clap_builder", "clap_derive", @@ -121,9 +121,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.38" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "379026ff283facf611b0ea629334361c4211d1b12ee01024eec1591133b04120" +checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" dependencies = [ "anstream", "anstyle", @@ -133,9 +133,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.32" +version = "4.5.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09176aae279615badda0765c0c0b3f6ed53f4709118af73cf4655d85d1530cd7" +checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" dependencies = [ "heck", "proc-macro2", @@ -145,15 +145,15 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "colorchoice" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "crossbeam-utils" @@ -183,7 +183,7 @@ checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi 0.11.1+wasi-snapshot-preview1", ] [[package]] @@ -236,9 +236,9 @@ checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" [[package]] name = "libc" -version = "0.2.172" +version = "0.2.173" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +checksum = "d8cfeafaffdbc32176b64fb251369d52ea9f0a8fbc6f8759edffef7b525d64bb" [[package]] name = "lock_api" @@ -258,9 +258,9 @@ checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "memchr" -version = "2.7.4" +version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "nohash-hasher" @@ -368,9 +368,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.12" +version = "0.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" +checksum = "0d04b7d0ee6b4a0207a0a7adb104d23ecb0b47d6beae7152d0fa34b692b29fd6" dependencies = [ "bitflags", ] @@ -421,14 +421,14 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.15.0" +version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" [[package]] name = "stateright" version = "0.30.2" -source = "git+https://github.com/cjen1-msft/stateright?branch=eventually-fix#c59ac159513b88830f9bab58ce094afd58c7cb11" +source = "git+https://github.com/cjen1-msft/stateright?branch=master#01959045c9c0cd69f7aa0498bf8dc48982eacdee" dependencies = [ "ahash", "choice", @@ -452,9 +452,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "syn" -version = "2.0.101" +version = "2.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +checksum = "e4307e30089d6fd6aff212f2da3a1f9e32f3223b1f010fb09b7c95f90f3ca1e8" dependencies = [ "proc-macro2", "quote", @@ -493,9 +493,9 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" +version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasi" diff --git a/tla/disaster-recovery/stateright/Cargo.toml b/tla/disaster-recovery/stateright/Cargo.toml index 74cf461168ec..9588f0cc93a2 100644 --- a/tla/disaster-recovery/stateright/Cargo.toml +++ b/tla/disaster-recovery/stateright/Cargo.toml @@ -4,5 +4,4 @@ version = "0.0.0" [dependencies] clap = { version = "4.5.38", features = ["derive"] } -#stateright = { git = "https://github.com/stateright/stateright", rev = "4c385128baa589c6760e675ca32219fb722aedd3", version = "0.30.2" } -stateright = { git = "https://github.com/cjen1-msft/stateright", branch="eventually-fix" } +stateright = { git = "https://github.com/cjen1-msft/stateright", branch="master" } \ No newline at end of file diff --git a/tla/disaster-recovery/stateright/src/main.rs b/tla/disaster-recovery/stateright/src/main.rs index 6145bb30ede3..5fac13bf5b74 100644 --- a/tla/disaster-recovery/stateright/src/main.rs +++ b/tla/disaster-recovery/stateright/src/main.rs @@ -3,23 +3,27 @@ extern crate stateright; use clap::Parser; mod model; use model::{ModelCfg, Msg, NextStep, Node, State}; -use stateright::{ - actor::*, - report::WriteReporter, - util::{HashableHashMap, HashableHashSet}, - Checker, Model, -}; +use stateright::{actor::*, report::WriteReporter, util::HashableHashSet, Checker, Model}; use std::sync::Arc; fn implies(a: bool, b: bool) -> bool { !a || b } -fn reached_open(model: &ActorModel, state: &ActorModelState) -> bool { +fn reached_open(state: &ActorModelState) -> bool { state .actor_states .iter() - .any(|actor_state: &Arc| actor_state.next_step == NextStep::Open) + .any(|actor_state: &Arc| matches!(actor_state.next_step, NextStep::Open { .. })) +} + +fn reached_open_timeout(state: &ActorModelState, expected_to_timeout: bool) -> bool { + state.actor_states.iter().any(|actor_state: &Arc| { + matches! ( + actor_state.next_step, + NextStep::Open {timeout} if timeout == expected_to_timeout + ) + }) } fn unanimous_votes(model: &ActorModel, state: &ActorModelState) -> bool { @@ -28,39 +32,122 @@ fn unanimous_votes(model: &ActorModel, state: &ActorModelState| { actor_state.submitted_vote.is_some() - && actor_state.submitted_vote.clone().unwrap().1.recv == peers + && peers.iter().all(|peer| { + actor_state + .submitted_vote + .clone() + .unwrap() + .1 + .recv + .iter() + .any(|g| g.src == *peer) + }) }) } +fn majority_have_same_maximum(state: &ActorModelState) -> bool { + // get the chosen replica of each replica into a vector and sort that vector + // that there is only one value up to the n/2th index + let mut chosen_replicas: Vec = state + .actor_states + .iter() + .filter(|actor_state: &&Arc| actor_state.submitted_vote.is_some()) + .map(|actor_state| { + actor_state + .submitted_vote + .clone() + .unwrap() + .1 + .recv + .iter() + .max_by_key(|g| (g.txid, g.src)) + .unwrap() + .src + }) + .collect(); + chosen_replicas.sort(); + let majority_idx = state.actor_states.len() / 2; + let majority_chosen_replica = chosen_replicas.get(majority_idx); + majority_chosen_replica.is_some() + && chosen_replicas[0..majority_idx] + .iter() + .all(|&r| r == *majority_chosen_replica.unwrap()) +} + fn liveness_properties(model: ActorModel) -> ActorModel { - let model = model.property( - stateright::Expectation::Eventually, - "Unanimous votes => no deadlock", - |model: &ActorModel, state: &ActorModelState| { - // Define deadlock as a path which does not reach open - // Hence unanimous votes => reach open - // Hence on every path unanimous votes => <> reached open - // Since votes are not forgotten on a node, we check for a state where unanimous votes => reached open - return implies(unanimous_votes(model, state), reached_open(model, state)); - }, - ); + let model = model + .property( + stateright::Expectation::Eventually, + "Unanimous votes => no chance of a fork", + |model: &ActorModel, state: &ActorModelState| { + // Define deadlock as a path which does not reach open without + // Hence unanimous votes => reach open + // Hence on every path unanimous votes => <> reached open + // Since votes are not forgotten on a node, we check for a state where unanimous votes => reached open + return implies( + unanimous_votes(model, state), + reached_open_timeout(state, false), + ); + }, + ) + .property( + stateright::Expectation::Eventually, + "Open", + |_, state: &ActorModelState| { + // all runs should eventually open, either via the reliable method, or via the failover timeout + reached_open(state) + }, + ) + .property( + stateright::Expectation::Eventually, + "Majority votes => no fork", + |_, state: &ActorModelState| { + return implies( + majority_have_same_maximum(state), + reached_open_timeout(state, false), + ); + }, + ); return model; } fn invariant_properties(model: ActorModel) -> ActorModel { - let model = model.property( - stateright::Expectation::Always, - "No fork", - |_model: &ActorModel, state: &ActorModelState| { - // Check if there is no fork in the state - let open_node_count = state - .actor_states - .iter() - .filter(|actor_state: &&Arc| actor_state.next_step == NextStep::Open) - .count(); - open_node_count <= 1 - }, - ); + let model = model + .property( + stateright::Expectation::Always, + "No open with timeout, no fork", + |_model: &ActorModel, state: &ActorModelState| { + // Check if there is no fork in the state + let open_node_count = state + .actor_states + .iter() + .filter(|actor_state: &&Arc| { + matches!(actor_state.next_step, NextStep::Open { .. }) + }) + .count(); + implies(!reached_open_timeout(state, true), open_node_count <= 1) + }, + ) + .property( + stateright::Expectation::Always, + "Persist committed txs", + |_model: &ActorModel, state: &ActorModelState| { + let majority_idx = state.actor_states.len() / 2; + let commit_txid = state + .actor_states + .iter() + .map(|actor_state| actor_state.txid) + .collect::>()[majority_idx]; + let cond = state + .actor_states + .iter() + .filter(|actor_state: &&Arc| { + matches!(actor_state.next_step, NextStep::Open { .. }) + }) + .all(|actor_state: &Arc| actor_state.txid >= commit_txid); + implies(!reached_open_timeout(state, true), cond) + }, + ); return model; } @@ -68,15 +155,18 @@ fn reachable_properties(model: ActorModel) -> ActorModel, state: &ActorModelState| { - reached_open(model, state) - }, + "Open is possible", + |_, state| implies(state.actor_states.len() > 1, reached_open(state)), + ) + .property( + stateright::Expectation::Sometimes, + "Unsafe open with timeout", + |_, state| reached_open_timeout(state, true), ) .property( stateright::Expectation::Sometimes, "Deadlock", - |_model: &ActorModel, state: &ActorModelState| { + |_model, state| { let all_open_join = state .actor_states .iter() @@ -89,6 +179,11 @@ fn reachable_properties(model: ActorModel) -> ActorModel) { let checker = model .checker() - .spawn_dfs() + //.symmetry() + .spawn_bfs() .join_and_report(&mut WriteReporter::new(&mut std::io::stderr())); checker.assert_properties(); } diff --git a/tla/disaster-recovery/stateright/src/model.rs b/tla/disaster-recovery/stateright/src/model.rs index b8679c06451e..497679b86ffe 100644 --- a/tla/disaster-recovery/stateright/src/model.rs +++ b/tla/disaster-recovery/stateright/src/model.rs @@ -1,5 +1,5 @@ extern crate stateright; -use stateright::{actor::*, util::HashableHashSet, Rewrite, RewritePlan}; +use stateright::{actor::*, util::HashableHashSet}; use std::borrow::Cow; type Txid = u64; @@ -13,14 +13,14 @@ pub struct GossipStruct { #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] pub struct VoteStruct { pub src: Id, - pub recv: HashableHashSet, + pub recv: HashableHashSet, } #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub enum Msg { Gossip(GossipStruct), Vote(VoteStruct), - Open, + IAmOpen(Id), } #[derive(Debug, Clone, Hash, PartialEq, Eq)] @@ -30,10 +30,9 @@ pub enum Timer { #[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)] pub enum NextStep { - // Gossip, Vote, OpenJoin, - Open, + Open { timeout: bool }, Join, } @@ -43,27 +42,26 @@ pub struct State { pub gossips: HashableHashSet, pub votes: HashableHashSet, pub submitted_vote: Option<(Id, VoteStruct)>, + pub txid: Txid, } #[derive(Debug, Clone, Hash, PartialEq, Eq)] pub struct Node { - pub txid: Txid, pub peers: HashableHashSet, } impl Node { - fn vote_for_max<'a, I>(gossips: &I, id: Id) -> (Id, VoteStruct) - where - I: Iterator + Clone, - { + fn vote_for_max<'a>(gossips: &HashableHashSet, id: Id) -> (Id, VoteStruct) +where { let dst = gossips + .iter() .clone() - .max_by(|a, b| a.txid.cmp(&b.txid)) + .max_by_key(|g| (g.txid, g.src)) .unwrap() .src; let vote = VoteStruct { src: id, - recv: gossips.clone().map(|g| g.src.clone()).collect(), + recv: gossips.clone(), }; return (dst, vote); } @@ -72,19 +70,30 @@ impl Node { self.peers.iter().filter(|&&p| p != id).cloned().collect() } - fn advance_step(&self, state: &mut State, o: &mut Out, id: Id, timeout: bool) { + fn advance_step(&self, state: &mut State, o: &mut Out, id: Id, timeout: bool) -> bool { match state.next_step { NextStep::Vote if state.gossips.len() == self.peers.len() || timeout => { - let (dst, vote) = Node::vote_for_max(&state.gossips.iter(), id); + let (dst, vote) = Node::vote_for_max(&state.gossips, id); state.submitted_vote = Some((dst, vote.clone())); - o.send(dst, Msg::Vote(vote)); + if dst == id { + state.votes.insert(vote); + } else { + o.send(dst, Msg::Vote(vote)); + } state.next_step = NextStep::OpenJoin; + return true; } - NextStep::OpenJoin if state.votes.len() >= (self.peers.len() + 1) / 2 => { - state.next_step = NextStep::Open; - o.broadcast(&self.other_peers(id), &Msg::Open); + NextStep::OpenJoin if state.votes.len() >= (self.peers.len() + 1) / 2 || timeout => { + state.next_step = NextStep::Open { timeout }; + o.broadcast(&self.other_peers(id), &Msg::IAmOpen(id)); + return true; } - _ => {} + _ => false, + } + } + + fn advance_several(&self, state: &mut State, o: &mut Out, id: Id, timeout: bool) { + while self.advance_step(state, o, id, timeout) { } } } @@ -97,38 +106,39 @@ impl Actor for Node { type Random = (); fn on_start(&self, id: Id, _storage: &Option, o: &mut Out) -> Self::State { + let txid = usize::from(id) as Txid; // Use id as txid for simplicity + let gossip = GossipStruct { src: id, txid }; let mut gossips = HashableHashSet::new(); - gossips.insert(GossipStruct { - src: id, - txid: self.txid, - }); - let state = State { + gossips.insert(gossip.clone()); + let mut state = State { next_step: NextStep::Vote, - gossips: gossips, + gossips, votes: HashableHashSet::new(), submitted_vote: None, + txid: usize::from(id) as Txid, }; - o.broadcast( - &self.other_peers(id), - &Msg::Gossip(GossipStruct { - src: id, - txid: self.txid, - }), - ); + o.broadcast(&self.other_peers(id), &Msg::Gossip(gossip)); o.set_timer(Timer::ElectionTimeout, model_timeout()); + self.advance_several(&mut state, o, id, false); return state; } fn on_timeout(&self, id: Id, state: &mut Cow, timer: &Timer, o: &mut Out) { match timer { - Timer::ElectionTimeout => { - if state.next_step == NextStep::Vote && !state.gossips.is_empty() { + Timer::ElectionTimeout => match state.next_step { + NextStep::Vote if !state.gossips.is_empty() => { let state = state.to_mut(); - self.advance_step(state, o, id, true); - } else { + self.advance_several(state, o, id, true); o.set_timer(Timer::ElectionTimeout, model_timeout()); } - } + NextStep::OpenJoin if !state.votes.is_empty() => { + let state = state.to_mut(); + self.advance_several(state, o, id, true); + } + _ => { + o.set_timer(Timer::ElectionTimeout, model_timeout()); + } + }, } } @@ -153,11 +163,13 @@ impl Actor for Node { state.votes.insert(vote); } } - Msg::Open => { - state.next_step = NextStep::Join; + Msg::IAmOpen(_) => { + if !matches!(state.next_step, NextStep::Open { .. }) { + state.next_step = NextStep::Join; + } } }; - self.advance_step(state, o, id, false); + self.advance_several(state, o, id, false); } } @@ -172,9 +184,8 @@ impl ModelCfg { ActorModel::new(self.clone(), ()) .actors( (0..self.n_nodes) - .map(|i| Node { + .map(|_| Node { peers: peers.clone(), - txid: i as u64, }) .collect::>(), ) From 5a98922ca48f29dad8ad2b2a144ca104fd683613 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 3 Jul 2025 19:05:03 +0100 Subject: [PATCH 012/197] Add reasonably clean curlm support --- src/http/curl.h | 207 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index 599f92d18e2a..74db6c1a0257 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -25,6 +25,17 @@ #define CHECK_CURL_EASY_GETINFO(handle, info, arg) \ CHECK_CURL_EASY(curl_easy_getinfo, handle, info, arg) +#define CHECK_CURL_MULTI(fn, ...) \ + do \ + { \ + const auto res = fn(__VA_ARGS__); \ + if (res != CURLM_OK) \ + { \ + throw std::runtime_error(fmt::format( \ + "Error calling " #fn ": {} ({})", res, curl_multi_strerror(res))); \ + } \ + } while (0) + namespace ccf::curl { @@ -48,6 +59,26 @@ namespace ccf::curl } }; + class UniqueCURLM + { + protected: + std::unique_ptr p; + + public: + UniqueCURLM() : p(curl_multi_init(), [](auto x) { curl_multi_cleanup(x); }) + { + if (!p.get()) + { + throw std::runtime_error("Error initialising curl multi request"); + } + } + + operator CURLM*() const + { + return p.get(); + } + }; + class UniqueSlist { protected: @@ -67,4 +98,180 @@ namespace ccf::curl } }; + class RequestBody + { + std::vector buffer_vec; + std::span buffer_span; + + public: + RequestBody(std::vector& buffer) : buffer_vec(std::move(buffer)) + { + buffer_span = + std::span(buffer_vec.data(), buffer_vec.size()); + } + + template + RequestBody(Jsonable jsonable) + { + auto json_str = nlohmann::json(jsonable).dump(); + buffer_vec = std::vector( + json_str.begin(), json_str.end()); // Convert to vector of bytes + buffer_span = + std::span(buffer_vec.data(), buffer_vec.size()); + } + + static size_t send_data( + char* ptr, size_t size, size_t nitems, void* userdata) + { + auto* data = static_cast(userdata); + auto bytes_to_copy = std::min(data->buffer_span.size(), size * nitems); + memcpy(ptr, data->buffer_span.data(), bytes_to_copy); + data->buffer_span = data->buffer_span.subspan(bytes_to_copy); + return bytes_to_copy; + } + + void attach_to_curl(CURL* curl) + { + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_READDATA, this); + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_READFUNCTION, send_data); + CHECK_CURL_EASY_SETOPT( + curl, CURLOPT_INFILESIZE, static_cast(buffer_span.size())); + } + }; + + class ResponseBody + { + public: + std::vector buffer; + + static size_t write_response_chunk( + char* ptr, size_t size, size_t nmemb, void* userdata) + { + auto* data = static_cast(userdata); + auto bytes_to_copy = size * nmemb; + data->buffer.insert( + data->buffer.end(), (uint8_t*)ptr, (uint8_t*)ptr + bytes_to_copy); + // Should probably set a maximum response size here + return bytes_to_copy; + } + + void attach_to_curl(CURL* curl) + { + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEDATA, this); + // Called one or more times to add more data + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEFUNCTION, write_response_chunk); + } + }; + + class CurlRequest + { + public: + UniqueCURL curl_handle; + std::string url; + std::unique_ptr request_body = nullptr; + std::unique_ptr response_body = nullptr; + ccf::curl::UniqueSlist headers; + std::optional> response_callback = + std::nullopt; + + void attach_to_curl() const + { + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_URL, url.c_str()); + if (request_body != nullptr) + { + request_body->attach_to_curl(curl_handle); + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); + } + if (response_body != nullptr) + { + response_body->attach_to_curl(curl_handle); + } + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); + } + + void set_url(const std::string& new_url) + { + url = new_url; + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_URL, url.c_str()); + } + + void set_blob_opt(auto option, const uint8_t* data, size_t length) + { + struct curl_blob blob + { + .data = const_cast(data), .len = length, + .flags = CURL_BLOB_COPY, + }; + + CHECK_CURL_EASY_SETOPT(curl_handle, option, blob); + } + + void set_response_callback( + std::function callback) + { + if (response_body != nullptr || response_callback.has_value()) + { + throw std::logic_error( + "Only one response callback can be set for a request."); + } + response_callback = std::move(callback); + response_body = std::make_unique(); + } + + static void attach_to_multi_curl( + CURLM* curl_multi, std::unique_ptr& request) + { + request->attach_to_curl(); + CURL* curl_handle = request->curl_handle; + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_PRIVATE, request.release()); + CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi, curl_handle); + } + }; + + inline int iter_CURLM_CurlRequest(UniqueCURLM& p) + { + int running_handles = 0; + CHECK_CURL_MULTI(curl_multi_perform, p, &running_handles); + + // handle all completed curl requests + int msgq = 0; + CURLMsg* msg = nullptr; + do + { + msg = curl_multi_info_read(p, &msgq); + + if ((msg != nullptr) && msg->msg == CURLMSG_DONE) + { + auto* easy = msg->easy_handle; + auto result = msg->data.result; + + LOG_INFO_FMT( + "CURL request response handling with result: {} ({})", + result, + curl_easy_strerror(result)); + + // retrieve the request data and attach a lifetime to it + ccf::curl::CurlRequest* request_data = nullptr; + curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request_data); + std::unique_ptr request_data_ptr(request_data); + + // Clean up the easy handle and corresponding resources + curl_multi_remove_handle(p, easy); + if (request_data->response_callback.has_value()) + { + if (request_data->response_body != nullptr) + { + request_data->response_callback.value()( + *request_data->response_body); + } + } + // Handled by the destructor of CurlRequest + LOG_INFO_FMT( + "Finished handling CURLMSG: msg_nullptr: {}, remaining: {}", + msg != nullptr, + msgq); + } + } while (msgq > 0); + return running_handles; + } } // namespace ccf::curl \ No newline at end of file From 9edf637612c72150f5aa6ee318d65f84afce5d52 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 4 Jul 2025 15:11:00 +0100 Subject: [PATCH 013/197] Add proper curl and libuv interaction --- src/http/curl.h | 247 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 231 insertions(+), 16 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 74db6c1a0257..56287da43dcd 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -5,9 +5,13 @@ #include "ccf/ds/logger.h" #include "ccf/ds/nonstd.h" +#include #include +#include #include #include +#include +#include #define CHECK_CURL_EASY(fn, ...) \ do \ @@ -41,13 +45,13 @@ namespace ccf::curl class UniqueCURL { - protected: + private: std::unique_ptr p; public: UniqueCURL() : p(curl_easy_init(), [](auto x) { curl_easy_cleanup(x); }) { - if (!p.get()) + if (!p) { throw std::runtime_error("Error initialising curl easy request"); } @@ -61,13 +65,13 @@ namespace ccf::curl class UniqueCURLM { - protected: + private: std::unique_ptr p; public: UniqueCURLM() : p(curl_multi_init(), [](auto x) { curl_multi_cleanup(x); }) { - if (!p.get()) + if (!p) { throw std::runtime_error("Error initialising curl multi request"); } @@ -81,7 +85,7 @@ namespace ccf::curl class UniqueSlist { - protected: + private: std::unique_ptr p; public: @@ -92,7 +96,7 @@ namespace ccf::curl p.reset(curl_slist_append(p.release(), str)); } - curl_slist* get() const + [[nodiscard]] curl_slist* get() const { return p.get(); } @@ -145,12 +149,11 @@ namespace ccf::curl std::vector buffer; static size_t write_response_chunk( - char* ptr, size_t size, size_t nmemb, void* userdata) + uint8_t* ptr, size_t size, size_t nmemb, void* userdata) { auto* data = static_cast(userdata); auto bytes_to_copy = size * nmemb; - data->buffer.insert( - data->buffer.end(), (uint8_t*)ptr, (uint8_t*)ptr + bytes_to_copy); + data->buffer.insert(data->buffer.end(), ptr, ptr + bytes_to_copy); // Should probably set a maximum response size here return bytes_to_copy; } @@ -163,6 +166,28 @@ namespace ccf::curl } }; + // Use in conjunction with the iter_CURLM_CurlRequest function + // to force only requests with the corresponding CurlRequest private data + class CurlRequestCURLM + { + private: + CURLM* curl_multi; + + public: + CurlRequestCURLM(CURLM* curl_multi) : curl_multi(curl_multi) + { + if (curl_multi == nullptr) + { + throw std::runtime_error("CURLM handle cannot be null"); + } + } + + [[nodiscard]] CURLM* get() const + { + return curl_multi; + } + }; + class CurlRequest { public: @@ -219,33 +244,33 @@ namespace ccf::curl } static void attach_to_multi_curl( - CURLM* curl_multi, std::unique_ptr& request) + const CurlRequestCURLM& curl_multi, std::unique_ptr& request) { request->attach_to_curl(); CURL* curl_handle = request->curl_handle; CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_PRIVATE, request.release()); - CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi, curl_handle); + CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi.get(), curl_handle); } }; - inline int iter_CURLM_CurlRequest(UniqueCURLM& p) + inline int iter_CURLM_CurlRequest(const CurlRequestCURLM& p) { int running_handles = 0; - CHECK_CURL_MULTI(curl_multi_perform, p, &running_handles); + CHECK_CURL_MULTI(curl_multi_perform, p.get(), &running_handles); // handle all completed curl requests int msgq = 0; CURLMsg* msg = nullptr; do { - msg = curl_multi_info_read(p, &msgq); + msg = curl_multi_info_read(p.get(), &msgq); if ((msg != nullptr) && msg->msg == CURLMSG_DONE) { auto* easy = msg->easy_handle; auto result = msg->data.result; - LOG_INFO_FMT( + LOG_TRACE_FMT( "CURL request response handling with result: {} ({})", result, curl_easy_strerror(result)); @@ -253,10 +278,15 @@ namespace ccf::curl // retrieve the request data and attach a lifetime to it ccf::curl::CurlRequest* request_data = nullptr; curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request_data); + if (request_data == nullptr) + { + throw std::runtime_error( + "CURLMSG_DONE received with no associated request data"); + } std::unique_ptr request_data_ptr(request_data); // Clean up the easy handle and corresponding resources - curl_multi_remove_handle(p, easy); + curl_multi_remove_handle(p.get(), easy); if (request_data->response_callback.has_value()) { if (request_data->response_body != nullptr) @@ -274,4 +304,189 @@ namespace ccf::curl } while (msgq > 0); return running_handles; } + + class CurlmLibuvContext + { + uv_loop_t* loop; + uv_timer_t timeout_tracker{}; + // lifetime handler of curl_multi interface + UniqueCURLM curl_multi; + // utility class to enforce type safety on accesses to curl_multi + CurlRequestCURLM curl_request_curlm; + + struct RequestContext + { + uv_poll_t poll_handle; + curl_socket_t socket; + CurlmLibuvContext* context; + }; + + public: + void handle_request_messages() + { + iter_CURLM_CurlRequest(curl_request_curlm); + } + + static void libuv_timeout_callback(uv_timer_t* handle) + { + auto* self = static_cast(handle->data); + if (self == nullptr) + { + throw std::logic_error( + "libuv_timeout_callback called with null self pointer"); + } + + int running_handles = 0; + CHECK_CURL_MULTI( + curl_multi_socket_action, + self->curl_multi, + CURL_SOCKET_TIMEOUT, + 0, + &running_handles); + self->handle_request_messages(); + } + + static int curl_timeout_callback( + CURLM* multi, long timeout_ms, CurlmLibuvContext* self) + { + (void)multi; + if (self == nullptr) + { + throw std::logic_error( + "libuv_timeout_callback called with null self pointer"); + } + + if (timeout_ms < 0) + { + // No timeout set, stop the timer + uv_timer_stop(&self->timeout_tracker); + } + else + { + // If timeout is zero, this will trigger on the next uv loop iteration + uv_timer_start(&self->timeout_tracker, libuv_timeout_callback, 0, 0); + } + return 0; + } + + // Called when libuv detects a socket event + static void libuv_socket_poll_callback( + uv_poll_t* req, int status, int events) + { + if (status < 0) + { + LOG_FAIL_FMT("Socket poll error: {}", uv_strerror(status)); + return; + } + + auto* request_context = static_cast(req->data); + if (request_context == nullptr) + { + throw std::logic_error( + "libuv_socket_poll_callback called with null request context"); + } + + auto* self = request_context->context; + if (self == nullptr) + { + throw std::logic_error( + "libuv_socket_poll_callback called with null self pointer"); + } + + int action = 0; + action |= ((events & UV_READABLE) != 0) ? CURL_CSELECT_IN : 0; + action |= ((events & UV_WRITABLE) != 0) ? CURL_CSELECT_OUT : 0; + int running_handles = 0; + CHECK_CURL_MULTI( + curl_multi_socket_action, + self->curl_multi, + request_context->socket, + action, + &running_handles); + self->handle_request_messages(); + } + + // Called when the status of a socket changes (creation/deletion) + static int curl_socket_callback( + CURL* easy, + curl_socket_t s, + int action, + CurlmLibuvContext* self, + RequestContext* request_context) + { + (void)easy; + switch (action) + { + case CURL_POLL_IN: + case CURL_POLL_OUT: + case CURL_POLL_INOUT: + { + if (request_context == nullptr) + { + auto request_context_ptr = std::make_unique(); + request_context_ptr->context = self; + request_context_ptr->socket = s; + uv_poll_init_socket( + self->loop, &request_context_ptr->poll_handle, s); + request_context_ptr->poll_handle.data = + request_context_ptr.get(); // Attach the context + // attach the lifetime to the socket handle + request_context = request_context_ptr.release(); + CHECK_CURL_MULTI( + curl_multi_assign, self->curl_multi, s, request_context); + } + + int events = 0; + events |= (action == CURL_POLL_IN) ? 0 : UV_WRITABLE; + events |= (action == CURL_POLL_OUT) ? 0 : UV_READABLE; + uv_poll_start( + &request_context->poll_handle, events, libuv_socket_poll_callback); + break; + } + case CURL_POLL_REMOVE: + if (request_context != nullptr) + { + uv_poll_stop(&request_context->poll_handle); + std::unique_ptr request_context_ptr( + request_context); + curl_multi_assign(self->curl_multi, s, nullptr); + } + break; + default: + throw std::runtime_error("Unknown action in curl_socket_callback"); + } + return 0; + } + + CurlmLibuvContext(uv_loop_t* loop) : + loop(loop), + curl_request_curlm(curl_multi) + { + uv_timer_init(loop, &timeout_tracker); + timeout_tracker.data = this; // Attach this instance to the timer + + // attach timeouts + CHECK_CURL_MULTI(curl_multi_setopt, curl_multi, CURLMOPT_TIMERDATA, this); + CHECK_CURL_MULTI( + curl_multi_setopt, + curl_multi, + CURLMOPT_TIMERFUNCTION, + curl_timeout_callback); + + // attach socket events + CHECK_CURL_MULTI( + curl_multi_setopt, curl_multi, CURLMOPT_SOCKETDATA, this); + CHECK_CURL_MULTI( + curl_multi_setopt, + curl_multi, + CURLMOPT_SOCKETFUNCTION, + curl_socket_callback); + } + + // should this return a reference or a pointer? + [[nodiscard]] const CurlRequestCURLM& curlm() const + { + return curl_request_curlm; + } + }; } // namespace ccf::curl \ No newline at end of file From c745ade1dfe5239a457b56523f8c1dde51b17c3b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 4 Jul 2025 15:56:15 +0100 Subject: [PATCH 014/197] Pass curl singleton over enclave barrier --- src/common/configuration.h | 3 +++ src/enclave/main.cpp | 16 ++++++++++++++++ src/host/main.cpp | 9 +++++++++ src/http/curl.h | 19 +++++++++++++++++++ 4 files changed, 47 insertions(+) diff --git a/src/common/configuration.h b/src/common/configuration.h index 49855b2b0de1..496ccd592d35 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -16,6 +16,7 @@ #include "common/enclave_interface_types.h" #include "consensus/consensus_types.h" #include "ds/oversized.h" +#include "http/curl.h" #include "service/tables/config.h" #include @@ -39,6 +40,8 @@ struct EnclaveConfig ringbuffer::Offsets* from_enclave_buffer_offsets; oversized::WriterConfig writer_config = {}; + + ccf::curl::CurlmLibuvContext* curl_libuv_context; }; static constexpr auto node_to_node_interface_name = "node_to_node_interface"; diff --git a/src/enclave/main.cpp b/src/enclave/main.cpp index 03db31cde942..1540ab0c437a 100644 --- a/src/enclave/main.cpp +++ b/src/enclave/main.cpp @@ -8,6 +8,7 @@ #include "common/enclave_interface_types.h" #include "enclave.h" #include "enclave_time.h" +#include "http/curl.h" #include "ringbuffer_logger.h" #include @@ -75,6 +76,21 @@ extern "C" auto writer_factory = std::make_unique( *basic_writer_factory, ec.writer_config); + auto& curl_context = ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe(); + if (curl_context == nullptr) + { + LOG_FAIL_FMT( + "Curl context singleton already initialized"); + return CreateNodeStatus::InternalError; + } + if (ec.curl_libuv_context == nullptr) + { + LOG_FAIL_FMT( + "Enclave config curl context is null"); + return CreateNodeStatus::InternalError; + } + curl_context = ec.curl_libuv_context; + // Note: because logger uses ringbuffer, logger can only be initialised once // ringbuffer memory has been verified auto new_logger = std::make_unique(*writer_factory); diff --git a/src/host/main.cpp b/src/host/main.cpp index a021cd09f419..136d8668869b 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -28,6 +28,7 @@ #include "enclave.h" #include "handle_ring_buffer.h" #include "host/env.h" +#include "http/curl.h" #include "json_schema.h" #include "lfs_file_handler.h" #include "load_monitor.h" @@ -356,6 +357,11 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) // Write PID to disk files::dump(fmt::format("{}", ::getpid()), config.output_files.pid_file); + // Initialise curlm libuv interface + ccf::curl::CurlmLibuvContext curl_libuv_context(uv_default_loop()); + ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe() = + &curl_libuv_context; + // set the host log level ccf::logger::config::level() = config.logging.host_level; @@ -602,6 +608,9 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) enclave_config.writer_config = writer_config; + enclave_config.curl_libuv_context = + &ccf::curl::CurlmLibuvContextSingleton::get_instance(); + ccf::StartupConfig startup_config(config); if (startup_config.attestation.snp_security_policy_file.has_value()) diff --git a/src/http/curl.h b/src/http/curl.h index 56287da43dcd..1aea19c8fa60 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -489,4 +489,23 @@ namespace ccf::curl return curl_request_curlm; } }; + + class CurlmLibuvContextSingleton + { + static CurlmLibuvContext* curlm_libuv_context_instance; + public: + static CurlmLibuvContext*& get_instance_unsafe() + { + return curlm_libuv_context_instance; + } + static CurlmLibuvContext& get_instance() + { + if (curlm_libuv_context_instance == nullptr) + { + throw std::logic_error( + "CurlmLibuvContextSingleton instance not initialized"); + } + return *curlm_libuv_context_instance; + } + }; } // namespace ccf::curl \ No newline at end of file From 71c1fb3650b57dee3d293e1884f21e1149051564 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 4 Jul 2025 15:59:19 +0100 Subject: [PATCH 015/197] Ensure singleton is initialised --- src/http/curl.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index 1aea19c8fa60..44c4b7211388 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -508,4 +508,7 @@ namespace ccf::curl return *curlm_libuv_context_instance; } }; + + inline CurlmLibuvContext* CurlmLibuvContextSingleton::curlm_libuv_context_instance = + nullptr; } // namespace ccf::curl \ No newline at end of file From 695f3516c06ffd5eb4748e4bcb14c36eba8106bf Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 4 Jul 2025 18:03:09 +0100 Subject: [PATCH 016/197] Make quote endorsement client use curl_multi --- src/enclave/main.cpp | 11 +- src/http/curl.h | 93 +++++++-- src/node/quote_endorsements_client.h | 299 ++++++++++++++------------- 3 files changed, 227 insertions(+), 176 deletions(-) diff --git a/src/enclave/main.cpp b/src/enclave/main.cpp index 1540ab0c437a..8024691eb48e 100644 --- a/src/enclave/main.cpp +++ b/src/enclave/main.cpp @@ -76,17 +76,16 @@ extern "C" auto writer_factory = std::make_unique( *basic_writer_factory, ec.writer_config); - auto& curl_context = ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe(); + auto& curl_context = + ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe(); if (curl_context == nullptr) { - LOG_FAIL_FMT( - "Curl context singleton already initialized"); + LOG_FAIL_FMT("Curl context singleton already initialized"); return CreateNodeStatus::InternalError; - } + } if (ec.curl_libuv_context == nullptr) { - LOG_FAIL_FMT( - "Enclave config curl context is null"); + LOG_FAIL_FMT("Enclave config curl context is null"); return CreateNodeStatus::InternalError; } curl_context = ec.curl_libuv_context; diff --git a/src/http/curl.h b/src/http/curl.h index 44c4b7211388..f58485dd08f1 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -143,26 +143,62 @@ namespace ccf::curl } }; - class ResponseBody + class Response { public: std::vector buffer; + using HeaderMap = std::unordered_map; + HeaderMap headers; + long status_code = 0; static size_t write_response_chunk( uint8_t* ptr, size_t size, size_t nmemb, void* userdata) { - auto* data = static_cast(userdata); + auto* data = static_cast(userdata); auto bytes_to_copy = size * nmemb; data->buffer.insert(data->buffer.end(), ptr, ptr + bytes_to_copy); // Should probably set a maximum response size here return bytes_to_copy; } + static size_t append_header( + char* buffer, size_t size, size_t nitems, Response* response) + { + if (size != 1) + { + LOG_FAIL_FMT( + "Unexpected value in curl HEADERFUNCTION callback: size = {}", size); + return 0; + } + + const std::string_view header = + ccf::nonstd::trim(std::string_view(buffer, nitems)); + + // Ignore HTTP status line, and empty line + if (!header.empty() && !header.starts_with("HTTP/1.1")) + { + const auto [field, value] = ccf::nonstd::split_1(header, ": "); + if (!value.empty()) + { + response->headers[std::string(field)] = ccf::nonstd::trim(value); + } + else + { + LOG_INFO_FMT("Ignoring invalid-looking HTTP Header '{}'", header); + } + } + + return nitems * size; + } + void attach_to_curl(CURL* curl) { + // Body CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEDATA, this); - // Called one or more times to add more data CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEFUNCTION, write_response_chunk); + // Headers + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERDATA, this); + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERFUNCTION, append_header); } }; @@ -194,9 +230,9 @@ namespace ccf::curl UniqueCURL curl_handle; std::string url; std::unique_ptr request_body = nullptr; - std::unique_ptr response_body = nullptr; + std::unique_ptr response = nullptr; ccf::curl::UniqueSlist headers; - std::optional> response_callback = + std::optional> response_callback = std::nullopt; void attach_to_curl() const @@ -207,9 +243,9 @@ namespace ccf::curl request_body->attach_to_curl(curl_handle); CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); } - if (response_body != nullptr) + if (response != nullptr) { - response_body->attach_to_curl(curl_handle); + response->attach_to_curl(curl_handle); } CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); } @@ -231,16 +267,25 @@ namespace ccf::curl CHECK_CURL_EASY_SETOPT(curl_handle, option, blob); } - void set_response_callback( - std::function callback) + void set_response_callback(std::function callback) { - if (response_body != nullptr || response_callback.has_value()) + if (response != nullptr || response_callback.has_value()) { throw std::logic_error( "Only one response callback can be set for a request."); } response_callback = std::move(callback); - response_body = std::make_unique(); + response = std::make_unique(); + } + + void set_header(const std::string& key, const std::string& value) + { + headers.append(fmt::format("{}: {}", key, value).c_str()); + } + + [[nodiscard]] CURL* get_easy_handle() const + { + return curl_handle; } static void attach_to_multi_curl( @@ -276,23 +321,28 @@ namespace ccf::curl curl_easy_strerror(result)); // retrieve the request data and attach a lifetime to it - ccf::curl::CurlRequest* request_data = nullptr; - curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request_data); - if (request_data == nullptr) + ccf::curl::CurlRequest* request = nullptr; + curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); + if (request == nullptr) { throw std::runtime_error( "CURLMSG_DONE received with no associated request data"); } - std::unique_ptr request_data_ptr(request_data); + std::unique_ptr request_data_ptr(request); + + if (request->response != nullptr) + { + CHECK_CURL_EASY_GETINFO( + easy, CURLINFO_RESPONSE_CODE, &request->response->status_code); + } // Clean up the easy handle and corresponding resources curl_multi_remove_handle(p.get(), easy); - if (request_data->response_callback.has_value()) + if (request->response_callback.has_value()) { - if (request_data->response_body != nullptr) + if (request->response != nullptr) { - request_data->response_callback.value()( - *request_data->response_body); + request->response_callback.value()(*request); } } // Handled by the destructor of CurlRequest @@ -493,6 +543,7 @@ namespace ccf::curl class CurlmLibuvContextSingleton { static CurlmLibuvContext* curlm_libuv_context_instance; + public: static CurlmLibuvContext*& get_instance_unsafe() { @@ -509,6 +560,6 @@ namespace ccf::curl } }; - inline CurlmLibuvContext* CurlmLibuvContextSingleton::curlm_libuv_context_instance = - nullptr; + inline CurlmLibuvContext* + CurlmLibuvContextSingleton::curlm_libuv_context_instance = nullptr; } // namespace ccf::curl \ No newline at end of file diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 16916e0045e3..5a492a5ca686 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -4,6 +4,7 @@ #include "ccf/pal/attestation.h" #include "enclave/rpc_sessions.h" +#include "http/curl.h" namespace ccf { @@ -82,103 +83,6 @@ namespace ccf size_t request_id; }; - std::shared_ptr create_unauthenticated_client() - { - // Note: server CA is not checked here as this client is not sending - // private data. If the server was malicious and the certificate chain was - // bogus, the verification of the endorsement of the quote would fail - // anyway. - return rpcsessions->create_client(std::make_shared<::tls::Cert>( - nullptr, std::nullopt, std::nullopt, std::nullopt, false)); - } - - std::shared_ptr create_unencrypted_client() - { - return rpcsessions->create_unencrypted_client(); - } - - void send_request( - const std::shared_ptr& client, - const EndpointInfo& endpoint) - { - { - ::http::Request r(endpoint.uri, HTTP_GET); - for (auto const& [k, v] : endpoint.params) - { - r.set_query_param(k, v); - } - for (auto const& [k, v] : endpoint.headers) - { - r.set_header(k, v); - } - r.set_header(http::headers::HOST, endpoint.host); - - LOG_INFO_FMT( - "Fetching endorsements for attestation report at {}{}{}", - endpoint, - r.get_path(), - r.get_formatted_query()); - client->send_request(std::move(r)); - } - - // Start watchdog to send request on new server if it is unresponsive - auto msg = std::make_unique< - ::threading::Tmsg>( - [](std::unique_ptr<::threading::Tmsg> - msg) { - std::lock_guard guard(msg->data.self->lock); - if (msg->data.self->has_completed) - { - return; - } - if (msg->data.request_id >= msg->data.self->last_received_request_id) - { - auto& servers = msg->data.self->config.servers; - // Should always contain at least one server, - // installed by ccf::pal::make_endorsement_endpoint_configuration() - if (servers.empty()) - { - throw std::logic_error( - "No server specified to fetch endorsements"); - } - - msg->data.self->server_retries_count++; - if ( - msg->data.self->server_retries_count >= - max_retries_count(servers.front())) - { - if (servers.size() > 1) - { - // Move on to next server if we have passed max retries count - servers.pop_front(); - } - else - { - auto& server = servers.front(); - LOG_FAIL_FMT( - "Giving up retrying fetching attestation endorsements from " - "{} after {} attempts", - server.front().host, - server.front().max_retries_count); - throw ccf::pal::AttestationCollateralFetchingTimeout( - "Timed out fetching attestation endorsements from all " - "configured servers"); - return; - } - } - - msg->data.self->fetch(servers.front()); - } - }, - shared_from_this(), - endpoint, - last_received_request_id); - - ::threading::ThreadMessaging::instance().add_task_after( - std::move(msg), - std::chrono::milliseconds(server_connection_timeout_s * 1000)); - } - void handle_success_response( std::vector&& data, const EndpointInfo& response_endpoint) { @@ -233,77 +137,174 @@ namespace ccf } } + std::string get_formatted_query( + const std::map params) const + { + std::string formatted_query; + bool first = true; + for (const auto& it : params) + { + formatted_query += + fmt::format("{}{}={}", (first ? '?' : '&'), it.first, it.second); + first = false; + } + return formatted_query; + } + void fetch(const Server& server) { auto endpoint = server.front(); - auto c = endpoint.tls ? create_unauthenticated_client() : - create_unencrypted_client(); - c->connect( + std::unique_ptr request; + + // set curl get + CHECK_CURL_EASY_SETOPT(request->get_easy_handle(), CURLOPT_HTTPGET, 1L); + + request->url = fmt::format( + "{}://{}:{}/{}{}", + endpoint.tls ? "https" : "http", endpoint.host, endpoint.port, - [this, server, endpoint]( - ccf::http_status status, - http::HeaderMap&& headers, - std::vector&& data) { - std::lock_guard guard(this->lock); + endpoint.uri, + get_formatted_query(endpoint.params)); + + if (endpoint.tls) + { + // Note: server CA is not checked here as this client is not sending + // private data. If the server was malicious and the certificate chain + // was bogus, the verification of the endorsement of the quote would + // fail anyway. + CHECK_CURL_EASY_SETOPT( + request->get_easy_handle(), CURLOPT_SSL_VERIFYHOST, 0L); + CHECK_CURL_EASY_SETOPT( + request->get_easy_handle(), CURLOPT_SSL_VERIFYPEER, 0L); + CHECK_CURL_EASY_SETOPT( + request->get_easy_handle(), CURLOPT_SSL_VERIFYSTATUS, 0L); + } + + for (auto const& [k, v] : endpoint.headers) + { + request->set_header(k, v); + } + request->set_header(http::headers::HOST, endpoint.host); + + request->set_response_callback([this, server, endpoint]( + curl::CurlRequest& request) { + std::lock_guard guard(this->lock); + auto response = *request.response.release(); - last_received_request_id++; + last_received_request_id++; - if (status == HTTP_STATUS_OK) + if (request.response->status_code == HTTP_STATUS_OK) + { + LOG_INFO_FMT( + "Successfully retrieved endorsements for attestation report: " + "{} bytes", + request.response->buffer.size()); + + handle_success_response(std::move(response.buffer), endpoint); + return; + } + + LOG_DEBUG_FMT( + "Error fetching endorsements for attestation report: {}", + response.status_code); + if (response.status_code == HTTP_STATUS_TOO_MANY_REQUESTS) + { + constexpr size_t default_retry_after_s = 3; + size_t retry_after_s = default_retry_after_s; + auto h = response.headers.find(http::headers::RETRY_AFTER); + if (h != response.headers.end()) { - LOG_INFO_FMT( - "Successfully retrieved endorsements for attestation report: " - "{} bytes", - data.size()); + const auto& retry_after_value = h->second; + // If value is invalid, retry_after_s is unchanged + std::from_chars( + retry_after_value.data(), + retry_after_value.data() + retry_after_value.size(), + retry_after_s); + } - handle_success_response(std::move(data), endpoint); + auto msg = + std::make_unique<::threading::Tmsg>( + [](std::unique_ptr<::threading::Tmsg> + msg) { msg->data.self->fetch(msg->data.server); }, + shared_from_this(), + server); + + LOG_INFO_FMT( + "{} endorsements endpoint had too many requests. Retrying " + "in {}s", + endpoint, + retry_after_s); + + ::threading::ThreadMessaging::instance().add_task_after( + std::move(msg), std::chrono::milliseconds(retry_after_s * 1000)); + } + return; + }); + + // Start watchdog to send request on new server if it is unresponsive + auto msg = std::make_unique< + ::threading::Tmsg>( + [](std::unique_ptr<::threading::Tmsg> + msg) { + std::lock_guard guard(msg->data.self->lock); + if (msg->data.self->has_completed) + { return; } - - LOG_DEBUG_FMT( - "Error fetching endorsements for attestation report: {}", status); - if (status == HTTP_STATUS_TOO_MANY_REQUESTS) + if (msg->data.request_id >= msg->data.self->last_received_request_id) { - constexpr size_t default_retry_after_s = 3; - size_t retry_after_s = default_retry_after_s; - auto h = headers.find(http::headers::RETRY_AFTER); - if (h != headers.end()) + auto& servers = msg->data.self->config.servers; + // Should always contain at least one server, + // installed by ccf::pal::make_endorsement_endpoint_configuration() + if (servers.empty()) { - const auto& retry_after_value = h->second; - // If value is invalid, retry_after_s is unchanged - std::from_chars( - retry_after_value.data(), - retry_after_value.data() + retry_after_value.size(), - retry_after_s); + throw std::logic_error( + "No server specified to fetch endorsements"); } - auto msg = - std::make_unique<::threading::Tmsg>( - []( - std::unique_ptr<::threading::Tmsg> - msg) { msg->data.self->fetch(msg->data.server); }, - shared_from_this(), - server); - - LOG_INFO_FMT( - "{} endorsements endpoint had too many requests. Retrying " - "in {}s", - endpoint, - retry_after_s); + msg->data.self->server_retries_count++; + if ( + msg->data.self->server_retries_count >= + max_retries_count(servers.front())) + { + if (servers.size() > 1) + { + // Move on to next server if we have passed max retries count + servers.pop_front(); + } + else + { + auto& server = servers.front(); + LOG_FAIL_FMT( + "Giving up retrying fetching attestation endorsements from " + "{} after {} attempts", + server.front().host, + server.front().max_retries_count); + throw ccf::pal::AttestationCollateralFetchingTimeout( + "Timed out fetching attestation endorsements from all " + "configured servers"); + return; + } + } - ::threading::ThreadMessaging::instance().add_task_after( - std::move(msg), std::chrono::milliseconds(retry_after_s * 1000)); + msg->data.self->fetch(servers.front()); } - return; }, - [endpoint](const std::string& error_msg) { - LOG_FAIL_FMT( - "TLS error when connecting to quote endorsements endpoint {}: {}", - endpoint, - error_msg); - }); - send_request(c, endpoint); + shared_from_this(), + endpoint, + last_received_request_id); + + ::threading::ThreadMessaging::instance().add_task_after( + std::move(msg), + std::chrono::milliseconds(server_connection_timeout_s * 1000)); + + LOG_INFO_FMT( + "Fetching endorsements for attestation report at {}", request->url); + + curl::CurlRequest::attach_to_multi_curl( + curl::CurlmLibuvContextSingleton::get_instance().curlm(), request); } public: From 2956c3872a8b1f3e181a61a76bb564c763c29914 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 7 Jul 2025 11:06:21 +0100 Subject: [PATCH 017/197] Add curl to public ccf linked libraryes --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index d123fe1867ff..6d629c490075 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -585,6 +585,7 @@ if(COMPILE_TARGET STREQUAL "snp") ccf_kv.snp nghttp2 ${CMAKE_THREAD_LIBS_INIT} + curl ) set_property(TARGET ccf.snp PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -637,6 +638,7 @@ elseif(COMPILE_TARGET STREQUAL "virtual") ccf_kv.host nghttp2 ${CMAKE_THREAD_LIBS_INIT} + curl ) set_property(TARGET ccf.virtual PROPERTY POSITION_INDEPENDENT_CODE ON) From 709228f95dc1ac662270f2a02b9cf1fd80bc17d2 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 7 Jul 2025 12:48:54 +0100 Subject: [PATCH 018/197] fix cond --- src/enclave/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/enclave/main.cpp b/src/enclave/main.cpp index 8024691eb48e..804789533cef 100644 --- a/src/enclave/main.cpp +++ b/src/enclave/main.cpp @@ -78,7 +78,7 @@ extern "C" auto& curl_context = ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe(); - if (curl_context == nullptr) + if (curl_context != nullptr) { LOG_FAIL_FMT("Curl context singleton already initialized"); return CreateNodeStatus::InternalError; From 32d1361e4183c25a81c1fddcd03dbc2d9cff1693 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 7 Jul 2025 13:18:30 +0100 Subject: [PATCH 019/197] Initialise request --- src/node/quote_endorsements_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 5a492a5ca686..e1f01c0ce294 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -155,7 +155,7 @@ namespace ccf { auto endpoint = server.front(); - std::unique_ptr request; + auto request = std::make_unique(); // set curl get CHECK_CURL_EASY_SETOPT(request->get_easy_handle(), CURLOPT_HTTPGET, 1L); From f88d7b504e2a826411be67ff31d243d61a9b7031 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 7 Jul 2025 13:26:44 +0100 Subject: [PATCH 020/197] Fix handler --- src/node/quote_endorsements_client.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index e1f01c0ce294..510db25b77f7 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -195,12 +195,12 @@ namespace ccf last_received_request_id++; - if (request.response->status_code == HTTP_STATUS_OK) + if (response.status_code == HTTP_STATUS_OK) { LOG_INFO_FMT( "Successfully retrieved endorsements for attestation report: " "{} bytes", - request.response->buffer.size()); + response.buffer.size()); handle_success_response(std::move(response.buffer), endpoint); return; From 4458c8b5bd9e11ae40fe0904d7874a17f7ffac8a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 7 Jul 2025 13:31:16 +0100 Subject: [PATCH 021/197] fiddle with pointers --- src/node/quote_endorsements_client.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 510db25b77f7..4effc84bf5ad 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -191,30 +191,30 @@ namespace ccf request->set_response_callback([this, server, endpoint]( curl::CurlRequest& request) { std::lock_guard guard(this->lock); - auto response = *request.response.release(); + auto* response = request.response.get(); last_received_request_id++; - if (response.status_code == HTTP_STATUS_OK) + if (response->status_code == HTTP_STATUS_OK) { LOG_INFO_FMT( "Successfully retrieved endorsements for attestation report: " "{} bytes", - response.buffer.size()); + response->buffer.size()); - handle_success_response(std::move(response.buffer), endpoint); + handle_success_response(std::move(response->buffer), endpoint); return; } LOG_DEBUG_FMT( "Error fetching endorsements for attestation report: {}", - response.status_code); - if (response.status_code == HTTP_STATUS_TOO_MANY_REQUESTS) + response->status_code); + if (response->status_code == HTTP_STATUS_TOO_MANY_REQUESTS) { constexpr size_t default_retry_after_s = 3; size_t retry_after_s = default_retry_after_s; - auto h = response.headers.find(http::headers::RETRY_AFTER); - if (h != response.headers.end()) + auto h = response->headers.find(http::headers::RETRY_AFTER); + if (h != response->headers.end()) { const auto& retry_after_value = h->second; // If value is invalid, retry_after_s is unchanged From cdebe299b5b9de5c2e6e9771d6a9c0546d23324d Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 7 Jul 2025 13:45:11 +0100 Subject: [PATCH 022/197] Fix timeout --- src/http/curl.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index f58485dd08f1..86877ac6f2b9 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -413,8 +413,9 @@ namespace ccf::curl } else { - // If timeout is zero, this will trigger on the next uv loop iteration - uv_timer_start(&self->timeout_tracker, libuv_timeout_callback, 0, 0); + // If timeout is zero, this will trigger immediately + timeout_ms = std::max(timeout_ms, 1L); + uv_timer_start(&self->timeout_tracker, libuv_timeout_callback, timeout_ms, 0); } return 0; } From 4ea2bb7d7c50420fcd33ea3002ca52e6dc299d6f Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 7 Jul 2025 17:23:16 +0100 Subject: [PATCH 023/197] Maybe fix issue? --- src/node/quote_endorsements_client.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 4effc84bf5ad..146e2dbcf687 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -51,6 +51,7 @@ namespace ccf // requests in series, after receiving the response to each one or after a // long timeout. size_t last_received_request_id = 0; + size_t last_submitted_request_id = 0; bool has_completed = false; size_t server_retries_count = 0; @@ -153,6 +154,7 @@ namespace ccf void fetch(const Server& server) { + auto request_id = ++last_submitted_request_id; auto endpoint = server.front(); auto request = std::make_unique(); @@ -253,7 +255,7 @@ namespace ccf { return; } - if (msg->data.request_id >= msg->data.self->last_received_request_id) + if (msg->data.request_id >= msg->data.self->last_submitted_request_id) { auto& servers = msg->data.self->config.servers; // Should always contain at least one server, @@ -294,7 +296,7 @@ namespace ccf }, shared_from_this(), endpoint, - last_received_request_id); + request_id); ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), From 6214b6c3926009e7f46ab9bf01d6e1774e24e7b3 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 21 Jul 2025 09:52:40 +0100 Subject: [PATCH 024/197] refmt --- src/http/curl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index 86877ac6f2b9..f63eec5133d1 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -415,7 +415,8 @@ namespace ccf::curl { // If timeout is zero, this will trigger immediately timeout_ms = std::max(timeout_ms, 1L); - uv_timer_start(&self->timeout_tracker, libuv_timeout_callback, timeout_ms, 0); + uv_timer_start( + &self->timeout_tracker, libuv_timeout_callback, timeout_ms, 0); } return 0; } From fce77da35232a67e1e182ef17682d68757f0283d Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 21 Jul 2025 17:55:11 +0100 Subject: [PATCH 025/197] Update --- src/http/curl.h | 249 ++++++++++++++++----------- src/node/quote_endorsements_client.h | 5 +- 2 files changed, 147 insertions(+), 107 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index f63eec5133d1..1f2d6ec7851c 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -104,42 +105,53 @@ namespace ccf::curl class RequestBody { - std::vector buffer_vec; - std::span buffer_span; + std::vector buffer; + std::span unsent; public: - RequestBody(std::vector& buffer) : buffer_vec(std::move(buffer)) + RequestBody(std::vector& buffer) : buffer(buffer) { - buffer_span = - std::span(buffer_vec.data(), buffer_vec.size()); + unsent = std::span(buffer.data(), buffer.size()); } - template - RequestBody(Jsonable jsonable) + RequestBody(std::vector&& buffer) : buffer(std::move(buffer)) { - auto json_str = nlohmann::json(jsonable).dump(); - buffer_vec = std::vector( + unsent = std::span(buffer.data(), buffer.size()); + } + + RequestBody(nlohmann::json json) + { + auto json_str = json.dump(); + buffer = std::vector( json_str.begin(), json_str.end()); // Convert to vector of bytes - buffer_span = - std::span(buffer_vec.data(), buffer_vec.size()); + unsent = std::span(buffer.data(), buffer.size()); } static size_t send_data( - char* ptr, size_t size, size_t nitems, void* userdata) + char* ptr, size_t size, size_t nitems, RequestBody* data) { - auto* data = static_cast(userdata); - auto bytes_to_copy = std::min(data->buffer_span.size(), size * nitems); - memcpy(ptr, data->buffer_span.data(), bytes_to_copy); - data->buffer_span = data->buffer_span.subspan(bytes_to_copy); + if (data == nullptr) + { + LOG_FAIL_FMT("send_data called with null userdata"); + return 0; + } + auto bytes_to_copy = std::min(data->unsent.size(), size * nitems); + memcpy(ptr, data->unsent.data(), bytes_to_copy); + data->unsent = data->unsent.subspan(bytes_to_copy); return bytes_to_copy; } void attach_to_curl(CURL* curl) { + if (curl == nullptr) + { + throw std::logic_error( + "Cannot attach request body to a null CURL handle"); + } CHECK_CURL_EASY_SETOPT(curl, CURLOPT_READDATA, this); CHECK_CURL_EASY_SETOPT(curl, CURLOPT_READFUNCTION, send_data); CHECK_CURL_EASY_SETOPT( - curl, CURLOPT_INFILESIZE, static_cast(buffer_span.size())); + curl, CURLOPT_INFILESIZE, static_cast(unsent.size())); } }; @@ -152,35 +164,46 @@ namespace ccf::curl long status_code = 0; static size_t write_response_chunk( - uint8_t* ptr, size_t size, size_t nmemb, void* userdata) + uint8_t* ptr, size_t size, size_t nmemb, Response* response) { - auto* data = static_cast(userdata); + if (response == nullptr) + { + LOG_FAIL_FMT( + "write_response_chunk called with a null response pointer"); + return 0; + } auto bytes_to_copy = size * nmemb; - data->buffer.insert(data->buffer.end(), ptr, ptr + bytes_to_copy); + response->buffer.insert(response->buffer.end(), ptr, ptr + bytes_to_copy); // Should probably set a maximum response size here return bytes_to_copy; } - static size_t append_header( + static size_t recv_header_line( char* buffer, size_t size, size_t nitems, Response* response) { - if (size != 1) + if (response == nullptr) { - LOG_FAIL_FMT( - "Unexpected value in curl HEADERFUNCTION callback: size = {}", size); + LOG_FAIL_FMT("recv_header_line called with a null response pointer"); return 0; } + auto bytes_to_read = size * nitems; + std::string_view header(buffer, bytes_to_read); - const std::string_view header = - ccf::nonstd::trim(std::string_view(buffer, nitems)); + // strip /r/n etc + header = ccf::nonstd::trim(header); - // Ignore HTTP status line, and empty line - if (!header.empty() && !header.starts_with("HTTP/1.1")) + // Ignore empty headers, and the http response line (e.g. "HTTP/1.1 200") + static const std::regex http_status_line_regex(R"(^HTTP\/[1-9]+.*)"); + if ( + !header.empty() && + !std::regex_match(std::string(header), http_status_line_regex)) { const auto [field, value] = ccf::nonstd::split_1(header, ": "); if (!value.empty()) { - response->headers[std::string(field)] = ccf::nonstd::trim(value); + std::string field_str(field); + nonstd::to_lower(field_str); + response->headers[field_str] = ccf::nonstd::trim(value); } else { @@ -188,39 +211,21 @@ namespace ccf::curl } } - return nitems * size; + return bytes_to_read; } void attach_to_curl(CURL* curl) { + if (curl == nullptr) + { + throw std::logic_error("Cannot attach response to a null CURL handle"); + } // Body CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEDATA, this); CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEFUNCTION, write_response_chunk); // Headers CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERDATA, this); - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERFUNCTION, append_header); - } - }; - - // Use in conjunction with the iter_CURLM_CurlRequest function - // to force only requests with the corresponding CurlRequest private data - class CurlRequestCURLM - { - private: - CURLM* curl_multi; - - public: - CurlRequestCURLM(CURLM* curl_multi) : curl_multi(curl_multi) - { - if (curl_multi == nullptr) - { - throw std::runtime_error("CURLM handle cannot be null"); - } - } - - [[nodiscard]] CURLM* get() const - { - return curl_multi; + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERFUNCTION, recv_header_line); } }; @@ -287,76 +292,110 @@ namespace ccf::curl { return curl_handle; } + }; + + // non-owning wrapper around a CURLM handle which supports CurlRequest + class CurlRequestCURLM + { + private: + CURLM* curl_multi; + + CurlRequestCURLM(CURLM* curl_multi) : curl_multi(curl_multi) + { + if (curl_multi == nullptr) + { + throw std::runtime_error("CURLM handle cannot be null"); + } + } + + public: + [[nodiscard]] CURLM* get() const + { + return curl_multi; + } - static void attach_to_multi_curl( - const CurlRequestCURLM& curl_multi, std::unique_ptr& request) + void attach_curl_request(std::unique_ptr& request) { + if(request == nullptr) + { + throw std::logic_error("Cannot attach a null CurlRequest"); + } request->attach_to_curl(); CURL* curl_handle = request->curl_handle; CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_PRIVATE, request.release()); - CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi.get(), curl_handle); + CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi, curl_handle); } - }; - inline int iter_CURLM_CurlRequest(const CurlRequestCURLM& p) - { - int running_handles = 0; - CHECK_CURL_MULTI(curl_multi_perform, p.get(), &running_handles); + static CurlRequestCURLM create_unsafe(CURLM* curl_multi) + { + if(curl_multi == nullptr) + { + throw std::runtime_error("CURLM handle cannot be null"); + } + return {curl_multi}; + } - // handle all completed curl requests - int msgq = 0; - CURLMsg* msg = nullptr; - do + int perform_unsafe() { - msg = curl_multi_info_read(p.get(), &msgq); + int running_handles = 0; + CHECK_CURL_MULTI(curl_multi_perform, curl_multi, &running_handles); - if ((msg != nullptr) && msg->msg == CURLMSG_DONE) + // handle all completed curl requests + int msgq = 0; + CURLMsg* msg = nullptr; + do { - auto* easy = msg->easy_handle; - auto result = msg->data.result; - - LOG_TRACE_FMT( - "CURL request response handling with result: {} ({})", - result, - curl_easy_strerror(result)); - - // retrieve the request data and attach a lifetime to it - ccf::curl::CurlRequest* request = nullptr; - curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); - if (request == nullptr) - { - throw std::runtime_error( - "CURLMSG_DONE received with no associated request data"); - } - std::unique_ptr request_data_ptr(request); + msg = curl_multi_info_read(curl_multi, &msgq); - if (request->response != nullptr) + if ((msg != nullptr) && msg->msg == CURLMSG_DONE) { - CHECK_CURL_EASY_GETINFO( - easy, CURLINFO_RESPONSE_CODE, &request->response->status_code); - } + auto* easy = msg->easy_handle; + auto result = msg->data.result; + + LOG_TRACE_FMT( + "CURL request response handling with result: {} ({})", + result, + curl_easy_strerror(result)); + + // retrieve the request data and attach a lifetime to it + ccf::curl::CurlRequest* request = nullptr; + curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); + if (request == nullptr) + { + throw std::runtime_error( + "CURLMSG_DONE received with no associated request data"); + } + std::unique_ptr request_data_ptr(request); - // Clean up the easy handle and corresponding resources - curl_multi_remove_handle(p.get(), easy); - if (request->response_callback.has_value()) - { if (request->response != nullptr) { - request->response_callback.value()(*request); + CHECK_CURL_EASY_GETINFO( + easy, CURLINFO_RESPONSE_CODE, &request->response->status_code); + } + + // Clean up the easy handle and corresponding resources + curl_multi_remove_handle(curl_multi, easy); + if (request->response_callback.has_value()) + { + if (request->response != nullptr) + { + request->response_callback.value()(*request); + } } + // Handled by the destructor of CurlRequest + LOG_INFO_FMT( + "Finished handling CURLMSG: msg_nullptr: {}, remaining: {}", + msg != nullptr, + msgq); } - // Handled by the destructor of CurlRequest - LOG_INFO_FMT( - "Finished handling CURLMSG: msg_nullptr: {}, remaining: {}", - msg != nullptr, - msgq); - } - } while (msgq > 0); - return running_handles; - } + } while (msgq > 0); + return running_handles; + } + }; class CurlmLibuvContext { + private: uv_loop_t* loop; uv_timer_t timeout_tracker{}; // lifetime handler of curl_multi interface @@ -374,7 +413,7 @@ namespace ccf::curl public: void handle_request_messages() { - iter_CURLM_CurlRequest(curl_request_curlm); + curl_request_curlm.perform_unsafe(); } static void libuv_timeout_callback(uv_timer_t* handle) @@ -512,7 +551,7 @@ namespace ccf::curl CurlmLibuvContext(uv_loop_t* loop) : loop(loop), - curl_request_curlm(curl_multi) + curl_request_curlm(CurlRequestCURLM::create_unsafe(curl_multi)) { uv_timer_init(loop, &timeout_tracker); timeout_tracker.data = this; // Attach this instance to the timer @@ -536,7 +575,7 @@ namespace ccf::curl } // should this return a reference or a pointer? - [[nodiscard]] const CurlRequestCURLM& curlm() const + [[nodiscard]] CurlRequestCURLM& curlm() { return curl_request_curlm; } diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 134f9b7912a0..ccb2aacb7bac 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -302,8 +302,9 @@ namespace ccf LOG_INFO_FMT( "Fetching endorsements for attestation report at {}", request->url); - curl::CurlRequest::attach_to_multi_curl( - curl::CurlmLibuvContextSingleton::get_instance().curlm(), request); + curl::CurlmLibuvContextSingleton::get_instance_unsafe() + ->curlm() + .attach_curl_request(request); } public: From 58eb20cb798d06616938ec76f340fc7cc2f2dbcf Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 21 Jul 2025 17:56:51 +0100 Subject: [PATCH 026/197] fmt --- src/http/curl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 1f2d6ec7851c..4a7c577094fd 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -316,7 +316,7 @@ namespace ccf::curl void attach_curl_request(std::unique_ptr& request) { - if(request == nullptr) + if (request == nullptr) { throw std::logic_error("Cannot attach a null CurlRequest"); } @@ -328,7 +328,7 @@ namespace ccf::curl static CurlRequestCURLM create_unsafe(CURLM* curl_multi) { - if(curl_multi == nullptr) + if (curl_multi == nullptr) { throw std::runtime_error("CURLM handle cannot be null"); } From 5b52e3d78db00448041227c0df076eb5192ca02e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 11:11:31 +0100 Subject: [PATCH 027/197] remove static_cast --- src/http/curl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 4a7c577094fd..e417101798b5 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -150,8 +150,7 @@ namespace ccf::curl } CHECK_CURL_EASY_SETOPT(curl, CURLOPT_READDATA, this); CHECK_CURL_EASY_SETOPT(curl, CURLOPT_READFUNCTION, send_data); - CHECK_CURL_EASY_SETOPT( - curl, CURLOPT_INFILESIZE, static_cast(unsent.size())); + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_INFILESIZE, unsent.size()); } }; From b876ccaa2f7a2460bd0ce6673ca3e3847366fc08 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 11:11:59 +0100 Subject: [PATCH 028/197] Fix url query --- src/node/quote_endorsements_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index ccb2aacb7bac..eb714671448b 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -162,7 +162,7 @@ namespace ccf CHECK_CURL_EASY_SETOPT(request->get_easy_handle(), CURLOPT_HTTPGET, 1L); request->url = fmt::format( - "{}://{}:{}/{}{}", + "{}://{}:{}{}{}", endpoint.tls ? "https" : "http", endpoint.host, endpoint.port, From 68aff99637fa0dafd018699d9561f39b2b65361a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 11:36:08 +0100 Subject: [PATCH 029/197] Add kickstart for curlm and document interaction between libuv and curlm --- src/http/curl.h | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index e417101798b5..7bd3a54ef216 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -394,6 +394,28 @@ namespace ccf::curl class CurlmLibuvContext { + /* Very high level: + * CURLM triggers timeout callback with some delay for libuv + * libuv calls the timeout callback which then triggers the curl socket + * action + * curl calls the socket callback to register the libuv polling + * libuv waits on the socket events and calls the socket poll callback + * socket poll callback triggers relevant libuv action + * etc. + * + * Example flow: + * + * Initially a CURL* is attached to the curl_multi CURLM* handle + * This calls the curl_multi's timeout function curl_timeout_callback with 0 + * delay + * which then registers the libuv timeout callback with 0 delay + * libuv_timeout_callback then registers a timeout socket_action with curl + * which then registers the socket polling at the libuv level + * + * At this point, either the relevant timeout will fire and call the + * relevant timeout callbacks, or the socket polling will trigger allowing + * data to be sent/received + */ private: uv_loop_t* loop; uv_timer_t timeout_tracker{}; @@ -571,6 +593,19 @@ namespace ccf::curl curl_multi, CURLMOPT_SOCKETFUNCTION, curl_socket_callback); + + LOG_INFO_FMT("Created CURLM libuv context"); + + // kickstart timeout, probably a no-op but allows curl to initialise + int running_handles = 0; + CHECK_CURL_MULTI( + curl_multi_socket_action, + curl_multi, + CURL_SOCKET_TIMEOUT, + 0, + &running_handles); + + LOG_INFO_FMT("Kickstarted CURLM libuv context"); } // should this return a reference or a pointer? From 934010f7f50d6f61f211ba864c032fd0b7bf79f5 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 15:21:22 +0100 Subject: [PATCH 030/197] Refactor interface to make checks more careful. --- src/http/curl.h | 119 ++++++++++++++++----------- src/node/quote_endorsements_client.h | 15 ++-- 2 files changed, 79 insertions(+), 55 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 7bd3a54ef216..5f33d63ab53e 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -91,12 +91,26 @@ namespace ccf::curl public: UniqueSlist() : p(nullptr, [](auto x) { curl_slist_free_all(x); }) {} + ~UniqueSlist() = default; + UniqueSlist(const UniqueSlist&) = delete; + UniqueSlist& operator=(const UniqueSlist&) = delete; + UniqueSlist(UniqueSlist&& other) noexcept : p(std::move(other.p)) {} + UniqueSlist& operator=(UniqueSlist&& other) noexcept + { + p = std::move(other.p); + return *this; + } void append(const char* str) { p.reset(curl_slist_append(p.release(), str)); } + void append(const std::string& key, const std::string& value) + { + append(fmt::format("{}: {}", key, value).c_str()); + } + [[nodiscard]] curl_slist* get() const { return p.get(); @@ -230,7 +244,7 @@ namespace ccf::curl class CurlRequest { - public: + private: UniqueCURL curl_handle; std::string url; std::unique_ptr request_body = nullptr; @@ -239,25 +253,44 @@ namespace ccf::curl std::optional> response_callback = std::nullopt; - void attach_to_curl() const + public: + void set_url(const std::string& new_url) { + if (new_url.empty()) + { + throw std::invalid_argument("URL cannot be empty"); + } + url = new_url; CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_URL, url.c_str()); - if (request_body != nullptr) + } + + void set_body(std::unique_ptr body) + { + if (body == nullptr) { - request_body->attach_to_curl(curl_handle); - CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); + throw std::invalid_argument("Request body cannot be null"); } - if (response != nullptr) + request_body = std::move(body); + request_body->attach_to_curl(curl_handle); + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); + } + + void set_response_callback(std::function callback) + { + if (response != nullptr || response_callback.has_value()) { - response->attach_to_curl(curl_handle); + throw std::logic_error( + "Only one response callback can be set for a request."); } - CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); + response_callback = std::move(callback); + response = std::make_unique(); + response->attach_to_curl(curl_handle); } - void set_url(const std::string& new_url) + void set_headers(UniqueSlist&& new_headers) { - url = new_url; - CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_URL, url.c_str()); + headers = std::move(new_headers); + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); } void set_blob_opt(auto option, const uint8_t* data, size_t length) @@ -271,25 +304,31 @@ namespace ccf::curl CHECK_CURL_EASY_SETOPT(curl_handle, option, blob); } - void set_response_callback(std::function callback) + void handle_response() { - if (response != nullptr || response_callback.has_value()) + if (response_callback.has_value()) { - throw std::logic_error( - "Only one response callback can be set for a request."); + response_callback.value()(*this); } - response_callback = std::move(callback); - response = std::make_unique(); } - void set_header(const std::string& key, const std::string& value) + [[nodiscard]] CURL* get_easy_handle() const + { + return curl_handle; + } + + [[nodiscard]] std::string get_url() const { - headers.append(fmt::format("{}: {}", key, value).c_str()); + return url; + } + [[nodiscard]] const ccf::curl::UniqueSlist& get_headers() const + { + return headers; } - [[nodiscard]] CURL* get_easy_handle() const + [[nodiscard]] Response* get_response() const { - return curl_handle; + return response.get(); } }; @@ -319,8 +358,7 @@ namespace ccf::curl { throw std::logic_error("Cannot attach a null CurlRequest"); } - request->attach_to_curl(); - CURL* curl_handle = request->curl_handle; + CURL* curl_handle = request->get_easy_handle(); CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_PRIVATE, request.release()); CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi, curl_handle); } @@ -334,7 +372,7 @@ namespace ccf::curl return {curl_multi}; } - int perform_unsafe() + int perform() { int running_handles = 0; CHECK_CURL_MULTI(curl_multi_perform, curl_multi, &running_handles); @@ -351,11 +389,6 @@ namespace ccf::curl auto* easy = msg->easy_handle; auto result = msg->data.result; - LOG_TRACE_FMT( - "CURL request response handling with result: {} ({})", - result, - curl_easy_strerror(result)); - // retrieve the request data and attach a lifetime to it ccf::curl::CurlRequest* request = nullptr; curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); @@ -366,26 +399,18 @@ namespace ccf::curl } std::unique_ptr request_data_ptr(request); - if (request->response != nullptr) + if (request->get_response() != nullptr) { CHECK_CURL_EASY_GETINFO( - easy, CURLINFO_RESPONSE_CODE, &request->response->status_code); + easy, + CURLINFO_RESPONSE_CODE, + &request->get_response()->status_code); } - // Clean up the easy handle and corresponding resources + // detach the easy handle such that it can be cleaned up with the + // destructor of CurlRequest curl_multi_remove_handle(curl_multi, easy); - if (request->response_callback.has_value()) - { - if (request->response != nullptr) - { - request->response_callback.value()(*request); - } - } - // Handled by the destructor of CurlRequest - LOG_INFO_FMT( - "Finished handling CURLMSG: msg_nullptr: {}, remaining: {}", - msg != nullptr, - msgq); + request->handle_response(); } } while (msgq > 0); return running_handles; @@ -434,7 +459,7 @@ namespace ccf::curl public: void handle_request_messages() { - curl_request_curlm.perform_unsafe(); + curl_request_curlm.perform(); } static void libuv_timeout_callback(uv_timer_t* handle) @@ -594,8 +619,6 @@ namespace ccf::curl CURLMOPT_SOCKETFUNCTION, curl_socket_callback); - LOG_INFO_FMT("Created CURLM libuv context"); - // kickstart timeout, probably a no-op but allows curl to initialise int running_handles = 0; CHECK_CURL_MULTI( @@ -604,8 +627,6 @@ namespace ccf::curl CURL_SOCKET_TIMEOUT, 0, &running_handles); - - LOG_INFO_FMT("Kickstarted CURLM libuv context"); } // should this return a reference or a pointer? diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index eb714671448b..6b70361c916f 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -161,13 +161,13 @@ namespace ccf // set curl get CHECK_CURL_EASY_SETOPT(request->get_easy_handle(), CURLOPT_HTTPGET, 1L); - request->url = fmt::format( + request->set_url(fmt::format( "{}://{}:{}{}{}", endpoint.tls ? "https" : "http", endpoint.host, endpoint.port, endpoint.uri, - get_formatted_query(endpoint.params)); + get_formatted_query(endpoint.params))); if (endpoint.tls) { @@ -183,16 +183,18 @@ namespace ccf request->get_easy_handle(), CURLOPT_SSL_VERIFYSTATUS, 0L); } + auto headers = ccf::curl::UniqueSlist(); for (auto const& [k, v] : endpoint.headers) { - request->set_header(k, v); + headers.append(k, v); } - request->set_header(http::headers::HOST, endpoint.host); + headers.append(http::headers::HOST, endpoint.host); + request->set_headers(std::move(headers)); request->set_response_callback([this, server, endpoint]( curl::CurlRequest& request) { std::lock_guard guard(this->lock); - auto* response = request.response.get(); + auto* response = request.get_response(); if (response->status_code == HTTP_STATUS_OK) { @@ -300,7 +302,8 @@ namespace ccf std::chrono::milliseconds(server_connection_timeout_s * 1000)); LOG_INFO_FMT( - "Fetching endorsements for attestation report at {}", request->url); + "Fetching endorsements for attestation report at {}", + request->get_url()); curl::CurlmLibuvContextSingleton::get_instance_unsafe() ->curlm() From c84ba3f17ae450dcc747f46e8ab27677c9a89c91 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 15:47:43 +0100 Subject: [PATCH 031/197] move to a constructor pattern --- src/http/curl.h | 82 ++++++++++++++-------------- src/node/quote_endorsements_client.h | 31 ++++++----- 2 files changed, 58 insertions(+), 55 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 5f33d63ab53e..a9a309ce4e6f 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -62,6 +62,22 @@ namespace ccf::curl { return p.get(); } + + void set_blob_opt(auto option, const uint8_t* data, size_t length) + { + struct curl_blob blob + { + .data = const_cast(data), .len = length, + .flags = CURL_BLOB_COPY, + }; + + CHECK_CURL_EASY_SETOPT(p.get(), option, blob); + } + + void set_opt(auto option, auto value) + { + CHECK_CURL_EASY_SETOPT(p.get(), option, value); + } }; class UniqueCURLM @@ -247,61 +263,47 @@ namespace ccf::curl private: UniqueCURL curl_handle; std::string url; + ccf::curl::UniqueSlist headers; std::unique_ptr request_body = nullptr; std::unique_ptr response = nullptr; - ccf::curl::UniqueSlist headers; std::optional> response_callback = - std::nullopt; + nullptr; public: - void set_url(const std::string& new_url) - { - if (new_url.empty()) + CurlRequest( + UniqueCURL&& curl_handle_, + std::string&& url_, + UniqueSlist&& headers_, + std::unique_ptr&& request_body_, + std::optional>&& response_callback_) : + curl_handle(std::move(curl_handle_)), + url(std::move(url_)), + headers(std::move(headers_)), + request_body(std::move(request_body_)), + response_callback(std::move(response_callback_)) + { + if (url.empty()) { throw std::invalid_argument("URL cannot be empty"); } - url = new_url; CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_URL, url.c_str()); - } - void set_body(std::unique_ptr body) - { - if (body == nullptr) + if (request_body != nullptr) { - throw std::invalid_argument("Request body cannot be null"); + request_body->attach_to_curl(curl_handle); + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); } - request_body = std::move(body); - request_body->attach_to_curl(curl_handle); - CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); - } - void set_response_callback(std::function callback) - { - if (response != nullptr || response_callback.has_value()) + if (response_callback != std::nullopt) { - throw std::logic_error( - "Only one response callback can be set for a request."); + response = std::make_unique(); + response->attach_to_curl(curl_handle); } - response_callback = std::move(callback); - response = std::make_unique(); - response->attach_to_curl(curl_handle); - } - void set_headers(UniqueSlist&& new_headers) - { - headers = std::move(new_headers); - CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); - } - - void set_blob_opt(auto option, const uint8_t* data, size_t length) - { - struct curl_blob blob + if (headers.get() != nullptr) { - .data = const_cast(data), .len = length, - .flags = CURL_BLOB_COPY, - }; - - CHECK_CURL_EASY_SETOPT(curl_handle, option, blob); + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); + } } void handle_response() @@ -321,10 +323,6 @@ namespace ccf::curl { return url; } - [[nodiscard]] const ccf::curl::UniqueSlist& get_headers() const - { - return headers; - } [[nodiscard]] Response* get_response() const { diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 6b70361c916f..5e6535bc762b 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -6,6 +6,8 @@ #include "enclave/rpc_sessions.h" #include "http/curl.h" +#include + namespace ccf { using QuoteEndorsementsFetchedCallback = @@ -156,18 +158,18 @@ namespace ccf auto request_id = ++last_submitted_request_id; auto endpoint = server.front(); - auto request = std::make_unique(); + curl::UniqueCURL curl_handle; // set curl get - CHECK_CURL_EASY_SETOPT(request->get_easy_handle(), CURLOPT_HTTPGET, 1L); + curl_handle.set_opt(CURLOPT_HTTPGET, 1L); - request->set_url(fmt::format( + auto url = fmt::format( "{}://{}:{}{}{}", endpoint.tls ? "https" : "http", endpoint.host, endpoint.port, endpoint.uri, - get_formatted_query(endpoint.params))); + get_formatted_query(endpoint.params)); if (endpoint.tls) { @@ -175,12 +177,9 @@ namespace ccf // private data. If the server was malicious and the certificate chain // was bogus, the verification of the endorsement of the quote would // fail anyway. - CHECK_CURL_EASY_SETOPT( - request->get_easy_handle(), CURLOPT_SSL_VERIFYHOST, 0L); - CHECK_CURL_EASY_SETOPT( - request->get_easy_handle(), CURLOPT_SSL_VERIFYPEER, 0L); - CHECK_CURL_EASY_SETOPT( - request->get_easy_handle(), CURLOPT_SSL_VERIFYSTATUS, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); } auto headers = ccf::curl::UniqueSlist(); @@ -189,10 +188,9 @@ namespace ccf headers.append(k, v); } headers.append(http::headers::HOST, endpoint.host); - request->set_headers(std::move(headers)); - request->set_response_callback([this, server, endpoint]( - curl::CurlRequest& request) { + auto response_callback = ([this, server, endpoint]( + curl::CurlRequest& request) { std::lock_guard guard(this->lock); auto* response = request.get_response(); @@ -244,6 +242,13 @@ namespace ccf return; }); + auto request = std::make_unique( + std::move(curl_handle), + std::move(url), + std::move(headers), + nullptr, + std::move(response_callback)); + // Start watchdog to send request on new server if it is unresponsive auto msg = std::make_unique< ::threading::Tmsg>( From 594f53621583bcc455313454141c89b80bad3eab Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 15:54:08 +0100 Subject: [PATCH 032/197] Add missing nullptr check in curl_socket_callback --- src/http/curl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index a9a309ce4e6f..6e5f673b53f3 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -549,6 +549,11 @@ namespace ccf::curl CurlmLibuvContext* self, RequestContext* request_context) { + if (self == nullptr) + { + throw std::logic_error( + "curl_socket_callback called with null self pointer"); + } (void)easy; switch (action) { From 333c4274d20c299498f14a0dc07cb90ea771756c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 16:27:28 +0100 Subject: [PATCH 033/197] Update src/http/curl.h Co-authored-by: Eddy Ashton --- src/http/curl.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 6e5f673b53f3..3a7add9b5f71 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -641,24 +641,21 @@ namespace ccf::curl class CurlmLibuvContextSingleton { - static CurlmLibuvContext* curlm_libuv_context_instance; - public: static CurlmLibuvContext*& get_instance_unsafe() - { + { + static CurlmLibuvContext* curlm_libuv_context_instance = nullptr; return curlm_libuv_context_instance; } static CurlmLibuvContext& get_instance() { - if (curlm_libuv_context_instance == nullptr) + auto*& instance = get_instance_unsafe(); + if (instance == nullptr) { throw std::logic_error( "CurlmLibuvContextSingleton instance not initialized"); } - return *curlm_libuv_context_instance; + return *instance; } }; - - inline CurlmLibuvContext* - CurlmLibuvContextSingleton::curlm_libuv_context_instance = nullptr; } // namespace ccf::curl \ No newline at end of file From 12edb67aaeab761ebc041d82929f44a8e8f91ec7 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 23 Jul 2025 15:32:49 +0100 Subject: [PATCH 034/197] Add check and warn of duplicate headers in responses --- src/http/curl.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index 3a7add9b5f71..61d6fe29e6f5 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -232,6 +232,15 @@ namespace ccf::curl { std::string field_str(field); nonstd::to_lower(field_str); + if (response->headers.contains(field_str)) + { + auto current = response->headers[field_str]; + LOG_FAIL_FMT( + "Duplicate header for '{}', current = '{}', new = '{}'", + field_str, + current, + value); + } response->headers[field_str] = ccf::nonstd::trim(value); } else From 14d827b8721379847f4f08d9465588ec85da8b5b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 24 Jul 2025 11:48:19 +0100 Subject: [PATCH 035/197] Migrate fetch.h to new interface --- src/http/curl.h | 80 +++++++++--- src/snapshots/fetch.h | 275 +++++++++++++++--------------------------- 2 files changed, 158 insertions(+), 197 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 61d6fe29e6f5..7502d34ba8d7 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -4,11 +4,13 @@ #include "ccf/ds/logger.h" #include "ccf/ds/nonstd.h" +#include "ccf/rest_verb.h" #include #include #include #include +#include #include #include #include @@ -63,7 +65,7 @@ namespace ccf::curl return p.get(); } - void set_blob_opt(auto option, const uint8_t* data, size_t length) + void set_blob_opt(auto option, const auto* data, size_t length) { struct curl_blob blob { @@ -190,7 +192,6 @@ namespace ccf::curl std::vector buffer; using HeaderMap = std::unordered_map; HeaderMap headers; - long status_code = 0; static size_t write_response_chunk( uint8_t* ptr, size_t size, size_t nmemb, Response* response) @@ -271,21 +272,25 @@ namespace ccf::curl { private: UniqueCURL curl_handle; + RESTVerb method = HTTP_GET; std::string url; ccf::curl::UniqueSlist headers; std::unique_ptr request_body = nullptr; std::unique_ptr response = nullptr; - std::optional> response_callback = + std::optional> response_callback = nullptr; public: CurlRequest( UniqueCURL&& curl_handle_, + RESTVerb method_, std::string&& url_, UniqueSlist&& headers_, std::unique_ptr&& request_body_, - std::optional>&& response_callback_) : + std::optional>&& + response_callback_) : curl_handle(std::move(curl_handle_)), + method(method_), url(std::move(url_)), headers(std::move(headers_)), request_body(std::move(request_body_)), @@ -300,10 +305,9 @@ namespace ccf::curl if (request_body != nullptr) { request_body->attach_to_curl(curl_handle); - CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); } - if (response_callback != std::nullopt) + if (response_callback.has_value()) { response = std::make_unique(); response->attach_to_curl(curl_handle); @@ -313,14 +317,61 @@ namespace ccf::curl { CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); } + + if (!method.get_http_method().has_value()) + { + throw std::logic_error( + fmt::format("Unsupported HTTP method: {}", method.c_str())); + } + switch (method.get_http_method().value()) + { + case HTTP_GET: + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPGET, 1L); + break; + case HTTP_HEAD: + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_NOBODY, 1L); + break; + case HTTP_PUT: + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); + break; + case HTTP_POST: + // libcurl sets the post verb when CURLOPT_POSTFIELDS is set, so we + // skip doing so here, and we assume that the user has already set + // these fields + break; + default: + throw std::logic_error( + fmt::format("Unsupported HTTP method: {}", method.c_str())); + } } void handle_response() { if (response_callback.has_value()) { - response_callback.value()(*this); + long status_code = 0; + CHECK_CURL_EASY_GETINFO( + curl_handle, CURLINFO_RESPONSE_CODE, &status_code); + response_callback.value()(*this, status_code); + } + } + + long syncronous_perform() + { + if (curl_handle == nullptr) + { + throw std::logic_error( + "Cannot curl_easy_perform on a null CURL handle"); } + + CHECK_CURL_EASY(curl_easy_perform, curl_handle); + + handle_response(); // handle the response callback if set + + long status_code = 0; + CHECK_CURL_EASY_GETINFO( + curl_handle, CURLINFO_RESPONSE_CODE, &status_code); + return status_code; } [[nodiscard]] CURL* get_easy_handle() const @@ -328,6 +379,11 @@ namespace ccf::curl return curl_handle; } + [[nodiscard]] RESTVerb get_method() const + { + return method; + } + [[nodiscard]] std::string get_url() const { return url; @@ -406,14 +462,6 @@ namespace ccf::curl } std::unique_ptr request_data_ptr(request); - if (request->get_response() != nullptr) - { - CHECK_CURL_EASY_GETINFO( - easy, - CURLINFO_RESPONSE_CODE, - &request->get_response()->status_code); - } - // detach the easy handle such that it can be cleaned up with the // destructor of CurlRequest curl_multi_remove_handle(curl_multi, easy); @@ -652,7 +700,7 @@ namespace ccf::curl { public: static CurlmLibuvContext*& get_instance_unsafe() - { + { static CurlmLibuvContext* curlm_libuv_context_instance = nullptr; return curlm_libuv_context_instance; } diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index 55b2e8aa1eb9..3eb5046538b8 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -10,152 +10,29 @@ #include #include +#include #include #include +#include #include #include -#define EXPECT_HTTP_RESPONSE_STATUS(request, response, expected) \ +#define EXPECT_HTTP_RESPONSE_STATUS(request, status_code, expected) \ do \ { \ - if (response.status_code != expected) \ + if (status_code != expected) \ { \ throw std::runtime_error(fmt::format( \ "Expected {} response from {} {}, instead received {}", \ ccf::http_status_str(expected), \ - request.method.c_str(), \ - request.url, \ - response.status_code)); \ + request.get_method().c_str(), \ + request.get_url(), \ + status_code)); \ } \ } while (0) namespace snapshots { - // Using curl 7.68.0, so missing niceties like curl_easy_header - - using HeaderMap = std::unordered_map; - size_t append_header(char* buffer, size_t size, size_t nitems, void* userdata) - { - HeaderMap& headers = *(HeaderMap*)userdata; - - if (size != 1) - { - LOG_FAIL_FMT( - "Unexpected value in curl HEADERFUNCTION callback: size = {}", size); - return 0; - } - - const std::string_view header = - ccf::nonstd::trim(std::string_view(buffer, nitems)); - - // Ignore HTTP status line, and empty line - if (!header.empty() && !header.starts_with("HTTP/1.1")) - { - const auto [field, value] = ccf::nonstd::split_1(header, ": "); - if (!value.empty()) - { - headers[std::string(field)] = ccf::nonstd::trim(value); - } - else - { - LOG_INFO_FMT("Ignoring invalid-looking HTTP Header '{}'", header); - } - } - - return nitems * size; - } - - using BodyHandler = std::function&)>; - size_t curl_write_callback( - char* ptr, size_t size, size_t nmemb, void* user_data) - { - BodyHandler& body_handler = *(BodyHandler*)user_data; - - if (size != 1) - { - LOG_FAIL_FMT( - "Unexpected value in curl WRITEFUNCTION callback: size = {}", size); - return 0; - } - - std::span data((const uint8_t*)ptr, size * nmemb); - - body_handler(data); - - return size * nmemb; - } - - struct SimpleHTTPRequest - { - ccf::RESTVerb method; - std::string url; - HeaderMap headers; - std::string ca_path; - BodyHandler body_handler = nullptr; - }; - - struct SimpleHTTPResponse - { - long status_code; - HeaderMap headers; - }; - static inline SimpleHTTPResponse make_curl_request( - const SimpleHTTPRequest& request) - { - ccf::curl::UniqueCURL curl; - - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_URL, request.url.c_str()); - if (request.method == HTTP_HEAD) - { - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_NOBODY, 1L); - } - else if (request.method == HTTP_GET) - { - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HTTPGET, 1L); - } - else - { - throw std::logic_error( - fmt::format("Unsupported HTTP method: {}", request.method.c_str())); - } - - SimpleHTTPResponse response; - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERDATA, &response.headers); - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERFUNCTION, append_header); - - curl_easy_setopt(curl, CURLOPT_CAINFO, request.ca_path.c_str()); - - ccf::curl::UniqueSlist list; - for (const auto& [k, v] : request.headers) - { - list.append(fmt::format("{}: {}", k, v).c_str()); - } - - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HTTPHEADER, list.get()); - - if (request.body_handler != nullptr) - { - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEDATA, &request.body_handler); - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEFUNCTION, curl_write_callback); - } - - LOG_TRACE_FMT( - "Sending curl request {} {}", request.method.c_str(), request.url); - - CHECK_CURL_EASY(curl_easy_perform, curl); - - CHECK_CURL_EASY_GETINFO( - curl, CURLINFO_RESPONSE_CODE, &response.status_code); - - LOG_TRACE_FMT( - "{} {} returned {}", - request.method.c_str(), - request.url, - response.status_code); - - return response; - } - struct SnapshotResponse { std::string snapshot_name; @@ -173,38 +50,48 @@ namespace snapshots // snapshot std::string snapshot_url; { - const auto initial_url = fmt::format( + ccf::curl::UniqueCURL curl_easy; + curl_easy.set_opt(CURLOPT_CAINFO, path_to_peer_cert.c_str()); + + auto initial_url = fmt::format( "https://{}/node/snapshot?since={}", peer_address, latest_local_snapshot); - SimpleHTTPRequest initial_request; - initial_request.method = HTTP_HEAD; - initial_request.url = initial_url; - initial_request.ca_path = path_to_peer_cert; + ccf::curl::UniqueSlist headers; - const auto initial_response = make_curl_request(initial_request); - if (initial_response.status_code == HTTP_STATUS_NOT_FOUND) + auto request = ccf::curl::CurlRequest( + std::move(curl_easy), + HTTP_HEAD, + std::move(initial_url), + std::move(headers), + nullptr, // No request body + true, // Expect a response + std::nullopt // No response callback + ); + + const auto status_code = request.syncronous_perform(); + if (status_code == HTTP_STATUS_NOT_FOUND) { LOG_INFO_FMT( "Peer has no snapshot newer than {}", latest_local_snapshot); return std::nullopt; } - else if (initial_response.status_code != HTTP_STATUS_PERMANENT_REDIRECT) + if (status_code != HTTP_STATUS_PERMANENT_REDIRECT) { EXPECT_HTTP_RESPONSE_STATUS( - initial_request, initial_response, HTTP_STATUS_PERMANENT_REDIRECT); + request, status_code, HTTP_STATUS_PERMANENT_REDIRECT); } - auto location_it = - initial_response.headers.find(ccf::http::headers::LOCATION); - if (location_it == initial_response.headers.end()) + auto* response = request.get_response(); + auto location_it = response->headers.find(ccf::http::headers::LOCATION); + if (location_it == response->headers.end()) { throw std::runtime_error(fmt::format( "Expected {} header in redirect response from {} {}, none found", ccf::http::headers::LOCATION, - initial_request.method.c_str(), - initial_request.url)); + request.get_method().c_str(), + request.get_url())); } LOG_TRACE_FMT("Snapshot fetch redirected to {}", location_it->second); @@ -214,28 +101,40 @@ namespace snapshots } // Make follow-up request to redirected URL, to fetch total content size - size_t content_size; + size_t content_size = 0; { - SimpleHTTPRequest snapshot_size_request; - snapshot_size_request.method = HTTP_HEAD; - snapshot_size_request.url = snapshot_url; - snapshot_size_request.ca_path = path_to_peer_cert; + ccf::curl::UniqueCURL curl_easy; + curl_easy.set_opt(CURLOPT_CAINFO, path_to_peer_cert.c_str()); + + ccf::curl::UniqueSlist headers; - const auto snapshot_size_response = - make_curl_request(snapshot_size_request); + ccf::curl::CurlRequest snapshot_size_request( + std::move(curl_easy), + HTTP_HEAD, + std::move(snapshot_url), + std::move(headers), + nullptr, // No request body + std::nullopt // No response callback + ); + + auto snapshot_size_status_code = + snapshot_size_request.syncronous_perform(); EXPECT_HTTP_RESPONSE_STATUS( - snapshot_size_request, snapshot_size_response, HTTP_STATUS_OK); + snapshot_size_request, snapshot_size_status_code, HTTP_STATUS_OK); + + auto* snapshot_size_response = snapshot_size_request.get_response(); - auto content_size_it = snapshot_size_response.headers.find( + auto content_size_it = snapshot_size_response->headers.find( ccf::http::headers::CONTENT_LENGTH); - if (content_size_it == snapshot_size_response.headers.end()) + + if (content_size_it == snapshot_size_response->headers.end()) { throw std::runtime_error(fmt::format( - "Expected {} header in redirect response from {} {}, none found", + "Expected {} header in response from {} {}, none found", ccf::http::headers::CONTENT_LENGTH, - snapshot_size_request.method.c_str(), - snapshot_size_request.url)); + snapshot_size_request.get_method().c_str(), + snapshot_size_request.get_url())); } const auto& content_size_s = content_size_it->second; @@ -246,16 +145,16 @@ namespace snapshots if (ec != std::errc()) { throw std::runtime_error(fmt::format( - "Invalid {} header in redirect response from {} {}: {}", + "Failed to parse {} header in response from {} {}: {}", ccf::http::headers::CONTENT_LENGTH, - snapshot_size_request.method.c_str(), - snapshot_size_request.url, + snapshot_size_request.get_method().c_str(), + snapshot_size_request.get_url(), ec)); } } // Fetch 4MB chunks at a time - constexpr size_t range_size = 4 * 1024 * 1024; + constexpr size_t range_size = 4L * 1024 * 1024; LOG_TRACE_FMT( "Preparing to fetch {}-byte snapshot from peer, {} bytes per-request", content_size, @@ -269,28 +168,42 @@ namespace snapshots while (true) { - SimpleHTTPRequest snapshot_range_request; - snapshot_range_request.method = HTTP_GET; - snapshot_range_request.url = snapshot_url; - snapshot_range_request.headers["range"] = - fmt::format("bytes={}-{}", range_start, range_end); - snapshot_range_request.ca_path = path_to_peer_cert; - - snapshot_range_request.body_handler = [&](const auto& data) { - LOG_TRACE_FMT( - "Copying {} bytes into snapshot, starting at {}", - range_size, - range_start); - memcpy(snapshot.data() + range_start, data.data(), data.size()); - range_start += data.size(); - }; - - const auto range_response = make_curl_request(snapshot_range_request); - + ccf::curl::UniqueCURL curl_easy; + curl_easy.set_opt(CURLOPT_CAINFO, path_to_peer_cert.c_str()); + + ccf::curl::UniqueSlist headers; + headers.append( + "Range", fmt::format("bytes={}-{}", range_start, range_end)); + + ccf::curl::CurlRequest snapshot_range_request( + std::move(curl_easy), + HTTP_GET, + std::move(snapshot_url), + std::move(headers), + nullptr, // No request body + nullptr // No response callback + ); + + auto snapshot_range_status_code = + snapshot_range_request.syncronous_perform(); EXPECT_HTTP_RESPONSE_STATUS( snapshot_range_request, - range_response, - HTTP_STATUS_PARTIAL_CONTENT); + snapshot_range_status_code, + HTTP_STATUS_PARTIAL_CONTENT) + + LOG_TRACE_FMT( + "Received {}-byte chunk from {}: {} bytes", + range_end - range_start, + snapshot_range_request.get_url(), + snapshot_range_status_code); + + auto* snapshot_range_response = snapshot_range_request.get_response(); + // This is an extra copy which would be good to avoid, but avoiding it + // with the current response interface is very messy... + memcpy( + snapshot.data() + range_start, + snapshot_range_response->buffer.data(), + snapshot_range_response->buffer.size()); if (range_end == content_size) { From 6c44deb98fff204f9b686c383d496e21bcea60b7 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 24 Jul 2025 14:38:23 +0100 Subject: [PATCH 036/197] fix --- src/node/quote_endorsements_client.h | 10 ++++++---- src/snapshots/fetch.h | 3 +-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 5e6535bc762b..fd9a44eca6f7 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -190,11 +190,12 @@ namespace ccf headers.append(http::headers::HOST, endpoint.host); auto response_callback = ([this, server, endpoint]( - curl::CurlRequest& request) { + curl::CurlRequest& request, + long status_code) { std::lock_guard guard(this->lock); auto* response = request.get_response(); - if (response->status_code == HTTP_STATUS_OK) + if (status_code == HTTP_STATUS_OK) { LOG_INFO_FMT( "Successfully retrieved endorsements for attestation report: " @@ -207,8 +208,8 @@ namespace ccf LOG_DEBUG_FMT( "Error fetching endorsements for attestation report: {}", - response->status_code); - if (response->status_code == HTTP_STATUS_TOO_MANY_REQUESTS) + status_code); + if (status_code == HTTP_STATUS_TOO_MANY_REQUESTS) { constexpr size_t default_retry_after_s = 3; size_t retry_after_s = default_retry_after_s; @@ -244,6 +245,7 @@ namespace ccf auto request = std::make_unique( std::move(curl_handle), + HTTP_GET, std::move(url), std::move(headers), nullptr, diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index 3eb5046538b8..c1367044ada7 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -66,7 +66,6 @@ namespace snapshots std::move(initial_url), std::move(headers), nullptr, // No request body - true, // Expect a response std::nullopt // No response callback ); @@ -189,7 +188,7 @@ namespace snapshots EXPECT_HTTP_RESPONSE_STATUS( snapshot_range_request, snapshot_range_status_code, - HTTP_STATUS_PARTIAL_CONTENT) + HTTP_STATUS_PARTIAL_CONTENT); LOG_TRACE_FMT( "Received {}-byte chunk from {}: {} bytes", From 1fbd0150c65e0f85d6497072abe7cff5a6cfb171 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 16:06:58 +0100 Subject: [PATCH 037/197] Pass through config bits for self-heal-open --- include/ccf/node/startup_config.h | 2 ++ include/ccf/service/tables/self_heal_open.h | 39 +++++++++++++++++++++ src/common/configuration.h | 9 +++-- src/host/configuration.h | 5 ++- src/host/main.cpp | 2 ++ 5 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 include/ccf/service/tables/self_heal_open.h diff --git a/include/ccf/node/startup_config.h b/include/ccf/node/startup_config.h index bd2684459d85..7dcfd9055194 100644 --- a/include/ccf/node/startup_config.h +++ b/include/ccf/node/startup_config.h @@ -146,6 +146,8 @@ namespace ccf std::nullopt; std::optional previous_sealed_ledger_secret_location = std::nullopt; + std::optional> + self_heal_open_addresses = std::nullopt; }; Recover recover = {}; }; diff --git a/include/ccf/service/tables/self_heal_open.h b/include/ccf/service/tables/self_heal_open.h new file mode 100644 index 000000000000..5920c5206001 --- /dev/null +++ b/include/ccf/service/tables/self_heal_open.h @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. +#pragma once + +#include "ccf/ds/json.h" +#include "ccf/ds/quote_info.h" +#include "ccf/service/map.h" +#include "node/identity.h" + +using IntrinsicIdentifier = std::string; + +struct SelfHealOpenNodeInfo +{ + ccf::QuoteInfo quote_info; + std::string published_network_address; + std::vector cert_der; + IntrinsicIdentifier intrinsic_id; +}; + +DECLARE_JSON_TYPE(SelfHealOpenNodeInfo); +DECLARE_JSON_REQUIRED_FIELDS( + SelfHealOpenNodeInfo, quote_info, published_network_address, cert_der); + +namespace ccf +{ + using SelfHealOpenNodeState = ServiceMap; + using SelfHealOpenGossipState = ServiceMap; + using SelfHealOpenChosenReplica = ServiceValue; + using SelfHealOpenVotes = ServiceSet; + + namespace Tables + { + static constexpr auto SELF_HEAL_OPEN_NODES = "public:ccf.gov.selfhealopen.nodes"; + static constexpr auto SELF_HEAL_OPEN_GOSSIP_STATE = "public:ccf.gov.selfhealopen.gossip"; + static constexpr auto SELF_HEAL_OPEN_CHOSEN_REPLICA = + "public:ccf.gov.selfhealopen.chosen_replica"; + static constexpr auto SELF_HEAL_OPEN_VOTES = "public:ccf.gov.selfhealopen.votes"; + } +} diff --git a/src/common/configuration.h b/src/common/configuration.h index 27fa738198ff..ccfc7bd4faca 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -5,6 +5,7 @@ #include "ccf/crypto/curve.h" #include "ccf/crypto/pem.h" +#include "ccf/ds/json.h" #include "ccf/ds/logger.h" #include "ccf/ds/unit_strings.h" #include "ccf/node/startup_config.h" @@ -128,11 +129,13 @@ namespace ccf service_cert, follow_redirect); - DECLARE_JSON_TYPE(StartupConfig::Recover); + DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(StartupConfig::Recover); DECLARE_JSON_REQUIRED_FIELDS( + StartupConfig::Recover, previous_service_identity); + DECLARE_JSON_OPTIONAL_FIELDS( StartupConfig::Recover, - previous_service_identity, - previous_sealed_ledger_secret_location); + previous_sealed_ledger_secret_location, + self_heal_open_addresses); DECLARE_JSON_TYPE_WITH_BASE(StartupConfig, CCFConfig); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/host/configuration.h b/src/host/configuration.h index 1995c63d5a1a..d336f358bc2b 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -141,6 +141,8 @@ namespace host std::string previous_service_identity_file; std::optional previous_sealed_ledger_secret_location = std::nullopt; + std::optional> self_heal_open_addresses = + std::nullopt; bool operator==(const Recover&) const = default; }; Recover recover = {}; @@ -195,7 +197,8 @@ namespace host CCHostConfig::Command::Recover, initial_service_certificate_validity_days, previous_service_identity_file, - previous_sealed_ledger_secret_location); + previous_sealed_ledger_secret_location, + self_heal_open_addresses); DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCHostConfig::Command); DECLARE_JSON_REQUIRED_FIELDS(CCHostConfig::Command, type); diff --git a/src/host/main.cpp b/src/host/main.cpp index afe17b77fdab..635589d478c8 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -825,6 +825,8 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) startup_config.recover.previous_sealed_ledger_secret_location = config.command.recover.previous_sealed_ledger_secret_location; } + startup_config.recover.self_heal_open_addresses = + config.command.recover.self_heal_open_addresses; } else { From c790f4ddad4250e454304f8572cde7feac847250 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 22 Jul 2025 17:02:37 +0100 Subject: [PATCH 038/197] Update test infra to test self-healing-open --- tests/config.jinja | 9 +++- tests/e2e_operations.py | 38 +++++++++++++++ tests/infra/network.py | 102 +++++++++++++++++++++++++++++++++------- tests/infra/remote.py | 2 + 4 files changed, 132 insertions(+), 19 deletions(-) diff --git a/tests/config.jinja b/tests/config.jinja index 08ae37370f8d..0930b7bc5405 100644 --- a/tests/config.jinja +++ b/tests/config.jinja @@ -56,10 +56,15 @@ }, "recover": { "initial_service_certificate_validity_days": {{ initial_service_cert_validity_days }}, - {% if previous_sealed_ledger_secret_location %} + "previous_service_identity_file": "{{ previous_service_identity_file }}" {% if previous_sealed_ledger_secret_location %}, "previous_sealed_ledger_secret_location": "{{ previous_sealed_ledger_secret_location }}", + {% endif %} {% if self_heal_open_addresses %}, + "self_heal_open_addresses" : [ + {% for address in self_heal_open_addresses %} + "{{ address }}" {% if not loop.last %},{% endif %} + {% endfor %} + ] {% endif %} - "previous_service_identity_file": "{{ previous_service_identity_file }}" } }, "ledger": diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index e779888e09a5..96ad5730d2c6 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1437,6 +1437,44 @@ def run(self, src_dir, dst_dir): recovery_network.stop_all_nodes() prev_network = recovery_network +def run_self_healing_open(args): + args.nodes = infra.e2e_args.min_nodes() + with infra.network.network( + args.nodes, + args.binary_dir, + args.debug_nodes, + args.perf_nodes, + ) as network: + LOG.info("Start a network and stop it") + network.start_and_open(args) + old_common = infra.network.get_common_folder_name(args.workspace, args.label) + network.stop_all_nodes() + + ledger_dirs = {} + committed_ledger_dirs = {} + for i, node in enumerate(network.nodes): + l, c = node.get_ledger() + ledger_dirs[i] = l + committed_ledger_dirs[i] = c + + LOG.info("Start a recovery network and stop it") + recovered_network = infra.network.Network( + args.nodes, + args.binary_dir, + args.debug_nodes, + args.perf_nodes, + existing_network=network, + ) + args.previous_service_identity_file = os.path.join( + old_common, "service_cert.pem" + ) + recovered_network.start_in_auto_dr( + args, + ledger_dirs=ledger_dirs, + committed_ledger_dirs=committed_ledger_dirs, + common_dir=network.common_dir, + ) + recovered_network.stop_all_nodes() def run_read_ledger_on_testdata(args): for testdata_dir in os.scandir(args.historical_testdata): diff --git a/tests/infra/network.py b/tests/infra/network.py index 39fc48797cfb..142e128ddb7b 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -435,6 +435,7 @@ def _start_all_nodes( self, args, recovery=False, + self_heal_open=False, ledger_dir=None, read_only_ledger_dirs=None, snapshots_dir=None, @@ -456,12 +457,16 @@ def _start_all_nodes( for arg in infra.network.Network.node_args_to_forward } + self_heal_open_addresses = [ + node.get_public_rpc_address() for node in self.nodes + ] + for i, node in enumerate(self.nodes): forwarded_args_with_overrides = forwarded_args.copy() forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) try: - if i == 0: - if not recovery: + if i == 0 or self_heal_open: + if not (recovery or self_heal_open): node.start( lib_name=args.package, workspace=args.workspace, @@ -473,17 +478,19 @@ def _start_all_nodes( **kwargs, ) else: - node.recover( - lib_name=args.package, - workspace=args.workspace, - label=args.label, - common_dir=self.common_dir, - ledger_dir=ledger_dir, - read_only_ledger_dirs=read_only_ledger_dirs, - snapshots_dir=snapshots_dir, - **forwarded_args_with_overrides, - **kwargs, - ) + node_kwargs = { + "lib_name": args.package, + "workspace": args.workspace, + "label": args.label, + "common_dir": self.common_dir, + "ledger_dir": ledger_dir, + "read_only_ledger_dirs": read_only_ledger_dirs, + "snapshots_dir": snapshots_dir, + } + self_heal_open_kwargs = {"self_heal_open_addresses": self_heal_open_addresses} + # If a kwarg is passed in override automatically set variants + node_kwargs = node_kwargs | self_heal_open_kwargs | forwarded_args_with_overrides | kwargs + node.recover(**node_kwargs) self.wait_for_state( node, infra.node.State.PART_OF_PUBLIC_NETWORK.value, @@ -758,6 +765,62 @@ def start_in_recovery( self.wait_for_all_nodes_to_commit(primary=primary, timeout=20) LOG.success("All nodes joined public network") + def start_in_self_healing_open( + self, + args, + ledger_dirs, + committed_ledger_dirs= None, + snapshot_dirs= None, + common_dir=None, + set_authenticate_session=None, + **kwargs, + ): + self.common_dir = common_dir or get_common_folder_name( + args.workspace, args.label + ) + + self.per_node_args_override = self.per_node_args_override or {i: {} for i in range(len(self.nodes))} + committed_ledger_dirs = committed_ledger_dirs or {i: None for i in range(len(self.nodes))} + snapshot_dirs = snapshot_dirs or {i: None for i in range(len(self.nodes))} + self.per_node_args_override = { + i: + (d | { + "ledger_dir" : ledger_dirs[i], + "read_only_ledger_dirs" : committed_ledger_dirs[i] or [], + "snapshots_dir" : snapshot_dirs[i] or None, + }) + for i, d in self.per_node_args_override.items() + } + + + for i, node in enumerate(self.nodes): + node.host.get_primary_interface().port = 5000 + (i + 1) + node.host.get_primary_interface().public_port = 5000 + (i + 1) + + LOG.info(f"Set up nodes") + for node in self.nodes: + LOG.info(node.host) + + primary = self._start_all_nodes( + args, + recovery=True, + self_heal_open=True, + **kwargs, + ) + + if set_authenticate_session is not None: + self.consortium.set_authenticate_session(set_authenticate_session) + + for node in self.get_joined_nodes(): + self.wait_for_state( + node, + infra.node.State.PART_OF_PUBLIC_NETWORK.value, + timeout=args.ledger_recovery_timeout, + ) + # Catch-up in recovery can take a long time, so extend this timeout + self.wait_for_all_nodes_to_commit(primary=primary, timeout=20) + LOG.success("All nodes joined public network") + def recover( self, args, @@ -1207,24 +1270,29 @@ def get_live_nodes(self): def get_f(self): return infra.e2e_args.max_f(self.args, len(self.nodes)) - def wait_for_state(self, node, state, timeout=3): + def wait_for_states(self, node, states, timeout=3): end_time = time.time() + timeout + final_state = None while time.time() < end_time: try: with node.client(connection_timeout=timeout) as c: r = c.get("/node/state").body.json() - if r["state"] == state: + if r["state"] in states: + final_state = r["state"] break except ConnectionRefusedError: pass time.sleep(0.1) else: raise TimeoutError( - f"Timed out waiting for state {state} on node {node.node_id}" + f"Timed out waiting for a state in {states} on node {node.node_id}" ) - if state == infra.node.State.PART_OF_NETWORK.value: + if final_state == infra.node.State.PART_OF_NETWORK.value: self.status = ServiceStatus.OPEN + def wait_for_state(self, node, state, timeout=3): + self.wait_for_states(node, [state], timeout=timeout) + def _wait_for_app_open(self, node, timeout=3): end_time = time.time() + timeout logs = [] diff --git a/tests/infra/remote.py b/tests/infra/remote.py index 4f7e85a1242d..e579b6741f4a 100644 --- a/tests/infra/remote.py +++ b/tests/infra/remote.py @@ -319,6 +319,7 @@ def __init__( cose_signatures_subject="ledger.signature", sealed_ledger_secret_location=None, previous_sealed_ledger_secret_location=None, + self_heal_open_addresses=None, **kwargs, ): """ @@ -537,6 +538,7 @@ def __init__( historical_cache_soft_limit=historical_cache_soft_limit, cose_signatures_issuer=cose_signatures_issuer, cose_signatures_subject=cose_signatures_subject, + self_heal_open_addresses=self_heal_open_addresses, **auto_dr_args, **kwargs, ) From b153e53ab93ffcd80eb0fe9588777d86b575a157 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 28 Jul 2025 16:10:36 +0100 Subject: [PATCH 039/197] Fix undefined request body and multi-threaded access to curl --- src/http/curl.h | 56 +++++++++++++++++++--------- src/node/quote_endorsements_client.h | 12 +++--- src/snapshots/fetch.h | 54 ++++++++++++++++++++++++--- 3 files changed, 95 insertions(+), 27 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 7502d34ba8d7..3a9b7837edc8 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -65,15 +66,26 @@ namespace ccf::curl return p.get(); } - void set_blob_opt(auto option, const auto* data, size_t length) + void set_blob_opt(auto option, const uint8_t* data, size_t length) { + if (data == nullptr || length == 0) + { + throw std::invalid_argument( + "Data pointer cannot be null or length zero"); + } + + if (p == nullptr) + { + throw std::logic_error("Cannot set option on a null CURL handle"); + } + struct curl_blob blob { .data = const_cast(data), .len = length, .flags = CURL_BLOB_COPY, }; - CHECK_CURL_EASY_SETOPT(p.get(), option, blob); + CHECK_CURL_EASY_SETOPT(p.get(), option, &blob); } void set_opt(auto option, auto value) @@ -270,6 +282,10 @@ namespace ccf::curl class CurlRequest { + public: + using ResponseCallback = std::function; + private: UniqueCURL curl_handle; RESTVerb method = HTTP_GET; @@ -277,8 +293,7 @@ namespace ccf::curl ccf::curl::UniqueSlist headers; std::unique_ptr request_body = nullptr; std::unique_ptr response = nullptr; - std::optional> response_callback = - nullptr; + std::optional response_callback = nullptr; public: CurlRequest( @@ -287,8 +302,7 @@ namespace ccf::curl std::string&& url_, UniqueSlist&& headers_, std::unique_ptr&& request_body_, - std::optional>&& - response_callback_) : + std::optional&& response_callback_) : curl_handle(std::move(curl_handle_)), method(method_), url(std::move(url_)), @@ -345,18 +359,18 @@ namespace ccf::curl } } - void handle_response() + void handle_response(CURLcode curl_response_code) { if (response_callback.has_value()) { long status_code = 0; CHECK_CURL_EASY_GETINFO( curl_handle, CURLINFO_RESPONSE_CODE, &status_code); - response_callback.value()(*this, status_code); + response_callback.value()(*this, curl_response_code, status_code); } } - long syncronous_perform() + void synchronous_perform(CURLcode& curl_code, long& status_code) { if (curl_handle == nullptr) { @@ -364,14 +378,12 @@ namespace ccf::curl "Cannot curl_easy_perform on a null CURL handle"); } - CHECK_CURL_EASY(curl_easy_perform, curl_handle); + curl_code = curl_easy_perform(curl_handle); - handle_response(); // handle the response callback if set + handle_response(curl_code); // handle the response callback if set - long status_code = 0; CHECK_CURL_EASY_GETINFO( curl_handle, CURLINFO_RESPONSE_CODE, &status_code); - return status_code; } [[nodiscard]] CURL* get_easy_handle() const @@ -465,7 +477,7 @@ namespace ccf::curl // detach the easy handle such that it can be cleaned up with the // destructor of CurlRequest curl_multi_remove_handle(curl_multi, easy); - request->handle_response(); + request->handle_response(result); } } while (msgq > 0); return running_handles; @@ -503,6 +515,13 @@ namespace ccf::curl UniqueCURLM curl_multi; // utility class to enforce type safety on accesses to curl_multi CurlRequestCURLM curl_request_curlm; + // We need a lock to prevent a client thread calling curl_multi_add_handle + // while the libuv thread is processing a curl callback + // + // Note that since the a client callback can call curl_multi_add_handle, but + // that will be difficult/impossible to detect, we need curlm_lock to be + // recursive. + std::recursive_mutex curlm_lock; struct RequestContext { @@ -525,6 +544,7 @@ namespace ccf::curl throw std::logic_error( "libuv_timeout_callback called with null self pointer"); } + std::lock_guard lock(self->curlm_lock); int running_handles = 0; CHECK_CURL_MULTI( @@ -584,6 +604,7 @@ namespace ccf::curl throw std::logic_error( "libuv_socket_poll_callback called with null self pointer"); } + std::lock_guard lock(self->curlm_lock); int action = 0; action |= ((events & UV_READABLE) != 0) ? CURL_CSELECT_IN : 0; @@ -689,11 +710,12 @@ namespace ccf::curl &running_handles); } - // should this return a reference or a pointer? - [[nodiscard]] CurlRequestCURLM& curlm() + void attach_request(std::unique_ptr& request) { - return curl_request_curlm; + std::lock_guard lock(curlm_lock); + curl_request_curlm.attach_curl_request(request); } + }; class CurlmLibuvContextSingleton diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index fd9a44eca6f7..73d52a50995a 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -191,11 +191,12 @@ namespace ccf auto response_callback = ([this, server, endpoint]( curl::CurlRequest& request, + CURLcode curl_response, long status_code) { std::lock_guard guard(this->lock); auto* response = request.get_response(); - if (status_code == HTTP_STATUS_OK) + if (curl_response == CURLE_OK && status_code == HTTP_STATUS_OK) { LOG_INFO_FMT( "Successfully retrieved endorsements for attestation report: " @@ -207,9 +208,11 @@ namespace ccf } LOG_DEBUG_FMT( - "Error fetching endorsements for attestation report: {}", + "Error fetching endorsements for attestation report: {} ({}) {}", + curl_easy_strerror(curl_response), + curl_response, status_code); - if (status_code == HTTP_STATUS_TOO_MANY_REQUESTS) + if (curl_response == CURLE_OK && status_code == HTTP_STATUS_TOO_MANY_REQUESTS) { constexpr size_t default_retry_after_s = 3; size_t retry_after_s = default_retry_after_s; @@ -313,8 +316,7 @@ namespace ccf request->get_url()); curl::CurlmLibuvContextSingleton::get_instance_unsafe() - ->curlm() - .attach_curl_request(request); + ->attach_request(request); } public: diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index c1367044ada7..c9c431ed41a6 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -69,7 +69,17 @@ namespace snapshots std::nullopt // No response callback ); - const auto status_code = request.syncronous_perform(); + long status_code = 0; + CURLcode curl_response = CURLE_OK; + request.synchronous_perform(curl_response, status_code); + if (curl_response != CURLE_OK) + { + throw std::runtime_error(fmt::format( + "Error fetching snapshot redirect from {}: {} ({})", + request.get_url(), + curl_easy_strerror(curl_response), + status_code)); + } if (status_code == HTTP_STATUS_NOT_FOUND) { LOG_INFO_FMT( @@ -116,8 +126,19 @@ namespace snapshots std::nullopt // No response callback ); - auto snapshot_size_status_code = - snapshot_size_request.syncronous_perform(); + CURLcode snapshot_size_curl_code = CURLE_OK; + long snapshot_size_status_code = 0; + snapshot_size_request.synchronous_perform( + snapshot_size_curl_code, snapshot_size_status_code); + + if (snapshot_size_curl_code != CURLE_OK) + { + throw std::runtime_error(fmt::format( + "Error fetching snapshot size from {}: {} ({})", + snapshot_size_request.get_url(), + curl_easy_strerror(snapshot_size_curl_code), + snapshot_size_status_code)); + } EXPECT_HTTP_RESPONSE_STATUS( snapshot_size_request, snapshot_size_status_code, HTTP_STATUS_OK); @@ -174,6 +195,19 @@ namespace snapshots headers.append( "Range", fmt::format("bytes={}-{}", range_start, range_end)); + auto response_callback = []( + ccf::curl::CurlRequest& request, + CURLcode curl_response_code, + long status_code) { + if (curl_response_code != CURLE_OK) + { + throw std::runtime_error(fmt::format( + "Error fetching snapshot chunk: {} ({})", + curl_easy_strerror(curl_response_code), + status_code)); + } + }; + ccf::curl::CurlRequest snapshot_range_request( std::move(curl_easy), HTTP_GET, @@ -183,8 +217,18 @@ namespace snapshots nullptr // No response callback ); - auto snapshot_range_status_code = - snapshot_range_request.syncronous_perform(); + CURLcode curl_response = CURLE_OK; + long snapshot_range_status_code = 0; + snapshot_range_request.synchronous_perform( + curl_response, snapshot_range_status_code); + if (curl_response != CURLE_OK) + { + throw std::runtime_error(fmt::format( + "Error fetching snapshot chunk range from {}: {} ({})", + snapshot_range_request.get_url(), + curl_easy_strerror(curl_response), + snapshot_range_status_code)); + } EXPECT_HTTP_RESPONSE_STATUS( snapshot_range_request, snapshot_range_status_code, From fee3559e725af6713805ef396f4066a221497a41 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 28 Jul 2025 19:05:01 +0100 Subject: [PATCH 040/197] Runnable checkpoint --- doc/host_config_schema/cchost_config.json | 7 + include/ccf/node/startup_config.h | 2 +- include/ccf/service/tables/self_heal_open.h | 52 +++- src/common/configuration.h | 2 +- src/host/configuration.h | 4 +- src/host/main.cpp | 4 +- src/node/node_state.h | 189 +++++++++++++++ src/node/rpc/node_frontend.h | 251 ++++++++++++++++++++ src/node/rpc/node_interface.h | 1 + src/node/rpc/node_operation.h | 5 + src/node/rpc/node_operation_interface.h | 2 + src/node/rpc/test/node_stub.h | 5 + src/node/self_healing_open.h | 116 +++++++++ src/service/network_tables.h | 13 + tests/config.jinja | 6 +- tests/e2e_operations.py | 8 +- tests/infra/network.py | 14 +- tests/infra/remote.py | 4 +- 18 files changed, 651 insertions(+), 34 deletions(-) create mode 100644 src/node/self_healing_open.h diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index db118e65e277..49e1c242dfa0 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -441,6 +441,13 @@ "previous_sealed_ledger_secret_location": { "type": ["string"], "description": "Path to the sealed ledger secret folder, the ledger secrets for the recovered service will be unsealed from here instead of reconstructed from recovery shares." + }, + "self_healing_open_addresses": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of addresses (host:port) of the cluster that should open via self-healing-open" } }, "required": ["previous_service_identity_file"], diff --git a/include/ccf/node/startup_config.h b/include/ccf/node/startup_config.h index 7dcfd9055194..0f370ac37f14 100644 --- a/include/ccf/node/startup_config.h +++ b/include/ccf/node/startup_config.h @@ -147,7 +147,7 @@ namespace ccf std::optional previous_sealed_ledger_secret_location = std::nullopt; std::optional> - self_heal_open_addresses = std::nullopt; + self_healing_open_addresses = std::nullopt; }; Recover recover = {}; }; diff --git a/include/ccf/service/tables/self_heal_open.h b/include/ccf/service/tables/self_heal_open.h index 5920c5206001..86b4b56941d8 100644 --- a/include/ccf/service/tables/self_heal_open.h +++ b/include/ccf/service/tables/self_heal_open.h @@ -6,10 +6,11 @@ #include "ccf/ds/quote_info.h" #include "ccf/service/map.h" #include "node/identity.h" +#include "ccf/ds/enum_formatter.h" using IntrinsicIdentifier = std::string; -struct SelfHealOpenNodeInfo +struct SelfHealingOpenNodeInfo_t { ccf::QuoteInfo quote_info; std::string published_network_address; @@ -17,23 +18,50 @@ struct SelfHealOpenNodeInfo IntrinsicIdentifier intrinsic_id; }; -DECLARE_JSON_TYPE(SelfHealOpenNodeInfo); +DECLARE_JSON_TYPE(SelfHealingOpenNodeInfo_t); DECLARE_JSON_REQUIRED_FIELDS( - SelfHealOpenNodeInfo, quote_info, published_network_address, cert_der); + SelfHealingOpenNodeInfo_t, + quote_info, + published_network_address, + cert_der, + intrinsic_id); + +enum class SelfHealingOpenSM{ + GOSSIPPING = 0, + VOTING, + OPENING, // by chosen replica + JOINING, // by all other replicas + OPEN, +}; + +DECLARE_JSON_ENUM( + SelfHealingOpenSM, + {{SelfHealingOpenSM::GOSSIPPING, "Gossipping"}, + {SelfHealingOpenSM::VOTING, "Voting"}, + {SelfHealingOpenSM::OPENING, "Opening"}, + {SelfHealingOpenSM::JOINING, "Joining"}}); namespace ccf { - using SelfHealOpenNodeState = ServiceMap; - using SelfHealOpenGossipState = ServiceMap; - using SelfHealOpenChosenReplica = ServiceValue; - using SelfHealOpenVotes = ServiceSet; + using SelfHealingOpenNodeInfo = + ServiceMap; + using SelfHealingOpenGossipState = + ServiceMap; + using SelfHealingOpenChosenReplica = ServiceValue; + using SelfHealingOpenVotes = ServiceSet; + using SelfHealingOpenSMState = ServiceValue; namespace Tables { - static constexpr auto SELF_HEAL_OPEN_NODES = "public:ccf.gov.selfhealopen.nodes"; - static constexpr auto SELF_HEAL_OPEN_GOSSIP_STATE = "public:ccf.gov.selfhealopen.gossip"; - static constexpr auto SELF_HEAL_OPEN_CHOSEN_REPLICA = - "public:ccf.gov.selfhealopen.chosen_replica"; - static constexpr auto SELF_HEAL_OPEN_VOTES = "public:ccf.gov.selfhealopen.votes"; + static constexpr auto SELF_HEALING_OPEN_NODES = + "public:ccf.gov.selfhealingopen.nodes"; + static constexpr auto SELF_HEALING_OPEN_GOSSIP_STATE = + "public:ccf.gov.selfhealingopen.gossip"; + static constexpr auto SELF_HEALING_OPEN_CHOSEN_REPLICA = + "public:ccf.gov.selfhealingopen.chosen_replica"; + static constexpr auto SELF_HEALING_OPEN_VOTES = + "public:ccf.gov.selfhealingopen.votes"; + static constexpr auto SELF_HEALING_OPEN_SM_STATE = + "public:ccf.gov.selfhealingopen.sm_state"; } } diff --git a/src/common/configuration.h b/src/common/configuration.h index ccfc7bd4faca..6e9627cd9395 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -135,7 +135,7 @@ namespace ccf DECLARE_JSON_OPTIONAL_FIELDS( StartupConfig::Recover, previous_sealed_ledger_secret_location, - self_heal_open_addresses); + self_healing_open_addresses); DECLARE_JSON_TYPE_WITH_BASE(StartupConfig, CCFConfig); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/host/configuration.h b/src/host/configuration.h index d336f358bc2b..38e995116bba 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -141,7 +141,7 @@ namespace host std::string previous_service_identity_file; std::optional previous_sealed_ledger_secret_location = std::nullopt; - std::optional> self_heal_open_addresses = + std::optional> self_healing_open_addresses = std::nullopt; bool operator==(const Recover&) const = default; }; @@ -198,7 +198,7 @@ namespace host initial_service_certificate_validity_days, previous_service_identity_file, previous_sealed_ledger_secret_location, - self_heal_open_addresses); + self_healing_open_addresses); DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCHostConfig::Command); DECLARE_JSON_REQUIRED_FIELDS(CCHostConfig::Command, type); diff --git a/src/host/main.cpp b/src/host/main.cpp index 635589d478c8..6a299497fa68 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -825,8 +825,8 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) startup_config.recover.previous_sealed_ledger_secret_location = config.command.recover.previous_sealed_ledger_secret_location; } - startup_config.recover.self_heal_open_addresses = - config.command.recover.self_heal_open_addresses; + startup_config.recover.self_healing_open_addresses = + config.command.recover.self_healing_open_addresses; } else { diff --git a/src/node/node_state.h b/src/node/node_state.h index 8fe7581f19c0..7cd392320a10 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -8,6 +8,7 @@ #include "ccf/crypto/verifier.h" #include "ccf/ds/json.h" #include "ccf/ds/logger.h" +#include "ccf/ds/unit_strings.h" #include "ccf/js/core/context.h" #include "ccf/node/cose_signatures_config.h" #include "ccf/pal/attestation_sev_snp.h" @@ -18,6 +19,7 @@ #include "ccf/service/node_info_network.h" #include "ccf/service/reconfiguration_type.h" #include "ccf/service/tables/acme_certificates.h" +#include "ccf/service/tables/self_heal_open.h" #include "ccf/service/tables/service.h" #include "ccf/tx.h" #include "ccf_acme_client.h" @@ -27,6 +29,7 @@ #include "ds/ccf_assert.h" #include "ds/files.h" #include "ds/state_machine.h" +#include "ds/thread_messaging.h" #include "enclave/rpc_sessions.h" #include "encryptor.h" #include "history.h" @@ -41,6 +44,7 @@ #include "node/ledger_secrets.h" #include "node/local_sealing.h" #include "node/node_to_node_channel_manager.h" +#include "node/self_healing_open.h" #include "node/snapshotter.h" #include "node_to_node.h" #include "pal/quote_generation.h" @@ -1981,6 +1985,78 @@ namespace ccf return history->get_cose_signatures_config(); } + void self_healing_open_start_retry_timer() override + { + auto timer_msg = std::make_unique<::threading::Tmsg>( + [](std::unique_ptr<::threading::Tmsg> msg) { + std::lock_guard guard(msg->data.self.lock); + // Keep doing this until the node is no longer in recovery + if (msg->data.self.sm.check(NodeStartupState::partOfNetwork)) + { + return; + } + + auto tx = msg->data.self.network.tables->create_read_only_tx(); + auto* sm_state_handle = + tx.ro(msg->data.self.network.self_healing_open_sm_state); + if (!sm_state_handle->get().has_value()) + { + throw std::logic_error( + "Self-healing-open state not set, cannot retry " + "self-healing-open"); + } + auto sm_state = sm_state_handle->get().value(); + + switch (sm_state) + { + case SelfHealingOpenSM::GOSSIPPING: + msg->data.self.self_healing_open_gossip_unsafe(); + break; + case SelfHealingOpenSM::VOTING: + { + auto* node_info_handle = + tx.ro(msg->data.self.network.self_healing_open_node_info); + auto* chosen_replica_handle = + tx.ro(msg->data.self.network.self_healing_open_chosen_replica); + if (!chosen_replica_handle->get().has_value()) + { + throw std::logic_error( + "Self-healing-open chosen node not set, cannot vote"); + } + auto chosen_node_info = + node_info_handle->get(chosen_replica_handle->get().value()); + if (!chosen_node_info.has_value()) + { + throw std::logic_error(fmt::format( + "Self-healing-open chosen node {} not found", + chosen_replica_handle->get().value())); + } + msg->data.self.self_healing_open_vote_unsafe( + chosen_node_info.value()); + // keep gossiping to allow lagging nodes to eventually vote + msg->data.self.self_healing_open_gossip_unsafe(); + break; + } + case SelfHealingOpenSM::OPENING: + msg->data.self.self_healing_open_iamopen_unsafe(); + break; + case SelfHealingOpenSM::JOINING: + return; + default: + throw std::logic_error(fmt::format( + "Unknown self-healing-open state: {}", + static_cast(sm_state))); + } + + auto delay = msg->data.self.config.join.retry_timeout; + ::threading::ThreadMessaging::instance().add_task_after( + std::move(msg), delay); + }, + *this); + ::threading::ThreadMessaging::instance().add_task_after( + std::move(timer_msg), ds::TimeString("0ms")); + } + private: bool is_ip(const std::string_view& hostname) { @@ -2915,6 +2991,119 @@ namespace ccf max_version); } + void self_healing_open_gossip_unsafe() + { + // Caller must ensure that the current node's quote_info is populated: + // ie not yet reached partOfNetwork + if (!config.recover.self_healing_open_addresses.has_value()) + { + LOG_TRACE_FMT( + "Self-healing-open addresses not set, cannot start gossip retries"); + return; + } + + LOG_TRACE_FMT("Broadcasting self-healing-open gossip"); + + self_healing_open::GossipRequest request{ + .info = + self_healing_open::RequestNodeInfo{ + .quote_info = quote_info, + .published_network_address = + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + .intrinsic_id = + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + }, + // TODO fix: This isn't quite right, as it should be the highest txid + // with a signature,before the recovery txs + .txid = network.tables->current_version(), + }; + + for (auto& target_address : + config.recover.self_healing_open_addresses.value()) + { + self_healing_open::dispatch_authenticated_message( + std::move(request), + target_address, + "gossip", + self_signed_node_cert, + node_sign_kp->private_key_pem()); + } + } + + void self_healing_open_vote_unsafe(SelfHealingOpenNodeInfo_t& node_info) + { + // Caller must ensure that the current node's quote_info is populated: + // ie not yet reached partOfNetwork + LOG_TRACE_FMT( + "Sending self-healing-open vote to {} at {}", + node_info.intrinsic_id, + node_info.published_network_address); + + self_healing_open::VoteRequest request{ + .info = self_healing_open::RequestNodeInfo{ + .quote_info = quote_info, + .published_network_address = + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + .intrinsic_id = + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + }}; + + self_healing_open::dispatch_authenticated_message( + std::move(request), + node_info.published_network_address, + "vote", + self_signed_node_cert, + node_sign_kp->private_key_pem()); + } + + void self_healing_open_iamopen_unsafe() + { + // Caller must ensure that the current node's quote_info is populated: + // ie not yet reached partOfNetwork + if (!config.recover.self_healing_open_addresses.has_value()) + { + LOG_TRACE_FMT( + "Self-healing-open addresses not set, cannot send iamopen"); + return; + } + + LOG_TRACE_FMT("Sending self-healing-open iamopen"); + + self_healing_open::IAmOpenRequest request{ + .info = self_healing_open::RequestNodeInfo{ + .quote_info = quote_info, + .published_network_address = + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + .intrinsic_id = + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + }}; + + for (auto& target_address : + config.recover.self_healing_open_addresses.value()) + { + if ( + target_address == + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address) + { + // Don't send to self + continue; + } + self_healing_open::dispatch_authenticated_message( + std::move(request), + target_address, + "iamopen", + self_signed_node_cert, + node_sign_kp->private_key_pem()); + } + } + public: void set_n2n_message_limit(size_t message_limit) { diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 7fb994352ce1..94954c9b6669 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -12,6 +12,7 @@ #include "ccf/pal/attestation.h" #include "ccf/pal/mem.h" #include "ccf/service/reconfiguration_type.h" +#include "ccf/service/tables/self_heal_open.h" #include "ccf/version.h" #include "crypto/certs.h" #include "crypto/csr.h" @@ -22,12 +23,15 @@ #include "node/rpc/jwt_management.h" #include "node/rpc/no_create_tx_claims_digest.cpp" #include "node/rpc/serialization.h" +#include "node/self_healing_open.h" #include "node/session_metrics.h" #include "node_interface.h" #include "service/internal_tables_access.h" #include "service/tables/previous_service_identity.h" #include "snapshots/filenames.h" +#include + namespace ccf { struct Quote @@ -414,6 +418,68 @@ namespace ccf } } + std::optional> + self_healing_open_validate_and_store_node_info( + endpoints::EndpointContext& args, + ccf::kv::Tx& tx, + const self_healing_open::RequestNodeInfo& in) + { + auto cert_der = ccf::crypto::public_key_der_from_cert( + args.rpc_ctx->get_session_context()->caller_cert); + + pal::PlatformAttestationMeasurement measurement; + QuoteVerificationResult verify_result = this->node_operation.verify_quote( + args.tx, in.quote_info, cert_der, measurement); + if (verify_result != QuoteVerificationResult::Verified) + { + const auto [code, message] = quote_verification_error(verify_result); + LOG_FAIL_FMT( + "Self-healing-open gossip from intrinsic id {} is invalid: {} ({})", + in.intrinsic_id, + code, + message); + return std::make_tuple(code, message); + } + + LOG_TRACE_FMT( + "Self-healing-open gossip from intrinsic id {}'s quote is valid", + in.intrinsic_id); + + // Validating that we haven't heard from this node before, of if we have + // that the cert hasn't changed + auto* node_info_handle = tx.rw(this->network.self_healing_open_node_info); + auto existing_node_info = node_info_handle->get(in.intrinsic_id); + + if (existing_node_info.has_value()) + { + // If we have seen this node before, check that the cert is the same + if (existing_node_info->cert_der != cert_der) + { + LOG_FAIL_FMT( + "Self-healing-open gossip from intrinsic id {} is invalid: " + "certificate has changed", + in.intrinsic_id); + return std::make_tuple( + HTTP_STATUS_BAD_REQUEST, + "Self-healing-open gossip from intrinsic id is invalid: " + "certificate has changed"); + } + } + else + { + SelfHealingOpenNodeInfo_t src_info{ + .quote_info = in.quote_info, + .published_network_address = in.published_network_address, + .cert_der = cert_der, + .intrinsic_id = in.intrinsic_id}; + node_info_handle->put(in.intrinsic_id, src_info); + } + + // TODO validate that this gossip is for the same network + + return std::nullopt; + }; + public: NodeEndpoints(NetworkState& network_, ccf::AbstractNodeContext& context_) : CommonEndpointRegistry(get_actor_prefix(ActorsType::nodes), context_), @@ -1657,6 +1723,17 @@ namespace ccf ctx.rpc_ctx->set_claims_digest(std::move(digest_value)); } + { + ccf::kv::Tx& tx = ctx.tx; + // TODO properly gate this + auto* self_healing_open_state_handle = + tx.rw( + Tables::SELF_HEALING_OPEN_SM_STATE); + self_healing_open_state_handle->put( + SelfHealingOpenSM::GOSSIPPING); + this->node_operation.self_healing_open_start_retry_timer(); + } + LOG_INFO_FMT("Created service"); return make_success(true); }; @@ -2192,6 +2269,180 @@ namespace ccf .set_forwarding_required(endpoints::ForwardingRequired::Never) .set_openapi_hidden(true) .install(); + + auto self_healing_open_gossip = [this]( + auto& args, + const nlohmann::json& params) { + auto node_configuration_subsystem = + this->context.get_subsystem(); + if (!node_configuration_subsystem) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "NodeConfigurationSubsystem is not available"); + } + + auto in = params.get(); + + auto valid = self_healing_open_validate_and_store_node_info( + args, args.tx, in.info); + if (valid.has_value()) + { + auto [code, message] = valid.value(); + return make_error(code, ccf::errors::InvalidQuote, message); + } + + LOG_TRACE_FMT("Processing self-healing-open gossip RPC"); + LOG_TRACE_FMT("Self-healing-open gossip params: {}", params.dump()); + + auto chosen_replica = + args.tx.rw(this->network.self_healing_open_chosen_replica); + // This freezes the gossips at the point where it votes + if (chosen_replica->get().has_value()) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "This replica has already voted"); + } + + auto gossip_handle = args.tx.rw(this->network.self_healing_open_gossip); + if (gossip_handle->get(in.info.intrinsic_id).has_value()) + { + LOG_INFO_FMT("Node {} already gossiped", in.info.intrinsic_id); + return make_success( + fmt::format("Node {} already gossiped", in.info.intrinsic_id)); + } + gossip_handle->put(in.info.intrinsic_id, in.txid); + + // TODO properly configure this limit + if ( + gossip_handle->size() == + node_configuration_subsystem->get() + .node_config.recover.self_healing_open_addresses.value() + .size()) + { + // Use commit handle on this to trigger the thread message to vote and + // ensure no rollbacks are possible + std::optional> min_iid; + gossip_handle->foreach([&min_iid](const auto& iid, const auto& txid) { + if ( + !min_iid.has_value() || min_iid->second < txid || + (min_iid->second == txid && min_iid->first > iid)) + { + min_iid = std::make_pair(iid, txid); + } + return true; + }); + chosen_replica->put(min_iid->first); + + auto* sm_state_handle = + args.tx.rw(this->network.self_healing_open_sm_state); + sm_state_handle->put(SelfHealingOpenSM::VOTING); + } + + return make_success(fmt::format( + "Node {} gossiped for self-healing-open", in.info.intrinsic_id)); + }; + make_endpoint( + "/self_healing_open/gossip", + HTTP_PUT, + json_adapter(self_healing_open_gossip), + no_auth_required) + .set_forwarding_required(endpoints::ForwardingRequired::Never) + .set_openapi_hidden(true) + .install(); + + auto self_healing_open_vote = + [this](auto& args, const nlohmann::json& params) { + auto node_configuration_subsystem = + this->context.get_subsystem(); + if (!node_configuration_subsystem) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "NodeConfigurationSubsystem is not available"); + } + + auto in = params.get(); + auto valid = self_healing_open_validate_and_store_node_info( + args, args.tx, in.info); + if (valid.has_value()) + { + auto [code, message] = valid.value(); + return make_error(code, ccf::errors::InvalidQuote, message); + } + LOG_TRACE_FMT("Processing self-healing-open vote RPC"); + LOG_TRACE_FMT("Self-healing-open vote params: {}", params.dump()); + + auto votes = args.tx.rw(this->network.self_healing_open_votes); + votes->insert(in.info.intrinsic_id); + + // if sufficient votes, then we can open the network + // TODO configure this limit + if ( + votes->size() >= + node_configuration_subsystem->get() + .node_config.recover.self_healing_open_addresses.value() + .size() / + 2 + + 1) + { + LOG_INFO_FMT("******************************"); + LOG_INFO_FMT( + "Self-healing-open suceeded we should open the network", + in.info.intrinsic_id); + LOG_INFO_FMT("******************************"); + + auto* sm_state_handle = + args.tx.rw(this->network.self_healing_open_sm_state); + sm_state_handle->put(SelfHealingOpenSM::OPENING); + + // TODO open the network + // have a utility function that kicks off the opening + } + return make_success(fmt::format( + "Node {} voted for self-healing-open", in.info.intrinsic_id)); + }; + make_endpoint( + "/self_healing_open/vote", + HTTP_PUT, + json_adapter(self_healing_open_vote), + no_auth_required) + .set_forwarding_required(endpoints::ForwardingRequired::Never) + .set_openapi_hidden(true) + .install(); + + auto self_healing_open_iamopen = + [this](auto& args, const nlohmann::json& params) { + LOG_TRACE_FMT("Processing self-healing-open iamopen RPC"); + + auto in = params.get(); + auto valid = self_healing_open_validate_and_store_node_info( + args, args.tx, in.info); + if (valid.has_value()) + { + auto [code, message] = valid.value(); + return make_error(code, ccf::errors::InvalidQuote, message); + } + + LOG_INFO_FMT("******************************"); + LOG_INFO_FMT("Self-healing-open is JOINING {}", in.info.intrinsic_id); + LOG_INFO_FMT("******************************"); + + return make_success(fmt::format( + "Node {} is joining self-healing-open", in.info.intrinsic_id)); + }; + make_endpoint( + "/self_healing_open/iamopen", + HTTP_PUT, + json_adapter(self_healing_open_iamopen), + no_auth_required) + .set_forwarding_required(endpoints::ForwardingRequired::Never) + .set_openapi_hidden(true) + .install(); } }; diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index becfa93fefbd..97de9df250be 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -64,6 +64,7 @@ namespace ccf virtual size_t get_jwt_attempts() = 0; virtual ccf::crypto::Pem get_self_signed_certificate() = 0; virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; + virtual void self_healing_open_start_retry_timer() = 0; virtual const ccf::StartupConfig& get_node_config() const = 0; virtual ccf::crypto::Pem get_network_cert() = 0; virtual void stop_notice() = 0; diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index ccd11843dd61..1023805ac2d4 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -109,5 +109,10 @@ namespace ccf { return impl.get_cose_signatures_config(); } + + void self_healing_open_start_retry_timer() override + { + impl.self_healing_open_start_retry_timer(); + } }; } \ No newline at end of file diff --git a/src/node/rpc/node_operation_interface.h b/src/node/rpc/node_operation_interface.h index 3667f5438918..46a30bffcb00 100644 --- a/src/node/rpc/node_operation_interface.h +++ b/src/node/rpc/node_operation_interface.h @@ -60,5 +60,7 @@ namespace ccf virtual ccf::crypto::Pem get_self_signed_node_certificate() = 0; virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; + + virtual void self_healing_open_start_retry_timer() = 0; }; } \ No newline at end of file diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index eae0d03fc54c..b4bcf32cfc4d 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -110,6 +110,11 @@ namespace ccf { return cose_signatures_config; } + + void self_healing_open_start_retry_timer() override + { + // No-op for stub + } }; class StubGovernanceEffects : public ccf::AbstractGovernanceEffects diff --git a/src/node/self_healing_open.h b/src/node/self_healing_open.h new file mode 100644 index 000000000000..aff8f960652f --- /dev/null +++ b/src/node/self_healing_open.h @@ -0,0 +1,116 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. + +#pragma once + +#include "ccf/crypto/pem.h" +#include "ccf/ds/json.h" +#include "ccf/ds/quote_info.h" +#include "ccf/kv/version.h" +#include "ds/actors.h" +#include "http/curl.h" + +#include +#include +#include + +namespace ccf::self_healing_open +{ + struct RequestNodeInfo + { + QuoteInfo quote_info; + std::string published_network_address; + std::string intrinsic_id; + }; + DECLARE_JSON_TYPE(RequestNodeInfo); + DECLARE_JSON_REQUIRED_FIELDS( + RequestNodeInfo, quote_info, published_network_address, intrinsic_id); + + struct GossipRequest + { + RequestNodeInfo info; + ccf::kv::Version txid; + }; + DECLARE_JSON_TYPE(GossipRequest); + DECLARE_JSON_REQUIRED_FIELDS(GossipRequest, txid, info); + + struct VoteRequest + { + RequestNodeInfo info; + }; + DECLARE_JSON_TYPE(VoteRequest); + DECLARE_JSON_REQUIRED_FIELDS(VoteRequest, info); + + struct IAmOpenRequest + { + RequestNodeInfo info; + }; + DECLARE_JSON_TYPE(IAmOpenRequest); + DECLARE_JSON_REQUIRED_FIELDS(IAmOpenRequest, info); + + inline void dispatch_authenticated_message( + nlohmann::json&& request, + const std::string& target_address, + const std::string& endpoint, + const crypto::Pem& self_signed_node_cert, + const crypto::Pem& privkey_pem) + { + curl::UniqueCURL curl_handle; + + // diable SSL verification as no private information is sent + curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); + + curl_handle.set_blob_opt( + CURLOPT_SSLCERT_BLOB, + self_signed_node_cert.data(), + self_signed_node_cert.size()); + curl_handle.set_opt(CURLOPT_SSLCERTTYPE, "PEM"); + + curl_handle.set_blob_opt( + CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); + + auto url = fmt::format( + "https://{}/{}/self_healing_open/{}", + target_address, + get_actor_prefix(ActorsType::nodes), + endpoint); + + curl::UniqueSlist headers; + headers.append("Content-Type", "application/json"); + + auto body = std::make_unique(request); + + auto response_callback = []( + const ccf::curl::CurlRequest& request, + CURLcode curl_code, + long status_code) { + LOG_TRACE_FMT( + "Response received for {} to {}: curl_result {} ({}), status code {}", + request.get_method().c_str(), + request.get_url(), + curl_easy_strerror(curl_code), + curl_code, + status_code); + }; + + auto curl_request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + std::move(body), + std::move(response_callback)); + + LOG_TRACE_FMT( + "Dispatching attested message for {} to {}: {}", + curl_request->get_method().c_str(), + curl_request->get_url(), + request.dump()); + + curl::CurlmLibuvContextSingleton::get_instance().attach_request( + curl_request); + } + +} \ No newline at end of file diff --git a/src/service/network_tables.h b/src/service/network_tables.h index a52cd5dc2c72..0aac37dd78b3 100644 --- a/src/service/network_tables.h +++ b/src/service/network_tables.h @@ -16,6 +16,7 @@ #include "ccf/service/tables/modules.h" #include "ccf/service/tables/nodes.h" #include "ccf/service/tables/proposals.h" +#include "ccf/service/tables/self_heal_open.h" #include "ccf/service/tables/service.h" #include "ccf/service/tables/snp_measurements.h" #include "ccf/service/tables/tcb_verification.h" @@ -245,6 +246,18 @@ namespace ccf return std::make_tuple(signatures, serialise_tree); } + // Self-healing open tables + const SelfHealingOpenNodeInfo self_healing_open_node_info = { + Tables::SELF_HEALING_OPEN_NODES}; + const SelfHealingOpenGossipState self_healing_open_gossip = { + Tables::SELF_HEALING_OPEN_GOSSIP_STATE}; + const SelfHealingOpenChosenReplica self_healing_open_chosen_replica = { + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA}; + const SelfHealingOpenVotes self_healing_open_votes = { + Tables::SELF_HEALING_OPEN_VOTES}; + const SelfHealingOpenSMState self_healing_open_sm_state = { + Tables::SELF_HEALING_OPEN_SM_STATE}; + inline auto get_all_internal_tables() const { return std::tuple_cat( diff --git a/tests/config.jinja b/tests/config.jinja index 0930b7bc5405..9e7222909a88 100644 --- a/tests/config.jinja +++ b/tests/config.jinja @@ -58,9 +58,9 @@ "initial_service_certificate_validity_days": {{ initial_service_cert_validity_days }}, "previous_service_identity_file": "{{ previous_service_identity_file }}" {% if previous_sealed_ledger_secret_location %}, "previous_sealed_ledger_secret_location": "{{ previous_sealed_ledger_secret_location }}", - {% endif %} {% if self_heal_open_addresses %}, - "self_heal_open_addresses" : [ - {% for address in self_heal_open_addresses %} + {% endif %} {% if self_healing_open_addresses %}, + "self_healing_open_addresses" : [ + {% for address in self_healing_open_addresses %} "{{ address }}" {% if not loop.last %},{% endif %} {% endfor %} ] diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 96ad5730d2c6..7918449148e7 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1067,7 +1067,7 @@ def run_initial_uvm_descriptor_checks(args): with recovered_primary.client() as c: r = c.get("/node/network").body.json() recovery_seqno = int(r["current_service_create_txid"].split(".")[1]) - network.stop_all_nodes() + recovered_network.stop_all_nodes() ledger = ccf.ledger.Ledger( recovered_primary.remote.ledger_paths(), committed_only=False, @@ -1144,7 +1144,7 @@ def run_initial_tcb_version_checks(args): with recovered_primary.client() as c: r = c.get("/node/network").body.json() recovery_seqno = int(r["current_service_create_txid"].split(".")[1]) - network.stop_all_nodes() + recovered_network.stop_all_nodes() ledger = ccf.ledger.Ledger( recovered_primary.remote.ledger_paths(), committed_only=False, @@ -1438,7 +1438,7 @@ def run(self, src_dir, dst_dir): prev_network = recovery_network def run_self_healing_open(args): - args.nodes = infra.e2e_args.min_nodes() + args.nodes = infra.e2e_args.min_nodes(args, f=1) with infra.network.network( args.nodes, args.binary_dir, @@ -1468,7 +1468,7 @@ def run_self_healing_open(args): args.previous_service_identity_file = os.path.join( old_common, "service_cert.pem" ) - recovered_network.start_in_auto_dr( + recovered_network.start_in_self_healing_open( args, ledger_dirs=ledger_dirs, committed_ledger_dirs=committed_ledger_dirs, diff --git a/tests/infra/network.py b/tests/infra/network.py index 142e128ddb7b..16ac5bf75562 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -435,7 +435,7 @@ def _start_all_nodes( self, args, recovery=False, - self_heal_open=False, + self_healing_open=False, ledger_dir=None, read_only_ledger_dirs=None, snapshots_dir=None, @@ -457,7 +457,7 @@ def _start_all_nodes( for arg in infra.network.Network.node_args_to_forward } - self_heal_open_addresses = [ + self_healing_open_addresses = [ node.get_public_rpc_address() for node in self.nodes ] @@ -465,8 +465,8 @@ def _start_all_nodes( forwarded_args_with_overrides = forwarded_args.copy() forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) try: - if i == 0 or self_heal_open: - if not (recovery or self_heal_open): + if i == 0 or self_healing_open: + if not (recovery or self_healing_open): node.start( lib_name=args.package, workspace=args.workspace, @@ -487,9 +487,9 @@ def _start_all_nodes( "read_only_ledger_dirs": read_only_ledger_dirs, "snapshots_dir": snapshots_dir, } - self_heal_open_kwargs = {"self_heal_open_addresses": self_heal_open_addresses} + self_healing_open_kwargs = {"self_healing_open_addresses": self_healing_open_addresses} # If a kwarg is passed in override automatically set variants - node_kwargs = node_kwargs | self_heal_open_kwargs | forwarded_args_with_overrides | kwargs + node_kwargs = node_kwargs | self_healing_open_kwargs | forwarded_args_with_overrides | kwargs node.recover(**node_kwargs) self.wait_for_state( node, @@ -804,7 +804,7 @@ def start_in_self_healing_open( primary = self._start_all_nodes( args, recovery=True, - self_heal_open=True, + self_healing_open=True, **kwargs, ) diff --git a/tests/infra/remote.py b/tests/infra/remote.py index e579b6741f4a..aa49bd09e8fe 100644 --- a/tests/infra/remote.py +++ b/tests/infra/remote.py @@ -319,7 +319,7 @@ def __init__( cose_signatures_subject="ledger.signature", sealed_ledger_secret_location=None, previous_sealed_ledger_secret_location=None, - self_heal_open_addresses=None, + self_healing_open_addresses=None, **kwargs, ): """ @@ -538,7 +538,7 @@ def __init__( historical_cache_soft_limit=historical_cache_soft_limit, cose_signatures_issuer=cose_signatures_issuer, cose_signatures_subject=cose_signatures_subject, - self_heal_open_addresses=self_heal_open_addresses, + self_healing_open_addresses=self_healing_open_addresses, **auto_dr_args, **kwargs, ) From dc6a7ee52710f12a167fef92ba67834c70ac3511 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 29 Jul 2025 15:45:07 +0100 Subject: [PATCH 041/197] Config changes --- doc/host_config_schema/cchost_config.json | 10 ++++++++++ include/ccf/node/startup_config.h | 2 ++ src/common/configuration.h | 4 +++- src/host/configuration.h | 6 +++++- src/host/main.cpp | 4 ++++ 5 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index 49e1c242dfa0..8df938bdb087 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -448,6 +448,16 @@ "type": "string" }, "description": "List of addresses (host:port) of the cluster that should open via self-healing-open" + }, + "self_healing_open_retry_timeout": { + "type": "string", + "default": "100ms", + "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be leass than 'self_healing_open_timeout'" + }, + "self_healing_open_timeout": { + "type": "string", + "default": "2000ms", + "description": "Interval (time string) after which the node forcibly advances to the next phase of the self-healing-open protocol" } }, "required": ["previous_service_identity_file"], diff --git a/include/ccf/node/startup_config.h b/include/ccf/node/startup_config.h index 0f370ac37f14..29199bfb433b 100644 --- a/include/ccf/node/startup_config.h +++ b/include/ccf/node/startup_config.h @@ -148,6 +148,8 @@ namespace ccf std::nullopt; std::optional> self_healing_open_addresses = std::nullopt; + ccf::ds::TimeString self_healing_open_retry_timeout = {"100ms"}; + ccf::ds::TimeString self_healing_open_timeout = {"2000ms"}; }; Recover recover = {}; }; diff --git a/src/common/configuration.h b/src/common/configuration.h index 6e9627cd9395..ffeed98aa994 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -135,7 +135,9 @@ namespace ccf DECLARE_JSON_OPTIONAL_FIELDS( StartupConfig::Recover, previous_sealed_ledger_secret_location, - self_healing_open_addresses); + self_healing_open_addresses, + self_healing_open_retry_timeout, + self_healing_open_timeout); DECLARE_JSON_TYPE_WITH_BASE(StartupConfig, CCFConfig); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/host/configuration.h b/src/host/configuration.h index 38e995116bba..9ffdcc0603d5 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -143,6 +143,8 @@ namespace host std::nullopt; std::optional> self_healing_open_addresses = std::nullopt; + ccf::ds::TimeString self_healing_open_retry_timeout = {"100ms"}; + ccf::ds::TimeString self_healing_open_timeout = {"2000ms"}; bool operator==(const Recover&) const = default; }; Recover recover = {}; @@ -198,7 +200,9 @@ namespace host initial_service_certificate_validity_days, previous_service_identity_file, previous_sealed_ledger_secret_location, - self_healing_open_addresses); + self_healing_open_addresses, + self_healing_open_retry_timeout, + self_healing_open_timeout); DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCHostConfig::Command); DECLARE_JSON_REQUIRED_FIELDS(CCHostConfig::Command, type); diff --git a/src/host/main.cpp b/src/host/main.cpp index 6a299497fa68..1e4e73c32a06 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -827,6 +827,10 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) } startup_config.recover.self_healing_open_addresses = config.command.recover.self_healing_open_addresses; + startup_config.recover.self_healing_open_retry_timeout = + config.command.recover.self_healing_open_retry_timeout; + startup_config.recover.self_healing_open_timeout = + config.command.recover.self_healing_open_timeout; } else { From 20581806d42e2eca86f5c75fcf140198fe4e8415 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 29 Jul 2025 15:46:41 +0100 Subject: [PATCH 042/197] Add timeouts --- include/ccf/service/tables/self_heal_open.h | 9 +- src/node/node_state.h | 185 +++++++++++++- src/node/rpc/node_frontend.h | 256 ++++++++++++-------- src/node/rpc/node_interface.h | 4 +- src/node/rpc/node_operation.h | 10 +- src/node/rpc/node_operation_interface.h | 5 +- src/node/rpc/test/node_stub.h | 8 +- src/node/self_healing_open.h | 1 + src/service/network_tables.h | 4 +- 9 files changed, 365 insertions(+), 117 deletions(-) diff --git a/include/ccf/service/tables/self_heal_open.h b/include/ccf/service/tables/self_heal_open.h index 86b4b56941d8..af421f12b076 100644 --- a/include/ccf/service/tables/self_heal_open.h +++ b/include/ccf/service/tables/self_heal_open.h @@ -2,11 +2,11 @@ // Licensed under the Apache 2.0 License. #pragma once +#include "ccf/ds/enum_formatter.h" #include "ccf/ds/json.h" #include "ccf/ds/quote_info.h" #include "ccf/service/map.h" #include "node/identity.h" -#include "ccf/ds/enum_formatter.h" using IntrinsicIdentifier = std::string; @@ -26,7 +26,8 @@ DECLARE_JSON_REQUIRED_FIELDS( cert_der, intrinsic_id); -enum class SelfHealingOpenSM{ +enum class SelfHealingOpenSM +{ GOSSIPPING = 0, VOTING, OPENING, // by chosen replica @@ -45,7 +46,7 @@ namespace ccf { using SelfHealingOpenNodeInfo = ServiceMap; - using SelfHealingOpenGossipState = + using SelfHealingOpenGossips = ServiceMap; using SelfHealingOpenChosenReplica = ServiceValue; using SelfHealingOpenVotes = ServiceSet; @@ -55,7 +56,7 @@ namespace ccf { static constexpr auto SELF_HEALING_OPEN_NODES = "public:ccf.gov.selfhealingopen.nodes"; - static constexpr auto SELF_HEALING_OPEN_GOSSIP_STATE = + static constexpr auto SELF_HEALING_OPEN_GOSSIPS = "public:ccf.gov.selfhealingopen.gossip"; static constexpr auto SELF_HEALING_OPEN_CHOSEN_REPLICA = "public:ccf.gov.selfhealingopen.chosen_replica"; diff --git a/src/node/node_state.h b/src/node/node_state.h index 7cd392320a10..7a1c5d83f7d9 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -10,7 +10,9 @@ #include "ccf/ds/logger.h" #include "ccf/ds/unit_strings.h" #include "ccf/js/core/context.h" +#include "ccf/json_handler.h" #include "ccf/node/cose_signatures_config.h" +#include "ccf/odata_error.h" #include "ccf/pal/attestation_sev_snp.h" #include "ccf/pal/locking.h" #include "ccf/pal/platform.h" @@ -21,6 +23,7 @@ #include "ccf/service/tables/acme_certificates.h" #include "ccf/service/tables/self_heal_open.h" #include "ccf/service/tables/service.h" +#include "ccf/threading/thread_ids.h" #include "ccf/tx.h" #include "ccf_acme_client.h" #include "consensus/aft/raft.h" @@ -33,6 +36,7 @@ #include "enclave/rpc_sessions.h" #include "encryptor.h" #include "history.h" +#include "http/curl.h" #include "http/http_parser.h" #include "indexing/indexer.h" #include "js/global_class_ids.h" @@ -1985,16 +1989,24 @@ namespace ccf return history->get_cose_signatures_config(); } - void self_healing_open_start_retry_timer() override + void self_healing_open_try_start_timers( + ccf::kv::Tx& tx, bool recovering) override { - auto timer_msg = std::make_unique<::threading::Tmsg>( + if ( + !recovering || !config.recover.self_healing_open_addresses.has_value()) + { + LOG_TRACE_FMT( + "Not recovering, or no self-healing-open addresses configured, " + "not starting self-healing-open timers"); + return; + } + + auto* state_handle = tx.rw(network.self_healing_open_sm_state); + state_handle->put(SelfHealingOpenSM::GOSSIPPING); + + auto retry_timer_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { std::lock_guard guard(msg->data.self.lock); - // Keep doing this until the node is no longer in recovery - if (msg->data.self.sm.check(NodeStartupState::partOfNetwork)) - { - return; - } auto tx = msg->data.self.network.tables->create_read_only_tx(); auto* sm_state_handle = @@ -2007,6 +2019,14 @@ namespace ccf } auto sm_state = sm_state_handle->get().value(); + // Keep doing this until the node is no longer in recovery + if ( + msg->data.self.sm.check(NodeStartupState::partOfNetwork) || + sm_state == SelfHealingOpenSM::OPEN) + { + return; + } + switch (sm_state) { case SelfHealingOpenSM::GOSSIPPING: @@ -2048,13 +2068,158 @@ namespace ccf static_cast(sm_state))); } - auto delay = msg->data.self.config.join.retry_timeout; + auto delay = + msg->data.self.config.recover.self_healing_open_retry_timeout; + ::threading::ThreadMessaging::instance().add_task_after( + std::move(msg), delay); + }, + *this); + // kick this off asynchronously as this can be called from a curl callback + ::threading::ThreadMessaging::instance().add_task( + threading::get_current_thread_id(), std::move(retry_timer_msg)); + + // Dispatch timeouts + auto timeout_msg = std::make_unique<::threading::Tmsg>( + [](std::unique_ptr<::threading::Tmsg> msg) { + std::lock_guard guard(msg->data.self.lock); + LOG_TRACE_FMT( + "Self-healing-open timeout, sending timeout to internal handlers"); + + curl::UniqueCURL curl_handle; + + auto cert = msg->data.self.self_signed_node_cert; + curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); + + curl_handle.set_blob_opt( + CURLOPT_SSLCERT_BLOB, cert.data(), cert.size()); + curl_handle.set_opt(CURLOPT_SSLCERTTYPE, "PEM"); + + auto privkey_pem = msg->data.self.node_sign_kp->private_key_pem(); + curl_handle.set_blob_opt( + CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); + curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); + + auto url = fmt::format( + "https://{}/{}/self_healing_open/timeout", + msg->data.self.config.network.rpc_interfaces + .at("primary_rpc_interface") + .published_address, + get_actor_prefix(ActorsType::nodes)); + + curl::UniqueSlist headers; + headers.append("Content-Type: application/json"); + + // This is simpler than going via the internal handlers... + auto curl_request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + nullptr, + std::nullopt); + curl::CurlmLibuvContextSingleton::get_instance().attach_request( + curl_request); + + auto delay = msg->data.self.config.recover.self_healing_open_timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, *this); ::threading::ThreadMessaging::instance().add_task_after( - std::move(timer_msg), ds::TimeString("0ms")); + std::move(timeout_msg), config.recover.self_healing_open_timeout); + } + + void self_healing_open_advance( + ccf::kv::Tx& tx, + const ccf::StartupConfig& node_config, + bool timeout) override + { + auto* sm_state_handle = tx.rw(network.self_healing_open_sm_state); + if (!sm_state_handle->get().has_value()) + { + throw std::logic_error( + "Self-healing-open state not set, cannot advance self-healing-open"); + } + + switch (sm_state_handle->get().value()) + { + case SelfHealingOpenSM::GOSSIPPING: + { + auto* gossip_handle = tx.ro(network.self_healing_open_gossip); + if ( + gossip_handle->size() == + node_config.recover.self_healing_open_addresses.value().size() || + timeout) + { + if (gossip_handle->size() == 0) + { + throw std::logic_error("No gossip addresses provided yet"); + } + + std::optional> min_iid; + gossip_handle->foreach( + [&min_iid](const auto& iid, const auto& txid) { + if ( + !min_iid.has_value() || min_iid->second < txid || + (min_iid->second == txid && min_iid->first > iid)) + { + min_iid = std::make_pair(iid, txid); + } + return true; + }); + + auto* chosen_replica = + tx.rw(network.self_healing_open_chosen_replica); + chosen_replica->put(min_iid->first); + sm_state_handle->put(SelfHealingOpenSM::VOTING); + } + return; + } + case SelfHealingOpenSM::VOTING: + { + auto* votes = tx.rw(network.self_healing_open_votes); + if ( + votes->size() >= + node_config.recover.self_healing_open_addresses.value().size() / + 2 + + 1 || + timeout) + { + if (votes->size() == 0) + { + throw std::logic_error( + "We didn't even vote for ourselves, so why should we open?"); + } + LOG_INFO_FMT("******************************"); + LOG_INFO_FMT( + "Self-healing-open suceeded we should open the network"); + LOG_INFO_FMT("******************************"); + + sm_state_handle->put(SelfHealingOpenSM::OPENING); + + // TODO open the network + // have a utility function that kicks off the opening + } + return; + } + case SelfHealingOpenSM::JOINING: + { + // TODO restart in join + return; + } + case SelfHealingOpenSM::OPENING: + case SelfHealingOpenSM::OPEN: + { + // Nothing to do here, we are already opening or open + return; + } + default: + throw std::logic_error(fmt::format( + "Unknown self-healing-open state: {}", + static_cast(sm_state_handle->get().value()))); + } } private: @@ -2976,7 +3141,7 @@ namespace ccf { CCF_ASSERT( snp_tcb_version.has_value(), - "TCB version must be set when unsealing ledger secret"); + "TCB version must be set when unsealing ledger sec/ret"); CCF_ASSERT( config.recover.previous_sealed_ledger_secret_location.has_value(), diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 94954c9b6669..b41e3b09e7ef 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -1724,14 +1724,27 @@ namespace ccf } { + // Reset the self-healing-open state ccf::kv::Tx& tx = ctx.tx; - // TODO properly gate this - auto* self_healing_open_state_handle = - tx.rw( - Tables::SELF_HEALING_OPEN_SM_STATE); - self_healing_open_state_handle->put( - SelfHealingOpenSM::GOSSIPPING); - this->node_operation.self_healing_open_start_retry_timer(); + auto* state_handle = tx.rw( + Tables::SELF_HEALING_OPEN_SM_STATE); + state_handle->clear(); + auto* node_info_handle = tx.rw( + Tables::SELF_HEALING_OPEN_NODES); + node_info_handle->clear(); + auto* gossip_state_handle = tx.rw( + Tables::SELF_HEALING_OPEN_GOSSIPS); + gossip_state_handle->clear(); + auto* chosen_replica = tx.rw( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA); + chosen_replica->clear(); + auto* votes = + tx.rw(Tables::SELF_HEALING_OPEN_VOTES); + votes->clear(); + + // Start timers if necessary + this->node_operation.self_healing_open_try_start_timers( + tx, recovering); } LOG_INFO_FMT("Created service"); @@ -2270,81 +2283,71 @@ namespace ccf .set_openapi_hidden(true) .install(); - auto self_healing_open_gossip = [this]( - auto& args, - const nlohmann::json& params) { - auto node_configuration_subsystem = - this->context.get_subsystem(); - if (!node_configuration_subsystem) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "NodeConfigurationSubsystem is not available"); - } - - auto in = params.get(); + auto self_healing_open_gossip = + [this](auto& args, const nlohmann::json& params) { + auto node_configuration_subsystem = + this->context.get_subsystem(); + if (!node_configuration_subsystem) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "NodeConfigurationSubsystem is not available"); + } - auto valid = self_healing_open_validate_and_store_node_info( - args, args.tx, in.info); - if (valid.has_value()) - { - auto [code, message] = valid.value(); - return make_error(code, ccf::errors::InvalidQuote, message); - } + auto in = params.get(); - LOG_TRACE_FMT("Processing self-healing-open gossip RPC"); - LOG_TRACE_FMT("Self-healing-open gossip params: {}", params.dump()); + auto valid = self_healing_open_validate_and_store_node_info( + args, args.tx, in.info); + if (valid.has_value()) + { + auto [code, message] = valid.value(); + return make_error(code, ccf::errors::InvalidQuote, message); + } - auto chosen_replica = - args.tx.rw(this->network.self_healing_open_chosen_replica); - // This freezes the gossips at the point where it votes - if (chosen_replica->get().has_value()) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "This replica has already voted"); - } + LOG_TRACE_FMT("Processing self-healing-open gossip RPC"); + LOG_TRACE_FMT("Self-healing-open gossip params: {}", params.dump()); - auto gossip_handle = args.tx.rw(this->network.self_healing_open_gossip); - if (gossip_handle->get(in.info.intrinsic_id).has_value()) - { - LOG_INFO_FMT("Node {} already gossiped", in.info.intrinsic_id); - return make_success( - fmt::format("Node {} already gossiped", in.info.intrinsic_id)); - } - gossip_handle->put(in.info.intrinsic_id, in.txid); + auto chosen_replica = + args.tx.rw(this->network.self_healing_open_chosen_replica); + // This freezes the gossips at the point where it votes + if (chosen_replica->get().has_value()) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "This replica has already voted"); + } - // TODO properly configure this limit - if ( - gossip_handle->size() == - node_configuration_subsystem->get() - .node_config.recover.self_healing_open_addresses.value() - .size()) - { - // Use commit handle on this to trigger the thread message to vote and - // ensure no rollbacks are possible - std::optional> min_iid; - gossip_handle->foreach([&min_iid](const auto& iid, const auto& txid) { - if ( - !min_iid.has_value() || min_iid->second < txid || - (min_iid->second == txid && min_iid->first > iid)) - { - min_iid = std::make_pair(iid, txid); - } - return true; - }); - chosen_replica->put(min_iid->first); + auto gossip_handle = + args.tx.rw(this->network.self_healing_open_gossip); + if (gossip_handle->get(in.info.intrinsic_id).has_value()) + { + LOG_INFO_FMT("Node {} already gossiped", in.info.intrinsic_id); + return make_success( + fmt::format("Node {} already gossiped", in.info.intrinsic_id)); + } + gossip_handle->put(in.info.intrinsic_id, in.txid); - auto* sm_state_handle = - args.tx.rw(this->network.self_healing_open_sm_state); - sm_state_handle->put(SelfHealingOpenSM::VOTING); - } + try + { + this->node_operation.self_healing_open_advance( + args.tx, node_configuration_subsystem->get().node_config, false); + } + catch (const std::logic_error& e) + { + LOG_FAIL_FMT( + "Self-healing-open gossip failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); + } - return make_success(fmt::format( - "Node {} gossiped for self-healing-open", in.info.intrinsic_id)); - }; + return make_success(fmt::format( + "Node {} gossiped for self-healing-open", in.info.intrinsic_id)); + }; make_endpoint( "/self_healing_open/gossip", HTTP_PUT, @@ -2380,29 +2383,23 @@ namespace ccf auto votes = args.tx.rw(this->network.self_healing_open_votes); votes->insert(in.info.intrinsic_id); - // if sufficient votes, then we can open the network - // TODO configure this limit - if ( - votes->size() >= - node_configuration_subsystem->get() - .node_config.recover.self_healing_open_addresses.value() - .size() / - 2 + - 1) + try + { + this->node_operation.self_healing_open_advance( + args.tx, node_configuration_subsystem->get().node_config, false); + } + catch (const std::logic_error& e) { - LOG_INFO_FMT("******************************"); - LOG_INFO_FMT( - "Self-healing-open suceeded we should open the network", - in.info.intrinsic_id); - LOG_INFO_FMT("******************************"); - - auto* sm_state_handle = - args.tx.rw(this->network.self_healing_open_sm_state); - sm_state_handle->put(SelfHealingOpenSM::OPENING); - - // TODO open the network - // have a utility function that kicks off the opening + LOG_FAIL_FMT( + "Self-healing-open gossip failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); } + + // if sufficient votes, then we can open the network return make_success(fmt::format( "Node {} voted for self-healing-open", in.info.intrinsic_id)); }; @@ -2443,6 +2440,73 @@ namespace ccf .set_forwarding_required(endpoints::ForwardingRequired::Never) .set_openapi_hidden(true) .install(); + + auto self_healing_open_timeout = [this]( + auto& args, + const nlohmann::json& params) { + (void)params; // Unused, but required by the adapter + + LOG_TRACE_FMT("Self-healing-open timeout received"); + + auto node_configuration_subsystem = + this->context.get_subsystem(); + if (!node_configuration_subsystem) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "NodeConfigurationSubsystem is not available"); + } + + // Must ensure that the request originates from the primary + auto primary_id = consensus->primary(); + if (!primary_id.has_value()) + { + LOG_FAIL_FMT("self-healing-open timeout: primary unknown"); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "Primary is unknown"); + } + const auto& sig_auth_ident = + args.template get_caller(); + if (primary_id.value() != sig_auth_ident.node_id) + { + LOG_FAIL_FMT( + "self-healing-open timeout: request does not originate from " + "primary"); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "Request does not originate from primary."); + } + + try + { + this->node_operation.self_healing_open_advance( + args.tx, node_configuration_subsystem->get().node_config, false); + } + catch (const std::logic_error& e) + { + LOG_FAIL_FMT( + "Self-healing-open gossip failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); + } + return make_success("Self-healing-open timeout processed successfully"); + }; + + make_endpoint( + "/self_healing_open/timeout", + HTTP_POST, + json_adapter(self_healing_open_timeout), + {std::make_shared()}) + .set_forwarding_required(endpoints::ForwardingRequired::Never) + .set_openapi_hidden(true) + .install(); } }; diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index 97de9df250be..5fe39f05af6b 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -4,6 +4,7 @@ #include "ccf/crypto/pem.h" #include "ccf/ds/quote_info.h" +#include "ccf/node/startup_config.h" #include "ccf/node_startup_state.h" #include "ccf/service/acme_client_config.h" #include "ccf/service/node_info_network.h" @@ -64,7 +65,8 @@ namespace ccf virtual size_t get_jwt_attempts() = 0; virtual ccf::crypto::Pem get_self_signed_certificate() = 0; virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; - virtual void self_healing_open_start_retry_timer() = 0; + virtual void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) = 0; + virtual void self_healing_open_advance(ccf::kv::Tx&, const ccf::StartupConfig&, bool) = 0; virtual const ccf::StartupConfig& get_node_config() const = 0; virtual ccf::crypto::Pem get_network_cert() = 0; virtual void stop_notice() = 0; diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index 1023805ac2d4..93542a7171ed 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -110,9 +110,15 @@ namespace ccf return impl.get_cose_signatures_config(); } - void self_healing_open_start_retry_timer() override + void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering); { - impl.self_healing_open_start_retry_timer(); + impl.self_healing_open_try_start_timers(tx, recovering); + } + + void self_healing_open_advance( + ccf::kv::Tx& tx, const ccf::StartupConfig& startup_config, bool is_recovery) override + { + impl.self_healing_open_advance(tx, startup_config, is_recovery); } }; } \ No newline at end of file diff --git a/src/node/rpc/node_operation_interface.h b/src/node/rpc/node_operation_interface.h index 46a30bffcb00..c006cfc17f8e 100644 --- a/src/node/rpc/node_operation_interface.h +++ b/src/node/rpc/node_operation_interface.h @@ -6,6 +6,7 @@ #include "ccf/ds/quote_info.h" #include "ccf/node/cose_signatures_config.h" #include "ccf/node/quote.h" +#include "ccf/node/startup_config.h" #include "ccf/node_startup_state.h" #include "ccf/node_subsystem_interface.h" #include "ccf/service/tables/code_id.h" @@ -61,6 +62,8 @@ namespace ccf virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; - virtual void self_healing_open_start_retry_timer() = 0; + virtual void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) = 0; + virtual void self_healing_open_advance( + ccf::kv::Tx&, const ccf::StartupConfig&, bool) = 0; }; } \ No newline at end of file diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index b4bcf32cfc4d..ea847f7b454b 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -111,7 +111,13 @@ namespace ccf return cose_signatures_config; } - void self_healing_open_start_retry_timer() override + void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) override + { + // No-op for stub + } + + void self_healing_open_advance( + ccf::kv::Tx& tx, const ccf::StartupConfig& config, bool timeout) override { // No-op for stub } diff --git a/src/node/self_healing_open.h b/src/node/self_healing_open.h index aff8f960652f..904600cd17da 100644 --- a/src/node/self_healing_open.h +++ b/src/node/self_healing_open.h @@ -70,6 +70,7 @@ namespace ccf::self_healing_open curl_handle.set_blob_opt( CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); + curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); auto url = fmt::format( "https://{}/{}/self_healing_open/{}", diff --git a/src/service/network_tables.h b/src/service/network_tables.h index 0aac37dd78b3..88a3c9564c0b 100644 --- a/src/service/network_tables.h +++ b/src/service/network_tables.h @@ -249,8 +249,8 @@ namespace ccf // Self-healing open tables const SelfHealingOpenNodeInfo self_healing_open_node_info = { Tables::SELF_HEALING_OPEN_NODES}; - const SelfHealingOpenGossipState self_healing_open_gossip = { - Tables::SELF_HEALING_OPEN_GOSSIP_STATE}; + const SelfHealingOpenGossips self_healing_open_gossip = { + Tables::SELF_HEALING_OPEN_GOSSIPS}; const SelfHealingOpenChosenReplica self_healing_open_chosen_replica = { Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA}; const SelfHealingOpenVotes self_healing_open_votes = { From f8981ae508fe01f82e1fd854bedf8dd7f6f11b0c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 30 Jul 2025 11:46:06 +0100 Subject: [PATCH 043/197] Fix curl put with empty body issue --- src/http/curl.h | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 3a9b7837edc8..14b91d61d6e4 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -316,22 +316,6 @@ namespace ccf::curl } CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_URL, url.c_str()); - if (request_body != nullptr) - { - request_body->attach_to_curl(curl_handle); - } - - if (response_callback.has_value()) - { - response = std::make_unique(); - response->attach_to_curl(curl_handle); - } - - if (headers.get() != nullptr) - { - CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); - } - if (!method.get_http_method().has_value()) { throw std::logic_error( @@ -345,8 +329,14 @@ namespace ccf::curl case HTTP_HEAD: CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_NOBODY, 1L); break; - case HTTP_PUT: + case HTTP_PUT: { CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); + if (request_body == nullptr) + { + // If no request body is provided, curl will try reading from stdin, which causes a blockage + request_body = std::make_unique(std::vector()); + } + } break; case HTTP_POST: // libcurl sets the post verb when CURLOPT_POSTFIELDS is set, so we @@ -357,6 +347,23 @@ namespace ccf::curl throw std::logic_error( fmt::format("Unsupported HTTP method: {}", method.c_str())); } + + if (request_body != nullptr) + { + request_body->attach_to_curl(curl_handle); + } + + if (response_callback.has_value()) + { + response = std::make_unique(); + response->attach_to_curl(curl_handle); + } + + if (headers.get() != nullptr) + { + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); + } + } void handle_response(CURLcode curl_response_code) @@ -715,7 +722,6 @@ namespace ccf::curl std::lock_guard lock(curlm_lock); curl_request_curlm.attach_curl_request(request); } - }; class CurlmLibuvContextSingleton From 963b6c143495a8bf7915548c1a57399c53b5ddc9 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 30 Jul 2025 11:53:46 +0100 Subject: [PATCH 044/197] Add test for timeouts --- src/node/rpc/node_frontend.h | 4 +-- src/node/rpc/node_operation.h | 2 +- tests/e2e_operations.py | 61 +++++++++++++++++++++++++++++++++++ tests/infra/network.py | 52 ++++++++++++++++++++++++++--- 4 files changed, 111 insertions(+), 8 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index b41e3b09e7ef..6a15bfcb211b 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -2484,7 +2484,7 @@ namespace ccf try { this->node_operation.self_healing_open_advance( - args.tx, node_configuration_subsystem->get().node_config, false); + args.tx, node_configuration_subsystem->get().node_config, true); } catch (const std::logic_error& e) { @@ -2501,7 +2501,7 @@ namespace ccf make_endpoint( "/self_healing_open/timeout", - HTTP_POST, + HTTP_PUT, json_adapter(self_healing_open_timeout), {std::make_shared()}) .set_forwarding_required(endpoints::ForwardingRequired::Never) diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index 93542a7171ed..b56dda849be0 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -110,7 +110,7 @@ namespace ccf return impl.get_cose_signatures_config(); } - void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering); + void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) override { impl.self_healing_open_try_start_timers(tx, recovering); } diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 7918449148e7..8c1b8c43ea7d 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1474,6 +1474,67 @@ def run_self_healing_open(args): committed_ledger_dirs=committed_ledger_dirs, common_dir=network.common_dir, ) + + # Wait for all replicas to report being part of the network + for node in recovered_network.nodes(): + recovered_network.wait_for_state( + node, + infra.node.State.PART_OF_NETWORK.value, + timeout=10, + ) + recovered_network._wait_for_app_open(node) + + recovered_network.stop_all_nodes() + +def run_self_healing_open_single_replica(args): + args.nodes = infra.e2e_args.min_nodes(args, f=1) + with infra.network.network( + args.nodes, + args.binary_dir, + args.debug_nodes, + args.perf_nodes, + ) as network: + LOG.info("Start a network and stop it") + network.start_and_open(args) + old_common = infra.network.get_common_folder_name(args.workspace, args.label) + network.stop_all_nodes() + + ledger_dirs = {} + committed_ledger_dirs = {} + for i, node in enumerate(network.nodes): + l, c = node.get_ledger() + ledger_dirs[i] = l + committed_ledger_dirs[i] = c + + LOG.info("Start a recovery network and stop it") + recovered_network = infra.network.Network( + args.nodes, + args.binary_dir, + args.debug_nodes, + args.perf_nodes, + existing_network=network, + ) + args.previous_service_identity_file = os.path.join( + old_common, "service_cert.pem" + ) + + recovered_network.start_in_self_healing_open( + args, + ledger_dirs=ledger_dirs, + committed_ledger_dirs=committed_ledger_dirs, + common_dir=network.common_dir, + start_all_nodes=False, + ) + + # Wait for all replicas to report being part of the network + for node in recovered_network.nodes[0:1]: + recovered_network.wait_for_state( + node, + infra.node.State.PART_OF_NETWORK.value, + timeout=30, + ) + recovered_network._wait_for_app_open(node) + recovered_network.stop_all_nodes() def run_read_ledger_on_testdata(args): diff --git a/tests/infra/network.py b/tests/infra/network.py index 16ac5bf75562..92827bdaf81e 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -773,6 +773,7 @@ def start_in_self_healing_open( snapshot_dirs= None, common_dir=None, set_authenticate_session=None, + start_all_nodes=True, **kwargs, ): self.common_dir = common_dir or get_common_folder_name( @@ -801,11 +802,52 @@ def start_in_self_healing_open( for node in self.nodes: LOG.info(node.host) - primary = self._start_all_nodes( - args, - recovery=True, - self_healing_open=True, - **kwargs, + self.status = ServiceStatus.RECOVERING + LOG.debug(f"Opening CCF service on {self.hosts}") + + forwarded_args = { + arg: getattr(args, arg, None) + for arg in infra.network.Network.node_args_to_forward + } + self_healing_open_addresses = [ + node.get_public_rpc_address() for node in self.nodes + ] + + for i, node in enumerate(self.nodes): + forwarded_args_with_overrides = forwarded_args.copy() + forwarded_args_with_overrides.update( + self.per_node_args_override.get(i, {}) + ) + if not start_all_nodes and i > 0: + break + + try: + node_kwargs = { + "lib_name": args.package, + "workspace": args.workspace, + "label": args.label, + "common_dir": self.common_dir, + } + self_healing_open_kwargs = {"self_healing_open_addresses": self_healing_open_addresses} + # If a kwarg is passed in override automatically set variants + node_kwargs = node_kwargs | self_healing_open_kwargs | forwarded_args_with_overrides | kwargs + node.recover(**node_kwargs) + self.wait_for_state( + node, + infra.node.State.PART_OF_PUBLIC_NETWORK.value, + timeout=args.ledger_recovery_timeout, + ) + except Exception: + LOG.exception(f"Failed to start node {node.local_node_id}") + raise + + self.election_duration = args.election_timeout_ms / 1000 + self.observed_election_duration = self.election_duration + 1 + + LOG.info("All nodes started") + + primary, _ = self.find_primary( + timeout=args.ledger_recovery_timeout ) if set_authenticate_session is not None: From 4d22d820ac06bde920628008fc0fb4ef96080fc6 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 30 Jul 2025 15:11:49 +0100 Subject: [PATCH 045/197] Get open working --- src/node/node_state.h | 24 ++++++++++++++++++------ tests/e2e_operations.py | 18 ++++++++++++++++++ tests/infra/network.py | 19 +++++++++++++++++++ 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 7a1c5d83f7d9..5dc76a921576 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2192,20 +2192,32 @@ namespace ccf throw std::logic_error( "We didn't even vote for ourselves, so why should we open?"); } - LOG_INFO_FMT("******************************"); - LOG_INFO_FMT( - "Self-healing-open suceeded we should open the network"); - LOG_INFO_FMT("******************************"); + LOG_INFO_FMT("Self-healing-open succeeded, now opening network"); sm_state_handle->put(SelfHealingOpenSM::OPENING); - // TODO open the network - // have a utility function that kicks off the opening + auto* service = tx.ro(Tables::SERVICE); + auto service_info = service->get(); + if (!service_info.has_value()) + { + throw std::logic_error( + "Service information cannot be found to transition service to " + "open"); + } + const auto prev_ident = + tx.ro(Tables::PREVIOUS_SERVICE_IDENTITY) + ->get(); + AbstractGovernanceEffects::ServiceIdentities identities{ + .previous = prev_ident, .next = service_info->cert}; + + transition_service_to_open(tx, identities); } return; } case SelfHealingOpenSM::JOINING: { + LOG_INFO_FMT( + "Self-healing-open in JOINING state, but no logic implemented"); // TODO restart in join return; } diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 8c1b8c43ea7d..2eed73282b80 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1475,6 +1475,15 @@ def run_self_healing_open(args): common_dir=network.common_dir, ) + # Wait for the first node to be in RecoveryShares + for node in recovered_network.nodes[0:1]: + recovered_network.wait_for_statuses( + node, + ["WaitingForRecoveryShares", "Open"], + timeout=30, + ) + recovered_network.consortium.recover_with_shares(recovered_network.find_random_node()) + # Wait for all replicas to report being part of the network for node in recovered_network.nodes(): recovered_network.wait_for_state( @@ -1526,6 +1535,15 @@ def run_self_healing_open_single_replica(args): start_all_nodes=False, ) + # Wait for the first node to be in RecoveryShares + for node in recovered_network.nodes[0:1]: + recovered_network.wait_for_statuses( + node, + ["WaitingForRecoveryShares", "Open"], + timeout=30, + ) + recovered_network.consortium.recover_with_shares(recovered_network.find_random_node()) + # Wait for all replicas to report being part of the network for node in recovered_network.nodes[0:1]: recovered_network.wait_for_state( diff --git a/tests/infra/network.py b/tests/infra/network.py index 92827bdaf81e..12ffa314620c 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -1335,6 +1335,25 @@ def wait_for_states(self, node, states, timeout=3): def wait_for_state(self, node, state, timeout=3): self.wait_for_states(node, [state], timeout=timeout) + def wait_for_statuses(self, node, statuses, timeout=3): + end_time = time.time() + timeout + while time.time() < end_time: + try: + with node.client(connection_timeout=timeout) as c: + r = c.get("/node/network").body.json() + if r["service_status"] in statuses: + break + except ConnectionRefusedError: + pass + time.sleep(0.1) + else: + raise TimeoutError( + f"Timed out waiting for a network status in {statuses} on node {node.node_id}" + ) + + def wait_for_status(self, node, status, timeout=3): + self.wait_for_statuses(node, [status], timeout=timeout) + def _wait_for_app_open(self, node, timeout=3): end_time = time.time() + timeout logs = [] From c67f032eff4e34e5b8b53f9299a83a35c9857214 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 1 Aug 2025 12:25:03 +0100 Subject: [PATCH 046/197] Get join working (still requires trusting of replacement nodes) --- include/ccf/service/tables/self_heal_open.h | 2 + src/enclave/interface.h | 6 +- src/host/configuration.h | 4 + src/host/handle_ring_buffer.h | 11 +++ src/host/main.cpp | 68 +++++++++++++++- src/host/self_healing_open.h | 72 +++++++++++++++++ src/node/node_state.h | 87 ++++++++++++--------- src/node/rpc/node_frontend.h | 64 +++++++-------- src/node/rpc/node_interface.h | 2 +- src/node/rpc/node_operation.h | 4 +- src/node/rpc/node_operation_interface.h | 2 +- src/node/rpc/test/node_stub.h | 2 +- src/node/self_healing_open.h | 9 ++- 13 files changed, 249 insertions(+), 84 deletions(-) create mode 100644 src/host/self_healing_open.h diff --git a/include/ccf/service/tables/self_heal_open.h b/include/ccf/service/tables/self_heal_open.h index af421f12b076..0a4fc7402566 100644 --- a/include/ccf/service/tables/self_heal_open.h +++ b/include/ccf/service/tables/self_heal_open.h @@ -15,6 +15,7 @@ struct SelfHealingOpenNodeInfo_t ccf::QuoteInfo quote_info; std::string published_network_address; std::vector cert_der; + std::string service_identity; IntrinsicIdentifier intrinsic_id; }; @@ -24,6 +25,7 @@ DECLARE_JSON_REQUIRED_FIELDS( quote_info, published_network_address, cert_der, + service_identity, intrinsic_id); enum class SelfHealingOpenSM diff --git a/src/enclave/interface.h b/src/enclave/interface.h index 29451f670257..7142d4bcb6f8 100644 --- a/src/enclave/interface.h +++ b/src/enclave/interface.h @@ -30,7 +30,10 @@ enum AdminMessage : ringbuffer::Message DEFINE_RINGBUFFER_MSG_TYPE(tick), /// Notify the host of work done since last message. Enclave -> Host - DEFINE_RINGBUFFER_MSG_TYPE(work_stats) + DEFINE_RINGBUFFER_MSG_TYPE(work_stats), + + /// Notify the host that it should restart in join + DEFINE_RINGBUFFER_MSG_TYPE(restart_and_join) }; DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( @@ -48,6 +51,7 @@ DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::stop_notice); DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::stopped); DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::tick); DECLARE_RINGBUFFER_MESSAGE_PAYLOAD(AdminMessage::work_stats, std::string); +DECLARE_RINGBUFFER_MESSAGE_PAYLOAD(AdminMessage::restart_and_join, std::string, std::string); /// Messages sent from app endpoints enum AppMessage : ringbuffer::Message diff --git a/src/host/configuration.h b/src/host/configuration.h index 9ffdcc0603d5..dbe0d22ab0a0 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -145,6 +145,10 @@ namespace host std::nullopt; ccf::ds::TimeString self_healing_open_retry_timeout = {"100ms"}; ccf::ds::TimeString self_healing_open_timeout = {"2000ms"}; + std::string self_healing_open_join_config_file = + "self_healing_open_join_config.json"; + std::string self_healing_open_join_service_identity_file = + "self_healing_open_join_service_identity.pem"; bool operator==(const Recover&) const = default; }; Recover recover = {}; diff --git a/src/host/handle_ring_buffer.h b/src/host/handle_ring_buffer.h index 311d13552ad6..eae61848a516 100644 --- a/src/host/handle_ring_buffer.h +++ b/src/host/handle_ring_buffer.h @@ -5,6 +5,8 @@ #include "../ds/files.h" #include "../enclave/interface.h" #include "ccf/ds/logger.h" +#include "ds/non_blocking.h" +#include "self_healing_open.h" #include "timer.h" #include @@ -99,6 +101,15 @@ namespace asynchost uv_stop(uv_default_loop()); LOG_INFO_FMT("Host stopped successfully"); }); + + DISPATCHER_SET_MESSAGE_HANDLER( + bp, + AdminMessage::restart_and_join, + [&](const uint8_t* data, size_t size) { + auto [url, service_identity] = ringbuffer::read_message( + data, size); + ccf::SelfHealingOpenSingleton::instance()->trigger_restart_and_join_url(url, service_identity); + }); } void on_timer() diff --git a/src/host/main.cpp b/src/host/main.cpp index 1e4e73c32a06..8d83a61c9fa5 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -28,6 +28,7 @@ #include "enclave.h" #include "handle_ring_buffer.h" #include "host/env.h" +#include "host/self_healing_open.h" #include "http/curl.h" #include "json_schema.h" #include "lfs_file_handler.h" @@ -541,6 +542,8 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) rpc_udp->behaviour.register_udp_message_handlers( buffer_processor.get_dispatcher()); + ccf::SelfHealingOpenSingleton::initialise(writer_factory); + ResolvedAddresses resolved_rpc_addresses; for (auto& [name, interface] : config.network.rpc_interfaces) { @@ -1014,6 +1017,8 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) while ((uv_loop_alive(uv_default_loop()) != 0) && (close_iterations > 0)) { uv_run(uv_default_loop(), UV_RUN_NOWAIT); + const uint millisecond = 1000; + usleep(millisecond); close_iterations--; } LOG_INFO_FMT( @@ -1027,5 +1032,66 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) } ccf::crypto::openssl_sha256_shutdown(); - return loop_close_rc; + if (!ccf::SelfHealingOpenSingleton::instance()->join_info.has_value()) + { + return loop_close_rc; + } + + auto join_info = ccf::SelfHealingOpenSingleton::instance()->join_info.value(); + LOG_INFO_FMT( + "Self-healing open URL: {}, {}", + join_info.url, + ccf::ds::to_hex(join_info.service_identity)); + + files::dump( + join_info.service_identity, + config.command.recover.self_healing_open_join_service_identity_file); + + host::CCHostConfig::Command::Join join_config; + join_config.target_rpc_address = join_info.url; + + host::CCHostConfig::Command command_config; + command_config.type = StartType::Join; + command_config.service_certificate_file = + config.command.recover.self_healing_open_join_service_identity_file; + command_config.join = join_config; + + auto new_config_json = config_json; + new_config_json["command"] = command_config; + new_config_json["output_files"]["pid_file"] = + "self_healing_open_join.pid"; + + files::dump( + new_config_json.dump(), + config.command.recover.self_healing_open_join_config_file); + + std::vector new_argv{ + argv[0], // The executable name + "--config", + config.command.recover.self_healing_open_join_config_file.c_str(), + "--log-level", + ccf::logger::to_string(log_level), + "--enclave-file", + enclave_file_path.c_str(), + }; + + std::string cmd = fmt::format("\"{}\"", fmt::join(new_argv, "\" \"")); + LOG_INFO_FMT("Joining network via an exec of: {}", cmd); + + // null terminator for execve + new_argv.push_back(nullptr); + + if (fflush(stdout) != 0) + { + LOG_FAIL_FMT("Failed to flush stdout"); + } + if (fflush(stderr) != 0) + { + LOG_FAIL_FMT("Failed to flush stderr"); + } + + int rc = execve(argv[0], const_cast(new_argv.data()), environ); + + LOG_FAIL_FMT("Failed to execve new process: {}. Exiting.", strerror(rc)); + return 1; } diff --git a/src/host/self_healing_open.h b/src/host/self_healing_open.h new file mode 100644 index 000000000000..8835f018ec6e --- /dev/null +++ b/src/host/self_healing_open.h @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. +#pragma once + +#include "../enclave/interface.h" +#include "ds/ring_buffer_types.h" + +#include +#include +#include +#include +namespace ccf +{ + struct SelfHealingOpenJoinInfo + { + std::string url; + std::string service_identity; + }; + + class SelfHealingOpen + { + public: + ringbuffer::WriterPtr to_enclave; + std::optional join_info; + + SelfHealingOpen(ringbuffer::AbstractWriterFactory& writer_factory) : + to_enclave(writer_factory.create_writer_to_inside()), + join_info(std::nullopt) + {} + + void trigger_restart_and_join_url( + const std::string& url, const std::string& service_identity) + { + join_info = SelfHealingOpenJoinInfo{ + .url = url, .service_identity = service_identity}; + RINGBUFFER_WRITE_MESSAGE(AdminMessage::stop, to_enclave); + } + }; + + class SelfHealingOpenSingleton + { + private: + static std::unique_ptr& instance_unsafe() + { + static std::unique_ptr instance = nullptr; + return instance; + } + + public: + static std::unique_ptr& instance() + { + auto& instance = instance_unsafe(); + if (instance == nullptr) + { + throw std::logic_error( + "SelfHealingOpenSingleton instance not initialized"); + } + return instance; + } + + static void initialise(ringbuffer::AbstractWriterFactory& writer_factory) + { + auto& instance = instance_unsafe(); + if (instance != nullptr) + { + throw std::logic_error( + "SelfHealingOpenSingleton instance already initialized"); + } + instance = std::make_unique(writer_factory); + } + }; +} \ No newline at end of file diff --git a/src/node/node_state.h b/src/node/node_state.h index 5dc76a921576..c317da674be4 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -31,8 +31,10 @@ #include "crypto/certs.h" #include "ds/ccf_assert.h" #include "ds/files.h" +#include "ds/ring_buffer_types.h" #include "ds/state_machine.h" #include "ds/thread_messaging.h" +#include "enclave/interface.h" #include "enclave/rpc_sessions.h" #include "encryptor.h" #include "history.h" @@ -2131,10 +2133,7 @@ namespace ccf std::move(timeout_msg), config.recover.self_healing_open_timeout); } - void self_healing_open_advance( - ccf::kv::Tx& tx, - const ccf::StartupConfig& node_config, - bool timeout) override + void self_healing_open_advance(ccf::kv::Tx& tx, bool timeout) override { auto* sm_state_handle = tx.rw(network.self_healing_open_sm_state); if (!sm_state_handle->get().has_value()) @@ -2150,7 +2149,7 @@ namespace ccf auto* gossip_handle = tx.ro(network.self_healing_open_gossip); if ( gossip_handle->size() == - node_config.recover.self_healing_open_addresses.value().size() || + config.recover.self_healing_open_addresses.value().size() || timeout) { if (gossip_handle->size() == 0) @@ -2182,8 +2181,7 @@ namespace ccf auto* votes = tx.rw(network.self_healing_open_votes); if ( votes->size() >= - node_config.recover.self_healing_open_addresses.value().size() / - 2 + + config.recover.self_healing_open_addresses.value().size() / 2 + 1 || timeout) { @@ -2216,15 +2214,37 @@ namespace ccf } case SelfHealingOpenSM::JOINING: { + auto chosen_replica = + tx.ro(network.self_healing_open_chosen_replica)->get(); + if (!chosen_replica.has_value()) + { + throw std::logic_error( + "Self-healing-open chosen node not set, cannot join"); + } + auto node_config = tx.ro(this->network.self_healing_open_node_info) + ->get(chosen_replica.value()); + if (!node_config.has_value()) + { + throw std::logic_error(fmt::format( + "Self-healing-open chosen node {} not found", + chosen_replica.value())); + } + LOG_INFO_FMT( - "Self-healing-open in JOINING state, but no logic implemented"); - // TODO restart in join - return; + "Self-healing-open joining {} with service identity {}", + node_config->published_network_address, + node_config->service_identity); + + RINGBUFFER_WRITE_MESSAGE( + AdminMessage::restart_and_join, + to_host, + node_config->published_network_address, + node_config->service_identity); } case SelfHealingOpenSM::OPENING: case SelfHealingOpenSM::OPEN: { - // Nothing to do here, we are already opening or open + // Nothing to do here, we are already opening or open or joining return; } default: @@ -3168,6 +3188,20 @@ namespace ccf max_version); } + self_healing_open::RequestNodeInfo self_healing_open_node_info() + { + return { + .quote_info = quote_info, + .published_network_address = + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + .intrinsic_id = + config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + .service_identity = network.identity->cert.str(), + }; + } + void self_healing_open_gossip_unsafe() { // Caller must ensure that the current node's quote_info is populated: @@ -3182,16 +3216,7 @@ namespace ccf LOG_TRACE_FMT("Broadcasting self-healing-open gossip"); self_healing_open::GossipRequest request{ - .info = - self_healing_open::RequestNodeInfo{ - .quote_info = quote_info, - .published_network_address = - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address, - .intrinsic_id = - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address, - }, + .info = self_healing_open_node_info(), // TODO fix: This isn't quite right, as it should be the highest txid // with a signature,before the recovery txs .txid = network.tables->current_version(), @@ -3219,15 +3244,7 @@ namespace ccf node_info.published_network_address); self_healing_open::VoteRequest request{ - .info = self_healing_open::RequestNodeInfo{ - .quote_info = quote_info, - .published_network_address = - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address, - .intrinsic_id = - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address, - }}; + .info = self_healing_open_node_info()}; self_healing_open::dispatch_authenticated_message( std::move(request), @@ -3251,15 +3268,7 @@ namespace ccf LOG_TRACE_FMT("Sending self-healing-open iamopen"); self_healing_open::IAmOpenRequest request{ - .info = self_healing_open::RequestNodeInfo{ - .quote_info = quote_info, - .published_network_address = - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address, - .intrinsic_id = - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address, - }}; + .info = self_healing_open_node_info()}; for (auto& target_address : config.recover.self_healing_open_addresses.value()) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 6a15bfcb211b..0132b1502027 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -17,6 +17,7 @@ #include "crypto/certs.h" #include "crypto/csr.h" #include "ds/files.h" +#include "ds/ring_buffer_types.h" #include "ds/std_formatters.h" #include "frontend.h" #include "node/network_state.h" @@ -31,6 +32,7 @@ #include "snapshots/filenames.h" #include +#include namespace ccf { @@ -471,6 +473,7 @@ namespace ccf .quote_info = in.quote_info, .published_network_address = in.published_network_address, .cert_der = cert_der, + .service_identity = in.service_identity, .intrinsic_id = in.intrinsic_id}; node_info_handle->put(in.intrinsic_id, src_info); } @@ -2285,16 +2288,6 @@ namespace ccf auto self_healing_open_gossip = [this](auto& args, const nlohmann::json& params) { - auto node_configuration_subsystem = - this->context.get_subsystem(); - if (!node_configuration_subsystem) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "NodeConfigurationSubsystem is not available"); - } - auto in = params.get(); auto valid = self_healing_open_validate_and_store_node_info( @@ -2331,8 +2324,7 @@ namespace ccf try { - this->node_operation.self_healing_open_advance( - args.tx, node_configuration_subsystem->get().node_config, false); + this->node_operation.self_healing_open_advance(args.tx, false); } catch (const std::logic_error& e) { @@ -2359,16 +2351,6 @@ namespace ccf auto self_healing_open_vote = [this](auto& args, const nlohmann::json& params) { - auto node_configuration_subsystem = - this->context.get_subsystem(); - if (!node_configuration_subsystem) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "NodeConfigurationSubsystem is not available"); - } - auto in = params.get(); auto valid = self_healing_open_validate_and_store_node_info( args, args.tx, in.info); @@ -2385,8 +2367,7 @@ namespace ccf try { - this->node_operation.self_healing_open_advance( - args.tx, node_configuration_subsystem->get().node_config, false); + this->node_operation.self_healing_open_advance(args.tx, false); } catch (const std::logic_error& e) { @@ -2425,10 +2406,32 @@ namespace ccf return make_error(code, ccf::errors::InvalidQuote, message); } + auto* sm_state = args.tx.rw(this->network.self_healing_open_sm_state); + sm_state->put(SelfHealingOpenSM::JOINING); + LOG_INFO_FMT("******************************"); LOG_INFO_FMT("Self-healing-open is JOINING {}", in.info.intrinsic_id); LOG_INFO_FMT("******************************"); + auto* chosen_replica = + args.tx.rw(this->network.self_healing_open_chosen_replica); + chosen_replica->put(in.info.intrinsic_id); + + try + { + this->node_operation.self_healing_open_advance(args.tx, false); + } + catch (const std::logic_error& e) + { + LOG_FAIL_FMT( + "Self-healing-open gossip failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); + } + return make_success(fmt::format( "Node {} is joining self-healing-open", in.info.intrinsic_id)); }; @@ -2448,16 +2451,6 @@ namespace ccf LOG_TRACE_FMT("Self-healing-open timeout received"); - auto node_configuration_subsystem = - this->context.get_subsystem(); - if (!node_configuration_subsystem) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "NodeConfigurationSubsystem is not available"); - } - // Must ensure that the request originates from the primary auto primary_id = consensus->primary(); if (!primary_id.has_value()) @@ -2483,8 +2476,7 @@ namespace ccf try { - this->node_operation.self_healing_open_advance( - args.tx, node_configuration_subsystem->get().node_config, true); + this->node_operation.self_healing_open_advance(args.tx, true); } catch (const std::logic_error& e) { diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index 5fe39f05af6b..a6cb8d792f3f 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -66,7 +66,7 @@ namespace ccf virtual ccf::crypto::Pem get_self_signed_certificate() = 0; virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; virtual void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) = 0; - virtual void self_healing_open_advance(ccf::kv::Tx&, const ccf::StartupConfig&, bool) = 0; + virtual void self_healing_open_advance(ccf::kv::Tx&, bool) = 0; virtual const ccf::StartupConfig& get_node_config() const = 0; virtual ccf::crypto::Pem get_network_cert() = 0; virtual void stop_notice() = 0; diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index b56dda849be0..7271b0b34a53 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -116,9 +116,9 @@ namespace ccf } void self_healing_open_advance( - ccf::kv::Tx& tx, const ccf::StartupConfig& startup_config, bool is_recovery) override + ccf::kv::Tx& tx, bool is_recovery) override { - impl.self_healing_open_advance(tx, startup_config, is_recovery); + impl.self_healing_open_advance(tx, is_recovery); } }; } \ No newline at end of file diff --git a/src/node/rpc/node_operation_interface.h b/src/node/rpc/node_operation_interface.h index c006cfc17f8e..8cf3adee98be 100644 --- a/src/node/rpc/node_operation_interface.h +++ b/src/node/rpc/node_operation_interface.h @@ -64,6 +64,6 @@ namespace ccf virtual void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) = 0; virtual void self_healing_open_advance( - ccf::kv::Tx&, const ccf::StartupConfig&, bool) = 0; + ccf::kv::Tx&, bool) = 0; }; } \ No newline at end of file diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index ea847f7b454b..86cf9e4ff569 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -117,7 +117,7 @@ namespace ccf } void self_healing_open_advance( - ccf::kv::Tx& tx, const ccf::StartupConfig& config, bool timeout) override + ccf::kv::Tx& tx, bool timeout) override { // No-op for stub } diff --git a/src/node/self_healing_open.h b/src/node/self_healing_open.h index 904600cd17da..166d50915c5c 100644 --- a/src/node/self_healing_open.h +++ b/src/node/self_healing_open.h @@ -21,10 +21,15 @@ namespace ccf::self_healing_open QuoteInfo quote_info; std::string published_network_address; std::string intrinsic_id; + std::string service_identity; }; DECLARE_JSON_TYPE(RequestNodeInfo); DECLARE_JSON_REQUIRED_FIELDS( - RequestNodeInfo, quote_info, published_network_address, intrinsic_id); + RequestNodeInfo, + quote_info, + published_network_address, + intrinsic_id, + service_identity); struct GossipRequest { @@ -70,7 +75,7 @@ namespace ccf::self_healing_open curl_handle.set_blob_opt( CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); - curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); + curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); auto url = fmt::format( "https://{}/{}/self_healing_open/{}", From 207b142dd3be091afa241a7057bafbe2bb328b80 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 12 Aug 2025 11:09:54 +0100 Subject: [PATCH 047/197] Changes to prevent repeated joins --- include/ccf/service/tables/self_heal_open.h | 6 +- src/node/node_state.h | 80 ++++++--- src/node/rpc/node_frontend.h | 169 ++++++++++++++------ src/service/network_tables.h | 2 + 4 files changed, 186 insertions(+), 71 deletions(-) diff --git a/include/ccf/service/tables/self_heal_open.h b/include/ccf/service/tables/self_heal_open.h index 0a4fc7402566..22819d6bc5f4 100644 --- a/include/ccf/service/tables/self_heal_open.h +++ b/include/ccf/service/tables/self_heal_open.h @@ -42,7 +42,8 @@ DECLARE_JSON_ENUM( {{SelfHealingOpenSM::GOSSIPPING, "Gossipping"}, {SelfHealingOpenSM::VOTING, "Voting"}, {SelfHealingOpenSM::OPENING, "Opening"}, - {SelfHealingOpenSM::JOINING, "Joining"}}); + {SelfHealingOpenSM::JOINING, "Joining"}, + {SelfHealingOpenSM::OPEN, "Open"}}); namespace ccf { @@ -53,6 +54,7 @@ namespace ccf using SelfHealingOpenChosenReplica = ServiceValue; using SelfHealingOpenVotes = ServiceSet; using SelfHealingOpenSMState = ServiceValue; + using SelfHealingOpenTimeoutSMState = ServiceValue; namespace Tables { @@ -66,5 +68,7 @@ namespace ccf "public:ccf.gov.selfhealingopen.votes"; static constexpr auto SELF_HEALING_OPEN_SM_STATE = "public:ccf.gov.selfhealingopen.sm_state"; + static constexpr auto SELF_HEALING_OPEN_TIMEOUT_SM_STATE = + "public:ccf.gov.selfhealingopen.timeout_sm_state"; } } diff --git a/src/node/node_state.h b/src/node/node_state.h index c317da674be4..b4c084557df7 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2005,6 +2005,9 @@ namespace ccf auto* state_handle = tx.rw(network.self_healing_open_sm_state); state_handle->put(SelfHealingOpenSM::GOSSIPPING); + auto* timeout_state_handle = + tx.rw(network.self_healing_open_timeout_sm_state); + timeout_state_handle->put(SelfHealingOpenSM::GOSSIPPING); auto retry_timer_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { @@ -2023,9 +2026,9 @@ namespace ccf // Keep doing this until the node is no longer in recovery if ( - msg->data.self.sm.check(NodeStartupState::partOfNetwork) || sm_state == SelfHealingOpenSM::OPEN) { + LOG_INFO_FMT("Self-healing-open complete, stopping timers."); return; } @@ -2136,12 +2139,40 @@ namespace ccf void self_healing_open_advance(ccf::kv::Tx& tx, bool timeout) override { auto* sm_state_handle = tx.rw(network.self_healing_open_sm_state); - if (!sm_state_handle->get().has_value()) + auto* timeout_state_handle = + tx.rw(network.self_healing_open_timeout_sm_state); + if ( + !sm_state_handle->get().has_value() || + !timeout_state_handle->get().has_value()) { throw std::logic_error( "Self-healing-open state not set, cannot advance self-healing-open"); } + bool valid_timeout = timeout && + timeout_state_handle->get().value() == sm_state_handle->get().value(); + + // Advance timeout SM + if (timeout) + { + switch (timeout_state_handle->get().value()) + { + case SelfHealingOpenSM::GOSSIPPING: + LOG_TRACE_FMT("Advancing timeout SM to VOTING"); + timeout_state_handle->put(SelfHealingOpenSM::VOTING); + break; + case SelfHealingOpenSM::VOTING: + LOG_TRACE_FMT("Advancing timeout SM to OPENING"); + timeout_state_handle->put(SelfHealingOpenSM::OPENING); + break; + case SelfHealingOpenSM::OPENING: + case SelfHealingOpenSM::JOINING: + case SelfHealingOpenSM::OPEN: + default: + LOG_TRACE_FMT("Timeout SM complete"); + } + } + switch (sm_state_handle->get().value()) { case SelfHealingOpenSM::GOSSIPPING: @@ -2150,7 +2181,7 @@ namespace ccf if ( gossip_handle->size() == config.recover.self_healing_open_addresses.value().size() || - timeout) + valid_timeout) { if (gossip_handle->size() == 0) { @@ -2183,7 +2214,7 @@ namespace ccf votes->size() >= config.recover.self_healing_open_addresses.value().size() / 2 + 1 || - timeout) + valid_timeout) { if (votes->size() == 0) { @@ -2193,22 +2224,6 @@ namespace ccf LOG_INFO_FMT("Self-healing-open succeeded, now opening network"); sm_state_handle->put(SelfHealingOpenSM::OPENING); - - auto* service = tx.ro(Tables::SERVICE); - auto service_info = service->get(); - if (!service_info.has_value()) - { - throw std::logic_error( - "Service information cannot be found to transition service to " - "open"); - } - const auto prev_ident = - tx.ro(Tables::PREVIOUS_SERVICE_IDENTITY) - ->get(); - AbstractGovernanceEffects::ServiceIdentities identities{ - .previous = prev_ident, .next = service_info->cert}; - - transition_service_to_open(tx, identities); } return; } @@ -2242,6 +2257,31 @@ namespace ccf node_config->service_identity); } case SelfHealingOpenSM::OPENING: + { + // TODO: Add fast path if enough replicas have joined already + // THIS IS POSSIBLY DANGEROUS as these joining replicas are not signed + // off... + if (valid_timeout) + { + auto* service = tx.ro(Tables::SERVICE); + auto service_info = service->get(); + if (!service_info.has_value()) + { + throw std::logic_error( + "Service information cannot be found to transition service to " + "open"); + } + const auto prev_ident = + tx.ro(Tables::PREVIOUS_SERVICE_IDENTITY) + ->get(); + AbstractGovernanceEffects::ServiceIdentities identities{ + .previous = prev_ident, .next = service_info->cert}; + + sm_state_handle->put(SelfHealingOpenSM::OPEN); + + transition_service_to_open(tx, identities); + } + } case SelfHealingOpenSM::OPEN: { // Nothing to do here, we are already opening or open or joining diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 0132b1502027..349197a3f41b 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -1732,6 +1732,9 @@ namespace ccf auto* state_handle = tx.rw( Tables::SELF_HEALING_OPEN_SM_STATE); state_handle->clear(); + auto* timeout_state_handle = + tx.rw( + Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE); auto* node_info_handle = tx.rw( Tables::SELF_HEALING_OPEN_NODES); node_info_handle->clear(); @@ -2286,60 +2289,76 @@ namespace ccf .set_openapi_hidden(true) .install(); - auto self_healing_open_gossip = - [this](auto& args, const nlohmann::json& params) { - auto in = params.get(); + auto self_healing_open_gossip = [this]( + auto& args, + const nlohmann::json& params) { + auto config = this->context.get_subsystem(); + if (!config) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "NodeConfigurationSubsystem is not available"); + } + if (!config->get() + .node_config.recover.self_healing_open_addresses.has_value()) + { + return make_error( + HTTP_STATUS_BAD_REQUEST, + ccf::errors::InvalidNodeState, + "Self-healing-open addresses are not configured"); + } - auto valid = self_healing_open_validate_and_store_node_info( - args, args.tx, in.info); - if (valid.has_value()) - { - auto [code, message] = valid.value(); - return make_error(code, ccf::errors::InvalidQuote, message); - } + auto in = params.get(); + auto valid = self_healing_open_validate_and_store_node_info( + args, args.tx, in.info); + if (valid.has_value()) + { + auto [code, message] = valid.value(); + return make_error(code, ccf::errors::InvalidQuote, message); + } - LOG_TRACE_FMT("Processing self-healing-open gossip RPC"); - LOG_TRACE_FMT("Self-healing-open gossip params: {}", params.dump()); + LOG_TRACE_FMT("Processing self-healing-open gossip RPC"); + LOG_TRACE_FMT("Self-healing-open gossip params: {}", params.dump()); - auto chosen_replica = - args.tx.rw(this->network.self_healing_open_chosen_replica); - // This freezes the gossips at the point where it votes - if (chosen_replica->get().has_value()) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "This replica has already voted"); - } + auto chosen_replica = + args.tx.rw(this->network.self_healing_open_chosen_replica); + // This freezes the gossips at the point where it votes + if (chosen_replica->get().has_value()) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "This replica has already voted"); + } - auto gossip_handle = - args.tx.rw(this->network.self_healing_open_gossip); - if (gossip_handle->get(in.info.intrinsic_id).has_value()) - { - LOG_INFO_FMT("Node {} already gossiped", in.info.intrinsic_id); - return make_success( - fmt::format("Node {} already gossiped", in.info.intrinsic_id)); - } - gossip_handle->put(in.info.intrinsic_id, in.txid); + auto gossip_handle = args.tx.rw(this->network.self_healing_open_gossip); + if (gossip_handle->get(in.info.intrinsic_id).has_value()) + { + LOG_INFO_FMT("Node {} already gossiped", in.info.intrinsic_id); + return make_success( + fmt::format("Node {} already gossiped", in.info.intrinsic_id)); + } + gossip_handle->put(in.info.intrinsic_id, in.txid); - try - { - this->node_operation.self_healing_open_advance(args.tx, false); - } - catch (const std::logic_error& e) - { - LOG_FAIL_FMT( - "Self-healing-open gossip failed to advance state: {}", e.what()); - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - fmt::format( - "Failed to advance self-healing-open state: {}", e.what())); - } + try + { + this->node_operation.self_healing_open_advance(args.tx, false); + } + catch (const std::logic_error& e) + { + LOG_FAIL_FMT( + "Self-healing-open gossip failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); + } - return make_success(fmt::format( - "Node {} gossiped for self-healing-open", in.info.intrinsic_id)); - }; + return make_success(fmt::format( + "Node {} gossiped for self-healing-open", in.info.intrinsic_id)); + }; make_endpoint( "/self_healing_open/gossip", HTTP_PUT, @@ -2351,6 +2370,24 @@ namespace ccf auto self_healing_open_vote = [this](auto& args, const nlohmann::json& params) { + auto config = + this->context.get_subsystem(); + if (!config) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "NodeConfigurationSubsystem is not available"); + } + if (!config->get() + .node_config.recover.self_healing_open_addresses.has_value()) + { + return make_error( + HTTP_STATUS_BAD_REQUEST, + ccf::errors::InvalidNodeState, + "Self-healing-open addresses are not configured"); + } + auto in = params.get(); auto valid = self_healing_open_validate_and_store_node_info( args, args.tx, in.info); @@ -2395,7 +2432,23 @@ namespace ccf auto self_healing_open_iamopen = [this](auto& args, const nlohmann::json& params) { - LOG_TRACE_FMT("Processing self-healing-open iamopen RPC"); + auto config = + this->context.get_subsystem(); + if (!config) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "NodeConfigurationSubsystem is not available"); + } + if (!config->get() + .node_config.recover.self_healing_open_addresses.has_value()) + { + return make_error( + HTTP_STATUS_BAD_REQUEST, + ccf::errors::InvalidNodeState, + "Self-healing-open addresses are not configured"); + } auto in = params.get(); auto valid = self_healing_open_validate_and_store_node_info( @@ -2447,7 +2500,23 @@ namespace ccf auto self_healing_open_timeout = [this]( auto& args, const nlohmann::json& params) { - (void)params; // Unused, but required by the adapter + (void)params; + auto config = this->context.get_subsystem(); + if (!config) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + "NodeConfigurationSubsystem is not available"); + } + if (!config->get() + .node_config.recover.self_healing_open_addresses.has_value()) + { + return make_error( + HTTP_STATUS_BAD_REQUEST, + ccf::errors::InvalidNodeState, + "Self-healing-open addresses are not configured"); + } LOG_TRACE_FMT("Self-healing-open timeout received"); diff --git a/src/service/network_tables.h b/src/service/network_tables.h index 88a3c9564c0b..b5ae614fa52f 100644 --- a/src/service/network_tables.h +++ b/src/service/network_tables.h @@ -257,6 +257,8 @@ namespace ccf Tables::SELF_HEALING_OPEN_VOTES}; const SelfHealingOpenSMState self_healing_open_sm_state = { Tables::SELF_HEALING_OPEN_SM_STATE}; + const SelfHealingOpenSMState self_healing_open_timeout_sm_state = { + Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE}; inline auto get_all_internal_tables() const { From 107317741fa3dc85d40fe5ce70de38eaf581cd68 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 12 Aug 2025 11:11:36 +0100 Subject: [PATCH 048/197] curl client fixes --- tests/infra/clients.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/infra/clients.py b/tests/infra/clients.py index 6a5b492f64b7..0e80e1b49822 100644 --- a/tests/infra/clients.py +++ b/tests/infra/clients.py @@ -486,7 +486,10 @@ def __init__( assert signing_auth is None, signing_auth self.cose_signing_auth = cose_signing_auth self.common_headers = common_headers or {} - self.ca_curve = get_curve(self.ca) + if self.ca: + self.ca_curve = get_curve(self.ca) + else: + self.ca_curve = None self.protocol = kwargs.get("protocol") if "protocol" in kwargs else "https" self.extra_args = [] if kwargs.get("http2"): @@ -579,6 +582,8 @@ def request( if self.session_auth: cmd.extend(["--key", self.session_auth.key]) cmd.extend(["--cert", self.session_auth.cert]) + if not self.ca and not self.session_auth: + cmd.extend(["-k"]) # Allow insecure connections for arg in self.extra_args: cmd.append(arg) @@ -600,9 +605,11 @@ def request( if rc.returncode != 0: if rc.returncode in [ + 7, 35, + 55, 60, - ]: # PEER_FAILED_VERIFICATION, SSL_CONNECT_ERROR + ]: # COULDNT_CONNECT, PEER_FAILED_VERIFICATION, SEND_ERROR, SSL_CONNECT_ERROR raise CCFConnectionException if rc.returncode == 28: # OPERATION_TIMEDOUT raise TimeoutError From 9d95055f13fb5b960caae9e4a5e060e3d47e00cd Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 12 Aug 2025 11:15:09 +0100 Subject: [PATCH 049/197] Update network to better integrate with volatile node identities --- tests/infra/network.py | 88 ++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 47 deletions(-) diff --git a/tests/infra/network.py b/tests/infra/network.py index 12ffa314620c..d3b9a1e769a6 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -5,7 +5,7 @@ from contextlib import contextmanager from enum import Enum, IntEnum, auto -from infra.clients import flush_info +from infra.clients import flush_info, CCFConnectionException import infra.member import infra.path import infra.proc @@ -814,54 +814,46 @@ def start_in_self_healing_open( ] for i, node in enumerate(self.nodes): - forwarded_args_with_overrides = forwarded_args.copy() - forwarded_args_with_overrides.update( - self.per_node_args_override.get(i, {}) - ) - if not start_all_nodes and i > 0: - break + forwarded_args_with_overrides = forwarded_args.copy() + forwarded_args_with_overrides.update( + self.per_node_args_override.get(i, {}) + ) + if not start_all_nodes and i > 0: + break - try: - node_kwargs = { - "lib_name": args.package, - "workspace": args.workspace, - "label": args.label, - "common_dir": self.common_dir, - } - self_healing_open_kwargs = {"self_healing_open_addresses": self_healing_open_addresses} - # If a kwarg is passed in override automatically set variants - node_kwargs = node_kwargs | self_healing_open_kwargs | forwarded_args_with_overrides | kwargs - node.recover(**node_kwargs) - self.wait_for_state( - node, - infra.node.State.PART_OF_PUBLIC_NETWORK.value, - timeout=args.ledger_recovery_timeout, - ) - except Exception: - LOG.exception(f"Failed to start node {node.local_node_id}") - raise + try: + node_kwargs = { + "lib_name": args.package, + "workspace": args.workspace, + "label": args.label, + "common_dir": self.common_dir, + } + self_healing_open_kwargs = {"self_healing_open_addresses": self_healing_open_addresses} + # If a kwarg is passed in override automatically set variants + node_kwargs = node_kwargs | self_healing_open_kwargs | forwarded_args_with_overrides | kwargs + node.recover(**node_kwargs) + except Exception: + LOG.exception(f"Failed to start node {node.local_node_id}") + raise self.election_duration = args.election_timeout_ms / 1000 self.observed_election_duration = self.election_duration + 1 - LOG.info("All nodes started") - - primary, _ = self.find_primary( - timeout=args.ledger_recovery_timeout - ) + for i, node in enumerate(self.nodes): + while True: + try: + self.wait_for_states( + node, + [infra.node.State.PART_OF_PUBLIC_NETWORK.value, infra.node.State.PART_OF_NETWORK], + timeout=args.ledger_recovery_timeout, + verify_ca=False, # Certs are volatile until the recovery is complete + ) + break + except CCFConnectionException: + time.sleep(0.1) - if set_authenticate_session is not None: - self.consortium.set_authenticate_session(set_authenticate_session) + LOG.info("All nodes started") - for node in self.get_joined_nodes(): - self.wait_for_state( - node, - infra.node.State.PART_OF_PUBLIC_NETWORK.value, - timeout=args.ledger_recovery_timeout, - ) - # Catch-up in recovery can take a long time, so extend this timeout - self.wait_for_all_nodes_to_commit(primary=primary, timeout=20) - LOG.success("All nodes joined public network") def recover( self, @@ -1312,12 +1304,12 @@ def get_live_nodes(self): def get_f(self): return infra.e2e_args.max_f(self.args, len(self.nodes)) - def wait_for_states(self, node, states, timeout=3): + def wait_for_states(self, node, states, timeout=3, **client_kwargs): end_time = time.time() + timeout final_state = None while time.time() < end_time: try: - with node.client(connection_timeout=timeout) as c: + with node.client(connection_timeout=timeout, **client_kwargs) as c: r = c.get("/node/state").body.json() if r["state"] in states: final_state = r["state"] @@ -1335,16 +1327,18 @@ def wait_for_states(self, node, states, timeout=3): def wait_for_state(self, node, state, timeout=3): self.wait_for_states(node, [state], timeout=timeout) - def wait_for_statuses(self, node, statuses, timeout=3): + def wait_for_statuses(self, node, statuses, timeout=3, **client_kwargs): end_time = time.time() + timeout while time.time() < end_time: try: - with node.client(connection_timeout=timeout) as c: + with node.client(connection_timeout=timeout, **client_kwargs) as c: r = c.get("/node/network").body.json() if r["service_status"] in statuses: break except ConnectionRefusedError: pass + except CCFConnectionException: + pass time.sleep(0.1) else: raise TimeoutError( @@ -1846,7 +1840,7 @@ def refresh_service_identity_file(self, args): connections pick up the new service certificate. """ primary = self.find_random_node() - with primary.client() as c: + with primary.client(verify_ca=False) as c: r = c.get("/node/network") assert r.status_code == 200, r new_service_identity = r.body.json()["service_certificate"] From b3a1f9becfbafd16ea2e1d394c54413f6e25c47d Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 12 Aug 2025 11:17:32 +0100 Subject: [PATCH 050/197] fmt --- include/ccf/service/tables/self_heal_open.h | 1 - src/enclave/interface.h | 3 ++- src/host/handle_ring_buffer.h | 8 +++++--- src/host/main.cpp | 3 +-- src/http/curl.h | 12 +++++++----- src/node/node_state.h | 3 +-- src/node/quote_endorsements_client.h | 8 +++++--- src/node/rpc/node_interface.h | 3 ++- src/node/rpc/node_operation.h | 6 +++--- src/node/rpc/node_operation_interface.h | 6 +++--- src/node/rpc/test/node_stub.h | 6 +++--- 11 files changed, 32 insertions(+), 27 deletions(-) diff --git a/include/ccf/service/tables/self_heal_open.h b/include/ccf/service/tables/self_heal_open.h index 22819d6bc5f4..7461b113f1ee 100644 --- a/include/ccf/service/tables/self_heal_open.h +++ b/include/ccf/service/tables/self_heal_open.h @@ -6,7 +6,6 @@ #include "ccf/ds/json.h" #include "ccf/ds/quote_info.h" #include "ccf/service/map.h" -#include "node/identity.h" using IntrinsicIdentifier = std::string; diff --git a/src/enclave/interface.h b/src/enclave/interface.h index 7142d4bcb6f8..1f6ab391dd09 100644 --- a/src/enclave/interface.h +++ b/src/enclave/interface.h @@ -51,7 +51,8 @@ DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::stop_notice); DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::stopped); DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::tick); DECLARE_RINGBUFFER_MESSAGE_PAYLOAD(AdminMessage::work_stats, std::string); -DECLARE_RINGBUFFER_MESSAGE_PAYLOAD(AdminMessage::restart_and_join, std::string, std::string); +DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( + AdminMessage::restart_and_join, std::string, std::string); /// Messages sent from app endpoints enum AppMessage : ringbuffer::Message diff --git a/src/host/handle_ring_buffer.h b/src/host/handle_ring_buffer.h index eae61848a516..f618fe5dd5c2 100644 --- a/src/host/handle_ring_buffer.h +++ b/src/host/handle_ring_buffer.h @@ -106,9 +106,11 @@ namespace asynchost bp, AdminMessage::restart_and_join, [&](const uint8_t* data, size_t size) { - auto [url, service_identity] = ringbuffer::read_message( - data, size); - ccf::SelfHealingOpenSingleton::instance()->trigger_restart_and_join_url(url, service_identity); + auto [url, service_identity] = + ringbuffer::read_message( + data, size); + ccf::SelfHealingOpenSingleton::instance() + ->trigger_restart_and_join_url(url, service_identity); }); } diff --git a/src/host/main.cpp b/src/host/main.cpp index 8d83a61c9fa5..e973c5e72233 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -1058,8 +1058,7 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) auto new_config_json = config_json; new_config_json["command"] = command_config; - new_config_json["output_files"]["pid_file"] = - "self_healing_open_join.pid"; + new_config_json["output_files"]["pid_file"] = "self_healing_open_join.pid"; files::dump( new_config_json.dump(), diff --git a/src/http/curl.h b/src/http/curl.h index 14b91d61d6e4..85e061a4fd65 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -329,15 +329,18 @@ namespace ccf::curl case HTTP_HEAD: CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_NOBODY, 1L); break; - case HTTP_PUT: { + case HTTP_PUT: + { CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); if (request_body == nullptr) { - // If no request body is provided, curl will try reading from stdin, which causes a blockage - request_body = std::make_unique(std::vector()); + // If no request body is provided, curl will try reading from stdin, + // which causes a blockage + request_body = + std::make_unique(std::vector()); } } - break; + break; case HTTP_POST: // libcurl sets the post verb when CURLOPT_POSTFIELDS is set, so we // skip doing so here, and we assume that the user has already set @@ -363,7 +366,6 @@ namespace ccf::curl { CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); } - } void handle_response(CURLcode curl_response_code) diff --git a/src/node/node_state.h b/src/node/node_state.h index b4c084557df7..39f9c25a0d00 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2025,8 +2025,7 @@ namespace ccf auto sm_state = sm_state_handle->get().value(); // Keep doing this until the node is no longer in recovery - if ( - sm_state == SelfHealingOpenSM::OPEN) + if (sm_state == SelfHealingOpenSM::OPEN) { LOG_INFO_FMT("Self-healing-open complete, stopping timers."); return; diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 73d52a50995a..d8ee731b7c5e 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -212,7 +212,9 @@ namespace ccf curl_easy_strerror(curl_response), curl_response, status_code); - if (curl_response == CURLE_OK && status_code == HTTP_STATUS_TOO_MANY_REQUESTS) + if ( + curl_response == CURLE_OK && + status_code == HTTP_STATUS_TOO_MANY_REQUESTS) { constexpr size_t default_retry_after_s = 3; size_t retry_after_s = default_retry_after_s; @@ -315,8 +317,8 @@ namespace ccf "Fetching endorsements for attestation report at {}", request->get_url()); - curl::CurlmLibuvContextSingleton::get_instance_unsafe() - ->attach_request(request); + curl::CurlmLibuvContextSingleton::get_instance_unsafe()->attach_request( + request); } public: diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index a6cb8d792f3f..b7715616dcc9 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -65,7 +65,8 @@ namespace ccf virtual size_t get_jwt_attempts() = 0; virtual ccf::crypto::Pem get_self_signed_certificate() = 0; virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; - virtual void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) = 0; + virtual void self_healing_open_try_start_timers( + ccf::kv::Tx& tx, bool recovering) = 0; virtual void self_healing_open_advance(ccf::kv::Tx&, bool) = 0; virtual const ccf::StartupConfig& get_node_config() const = 0; virtual ccf::crypto::Pem get_network_cert() = 0; diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index 7271b0b34a53..7338134bc56d 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -110,13 +110,13 @@ namespace ccf return impl.get_cose_signatures_config(); } - void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) override + void self_healing_open_try_start_timers( + ccf::kv::Tx& tx, bool recovering) override { impl.self_healing_open_try_start_timers(tx, recovering); } - void self_healing_open_advance( - ccf::kv::Tx& tx, bool is_recovery) override + void self_healing_open_advance(ccf::kv::Tx& tx, bool is_recovery) override { impl.self_healing_open_advance(tx, is_recovery); } diff --git a/src/node/rpc/node_operation_interface.h b/src/node/rpc/node_operation_interface.h index 8cf3adee98be..010ff707b117 100644 --- a/src/node/rpc/node_operation_interface.h +++ b/src/node/rpc/node_operation_interface.h @@ -62,8 +62,8 @@ namespace ccf virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; - virtual void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) = 0; - virtual void self_healing_open_advance( - ccf::kv::Tx&, bool) = 0; + virtual void self_healing_open_try_start_timers( + ccf::kv::Tx& tx, bool recovering) = 0; + virtual void self_healing_open_advance(ccf::kv::Tx&, bool) = 0; }; } \ No newline at end of file diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index 86cf9e4ff569..9cf7ac897dbf 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -111,13 +111,13 @@ namespace ccf return cose_signatures_config; } - void self_healing_open_try_start_timers(ccf::kv::Tx& tx, bool recovering) override + void self_healing_open_try_start_timers( + ccf::kv::Tx& tx, bool recovering) override { // No-op for stub } - void self_healing_open_advance( - ccf::kv::Tx& tx, bool timeout) override + void self_healing_open_advance(ccf::kv::Tx& tx, bool timeout) override { // No-op for stub } From 39da991687c79275442c84c438c8271a2b3e692f Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 15 Aug 2025 11:13:38 +0100 Subject: [PATCH 051/197] Changes to curl to make it close carefully --- src/host/main.cpp | 1 + src/http/curl.h | 127 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 95 insertions(+), 33 deletions(-) diff --git a/src/host/main.cpp b/src/host/main.cpp index e973c5e72233..054b6b23b5b5 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -1008,6 +1008,7 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) } process_launcher.stop(); + curl_libuv_context.stop(); // Continue running the loop long enough for the on_close // callbacks to be despatched, so as to avoid memory being diff --git a/src/http/curl.h b/src/http/curl.h index 85e061a4fd65..86510b597a05 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -108,6 +108,15 @@ namespace ccf::curl } } + CURLM* release() + { + if (!p) + { + return p.release(); + } + return nullptr; + } + operator CURLM*() const { return p.get(); @@ -420,15 +429,7 @@ namespace ccf::curl class CurlRequestCURLM { private: - CURLM* curl_multi; - - CurlRequestCURLM(CURLM* curl_multi) : curl_multi(curl_multi) - { - if (curl_multi == nullptr) - { - throw std::runtime_error("CURLM handle cannot be null"); - } - } + UniqueCURLM curl_multi; public: [[nodiscard]] CURLM* get() const @@ -447,15 +448,6 @@ namespace ccf::curl CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi, curl_handle); } - static CurlRequestCURLM create_unsafe(CURLM* curl_multi) - { - if (curl_multi == nullptr) - { - throw std::runtime_error("CURLM handle cannot be null"); - } - return {curl_multi}; - } - int perform() { int running_handles = 0; @@ -491,6 +483,11 @@ namespace ccf::curl } while (msgq > 0); return running_handles; } + + CURLM* release() + { + return curl_multi.release(); + } }; class CurlmLibuvContext @@ -520,10 +517,10 @@ namespace ccf::curl private: uv_loop_t* loop; uv_timer_t timeout_tracker{}; - // lifetime handler of curl_multi interface - UniqueCURLM curl_multi; - // utility class to enforce type safety on accesses to curl_multi + // utility class to enforce type safety on accesses to curl_multi wrapping a + // UniqueCURLM CurlRequestCURLM curl_request_curlm; + // We need a lock to prevent a client thread calling curl_multi_add_handle // while the libuv thread is processing a curl callback // @@ -540,6 +537,53 @@ namespace ccf::curl }; public: + // Stop all curl transfers and remove handles from libuv + void stop() + { + std::lock_guard lock(curlm_lock); + LOG_INFO_FMT("Stopping curl transfers and removing handles from libuv"); + if (curl_request_curlm.get() == nullptr) + { + throw std::logic_error( + "Cannot stop curl transfers on a null CURLM handle"); + } + // Stop all curl easy handles + { + CURL** easy_handles = curl_multi_get_handles(curl_request_curlm.get()); + for (int i = 0; easy_handles[i] != nullptr; ++i) + { + auto* easy = easy_handles[i]; + curl_multi_remove_handle(curl_request_curlm.get(), easy); + if (easy != nullptr) + { + // attach a lifetime to the request + ccf::curl::CurlRequest* request = nullptr; + curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); + if (request == nullptr) + { + LOG_FAIL_FMT( + "CURLMSG_DONE received with no associated request data"); + } + std::unique_ptr request_data_ptr(request); + curl_multi_remove_handle(curl_request_curlm.get(), easy); + curl_easy_cleanup(easy); + } + } + curl_free(easy_handles); + auto* curlm = curl_request_curlm.release(); + if (curlm != nullptr) + { + // calls socket callbacks to remove the handles from libuv + LOG_INFO_FMT("Cleaning up CURLM handle"); + curl_multi_cleanup(curlm); + } + } + + // There should be no more sockets from curl in libuv, so we can stop the + // timeout + uv_close(reinterpret_cast(&timeout_tracker), nullptr); + } + void handle_request_messages() { curl_request_curlm.perform(); @@ -555,10 +599,16 @@ namespace ccf::curl } std::lock_guard lock(self->curlm_lock); + if (self->curl_request_curlm.get() == nullptr) + { + LOG_FAIL_FMT("libuv_timeout_callback called with null CURLM handle"); + return; + } + int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - self->curl_multi, + self->curl_request_curlm.get(), CURL_SOCKET_TIMEOUT, 0, &running_handles); @@ -615,13 +665,20 @@ namespace ccf::curl } std::lock_guard lock(self->curlm_lock); + if (self->curl_request_curlm.get() == nullptr) + { + LOG_FAIL_FMT( + "libuv_socket_poll_callback called with null CURLM handle"); + return; + } + int action = 0; action |= ((events & UV_READABLE) != 0) ? CURL_CSELECT_IN : 0; action |= ((events & UV_WRITABLE) != 0) ? CURL_CSELECT_OUT : 0; int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - self->curl_multi, + self->curl_request_curlm.get(), request_context->socket, action, &running_handles); @@ -660,7 +717,10 @@ namespace ccf::curl // attach the lifetime to the socket handle request_context = request_context_ptr.release(); CHECK_CURL_MULTI( - curl_multi_assign, self->curl_multi, s, request_context); + curl_multi_assign, + self->curl_request_curlm.get(), + s, + request_context); } int events = 0; @@ -673,10 +733,12 @@ namespace ccf::curl case CURL_POLL_REMOVE: if (request_context != nullptr) { + LOG_TRACE_FMT( + "Removing socket {} from libuv", request_context->socket); uv_poll_stop(&request_context->poll_handle); std::unique_ptr request_context_ptr( request_context); - curl_multi_assign(self->curl_multi, s, nullptr); + curl_multi_assign(self->curl_request_curlm.get(), s, nullptr); } break; default: @@ -685,27 +747,26 @@ namespace ccf::curl return 0; } - CurlmLibuvContext(uv_loop_t* loop) : - loop(loop), - curl_request_curlm(CurlRequestCURLM::create_unsafe(curl_multi)) + CurlmLibuvContext(uv_loop_t* loop) : loop(loop) { uv_timer_init(loop, &timeout_tracker); timeout_tracker.data = this; // Attach this instance to the timer // attach timeouts - CHECK_CURL_MULTI(curl_multi_setopt, curl_multi, CURLMOPT_TIMERDATA, this); + CHECK_CURL_MULTI( + curl_multi_setopt, curl_request_curlm.get(), CURLMOPT_TIMERDATA, this); CHECK_CURL_MULTI( curl_multi_setopt, - curl_multi, + curl_request_curlm.get(), CURLMOPT_TIMERFUNCTION, curl_timeout_callback); // attach socket events CHECK_CURL_MULTI( - curl_multi_setopt, curl_multi, CURLMOPT_SOCKETDATA, this); + curl_multi_setopt, curl_request_curlm.get(), CURLMOPT_SOCKETDATA, this); CHECK_CURL_MULTI( curl_multi_setopt, - curl_multi, + curl_request_curlm.get(), CURLMOPT_SOCKETFUNCTION, curl_socket_callback); @@ -713,7 +774,7 @@ namespace ccf::curl int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - curl_multi, + curl_request_curlm.get(), CURL_SOCKET_TIMEOUT, 0, &running_handles); From 64e3dc8476845bfdeee993770f5cd50841dcc49d Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 15 Aug 2025 11:15:19 +0100 Subject: [PATCH 052/197] e2e sho test --- tests/e2e_operations.py | 47 +++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 2eed73282b80..b1de080fc541 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1448,8 +1448,11 @@ def run_self_healing_open(args): LOG.info("Start a network and stop it") network.start_and_open(args) old_common = infra.network.get_common_folder_name(args.workspace, args.label) + network.save_service_identity(args) network.stop_all_nodes() + recovery_args = copy.deepcopy(args) + ledger_dirs = {} committed_ledger_dirs = {} for i, node in enumerate(network.nodes): @@ -1459,40 +1462,52 @@ def run_self_healing_open(args): LOG.info("Start a recovery network and stop it") recovered_network = infra.network.Network( - args.nodes, - args.binary_dir, - args.debug_nodes, - args.perf_nodes, + recovery_args.nodes, + recovery_args.binary_dir, + recovery_args.debug_nodes, + recovery_args.perf_nodes, existing_network=network, ) - args.previous_service_identity_file = os.path.join( - old_common, "service_cert.pem" - ) recovered_network.start_in_self_healing_open( - args, + recovery_args, ledger_dirs=ledger_dirs, committed_ledger_dirs=committed_ledger_dirs, - common_dir=network.common_dir, ) - # Wait for the first node to be in RecoveryShares - for node in recovered_network.nodes[0:1]: + def cycle(items): + while True: + for item in items: + yield item + + # Wait for any node to be waiting for RecoveryShares, ie it opened + for node in cycle(recovered_network.nodes): + try: recovered_network.wait_for_statuses( node, ["WaitingForRecoveryShares", "Open"], - timeout=30, + timeout=1, + verify_ca=False ) + break + except TimeoutError: + LOG.info(f"Failed to get the status of {node.local_node_id}, retrying...") + continue + + recovered_network.refresh_service_identity_file(recovery_args) + recovered_network.consortium.recover_with_shares(recovered_network.find_random_node()) - # Wait for all replicas to report being part of the network - for node in recovered_network.nodes(): - recovered_network.wait_for_state( + # Wait for all replicas to report being part of the opened network + for node in recovered_network.nodes: + recovered_network.wait_for_status( node, - infra.node.State.PART_OF_NETWORK.value, + "Open", timeout=10, ) recovered_network._wait_for_app_open(node) + LOG.info("Completed self-healing open successfully") + recovered_network.stop_all_nodes() def run_self_healing_open_single_replica(args): From bba91ce6c4a77b05cab4a246e721e4878443adc6 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 28 Jul 2025 16:10:36 +0100 Subject: [PATCH 053/197] Fix undefined request body and multi-threaded access to curl --- src/http/curl.h | 56 +++++++++++++++++++--------- src/node/quote_endorsements_client.h | 12 +++--- src/snapshots/fetch.h | 54 ++++++++++++++++++++++++--- 3 files changed, 95 insertions(+), 27 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 7502d34ba8d7..3a9b7837edc8 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -65,15 +66,26 @@ namespace ccf::curl return p.get(); } - void set_blob_opt(auto option, const auto* data, size_t length) + void set_blob_opt(auto option, const uint8_t* data, size_t length) { + if (data == nullptr || length == 0) + { + throw std::invalid_argument( + "Data pointer cannot be null or length zero"); + } + + if (p == nullptr) + { + throw std::logic_error("Cannot set option on a null CURL handle"); + } + struct curl_blob blob { .data = const_cast(data), .len = length, .flags = CURL_BLOB_COPY, }; - CHECK_CURL_EASY_SETOPT(p.get(), option, blob); + CHECK_CURL_EASY_SETOPT(p.get(), option, &blob); } void set_opt(auto option, auto value) @@ -270,6 +282,10 @@ namespace ccf::curl class CurlRequest { + public: + using ResponseCallback = std::function; + private: UniqueCURL curl_handle; RESTVerb method = HTTP_GET; @@ -277,8 +293,7 @@ namespace ccf::curl ccf::curl::UniqueSlist headers; std::unique_ptr request_body = nullptr; std::unique_ptr response = nullptr; - std::optional> response_callback = - nullptr; + std::optional response_callback = nullptr; public: CurlRequest( @@ -287,8 +302,7 @@ namespace ccf::curl std::string&& url_, UniqueSlist&& headers_, std::unique_ptr&& request_body_, - std::optional>&& - response_callback_) : + std::optional&& response_callback_) : curl_handle(std::move(curl_handle_)), method(method_), url(std::move(url_)), @@ -345,18 +359,18 @@ namespace ccf::curl } } - void handle_response() + void handle_response(CURLcode curl_response_code) { if (response_callback.has_value()) { long status_code = 0; CHECK_CURL_EASY_GETINFO( curl_handle, CURLINFO_RESPONSE_CODE, &status_code); - response_callback.value()(*this, status_code); + response_callback.value()(*this, curl_response_code, status_code); } } - long syncronous_perform() + void synchronous_perform(CURLcode& curl_code, long& status_code) { if (curl_handle == nullptr) { @@ -364,14 +378,12 @@ namespace ccf::curl "Cannot curl_easy_perform on a null CURL handle"); } - CHECK_CURL_EASY(curl_easy_perform, curl_handle); + curl_code = curl_easy_perform(curl_handle); - handle_response(); // handle the response callback if set + handle_response(curl_code); // handle the response callback if set - long status_code = 0; CHECK_CURL_EASY_GETINFO( curl_handle, CURLINFO_RESPONSE_CODE, &status_code); - return status_code; } [[nodiscard]] CURL* get_easy_handle() const @@ -465,7 +477,7 @@ namespace ccf::curl // detach the easy handle such that it can be cleaned up with the // destructor of CurlRequest curl_multi_remove_handle(curl_multi, easy); - request->handle_response(); + request->handle_response(result); } } while (msgq > 0); return running_handles; @@ -503,6 +515,13 @@ namespace ccf::curl UniqueCURLM curl_multi; // utility class to enforce type safety on accesses to curl_multi CurlRequestCURLM curl_request_curlm; + // We need a lock to prevent a client thread calling curl_multi_add_handle + // while the libuv thread is processing a curl callback + // + // Note that since the a client callback can call curl_multi_add_handle, but + // that will be difficult/impossible to detect, we need curlm_lock to be + // recursive. + std::recursive_mutex curlm_lock; struct RequestContext { @@ -525,6 +544,7 @@ namespace ccf::curl throw std::logic_error( "libuv_timeout_callback called with null self pointer"); } + std::lock_guard lock(self->curlm_lock); int running_handles = 0; CHECK_CURL_MULTI( @@ -584,6 +604,7 @@ namespace ccf::curl throw std::logic_error( "libuv_socket_poll_callback called with null self pointer"); } + std::lock_guard lock(self->curlm_lock); int action = 0; action |= ((events & UV_READABLE) != 0) ? CURL_CSELECT_IN : 0; @@ -689,11 +710,12 @@ namespace ccf::curl &running_handles); } - // should this return a reference or a pointer? - [[nodiscard]] CurlRequestCURLM& curlm() + void attach_request(std::unique_ptr& request) { - return curl_request_curlm; + std::lock_guard lock(curlm_lock); + curl_request_curlm.attach_curl_request(request); } + }; class CurlmLibuvContextSingleton diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index fd9a44eca6f7..73d52a50995a 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -191,11 +191,12 @@ namespace ccf auto response_callback = ([this, server, endpoint]( curl::CurlRequest& request, + CURLcode curl_response, long status_code) { std::lock_guard guard(this->lock); auto* response = request.get_response(); - if (status_code == HTTP_STATUS_OK) + if (curl_response == CURLE_OK && status_code == HTTP_STATUS_OK) { LOG_INFO_FMT( "Successfully retrieved endorsements for attestation report: " @@ -207,9 +208,11 @@ namespace ccf } LOG_DEBUG_FMT( - "Error fetching endorsements for attestation report: {}", + "Error fetching endorsements for attestation report: {} ({}) {}", + curl_easy_strerror(curl_response), + curl_response, status_code); - if (status_code == HTTP_STATUS_TOO_MANY_REQUESTS) + if (curl_response == CURLE_OK && status_code == HTTP_STATUS_TOO_MANY_REQUESTS) { constexpr size_t default_retry_after_s = 3; size_t retry_after_s = default_retry_after_s; @@ -313,8 +316,7 @@ namespace ccf request->get_url()); curl::CurlmLibuvContextSingleton::get_instance_unsafe() - ->curlm() - .attach_curl_request(request); + ->attach_request(request); } public: diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index c1367044ada7..c9c431ed41a6 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -69,7 +69,17 @@ namespace snapshots std::nullopt // No response callback ); - const auto status_code = request.syncronous_perform(); + long status_code = 0; + CURLcode curl_response = CURLE_OK; + request.synchronous_perform(curl_response, status_code); + if (curl_response != CURLE_OK) + { + throw std::runtime_error(fmt::format( + "Error fetching snapshot redirect from {}: {} ({})", + request.get_url(), + curl_easy_strerror(curl_response), + status_code)); + } if (status_code == HTTP_STATUS_NOT_FOUND) { LOG_INFO_FMT( @@ -116,8 +126,19 @@ namespace snapshots std::nullopt // No response callback ); - auto snapshot_size_status_code = - snapshot_size_request.syncronous_perform(); + CURLcode snapshot_size_curl_code = CURLE_OK; + long snapshot_size_status_code = 0; + snapshot_size_request.synchronous_perform( + snapshot_size_curl_code, snapshot_size_status_code); + + if (snapshot_size_curl_code != CURLE_OK) + { + throw std::runtime_error(fmt::format( + "Error fetching snapshot size from {}: {} ({})", + snapshot_size_request.get_url(), + curl_easy_strerror(snapshot_size_curl_code), + snapshot_size_status_code)); + } EXPECT_HTTP_RESPONSE_STATUS( snapshot_size_request, snapshot_size_status_code, HTTP_STATUS_OK); @@ -174,6 +195,19 @@ namespace snapshots headers.append( "Range", fmt::format("bytes={}-{}", range_start, range_end)); + auto response_callback = []( + ccf::curl::CurlRequest& request, + CURLcode curl_response_code, + long status_code) { + if (curl_response_code != CURLE_OK) + { + throw std::runtime_error(fmt::format( + "Error fetching snapshot chunk: {} ({})", + curl_easy_strerror(curl_response_code), + status_code)); + } + }; + ccf::curl::CurlRequest snapshot_range_request( std::move(curl_easy), HTTP_GET, @@ -183,8 +217,18 @@ namespace snapshots nullptr // No response callback ); - auto snapshot_range_status_code = - snapshot_range_request.syncronous_perform(); + CURLcode curl_response = CURLE_OK; + long snapshot_range_status_code = 0; + snapshot_range_request.synchronous_perform( + curl_response, snapshot_range_status_code); + if (curl_response != CURLE_OK) + { + throw std::runtime_error(fmt::format( + "Error fetching snapshot chunk range from {}: {} ({})", + snapshot_range_request.get_url(), + curl_easy_strerror(curl_response), + snapshot_range_status_code)); + } EXPECT_HTTP_RESPONSE_STATUS( snapshot_range_request, snapshot_range_status_code, From d28af46c71cbf8d6ad93b4e0efbb8099c6f456aa Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 30 Jul 2025 11:46:06 +0100 Subject: [PATCH 054/197] Fix curl put with empty body issue --- src/http/curl.h | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 3a9b7837edc8..14b91d61d6e4 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -316,22 +316,6 @@ namespace ccf::curl } CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_URL, url.c_str()); - if (request_body != nullptr) - { - request_body->attach_to_curl(curl_handle); - } - - if (response_callback.has_value()) - { - response = std::make_unique(); - response->attach_to_curl(curl_handle); - } - - if (headers.get() != nullptr) - { - CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); - } - if (!method.get_http_method().has_value()) { throw std::logic_error( @@ -345,8 +329,14 @@ namespace ccf::curl case HTTP_HEAD: CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_NOBODY, 1L); break; - case HTTP_PUT: + case HTTP_PUT: { CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); + if (request_body == nullptr) + { + // If no request body is provided, curl will try reading from stdin, which causes a blockage + request_body = std::make_unique(std::vector()); + } + } break; case HTTP_POST: // libcurl sets the post verb when CURLOPT_POSTFIELDS is set, so we @@ -357,6 +347,23 @@ namespace ccf::curl throw std::logic_error( fmt::format("Unsupported HTTP method: {}", method.c_str())); } + + if (request_body != nullptr) + { + request_body->attach_to_curl(curl_handle); + } + + if (response_callback.has_value()) + { + response = std::make_unique(); + response->attach_to_curl(curl_handle); + } + + if (headers.get() != nullptr) + { + CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); + } + } void handle_response(CURLcode curl_response_code) @@ -715,7 +722,6 @@ namespace ccf::curl std::lock_guard lock(curlm_lock); curl_request_curlm.attach_curl_request(request); } - }; class CurlmLibuvContextSingleton From b4e1d16bef0fbc551e10a07f530daf7cc2359909 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 15 Aug 2025 11:13:38 +0100 Subject: [PATCH 055/197] Changes to curl to make it close carefully --- src/host/main.cpp | 1 + src/http/curl.h | 127 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 95 insertions(+), 33 deletions(-) diff --git a/src/host/main.cpp b/src/host/main.cpp index afe17b77fdab..99283e0a7528 100644 --- a/src/host/main.cpp +++ b/src/host/main.cpp @@ -999,6 +999,7 @@ int main(int argc, char** argv) // NOLINT(bugprone-exception-escape) } process_launcher.stop(); + curl_libuv_context.stop(); // Continue running the loop long enough for the on_close // callbacks to be despatched, so as to avoid memory being diff --git a/src/http/curl.h b/src/http/curl.h index 14b91d61d6e4..398536ee55dd 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -108,6 +108,15 @@ namespace ccf::curl } } + CURLM* release() + { + if (!p) + { + return p.release(); + } + return nullptr; + } + operator CURLM*() const { return p.get(); @@ -418,15 +427,7 @@ namespace ccf::curl class CurlRequestCURLM { private: - CURLM* curl_multi; - - CurlRequestCURLM(CURLM* curl_multi) : curl_multi(curl_multi) - { - if (curl_multi == nullptr) - { - throw std::runtime_error("CURLM handle cannot be null"); - } - } + UniqueCURLM curl_multi; public: [[nodiscard]] CURLM* get() const @@ -445,15 +446,6 @@ namespace ccf::curl CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi, curl_handle); } - static CurlRequestCURLM create_unsafe(CURLM* curl_multi) - { - if (curl_multi == nullptr) - { - throw std::runtime_error("CURLM handle cannot be null"); - } - return {curl_multi}; - } - int perform() { int running_handles = 0; @@ -489,6 +481,11 @@ namespace ccf::curl } while (msgq > 0); return running_handles; } + + CURLM* release() + { + return curl_multi.release(); + } }; class CurlmLibuvContext @@ -518,10 +515,10 @@ namespace ccf::curl private: uv_loop_t* loop; uv_timer_t timeout_tracker{}; - // lifetime handler of curl_multi interface - UniqueCURLM curl_multi; - // utility class to enforce type safety on accesses to curl_multi + // utility class to enforce type safety on accesses to curl_multi wrapping a + // UniqueCURLM CurlRequestCURLM curl_request_curlm; + // We need a lock to prevent a client thread calling curl_multi_add_handle // while the libuv thread is processing a curl callback // @@ -538,6 +535,53 @@ namespace ccf::curl }; public: + // Stop all curl transfers and remove handles from libuv + void stop() + { + std::lock_guard lock(curlm_lock); + LOG_INFO_FMT("Stopping curl transfers and removing handles from libuv"); + if (curl_request_curlm.get() == nullptr) + { + throw std::logic_error( + "Cannot stop curl transfers on a null CURLM handle"); + } + // Stop all curl easy handles + { + CURL** easy_handles = curl_multi_get_handles(curl_request_curlm.get()); + for (int i = 0; easy_handles[i] != nullptr; ++i) + { + auto* easy = easy_handles[i]; + curl_multi_remove_handle(curl_request_curlm.get(), easy); + if (easy != nullptr) + { + // attach a lifetime to the request + ccf::curl::CurlRequest* request = nullptr; + curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); + if (request == nullptr) + { + LOG_FAIL_FMT( + "CURLMSG_DONE received with no associated request data"); + } + std::unique_ptr request_data_ptr(request); + curl_multi_remove_handle(curl_request_curlm.get(), easy); + curl_easy_cleanup(easy); + } + } + curl_free(easy_handles); + auto* curlm = curl_request_curlm.release(); + if (curlm != nullptr) + { + // calls socket callbacks to remove the handles from libuv + LOG_INFO_FMT("Cleaning up CURLM handle"); + curl_multi_cleanup(curlm); + } + } + + // There should be no more sockets from curl in libuv, so we can stop the + // timeout + uv_close(reinterpret_cast(&timeout_tracker), nullptr); + } + void handle_request_messages() { curl_request_curlm.perform(); @@ -553,10 +597,16 @@ namespace ccf::curl } std::lock_guard lock(self->curlm_lock); + if (self->curl_request_curlm.get() == nullptr) + { + LOG_FAIL_FMT("libuv_timeout_callback called with null CURLM handle"); + return; + } + int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - self->curl_multi, + self->curl_request_curlm.get(), CURL_SOCKET_TIMEOUT, 0, &running_handles); @@ -613,13 +663,20 @@ namespace ccf::curl } std::lock_guard lock(self->curlm_lock); + if (self->curl_request_curlm.get() == nullptr) + { + LOG_FAIL_FMT( + "libuv_socket_poll_callback called with null CURLM handle"); + return; + } + int action = 0; action |= ((events & UV_READABLE) != 0) ? CURL_CSELECT_IN : 0; action |= ((events & UV_WRITABLE) != 0) ? CURL_CSELECT_OUT : 0; int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - self->curl_multi, + self->curl_request_curlm.get(), request_context->socket, action, &running_handles); @@ -658,7 +715,10 @@ namespace ccf::curl // attach the lifetime to the socket handle request_context = request_context_ptr.release(); CHECK_CURL_MULTI( - curl_multi_assign, self->curl_multi, s, request_context); + curl_multi_assign, + self->curl_request_curlm.get(), + s, + request_context); } int events = 0; @@ -671,10 +731,12 @@ namespace ccf::curl case CURL_POLL_REMOVE: if (request_context != nullptr) { + LOG_TRACE_FMT( + "Removing socket {} from libuv", request_context->socket); uv_poll_stop(&request_context->poll_handle); std::unique_ptr request_context_ptr( request_context); - curl_multi_assign(self->curl_multi, s, nullptr); + curl_multi_assign(self->curl_request_curlm.get(), s, nullptr); } break; default: @@ -683,27 +745,26 @@ namespace ccf::curl return 0; } - CurlmLibuvContext(uv_loop_t* loop) : - loop(loop), - curl_request_curlm(CurlRequestCURLM::create_unsafe(curl_multi)) + CurlmLibuvContext(uv_loop_t* loop) : loop(loop) { uv_timer_init(loop, &timeout_tracker); timeout_tracker.data = this; // Attach this instance to the timer // attach timeouts - CHECK_CURL_MULTI(curl_multi_setopt, curl_multi, CURLMOPT_TIMERDATA, this); + CHECK_CURL_MULTI( + curl_multi_setopt, curl_request_curlm.get(), CURLMOPT_TIMERDATA, this); CHECK_CURL_MULTI( curl_multi_setopt, - curl_multi, + curl_request_curlm.get(), CURLMOPT_TIMERFUNCTION, curl_timeout_callback); // attach socket events CHECK_CURL_MULTI( - curl_multi_setopt, curl_multi, CURLMOPT_SOCKETDATA, this); + curl_multi_setopt, curl_request_curlm.get(), CURLMOPT_SOCKETDATA, this); CHECK_CURL_MULTI( curl_multi_setopt, - curl_multi, + curl_request_curlm.get(), CURLMOPT_SOCKETFUNCTION, curl_socket_callback); @@ -711,7 +772,7 @@ namespace ccf::curl int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - curl_multi, + curl_request_curlm.get(), CURL_SOCKET_TIMEOUT, 0, &running_handles); From 5626b86dd16f41c78812972ca78c9e2e246fbbe1 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 15 Aug 2025 14:20:10 +0100 Subject: [PATCH 056/197] Stop passing the singleton over the enclave boundary --- src/common/configuration.h | 2 -- src/enclave/main.cpp | 15 --------------- src/host/run.cpp | 3 --- 3 files changed, 20 deletions(-) diff --git a/src/common/configuration.h b/src/common/configuration.h index be2c3f389160..c55dc73edef1 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -41,8 +41,6 @@ struct EnclaveConfig ringbuffer::Offsets* from_enclave_buffer_offsets; oversized::WriterConfig writer_config = {}; - - ccf::curl::CurlmLibuvContext* curl_libuv_context; }; static constexpr auto node_to_node_interface_name = "node_to_node_interface"; diff --git a/src/enclave/main.cpp b/src/enclave/main.cpp index 93c0ed4fe0dd..ab727c22e6de 100644 --- a/src/enclave/main.cpp +++ b/src/enclave/main.cpp @@ -63,21 +63,6 @@ extern "C" auto writer_factory = std::make_unique( *basic_writer_factory, enclave_config.writer_config); - auto& curl_context = - ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe(); - if (curl_context != nullptr) - { - LOG_FAIL_FMT("Curl context singleton already initialized"); - return CreateNodeStatus::InternalError; - } - if (enclave_config.curl_libuv_context == nullptr) - { - LOG_FAIL_FMT("Enclave config curl context is null"); - return CreateNodeStatus::InternalError; - } - curl_context = enclave_config.curl_libuv_context; - - { num_pending_threads = (uint16_t)num_worker_threads + 1; diff --git a/src/host/run.cpp b/src/host/run.cpp index 365042af9208..639f2f5778f4 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -597,9 +597,6 @@ namespace ccf enclave_config.writer_config = writer_config; - enclave_config.curl_libuv_context = - &ccf::curl::CurlmLibuvContextSingleton::get_instance(); - ccf::StartupConfig startup_config(config); if (startup_config.attestation.snp_security_policy_file.has_value()) From 7b72ea7dad0a92e5adf3006dc78c34dfa2d03c31 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 15 Aug 2025 15:50:40 +0100 Subject: [PATCH 057/197] refactor and format curl response interface --- src/http/curl.h | 24 +++++++++++------------- src/node/quote_endorsements_client.h | 18 ++++++++++-------- src/snapshots/fetch.h | 26 ++++++++++++-------------- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 398536ee55dd..c82393b66682 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -301,7 +301,7 @@ namespace ccf::curl std::string url; ccf::curl::UniqueSlist headers; std::unique_ptr request_body = nullptr; - std::unique_ptr response = nullptr; + ccf::curl::Response response; std::optional response_callback = nullptr; public: @@ -338,15 +338,18 @@ namespace ccf::curl case HTTP_HEAD: CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_NOBODY, 1L); break; - case HTTP_PUT: { + case HTTP_PUT: + { CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); if (request_body == nullptr) { - // If no request body is provided, curl will try reading from stdin, which causes a blockage - request_body = std::make_unique(std::vector()); + // If no request body is provided, curl will try reading from stdin, + // which causes a blockage + request_body = + std::make_unique(std::vector()); } } - break; + break; case HTTP_POST: // libcurl sets the post verb when CURLOPT_POSTFIELDS is set, so we // skip doing so here, and we assume that the user has already set @@ -362,17 +365,12 @@ namespace ccf::curl request_body->attach_to_curl(curl_handle); } - if (response_callback.has_value()) - { - response = std::make_unique(); - response->attach_to_curl(curl_handle); - } + response.attach_to_curl(curl_handle); if (headers.get() != nullptr) { CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_HTTPHEADER, headers.get()); } - } void handle_response(CURLcode curl_response_code) @@ -417,9 +415,9 @@ namespace ccf::curl return url; } - [[nodiscard]] Response* get_response() const + [[nodiscard]] ccf::curl::Response& get_response() { - return response.get(); + return response; } }; diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 73d52a50995a..92ed7ebb39fd 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -194,16 +194,16 @@ namespace ccf CURLcode curl_response, long status_code) { std::lock_guard guard(this->lock); - auto* response = request.get_response(); + auto response = request.get_response(); if (curl_response == CURLE_OK && status_code == HTTP_STATUS_OK) { LOG_INFO_FMT( "Successfully retrieved endorsements for attestation report: " "{} bytes", - response->buffer.size()); + response.buffer.size()); - handle_success_response(std::move(response->buffer), endpoint); + handle_success_response(std::move(response.buffer), endpoint); return; } @@ -212,12 +212,14 @@ namespace ccf curl_easy_strerror(curl_response), curl_response, status_code); - if (curl_response == CURLE_OK && status_code == HTTP_STATUS_TOO_MANY_REQUESTS) + if ( + curl_response == CURLE_OK && + status_code == HTTP_STATUS_TOO_MANY_REQUESTS) { constexpr size_t default_retry_after_s = 3; size_t retry_after_s = default_retry_after_s; - auto h = response->headers.find(http::headers::RETRY_AFTER); - if (h != response->headers.end()) + auto h = response.headers.find(http::headers::RETRY_AFTER); + if (h != response.headers.end()) { const auto& retry_after_value = h->second; // If value is invalid, retry_after_s is unchanged @@ -315,8 +317,8 @@ namespace ccf "Fetching endorsements for attestation report at {}", request->get_url()); - curl::CurlmLibuvContextSingleton::get_instance_unsafe() - ->attach_request(request); + curl::CurlmLibuvContextSingleton::get_instance_unsafe()->attach_request( + request); } public: diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index c9c431ed41a6..59363e75d861 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -86,15 +86,12 @@ namespace snapshots "Peer has no snapshot newer than {}", latest_local_snapshot); return std::nullopt; } - if (status_code != HTTP_STATUS_PERMANENT_REDIRECT) - { - EXPECT_HTTP_RESPONSE_STATUS( - request, status_code, HTTP_STATUS_PERMANENT_REDIRECT); - } + EXPECT_HTTP_RESPONSE_STATUS( + request, status_code, HTTP_STATUS_PERMANENT_REDIRECT); - auto* response = request.get_response(); - auto location_it = response->headers.find(ccf::http::headers::LOCATION); - if (location_it == response->headers.end()) + auto& response = request.get_response(); + auto location_it = response.headers.find(ccf::http::headers::LOCATION); + if (location_it == response.headers.end()) { throw std::runtime_error(fmt::format( "Expected {} header in redirect response from {} {}, none found", @@ -143,12 +140,12 @@ namespace snapshots EXPECT_HTTP_RESPONSE_STATUS( snapshot_size_request, snapshot_size_status_code, HTTP_STATUS_OK); - auto* snapshot_size_response = snapshot_size_request.get_response(); + auto snapshot_size_response = snapshot_size_request.get_response(); - auto content_size_it = snapshot_size_response->headers.find( + auto content_size_it = snapshot_size_response.headers.find( ccf::http::headers::CONTENT_LENGTH); - if (content_size_it == snapshot_size_response->headers.end()) + if (content_size_it == snapshot_size_response.headers.end()) { throw std::runtime_error(fmt::format( "Expected {} header in response from {} {}, none found", @@ -199,6 +196,7 @@ namespace snapshots ccf::curl::CurlRequest& request, CURLcode curl_response_code, long status_code) { + (void)request; if (curl_response_code != CURLE_OK) { throw std::runtime_error(fmt::format( @@ -240,13 +238,13 @@ namespace snapshots snapshot_range_request.get_url(), snapshot_range_status_code); - auto* snapshot_range_response = snapshot_range_request.get_response(); + auto snapshot_range_response = snapshot_range_request.get_response(); // This is an extra copy which would be good to avoid, but avoiding it // with the current response interface is very messy... memcpy( snapshot.data() + range_start, - snapshot_range_response->buffer.data(), - snapshot_range_response->buffer.size()); + snapshot_range_response.buffer.data(), + snapshot_range_response.buffer.size()); if (range_end == content_size) { From 54f0823cdacbd19faaf2eea42677e46cfc8b0b7c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 15 Aug 2025 19:04:22 +0100 Subject: [PATCH 058/197] Add and fix for e2e test --- CMakeLists.txt | 10 +++ cmake/common.cmake | 2 +- src/host/run.cpp | 3 + src/http/curl.h | 2 - src/http/test/curl_test.cpp | 141 ++++++++++++++++++++++++++++++++++++ tests/e2e_curl.py | 54 ++++++++++++++ tests/requirements.txt | 3 +- 7 files changed, 211 insertions(+), 4 deletions(-) create mode 100644 src/http/test/curl_test.cpp create mode 100644 tests/e2e_curl.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 3291b3cbb1de..5bb2107ff3ee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1216,6 +1216,16 @@ if(BUILD_TESTS) NAME historical_query_cache_test PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/historical_query_cache.py ) + + add_test_bin( + curl_test ${CMAKE_CURRENT_SOURCE_DIR}/src/http/test/curl_test.cpp + ) + target_link_libraries(curl_test PRIVATE curl uv http_parser) + + add_e2e_test( + NAME e2e_curl + PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/e2e_curl.py + ) endif() endif() diff --git a/cmake/common.cmake b/cmake/common.cmake index 74780db4319a..2ee35d60eb99 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -53,7 +53,7 @@ endfunction() function(add_test_bin name) add_executable(${name} ${CCF_DIR}/src/enclave/thread_local.cpp ${ARGN}) target_compile_options(${name} PRIVATE ${COMPILE_LIBCXX}) - target_include_directories(${name} PRIVATE src ${CCFCRYPTO_INC}) + target_include_directories(${name} PRIVATE src ${CCFCRYPTO_INC} ${CCF_DIR}/3rdparty/test) enable_coverage(${name}) target_link_libraries(${name} PRIVATE ${LINK_LIBCXX} ccfcrypto) add_san(${name}) diff --git a/src/host/run.cpp b/src/host/run.cpp index 639f2f5778f4..b0316d340dc2 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -359,6 +360,7 @@ namespace ccf files::dump(fmt::format("{}", ::getpid()), config.output_files.pid_file); // Initialise curlm libuv interface + curl_global_init(CURL_GLOBAL_DEFAULT); ccf::curl::CurlmLibuvContext curl_libuv_context(uv_default_loop()); ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe() = &curl_libuv_context; @@ -1047,6 +1049,7 @@ namespace ccf LOG_FAIL_FMT( "Failed to close uv loop cleanly: {}", uv_err_name(loop_close_rc)); } + curl_global_cleanup(); ccf::crypto::openssl_sha256_shutdown(); return loop_close_rc; diff --git a/src/http/curl.h b/src/http/curl.h index c82393b66682..6d998ab38761 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -729,8 +729,6 @@ namespace ccf::curl case CURL_POLL_REMOVE: if (request_context != nullptr) { - LOG_TRACE_FMT( - "Removing socket {} from libuv", request_context->socket); uv_poll_stop(&request_context->poll_handle); std::unique_ptr request_context_ptr( request_context); diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp new file mode 100644 index 000000000000..19fc54d5034b --- /dev/null +++ b/src/http/test/curl_test.cpp @@ -0,0 +1,141 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. + +#define CCF_LOGGER_NO_DEPRECATE + +#include "ccf/ds/json.h" +#include "ccf/ds/logger.h" +#include "curl/curl.h" +#include "http/curl.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DOCTEST_CONFIG_IMPLEMENT +#include + +struct Data +{ + std::string foo; + std::string bar; + uint8_t iter = 0; +}; + +DECLARE_JSON_TYPE(Data); +DECLARE_JSON_REQUIRED_FIELDS(Data, foo, bar, iter); + +constexpr size_t number_requests = 1000; + +TEST_CASE("Synchronous") +{ + Data data = {.foo = "alpha", .bar = "beta"}; + size_t response_count = 0; + constexpr size_t sync_number_requests = number_requests / 10; + for (int i = 0; i < sync_number_requests; ++i) + { + data.iter = i; + std::string url = fmt::format("http://localhost:8080/{}", i); + auto body = std::make_unique(data); + + auto headers = ccf::curl::UniqueSlist(); + headers.append("Content-Type", "application/json"); + + auto curl_handle = ccf::curl::UniqueCURL(); + + auto request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + std::move(body), + std::nullopt); + + CURLcode curl_code = CURLE_OK; + long status_code = 0; + + request->synchronous_perform(curl_code, status_code); + constexpr size_t HTTP_SUCCESS = 200; + if (curl_code == CURLE_OK && status_code == HTTP_SUCCESS) + { + response_count++; + } + } + REQUIRE(response_count == sync_number_requests); +} + +static size_t response_count = 0; + +TEST_CASE("CurlmLibuvContext") +{ + auto load_generator = [](uv_work_t* req) { + thread_local std::random_device rd; + thread_local std::mt19937 gen(rd()); + constexpr size_t max_delay_ms = 10; + thread_local std::uniform_int_distribution<> uniform_dist(1, max_delay_ms); + (void)req; + Data data = {.foo = "alpha", .bar = "beta"}; + for (int i = 0; i < number_requests; ++i) + { + auto delay = uniform_dist(gen); + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + + data.iter = i; + std::string url = fmt::format("http://localhost:8080/{}", i); + auto body = std::make_unique(data); + + auto headers = ccf::curl::UniqueSlist(); + headers.append("Content-Type", "application/json"); + + auto curl_handle = ccf::curl::UniqueCURL(); + + auto response_callback = []( + ccf::curl::CurlRequest& request, + CURLcode curl_response, + long status_code) { + (void)request; + constexpr size_t HTTP_SUCCESS = 200; + if (curl_response == CURLE_OK && status_code == HTTP_SUCCESS) + { + response_count++; + } + }; + + auto request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + std::move(body), + std::move(response_callback)); + + ccf::curl::CurlmLibuvContextSingleton::get_instance().attach_request( + request); + } + }; + + ccf::curl::CurlmLibuvContext context(uv_default_loop()); + ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe() = &context; + + uv_work_t work_req; + uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); + uv_run(uv_default_loop(), UV_RUN_DEFAULT); + REQUIRE(response_count == number_requests); +} + +int main(int argc, char** argv) +{ + ccf::logger::config::default_init(); + curl_global_init(CURL_GLOBAL_DEFAULT); + doctest::Context context; + context.applyCommandLine(argc, argv); + int res = context.run(); + curl_global_cleanup(); + return res; +} diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py new file mode 100644 index 000000000000..b02301bdbcf6 --- /dev/null +++ b/tests/e2e_curl.py @@ -0,0 +1,54 @@ +from aiohttp import web +import json +from datetime import datetime, UTC +import asyncio +import random + +async def echo_handler(request): + # Extract headers as list of [name, value] pairs + headers = [[name, value] for name, value in request.headers.items()] + + # Read body + body = await request.text() + + time_received = datetime.now(UTC) + + # Add random delay between 0 and 1 second + delay = random.random() / 100 # Returns float between 0.0 and 1.0 + await asyncio.sleep(delay) + + # Build response data + response_data = { + "headers": headers, + "body": body, + "metadata": { + "method": request.method, + "path": request.path_qs, + "timestamp": time_received.isoformat(), + "delay_seconds": delay + } + } + + return web.json_response(response_data) + +async def main(): + app = web.Application() + app.router.add_route('*', '/{path:.*}', echo_handler) + + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, '127.0.0.1', 8080) + await site.start() + + print('Echo server running on http://127.0.0.1:8080') + + # call ./curl_test to run the load generator + cmd = "./curl_test" + process = await asyncio.create_subprocess_shell(cmd) + await process.wait() + + exit(process.returncode) + +if __name__ == '__main__': + import asyncio + asyncio.run(main()) \ No newline at end of file diff --git a/tests/requirements.txt b/tests/requirements.txt index 59dc817f413d..0688c150e9bc 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -24,4 +24,5 @@ prettytable==3.* polars plotext boofuzz -numpy<2 \ No newline at end of file +numpy<2 +aiohttp \ No newline at end of file From e55f9df24e803a3c0b6c8ffb4a9b467c6fc82175 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 15 Aug 2025 19:05:21 +0100 Subject: [PATCH 059/197] Add license --- tests/e2e_curl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index b02301bdbcf6..fc5f9264a76c 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -1,3 +1,5 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the Apache 2.0 License. from aiohttp import web import json from datetime import datetime, UTC From af81c105bff524f93b85c14ee47b3391e080b113 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 15 Aug 2025 19:07:32 +0100 Subject: [PATCH 060/197] fmt --- CMakeLists.txt | 3 +-- cmake/common.cmake | 4 +++- tests/e2e_curl.py | 31 +++++++++++++++++-------------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bb2107ff3ee..9464cc37d80c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1223,8 +1223,7 @@ if(BUILD_TESTS) target_link_libraries(curl_test PRIVATE curl uv http_parser) add_e2e_test( - NAME e2e_curl - PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/e2e_curl.py + NAME e2e_curl PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/e2e_curl.py ) endif() endif() diff --git a/cmake/common.cmake b/cmake/common.cmake index 2ee35d60eb99..5f52c4e3a3d9 100644 --- a/cmake/common.cmake +++ b/cmake/common.cmake @@ -53,7 +53,9 @@ endfunction() function(add_test_bin name) add_executable(${name} ${CCF_DIR}/src/enclave/thread_local.cpp ${ARGN}) target_compile_options(${name} PRIVATE ${COMPILE_LIBCXX}) - target_include_directories(${name} PRIVATE src ${CCFCRYPTO_INC} ${CCF_DIR}/3rdparty/test) + target_include_directories( + ${name} PRIVATE src ${CCFCRYPTO_INC} ${CCF_DIR}/3rdparty/test + ) enable_coverage(${name}) target_link_libraries(${name} PRIVATE ${LINK_LIBCXX} ccfcrypto) add_san(${name}) diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index fc5f9264a76c..6a8a2ee3ce52 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -1,24 +1,24 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the Apache 2.0 License. from aiohttp import web -import json from datetime import datetime, UTC import asyncio import random + async def echo_handler(request): # Extract headers as list of [name, value] pairs headers = [[name, value] for name, value in request.headers.items()] - + # Read body body = await request.text() time_received = datetime.now(UTC) - + # Add random delay between 0 and 1 second delay = random.random() / 100 # Returns float between 0.0 and 1.0 await asyncio.sleep(delay) - + # Build response data response_data = { "headers": headers, @@ -27,22 +27,23 @@ async def echo_handler(request): "method": request.method, "path": request.path_qs, "timestamp": time_received.isoformat(), - "delay_seconds": delay - } + "delay_seconds": delay, + }, } - + return web.json_response(response_data) + async def main(): app = web.Application() - app.router.add_route('*', '/{path:.*}', echo_handler) - + app.router.add_route("*", "/{path:.*}", echo_handler) + runner = web.AppRunner(app) await runner.setup() - site = web.TCPSite(runner, '127.0.0.1', 8080) + site = web.TCPSite(runner, "127.0.0.1", 8080) await site.start() - - print('Echo server running on http://127.0.0.1:8080') + + print("Echo server running on http://127.0.0.1:8080") # call ./curl_test to run the load generator cmd = "./curl_test" @@ -51,6 +52,8 @@ async def main(): exit(process.returncode) -if __name__ == '__main__': + +if __name__ == "__main__": import asyncio - asyncio.run(main()) \ No newline at end of file + + asyncio.run(main()) From eb0dcd00eff143507ed12fd30b7a4784d5220980 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 18 Aug 2025 11:50:20 +0100 Subject: [PATCH 061/197] Fix bug in fetch code --- src/http/curl.h | 2 +- src/snapshots/fetch.h | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 6d998ab38761..becadceaa76a 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -302,7 +302,7 @@ namespace ccf::curl ccf::curl::UniqueSlist headers; std::unique_ptr request_body = nullptr; ccf::curl::Response response; - std::optional response_callback = nullptr; + std::optional response_callback = std::nullopt; public: CurlRequest( diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index 59363e75d861..927fb65afd6e 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -114,10 +114,12 @@ namespace snapshots ccf::curl::UniqueSlist headers; + std::string current_snapshot_url = snapshot_url; + ccf::curl::CurlRequest snapshot_size_request( std::move(curl_easy), HTTP_HEAD, - std::move(snapshot_url), + std::move(current_snapshot_url), std::move(headers), nullptr, // No request body std::nullopt // No response callback @@ -206,13 +208,15 @@ namespace snapshots } }; + std::string current_snapshot_url = snapshot_url; + ccf::curl::CurlRequest snapshot_range_request( std::move(curl_easy), HTTP_GET, - std::move(snapshot_url), + std::move(current_snapshot_url), std::move(headers), nullptr, // No request body - nullptr // No response callback + std::nullopt // No response callback ); CURLcode curl_response = CURLE_OK; From 86f624f96727209d5c8da5adc2cc8fd41aca6a13 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 18 Aug 2025 13:40:30 +0100 Subject: [PATCH 062/197] Reuse response to skip a copy --- src/http/curl.h | 18 ++++++++--- src/node/quote_endorsements_client.h | 10 +++---- src/snapshots/fetch.h | 45 +++++++++------------------- 3 files changed, 33 insertions(+), 40 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index becadceaa76a..f807cc28be32 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -301,7 +301,7 @@ namespace ccf::curl std::string url; ccf::curl::UniqueSlist headers; std::unique_ptr request_body = nullptr; - ccf::curl::Response response; + std::unique_ptr response; std::optional response_callback = std::nullopt; public: @@ -311,12 +311,16 @@ namespace ccf::curl std::string&& url_, UniqueSlist&& headers_, std::unique_ptr&& request_body_, - std::optional&& response_callback_) : + std::optional&& response_callback_, + std::unique_ptr&& _response = nullptr) : curl_handle(std::move(curl_handle_)), method(method_), url(std::move(url_)), headers(std::move(headers_)), request_body(std::move(request_body_)), + response( + _response != nullptr ? std::move(_response) : + std::make_unique()), response_callback(std::move(response_callback_)) { if (url.empty()) @@ -330,6 +334,7 @@ namespace ccf::curl throw std::logic_error( fmt::format("Unsupported HTTP method: {}", method.c_str())); } + switch (method.get_http_method().value()) { case HTTP_GET: @@ -365,7 +370,7 @@ namespace ccf::curl request_body->attach_to_curl(curl_handle); } - response.attach_to_curl(curl_handle); + response->attach_to_curl(curl_handle); if (headers.get() != nullptr) { @@ -415,7 +420,12 @@ namespace ccf::curl return url; } - [[nodiscard]] ccf::curl::Response& get_response() + [[nodiscard]] ccf::curl::Response* get_response() + { + return response.get(); + } + + [[nodiscard]] std::unique_ptr& get_response_ptr() { return response; } diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 92ed7ebb39fd..d8ee731b7c5e 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -194,16 +194,16 @@ namespace ccf CURLcode curl_response, long status_code) { std::lock_guard guard(this->lock); - auto response = request.get_response(); + auto* response = request.get_response(); if (curl_response == CURLE_OK && status_code == HTTP_STATUS_OK) { LOG_INFO_FMT( "Successfully retrieved endorsements for attestation report: " "{} bytes", - response.buffer.size()); + response->buffer.size()); - handle_success_response(std::move(response.buffer), endpoint); + handle_success_response(std::move(response->buffer), endpoint); return; } @@ -218,8 +218,8 @@ namespace ccf { constexpr size_t default_retry_after_s = 3; size_t retry_after_s = default_retry_after_s; - auto h = response.headers.find(http::headers::RETRY_AFTER); - if (h != response.headers.end()) + auto h = response->headers.find(http::headers::RETRY_AFTER); + if (h != response->headers.end()) { const auto& retry_after_value = h->second; // If value is invalid, retry_after_s is unchanged diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index 927fb65afd6e..86427ad0d39c 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -89,9 +89,9 @@ namespace snapshots EXPECT_HTTP_RESPONSE_STATUS( request, status_code, HTTP_STATUS_PERMANENT_REDIRECT); - auto& response = request.get_response(); - auto location_it = response.headers.find(ccf::http::headers::LOCATION); - if (location_it == response.headers.end()) + auto* response = request.get_response(); + auto location_it = response->headers.find(ccf::http::headers::LOCATION); + if (location_it == response->headers.end()) { throw std::runtime_error(fmt::format( "Expected {} header in redirect response from {} {}, none found", @@ -142,12 +142,12 @@ namespace snapshots EXPECT_HTTP_RESPONSE_STATUS( snapshot_size_request, snapshot_size_status_code, HTTP_STATUS_OK); - auto snapshot_size_response = snapshot_size_request.get_response(); + auto* snapshot_size_response = snapshot_size_request.get_response(); - auto content_size_it = snapshot_size_response.headers.find( + auto content_size_it = snapshot_size_response->headers.find( ccf::http::headers::CONTENT_LENGTH); - if (content_size_it == snapshot_size_response.headers.end()) + if (content_size_it == snapshot_size_response->headers.end()) { throw std::runtime_error(fmt::format( "Expected {} header in response from {} {}, none found", @@ -179,7 +179,7 @@ namespace snapshots content_size, range_size); - std::vector snapshot(content_size); + auto snapshot_response = std::make_unique(); { auto range_start = 0; @@ -194,30 +194,17 @@ namespace snapshots headers.append( "Range", fmt::format("bytes={}-{}", range_start, range_end)); - auto response_callback = []( - ccf::curl::CurlRequest& request, - CURLcode curl_response_code, - long status_code) { - (void)request; - if (curl_response_code != CURLE_OK) - { - throw std::runtime_error(fmt::format( - "Error fetching snapshot chunk: {} ({})", - curl_easy_strerror(curl_response_code), - status_code)); - } - }; - std::string current_snapshot_url = snapshot_url; + snapshot_response->headers.clear(); ccf::curl::CurlRequest snapshot_range_request( std::move(curl_easy), HTTP_GET, std::move(current_snapshot_url), std::move(headers), nullptr, // No request body - std::nullopt // No response callback - ); + std::nullopt, // No response callback + std::move(snapshot_response)); CURLcode curl_response = CURLE_OK; long snapshot_range_status_code = 0; @@ -242,13 +229,8 @@ namespace snapshots snapshot_range_request.get_url(), snapshot_range_status_code); - auto snapshot_range_response = snapshot_range_request.get_response(); - // This is an extra copy which would be good to avoid, but avoiding it - // with the current response interface is very messy... - memcpy( - snapshot.data() + range_start, - snapshot_range_response.buffer.data(), - snapshot_range_response.buffer.size()); + snapshot_response = + std::move(snapshot_range_request.get_response_ptr()); if (range_end == content_size) { @@ -263,7 +245,8 @@ namespace snapshots const auto url_components = ccf::nonstd::split(snapshot_url, "/"); const std::string snapshot_name(url_components.back()); - return SnapshotResponse{snapshot_name, std::move(snapshot)}; + return SnapshotResponse{ + snapshot_name, std::move(snapshot_response->buffer)}; } catch (const std::exception& e) { From ebf73b472926dc0e24d1b9c340a8e8c133f711d2 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 18 Aug 2025 13:55:52 +0100 Subject: [PATCH 063/197] tidy --- src/http/curl.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index f807cc28be32..58da6c8bb3c8 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -297,12 +297,12 @@ namespace ccf::curl private: UniqueCURL curl_handle; - RESTVerb method = HTTP_GET; + RESTVerb method; std::string url; ccf::curl::UniqueSlist headers; - std::unique_ptr request_body = nullptr; + std::unique_ptr request_body; std::unique_ptr response; - std::optional response_callback = std::nullopt; + std::optional response_callback; public: CurlRequest( @@ -523,12 +523,11 @@ namespace ccf::curl private: uv_loop_t* loop; uv_timer_t timeout_tracker{}; - // utility class to enforce type safety on accesses to curl_multi wrapping a - // UniqueCURLM CurlRequestCURLM curl_request_curlm; - // We need a lock to prevent a client thread calling curl_multi_add_handle - // while the libuv thread is processing a curl callback + // We need a lock to prevent a client in another thread calling + // curl_multi_add_handle while the libuv thread is processing a curl + // callback // // Note that since the a client callback can call curl_multi_add_handle, but // that will be difficult/impossible to detect, we need curlm_lock to be From 0152a4e2519fdeae38c14e15f312ef746b143378 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 18 Aug 2025 14:12:29 +0100 Subject: [PATCH 064/197] Tidy up --- src/http/curl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 58da6c8bb3c8..882d0ae76b27 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -543,10 +543,11 @@ namespace ccf::curl public: // Stop all curl transfers and remove handles from libuv + // Cannot be safely called while the uv loop is running void stop() { std::lock_guard lock(curlm_lock); - LOG_INFO_FMT("Stopping curl transfers and removing handles from libuv"); + LOG_TRACE_FMT("Stopping curl transfers and removing handles from libuv"); if (curl_request_curlm.get() == nullptr) { throw std::logic_error( @@ -554,6 +555,7 @@ namespace ccf::curl } // Stop all curl easy handles { + // returns the handles as a null-terminated array CURL** easy_handles = curl_multi_get_handles(curl_request_curlm.get()); for (int i = 0; easy_handles[i] != nullptr; ++i) { @@ -570,7 +572,6 @@ namespace ccf::curl "CURLMSG_DONE received with no associated request data"); } std::unique_ptr request_data_ptr(request); - curl_multi_remove_handle(curl_request_curlm.get(), easy); curl_easy_cleanup(easy); } } @@ -579,7 +580,7 @@ namespace ccf::curl if (curlm != nullptr) { // calls socket callbacks to remove the handles from libuv - LOG_INFO_FMT("Cleaning up CURLM handle"); + LOG_TRACE_FMT("Cleaning up CURLM handle"); curl_multi_cleanup(curlm); } } From 64da5795d129b37578aa8f6a454e7878ad27ab34 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 19 Aug 2025 15:50:03 +0100 Subject: [PATCH 065/197] Testing changes to test testing infra --- tests/e2e_operations.py | 33 +++++++++++++++++++++------------ tests/infra/network.py | 8 +++++++- tests/infra/node.py | 14 ++++++++++++++ 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index fb6c2b5ba315..90e14108a2cb 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1424,7 +1424,6 @@ def run_self_healing_open(args): args.nodes, args.binary_dir, args.debug_nodes, - args.perf_nodes, ) as network: LOG.info("Start a network and stop it") network.start_and_open(args) @@ -1446,7 +1445,6 @@ def run_self_healing_open(args): recovery_args.nodes, recovery_args.binary_dir, recovery_args.debug_nodes, - recovery_args.perf_nodes, existing_network=network, ) recovered_network.start_in_self_healing_open( @@ -1474,18 +1472,31 @@ def cycle(items): LOG.info(f"Failed to get the status of {node.local_node_id}, retrying...") continue + # Refresh the the declared state of nodes which have shut themselves down to join. + for node in recovered_network.nodes: + node.refresh_network_state(verify_ca=False) + recovered_network.refresh_service_identity_file(recovery_args) recovered_network.consortium.recover_with_shares(recovered_network.find_random_node()) - # Wait for all replicas to report being part of the opened network - for node in recovered_network.nodes: - recovered_network.wait_for_status( - node, - "Open", - timeout=10, - ) - recovered_network._wait_for_app_open(node) + LOG.info("Submitted recovery shares") + + # Wait for all live replicas to report being part of the opened network + successfully_opened = 0 + for node in recovered_network.get_joined_nodes(): + try: + recovered_network.wait_for_status( + node, + "Open", + timeout=10, + ) + recovered_network._wait_for_app_open(node) + successfully_opened += 1 + except TimeoutError as e: + pass + + assert successfully_opened == 1 LOG.info("Completed self-healing open successfully") @@ -1497,7 +1508,6 @@ def run_self_healing_open_single_replica(args): args.nodes, args.binary_dir, args.debug_nodes, - args.perf_nodes, ) as network: LOG.info("Start a network and stop it") network.start_and_open(args) @@ -1516,7 +1526,6 @@ def run_self_healing_open_single_replica(args): args.nodes, args.binary_dir, args.debug_nodes, - args.perf_nodes, existing_network=network, ) args.previous_service_identity_file = os.path.join( diff --git a/tests/infra/network.py b/tests/infra/network.py index 417a9083cfab..af0c96f2476b 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -767,6 +767,7 @@ def start_in_self_healing_open( common_dir=None, set_authenticate_session=None, start_all_nodes=True, + timeout=10, **kwargs, ): self.common_dir = common_dir or get_common_folder_name( @@ -833,7 +834,9 @@ def start_in_self_healing_open( self.observed_election_duration = self.election_duration + 1 for i, node in enumerate(self.nodes): - while True: + end_time = time.time() + timeout + success = False + while time.time() < end_time: try: self.wait_for_states( node, @@ -841,9 +844,12 @@ def start_in_self_healing_open( timeout=args.ledger_recovery_timeout, verify_ca=False, # Certs are volatile until the recovery is complete ) + success = True break except CCFConnectionException: time.sleep(0.1) + if not success: + raise TimeoutError(f"Failed to get state of node {node.local_node_id} after {timeout} seconds") LOG.info("All nodes started") diff --git a/tests/infra/node.py b/tests/infra/node.py index a04ebb61e2b3..ecc4f56b8b15 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -858,6 +858,20 @@ def wait_for_leadership_state(self, min_view, leadership_states, timeout=3): f"Node {self.local_node_id} was not in leadership states {leadership_states} in view > {min_view} after {timeout}s: {r}" ) + def refresh_network_state(self, **client_kwargs): + try: + with self.client(**client_kwargs) as c: + LOG.info(f"Trying to refresh using {c}") + r = c.get(f"/node/network/nodes/{self.node_id}").body.json() + LOG.info(r) + + if r["status"] == "Pending": + self.network_state = NodeNetworkState.started + elif r["status"] == "Trusted": + self.network_state = NodeNetworkState.joined + except Exception as e: + LOG.debug(f"Failed to connect {e}") + self.network_state = NodeNetworkState.stopped @contextmanager def node( From fa52e9b8cca37175195ee49fd1db15da587e3b8c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 19 Aug 2025 17:08:10 +0100 Subject: [PATCH 066/197] transition_to_open immediately on OPENING rather than waiting for a timeout --- src/node/node_state.h | 79 +++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 32 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index d5927e9e7a3f..f51db4dc177c 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2090,6 +2090,24 @@ namespace ccf LOG_TRACE_FMT( "Self-healing-open timeout, sending timeout to internal handlers"); + // Stop the timer if the node has completed its self-healing-open + auto tx = msg->data.self.network.tables->create_read_only_tx(); + auto* sm_state_handle = + tx.ro(msg->data.self.network.self_healing_open_sm_state); + if (!sm_state_handle->get().has_value()) + { + throw std::logic_error( + "Self-healing-open state not set, cannot retry " + "self-healing-open"); + } + auto sm_state = sm_state_handle->get().value(); + if (sm_state == SelfHealingOpenSM::OPEN) + { + LOG_INFO_FMT("Self-healing-open complete, stopping timers."); + return; + } + + // Send a timeout to the internal handlers curl::UniqueCURL curl_handle; auto cert = msg->data.self.self_signed_node_cert; @@ -2173,6 +2191,7 @@ namespace ccf } } + // Advance self-healing-open SM switch (sm_state_handle->get().value()) { case SelfHealingOpenSM::GOSSIPPING: @@ -2188,21 +2207,20 @@ namespace ccf throw std::logic_error("No gossip addresses provided yet"); } - std::optional> min_iid; - gossip_handle->foreach( - [&min_iid](const auto& iid, const auto& txid) { - if ( - !min_iid.has_value() || min_iid->second < txid || - (min_iid->second == txid && min_iid->first > iid)) - { - min_iid = std::make_pair(iid, txid); - } - return true; - }); + // Lexographically maximum pair + std::optional> maximum; + gossip_handle->foreach([&maximum]( + const auto& iid, const auto& txid) { + if (!maximum.has_value() || maximum.value() < std::tie(txid, iid)) + { + maximum = std::make_pair(iid, txid); + } + return true; + }); auto* chosen_replica = tx.rw(network.self_healing_open_chosen_replica); - chosen_replica->put(min_iid->first); + chosen_replica->put(maximum->second); sm_state_handle->put(SelfHealingOpenSM::VOTING); } return; @@ -2223,7 +2241,23 @@ namespace ccf } LOG_INFO_FMT("Self-healing-open succeeded, now opening network"); - sm_state_handle->put(SelfHealingOpenSM::OPENING); + auto* service = tx.ro(Tables::SERVICE); + auto service_info = service->get(); + if (!service_info.has_value()) + { + throw std::logic_error( + "Service information cannot be found to transition service to " + "open"); + } + const auto prev_ident = + tx.ro(Tables::PREVIOUS_SERVICE_IDENTITY) + ->get(); + AbstractGovernanceEffects::ServiceIdentities identities{ + .previous = prev_ident, .next = service_info->cert}; + + sm_state_handle->put(SelfHealingOpenSM::OPEN); + + transition_service_to_open(tx, identities); } return; } @@ -2258,28 +2292,9 @@ namespace ccf } case SelfHealingOpenSM::OPENING: { - // TODO: Add fast path if enough replicas have joined already - // THIS IS POSSIBLY DANGEROUS as these joining replicas are not signed - // off... if (valid_timeout) { - auto* service = tx.ro(Tables::SERVICE); - auto service_info = service->get(); - if (!service_info.has_value()) - { - throw std::logic_error( - "Service information cannot be found to transition service to " - "open"); - } - const auto prev_ident = - tx.ro(Tables::PREVIOUS_SERVICE_IDENTITY) - ->get(); - AbstractGovernanceEffects::ServiceIdentities identities{ - .previous = prev_ident, .next = service_info->cert}; - sm_state_handle->put(SelfHealingOpenSM::OPEN); - - transition_service_to_open(tx, identities); } } case SelfHealingOpenSM::OPEN: From 5205458b60d0dc3a29d81e3960e460e126483ec3 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 20 Aug 2025 12:33:17 +0100 Subject: [PATCH 067/197] Update src/http/curl.h Co-authored-by: Amaury Chamayou --- src/http/curl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index 882d0ae76b27..162ffead20ac 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -557,7 +557,7 @@ namespace ccf::curl { // returns the handles as a null-terminated array CURL** easy_handles = curl_multi_get_handles(curl_request_curlm.get()); - for (int i = 0; easy_handles[i] != nullptr; ++i) + for (size_t i = 0; easy_handles[i] != nullptr; ++i) { auto* easy = easy_handles[i]; curl_multi_remove_handle(curl_request_curlm.get(), easy); From 080ded9dc36a3a409bd8060bad2589db72aa8c8b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 20 Aug 2025 12:36:12 +0100 Subject: [PATCH 068/197] Ensure opening replica sends iamopen messages --- src/node/node_state.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index f51db4dc177c..79bee71e22a5 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2211,9 +2211,9 @@ namespace ccf std::optional> maximum; gossip_handle->foreach([&maximum]( const auto& iid, const auto& txid) { - if (!maximum.has_value() || maximum.value() < std::tie(txid, iid)) + if (!maximum.has_value() || maximum.value() < std::make_pair(txid, iid)) { - maximum = std::make_pair(iid, txid); + maximum = std::make_pair(txid, iid); } return true; }); @@ -2255,7 +2255,7 @@ namespace ccf AbstractGovernanceEffects::ServiceIdentities identities{ .previous = prev_ident, .next = service_info->cert}; - sm_state_handle->put(SelfHealingOpenSM::OPEN); + sm_state_handle->put(SelfHealingOpenSM::OPENING); transition_service_to_open(tx, identities); } From 60a66c3853585a0e57c19befdda3859b0cfecbaa Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 20 Aug 2025 17:36:42 +0100 Subject: [PATCH 069/197] Make ownership more explicit. Specifically: - Owning initialiser for the singleton - proxy_ptr integration to get the right ordering of closing for CurlmLibuvContextImpl --- src/host/run.cpp | 12 +- src/http/curl.h | 251 ++++++++++++++++----------- src/http/test/curl_test.cpp | 13 +- src/node/quote_endorsements_client.h | 3 +- 4 files changed, 158 insertions(+), 121 deletions(-) diff --git a/src/host/run.cpp b/src/host/run.cpp index b0316d340dc2..8e372c6e5606 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include @@ -359,12 +358,6 @@ namespace ccf // Write PID to disk files::dump(fmt::format("{}", ::getpid()), config.output_files.pid_file); - // Initialise curlm libuv interface - curl_global_init(CURL_GLOBAL_DEFAULT); - ccf::curl::CurlmLibuvContext curl_libuv_context(uv_default_loop()); - ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe() = - &curl_libuv_context; - // set the host log level ccf::logger::config::level() = log_level; @@ -532,6 +525,10 @@ namespace ccf rpc_udp->behaviour.register_udp_message_handlers( buffer_processor.get_dispatcher()); + // Initialise the curlm singleton + curl_global_init(CURL_GLOBAL_DEFAULT); + auto curl_libuv_context = curl::CurlmLibuvContextSingleton(uv_default_loop()); + ResolvedAddresses resolved_rpc_addresses; for (auto& [name, interface] : config.network.rpc_interfaces) { @@ -1025,7 +1022,6 @@ namespace ccf } process_launcher.stop(); - curl_libuv_context.stop(); constexpr size_t max_close_iterations = 1000; size_t close_iterations = max_close_iterations; diff --git a/src/http/curl.h b/src/http/curl.h index 162ffead20ac..1e620bfb161c 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -5,6 +5,7 @@ #include "ccf/ds/logger.h" #include "ccf/ds/nonstd.h" #include "ccf/rest_verb.h" +#include "host/proxy.h" #include #include @@ -96,7 +97,7 @@ namespace ccf::curl class UniqueCURLM { - private: + protected: std::unique_ptr p; public: @@ -108,13 +109,21 @@ namespace ccf::curl } } - CURLM* release() + ~UniqueCURLM() = default; + UniqueCURLM(const UniqueCURLM&) = delete; + UniqueCURLM& operator=(const UniqueCURLM&) = delete; + UniqueCURLM(UniqueCURLM&& other) noexcept : p(std::move(other.p)) {} + UniqueCURLM& operator=(UniqueCURLM&& other) noexcept { - if (!p) - { - return p.release(); - } - return nullptr; + p = std::move(other.p); + return *this; + } + + // Transfers ownership of the CURLM* to the caller. + // Caller is responsible for eventually calling curl_multi_cleanup(). + [[nodiscard]] CURLM* release() noexcept + { + return p.release(); } operator CURLM*() const @@ -432,39 +441,41 @@ namespace ccf::curl }; // non-owning wrapper around a CURLM handle which supports CurlRequest - class CurlRequestCURLM + class CurlRequestCURLM : public UniqueCURLM { - private: - UniqueCURLM curl_multi; - public: - [[nodiscard]] CURLM* get() const - { - return curl_multi; - } - void attach_curl_request(std::unique_ptr& request) { + if (p == nullptr) + { + throw std::logic_error( + "Cannot attach CurlRequest to a null CURLM handle"); + } if (request == nullptr) { throw std::logic_error("Cannot attach a null CurlRequest"); } CURL* curl_handle = request->get_easy_handle(); CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_PRIVATE, request.release()); - CHECK_CURL_MULTI(curl_multi_add_handle, curl_multi, curl_handle); + CHECK_CURL_MULTI(curl_multi_add_handle, p.get(), curl_handle); } int perform() { + if (p == nullptr) + { + throw std::logic_error("Cannot perform on a null CURLM handle"); + } + int running_handles = 0; - CHECK_CURL_MULTI(curl_multi_perform, curl_multi, &running_handles); + CHECK_CURL_MULTI(curl_multi_perform, p.get(), &running_handles); // handle all completed curl requests int msgq = 0; CURLMsg* msg = nullptr; do { - msg = curl_multi_info_read(curl_multi, &msgq); + msg = curl_multi_info_read(p.get(), &msgq); if ((msg != nullptr) && msg->msg == CURLMSG_DONE) { @@ -483,20 +494,15 @@ namespace ccf::curl // detach the easy handle such that it can be cleaned up with the // destructor of CurlRequest - curl_multi_remove_handle(curl_multi, easy); + curl_multi_remove_handle(p.get(), easy); request->handle_response(result); } } while (msgq > 0); return running_handles; } - - CURLM* release() - { - return curl_multi.release(); - } }; - class CurlmLibuvContext + class CurlmLibuvContextImpl { /* Very high level: * CURLM triggers timeout callback with some delay for libuv @@ -522,7 +528,7 @@ namespace ccf::curl */ private: uv_loop_t* loop; - uv_timer_t timeout_tracker{}; + uv_timer_t uv_handle{}; CurlRequestCURLM curl_request_curlm; // We need a lock to prevent a client in another thread calling @@ -538,58 +544,10 @@ namespace ccf::curl { uv_poll_t poll_handle; curl_socket_t socket; - CurlmLibuvContext* context; + CurlmLibuvContextImpl* context; }; public: - // Stop all curl transfers and remove handles from libuv - // Cannot be safely called while the uv loop is running - void stop() - { - std::lock_guard lock(curlm_lock); - LOG_TRACE_FMT("Stopping curl transfers and removing handles from libuv"); - if (curl_request_curlm.get() == nullptr) - { - throw std::logic_error( - "Cannot stop curl transfers on a null CURLM handle"); - } - // Stop all curl easy handles - { - // returns the handles as a null-terminated array - CURL** easy_handles = curl_multi_get_handles(curl_request_curlm.get()); - for (size_t i = 0; easy_handles[i] != nullptr; ++i) - { - auto* easy = easy_handles[i]; - curl_multi_remove_handle(curl_request_curlm.get(), easy); - if (easy != nullptr) - { - // attach a lifetime to the request - ccf::curl::CurlRequest* request = nullptr; - curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); - if (request == nullptr) - { - LOG_FAIL_FMT( - "CURLMSG_DONE received with no associated request data"); - } - std::unique_ptr request_data_ptr(request); - curl_easy_cleanup(easy); - } - } - curl_free(easy_handles); - auto* curlm = curl_request_curlm.release(); - if (curlm != nullptr) - { - // calls socket callbacks to remove the handles from libuv - LOG_TRACE_FMT("Cleaning up CURLM handle"); - curl_multi_cleanup(curlm); - } - } - - // There should be no more sockets from curl in libuv, so we can stop the - // timeout - uv_close(reinterpret_cast(&timeout_tracker), nullptr); - } - void handle_request_messages() { curl_request_curlm.perform(); @@ -597,7 +555,7 @@ namespace ccf::curl static void libuv_timeout_callback(uv_timer_t* handle) { - auto* self = static_cast(handle->data); + auto* self = static_cast(handle->data); if (self == nullptr) { throw std::logic_error( @@ -605,7 +563,7 @@ namespace ccf::curl } std::lock_guard lock(self->curlm_lock); - if (self->curl_request_curlm.get() == nullptr) + if (self->curl_request_curlm == nullptr) { LOG_FAIL_FMT("libuv_timeout_callback called with null CURLM handle"); return; @@ -614,7 +572,7 @@ namespace ccf::curl int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - self->curl_request_curlm.get(), + self->curl_request_curlm, CURL_SOCKET_TIMEOUT, 0, &running_handles); @@ -622,7 +580,7 @@ namespace ccf::curl } static int curl_timeout_callback( - CURLM* multi, long timeout_ms, CurlmLibuvContext* self) + CURLM* multi, long timeout_ms, CurlmLibuvContextImpl* self) { (void)multi; if (self == nullptr) @@ -634,14 +592,13 @@ namespace ccf::curl if (timeout_ms < 0) { // No timeout set, stop the timer - uv_timer_stop(&self->timeout_tracker); + uv_timer_stop(&self->uv_handle); } else { // If timeout is zero, this will trigger immediately timeout_ms = std::max(timeout_ms, 1L); - uv_timer_start( - &self->timeout_tracker, libuv_timeout_callback, timeout_ms, 0); + uv_timer_start(&self->uv_handle, libuv_timeout_callback, timeout_ms, 0); } return 0; } @@ -671,7 +628,7 @@ namespace ccf::curl } std::lock_guard lock(self->curlm_lock); - if (self->curl_request_curlm.get() == nullptr) + if (self->curl_request_curlm == nullptr) { LOG_FAIL_FMT( "libuv_socket_poll_callback called with null CURLM handle"); @@ -684,7 +641,7 @@ namespace ccf::curl int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - self->curl_request_curlm.get(), + self->curl_request_curlm, request_context->socket, action, &running_handles); @@ -696,7 +653,7 @@ namespace ccf::curl CURL* easy, curl_socket_t s, int action, - CurlmLibuvContext* self, + CurlmLibuvContextImpl* self, RequestContext* request_context) { if (self == nullptr) @@ -723,10 +680,7 @@ namespace ccf::curl // attach the lifetime to the socket handle request_context = request_context_ptr.release(); CHECK_CURL_MULTI( - curl_multi_assign, - self->curl_request_curlm.get(), - s, - request_context); + curl_multi_assign, self->curl_request_curlm, s, request_context); } int events = 0; @@ -742,7 +696,7 @@ namespace ccf::curl uv_poll_stop(&request_context->poll_handle); std::unique_ptr request_context_ptr( request_context); - curl_multi_assign(self->curl_request_curlm.get(), s, nullptr); + curl_multi_assign(self->curl_request_curlm, s, nullptr); } break; default: @@ -751,26 +705,26 @@ namespace ccf::curl return 0; } - CurlmLibuvContext(uv_loop_t* loop) : loop(loop) + CurlmLibuvContextImpl(uv_loop_t* loop) : loop(loop) { - uv_timer_init(loop, &timeout_tracker); - timeout_tracker.data = this; // Attach this instance to the timer + uv_timer_init(loop, &uv_handle); + uv_handle.data = this; // Attach this instance to the timer // attach timeouts CHECK_CURL_MULTI( - curl_multi_setopt, curl_request_curlm.get(), CURLMOPT_TIMERDATA, this); + curl_multi_setopt, curl_request_curlm, CURLMOPT_TIMERDATA, this); CHECK_CURL_MULTI( curl_multi_setopt, - curl_request_curlm.get(), + curl_request_curlm, CURLMOPT_TIMERFUNCTION, curl_timeout_callback); // attach socket events CHECK_CURL_MULTI( - curl_multi_setopt, curl_request_curlm.get(), CURLMOPT_SOCKETDATA, this); + curl_multi_setopt, curl_request_curlm, CURLMOPT_SOCKETDATA, this); CHECK_CURL_MULTI( curl_multi_setopt, - curl_request_curlm.get(), + curl_request_curlm, CURLMOPT_SOCKETFUNCTION, curl_socket_callback); @@ -778,7 +732,7 @@ namespace ccf::curl int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, - curl_request_curlm.get(), + curl_request_curlm, CURL_SOCKET_TIMEOUT, 0, &running_handles); @@ -789,25 +743,112 @@ namespace ccf::curl std::lock_guard lock(curlm_lock); curl_request_curlm.attach_curl_request(request); } + + private: + // Interface to allow the proxy pointer to close and delete this safely + // Make the templated asynchost::close_ptr a friend so it can call close() + template + friend class ::asynchost::close_ptr; + + // called by the close_ptr within the destructor of the proxy_ptr + void close() + { + std::lock_guard lock(curlm_lock); + + // Prevent multiple close calls + if (curl_request_curlm == nullptr) + { + LOG_INFO_FMT( + "CurlmLibuvContext already closed, nothing to stop or remove"); + return; + } + UniqueCURLM curlm(std::move(curl_request_curlm)); + + // remove, stop and cleanup all curl easy handles + std::unique_ptr easy_handles( + curl_multi_get_handles(curlm), + [](CURL** handles) { curl_free(handles); }); + // curl_multi_get_handles returns the handles as a null-terminated array + for (size_t i = 0; easy_handles.get()[i] != nullptr; ++i) + { + auto* easy = easy_handles.get()[i]; + curl_multi_remove_handle(curlm, easy); + if (easy != nullptr) + { + // attach a lifetime to the request + ccf::curl::CurlRequest* request = nullptr; + curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); + if (request == nullptr) + { + LOG_FAIL_FMT( + "CURLMSG_DONE received with no associated request data"); + } + std::unique_ptr request_data_ptr(request); + curl_easy_cleanup(easy); + } + } + // Dispatch uv_close to asynchronously close the timer handle + uv_close(reinterpret_cast(&uv_handle), on_close); + } + static void on_close(uv_handle_t* handle) + { + static_cast(handle->data)->on_close(); + } + void on_close() + { + // We are being notified asynchronously that libuv has finished closing + delete this; + } }; + // Required destructor sequence: + // 1. Detach CURLM handle from this object and clean up all CURL handles. + // Detaching prevents new handles being added. + // curl_multi_cleanup detaches all sockets from libuv + // 2. Close the libuv timer handle via the with_uv_handle. + // Prevents any further callbacks from the libuv timer + // 3. Delete CurlmLibuvContextImpl via the with_uv_handle on_close callback + using CurlmLibuvContext = asynchost::proxy_ptr; + class CurlmLibuvContextSingleton { - public: - static CurlmLibuvContext*& get_instance_unsafe() + private: + static std::unique_ptr& instance() { - static CurlmLibuvContext* curlm_libuv_context_instance = nullptr; + static std::unique_ptr curlm_libuv_context_instance = + nullptr; return curlm_libuv_context_instance; } + + public: static CurlmLibuvContext& get_instance() { - auto*& instance = get_instance_unsafe(); - if (instance == nullptr) + if (instance() == nullptr) { throw std::logic_error( "CurlmLibuvContextSingleton instance not initialized"); } - return *instance; + return *instance(); + } + CurlmLibuvContextSingleton(uv_loop_t* loop) + { + if (instance() != nullptr) + { + throw std::logic_error( + "CurlmLibuvContextSingleton instance already initialized"); + } + instance() = std::make_unique(loop); } + ~CurlmLibuvContextSingleton() + { + instance().reset(); // Clean up the instance + } + + CurlmLibuvContextSingleton(const CurlmLibuvContextSingleton&) = delete; + CurlmLibuvContextSingleton& operator=(const CurlmLibuvContextSingleton&) = + delete; + CurlmLibuvContextSingleton(CurlmLibuvContextSingleton&&) = default; + CurlmLibuvContextSingleton& operator=(CurlmLibuvContextSingleton&&) = + default; }; } // namespace ccf::curl \ No newline at end of file diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index 19fc54d5034b..b047e135a2d3 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -115,17 +115,18 @@ TEST_CASE("CurlmLibuvContext") std::move(body), std::move(response_callback)); - ccf::curl::CurlmLibuvContextSingleton::get_instance().attach_request( + ccf::curl::CurlmLibuvContextSingleton::get_instance()->attach_request( request); } }; - ccf::curl::CurlmLibuvContext context(uv_default_loop()); - ccf::curl::CurlmLibuvContextSingleton::get_instance_unsafe() = &context; + { + ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); - uv_work_t work_req; - uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); - uv_run(uv_default_loop(), UV_RUN_DEFAULT); + uv_work_t work_req; + uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); + uv_run(uv_default_loop(), UV_RUN_DEFAULT); + } REQUIRE(response_count == number_requests); } diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index d8ee731b7c5e..400b2d9b67dd 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -317,8 +317,7 @@ namespace ccf "Fetching endorsements for attestation report at {}", request->get_url()); - curl::CurlmLibuvContextSingleton::get_instance_unsafe()->attach_request( - request); + curl::CurlmLibuvContextSingleton::get_instance()->attach_request(request); } public: From d0fe1f3599427c16bd6d49ae4e49e3b9f20ad7ae Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 20 Aug 2025 17:39:00 +0100 Subject: [PATCH 070/197] Fix clang-tidy gripe --- src/http/curl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 1e620bfb161c..89f07d80265d 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -56,7 +56,7 @@ namespace ccf::curl public: UniqueCURL() : p(curl_easy_init(), [](auto x) { curl_easy_cleanup(x); }) { - if (!p) + if (p == nullptr) { throw std::runtime_error("Error initialising curl easy request"); } @@ -103,7 +103,7 @@ namespace ccf::curl public: UniqueCURLM() : p(curl_multi_init(), [](auto x) { curl_multi_cleanup(x); }) { - if (!p) + if (p == nullptr) { throw std::runtime_error("Error initialising curl multi request"); } From e6bfb0b5b9c976edf9d6ca95773e58cd7c669830 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 20 Aug 2025 18:02:50 +0100 Subject: [PATCH 071/197] Separate response_body from response_headers --- src/host/run.cpp | 3 +- src/http/curl.h | 57 ++++++++++++++++++---------- src/node/quote_endorsements_client.h | 11 +++--- src/snapshots/fetch.h | 17 +++++---- 4 files changed, 53 insertions(+), 35 deletions(-) diff --git a/src/host/run.cpp b/src/host/run.cpp index 8e372c6e5606..8336b5cf3b3e 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -527,7 +527,8 @@ namespace ccf // Initialise the curlm singleton curl_global_init(CURL_GLOBAL_DEFAULT); - auto curl_libuv_context = curl::CurlmLibuvContextSingleton(uv_default_loop()); + auto curl_libuv_context = + curl::CurlmLibuvContextSingleton(uv_default_loop()); ResolvedAddresses resolved_rpc_addresses; for (auto& [name, interface] : config.network.rpc_interfaces) diff --git a/src/http/curl.h b/src/http/curl.h index 89f07d80265d..73ef2f83a833 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -119,9 +119,7 @@ namespace ccf::curl return *this; } - // Transfers ownership of the CURLM* to the caller. - // Caller is responsible for eventually calling curl_multi_cleanup(). - [[nodiscard]] CURLM* release() noexcept + [[nodiscard]] CURLM* release() { return p.release(); } @@ -216,15 +214,12 @@ namespace ccf::curl } }; - class Response + class ResponseBody { public: std::vector buffer; - using HeaderMap = std::unordered_map; - HeaderMap headers; - static size_t write_response_chunk( - uint8_t* ptr, size_t size, size_t nmemb, Response* response) + uint8_t* ptr, size_t size, size_t nmemb, ResponseBody* response) { if (response == nullptr) { @@ -238,8 +233,25 @@ namespace ccf::curl return bytes_to_copy; } + void attach_to_curl(CURL* curl) + { + if (curl == nullptr) + { + throw std::logic_error("Cannot attach response to a null CURL handle"); + } + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEDATA, this); + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEFUNCTION, write_response_chunk); + } + }; + + class ResponseHeaders + { + public: + using HeaderMap = std::unordered_map; + HeaderMap data; + static size_t recv_header_line( - char* buffer, size_t size, size_t nitems, Response* response) + char* buffer, size_t size, size_t nitems, ResponseHeaders* response) { if (response == nullptr) { @@ -263,16 +275,16 @@ namespace ccf::curl { std::string field_str(field); nonstd::to_lower(field_str); - if (response->headers.contains(field_str)) + if (response->data.contains(field_str)) { - auto current = response->headers[field_str]; + auto current = response->data[field_str]; LOG_FAIL_FMT( "Duplicate header for '{}', current = '{}', new = '{}'", field_str, current, value); } - response->headers[field_str] = ccf::nonstd::trim(value); + response->data[field_str] = ccf::nonstd::trim(value); } else { @@ -289,10 +301,6 @@ namespace ccf::curl { throw std::logic_error("Cannot attach response to a null CURL handle"); } - // Body - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEDATA, this); - CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEFUNCTION, write_response_chunk); - // Headers CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERDATA, this); CHECK_CURL_EASY_SETOPT(curl, CURLOPT_HEADERFUNCTION, recv_header_line); } @@ -310,7 +318,8 @@ namespace ccf::curl std::string url; ccf::curl::UniqueSlist headers; std::unique_ptr request_body; - std::unique_ptr response; + std::unique_ptr response; + ResponseHeaders response_headers; std::optional response_callback; public: @@ -321,7 +330,7 @@ namespace ccf::curl UniqueSlist&& headers_, std::unique_ptr&& request_body_, std::optional&& response_callback_, - std::unique_ptr&& _response = nullptr) : + std::unique_ptr&& _response = nullptr) : curl_handle(std::move(curl_handle_)), method(method_), url(std::move(url_)), @@ -329,7 +338,7 @@ namespace ccf::curl request_body(std::move(request_body_)), response( _response != nullptr ? std::move(_response) : - std::make_unique()), + std::make_unique()), response_callback(std::move(response_callback_)) { if (url.empty()) @@ -380,6 +389,7 @@ namespace ccf::curl } response->attach_to_curl(curl_handle); + response_headers.attach_to_curl(curl_handle); if (headers.get() != nullptr) { @@ -429,15 +439,20 @@ namespace ccf::curl return url; } - [[nodiscard]] ccf::curl::Response* get_response() + [[nodiscard]] ResponseBody* get_response_body() { return response.get(); } - [[nodiscard]] std::unique_ptr& get_response_ptr() + [[nodiscard]] std::unique_ptr& get_response_ptr() { return response; } + + [[nodiscard]] ResponseHeaders& get_response_headers() + { + return response_headers; + } }; // non-owning wrapper around a CURLM handle which supports CurlRequest diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 400b2d9b67dd..438a6693c795 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -194,16 +194,17 @@ namespace ccf CURLcode curl_response, long status_code) { std::lock_guard guard(this->lock); - auto* response = request.get_response(); + auto* response_body = request.get_response_body(); + auto& response_headers = request.get_response_headers(); if (curl_response == CURLE_OK && status_code == HTTP_STATUS_OK) { LOG_INFO_FMT( "Successfully retrieved endorsements for attestation report: " "{} bytes", - response->buffer.size()); + response_body->buffer.size()); - handle_success_response(std::move(response->buffer), endpoint); + handle_success_response(std::move(response_body->buffer), endpoint); return; } @@ -218,8 +219,8 @@ namespace ccf { constexpr size_t default_retry_after_s = 3; size_t retry_after_s = default_retry_after_s; - auto h = response->headers.find(http::headers::RETRY_AFTER); - if (h != response->headers.end()) + auto h = response_headers.data.find(http::headers::RETRY_AFTER); + if (h != response_headers.data.end()) { const auto& retry_after_value = h->second; // If value is invalid, retry_after_s is unchanged diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index 86427ad0d39c..bb2ee9374c47 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -89,9 +89,10 @@ namespace snapshots EXPECT_HTTP_RESPONSE_STATUS( request, status_code, HTTP_STATUS_PERMANENT_REDIRECT); - auto* response = request.get_response(); - auto location_it = response->headers.find(ccf::http::headers::LOCATION); - if (location_it == response->headers.end()) + auto& response_headers = request.get_response_headers(); + auto location_it = + response_headers.data.find(ccf::http::headers::LOCATION); + if (location_it == response_headers.data.end()) { throw std::runtime_error(fmt::format( "Expected {} header in redirect response from {} {}, none found", @@ -142,12 +143,13 @@ namespace snapshots EXPECT_HTTP_RESPONSE_STATUS( snapshot_size_request, snapshot_size_status_code, HTTP_STATUS_OK); - auto* snapshot_size_response = snapshot_size_request.get_response(); + auto& snapshot_size_response_headers = + snapshot_size_request.get_response_headers(); - auto content_size_it = snapshot_size_response->headers.find( + auto content_size_it = snapshot_size_response_headers.data.find( ccf::http::headers::CONTENT_LENGTH); - if (content_size_it == snapshot_size_response->headers.end()) + if (content_size_it == snapshot_size_response_headers.data.end()) { throw std::runtime_error(fmt::format( "Expected {} header in response from {} {}, none found", @@ -179,7 +181,7 @@ namespace snapshots content_size, range_size); - auto snapshot_response = std::make_unique(); + auto snapshot_response = std::make_unique(); { auto range_start = 0; @@ -196,7 +198,6 @@ namespace snapshots std::string current_snapshot_url = snapshot_url; - snapshot_response->headers.clear(); ccf::curl::CurlRequest snapshot_range_request( std::move(curl_easy), HTTP_GET, From d716c8b7077845bc5774f4d23900f012abbefd34 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 20 Aug 2025 18:06:16 +0100 Subject: [PATCH 072/197] Remove easy handle before throwing an error. --- src/http/curl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/http/curl.h b/src/http/curl.h index 73ef2f83a833..3dabb239b8b6 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -502,6 +502,7 @@ namespace ccf::curl curl_easy_getinfo(easy, CURLINFO_PRIVATE, &request); if (request == nullptr) { + curl_multi_remove_handle(p.get(), easy); throw std::runtime_error( "CURLMSG_DONE received with no associated request data"); } From b6352c0f43579e20f5b86c63975eb66d39d5c15f Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 21 Aug 2025 10:15:08 +0100 Subject: [PATCH 073/197] Update src/http/curl.h Co-authored-by: Amaury Chamayou --- src/http/curl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index 3dabb239b8b6..edc55e95c527 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -261,7 +261,7 @@ namespace ccf::curl auto bytes_to_read = size * nitems; std::string_view header(buffer, bytes_to_read); - // strip /r/n etc + // strip \r\n etc header = ccf::nonstd::trim(header); // Ignore empty headers, and the http response line (e.g. "HTTP/1.1 200") From a23a88294b20e6eb5373379d3d88519c8239e9e4 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 21 Aug 2025 10:15:28 +0100 Subject: [PATCH 074/197] Update src/http/curl.h Co-authored-by: Amaury Chamayou --- src/http/curl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index edc55e95c527..5fb5ae28bd5e 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -369,7 +369,7 @@ namespace ccf::curl // If no request body is provided, curl will try reading from stdin, // which causes a blockage request_body = - std::make_unique(std::vector()); + std::make_unique({}); } } break; From 7f55f13a8d6c779a62d17b598b3950e19f1ed969 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 21 Aug 2025 13:48:10 +0100 Subject: [PATCH 075/197] Snagging --- src/common/configuration.h | 1 - src/enclave/main.cpp | 1 - 2 files changed, 2 deletions(-) diff --git a/src/common/configuration.h b/src/common/configuration.h index 3e3cedf55189..d82a54fb5498 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -16,7 +16,6 @@ #include "common/enclave_interface_types.h" #include "consensus/consensus_types.h" #include "ds/oversized.h" -#include "http/curl.h" #include "service/tables/config.h" #include diff --git a/src/enclave/main.cpp b/src/enclave/main.cpp index ab727c22e6de..cae0c6925c77 100644 --- a/src/enclave/main.cpp +++ b/src/enclave/main.cpp @@ -8,7 +8,6 @@ #include "common/enclave_interface_types.h" #include "enclave.h" #include "enclave_time.h" -#include "http/curl.h" #include #include From 5282f198eafe50bb37fe7c818463793782a08b0e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 21 Aug 2025 13:51:21 +0100 Subject: [PATCH 076/197] Snags --- src/http/curl.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 5fb5ae28bd5e..4f7138d30e18 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -366,10 +366,10 @@ namespace ccf::curl CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_UPLOAD, 1L); if (request_body == nullptr) { - // If no request body is provided, curl will try reading from stdin, - // which causes a blockage + // If no request body is provided, curl will try reading from + // stdin, which causes a blockage request_body = - std::make_unique({}); + std::make_unique(std::vector()); } } break; @@ -532,9 +532,8 @@ namespace ccf::curl * Example flow: * * Initially a CURL* is attached to the curl_multi CURLM* handle - * This calls the curl_multi's timeout function curl_timeout_callback with 0 - * delay - * which then registers the libuv timeout callback with 0 delay + * This calls the curl_multi's timeout function curl_timeout_callback with + * 0 delay which then registers the libuv timeout callback with 0 delay * libuv_timeout_callback then registers a timeout socket_action with curl * which then registers the socket polling at the libuv level * @@ -551,9 +550,9 @@ namespace ccf::curl // curl_multi_add_handle while the libuv thread is processing a curl // callback // - // Note that since the a client callback can call curl_multi_add_handle, but - // that will be difficult/impossible to detect, we need curlm_lock to be - // recursive. + // Note that since the a client callback can call curl_multi_add_handle, + // but that will be difficult/impossible to detect, we need curlm_lock to + // be recursive. std::recursive_mutex curlm_lock; struct RequestContext @@ -817,13 +816,13 @@ namespace ccf::curl } }; - // Required destructor sequence: - // 1. Detach CURLM handle from this object and clean up all CURL handles. - // Detaching prevents new handles being added. + // Required destructor sequence triggered by proxy_ptr calling close + // 1. Detach CURLM handle from this object and clean up all easy handles. + // Detaching prevents new easy handles being added. // curl_multi_cleanup detaches all sockets from libuv - // 2. Close the libuv timer handle via the with_uv_handle. + // 2. Close the libuv timer handle. // Prevents any further callbacks from the libuv timer - // 3. Delete CurlmLibuvContextImpl via the with_uv_handle on_close callback + // 3. Delete CurlmLibuvContextImpl via the on_close callback using CurlmLibuvContext = asynchost::proxy_ptr; class CurlmLibuvContextSingleton From 03a8d5dac7ff3278a5945d7320e6bc71145bc982 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 21 Aug 2025 13:51:30 +0100 Subject: [PATCH 077/197] Rejig logic around header processing --- src/http/curl.h | 54 +++++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 4f7138d30e18..7a103b127e10 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -248,6 +248,7 @@ namespace ccf::curl { public: using HeaderMap = std::unordered_map; + bool is_first_header = true; HeaderMap data; static size_t recv_header_line( @@ -264,31 +265,44 @@ namespace ccf::curl // strip \r\n etc header = ccf::nonstd::trim(header); - // Ignore empty headers, and the http response line (e.g. "HTTP/1.1 200") + // Ignore the http status line (e.g. "HTTP/1.1 200") which should be the + // first header static const std::regex http_status_line_regex(R"(^HTTP\/[1-9]+.*)"); - if ( - !header.empty() && - !std::regex_match(std::string(header), http_status_line_regex)) + if (response->is_first_header) { - const auto [field, value] = ccf::nonstd::split_1(header, ": "); - if (!value.empty()) + response->is_first_header = false; + if (!std::regex_match(std::string(header), http_status_line_regex)) { - std::string field_str(field); - nonstd::to_lower(field_str); - if (response->data.contains(field_str)) - { - auto current = response->data[field_str]; - LOG_FAIL_FMT( - "Duplicate header for '{}', current = '{}', new = '{}'", - field_str, - current, - value); - } - response->data[field_str] = ccf::nonstd::trim(value); + LOG_FAIL_FMT( + "Expected HTTP status line as first header, got '{}'", header); + return bytes_to_read; // Not a valid HTTP response } - else + } + else + { + // ignore empty headers + if (!header.empty()) { - LOG_INFO_FMT("Ignoring invalid-looking HTTP Header '{}'", header); + const auto [field, value] = ccf::nonstd::split_1(header, ": "); + if (!value.empty()) + { + std::string field_str(field); + nonstd::to_lower(field_str); + if (response->data.contains(field_str)) + { + auto current = response->data[field_str]; + LOG_FAIL_FMT( + "Duplicate header for '{}', current = '{}', new = '{}'", + field_str, + current, + value); + } + response->data[field_str] = ccf::nonstd::trim(value); + } + else + { + LOG_INFO_FMT("Ignoring invalid-looking HTTP Header '{}'", header); + } } } From 8cfb10439d66dd7981796b528b31d68d9be7857a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 21 Aug 2025 14:01:21 +0100 Subject: [PATCH 078/197] Set a 1mb default maximum size --- src/http/curl.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index 7a103b127e10..ad1d19875202 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -7,6 +7,7 @@ #include "ccf/rest_verb.h" #include "host/proxy.h" +#include #include #include #include @@ -218,6 +219,10 @@ namespace ccf::curl { public: std::vector buffer; + static constexpr size_t mb = 1024L * 1024L; + // Maximum size of the response body, set to 0 to disable limit + size_t maximum_size = 1 * mb; + static size_t write_response_chunk( uint8_t* ptr, size_t size, size_t nmemb, ResponseBody* response) { @@ -225,9 +230,20 @@ namespace ccf::curl { LOG_FAIL_FMT( "write_response_chunk called with a null response pointer"); - return 0; + return CURL_WRITEFUNC_ERROR; } auto bytes_to_copy = size * nmemb; + if ( + response->maximum_size > 0 && + response->buffer.size() + bytes_to_copy > response->maximum_size) + { + LOG_FAIL_FMT( + "Response size limit exceeded: {} bytes, maximum is {} bytes", + response->buffer.size() + bytes_to_copy, + response->maximum_size); + return CURL_WRITEFUNC_ERROR; + } + response->buffer.insert(response->buffer.end(), ptr, ptr + bytes_to_copy); // Should probably set a maximum response size here return bytes_to_copy; From 9b59d25f2690c96c018d363984a2d748462fd10c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 21 Aug 2025 17:05:15 +0100 Subject: [PATCH 079/197] fix maximum sizing to be sane but not yet configurable for quote endorsements --- src/http/curl.h | 52 +++++++++++++++++++++------- src/http/test/curl_test.cpp | 2 ++ src/node/quote_endorsements_client.h | 2 ++ src/snapshots/fetch.h | 9 +++-- 4 files changed, 49 insertions(+), 16 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index ad1d19875202..56ef74dcef37 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -219,9 +219,13 @@ namespace ccf::curl { public: std::vector buffer; - static constexpr size_t mb = 1024L * 1024L; - // Maximum size of the response body, set to 0 to disable limit - size_t maximum_size = 1 * mb; + size_t maximum_size; + + // Ensure there is always a maximum size set + ResponseBody() = delete; + + // _max_size is the maximum size of the response body + ResponseBody(size_t max_size_) : maximum_size(max_size_) {} static size_t write_response_chunk( uint8_t* ptr, size_t size, size_t nmemb, ResponseBody* response) @@ -233,9 +237,7 @@ namespace ccf::curl return CURL_WRITEFUNC_ERROR; } auto bytes_to_copy = size * nmemb; - if ( - response->maximum_size > 0 && - response->buffer.size() + bytes_to_copy > response->maximum_size) + if (response->buffer.size() + bytes_to_copy > response->maximum_size) { LOG_FAIL_FMT( "Response size limit exceeded: {} bytes, maximum is {} bytes", @@ -245,7 +247,6 @@ namespace ccf::curl } response->buffer.insert(response->buffer.end(), ptr, ptr + bytes_to_copy); - // Should probably set a maximum response size here return bytes_to_copy; } @@ -258,6 +259,25 @@ namespace ccf::curl CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEDATA, this); CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEFUNCTION, write_response_chunk); } + + static size_t noop_write_function( + uint8_t* ptr, size_t size, size_t nmemb, ResponseBody* response) + { + (void)ptr; + (void)response; + return size * nmemb; + } + + static void attach_noop_response(CURL* curl) + { + if (curl == nullptr) + { + throw std::logic_error( + "Cannot attach noop response to a null CURL handle"); + } + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEDATA, nullptr); + CHECK_CURL_EASY_SETOPT(curl, CURLOPT_WRITEFUNCTION, noop_write_function); + } }; class ResponseHeaders @@ -359,16 +379,14 @@ namespace ccf::curl std::string&& url_, UniqueSlist&& headers_, std::unique_ptr&& request_body_, - std::optional&& response_callback_, - std::unique_ptr&& _response = nullptr) : + std::unique_ptr&& response_, + std::optional&& response_callback_) : curl_handle(std::move(curl_handle_)), method(method_), url(std::move(url_)), headers(std::move(headers_)), request_body(std::move(request_body_)), - response( - _response != nullptr ? std::move(_response) : - std::make_unique()), + response(std::move(response_)), response_callback(std::move(response_callback_)) { if (url.empty()) @@ -418,7 +436,15 @@ namespace ccf::curl request_body->attach_to_curl(curl_handle); } - response->attach_to_curl(curl_handle); + if (response != nullptr) + { + response->attach_to_curl(curl_handle); + } + else + { + ResponseBody::attach_noop_response(curl_handle); + } + response_headers.attach_to_curl(curl_handle); if (headers.get() != nullptr) diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index b047e135a2d3..93094f7f63c3 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -55,6 +55,7 @@ TEST_CASE("Synchronous") std::move(url), std::move(headers), std::move(body), + std::make_unique(SIZE_MAX), std::nullopt); CURLcode curl_code = CURLE_OK; @@ -113,6 +114,7 @@ TEST_CASE("CurlmLibuvContext") std::move(url), std::move(headers), std::move(body), + std::make_unique(SIZE_MAX), std::move(response_callback)); ccf::curl::CurlmLibuvContextSingleton::get_instance()->attach_request( diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 438a6693c795..6b0763df809b 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -249,12 +249,14 @@ namespace ccf return; }); + constexpr size_t max_response_size = 100L * 1024 * 1024; // 100MB auto request = std::make_unique( std::move(curl_handle), HTTP_GET, std::move(url), std::move(headers), nullptr, + std::make_unique(max_response_size), std::move(response_callback)); // Start watchdog to send request on new server if it is unresponsive diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index bb2ee9374c47..8acbf55c6c45 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -66,6 +66,7 @@ namespace snapshots std::move(initial_url), std::move(headers), nullptr, // No request body + nullptr, // No response body std::nullopt // No response callback ); @@ -123,6 +124,7 @@ namespace snapshots std::move(current_snapshot_url), std::move(headers), nullptr, // No request body + nullptr, // No response body std::nullopt // No response callback ); @@ -181,7 +183,7 @@ namespace snapshots content_size, range_size); - auto snapshot_response = std::make_unique(); + auto snapshot_response = std::make_unique(content_size); { auto range_start = 0; @@ -204,8 +206,9 @@ namespace snapshots std::move(current_snapshot_url), std::move(headers), nullptr, // No request body - std::nullopt, // No response callback - std::move(snapshot_response)); + std::move(snapshot_response), + std::nullopt // No response callback + ); CURLcode curl_response = CURLE_OK; long snapshot_range_status_code = 0; From ef1f464d45f7daf4c82ceec91ab8408d0dda6257 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 22 Aug 2025 11:09:06 +0100 Subject: [PATCH 080/197] Make quote endorsements maximum response size configurable. --- include/ccf/pal/attestation_sev_snp.h | 23 ++++++++++++++---- .../pal/attestation_sev_snp_endorsements.h | 24 +++++++++++++++---- src/node/quote_endorsements_client.h | 3 +-- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/include/ccf/pal/attestation_sev_snp.h b/include/ccf/pal/attestation_sev_snp.h index 912438d2dc53..45eb74d96732 100644 --- a/include/ccf/pal/attestation_sev_snp.h +++ b/include/ccf/pal/attestation_sev_snp.h @@ -4,6 +4,7 @@ #include "ccf/ds/enum_formatter.h" #include "ccf/ds/json.h" +#include "ccf/ds/unit_strings.h" #include "ccf/pal/attestation_sev_snp_endorsements.h" #include "ccf/pal/measurement.h" #include "ccf/pal/report_data.h" @@ -428,6 +429,8 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== auto reported_tcb = fmt::format("{:0x}", *(uint64_t*)("e.reported_tcb)); constexpr size_t default_max_retries_count = 10; + static const ds::SizeString default_max_response_size = + ds::SizeString("100mb"); if (endorsements_servers.empty()) { @@ -436,7 +439,8 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== default_azure_endorsements_endpoint, chip_id_hex, reported_tcb, - default_max_retries_count)); + default_max_retries_count, + default_max_response_size)); return config; } @@ -444,6 +448,8 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== { size_t max_retries_count = server.max_retries_count.value_or(default_max_retries_count); + size_t max_response_size = + server.max_response_size.value_or(default_max_response_size); switch (server.type) { case EndorsementsEndpointType::Azure: @@ -451,7 +457,11 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== auto loc = get_endpoint_loc(server, default_azure_endorsements_endpoint); config.servers.emplace_back(make_azure_endorsements_server( - loc, chip_id_hex, reported_tcb, max_retries_count)); + loc, + chip_id_hex, + reported_tcb, + max_retries_count, + max_response_size)); break; } case EndorsementsEndpointType::AMD: @@ -501,7 +511,8 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== snp, microcode, product, - max_retries_count)); + max_retries_count, + max_response_size)); break; } case EndorsementsEndpointType::THIM: @@ -509,7 +520,11 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== auto loc = get_endpoint_loc(server, default_thim_endorsements_endpoint); config.servers.emplace_back(make_thim_endorsements_server( - loc, chip_id_hex, reported_tcb, max_retries_count)); + loc, + chip_id_hex, + reported_tcb, + max_retries_count, + max_response_size)); break; } default: diff --git a/include/ccf/pal/attestation_sev_snp_endorsements.h b/include/ccf/pal/attestation_sev_snp_endorsements.h index f4854c7e1196..f4d8a21bd9a3 100644 --- a/include/ccf/pal/attestation_sev_snp_endorsements.h +++ b/include/ccf/pal/attestation_sev_snp_endorsements.h @@ -3,6 +3,7 @@ #pragma once #include "ccf/ds/json.h" +#include "ccf/ds/unit_strings.h" #include "ccf/pal/sev_snp_cpuid.h" #include @@ -47,6 +48,7 @@ namespace ccf::pal::snp std::map headers = {}; bool tls = true; size_t max_retries_count = 3; + size_t max_response_size = SIZE_MAX; bool operator==(const EndpointInfo&) const = default; }; @@ -74,13 +76,14 @@ namespace ccf::pal::snp EndorsementsEndpointType type = Azure; std::optional url = std::nullopt; std::optional max_retries_count = std::nullopt; + std::optional max_response_size = std::nullopt; bool operator==(const EndorsementsServer&) const = default; }; DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(EndorsementsServer); DECLARE_JSON_REQUIRED_FIELDS(EndorsementsServer); DECLARE_JSON_OPTIONAL_FIELDS( - EndorsementsServer, type, url, max_retries_count); + EndorsementsServer, type, url, max_retries_count, max_response_size); using EndorsementsServers = std::vector; struct HostPort @@ -97,7 +100,9 @@ namespace ccf::pal::snp const HostPort& endpoint, const std::string& chip_id_hex, const std::string& reported_tcb, - size_t max_retries_count) + size_t max_retries_count, + size_t max_response_size + ) { std::map params; params["api-version"] = "2020-10-15-preview"; @@ -105,9 +110,11 @@ namespace ccf::pal::snp endpoint.host, endpoint.port, fmt::format("/SevSnpVM/certificates/{}/{}", chip_id_hex, reported_tcb), - params}; + params, + }; info.max_retries_count = max_retries_count; + info.max_response_size = max_response_size; return {info}; } @@ -125,7 +132,9 @@ namespace ccf::pal::snp const std::string& snp, const std::string& microcode, const ProductName& product_name, - size_t max_retries_count) + size_t max_retries_count, + size_t max_response_size + ) { std::map params; params["blSPL"] = boot_loader; @@ -142,12 +151,14 @@ namespace ccf::pal::snp true // DER }; leaf.max_retries_count = max_retries_count; + leaf.max_response_size = max_response_size; EndorsementEndpointsConfiguration::EndpointInfo chain{ endpoint.host, endpoint.port, fmt::format("/vcek/v1/{}/cert_chain", to_string(product_name)), {}}; chain.max_retries_count = max_retries_count; + chain.max_response_size = max_response_size; server.push_back(leaf); server.push_back(chain); @@ -162,7 +173,9 @@ namespace ccf::pal::snp const HostPort& endpoint, const std::string& chip_id_hex, const std::string& reported_tcb, - size_t max_retries_count) + size_t max_retries_count, + size_t max_response_size + ) { std::map params; params["tcbVersion"] = reported_tcb; @@ -178,6 +191,7 @@ namespace ccf::pal::snp false // No TLS }; info.max_retries_count = max_retries_count; + info.max_response_size = max_response_size; return {info}; } diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 6b0763df809b..7029b881bd69 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -249,14 +249,13 @@ namespace ccf return; }); - constexpr size_t max_response_size = 100L * 1024 * 1024; // 100MB auto request = std::make_unique( std::move(curl_handle), HTTP_GET, std::move(url), std::move(headers), nullptr, - std::make_unique(max_response_size), + std::make_unique(endpoint.max_response_size), std::move(response_callback)); // Start watchdog to send request on new server if it is unresponsive From acbdcb1d05382a13fdefe1e068745354b5b7656d Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 22 Aug 2025 11:35:15 +0100 Subject: [PATCH 081/197] fmt --- include/ccf/pal/attestation_sev_snp_endorsements.h | 11 ++++------- src/snapshots/fetch.h | 3 ++- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/ccf/pal/attestation_sev_snp_endorsements.h b/include/ccf/pal/attestation_sev_snp_endorsements.h index f4d8a21bd9a3..01cc3fecf402 100644 --- a/include/ccf/pal/attestation_sev_snp_endorsements.h +++ b/include/ccf/pal/attestation_sev_snp_endorsements.h @@ -101,8 +101,7 @@ namespace ccf::pal::snp const std::string& chip_id_hex, const std::string& reported_tcb, size_t max_retries_count, - size_t max_response_size - ) + size_t max_response_size) { std::map params; params["api-version"] = "2020-10-15-preview"; @@ -111,7 +110,7 @@ namespace ccf::pal::snp endpoint.port, fmt::format("/SevSnpVM/certificates/{}/{}", chip_id_hex, reported_tcb), params, - }; + }; info.max_retries_count = max_retries_count; info.max_response_size = max_response_size; @@ -133,8 +132,7 @@ namespace ccf::pal::snp const std::string& microcode, const ProductName& product_name, size_t max_retries_count, - size_t max_response_size - ) + size_t max_response_size) { std::map params; params["blSPL"] = boot_loader; @@ -174,8 +172,7 @@ namespace ccf::pal::snp const std::string& chip_id_hex, const std::string& reported_tcb, size_t max_retries_count, - size_t max_response_size - ) + size_t max_response_size) { std::map params; params["tcbVersion"] = reported_tcb; diff --git a/src/snapshots/fetch.h b/src/snapshots/fetch.h index 8acbf55c6c45..4497117e4d4a 100644 --- a/src/snapshots/fetch.h +++ b/src/snapshots/fetch.h @@ -183,7 +183,8 @@ namespace snapshots content_size, range_size); - auto snapshot_response = std::make_unique(content_size); + auto snapshot_response = + std::make_unique(content_size); { auto range_start = 0; From 2e1089c5779145b99770c116205ce97efa34f68a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 22 Aug 2025 13:32:10 +0100 Subject: [PATCH 082/197] Rephrase --- include/ccf/pal/attestation_sev_snp.h | 14 ++++++------- .../pal/attestation_sev_snp_endorsements.h | 20 +++++++++---------- src/node/quote_endorsements_client.h | 2 +- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/ccf/pal/attestation_sev_snp.h b/include/ccf/pal/attestation_sev_snp.h index 45eb74d96732..6c11940ebf49 100644 --- a/include/ccf/pal/attestation_sev_snp.h +++ b/include/ccf/pal/attestation_sev_snp.h @@ -429,7 +429,7 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== auto reported_tcb = fmt::format("{:0x}", *(uint64_t*)("e.reported_tcb)); constexpr size_t default_max_retries_count = 10; - static const ds::SizeString default_max_response_size = + static const ds::SizeString default_max_client_response_size = ds::SizeString("100mb"); if (endorsements_servers.empty()) @@ -440,7 +440,7 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== chip_id_hex, reported_tcb, default_max_retries_count, - default_max_response_size)); + default_max_client_response_size)); return config; } @@ -448,8 +448,8 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== { size_t max_retries_count = server.max_retries_count.value_or(default_max_retries_count); - size_t max_response_size = - server.max_response_size.value_or(default_max_response_size); + size_t max_client_response_size = + server.max_client_response_size.value_or(default_max_client_response_size); switch (server.type) { case EndorsementsEndpointType::Azure: @@ -461,7 +461,7 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== chip_id_hex, reported_tcb, max_retries_count, - max_response_size)); + max_client_response_size)); break; } case EndorsementsEndpointType::AMD: @@ -512,7 +512,7 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== microcode, product, max_retries_count, - max_response_size)); + max_client_response_size)); break; } case EndorsementsEndpointType::THIM: @@ -524,7 +524,7 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== chip_id_hex, reported_tcb, max_retries_count, - max_response_size)); + max_client_response_size)); break; } default: diff --git a/include/ccf/pal/attestation_sev_snp_endorsements.h b/include/ccf/pal/attestation_sev_snp_endorsements.h index 01cc3fecf402..48d846a55ac6 100644 --- a/include/ccf/pal/attestation_sev_snp_endorsements.h +++ b/include/ccf/pal/attestation_sev_snp_endorsements.h @@ -48,7 +48,7 @@ namespace ccf::pal::snp std::map headers = {}; bool tls = true; size_t max_retries_count = 3; - size_t max_response_size = SIZE_MAX; + size_t max_client_response_size = SIZE_MAX; bool operator==(const EndpointInfo&) const = default; }; @@ -76,14 +76,14 @@ namespace ccf::pal::snp EndorsementsEndpointType type = Azure; std::optional url = std::nullopt; std::optional max_retries_count = std::nullopt; - std::optional max_response_size = std::nullopt; + std::optional max_client_response_size = std::nullopt; bool operator==(const EndorsementsServer&) const = default; }; DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(EndorsementsServer); DECLARE_JSON_REQUIRED_FIELDS(EndorsementsServer); DECLARE_JSON_OPTIONAL_FIELDS( - EndorsementsServer, type, url, max_retries_count, max_response_size); + EndorsementsServer, type, url, max_retries_count, max_client_response_size); using EndorsementsServers = std::vector; struct HostPort @@ -101,7 +101,7 @@ namespace ccf::pal::snp const std::string& chip_id_hex, const std::string& reported_tcb, size_t max_retries_count, - size_t max_response_size) + size_t max_client_response_size) { std::map params; params["api-version"] = "2020-10-15-preview"; @@ -113,7 +113,7 @@ namespace ccf::pal::snp }; info.max_retries_count = max_retries_count; - info.max_response_size = max_response_size; + info.max_client_response_size = max_client_response_size; return {info}; } @@ -132,7 +132,7 @@ namespace ccf::pal::snp const std::string& microcode, const ProductName& product_name, size_t max_retries_count, - size_t max_response_size) + size_t max_client_response_size) { std::map params; params["blSPL"] = boot_loader; @@ -149,14 +149,14 @@ namespace ccf::pal::snp true // DER }; leaf.max_retries_count = max_retries_count; - leaf.max_response_size = max_response_size; + leaf.max_client_response_size = max_client_response_size; EndorsementEndpointsConfiguration::EndpointInfo chain{ endpoint.host, endpoint.port, fmt::format("/vcek/v1/{}/cert_chain", to_string(product_name)), {}}; chain.max_retries_count = max_retries_count; - chain.max_response_size = max_response_size; + chain.max_client_response_size = max_client_response_size; server.push_back(leaf); server.push_back(chain); @@ -172,7 +172,7 @@ namespace ccf::pal::snp const std::string& chip_id_hex, const std::string& reported_tcb, size_t max_retries_count, - size_t max_response_size) + size_t max_client_response_size) { std::map params; params["tcbVersion"] = reported_tcb; @@ -188,7 +188,7 @@ namespace ccf::pal::snp false // No TLS }; info.max_retries_count = max_retries_count; - info.max_response_size = max_response_size; + info.max_client_response_size = max_client_response_size; return {info}; } diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index 7029b881bd69..eaa7dba49071 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -255,7 +255,7 @@ namespace ccf std::move(url), std::move(headers), nullptr, - std::make_unique(endpoint.max_response_size), + std::make_unique(endpoint.max_client_response_size), std::move(response_callback)); // Start watchdog to send request on new server if it is unresponsive From 6e05563b0a0235c91262a91484addaeb2f4f9ef9 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 22 Aug 2025 13:33:27 +0100 Subject: [PATCH 083/197] reboop --- include/ccf/pal/attestation_sev_snp.h | 3 ++- src/node/quote_endorsements_client.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/ccf/pal/attestation_sev_snp.h b/include/ccf/pal/attestation_sev_snp.h index 6c11940ebf49..307ebaa1610d 100644 --- a/include/ccf/pal/attestation_sev_snp.h +++ b/include/ccf/pal/attestation_sev_snp.h @@ -449,7 +449,8 @@ pRb21iI1NlNCfOGUPIhVpWECAwEAAQ== size_t max_retries_count = server.max_retries_count.value_or(default_max_retries_count); size_t max_client_response_size = - server.max_client_response_size.value_or(default_max_client_response_size); + server.max_client_response_size.value_or( + default_max_client_response_size); switch (server.type) { case EndorsementsEndpointType::Azure: diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index eaa7dba49071..a52e5b26165e 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -255,7 +255,8 @@ namespace ccf std::move(url), std::move(headers), nullptr, - std::make_unique(endpoint.max_client_response_size), + std::make_unique( + endpoint.max_client_response_size), std::move(response_callback)); // Start watchdog to send request on new server if it is unresponsive From 6fa1587a93393578dbc900b508cb18365b750cde Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 22 Aug 2025 17:41:09 +0100 Subject: [PATCH 084/197] Ensure attaching request check curl_request_curlm --- src/http/curl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index 56ef74dcef37..7b5653051f89 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -812,6 +812,10 @@ namespace ccf::curl void attach_request(std::unique_ptr& request) { std::lock_guard lock(curlm_lock); + if (curl_request_curlm == nullptr) { + LOG_FAIL_FMT("CurlmLibuvContext already closed, cannot attach request"); + return; + } curl_request_curlm.attach_curl_request(request); } From 51b4f3b90431fb85b4a4b97e6e4faa765548e758 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 10:38:35 +0100 Subject: [PATCH 085/197] Reformat --- src/http/curl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index 7b5653051f89..498c8f697747 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -812,7 +812,8 @@ namespace ccf::curl void attach_request(std::unique_ptr& request) { std::lock_guard lock(curlm_lock); - if (curl_request_curlm == nullptr) { + if (curl_request_curlm == nullptr) + { LOG_FAIL_FMT("CurlmLibuvContext already closed, cannot attach request"); return; } From 5a8de4af6599ad252d782f1d0dda9bb5c34f3792 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 11:23:31 +0100 Subject: [PATCH 086/197] Add trace logging of timeout actions --- src/http/curl.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index 498c8f697747..dd4fdc4d460d 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -640,6 +640,8 @@ namespace ccf::curl return; } + LOG_TRACE_FMT("Curlm Libuv timeout"); + int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, @@ -660,6 +662,8 @@ namespace ccf::curl "libuv_timeout_callback called with null self pointer"); } + LOG_TRACE_FMT("Processing curl timeout: {}ms", timeout_ms); + if (timeout_ms < 0) { // No timeout set, stop the timer @@ -667,7 +671,8 @@ namespace ccf::curl } else { - // If timeout is zero, this will trigger immediately + // If timeout is zero, this will trigger immediately, possibly within a + // callback so clamp it to at least 1ms timeout_ms = std::max(timeout_ms, 1L); uv_timer_start(&self->uv_handle, libuv_timeout_callback, timeout_ms, 0); } @@ -831,6 +836,8 @@ namespace ccf::curl { std::lock_guard lock(curlm_lock); + LOG_TRACE_FMT("Closing CurlmLibuvContext"); + // Prevent multiple close calls if (curl_request_curlm == nullptr) { From 02bb53366153af03a6cf3749702496ece199950c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 11:25:29 +0100 Subject: [PATCH 087/197] Add tests for slow requests and timed out requests. --- src/http/test/curl_test.cpp | 150 ++++++++++++++++++++++++++++++++++-- 1 file changed, 142 insertions(+), 8 deletions(-) diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index 93094f7f63c3..cd49e6324a1c 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -31,13 +31,11 @@ struct Data DECLARE_JSON_TYPE(Data); DECLARE_JSON_REQUIRED_FIELDS(Data, foo, bar, iter); -constexpr size_t number_requests = 1000; - TEST_CASE("Synchronous") { Data data = {.foo = "alpha", .bar = "beta"}; size_t response_count = 0; - constexpr size_t sync_number_requests = number_requests / 10; + constexpr size_t sync_number_requests = 100; for (int i = 0; i < sync_number_requests; ++i) { data.iter = i; @@ -71,16 +69,16 @@ TEST_CASE("Synchronous") REQUIRE(response_count == sync_number_requests); } -static size_t response_count = 0; - TEST_CASE("CurlmLibuvContext") { + size_t response_count = 0; + constexpr size_t number_requests = 1000; auto load_generator = [](uv_work_t* req) { thread_local std::random_device rd; thread_local std::mt19937 gen(rd()); constexpr size_t max_delay_ms = 10; thread_local std::uniform_int_distribution<> uniform_dist(1, max_delay_ms); - (void)req; + auto* response_count_ptr = reinterpret_cast(req->data); Data data = {.foo = "alpha", .bar = "beta"}; for (int i = 0; i < number_requests; ++i) { @@ -96,7 +94,7 @@ TEST_CASE("CurlmLibuvContext") auto curl_handle = ccf::curl::UniqueCURL(); - auto response_callback = []( + auto response_callback = [response_count_ptr]( ccf::curl::CurlRequest& request, CURLcode curl_response, long status_code) { @@ -104,7 +102,7 @@ TEST_CASE("CurlmLibuvContext") constexpr size_t HTTP_SUCCESS = 200; if (curl_response == CURLE_OK && status_code == HTTP_SUCCESS) { - response_count++; + (*response_count_ptr)++; } }; @@ -126,12 +124,148 @@ TEST_CASE("CurlmLibuvContext") ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); uv_work_t work_req; + work_req.data = &response_count; uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); uv_run(uv_default_loop(), UV_RUN_DEFAULT); } REQUIRE(response_count == number_requests); } +TEST_CASE("CurlmLibuvContext slow") +{ + size_t response_count = 0; + constexpr size_t slow_number_requests = 10; + auto load_generator = [](uv_work_t* req) { + thread_local std::random_device rd; + thread_local std::mt19937 gen(rd()); + constexpr size_t max_delay_ms = 2000; + thread_local std::uniform_int_distribution<> uniform_dist(1, max_delay_ms); + auto* response_count_ptr = reinterpret_cast(req->data); + (void)req; + Data data = {.foo = "alpha", .bar = "beta"}; + for (int i = 0; i < slow_number_requests; ++i) + { + auto delay = uniform_dist(gen); + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + + data.iter = i; + std::string url = fmt::format("http://localhost:8080/{}", i); + auto body = std::make_unique(data); + + auto headers = ccf::curl::UniqueSlist(); + headers.append("Content-Type", "application/json"); + + auto curl_handle = ccf::curl::UniqueCURL(); + + auto response_callback = [response_count_ptr]( + ccf::curl::CurlRequest& request, + CURLcode curl_response, + long status_code) { + (void)request; + constexpr size_t HTTP_SUCCESS = 200; + if (curl_response == CURLE_OK && status_code == HTTP_SUCCESS) + { + (*response_count_ptr)++; + } + }; + + auto request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + std::move(body), + std::make_unique(SIZE_MAX), + std::move(response_callback)); + + ccf::curl::CurlmLibuvContextSingleton::get_instance()->attach_request( + request); + } + }; + + { + ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); + + uv_work_t work_req; + work_req.data = &response_count; + uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); + uv_run(uv_default_loop(), UV_RUN_DEFAULT); + } + REQUIRE(response_count == slow_number_requests); +} + +TEST_CASE("CurlmLibuvContext timeouts") +{ + size_t response_count = 0; + constexpr size_t number_requests = 1000; + + auto load_generator = [](uv_work_t* req) { + thread_local std::random_device rd; + thread_local std::mt19937 gen(rd()); + constexpr size_t max_delay_ms = 40; + thread_local std::uniform_int_distribution<> uniform_dist(1, max_delay_ms); + auto* response_count_ptr = reinterpret_cast(req->data); + (void)req; + + Data data = {.foo = "alpha", .bar = "beta"}; + for (int i = 0; i < number_requests; ++i) + { + auto delay = uniform_dist(gen); + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + + data.iter = i; + + // 192.0.2.0/24 (TEST-NET-1) is reserved (RFC 5737) and should be + // unroutable. + const std::string unreachable_base = "http://192.0.2.1:65535"; + std::string url = fmt::format("{}/{}", unreachable_base, i); + auto body = std::make_unique(data); + + auto headers = ccf::curl::UniqueSlist(); + headers.append("Content-Type", "application/json"); + + auto curl_handle = ccf::curl::UniqueCURL(); + curl_handle.set_opt(CURLOPT_TIMEOUT_MS, max_delay_ms); + + auto response_callback = [response_count_ptr]( + ccf::curl::CurlRequest& request, + CURLcode curl_response, + long status_code) { + (void)request; + // We expect all to fail to connect; count only unexpected successes. + constexpr size_t HTTP_SUCCESS = 200; + if (curl_response == CURLE_OK && status_code == HTTP_SUCCESS) + { + (*response_count_ptr)++; + } + }; + + auto request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + std::move(body), + std::make_unique(SIZE_MAX), + std::move(response_callback)); + + ccf::curl::CurlmLibuvContextSingleton::get_instance()->attach_request( + request); + } + }; + + { + ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); + + uv_work_t work_req; + work_req.data = &response_count; + uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); + uv_run(uv_default_loop(), UV_RUN_DEFAULT); + } + // All should fail to reach the unreachable host. + REQUIRE(response_count == 0); +} + int main(int argc, char** argv) { ccf::logger::config::default_init(); From ff72e769856d2f6925fe4a606b3a565c88c5c6cb Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 11:26:27 +0100 Subject: [PATCH 088/197] Make e2e_curl a long test --- CMakeLists.txt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1bfcd7d101bc..889022297592 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1182,9 +1182,11 @@ if(BUILD_TESTS) ) target_link_libraries(curl_test PRIVATE curl uv http_parser) - add_e2e_test( - NAME e2e_curl PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/e2e_curl.py - ) + if(LONG_TESTS) + add_e2e_test( + NAME e2e_curl PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/e2e_curl.py + ) + endif() endif() endif() From 1dc290c4d8134bd7ff680e7509f0bbb2855aaac6 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 13:47:15 +0100 Subject: [PATCH 089/197] Add logging on all curl requests --- src/http/curl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index dd4fdc4d460d..7097b294b7fc 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -455,6 +455,7 @@ namespace ccf::curl void handle_response(CURLcode curl_response_code) { + LOG_TRACE_FMT("Handling response for {}", url); if (response_callback.has_value()) { long status_code = 0; @@ -526,6 +527,7 @@ namespace ccf::curl { throw std::logic_error("Cannot attach a null CurlRequest"); } + LOG_TRACE_FMT("Attaching CurlRequest to {} to Curlm", request->get_url()); CURL* curl_handle = request->get_easy_handle(); CHECK_CURL_EASY_SETOPT(curl_handle, CURLOPT_PRIVATE, request.release()); CHECK_CURL_MULTI(curl_multi_add_handle, p.get(), curl_handle); From fd39352293644965bf3b35a7ffea012ee0199f08 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 13:52:10 +0100 Subject: [PATCH 090/197] Add debug print for all unclosed uv handles --- src/host/run.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/host/run.cpp b/src/host/run.cpp index 87f460f899c1..2a227cc3d57d 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -1001,7 +1001,7 @@ namespace ccf { thread.join(); } - } + @} process_launcher.stop(); @@ -1026,6 +1026,15 @@ namespace ccf { LOG_FAIL_FMT( "Failed to close uv loop cleanly: {}", uv_err_name(loop_close_rc)); + // walk loop to diagnose unclosed handles + auto cb = [](uv_handle_t* handle, void* arg) { + (void)arg; + LOG_FAIL_FMT( + "Leaked handle: type={}, ptr={}", + uv_handle_type_name(handle->type), + fmt::ptr(handle)); + }; + uv_walk(uv_default_loop(), cb, nullptr); } curl_global_cleanup(); ccf::crypto::openssl_sha256_shutdown(); From 699445c6c33b5dca513ea61ea7538147cbd82a2f Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 13:53:09 +0100 Subject: [PATCH 091/197] fix --- src/host/run.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/host/run.cpp b/src/host/run.cpp index 2a227cc3d57d..fa5542a3c374 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -1001,7 +1001,7 @@ namespace ccf { thread.join(); } - @} + } process_launcher.stop(); From 6fa7169c523151ac2d400a195ee450ad6e8fc69c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 14:07:42 +0100 Subject: [PATCH 092/197] Refactor closing logic --- src/http/curl.h | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 7097b294b7fc..ddaed969746d 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -603,6 +603,7 @@ namespace ccf::curl uv_loop_t* loop; uv_timer_t uv_handle{}; CurlRequestCURLM curl_request_curlm; + std::atomic is_stopping = false; // We need a lock to prevent a client in another thread calling // curl_multi_add_handle while the libuv thread is processing a curl @@ -636,9 +637,9 @@ namespace ccf::curl } std::lock_guard lock(self->curlm_lock); - if (self->curl_request_curlm == nullptr) + if (self->is_stopping) { - LOG_FAIL_FMT("libuv_timeout_callback called with null CURLM handle"); + LOG_FAIL_FMT("libuv_timeout_callback called while stopping"); return; } @@ -664,6 +665,12 @@ namespace ccf::curl "libuv_timeout_callback called with null self pointer"); } + if (self->is_stopping) + { + LOG_FAIL_FMT("curl_timeout_callback called while stopping"); + return 0; + } + LOG_TRACE_FMT("Processing curl timeout: {}ms", timeout_ms); if (timeout_ms < 0) @@ -706,10 +713,9 @@ namespace ccf::curl } std::lock_guard lock(self->curlm_lock); - if (self->curl_request_curlm == nullptr) + if (self->is_stopping) { - LOG_FAIL_FMT( - "libuv_socket_poll_callback called with null CURLM handle"); + LOG_FAIL_FMT("libuv_socket_poll_callback called while stopping"); return; } @@ -740,12 +746,20 @@ namespace ccf::curl "curl_socket_callback called with null self pointer"); } (void)easy; + switch (action) { case CURL_POLL_IN: case CURL_POLL_OUT: case CURL_POLL_INOUT: { + // Possibly called during shutdown + if (self->is_stopping) + { + LOG_FAIL_FMT("curl_socket_callback called while stopping"); + return 0; + } + if (request_context == nullptr) { auto request_context_ptr = std::make_unique(); @@ -819,7 +833,7 @@ namespace ccf::curl void attach_request(std::unique_ptr& request) { std::lock_guard lock(curlm_lock); - if (curl_request_curlm == nullptr) + if (is_stopping) { LOG_FAIL_FMT("CurlmLibuvContext already closed, cannot attach request"); return; @@ -841,23 +855,23 @@ namespace ccf::curl LOG_TRACE_FMT("Closing CurlmLibuvContext"); // Prevent multiple close calls - if (curl_request_curlm == nullptr) + if (is_stopping) { LOG_INFO_FMT( "CurlmLibuvContext already closed, nothing to stop or remove"); return; } - UniqueCURLM curlm(std::move(curl_request_curlm)); + is_stopping = true; // remove, stop and cleanup all curl easy handles std::unique_ptr easy_handles( - curl_multi_get_handles(curlm), + curl_multi_get_handles(curl_request_curlm), [](CURL** handles) { curl_free(handles); }); // curl_multi_get_handles returns the handles as a null-terminated array for (size_t i = 0; easy_handles.get()[i] != nullptr; ++i) { auto* easy = easy_handles.get()[i]; - curl_multi_remove_handle(curlm, easy); + curl_multi_remove_handle(curl_request_curlm, easy); if (easy != nullptr) { // attach a lifetime to the request From 068cc59d10850b1a4ae528132212f8976079b1bd Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 15:11:57 +0100 Subject: [PATCH 093/197] Improve lifetime handling of the requestcontext uv_handle --- src/http/curl.h | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index ddaed969746d..3b2b055c84b5 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -576,6 +576,16 @@ namespace ccf::curl } }; + class CurlmLibuvContextImpl; + + class CurlmLibuvRequestContextImpl : asynchost::with_uv_handle{ + friend class CurlmLibuvContextImpl; + public: + curl_socket_t socket; + CurlmLibuvContextImpl* context; + }; + using CurlmLibuvRequestContext = std::shared_ptr; + class CurlmLibuvContextImpl { /* Very high level: @@ -614,13 +624,6 @@ namespace ccf::curl // be recursive. std::recursive_mutex curlm_lock; - struct RequestContext - { - uv_poll_t poll_handle; - curl_socket_t socket; - CurlmLibuvContextImpl* context; - }; - public: void handle_request_messages() { @@ -698,14 +701,14 @@ namespace ccf::curl return; } - auto* request_context = static_cast(req->data); + auto* request_context = static_cast(req->data); if (request_context == nullptr) { throw std::logic_error( "libuv_socket_poll_callback called with null request context"); } - auto* self = request_context->context; + auto* self = request_context->get()->context; if (self == nullptr) { throw std::logic_error( @@ -726,7 +729,7 @@ namespace ccf::curl CHECK_CURL_MULTI( curl_multi_socket_action, self->curl_request_curlm, - request_context->socket, + request_context->get()->socket, action, &running_handles); self->handle_request_messages(); @@ -738,7 +741,7 @@ namespace ccf::curl curl_socket_t s, int action, CurlmLibuvContextImpl* self, - RequestContext* request_context) + CurlmLibuvRequestContext* request_context) { if (self == nullptr) { @@ -762,12 +765,12 @@ namespace ccf::curl if (request_context == nullptr) { - auto request_context_ptr = std::make_unique(); - request_context_ptr->context = self; - request_context_ptr->socket = s; + auto request_context_ptr = std::make_unique(); + request_context_ptr->get()->context = self; + request_context_ptr->get()->socket = s; uv_poll_init_socket( - self->loop, &request_context_ptr->poll_handle, s); - request_context_ptr->poll_handle.data = + self->loop, &request_context_ptr->get()->uv_handle, s); + request_context_ptr->get()->uv_handle.data = request_context_ptr.get(); // Attach the context // attach the lifetime to the socket handle request_context = request_context_ptr.release(); @@ -779,14 +782,14 @@ namespace ccf::curl events |= (action == CURL_POLL_IN) ? 0 : UV_WRITABLE; events |= (action == CURL_POLL_OUT) ? 0 : UV_READABLE; uv_poll_start( - &request_context->poll_handle, events, libuv_socket_poll_callback); + &request_context->get()->uv_handle, events, libuv_socket_poll_callback); break; } case CURL_POLL_REMOVE: if (request_context != nullptr) { - uv_poll_stop(&request_context->poll_handle); - std::unique_ptr request_context_ptr( + uv_poll_stop(&request_context->get()->uv_handle); + std::unique_ptr request_context_ptr( request_context); curl_multi_assign(self->curl_request_curlm, s, nullptr); } From 9e0ae3df7a585fde7e57c5d240515f11a992f380 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 16:35:31 +0100 Subject: [PATCH 094/197] Revert "Improve lifetime handling of the requestcontext uv_handle" This reverts commit 068cc59d10850b1a4ae528132212f8976079b1bd. --- src/http/curl.h | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 3b2b055c84b5..ddaed969746d 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -576,16 +576,6 @@ namespace ccf::curl } }; - class CurlmLibuvContextImpl; - - class CurlmLibuvRequestContextImpl : asynchost::with_uv_handle{ - friend class CurlmLibuvContextImpl; - public: - curl_socket_t socket; - CurlmLibuvContextImpl* context; - }; - using CurlmLibuvRequestContext = std::shared_ptr; - class CurlmLibuvContextImpl { /* Very high level: @@ -624,6 +614,13 @@ namespace ccf::curl // be recursive. std::recursive_mutex curlm_lock; + struct RequestContext + { + uv_poll_t poll_handle; + curl_socket_t socket; + CurlmLibuvContextImpl* context; + }; + public: void handle_request_messages() { @@ -701,14 +698,14 @@ namespace ccf::curl return; } - auto* request_context = static_cast(req->data); + auto* request_context = static_cast(req->data); if (request_context == nullptr) { throw std::logic_error( "libuv_socket_poll_callback called with null request context"); } - auto* self = request_context->get()->context; + auto* self = request_context->context; if (self == nullptr) { throw std::logic_error( @@ -729,7 +726,7 @@ namespace ccf::curl CHECK_CURL_MULTI( curl_multi_socket_action, self->curl_request_curlm, - request_context->get()->socket, + request_context->socket, action, &running_handles); self->handle_request_messages(); @@ -741,7 +738,7 @@ namespace ccf::curl curl_socket_t s, int action, CurlmLibuvContextImpl* self, - CurlmLibuvRequestContext* request_context) + RequestContext* request_context) { if (self == nullptr) { @@ -765,12 +762,12 @@ namespace ccf::curl if (request_context == nullptr) { - auto request_context_ptr = std::make_unique(); - request_context_ptr->get()->context = self; - request_context_ptr->get()->socket = s; + auto request_context_ptr = std::make_unique(); + request_context_ptr->context = self; + request_context_ptr->socket = s; uv_poll_init_socket( - self->loop, &request_context_ptr->get()->uv_handle, s); - request_context_ptr->get()->uv_handle.data = + self->loop, &request_context_ptr->poll_handle, s); + request_context_ptr->poll_handle.data = request_context_ptr.get(); // Attach the context // attach the lifetime to the socket handle request_context = request_context_ptr.release(); @@ -782,14 +779,14 @@ namespace ccf::curl events |= (action == CURL_POLL_IN) ? 0 : UV_WRITABLE; events |= (action == CURL_POLL_OUT) ? 0 : UV_READABLE; uv_poll_start( - &request_context->get()->uv_handle, events, libuv_socket_poll_callback); + &request_context->poll_handle, events, libuv_socket_poll_callback); break; } case CURL_POLL_REMOVE: if (request_context != nullptr) { - uv_poll_stop(&request_context->get()->uv_handle); - std::unique_ptr request_context_ptr( + uv_poll_stop(&request_context->poll_handle); + std::unique_ptr request_context_ptr( request_context); curl_multi_assign(self->curl_request_curlm, s, nullptr); } From 6830d02f054541476c8cf0a906ca08aa6e9da747 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 26 Aug 2025 16:39:08 +0100 Subject: [PATCH 095/197] Just close the handle when closing the socket --- src/http/curl.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index ddaed969746d..bbccab12938f 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -785,7 +785,11 @@ namespace ccf::curl case CURL_POLL_REMOVE: if (request_context != nullptr) { + // close then delete the poll handle uv_poll_stop(&request_context->poll_handle); + uv_close( + reinterpret_cast(&request_context->poll_handle), + nullptr); std::unique_ptr request_context_ptr( request_context); curl_multi_assign(self->curl_request_curlm, s, nullptr); From a4235ac75c98ceebef85671c8a6af1bdca274e9a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 14:10:21 +0100 Subject: [PATCH 096/197] Use a queue to manage curl requests. the uv_timer_t is not thread-safe and even though there was a lock around it, caused the timer to not trigger. --- src/http/curl.h | 152 +++++++++++++++++++++--------------- src/http/test/curl_test.cpp | 9 ++- tests/e2e_curl.py | 10 ++- 3 files changed, 103 insertions(+), 68 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index bbccab12938f..7a2a5cc93136 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -516,7 +516,7 @@ namespace ccf::curl class CurlRequestCURLM : public UniqueCURLM { public: - void attach_curl_request(std::unique_ptr& request) + void attach_curl_request(std::unique_ptr&& request) { if (p == nullptr) { @@ -576,6 +576,7 @@ namespace ccf::curl } }; + // Must be created on the same thread as the uv loop is running class CurlmLibuvContextImpl { /* Very high level: @@ -605,28 +606,49 @@ namespace ccf::curl CurlRequestCURLM curl_request_curlm; std::atomic is_stopping = false; - // We need a lock to prevent a client in another thread calling - // curl_multi_add_handle while the libuv thread is processing a curl - // callback - // - // Note that since the a client callback can call curl_multi_add_handle, - // but that will be difficult/impossible to detect, we need curlm_lock to - // be recursive. - std::recursive_mutex curlm_lock; - - struct RequestContext + class SocketContext : public asynchost::with_uv_handle { - uv_poll_t poll_handle; - curl_socket_t socket; - CurlmLibuvContextImpl* context; + friend class CurlmLibuvContextImpl; + + public: + curl_socket_t socket{}; + CurlmLibuvContextImpl* context = nullptr; }; - public: - void handle_request_messages() + uv_async_t async_requests_handle{}; + std::mutex requests_mutex; + std::deque> pending_requests; + + static void async_requests_callback(uv_async_t* handle) { - curl_request_curlm.perform(); + auto* self = static_cast(handle->data); + if (self == nullptr) + { + throw std::logic_error( + "async_requests_callback called with null self pointer"); + } + + if (self->is_stopping) + { + LOG_FAIL_FMT("async_requests_callback called while stopping"); + return; + } + + LOG_TRACE_FMT("Libuv: processing pending curl requests"); + + std::deque> requests_to_add; + { + std::lock_guard requests_lock(self->requests_mutex); + requests_to_add.swap(self->pending_requests); + } + + for (auto& req : requests_to_add) + { + self->curl_request_curlm.attach_curl_request(std::move(req)); + } } + public: static void libuv_timeout_callback(uv_timer_t* handle) { auto* self = static_cast(handle->data); @@ -635,7 +657,6 @@ namespace ccf::curl throw std::logic_error( "libuv_timeout_callback called with null self pointer"); } - std::lock_guard lock(self->curlm_lock); if (self->is_stopping) { @@ -643,7 +664,7 @@ namespace ccf::curl return; } - LOG_TRACE_FMT("Curlm Libuv timeout"); + LOG_TRACE_FMT("Libuv timeout"); int running_handles = 0; CHECK_CURL_MULTI( @@ -652,7 +673,7 @@ namespace ccf::curl CURL_SOCKET_TIMEOUT, 0, &running_handles); - self->handle_request_messages(); + self->curl_request_curlm.perform(); } static int curl_timeout_callback( @@ -671,7 +692,7 @@ namespace ccf::curl return 0; } - LOG_TRACE_FMT("Processing curl timeout: {}ms", timeout_ms); + LOG_TRACE_FMT("Curl timeout {}ms", timeout_ms); if (timeout_ms < 0) { @@ -698,20 +719,19 @@ namespace ccf::curl return; } - auto* request_context = static_cast(req->data); - if (request_context == nullptr) + auto* socket_context = static_cast(req->data); + if (socket_context == nullptr) { throw std::logic_error( "libuv_socket_poll_callback called with null request context"); } - auto* self = request_context->context; + auto* self = socket_context->context; if (self == nullptr) { throw std::logic_error( "libuv_socket_poll_callback called with null self pointer"); } - std::lock_guard lock(self->curlm_lock); if (self->is_stopping) { @@ -719,6 +739,11 @@ namespace ccf::curl return; } + LOG_TRACE_FMT( + "Libuv socket poll callback on {}: {}", + static_cast(socket_context->socket), + static_cast(events)); + int action = 0; action |= ((events & UV_READABLE) != 0) ? CURL_CSELECT_IN : 0; action |= ((events & UV_WRITABLE) != 0) ? CURL_CSELECT_OUT : 0; @@ -726,10 +751,10 @@ namespace ccf::curl CHECK_CURL_MULTI( curl_multi_socket_action, self->curl_request_curlm, - request_context->socket, + socket_context->socket, action, &running_handles); - self->handle_request_messages(); + self->curl_request_curlm.perform(); } // Called when the status of a socket changes (creation/deletion) @@ -738,7 +763,7 @@ namespace ccf::curl curl_socket_t s, int action, CurlmLibuvContextImpl* self, - RequestContext* request_context) + SocketContext* socket_context) { if (self == nullptr) { @@ -760,39 +785,39 @@ namespace ccf::curl return 0; } - if (request_context == nullptr) + LOG_INFO_FMT( + "Curl socket callback: listen on socket {}", static_cast(s)); + if (socket_context == nullptr) { - auto request_context_ptr = std::make_unique(); + auto request_context_ptr = std::make_unique(); request_context_ptr->context = self; request_context_ptr->socket = s; - uv_poll_init_socket( - self->loop, &request_context_ptr->poll_handle, s); - request_context_ptr->poll_handle.data = + uv_poll_init_socket(self->loop, &request_context_ptr->uv_handle, s); + request_context_ptr->uv_handle.data = request_context_ptr.get(); // Attach the context // attach the lifetime to the socket handle - request_context = request_context_ptr.release(); + socket_context = request_context_ptr.release(); CHECK_CURL_MULTI( - curl_multi_assign, self->curl_request_curlm, s, request_context); + curl_multi_assign, self->curl_request_curlm, s, socket_context); } int events = 0; - events |= (action == CURL_POLL_IN) ? 0 : UV_WRITABLE; - events |= (action == CURL_POLL_OUT) ? 0 : UV_READABLE; + events |= (action != CURL_POLL_IN) ? UV_WRITABLE : 0; + events |= (action != CURL_POLL_OUT) ? UV_READABLE : 0; + uv_poll_start( - &request_context->poll_handle, events, libuv_socket_poll_callback); + &socket_context->uv_handle, events, libuv_socket_poll_callback); break; } case CURL_POLL_REMOVE: - if (request_context != nullptr) + if (socket_context != nullptr) { - // close then delete the poll handle - uv_poll_stop(&request_context->poll_handle); - uv_close( - reinterpret_cast(&request_context->poll_handle), - nullptr); - std::unique_ptr request_context_ptr( - request_context); + LOG_INFO_FMT( + "CurlmLibuv: curl socket callback: remove socket {}", + static_cast(s)); curl_multi_assign(self->curl_request_curlm, s, nullptr); + uv_poll_stop(&socket_context->uv_handle); + std::unique_ptr socket_context_ptr(socket_context); } break; default: @@ -806,6 +831,12 @@ namespace ccf::curl uv_timer_init(loop, &uv_handle); uv_handle.data = this; // Attach this instance to the timer + uv_async_init(loop, &async_requests_handle, async_requests_callback); + async_requests_handle.data = this; + uv_unref(reinterpret_cast( + &async_requests_handle)); // allow the loop to exit if this is the only + // active handle + // attach timeouts CHECK_CURL_MULTI( curl_multi_setopt, curl_request_curlm, CURLMOPT_TIMERDATA, this); @@ -823,26 +854,19 @@ namespace ccf::curl curl_request_curlm, CURLMOPT_SOCKETFUNCTION, curl_socket_callback); - - // kickstart timeout, probably a no-op but allows curl to initialise - int running_handles = 0; - CHECK_CURL_MULTI( - curl_multi_socket_action, - curl_request_curlm, - CURL_SOCKET_TIMEOUT, - 0, - &running_handles); } - void attach_request(std::unique_ptr& request) + void attach_request(std::unique_ptr&& request) { - std::lock_guard lock(curlm_lock); if (is_stopping) { LOG_FAIL_FMT("CurlmLibuvContext already closed, cannot attach request"); return; } - curl_request_curlm.attach_curl_request(request); + LOG_INFO_FMT("Adding request to {} to queue", request->get_url()); + std::lock_guard requests_lock(requests_mutex); + pending_requests.push_back(std::move(request)); + uv_async_send(&async_requests_handle); } private: @@ -850,12 +874,11 @@ namespace ccf::curl // Make the templated asynchost::close_ptr a friend so it can call close() template friend class ::asynchost::close_ptr; + size_t closed_uv_handle_count = 0; // called by the close_ptr within the destructor of the proxy_ptr void close() { - std::lock_guard lock(curlm_lock); - LOG_TRACE_FMT("Closing CurlmLibuvContext"); // Prevent multiple close calls @@ -891,11 +914,18 @@ namespace ccf::curl } } // Dispatch uv_close to asynchronously close the timer handle + uv_close( + reinterpret_cast(&async_requests_handle), on_close); uv_close(reinterpret_cast(&uv_handle), on_close); } static void on_close(uv_handle_t* handle) { - static_cast(handle->data)->on_close(); + auto& close_count = + static_cast(handle->data)->closed_uv_handle_count; + if (close_count++ >= 2) + { + static_cast(handle->data)->on_close(); + } } void on_close() { diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index cd49e6324a1c..84a1699f1960 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -93,6 +93,7 @@ TEST_CASE("CurlmLibuvContext") headers.append("Content-Type", "application/json"); auto curl_handle = ccf::curl::UniqueCURL(); + curl_handle.set_opt(CURLOPT_FORBID_REUSE, 1L); auto response_callback = [response_count_ptr]( ccf::curl::CurlRequest& request, @@ -116,7 +117,7 @@ TEST_CASE("CurlmLibuvContext") std::move(response_callback)); ccf::curl::CurlmLibuvContextSingleton::get_instance()->attach_request( - request); + std::move(request)); } }; @@ -156,6 +157,7 @@ TEST_CASE("CurlmLibuvContext slow") headers.append("Content-Type", "application/json"); auto curl_handle = ccf::curl::UniqueCURL(); + curl_handle.set_opt(CURLOPT_FORBID_REUSE, 1L); auto response_callback = [response_count_ptr]( ccf::curl::CurlRequest& request, @@ -179,7 +181,7 @@ TEST_CASE("CurlmLibuvContext slow") std::move(response_callback)); ccf::curl::CurlmLibuvContextSingleton::get_instance()->attach_request( - request); + std::move(request)); } }; @@ -226,6 +228,7 @@ TEST_CASE("CurlmLibuvContext timeouts") auto curl_handle = ccf::curl::UniqueCURL(); curl_handle.set_opt(CURLOPT_TIMEOUT_MS, max_delay_ms); + curl_handle.set_opt(CURLOPT_FORBID_REUSE, 1L); auto response_callback = [response_count_ptr]( ccf::curl::CurlRequest& request, @@ -250,7 +253,7 @@ TEST_CASE("CurlmLibuvContext timeouts") std::move(response_callback)); ccf::curl::CurlmLibuvContextSingleton::get_instance()->attach_request( - request); + std::move(request)); } }; diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index 6a8a2ee3ce52..061da037a780 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -15,10 +15,12 @@ async def echo_handler(request): time_received = datetime.now(UTC) - # Add random delay between 0 and 1 second - delay = random.random() / 100 # Returns float between 0.0 and 1.0 + # Add random delay between 0 and 10 millisecond + delay = random.random() / 100 await asyncio.sleep(delay) + #print(f"{datetime.now(UTC)}: Replying to request to {request.path_qs} after {delay:.3f}s delay") + # Build response data response_data = { "headers": headers, @@ -40,10 +42,10 @@ async def main(): runner = web.AppRunner(app) await runner.setup() - site = web.TCPSite(runner, "127.0.0.1", 8080) + site = web.TCPSite(runner, "::1", 8080) await site.start() - print("Echo server running on http://127.0.0.1:8080") + print("Echo server running on http://::1:8080") # call ./curl_test to run the load generator cmd = "./curl_test" From aeeef5f96d207ab7a1d86bb64d7af663d574fea3 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 14:25:57 +0100 Subject: [PATCH 097/197] Fix test --- src/http/test/curl_test.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index 84a1699f1960..4cb2be89cef4 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -122,8 +122,6 @@ TEST_CASE("CurlmLibuvContext") }; { - ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); - uv_work_t work_req; work_req.data = &response_count; uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); @@ -186,8 +184,6 @@ TEST_CASE("CurlmLibuvContext slow") }; { - ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); - uv_work_t work_req; work_req.data = &response_count; uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); @@ -258,8 +254,6 @@ TEST_CASE("CurlmLibuvContext timeouts") }; { - ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); - uv_work_t work_req; work_req.data = &response_count; uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); @@ -273,9 +267,12 @@ int main(int argc, char** argv) { ccf::logger::config::default_init(); curl_global_init(CURL_GLOBAL_DEFAULT); + ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); + doctest::Context context; context.applyCommandLine(argc, argv); int res = context.run(); + curl_global_cleanup(); return res; } From a04a755b5374803d918e669c6851b952b16a01d1 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 14:26:53 +0100 Subject: [PATCH 098/197] move for attachment --- src/node/quote_endorsements_client.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index a52e5b26165e..f9cd11c7cd1f 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -320,7 +320,7 @@ namespace ccf "Fetching endorsements for attestation report at {}", request->get_url()); - curl::CurlmLibuvContextSingleton::get_instance()->attach_request(request); + curl::CurlmLibuvContextSingleton::get_instance()->attach_request(std::move(request)); } public: From 20041eaaede0ede34bb39e55c2b7a1ff624e5ffd Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 14:28:04 +0100 Subject: [PATCH 099/197] fmt --- src/http/curl.h | 4 ++-- src/node/quote_endorsements_client.h | 3 ++- tests/e2e_curl.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 7a2a5cc93136..256b5d52c79b 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -920,8 +920,8 @@ namespace ccf::curl } static void on_close(uv_handle_t* handle) { - auto& close_count = - static_cast(handle->data)->closed_uv_handle_count; + auto& close_count = static_cast(handle->data) + ->closed_uv_handle_count; if (close_count++ >= 2) { static_cast(handle->data)->on_close(); diff --git a/src/node/quote_endorsements_client.h b/src/node/quote_endorsements_client.h index f9cd11c7cd1f..48052b6a066c 100644 --- a/src/node/quote_endorsements_client.h +++ b/src/node/quote_endorsements_client.h @@ -320,7 +320,8 @@ namespace ccf "Fetching endorsements for attestation report at {}", request->get_url()); - curl::CurlmLibuvContextSingleton::get_instance()->attach_request(std::move(request)); + curl::CurlmLibuvContextSingleton::get_instance()->attach_request( + std::move(request)); } public: diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index 061da037a780..565fdd1ec156 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -19,7 +19,7 @@ async def echo_handler(request): delay = random.random() / 100 await asyncio.sleep(delay) - #print(f"{datetime.now(UTC)}: Replying to request to {request.path_qs} after {delay:.3f}s delay") + # print(f"{datetime.now(UTC)}: Replying to request to {request.path_qs} after {delay:.3f}s delay") # Build response data response_data = { From 9a2ba9aaa8bdd680bf413457bfa514f4e3359fa8 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 15:53:24 +0100 Subject: [PATCH 100/197] Revert "Fix test" This reverts commit aeeef5f96d207ab7a1d86bb64d7af663d574fea3. --- src/http/test/curl_test.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index 4cb2be89cef4..84a1699f1960 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -122,6 +122,8 @@ TEST_CASE("CurlmLibuvContext") }; { + ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); + uv_work_t work_req; work_req.data = &response_count; uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); @@ -184,6 +186,8 @@ TEST_CASE("CurlmLibuvContext slow") }; { + ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); + uv_work_t work_req; work_req.data = &response_count; uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); @@ -254,6 +258,8 @@ TEST_CASE("CurlmLibuvContext timeouts") }; { + ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); + uv_work_t work_req; work_req.data = &response_count; uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); @@ -267,12 +273,9 @@ int main(int argc, char** argv) { ccf::logger::config::default_init(); curl_global_init(CURL_GLOBAL_DEFAULT); - ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); - doctest::Context context; context.applyCommandLine(argc, argv); int res = context.run(); - curl_global_cleanup(); return res; } From 03f0590fb9260f68f84cce2162feb92ef09848cf Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 17:11:41 +0100 Subject: [PATCH 101/197] All instantiating new proxy_ptrs from a pointer --- src/host/proxy.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/host/proxy.h b/src/host/proxy.h index 9234816b6522..bc9d1c030084 100644 --- a/src/host/proxy.h +++ b/src/host/proxy.h @@ -25,6 +25,7 @@ namespace asynchost { raw = new T(std::forward(args)...); } + close_ptr(T* that) : raw(that) {} ~close_ptr() { @@ -56,6 +57,7 @@ namespace asynchost proxy_ptr(const proxy_ptr& that) : internal(that.internal) {} proxy_ptr(proxy_ptr&& that) : internal(std::move(that.internal)) {} proxy_ptr(std::nullptr_t that) : internal(that) {} + proxy_ptr(T* that) : internal(std::make_shared>(that)) {} template proxy_ptr(Args&&... args) : From a4f234c4648e7f2efa159a1bed8c7e20b294beb4 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 17:13:01 +0100 Subject: [PATCH 102/197] Fix asan errors --- src/http/curl.h | 27 +++++++------ src/http/test/curl_test.cpp | 79 +++++++++++++++++++++++++++++++++++++ tests/e2e_curl.py | 13 ++++-- 3 files changed, 104 insertions(+), 15 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 256b5d52c79b..a0add789782e 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -606,7 +606,7 @@ namespace ccf::curl CurlRequestCURLM curl_request_curlm; std::atomic is_stopping = false; - class SocketContext : public asynchost::with_uv_handle + class SocketContextImpl : public asynchost::with_uv_handle { friend class CurlmLibuvContextImpl; @@ -615,6 +615,8 @@ namespace ccf::curl CurlmLibuvContextImpl* context = nullptr; }; + using SocketContext = asynchost::proxy_ptr; + uv_async_t async_requests_handle{}; std::mutex requests_mutex; std::deque> pending_requests; @@ -719,7 +721,7 @@ namespace ccf::curl return; } - auto* socket_context = static_cast(req->data); + auto* socket_context = static_cast(req->data); if (socket_context == nullptr) { throw std::logic_error( @@ -763,7 +765,7 @@ namespace ccf::curl curl_socket_t s, int action, CurlmLibuvContextImpl* self, - SocketContext* socket_context) + SocketContextImpl* socket_context) { if (self == nullptr) { @@ -789,14 +791,14 @@ namespace ccf::curl "Curl socket callback: listen on socket {}", static_cast(s)); if (socket_context == nullptr) { - auto request_context_ptr = std::make_unique(); - request_context_ptr->context = self; - request_context_ptr->socket = s; - uv_poll_init_socket(self->loop, &request_context_ptr->uv_handle, s); - request_context_ptr->uv_handle.data = - request_context_ptr.get(); // Attach the context + auto socket_context_ptr = std::make_unique(); + socket_context_ptr->context = self; + socket_context_ptr->socket = s; + uv_poll_init_socket(self->loop, &socket_context_ptr->uv_handle, s); + socket_context_ptr->uv_handle.data = + socket_context_ptr.get(); // Attach the context // attach the lifetime to the socket handle - socket_context = request_context_ptr.release(); + socket_context = socket_context_ptr.release(); CHECK_CURL_MULTI( curl_multi_assign, self->curl_request_curlm, s, socket_context); } @@ -815,9 +817,10 @@ namespace ccf::curl LOG_INFO_FMT( "CurlmLibuv: curl socket callback: remove socket {}", static_cast(s)); - curl_multi_assign(self->curl_request_curlm, s, nullptr); + SocketContext socket_context_ptr(socket_context); uv_poll_stop(&socket_context->uv_handle); - std::unique_ptr socket_context_ptr(socket_context); + CHECK_CURL_MULTI( + curl_multi_assign, self->curl_request_curlm, s, nullptr); } break; default: diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index 84a1699f1960..5cb253c4f229 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -269,6 +269,85 @@ TEST_CASE("CurlmLibuvContext timeouts") REQUIRE(response_count == 0); } +TEST_CASE("CurlmLibuvContext double init") +{ + size_t response_count = 0; + constexpr size_t number_iterations = 10; + constexpr size_t number_requests = 10; + + auto load_generator = [](uv_work_t* req) { + thread_local std::random_device rd; + thread_local std::mt19937 gen(rd()); + constexpr size_t max_delay_ms = 40; + thread_local std::uniform_int_distribution<> uniform_dist(1, max_delay_ms); + auto* response_count_ptr = reinterpret_cast(req->data); + (void)req; + + Data data = {.foo = "alpha", .bar = "beta"}; + for (int i = 0; i < number_requests; ++i) + { + auto delay = uniform_dist(gen); + std::this_thread::sleep_for(std::chrono::milliseconds(delay)); + + data.iter = i; + + std::string url = fmt::format("http://localhost:8080/{}", i); + auto body = std::make_unique(data); + + auto headers = ccf::curl::UniqueSlist(); + headers.append("Content-Type", "application/json"); + + auto curl_handle = ccf::curl::UniqueCURL(); + curl_handle.set_opt(CURLOPT_TIMEOUT_MS, max_delay_ms); + curl_handle.set_opt(CURLOPT_FORBID_REUSE, 1L); + + auto response_callback = [response_count_ptr]( + ccf::curl::CurlRequest& request, + CURLcode curl_response, + long status_code) { + //(void)request; + LOG_INFO_FMT( + "Request to {} completed: {} ({}) {}", + request.get_url(), + curl_easy_strerror(curl_response), + curl_response, + status_code); + + // We expect all to fail to connect; count only unexpected successes. + constexpr size_t HTTP_SUCCESS = 200; + if (curl_response == CURLE_OK && status_code == HTTP_SUCCESS) + { + (*response_count_ptr)++; + } + }; + + auto request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + std::move(body), + std::make_unique(SIZE_MAX), + std::move(response_callback)); + + ccf::curl::CurlmLibuvContextSingleton::get_instance()->attach_request( + std::move(request)); + } + }; + + for (int i = 0; i < number_iterations; ++i) + { + ccf::curl::CurlmLibuvContextSingleton singleton(uv_default_loop()); + + uv_work_t work_req; + work_req.data = &response_count; + uv_queue_work(uv_default_loop(), &work_req, load_generator, nullptr); + uv_run(uv_default_loop(), UV_RUN_DEFAULT); + } + // All should fail to reach the unreachable host. + REQUIRE(response_count == number_iterations * number_requests); +} + int main(int argc, char** argv) { ccf::logger::config::default_init(); diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index 565fdd1ec156..b800988f18a7 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -36,7 +36,7 @@ async def echo_handler(request): return web.json_response(response_data) -async def main(): +async def main(debug): app = web.Application() app.router.add_route("*", "/{path:.*}", echo_handler) @@ -47,8 +47,11 @@ async def main(): print("Echo server running on http://::1:8080") - # call ./curl_test to run the load generator cmd = "./curl_test" + if (debug): + print(f"Run '{cmd}' to run the load generator") + # wait forever + await asyncio.Event().wait() process = await asyncio.create_subprocess_shell(cmd) await process.wait() @@ -57,5 +60,9 @@ async def main(): if __name__ == "__main__": import asyncio + import argparse + parser = argparse.ArgumentParser(description="Run echo server") + parser.add_argument("-d", "--debug", action="store_true", help="Enable debug logging") + args = parser.parse_args() - asyncio.run(main()) + asyncio.run(main(args.debug)) From 6a083b3006c66c2f2a63e04281ebbe88a968a75b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 17:15:04 +0100 Subject: [PATCH 103/197] refmt --- tests/e2e_curl.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index b800988f18a7..4e01060a7e74 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -48,7 +48,7 @@ async def main(debug): print("Echo server running on http://::1:8080") cmd = "./curl_test" - if (debug): + if debug: print(f"Run '{cmd}' to run the load generator") # wait forever await asyncio.Event().wait() @@ -61,8 +61,11 @@ async def main(debug): if __name__ == "__main__": import asyncio import argparse + parser = argparse.ArgumentParser(description="Run echo server") - parser.add_argument("-d", "--debug", action="store_true", help="Enable debug logging") + parser.add_argument( + "-d", "--debug", action="store_true", help="Enable debug logging" + ) args = parser.parse_args() asyncio.run(main(args.debug)) From 8b7cedd47a819f92cc1aae26df7ab4e29ea3a615 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 20:33:19 +0100 Subject: [PATCH 104/197] Don't have a default... --- tests/e2e_curl.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index 4e01060a7e74..91e5c0c11a5e 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -36,7 +36,7 @@ async def echo_handler(request): return web.json_response(response_data) -async def main(debug): +async def main(): app = web.Application() app.router.add_route("*", "/{path:.*}", echo_handler) @@ -48,10 +48,6 @@ async def main(debug): print("Echo server running on http://::1:8080") cmd = "./curl_test" - if debug: - print(f"Run '{cmd}' to run the load generator") - # wait forever - await asyncio.Event().wait() process = await asyncio.create_subprocess_shell(cmd) await process.wait() @@ -62,10 +58,4 @@ async def main(debug): import asyncio import argparse - parser = argparse.ArgumentParser(description="Run echo server") - parser.add_argument( - "-d", "--debug", action="store_true", help="Enable debug logging" - ) - args = parser.parse_args() - - asyncio.run(main(args.debug)) + asyncio.run(main()) From 31e1b5fd2d983f075bb08b5ac78449181a75d499 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 27 Aug 2025 20:52:00 +0100 Subject: [PATCH 105/197] Explicitly drain deque --- src/http/curl.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index a0add789782e..4d89c0196c92 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -916,6 +916,12 @@ namespace ccf::curl curl_easy_cleanup(easy); } } + // Drain the deque rather than letting it destruct + std::deque> requests_to_cleanup; + { + std::lock_guard requests_lock(requests_mutex); + requests_to_cleanup.swap(pending_requests); + } // Dispatch uv_close to asynchronously close the timer handle uv_close( reinterpret_cast(&async_requests_handle), on_close); From 29d6a9da1307ee20e18817f1c1482f7fb89e9253 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 28 Aug 2025 10:37:34 +0100 Subject: [PATCH 106/197] fmt --- tests/e2e_curl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index 91e5c0c11a5e..a8d4c73aa97c 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -56,6 +56,5 @@ async def main(): if __name__ == "__main__": import asyncio - import argparse asyncio.run(main()) From 4b910b8fd7d88c10e58522edb470871f030e8004 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 28 Aug 2025 10:52:28 +0100 Subject: [PATCH 107/197] Fix asan failure --- src/http/curl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index 4d89c0196c92..e93ab0f4a619 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -931,7 +931,8 @@ namespace ccf::curl { auto& close_count = static_cast(handle->data) ->closed_uv_handle_count; - if (close_count++ >= 2) + close_count++; + if (close_count >= 2) { static_cast(handle->data)->on_close(); } From 27be8e92098417dc557408b277cb4e68e5bde795 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 28 Aug 2025 12:02:20 +0100 Subject: [PATCH 108/197] Snags --- src/http/curl.h | 3 +-- src/http/test/curl_test.cpp | 2 +- tests/e2e_curl.py | 2 -- 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index e93ab0f4a619..22fd0ecdc980 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -311,7 +311,7 @@ namespace ccf::curl { LOG_FAIL_FMT( "Expected HTTP status line as first header, got '{}'", header); - return bytes_to_read; // Not a valid HTTP response + return bytes_to_read; } } else @@ -512,7 +512,6 @@ namespace ccf::curl } }; - // non-owning wrapper around a CURLM handle which supports CurlRequest class CurlRequestCURLM : public UniqueCURLM { public: diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index 5cb253c4f229..002f02b30c80 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -269,7 +269,7 @@ TEST_CASE("CurlmLibuvContext timeouts") REQUIRE(response_count == 0); } -TEST_CASE("CurlmLibuvContext double init") +TEST_CASE("CurlmLibuvContext multiple init") { size_t response_count = 0; constexpr size_t number_iterations = 10; diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index a8d4c73aa97c..86b80fb476f4 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -19,8 +19,6 @@ async def echo_handler(request): delay = random.random() / 100 await asyncio.sleep(delay) - # print(f"{datetime.now(UTC)}: Replying to request to {request.path_qs} after {delay:.3f}s delay") - # Build response data response_data = { "headers": headers, From 259522dd8e8ac84a1c632dd1414a305b574e6fb3 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 28 Aug 2025 14:26:15 +0100 Subject: [PATCH 109/197] Bump js max_execution_time from 1s to 5s --- include/ccf/service/tables/jsengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ccf/service/tables/jsengine.h b/include/ccf/service/tables/jsengine.h index 4d01ce543926..63c6f3ae4a18 100644 --- a/include/ccf/service/tables/jsengine.h +++ b/include/ccf/service/tables/jsengine.h @@ -14,7 +14,7 @@ namespace ccf { static constexpr size_t max_heap_bytes = 100 * 1024 * 1024; static constexpr size_t max_stack_bytes = 1024 * 1024; - static constexpr uint64_t max_execution_time_ms = 1000; + static constexpr uint64_t max_execution_time_ms = 5000; static constexpr bool log_exception_details = false; static constexpr bool return_exception_details = false; static constexpr size_t max_cached_interpreters = 10; From 47aff71f39cd2cd4d82b941874dd628e3179ad2b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 28 Aug 2025 16:36:05 +0100 Subject: [PATCH 110/197] Also bump limits test limit --- tests/js-custom-authorization/custom_authorization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/js-custom-authorization/custom_authorization.py b/tests/js-custom-authorization/custom_authorization.py index a5e570e82d4b..6adf671db920 100644 --- a/tests/js-custom-authorization/custom_authorization.py +++ b/tests/js-custom-authorization/custom_authorization.py @@ -224,7 +224,7 @@ def test_execution_time_limit(network, args): primary, _ = network.find_nodes() safe_time = 50 - unsafe_time = 5000 + unsafe_time = 10000 with primary.client("user0") as c: r = c.post("/app/sleep", body={"time": safe_time}) From ed0d4f09ea6dd1d2698547274490930dfa10a371 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 29 Aug 2025 09:56:12 +0100 Subject: [PATCH 111/197] e2e_curl should use a random port --- src/http/test/curl_test.cpp | 17 +++++++++++++---- tests/e2e_curl.py | 17 +++++++++++++---- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index 002f02b30c80..56ce24bd42e5 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -8,6 +8,7 @@ #include "curl/curl.h" #include "http/curl.h" +#include #include #include #include @@ -21,6 +22,8 @@ #define DOCTEST_CONFIG_IMPLEMENT #include +static std::string server_port = "8080"; + struct Data { std::string foo; @@ -39,7 +42,7 @@ TEST_CASE("Synchronous") for (int i = 0; i < sync_number_requests; ++i) { data.iter = i; - std::string url = fmt::format("http://localhost:8080/{}", i); + std::string url = fmt::format("http://[::1]:{}/{}", server_port, i); auto body = std::make_unique(data); auto headers = ccf::curl::UniqueSlist(); @@ -86,7 +89,7 @@ TEST_CASE("CurlmLibuvContext") std::this_thread::sleep_for(std::chrono::milliseconds(delay)); data.iter = i; - std::string url = fmt::format("http://localhost:8080/{}", i); + std::string url = fmt::format("http://[::1]:{}/{}", server_port, i); auto body = std::make_unique(data); auto headers = ccf::curl::UniqueSlist(); @@ -150,7 +153,7 @@ TEST_CASE("CurlmLibuvContext slow") std::this_thread::sleep_for(std::chrono::milliseconds(delay)); data.iter = i; - std::string url = fmt::format("http://localhost:8080/{}", i); + std::string url = fmt::format("http://[::1]:{}/{}", server_port, i); auto body = std::make_unique(data); auto headers = ccf::curl::UniqueSlist(); @@ -291,7 +294,7 @@ TEST_CASE("CurlmLibuvContext multiple init") data.iter = i; - std::string url = fmt::format("http://localhost:8080/{}", i); + std::string url = fmt::format("http://[::1]:{}/{}", server_port, i); auto body = std::make_unique(data); auto headers = ccf::curl::UniqueSlist(); @@ -350,6 +353,12 @@ TEST_CASE("CurlmLibuvContext multiple init") int main(int argc, char** argv) { + // NOLINTNEXTLINE(concurrency-mt-unsafe) + auto* port_ptr = std::getenv("ECHO_SERVER_PORT"); + if (port_ptr != nullptr) + { + server_port = std::string(port_ptr); + } ccf::logger::config::default_init(); curl_global_init(CURL_GLOBAL_DEFAULT); doctest::Context context; diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index 86b80fb476f4..f0bd682065d8 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -4,6 +4,7 @@ from datetime import datetime, UTC import asyncio import random +import os async def echo_handler(request): @@ -40,15 +41,23 @@ async def main(): runner = web.AppRunner(app) await runner.setup() - site = web.TCPSite(runner, "::1", 8080) + + site = web.TCPSite(runner, "::1", 0) await site.start() - print("Echo server running on http://::1:8080") + sockets = site._server.sockets + if not sockets: + raise RuntimeError("Failed to start server") + port = sockets[0].getsockname()[1] + + print(f"Echo server running on http://[::1]:{port}") + + env = os.environ.copy() + env["ECHO_SERVER_PORT"] = str(port) cmd = "./curl_test" - process = await asyncio.create_subprocess_shell(cmd) + process = await asyncio.create_subprocess_shell(cmd, env=env) await process.wait() - exit(process.returncode) From b93ab79d3ef556c9a0e8e72f114e617cb646637f Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 29 Aug 2025 13:49:36 +0100 Subject: [PATCH 112/197] Use ipv4 (127.0.0.1) and a random port --- src/http/test/curl_test.cpp | 16 ++++++++-------- tests/e2e_curl.py | 8 +++++--- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/http/test/curl_test.cpp b/src/http/test/curl_test.cpp index 56ce24bd42e5..e162ce22151d 100644 --- a/src/http/test/curl_test.cpp +++ b/src/http/test/curl_test.cpp @@ -22,7 +22,7 @@ #define DOCTEST_CONFIG_IMPLEMENT #include -static std::string server_port = "8080"; +static std::string server_address = "127.0.0.1:8080"; struct Data { @@ -42,7 +42,7 @@ TEST_CASE("Synchronous") for (int i = 0; i < sync_number_requests; ++i) { data.iter = i; - std::string url = fmt::format("http://[::1]:{}/{}", server_port, i); + std::string url = fmt::format("http://{}/{}", server_address, i); auto body = std::make_unique(data); auto headers = ccf::curl::UniqueSlist(); @@ -89,7 +89,7 @@ TEST_CASE("CurlmLibuvContext") std::this_thread::sleep_for(std::chrono::milliseconds(delay)); data.iter = i; - std::string url = fmt::format("http://[::1]:{}/{}", server_port, i); + std::string url = fmt::format("http://{}/{}", server_address, i); auto body = std::make_unique(data); auto headers = ccf::curl::UniqueSlist(); @@ -153,7 +153,7 @@ TEST_CASE("CurlmLibuvContext slow") std::this_thread::sleep_for(std::chrono::milliseconds(delay)); data.iter = i; - std::string url = fmt::format("http://[::1]:{}/{}", server_port, i); + std::string url = fmt::format("http://{}/{}", server_address, i); auto body = std::make_unique(data); auto headers = ccf::curl::UniqueSlist(); @@ -294,7 +294,7 @@ TEST_CASE("CurlmLibuvContext multiple init") data.iter = i; - std::string url = fmt::format("http://[::1]:{}/{}", server_port, i); + std::string url = fmt::format("http://{}/{}", server_address, i); auto body = std::make_unique(data); auto headers = ccf::curl::UniqueSlist(); @@ -354,10 +354,10 @@ TEST_CASE("CurlmLibuvContext multiple init") int main(int argc, char** argv) { // NOLINTNEXTLINE(concurrency-mt-unsafe) - auto* port_ptr = std::getenv("ECHO_SERVER_PORT"); - if (port_ptr != nullptr) + auto* addr_ptr = std::getenv("ECHO_SERVER_ADDR"); + if (addr_ptr != nullptr) { - server_port = std::string(port_ptr); + server_address = std::string(addr_ptr); } ccf::logger::config::default_init(); curl_global_init(CURL_GLOBAL_DEFAULT); diff --git a/tests/e2e_curl.py b/tests/e2e_curl.py index f0bd682065d8..93b0409b4c1e 100644 --- a/tests/e2e_curl.py +++ b/tests/e2e_curl.py @@ -42,18 +42,20 @@ async def main(): runner = web.AppRunner(app) await runner.setup() - site = web.TCPSite(runner, "::1", 0) + base_addr = "127.0.0.1" + site = web.TCPSite(runner, base_addr, 0) await site.start() sockets = site._server.sockets if not sockets: raise RuntimeError("Failed to start server") port = sockets[0].getsockname()[1] + addr = f"{base_addr}:{port}" - print(f"Echo server running on http://[::1]:{port}") + print(f"Echo server running on http://{addr}") env = os.environ.copy() - env["ECHO_SERVER_PORT"] = str(port) + env["ECHO_SERVER_ADDR"] = str(addr) cmd = "./curl_test" process = await asyncio.create_subprocess_shell(cmd, env=env) From 2ead59387c5299af12b6f795e27171b4d21bc00d Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 14:29:26 +0100 Subject: [PATCH 113/197] fmt --- src/node/node_state.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index db2bafa89b30..a55a36f0967e 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -30,8 +30,8 @@ #include "crypto/certs.h" #include "ds/ccf_assert.h" #include "ds/files.h" -#include "ds/ring_buffer_types.h" #include "ds/internal_logger.h" +#include "ds/ring_buffer_types.h" #include "ds/state_machine.h" #include "ds/thread_messaging.h" #include "enclave/interface.h" @@ -2214,14 +2214,16 @@ namespace ccf // Lexographically maximum pair std::optional> maximum; - gossip_handle->foreach([&maximum]( - const auto& iid, const auto& txid) { - if (!maximum.has_value() || maximum.value() < std::make_pair(txid, iid)) - { - maximum = std::make_pair(txid, iid); - } - return true; - }); + gossip_handle->foreach( + [&maximum](const auto& iid, const auto& txid) { + if ( + !maximum.has_value() || + maximum.value() < std::make_pair(txid, iid)) + { + maximum = std::make_pair(txid, iid); + } + return true; + }); auto* chosen_replica = tx.rw(network.self_healing_open_chosen_replica); From 3ecfd7e4e22a3fc8598355f714704d5384072d16 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 15:24:09 +0100 Subject: [PATCH 114/197] Refactor sho out of recovery config --- src/host/configuration.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/host/configuration.h b/src/host/configuration.h index 100d01beccf2..9b002988acea 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -112,20 +112,21 @@ namespace host }; Join join = {}; + struct SelfHealingOpen + { + std::vector addresses{}; + ccf::ds::TimeString retry_timeout = {"100ms"}; + ccf::ds::TimeString timeout = {"2000ms"}; + bool operator==(const SelfHealingOpen&) const = default; + }; + struct Recover { size_t initial_service_certificate_validity_days = 1; std::string previous_service_identity_file; std::optional previous_sealed_ledger_secret_location = std::nullopt; - std::optional> self_healing_open_addresses = - std::nullopt; - ccf::ds::TimeString self_healing_open_retry_timeout = {"100ms"}; - ccf::ds::TimeString self_healing_open_timeout = {"2000ms"}; - std::string self_healing_open_join_config_file = - "self_healing_open_join_config.json"; - std::string self_healing_open_join_service_identity_file = - "self_healing_open_join_service_identity.pem"; + std::optional self_healing_open = std::nullopt; bool operator==(const Recover&) const = default; }; Recover recover = {}; @@ -170,6 +171,13 @@ namespace host follow_redirect, fetch_recent_snapshot); + DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS( + CCHostConfig::Command::SelfHealingOpen); + DECLARE_JSON_REQUIRED_FIELDS( + CCHostConfig::Command::SelfHealingOpen, addresses); + DECLARE_JSON_OPTIONAL_FIELDS( + CCHostConfig::Command::SelfHealingOpen, retry_timeout, timeout); + DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCHostConfig::Command::Recover); DECLARE_JSON_REQUIRED_FIELDS(CCHostConfig::Command::Recover); DECLARE_JSON_OPTIONAL_FIELDS( @@ -177,9 +185,7 @@ namespace host initial_service_certificate_validity_days, previous_service_identity_file, previous_sealed_ledger_secret_location, - self_healing_open_addresses, - self_healing_open_retry_timeout, - self_healing_open_timeout); + self_healing_open); DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCHostConfig::Command); DECLARE_JSON_REQUIRED_FIELDS(CCHostConfig::Command, type); From 8384d1ae74baddbddaa1330e597cdc315cd9e6cf Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 15:24:20 +0100 Subject: [PATCH 115/197] Fixup curl calls --- src/node/node_state.h | 6 +++--- src/node/self_healing_open.h | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index a55a36f0967e..8d96c91e9fb3 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2139,16 +2139,16 @@ namespace ccf curl::UniqueSlist headers; headers.append("Content-Type: application/json"); - // This is simpler than going via the internal handlers... auto curl_request = std::make_unique( std::move(curl_handle), HTTP_PUT, std::move(url), std::move(headers), nullptr, + nullptr, std::nullopt); - curl::CurlmLibuvContextSingleton::get_instance().attach_request( - curl_request); + curl::CurlmLibuvContextSingleton::get_instance()->attach_request( + std::move(curl_request)); auto delay = msg->data.self.config.recover.self_healing_open_timeout; ::threading::ThreadMessaging::instance().add_task_after( diff --git a/src/node/self_healing_open.h b/src/node/self_healing_open.h index 166d50915c5c..abe3825f880b 100644 --- a/src/node/self_healing_open.h +++ b/src/node/self_healing_open.h @@ -107,6 +107,7 @@ namespace ccf::self_healing_open std::move(url), std::move(headers), std::move(body), + nullptr, std::move(response_callback)); LOG_TRACE_FMT( @@ -115,8 +116,8 @@ namespace ccf::self_healing_open curl_request->get_url(), request.dump()); - curl::CurlmLibuvContextSingleton::get_instance().attach_request( - curl_request); + curl::CurlmLibuvContextSingleton::get_instance()->attach_request( + std::move(curl_request)); } } \ No newline at end of file From 5a9fa04097e350d817e0501157098c2cceb63e69 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 15:33:03 +0100 Subject: [PATCH 116/197] Just stop when recv iamopen --- src/enclave/interface.h | 5 ++--- src/host/handle_ring_buffer.h | 9 +++------ src/host/self_healing_open.h | 15 ++------------- src/node/node_state.h | 6 +----- 4 files changed, 8 insertions(+), 27 deletions(-) diff --git a/src/enclave/interface.h b/src/enclave/interface.h index 231e7c6def84..324da11c4510 100644 --- a/src/enclave/interface.h +++ b/src/enclave/interface.h @@ -30,7 +30,7 @@ enum AdminMessage : ringbuffer::Message DEFINE_RINGBUFFER_MSG_TYPE(work_stats), /// Notify the host that it should restart in join - DEFINE_RINGBUFFER_MSG_TYPE(restart_and_join) + DEFINE_RINGBUFFER_MSG_TYPE(restart) }; DECLARE_RINGBUFFER_MESSAGE_PAYLOAD(AdminMessage::fatal_error_msg, std::string); @@ -39,8 +39,7 @@ DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::stop_notice); DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::stopped); DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::tick); DECLARE_RINGBUFFER_MESSAGE_PAYLOAD(AdminMessage::work_stats, std::string); -DECLARE_RINGBUFFER_MESSAGE_PAYLOAD( - AdminMessage::restart_and_join, std::string, std::string); +DECLARE_RINGBUFFER_MESSAGE_NO_PAYLOAD(AdminMessage::restart); /// Messages sent from app endpoints enum AppMessage : ringbuffer::Message diff --git a/src/host/handle_ring_buffer.h b/src/host/handle_ring_buffer.h index c6cb03d4f214..d49b6dee95ae 100644 --- a/src/host/handle_ring_buffer.h +++ b/src/host/handle_ring_buffer.h @@ -58,13 +58,10 @@ namespace asynchost DISPATCHER_SET_MESSAGE_HANDLER( bp, - AdminMessage::restart_and_join, - [&](const uint8_t* data, size_t size) { - auto [url, service_identity] = - ringbuffer::read_message( - data, size); + AdminMessage::restart, + [&](const uint8_t*, size_t) { ccf::SelfHealingOpenSingleton::instance() - ->trigger_restart_and_join_url(url, service_identity); + ->trigger_restart(); }); } diff --git a/src/host/self_healing_open.h b/src/host/self_healing_open.h index 8835f018ec6e..84f31d61f6e4 100644 --- a/src/host/self_healing_open.h +++ b/src/host/self_healing_open.h @@ -11,28 +11,17 @@ #include namespace ccf { - struct SelfHealingOpenJoinInfo - { - std::string url; - std::string service_identity; - }; - class SelfHealingOpen { public: ringbuffer::WriterPtr to_enclave; - std::optional join_info; SelfHealingOpen(ringbuffer::AbstractWriterFactory& writer_factory) : - to_enclave(writer_factory.create_writer_to_inside()), - join_info(std::nullopt) + to_enclave(writer_factory.create_writer_to_inside()) {} - void trigger_restart_and_join_url( - const std::string& url, const std::string& service_identity) + void trigger_restart() { - join_info = SelfHealingOpenJoinInfo{ - .url = url, .service_identity = service_identity}; RINGBUFFER_WRITE_MESSAGE(AdminMessage::stop, to_enclave); } }; diff --git a/src/node/node_state.h b/src/node/node_state.h index 8d96c91e9fb3..3020935a95d5 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2291,11 +2291,7 @@ namespace ccf node_config->published_network_address, node_config->service_identity); - RINGBUFFER_WRITE_MESSAGE( - AdminMessage::restart_and_join, - to_host, - node_config->published_network_address, - node_config->service_identity); + RINGBUFFER_WRITE_MESSAGE(AdminMessage::restart, to_host); } case SelfHealingOpenSM::OPENING: { From efe59bb1cc0e4e2b148c59739860172c86b5c9ef Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 15:39:21 +0100 Subject: [PATCH 117/197] refactor config --- include/ccf/node/startup_config.h | 13 +++++++++---- src/common/configuration.h | 4 +--- src/host/configuration.h | 10 +--------- src/host/run.cpp | 7 +------ src/node/node_state.h | 24 ++++++++++-------------- 5 files changed, 22 insertions(+), 36 deletions(-) diff --git a/include/ccf/node/startup_config.h b/include/ccf/node/startup_config.h index 29199bfb433b..9ce5b9a31384 100644 --- a/include/ccf/node/startup_config.h +++ b/include/ccf/node/startup_config.h @@ -102,6 +102,14 @@ namespace ccf Snapshots snapshots = {}; }; + struct SelfHealingOpen + { + std::vector addresses; + ccf::ds::TimeString retry_timeout = {"100ms"}; + ccf::ds::TimeString timeout = {"2000ms"}; + bool operator==(const SelfHealingOpen&) const = default; + }; + struct StartupConfig : CCFConfig { StartupConfig() = default; @@ -146,10 +154,7 @@ namespace ccf std::nullopt; std::optional previous_sealed_ledger_secret_location = std::nullopt; - std::optional> - self_healing_open_addresses = std::nullopt; - ccf::ds::TimeString self_healing_open_retry_timeout = {"100ms"}; - ccf::ds::TimeString self_healing_open_timeout = {"2000ms"}; + std::optional self_healing_open = std::nullopt; }; Recover recover = {}; }; diff --git a/src/common/configuration.h b/src/common/configuration.h index f3d516383a49..89c275807009 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -131,9 +131,7 @@ namespace ccf DECLARE_JSON_OPTIONAL_FIELDS( StartupConfig::Recover, previous_sealed_ledger_secret_location, - self_healing_open_addresses, - self_healing_open_retry_timeout, - self_healing_open_timeout); + self_healing_open); DECLARE_JSON_TYPE_WITH_BASE(StartupConfig, CCFConfig); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/host/configuration.h b/src/host/configuration.h index 9b002988acea..d5bb5f4f7889 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -112,21 +112,13 @@ namespace host }; Join join = {}; - struct SelfHealingOpen - { - std::vector addresses{}; - ccf::ds::TimeString retry_timeout = {"100ms"}; - ccf::ds::TimeString timeout = {"2000ms"}; - bool operator==(const SelfHealingOpen&) const = default; - }; - struct Recover { size_t initial_service_certificate_validity_days = 1; std::string previous_service_identity_file; std::optional previous_sealed_ledger_secret_location = std::nullopt; - std::optional self_healing_open = std::nullopt; + std::optional self_healing_open = std::nullopt; bool operator==(const Recover&) const = default; }; Recover recover = {}; diff --git a/src/host/run.cpp b/src/host/run.cpp index 68e15bd98fdc..9e37527285ac 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -830,12 +830,7 @@ namespace ccf startup_config.recover.previous_sealed_ledger_secret_location = config.command.recover.previous_sealed_ledger_secret_location; } - startup_config.recover.self_healing_open_addresses = - config.command.recover.self_healing_open_addresses; - startup_config.recover.self_healing_open_retry_timeout = - config.command.recover.self_healing_open_retry_timeout; - startup_config.recover.self_healing_open_timeout = - config.command.recover.self_healing_open_timeout; + startup_config.recover.self_healing_open = config.command.recover.self_healing_open; } else { diff --git a/src/node/node_state.h b/src/node/node_state.h index 3020935a95d5..d4a1f875940a 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -2000,8 +2000,7 @@ namespace ccf void self_healing_open_try_start_timers( ccf::kv::Tx& tx, bool recovering) override { - if ( - !recovering || !config.recover.self_healing_open_addresses.has_value()) + if (!recovering || !config.recover.self_healing_open.has_value()) { LOG_TRACE_FMT( "Not recovering, or no self-healing-open addresses configured, " @@ -2079,7 +2078,7 @@ namespace ccf } auto delay = - msg->data.self.config.recover.self_healing_open_retry_timeout; + msg->data.self.config.recover.self_healing_open->retry_timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, @@ -2150,13 +2149,13 @@ namespace ccf curl::CurlmLibuvContextSingleton::get_instance()->attach_request( std::move(curl_request)); - auto delay = msg->data.self.config.recover.self_healing_open_timeout; + auto delay = msg->data.self.config.recover.self_healing_open->timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, *this); ::threading::ThreadMessaging::instance().add_task_after( - std::move(timeout_msg), config.recover.self_healing_open_timeout); + std::move(timeout_msg), config.recover.self_healing_open->timeout); } void self_healing_open_advance(ccf::kv::Tx& tx, bool timeout) override @@ -2204,7 +2203,7 @@ namespace ccf auto* gossip_handle = tx.ro(network.self_healing_open_gossip); if ( gossip_handle->size() == - config.recover.self_healing_open_addresses.value().size() || + config.recover.self_healing_open->addresses.size() || valid_timeout) { if (gossip_handle->size() == 0) @@ -2237,8 +2236,7 @@ namespace ccf auto* votes = tx.rw(network.self_healing_open_votes); if ( votes->size() >= - config.recover.self_healing_open_addresses.value().size() / 2 + - 1 || + config.recover.self_healing_open->addresses.size() / 2 + 1 || valid_timeout) { if (votes->size() == 0) @@ -3264,7 +3262,7 @@ namespace ccf { // Caller must ensure that the current node's quote_info is populated: // ie not yet reached partOfNetwork - if (!config.recover.self_healing_open_addresses.has_value()) + if (!config.recover.self_healing_open.has_value()) { LOG_TRACE_FMT( "Self-healing-open addresses not set, cannot start gossip retries"); @@ -3280,8 +3278,7 @@ namespace ccf .txid = network.tables->current_version(), }; - for (auto& target_address : - config.recover.self_healing_open_addresses.value()) + for (auto& target_address : config.recover.self_healing_open->addresses) { self_healing_open::dispatch_authenticated_message( std::move(request), @@ -3316,7 +3313,7 @@ namespace ccf { // Caller must ensure that the current node's quote_info is populated: // ie not yet reached partOfNetwork - if (!config.recover.self_healing_open_addresses.has_value()) + if (!config.recover.self_healing_open.has_value()) { LOG_TRACE_FMT( "Self-healing-open addresses not set, cannot send iamopen"); @@ -3328,8 +3325,7 @@ namespace ccf self_healing_open::IAmOpenRequest request{ .info = self_healing_open_node_info()}; - for (auto& target_address : - config.recover.self_healing_open_addresses.value()) + for (auto& target_address : config.recover.self_healing_open->addresses) { if ( target_address == From 64779bdd6d7dcc13fca98c6065b580eac13a0714 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 15:53:25 +0100 Subject: [PATCH 118/197] Make build --- src/common/configuration.h | 7 +++++++ src/host/configuration.h | 7 ------- src/host/handle_ring_buffer.h | 2 +- src/host/run.cpp | 2 +- src/host/self_healing_open.h | 14 +++++++------- src/node/rpc/node_frontend.h | 8 ++++---- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/common/configuration.h b/src/common/configuration.h index 89c275807009..25515a203360 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -113,6 +113,13 @@ namespace ccf node_to_node_message_limit, historical_cache_soft_limit); + DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS( + SelfHealingOpen); + DECLARE_JSON_REQUIRED_FIELDS( + SelfHealingOpen, addresses); + DECLARE_JSON_OPTIONAL_FIELDS( + SelfHealingOpen, retry_timeout, timeout); + DECLARE_JSON_TYPE(StartupConfig::Start); DECLARE_JSON_REQUIRED_FIELDS( StartupConfig::Start, members, constitution, service_configuration); diff --git a/src/host/configuration.h b/src/host/configuration.h index d5bb5f4f7889..29f47a322c1d 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -163,13 +163,6 @@ namespace host follow_redirect, fetch_recent_snapshot); - DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS( - CCHostConfig::Command::SelfHealingOpen); - DECLARE_JSON_REQUIRED_FIELDS( - CCHostConfig::Command::SelfHealingOpen, addresses); - DECLARE_JSON_OPTIONAL_FIELDS( - CCHostConfig::Command::SelfHealingOpen, retry_timeout, timeout); - DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(CCHostConfig::Command::Recover); DECLARE_JSON_REQUIRED_FIELDS(CCHostConfig::Command::Recover); DECLARE_JSON_OPTIONAL_FIELDS( diff --git a/src/host/handle_ring_buffer.h b/src/host/handle_ring_buffer.h index d49b6dee95ae..88bc4eb78f8e 100644 --- a/src/host/handle_ring_buffer.h +++ b/src/host/handle_ring_buffer.h @@ -60,7 +60,7 @@ namespace asynchost bp, AdminMessage::restart, [&](const uint8_t*, size_t) { - ccf::SelfHealingOpenSingleton::instance() + ccf::SelfHealingOpenRBHandlerSingleton::instance() ->trigger_restart(); }); } diff --git a/src/host/run.cpp b/src/host/run.cpp index 9e37527285ac..940f7dfe1e57 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -525,7 +525,7 @@ namespace ccf auto curl_libuv_context = curl::CurlmLibuvContextSingleton(uv_default_loop()); - ccf::SelfHealingOpenSingleton::initialise(writer_factory); + ccf::SelfHealingOpenRBHandlerSingleton::initialise(writer_factory); ResolvedAddresses resolved_rpc_addresses; for (auto& [name, interface] : config.network.rpc_interfaces) diff --git a/src/host/self_healing_open.h b/src/host/self_healing_open.h index 84f31d61f6e4..408079772f03 100644 --- a/src/host/self_healing_open.h +++ b/src/host/self_healing_open.h @@ -11,12 +11,12 @@ #include namespace ccf { - class SelfHealingOpen + class SelfHealingOpenRBHandler { public: ringbuffer::WriterPtr to_enclave; - SelfHealingOpen(ringbuffer::AbstractWriterFactory& writer_factory) : + SelfHealingOpenRBHandler(ringbuffer::AbstractWriterFactory& writer_factory) : to_enclave(writer_factory.create_writer_to_inside()) {} @@ -26,17 +26,17 @@ namespace ccf } }; - class SelfHealingOpenSingleton + class SelfHealingOpenRBHandlerSingleton { private: - static std::unique_ptr& instance_unsafe() + static std::unique_ptr& instance_unsafe() { - static std::unique_ptr instance = nullptr; + static std::unique_ptr instance = nullptr; return instance; } public: - static std::unique_ptr& instance() + static std::unique_ptr& instance() { auto& instance = instance_unsafe(); if (instance == nullptr) @@ -55,7 +55,7 @@ namespace ccf throw std::logic_error( "SelfHealingOpenSingleton instance already initialized"); } - instance = std::make_unique(writer_factory); + instance = std::make_unique(writer_factory); } }; } \ No newline at end of file diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index f16fa600f2a3..74d94a21c9f2 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -2296,7 +2296,7 @@ namespace ccf "NodeConfigurationSubsystem is not available"); } if (!config->get() - .node_config.recover.self_healing_open_addresses.has_value()) + .node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, @@ -2375,7 +2375,7 @@ namespace ccf "NodeConfigurationSubsystem is not available"); } if (!config->get() - .node_config.recover.self_healing_open_addresses.has_value()) + .node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, @@ -2437,7 +2437,7 @@ namespace ccf "NodeConfigurationSubsystem is not available"); } if (!config->get() - .node_config.recover.self_healing_open_addresses.has_value()) + .node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, @@ -2505,7 +2505,7 @@ namespace ccf "NodeConfigurationSubsystem is not available"); } if (!config->get() - .node_config.recover.self_healing_open_addresses.has_value()) + .node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, From ef70b52f8f84d04bc4d5e9909a93d0b25839d9ce Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 15:55:01 +0100 Subject: [PATCH 119/197] refmt --- src/common/configuration.h | 9 +-- src/host/handle_ring_buffer.h | 9 +-- src/host/run.cpp | 3 +- src/host/self_healing_open.h | 3 +- src/node/rpc/node_frontend.h | 12 ++-- tests/e2e_operations.py | 68 ++++++++++-------- tests/infra/clients.py | 6 +- tests/infra/network.py | 127 ++++++++++++++++++++-------------- tests/infra/node.py | 27 ++++---- 9 files changed, 143 insertions(+), 121 deletions(-) diff --git a/src/common/configuration.h b/src/common/configuration.h index 25515a203360..f592a4097fc6 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -113,12 +113,9 @@ namespace ccf node_to_node_message_limit, historical_cache_soft_limit); - DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS( - SelfHealingOpen); - DECLARE_JSON_REQUIRED_FIELDS( - SelfHealingOpen, addresses); - DECLARE_JSON_OPTIONAL_FIELDS( - SelfHealingOpen, retry_timeout, timeout); + DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(SelfHealingOpen); + DECLARE_JSON_REQUIRED_FIELDS(SelfHealingOpen, addresses); + DECLARE_JSON_OPTIONAL_FIELDS(SelfHealingOpen, retry_timeout, timeout); DECLARE_JSON_TYPE(StartupConfig::Start); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/host/handle_ring_buffer.h b/src/host/handle_ring_buffer.h index 88bc4eb78f8e..8aee28cc1cba 100644 --- a/src/host/handle_ring_buffer.h +++ b/src/host/handle_ring_buffer.h @@ -4,9 +4,9 @@ #include "../ds/files.h" #include "../enclave/interface.h" +#include "ds/internal_logger.h" #include "ds/non_blocking.h" #include "self_healing_open.h" -#include "ds/internal_logger.h" #include "timer.h" #include @@ -57,11 +57,8 @@ namespace asynchost }); DISPATCHER_SET_MESSAGE_HANDLER( - bp, - AdminMessage::restart, - [&](const uint8_t*, size_t) { - ccf::SelfHealingOpenRBHandlerSingleton::instance() - ->trigger_restart(); + bp, AdminMessage::restart, [&](const uint8_t*, size_t) { + ccf::SelfHealingOpenRBHandlerSingleton::instance()->trigger_restart(); }); } diff --git a/src/host/run.cpp b/src/host/run.cpp index 940f7dfe1e57..2496e70a66c9 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -830,7 +830,8 @@ namespace ccf startup_config.recover.previous_sealed_ledger_secret_location = config.command.recover.previous_sealed_ledger_secret_location; } - startup_config.recover.self_healing_open = config.command.recover.self_healing_open; + startup_config.recover.self_healing_open = + config.command.recover.self_healing_open; } else { diff --git a/src/host/self_healing_open.h b/src/host/self_healing_open.h index 408079772f03..4ae3d41db0d0 100644 --- a/src/host/self_healing_open.h +++ b/src/host/self_healing_open.h @@ -16,7 +16,8 @@ namespace ccf public: ringbuffer::WriterPtr to_enclave; - SelfHealingOpenRBHandler(ringbuffer::AbstractWriterFactory& writer_factory) : + SelfHealingOpenRBHandler( + ringbuffer::AbstractWriterFactory& writer_factory) : to_enclave(writer_factory.create_writer_to_inside()) {} diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 74d94a21c9f2..56e19b4b8e2a 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -2295,8 +2295,7 @@ namespace ccf ccf::errors::InternalError, "NodeConfigurationSubsystem is not available"); } - if (!config->get() - .node_config.recover.self_healing_open.has_value()) + if (!config->get().node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, @@ -2374,8 +2373,7 @@ namespace ccf ccf::errors::InternalError, "NodeConfigurationSubsystem is not available"); } - if (!config->get() - .node_config.recover.self_healing_open.has_value()) + if (!config->get().node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, @@ -2436,8 +2434,7 @@ namespace ccf ccf::errors::InternalError, "NodeConfigurationSubsystem is not available"); } - if (!config->get() - .node_config.recover.self_healing_open.has_value()) + if (!config->get().node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, @@ -2504,8 +2501,7 @@ namespace ccf ccf::errors::InternalError, "NodeConfigurationSubsystem is not available"); } - if (!config->get() - .node_config.recover.self_healing_open.has_value()) + if (!config->get().node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 08a3a67cf80a..0c44b44d5cbe 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1429,6 +1429,7 @@ def run(self, src_dir, dst_dir): recovery_network.stop_all_nodes() prev_network = recovery_network + def run_self_healing_open(args): args.nodes = infra.e2e_args.min_nodes(args, f=1) with infra.network.network( @@ -1438,7 +1439,6 @@ def run_self_healing_open(args): ) as network: LOG.info("Start a network and stop it") network.start_and_open(args) - old_common = infra.network.get_common_folder_name(args.workspace, args.label) network.save_service_identity(args) network.stop_all_nodes() @@ -1465,47 +1465,51 @@ def run_self_healing_open(args): ) def cycle(items): - while True: - for item in items: - yield item + while True: + for item in items: + yield item # Wait for any node to be waiting for RecoveryShares, ie it opened for node in cycle(recovered_network.nodes): - try: - recovered_network.wait_for_statuses( - node, - ["WaitingForRecoveryShares", "Open"], - timeout=1, - verify_ca=False - ) - break - except TimeoutError: - LOG.info(f"Failed to get the status of {node.local_node_id}, retrying...") - continue + try: + recovered_network.wait_for_statuses( + node, + ["WaitingForRecoveryShares", "Open"], + timeout=1, + verify_ca=False, + ) + break + except TimeoutError: + LOG.info( + f"Failed to get the status of {node.local_node_id}, retrying..." + ) + continue # Refresh the the declared state of nodes which have shut themselves down to join. for node in recovered_network.nodes: - node.refresh_network_state(verify_ca=False) + node.refresh_network_state(verify_ca=False) recovered_network.refresh_service_identity_file(recovery_args) - recovered_network.consortium.recover_with_shares(recovered_network.find_random_node()) + recovered_network.consortium.recover_with_shares( + recovered_network.find_random_node() + ) LOG.info("Submitted recovery shares") # Wait for all live replicas to report being part of the opened network successfully_opened = 0 for node in recovered_network.get_joined_nodes(): - try: - recovered_network.wait_for_status( - node, - "Open", - timeout=10, - ) - recovered_network._wait_for_app_open(node) - successfully_opened += 1 - except TimeoutError as e: - pass + try: + recovered_network.wait_for_status( + node, + "Open", + timeout=10, + ) + recovered_network._wait_for_app_open(node) + successfully_opened += 1 + except TimeoutError: + pass assert successfully_opened == 1 @@ -1513,6 +1517,7 @@ def cycle(items): recovered_network.stop_all_nodes() + def run_self_healing_open_single_replica(args): args.nodes = infra.e2e_args.min_nodes(args, f=1) with infra.network.network( @@ -1528,8 +1533,8 @@ def run_self_healing_open_single_replica(args): ledger_dirs = {} committed_ledger_dirs = {} for i, node in enumerate(network.nodes): - l, c = node.get_ledger() - ledger_dirs[i] = l + l_dir, c = node.get_ledger() + ledger_dirs[i] = l_dir committed_ledger_dirs[i] = c LOG.info("Start a recovery network and stop it") @@ -1558,7 +1563,9 @@ def run_self_healing_open_single_replica(args): ["WaitingForRecoveryShares", "Open"], timeout=30, ) - recovered_network.consortium.recover_with_shares(recovered_network.find_random_node()) + recovered_network.consortium.recover_with_shares( + recovered_network.find_random_node() + ) # Wait for all replicas to report being part of the network for node in recovered_network.nodes[0:1]: @@ -1571,6 +1578,7 @@ def run_self_healing_open_single_replica(args): recovered_network.stop_all_nodes() + def run_read_ledger_on_testdata(args): for testdata_dir in os.scandir(args.historical_testdata): assert testdata_dir.is_dir() diff --git a/tests/infra/clients.py b/tests/infra/clients.py index 4d0c5d256931..d1a3a486e2b7 100644 --- a/tests/infra/clients.py +++ b/tests/infra/clients.py @@ -487,9 +487,9 @@ def __init__( self.cose_signing_auth = cose_signing_auth self.common_headers = common_headers or {} if self.ca: - self.ca_curve = get_curve(self.ca) + self.ca_curve = get_curve(self.ca) else: - self.ca_curve = None + self.ca_curve = None self.protocol = kwargs.get("protocol") if "protocol" in kwargs else "https" self.extra_args = [] if kwargs.get("http2"): @@ -583,7 +583,7 @@ def request( cmd.extend(["--key", self.session_auth.key]) cmd.extend(["--cert", self.session_auth.cert]) if not self.ca and not self.session_auth: - cmd.extend(["-k"]) # Allow insecure connections + cmd.extend(["-k"]) # Allow insecure connections for arg in self.extra_args: cmd.append(arg) diff --git a/tests/infra/network.py b/tests/infra/network.py index 87cb6525c6d0..c43d7d67fb11 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -490,9 +490,16 @@ def _start_all_nodes( "read_only_ledger_dirs": read_only_ledger_dirs, "snapshots_dir": snapshots_dir, } - self_healing_open_kwargs = {"self_healing_open_addresses": self_healing_open_addresses} + self_healing_open_kwargs = { + "self_healing_open_addresses": self_healing_open_addresses + } # If a kwarg is passed in override automatically set variants - node_kwargs = node_kwargs | self_healing_open_kwargs | forwarded_args_with_overrides | kwargs + node_kwargs = ( + node_kwargs + | self_healing_open_kwargs + | forwarded_args_with_overrides + | kwargs + ) node.recover(**node_kwargs) self.wait_for_state( node, @@ -772,8 +779,8 @@ def start_in_self_healing_open( self, args, ledger_dirs, - committed_ledger_dirs= None, - snapshot_dirs= None, + committed_ledger_dirs=None, + snapshot_dirs=None, common_dir=None, set_authenticate_session=None, start_all_nodes=True, @@ -784,29 +791,34 @@ def start_in_self_healing_open( args.workspace, args.label ) - self.per_node_args_override = self.per_node_args_override or {i: {} for i in range(len(self.nodes))} - committed_ledger_dirs = committed_ledger_dirs or {i: None for i in range(len(self.nodes))} + self.per_node_args_override = self.per_node_args_override or { + i: {} for i in range(len(self.nodes)) + } + committed_ledger_dirs = committed_ledger_dirs or { + i: None for i in range(len(self.nodes)) + } snapshot_dirs = snapshot_dirs or {i: None for i in range(len(self.nodes))} self.per_node_args_override = { - i: - (d | { - "ledger_dir" : ledger_dirs[i], - "read_only_ledger_dirs" : committed_ledger_dirs[i] or [], - "snapshots_dir" : snapshot_dirs[i] or None, - }) + i: ( + d + | { + "ledger_dir": ledger_dirs[i], + "read_only_ledger_dirs": committed_ledger_dirs[i] or [], + "snapshots_dir": snapshot_dirs[i] or None, + } + ) for i, d in self.per_node_args_override.items() } - for i, node in enumerate(self.nodes): node.host.get_primary_interface().port = 5000 + (i + 1) node.host.get_primary_interface().public_port = 5000 + (i + 1) - LOG.info(f"Set up nodes") + LOG.info("Set up nodes") for node in self.nodes: - LOG.info(node.host) + LOG.info(node.host) - self.status = ServiceStatus.RECOVERING + self.status = ServiceStatus.RECOVERING LOG.debug(f"Opening CCF service on {self.hosts}") forwarded_args = { @@ -814,14 +826,12 @@ def start_in_self_healing_open( for arg in infra.network.Network.node_args_to_forward } self_healing_open_addresses = [ - node.get_public_rpc_address() for node in self.nodes + node.get_public_rpc_address() for node in self.nodes ] for i, node in enumerate(self.nodes): forwarded_args_with_overrides = forwarded_args.copy() - forwarded_args_with_overrides.update( - self.per_node_args_override.get(i, {}) - ) + forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) if not start_all_nodes and i > 0: break @@ -832,9 +842,16 @@ def start_in_self_healing_open( "label": args.label, "common_dir": self.common_dir, } - self_healing_open_kwargs = {"self_healing_open_addresses": self_healing_open_addresses} + self_healing_open_kwargs = { + "self_healing_open_addresses": self_healing_open_addresses + } # If a kwarg is passed in override automatically set variants - node_kwargs = node_kwargs | self_healing_open_kwargs | forwarded_args_with_overrides | kwargs + node_kwargs = ( + node_kwargs + | self_healing_open_kwargs + | forwarded_args_with_overrides + | kwargs + ) node.recover(**node_kwargs) except Exception: LOG.exception(f"Failed to start node {node.local_node_id}") @@ -844,26 +861,30 @@ def start_in_self_healing_open( self.observed_election_duration = self.election_duration + 1 for i, node in enumerate(self.nodes): - end_time = time.time() + timeout - success = False - while time.time() < end_time: - try: - self.wait_for_states( - node, - [infra.node.State.PART_OF_PUBLIC_NETWORK.value, infra.node.State.PART_OF_NETWORK], - timeout=args.ledger_recovery_timeout, - verify_ca=False, # Certs are volatile until the recovery is complete - ) - success = True - break - except CCFConnectionException: - time.sleep(0.1) - if not success: - raise TimeoutError(f"Failed to get state of node {node.local_node_id} after {timeout} seconds") + end_time = time.time() + timeout + success = False + while time.time() < end_time: + try: + self.wait_for_states( + node, + [ + infra.node.State.PART_OF_PUBLIC_NETWORK.value, + infra.node.State.PART_OF_NETWORK, + ], + timeout=args.ledger_recovery_timeout, + verify_ca=False, # Certs are volatile until the recovery is complete + ) + success = True + break + except CCFConnectionException: + time.sleep(0.1) + if not success: + raise TimeoutError( + f"Failed to get state of node {node.local_node_id} after {timeout} seconds" + ) LOG.info("All nodes started") - def recover( self, args, @@ -1347,21 +1368,21 @@ def wait_for_state(self, node, state, timeout=3): def wait_for_statuses(self, node, statuses, timeout=3, **client_kwargs): end_time = time.time() + timeout while time.time() < end_time: - try: - with node.client(connection_timeout=timeout, **client_kwargs) as c: - r = c.get("/node/network").body.json() - if r["service_status"] in statuses: - break - except ConnectionRefusedError: - pass - except CCFConnectionException: - pass - time.sleep(0.1) + try: + with node.client(connection_timeout=timeout, **client_kwargs) as c: + r = c.get("/node/network").body.json() + if r["service_status"] in statuses: + break + except ConnectionRefusedError: + pass + except CCFConnectionException: + pass + time.sleep(0.1) else: - raise TimeoutError( - f"Timed out waiting for a network status in {statuses} on node {node.node_id}" - ) - + raise TimeoutError( + f"Timed out waiting for a network status in {statuses} on node {node.node_id}" + ) + def wait_for_status(self, node, status, timeout=3): self.wait_for_statuses(node, [status], timeout=timeout) diff --git a/tests/infra/node.py b/tests/infra/node.py index dc10e4368f67..a3fd3970deeb 100644 --- a/tests/infra/node.py +++ b/tests/infra/node.py @@ -855,19 +855,20 @@ def wait_for_leadership_state(self, min_view, leadership_states, timeout=3): ) def refresh_network_state(self, **client_kwargs): - try: - with self.client(**client_kwargs) as c: - LOG.info(f"Trying to refresh using {c}") - r = c.get(f"/node/network/nodes/{self.node_id}").body.json() - LOG.info(r) - - if r["status"] == "Pending": - self.network_state = NodeNetworkState.started - elif r["status"] == "Trusted": - self.network_state = NodeNetworkState.joined - except Exception as e: - LOG.debug(f"Failed to connect {e}") - self.network_state = NodeNetworkState.stopped + try: + with self.client(**client_kwargs) as c: + LOG.info(f"Trying to refresh using {c}") + r = c.get(f"/node/network/nodes/{self.node_id}").body.json() + LOG.info(r) + + if r["status"] == "Pending": + self.network_state = NodeNetworkState.started + elif r["status"] == "Trusted": + self.network_state = NodeNetworkState.joined + except Exception as e: + LOG.debug(f"Failed to connect {e}") + self.network_state = NodeNetworkState.stopped + @contextmanager def node( From c1a7aed1da1c5c29a5285117ad404b056dc51fb4 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 16:08:36 +0100 Subject: [PATCH 120/197] Get a single test to pass! woop woop --- doc/host_config_schema/cchost_config.json | 37 +++++++++------- tests/config.jinja | 12 +++--- tests/e2e_operations.py | 51 ++++++++++++----------- tests/schema.py | 42 +++++++++---------- 4 files changed, 75 insertions(+), 67 deletions(-) diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index 1a6f044ae463..d116f5658429 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -418,22 +418,27 @@ "type": ["string"], "description": "Path to the sealed ledger secret folder, the ledger secrets for the recovered service will be unsealed from here instead of reconstructed from recovery shares." }, - "self_healing_open_addresses": { - "type": "array", - "items": { - "type": "string" - }, - "description": "List of addresses (host:port) of the cluster that should open via self-healing-open" - }, - "self_healing_open_retry_timeout": { - "type": "string", - "default": "100ms", - "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be leass than 'self_healing_open_timeout'" - }, - "self_healing_open_timeout": { - "type": "string", - "default": "2000ms", - "description": "Interval (time string) after which the node forcibly advances to the next phase of the self-healing-open protocol" + "self_healing_open": { + "type": "object", + "properties": { + "self_healing_open_addresses": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of addresses (host:port) of the cluster that should open via self-healing-open" + }, + "self_healing_open_retry_timeout": { + "type": "string", + "default": "100ms", + "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be leass than 'self_healing_open_timeout'" + }, + "self_healing_open_timeout": { + "type": "string", + "default": "2000ms", + "description": "Interval (time string) after which the node forcibly advances to the next phase of the self-healing-open protocol" + } + } } }, "required": ["previous_service_identity_file"], diff --git a/tests/config.jinja b/tests/config.jinja index f8a18aa27b59..84b92b500d75 100644 --- a/tests/config.jinja +++ b/tests/config.jinja @@ -54,11 +54,13 @@ "initial_service_certificate_validity_days": {{ initial_service_cert_validity_days }}, "previous_service_identity_file": "{{ previous_service_identity_file }}" {% if previous_sealed_ledger_secret_location %}, "previous_sealed_ledger_secret_location": "{{ previous_sealed_ledger_secret_location }}"{% endif %} {% if self_healing_open_addresses %}, - "self_healing_open_addresses" : [ - {% for address in self_healing_open_addresses %} - "{{ address }}" {% if not loop.last %},{% endif %} - {% endfor %} - ] {% endif %} + "self_healing_open": { + "addresses" : [ + {% for address in self_healing_open_addresses %} + "{{ address }}" {% if not loop.last %},{% endif %} + {% endfor %} + ] + } {% endif %} } }, "ledger": diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 0c44b44d5cbe..9671ac9ab8f5 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1864,28 +1864,29 @@ def test_error_message_on_failure_to_read_aci_sec_context(args): def run(args): - run_max_uncommitted_tx_count(args) - run_file_operations(args) - run_tls_san_checks(args) - run_config_timeout_check(args) - run_configuration_file_checks(args) - run_pid_file_check(args) - run_preopen_readiness_check(args) - run_sighup_check(args) - run_service_subject_name_check(args) - run_cose_signatures_config_check(args) - run_late_mounted_ledger_check(args) - run_empty_ledger_dir_check(args) - - if infra.platform_detection.is_snp(): - run_initial_uvm_descriptor_checks(args) - run_initial_tcb_version_checks(args) - run_recovery_local_unsealing(args) - run_recovery_local_unsealing(args, rekey=True) - run_recovery_local_unsealing(args, recovery_shares_refresh=True) - run_recovery_local_unsealing(args, recovery_f=1) - run_recovery_unsealing_corrupt(args) - run_recovery_unsealing_validate_audit(args) - test_error_message_on_failure_to_read_aci_sec_context(args) - run_read_ledger_on_testdata(args) - run_ledger_chunk_bytes_check(args) + #run_max_uncommitted_tx_count(args) + #run_file_operations(args) + #run_tls_san_checks(args) + #run_config_timeout_check(args) + #run_configuration_file_checks(args) + #run_pid_file_check(args) + #run_preopen_readiness_check(args) + #run_sighup_check(args) + #run_service_subject_name_check(args) + #run_cose_signatures_config_check(args) + #run_late_mounted_ledger_check(args) + #run_empty_ledger_dir_check(args) + + #if infra.platform_detection.is_snp(): + # run_initial_uvm_descriptor_checks(args) + # run_initial_tcb_version_checks(args) + # run_recovery_local_unsealing(args) + # run_recovery_local_unsealing(args, rekey=True) + # run_recovery_local_unsealing(args, recovery_shares_refresh=True) + # run_recovery_local_unsealing(args, recovery_f=1) + # run_recovery_unsealing_corrupt(args) + # run_recovery_unsealing_validate_audit(args) + # test_error_message_on_failure_to_read_aci_sec_context(args) + #run_read_ledger_on_testdata(args) + #run_ledger_chunk_bytes_check(args) + run_self_healing_open(args) diff --git a/tests/schema.py b/tests/schema.py index 8b969c09edf0..1d5740451e47 100644 --- a/tests/schema.py +++ b/tests/schema.py @@ -204,27 +204,27 @@ def add(parser): cr = ConcurrentRunner(add) - cr.add( - "schema", - run, - package="samples/apps/logging/logging", - nodes=infra.e2e_args.nodes(cr.args, 1), - ) - - cr.add( - "nobuiltins", - run_nobuiltins, - package="samples/apps/nobuiltins/nobuiltins", - nodes=infra.e2e_args.min_nodes(cr.args, f=1), - ) - - cr.add( - "tutorial", - e2e_tutorial.run, - package="samples/apps/logging/logging", - nodes=["local://127.0.0.1:8000"], - initial_member_count=1, - ) + #cr.add( + # "schema", + # run, + # package="samples/apps/logging/logging", + # nodes=infra.e2e_args.nodes(cr.args, 1), + #) + + #cr.add( + # "nobuiltins", + # run_nobuiltins, + # package="samples/apps/nobuiltins/nobuiltins", + # nodes=infra.e2e_args.min_nodes(cr.args, f=1), + #) + + #cr.add( + # "tutorial", + # e2e_tutorial.run, + # package="samples/apps/logging/logging", + # nodes=["local://127.0.0.1:8000"], + # initial_member_count=1, + #) cr.add( "operations", From 30344fda639c3f47c4f11520a5b9642b3f2d07ab Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 16:09:13 +0100 Subject: [PATCH 121/197] And do the other tests as well... --- tests/e2e_operations.py | 50 ++++++++++++++++++++--------------------- tests/schema.py | 42 +++++++++++++++++----------------- 2 files changed, 46 insertions(+), 46 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 9671ac9ab8f5..c4158b9a4bba 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1864,29 +1864,29 @@ def test_error_message_on_failure_to_read_aci_sec_context(args): def run(args): - #run_max_uncommitted_tx_count(args) - #run_file_operations(args) - #run_tls_san_checks(args) - #run_config_timeout_check(args) - #run_configuration_file_checks(args) - #run_pid_file_check(args) - #run_preopen_readiness_check(args) - #run_sighup_check(args) - #run_service_subject_name_check(args) - #run_cose_signatures_config_check(args) - #run_late_mounted_ledger_check(args) - #run_empty_ledger_dir_check(args) - - #if infra.platform_detection.is_snp(): - # run_initial_uvm_descriptor_checks(args) - # run_initial_tcb_version_checks(args) - # run_recovery_local_unsealing(args) - # run_recovery_local_unsealing(args, rekey=True) - # run_recovery_local_unsealing(args, recovery_shares_refresh=True) - # run_recovery_local_unsealing(args, recovery_f=1) - # run_recovery_unsealing_corrupt(args) - # run_recovery_unsealing_validate_audit(args) - # test_error_message_on_failure_to_read_aci_sec_context(args) - #run_read_ledger_on_testdata(args) - #run_ledger_chunk_bytes_check(args) + run_max_uncommitted_tx_count(args) + run_file_operations(args) + run_tls_san_checks(args) + run_config_timeout_check(args) + run_configuration_file_checks(args) + run_pid_file_check(args) + run_preopen_readiness_check(args) + run_sighup_check(args) + run_service_subject_name_check(args) + run_cose_signatures_config_check(args) + run_late_mounted_ledger_check(args) + run_empty_ledger_dir_check(args) + + if infra.platform_detection.is_snp(): + run_initial_uvm_descriptor_checks(args) + run_initial_tcb_version_checks(args) + run_recovery_local_unsealing(args) + run_recovery_local_unsealing(args, rekey=True) + run_recovery_local_unsealing(args, recovery_shares_refresh=True) + run_recovery_local_unsealing(args, recovery_f=1) + run_recovery_unsealing_corrupt(args) + run_recovery_unsealing_validate_audit(args) + test_error_message_on_failure_to_read_aci_sec_context(args) + run_read_ledger_on_testdata(args) + run_ledger_chunk_bytes_check(args) run_self_healing_open(args) diff --git a/tests/schema.py b/tests/schema.py index 1d5740451e47..8b969c09edf0 100644 --- a/tests/schema.py +++ b/tests/schema.py @@ -204,27 +204,27 @@ def add(parser): cr = ConcurrentRunner(add) - #cr.add( - # "schema", - # run, - # package="samples/apps/logging/logging", - # nodes=infra.e2e_args.nodes(cr.args, 1), - #) - - #cr.add( - # "nobuiltins", - # run_nobuiltins, - # package="samples/apps/nobuiltins/nobuiltins", - # nodes=infra.e2e_args.min_nodes(cr.args, f=1), - #) - - #cr.add( - # "tutorial", - # e2e_tutorial.run, - # package="samples/apps/logging/logging", - # nodes=["local://127.0.0.1:8000"], - # initial_member_count=1, - #) + cr.add( + "schema", + run, + package="samples/apps/logging/logging", + nodes=infra.e2e_args.nodes(cr.args, 1), + ) + + cr.add( + "nobuiltins", + run_nobuiltins, + package="samples/apps/nobuiltins/nobuiltins", + nodes=infra.e2e_args.min_nodes(cr.args, f=1), + ) + + cr.add( + "tutorial", + e2e_tutorial.run, + package="samples/apps/logging/logging", + nodes=["local://127.0.0.1:8000"], + initial_member_count=1, + ) cr.add( "operations", From 84960c09c56eea602aae0197bc83e8b521552a77 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 12 Sep 2025 16:11:01 +0100 Subject: [PATCH 122/197] snags --- src/enclave/interface.h | 2 +- tests/e2e_operations.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/enclave/interface.h b/src/enclave/interface.h index 324da11c4510..bc2f58fdc3b8 100644 --- a/src/enclave/interface.h +++ b/src/enclave/interface.h @@ -29,7 +29,7 @@ enum AdminMessage : ringbuffer::Message /// Notify the host of work done since last message. Enclave -> Host DEFINE_RINGBUFFER_MSG_TYPE(work_stats), - /// Notify the host that it should restart in join + /// Notify the host that it should restart DEFINE_RINGBUFFER_MSG_TYPE(restart) }; diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index c4158b9a4bba..71b9f4ed1b5b 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1447,8 +1447,8 @@ def run_self_healing_open(args): ledger_dirs = {} committed_ledger_dirs = {} for i, node in enumerate(network.nodes): - l, c = node.get_ledger() - ledger_dirs[i] = l + l_dir, c = node.get_ledger() + ledger_dirs[i] = l_dir committed_ledger_dirs[i] = c LOG.info("Start a recovery network and stop it") From b55ac60ab7a1e1671ec93a859848d9b745d704d0 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 15 Sep 2025 11:30:06 +0100 Subject: [PATCH 123/197] Cleanup --- tla/disaster-recovery/autoopen.fizz | 31 ------------------- .../{autoopen.cfg => selfhealingopen.cfg} | 0 .../{autoopen.tla => selfhealingopen.tla} | 2 +- 3 files changed, 1 insertion(+), 32 deletions(-) delete mode 100644 tla/disaster-recovery/autoopen.fizz rename tla/disaster-recovery/{autoopen.cfg => selfhealingopen.cfg} (100%) rename tla/disaster-recovery/{autoopen.tla => selfhealingopen.tla} (99%) diff --git a/tla/disaster-recovery/autoopen.fizz b/tla/disaster-recovery/autoopen.fizz deleted file mode 100644 index 49705045dda8..000000000000 --- a/tla/disaster-recovery/autoopen.fizz +++ /dev/null @@ -1,31 +0,0 @@ ----- -options: - maxActions: 10 - -deadlock_detection: false ----- -NUM_NODES = 2 - -NextSteps = enum("GOSSIP", "VOTE", "OPENJOIN", "OPEN", "JOIN") - -role Node: - atomic action Init: - self.next_step = NextSteps.GOSSIP - self.recv_gossips = {} - - action Gossip: - if self.next_step == NextSteps.GOSSIP: - self.next_step = NextSteps.VOTE - self.gossip(self.__id__, self.txid) - for n in nodes: - if n.__id__ != self.__id__: - n.gossip(self.__id__, self.txid) - - func gossip(src_id, txid): - self.recv_gossips[src_id] = txid - -atomic action Init: - nodes = [] - for i in range(0, NUM_NODES): - node = Node(txid=i) - nodes.append(node) \ No newline at end of file diff --git a/tla/disaster-recovery/autoopen.cfg b/tla/disaster-recovery/selfhealingopen.cfg similarity index 100% rename from tla/disaster-recovery/autoopen.cfg rename to tla/disaster-recovery/selfhealingopen.cfg diff --git a/tla/disaster-recovery/autoopen.tla b/tla/disaster-recovery/selfhealingopen.tla similarity index 99% rename from tla/disaster-recovery/autoopen.tla rename to tla/disaster-recovery/selfhealingopen.tla index e0d709af4087..c68902e6e26c 100644 --- a/tla/disaster-recovery/autoopen.tla +++ b/tla/disaster-recovery/selfhealingopen.tla @@ -1,4 +1,4 @@ ----- MODULE autoopen ---- +---- MODULE selfhealingopen ---- EXTENDS Integers, Sequences, FiniteSets, TLC From e102af927121ee159a65d6f56b6b07a739629186 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 15 Sep 2025 11:34:52 +0100 Subject: [PATCH 124/197] Cleanup --- tla/disaster-recovery/stateright/Cargo.lock | 4 +-- tla/disaster-recovery/stateright/Cargo.toml | 2 +- tla/disaster-recovery/stateright/Readme.md | 2 +- tla/disaster-recovery/stateright/src/main.rs | 36 ++++++++++---------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tla/disaster-recovery/stateright/Cargo.lock b/tla/disaster-recovery/stateright/Cargo.lock index d7f54e1881b4..b339a5f99ee6 100644 --- a/tla/disaster-recovery/stateright/Cargo.lock +++ b/tla/disaster-recovery/stateright/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "ahash" @@ -84,7 +84,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" [[package]] -name = "ccf-autodr" +name = "ccf-selfhealingopen" version = "0.0.0" dependencies = [ "clap", diff --git a/tla/disaster-recovery/stateright/Cargo.toml b/tla/disaster-recovery/stateright/Cargo.toml index 9588f0cc93a2..6769d9a6349f 100644 --- a/tla/disaster-recovery/stateright/Cargo.toml +++ b/tla/disaster-recovery/stateright/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "ccf-autodr" +name = "ccf-selfhealingopen" version = "0.0.0" [dependencies] diff --git a/tla/disaster-recovery/stateright/Readme.md b/tla/disaster-recovery/stateright/Readme.md index 30e19f7addfc..637c987fd78f 100644 --- a/tla/disaster-recovery/stateright/Readme.md +++ b/tla/disaster-recovery/stateright/Readme.md @@ -1,4 +1,4 @@ -# Auto-open specification in [stateright](https://github.com/stateright/stateright) +# Self-healing-open specification in [stateright](https://github.com/stateright/stateright) The properties are specified in [main.rs](./src/main.rs), while the model is specified in [model.rs](./src/model.rs). diff --git a/tla/disaster-recovery/stateright/src/main.rs b/tla/disaster-recovery/stateright/src/main.rs index 5fac13bf5b74..e6676d08d7c2 100644 --- a/tla/disaster-recovery/stateright/src/main.rs +++ b/tla/disaster-recovery/stateright/src/main.rs @@ -128,6 +128,23 @@ fn invariant_properties(model: ActorModel) -> ActorModel| actor_state.next_step == NextStep::OpenJoin); + let all_votes_delivered = state + .network + .iter_all() + .filter(|msg| matches!(msg.msg, Msg::Vote(_))) + .count() + == 0; + !(all_open_join && all_votes_delivered) + }, + ) .property( stateright::Expectation::Always, "Persist committed txs", @@ -163,23 +180,6 @@ fn reachable_properties(model: ActorModel) -> ActorModel| actor_state.next_step == NextStep::OpenJoin); - let all_votes_delivered = state - .network - .iter_all() - .filter(|msg| matches!(msg.msg, Msg::Vote(_))) - .count() - == 0; - all_open_join && all_votes_delivered - }, - ) .property( stateright::Expectation::Sometimes, "Majority vote still opens without timeout", @@ -196,7 +196,7 @@ fn properties(model: ActorModel) -> ActorModel Date: Wed, 17 Sep 2025 17:42:58 +0100 Subject: [PATCH 125/197] Large refactor to pull out the self_healing_open code from node_state.h --- CMakeLists.txt | 4 + include/ccf/node/startup_config.h | 6 +- src/common/configuration.h | 6 +- src/host/configuration.h | 2 +- src/node/node_state.h | 426 +------------------ src/node/rpc/node_frontend.h | 38 +- src/node/rpc/node_interface.h | 5 +- src/node/rpc/node_operation.h | 11 +- src/node/rpc/node_operation_interface.h | 5 +- src/node/rpc/test/node_stub.h | 11 +- src/node/self_healing_open.h | 123 ------ src/node/self_healing_open_impl.cpp | 525 ++++++++++++++++++++++++ src/node/self_healing_open_impl.h | 40 ++ src/node/self_healing_open_types.h | 56 +++ 14 files changed, 660 insertions(+), 598 deletions(-) delete mode 100644 src/node/self_healing_open.h create mode 100644 src/node/self_healing_open_impl.cpp create mode 100644 src/node/self_healing_open_impl.h create mode 100644 src/node/self_healing_open_types.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 8dbfa652ba5f..56dc0488f5aa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -372,6 +372,7 @@ endif() set(CCF_IMPL_SOURCE ${CCF_DIR}/src/enclave/main.cpp ${CCF_DIR}/src/enclave/thread_local.cpp ${CCF_DIR}/src/node/quote.cpp ${CCF_DIR}/src/node/uvm_endorsements.cpp + ${CCF_DIR}/src/node/self_healing_open_impl.cpp ) add_ccf_static_library( @@ -667,6 +668,8 @@ if(BUILD_TESTS) frontend_test ${CMAKE_CURRENT_SOURCE_DIR}/src/node/rpc/test/frontend_test.cpp ${CCF_DIR}/src/node/quote.cpp ${CCF_DIR}/src/node/uvm_endorsements.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/node/self_healing_open_impl.cpp + ) target_link_libraries( frontend_test PRIVATE ${CMAKE_THREAD_LIBS_INIT} http_parser ccf_js @@ -697,6 +700,7 @@ if(BUILD_TESTS) node_frontend_test ${CMAKE_CURRENT_SOURCE_DIR}/src/node/rpc/test/node_frontend_test.cpp ${CCF_DIR}/src/node/quote.cpp ${CCF_DIR}/src/node/uvm_endorsements.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/node/self_healing_open_impl.cpp ) target_link_libraries( node_frontend_test PRIVATE ${CMAKE_THREAD_LIBS_INIT} http_parser ccf_js diff --git a/include/ccf/node/startup_config.h b/include/ccf/node/startup_config.h index 9ce5b9a31384..3eed69867024 100644 --- a/include/ccf/node/startup_config.h +++ b/include/ccf/node/startup_config.h @@ -102,12 +102,12 @@ namespace ccf Snapshots snapshots = {}; }; - struct SelfHealingOpen + struct SelfHealingOpenConfig { std::vector addresses; ccf::ds::TimeString retry_timeout = {"100ms"}; ccf::ds::TimeString timeout = {"2000ms"}; - bool operator==(const SelfHealingOpen&) const = default; + bool operator==(const SelfHealingOpenConfig&) const = default; }; struct StartupConfig : CCFConfig @@ -154,7 +154,7 @@ namespace ccf std::nullopt; std::optional previous_sealed_ledger_secret_location = std::nullopt; - std::optional self_healing_open = std::nullopt; + std::optional self_healing_open = std::nullopt; }; Recover recover = {}; }; diff --git a/src/common/configuration.h b/src/common/configuration.h index f592a4097fc6..271f4183ecf8 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -113,9 +113,9 @@ namespace ccf node_to_node_message_limit, historical_cache_soft_limit); - DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(SelfHealingOpen); - DECLARE_JSON_REQUIRED_FIELDS(SelfHealingOpen, addresses); - DECLARE_JSON_OPTIONAL_FIELDS(SelfHealingOpen, retry_timeout, timeout); + DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(SelfHealingOpenConfig); + DECLARE_JSON_REQUIRED_FIELDS(SelfHealingOpenConfig, addresses); + DECLARE_JSON_OPTIONAL_FIELDS(SelfHealingOpenConfig, retry_timeout, timeout); DECLARE_JSON_TYPE(StartupConfig::Start); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/host/configuration.h b/src/host/configuration.h index 29f47a322c1d..4499e79627d3 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -118,7 +118,7 @@ namespace host std::string previous_service_identity_file; std::optional previous_sealed_ledger_secret_location = std::nullopt; - std::optional self_healing_open = std::nullopt; + std::optional self_healing_open = std::nullopt; bool operator==(const Recover&) const = default; }; Recover recover = {}; diff --git a/src/node/node_state.h b/src/node/node_state.h index d4a1f875940a..7e00a11a7d1d 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -50,7 +50,6 @@ #include "node/ledger_secrets.h" #include "node/local_sealing.h" #include "node/node_to_node_channel_manager.h" -#include "node/self_healing_open.h" #include "node/snapshotter.h" #include "node_to_node.h" #include "pal/quote_generation.h" @@ -96,6 +95,8 @@ namespace ccf class NodeState : public AbstractNodeState { + friend class SelfHealingOpenService; + private: // // this node's core state @@ -243,6 +244,8 @@ namespace ccf last_recovered_signed_idx = last_recovered_idx; } + std::unique_ptr self_healing_open_impl; + public: NodeState( ringbuffer::AbstractWriterFactory& writer_factory, @@ -258,7 +261,8 @@ namespace ccf to_host(writer_factory.create_writer_to_outside()), network(network), rpcsessions(rpcsessions), - share_manager(network.ledger_secrets) + share_manager(network.ledger_secrets), + self_healing_open_impl(std::make_unique(this)) {} QuoteVerificationResult verify_quote( @@ -1997,319 +2001,6 @@ namespace ccf return history->get_cose_signatures_config(); } - void self_healing_open_try_start_timers( - ccf::kv::Tx& tx, bool recovering) override - { - if (!recovering || !config.recover.self_healing_open.has_value()) - { - LOG_TRACE_FMT( - "Not recovering, or no self-healing-open addresses configured, " - "not starting self-healing-open timers"); - return; - } - - auto* state_handle = tx.rw(network.self_healing_open_sm_state); - state_handle->put(SelfHealingOpenSM::GOSSIPPING); - auto* timeout_state_handle = - tx.rw(network.self_healing_open_timeout_sm_state); - timeout_state_handle->put(SelfHealingOpenSM::GOSSIPPING); - - auto retry_timer_msg = std::make_unique<::threading::Tmsg>( - [](std::unique_ptr<::threading::Tmsg> msg) { - std::lock_guard guard(msg->data.self.lock); - - auto tx = msg->data.self.network.tables->create_read_only_tx(); - auto* sm_state_handle = - tx.ro(msg->data.self.network.self_healing_open_sm_state); - if (!sm_state_handle->get().has_value()) - { - throw std::logic_error( - "Self-healing-open state not set, cannot retry " - "self-healing-open"); - } - auto sm_state = sm_state_handle->get().value(); - - // Keep doing this until the node is no longer in recovery - if (sm_state == SelfHealingOpenSM::OPEN) - { - LOG_INFO_FMT("Self-healing-open complete, stopping timers."); - return; - } - - switch (sm_state) - { - case SelfHealingOpenSM::GOSSIPPING: - msg->data.self.self_healing_open_gossip_unsafe(); - break; - case SelfHealingOpenSM::VOTING: - { - auto* node_info_handle = - tx.ro(msg->data.self.network.self_healing_open_node_info); - auto* chosen_replica_handle = - tx.ro(msg->data.self.network.self_healing_open_chosen_replica); - if (!chosen_replica_handle->get().has_value()) - { - throw std::logic_error( - "Self-healing-open chosen node not set, cannot vote"); - } - auto chosen_node_info = - node_info_handle->get(chosen_replica_handle->get().value()); - if (!chosen_node_info.has_value()) - { - throw std::logic_error(fmt::format( - "Self-healing-open chosen node {} not found", - chosen_replica_handle->get().value())); - } - msg->data.self.self_healing_open_vote_unsafe( - chosen_node_info.value()); - // keep gossiping to allow lagging nodes to eventually vote - msg->data.self.self_healing_open_gossip_unsafe(); - break; - } - case SelfHealingOpenSM::OPENING: - msg->data.self.self_healing_open_iamopen_unsafe(); - break; - case SelfHealingOpenSM::JOINING: - return; - default: - throw std::logic_error(fmt::format( - "Unknown self-healing-open state: {}", - static_cast(sm_state))); - } - - auto delay = - msg->data.self.config.recover.self_healing_open->retry_timeout; - ::threading::ThreadMessaging::instance().add_task_after( - std::move(msg), delay); - }, - *this); - // kick this off asynchronously as this can be called from a curl callback - ::threading::ThreadMessaging::instance().add_task( - threading::get_current_thread_id(), std::move(retry_timer_msg)); - - // Dispatch timeouts - auto timeout_msg = std::make_unique<::threading::Tmsg>( - [](std::unique_ptr<::threading::Tmsg> msg) { - std::lock_guard guard(msg->data.self.lock); - LOG_TRACE_FMT( - "Self-healing-open timeout, sending timeout to internal handlers"); - - // Stop the timer if the node has completed its self-healing-open - auto tx = msg->data.self.network.tables->create_read_only_tx(); - auto* sm_state_handle = - tx.ro(msg->data.self.network.self_healing_open_sm_state); - if (!sm_state_handle->get().has_value()) - { - throw std::logic_error( - "Self-healing-open state not set, cannot retry " - "self-healing-open"); - } - auto sm_state = sm_state_handle->get().value(); - if (sm_state == SelfHealingOpenSM::OPEN) - { - LOG_INFO_FMT("Self-healing-open complete, stopping timers."); - return; - } - - // Send a timeout to the internal handlers - curl::UniqueCURL curl_handle; - - auto cert = msg->data.self.self_signed_node_cert; - curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); - curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); - curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); - - curl_handle.set_blob_opt( - CURLOPT_SSLCERT_BLOB, cert.data(), cert.size()); - curl_handle.set_opt(CURLOPT_SSLCERTTYPE, "PEM"); - - auto privkey_pem = msg->data.self.node_sign_kp->private_key_pem(); - curl_handle.set_blob_opt( - CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); - curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); - - auto url = fmt::format( - "https://{}/{}/self_healing_open/timeout", - msg->data.self.config.network.rpc_interfaces - .at("primary_rpc_interface") - .published_address, - get_actor_prefix(ActorsType::nodes)); - - curl::UniqueSlist headers; - headers.append("Content-Type: application/json"); - - auto curl_request = std::make_unique( - std::move(curl_handle), - HTTP_PUT, - std::move(url), - std::move(headers), - nullptr, - nullptr, - std::nullopt); - curl::CurlmLibuvContextSingleton::get_instance()->attach_request( - std::move(curl_request)); - - auto delay = msg->data.self.config.recover.self_healing_open->timeout; - ::threading::ThreadMessaging::instance().add_task_after( - std::move(msg), delay); - }, - *this); - ::threading::ThreadMessaging::instance().add_task_after( - std::move(timeout_msg), config.recover.self_healing_open->timeout); - } - - void self_healing_open_advance(ccf::kv::Tx& tx, bool timeout) override - { - auto* sm_state_handle = tx.rw(network.self_healing_open_sm_state); - auto* timeout_state_handle = - tx.rw(network.self_healing_open_timeout_sm_state); - if ( - !sm_state_handle->get().has_value() || - !timeout_state_handle->get().has_value()) - { - throw std::logic_error( - "Self-healing-open state not set, cannot advance self-healing-open"); - } - - bool valid_timeout = timeout && - timeout_state_handle->get().value() == sm_state_handle->get().value(); - - // Advance timeout SM - if (timeout) - { - switch (timeout_state_handle->get().value()) - { - case SelfHealingOpenSM::GOSSIPPING: - LOG_TRACE_FMT("Advancing timeout SM to VOTING"); - timeout_state_handle->put(SelfHealingOpenSM::VOTING); - break; - case SelfHealingOpenSM::VOTING: - LOG_TRACE_FMT("Advancing timeout SM to OPENING"); - timeout_state_handle->put(SelfHealingOpenSM::OPENING); - break; - case SelfHealingOpenSM::OPENING: - case SelfHealingOpenSM::JOINING: - case SelfHealingOpenSM::OPEN: - default: - LOG_TRACE_FMT("Timeout SM complete"); - } - } - - // Advance self-healing-open SM - switch (sm_state_handle->get().value()) - { - case SelfHealingOpenSM::GOSSIPPING: - { - auto* gossip_handle = tx.ro(network.self_healing_open_gossip); - if ( - gossip_handle->size() == - config.recover.self_healing_open->addresses.size() || - valid_timeout) - { - if (gossip_handle->size() == 0) - { - throw std::logic_error("No gossip addresses provided yet"); - } - - // Lexographically maximum pair - std::optional> maximum; - gossip_handle->foreach( - [&maximum](const auto& iid, const auto& txid) { - if ( - !maximum.has_value() || - maximum.value() < std::make_pair(txid, iid)) - { - maximum = std::make_pair(txid, iid); - } - return true; - }); - - auto* chosen_replica = - tx.rw(network.self_healing_open_chosen_replica); - chosen_replica->put(maximum->second); - sm_state_handle->put(SelfHealingOpenSM::VOTING); - } - return; - } - case SelfHealingOpenSM::VOTING: - { - auto* votes = tx.rw(network.self_healing_open_votes); - if ( - votes->size() >= - config.recover.self_healing_open->addresses.size() / 2 + 1 || - valid_timeout) - { - if (votes->size() == 0) - { - throw std::logic_error( - "We didn't even vote for ourselves, so why should we open?"); - } - LOG_INFO_FMT("Self-healing-open succeeded, now opening network"); - - auto* service = tx.ro(Tables::SERVICE); - auto service_info = service->get(); - if (!service_info.has_value()) - { - throw std::logic_error( - "Service information cannot be found to transition service to " - "open"); - } - const auto prev_ident = - tx.ro(Tables::PREVIOUS_SERVICE_IDENTITY) - ->get(); - AbstractGovernanceEffects::ServiceIdentities identities{ - .previous = prev_ident, .next = service_info->cert}; - - sm_state_handle->put(SelfHealingOpenSM::OPENING); - - transition_service_to_open(tx, identities); - } - return; - } - case SelfHealingOpenSM::JOINING: - { - auto chosen_replica = - tx.ro(network.self_healing_open_chosen_replica)->get(); - if (!chosen_replica.has_value()) - { - throw std::logic_error( - "Self-healing-open chosen node not set, cannot join"); - } - auto node_config = tx.ro(this->network.self_healing_open_node_info) - ->get(chosen_replica.value()); - if (!node_config.has_value()) - { - throw std::logic_error(fmt::format( - "Self-healing-open chosen node {} not found", - chosen_replica.value())); - } - - LOG_INFO_FMT( - "Self-healing-open joining {} with service identity {}", - node_config->published_network_address, - node_config->service_identity); - - RINGBUFFER_WRITE_MESSAGE(AdminMessage::restart, to_host); - } - case SelfHealingOpenSM::OPENING: - { - if (valid_timeout) - { - sm_state_handle->put(SelfHealingOpenSM::OPEN); - } - } - case SelfHealingOpenSM::OPEN: - { - // Nothing to do here, we are already opening or open or joining - return; - } - default: - throw std::logic_error(fmt::format( - "Unknown self-healing-open state: {}", - static_cast(sm_state_handle->get().value()))); - } - } - private: bool is_ip(const std::string_view& hostname) { @@ -3244,106 +2935,6 @@ namespace ccf max_version); } - self_healing_open::RequestNodeInfo self_healing_open_node_info() - { - return { - .quote_info = quote_info, - .published_network_address = - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address, - .intrinsic_id = - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address, - .service_identity = network.identity->cert.str(), - }; - } - - void self_healing_open_gossip_unsafe() - { - // Caller must ensure that the current node's quote_info is populated: - // ie not yet reached partOfNetwork - if (!config.recover.self_healing_open.has_value()) - { - LOG_TRACE_FMT( - "Self-healing-open addresses not set, cannot start gossip retries"); - return; - } - - LOG_TRACE_FMT("Broadcasting self-healing-open gossip"); - - self_healing_open::GossipRequest request{ - .info = self_healing_open_node_info(), - // TODO fix: This isn't quite right, as it should be the highest txid - // with a signature,before the recovery txs - .txid = network.tables->current_version(), - }; - - for (auto& target_address : config.recover.self_healing_open->addresses) - { - self_healing_open::dispatch_authenticated_message( - std::move(request), - target_address, - "gossip", - self_signed_node_cert, - node_sign_kp->private_key_pem()); - } - } - - void self_healing_open_vote_unsafe(SelfHealingOpenNodeInfo_t& node_info) - { - // Caller must ensure that the current node's quote_info is populated: - // ie not yet reached partOfNetwork - LOG_TRACE_FMT( - "Sending self-healing-open vote to {} at {}", - node_info.intrinsic_id, - node_info.published_network_address); - - self_healing_open::VoteRequest request{ - .info = self_healing_open_node_info()}; - - self_healing_open::dispatch_authenticated_message( - std::move(request), - node_info.published_network_address, - "vote", - self_signed_node_cert, - node_sign_kp->private_key_pem()); - } - - void self_healing_open_iamopen_unsafe() - { - // Caller must ensure that the current node's quote_info is populated: - // ie not yet reached partOfNetwork - if (!config.recover.self_healing_open.has_value()) - { - LOG_TRACE_FMT( - "Self-healing-open addresses not set, cannot send iamopen"); - return; - } - - LOG_TRACE_FMT("Sending self-healing-open iamopen"); - - self_healing_open::IAmOpenRequest request{ - .info = self_healing_open_node_info()}; - - for (auto& target_address : config.recover.self_healing_open->addresses) - { - if ( - target_address == - config.network.rpc_interfaces.at("primary_rpc_interface") - .published_address) - { - // Don't send to self - continue; - } - self_healing_open::dispatch_authenticated_message( - std::move(request), - target_address, - "iamopen", - self_signed_node_cert, - node_sign_kp->private_key_pem()); - } - } - public: void set_n2n_message_limit(size_t message_limit) { @@ -3422,5 +3013,10 @@ namespace ccf { return writer_factory; } + + SelfHealingOpenService& self_healing_open() override + { + return *self_healing_open_impl; + } }; } diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 56e19b4b8e2a..419cc4f63f51 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -24,7 +24,8 @@ #include "node/rpc/jwt_management.h" #include "node/rpc/no_create_tx_claims_digest.cpp" #include "node/rpc/serialization.h" -#include "node/self_healing_open.h" +#include "node/self_healing_open_impl.h" +#include "node/self_healing_open_types.h" #include "node/session_metrics.h" #include "node_interface.h" #include "service/internal_tables_access.h" @@ -1721,32 +1722,7 @@ namespace ccf ctx.rpc_ctx->set_claims_digest(std::move(digest_value)); } - { - // Reset the self-healing-open state - ccf::kv::Tx& tx = ctx.tx; - auto* state_handle = tx.rw( - Tables::SELF_HEALING_OPEN_SM_STATE); - state_handle->clear(); - auto* timeout_state_handle = - tx.rw( - Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE); - auto* node_info_handle = tx.rw( - Tables::SELF_HEALING_OPEN_NODES); - node_info_handle->clear(); - auto* gossip_state_handle = tx.rw( - Tables::SELF_HEALING_OPEN_GOSSIPS); - gossip_state_handle->clear(); - auto* chosen_replica = tx.rw( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA); - chosen_replica->clear(); - auto* votes = - tx.rw(Tables::SELF_HEALING_OPEN_VOTES); - votes->clear(); - - // Start timers if necessary - this->node_operation.self_healing_open_try_start_timers( - tx, recovering); - } + this->node_operation.self_healing_open().try_start(ctx.tx, recovering); LOG_INFO_FMT("Created service"); return make_success(true); @@ -2337,7 +2313,7 @@ namespace ccf try { - this->node_operation.self_healing_open_advance(args.tx, false); + this->node_operation.self_healing_open().advance(args.tx, false); } catch (const std::logic_error& e) { @@ -2397,7 +2373,7 @@ namespace ccf try { - this->node_operation.self_healing_open_advance(args.tx, false); + this->node_operation.self_healing_open().advance(args.tx, false); } catch (const std::logic_error& e) { @@ -2464,7 +2440,7 @@ namespace ccf try { - this->node_operation.self_healing_open_advance(args.tx, false); + this->node_operation.self_healing_open().advance(args.tx, false); } catch (const std::logic_error& e) { @@ -2536,7 +2512,7 @@ namespace ccf try { - this->node_operation.self_healing_open_advance(args.tx, true); + this->node_operation.self_healing_open().advance(args.tx, true); } catch (const std::logic_error& e) { diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index b7715616dcc9..cc692e48d192 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -16,6 +16,7 @@ #include "node/ledger_secret.h" #include "node/rpc/gov_effects_interface.h" #include "node/rpc/node_operation_interface.h" +#include "node/self_healing_open_impl.h" #include "node/session_metrics.h" namespace ccf @@ -65,9 +66,7 @@ namespace ccf virtual size_t get_jwt_attempts() = 0; virtual ccf::crypto::Pem get_self_signed_certificate() = 0; virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; - virtual void self_healing_open_try_start_timers( - ccf::kv::Tx& tx, bool recovering) = 0; - virtual void self_healing_open_advance(ccf::kv::Tx&, bool) = 0; + virtual SelfHealingOpenService& self_healing_open() = 0; virtual const ccf::StartupConfig& get_node_config() const = 0; virtual ccf::crypto::Pem get_network_cert() = 0; virtual void stop_notice() = 0; diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index 7338134bc56d..b3d3ad495980 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -4,6 +4,7 @@ #include "node/rpc/node_interface.h" #include "node/rpc/node_operation_interface.h" +#include "node/self_healing_open_impl.h" namespace ccf { @@ -110,15 +111,9 @@ namespace ccf return impl.get_cose_signatures_config(); } - void self_healing_open_try_start_timers( - ccf::kv::Tx& tx, bool recovering) override + SelfHealingOpenService& self_healing_open() override { - impl.self_healing_open_try_start_timers(tx, recovering); - } - - void self_healing_open_advance(ccf::kv::Tx& tx, bool is_recovery) override - { - impl.self_healing_open_advance(tx, is_recovery); + return impl.self_healing_open(); } }; } \ No newline at end of file diff --git a/src/node/rpc/node_operation_interface.h b/src/node/rpc/node_operation_interface.h index 010ff707b117..b3ace7f004cf 100644 --- a/src/node/rpc/node_operation_interface.h +++ b/src/node/rpc/node_operation_interface.h @@ -11,6 +11,7 @@ #include "ccf/node_subsystem_interface.h" #include "ccf/service/tables/code_id.h" #include "ccf/tx.h" +#include "node/self_healing_open_impl.h" #include "node/session_metrics.h" namespace ccf @@ -62,8 +63,6 @@ namespace ccf virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; - virtual void self_healing_open_try_start_timers( - ccf::kv::Tx& tx, bool recovering) = 0; - virtual void self_healing_open_advance(ccf::kv::Tx&, bool) = 0; + virtual SelfHealingOpenService& self_healing_open() = 0; }; } \ No newline at end of file diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index 9cf7ac897dbf..fe89f21f4778 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -8,6 +8,7 @@ #include "node/rpc/gov_effects_interface.h" #include "node/rpc/node_interface.h" #include "node/rpc/node_operation_interface.h" +#include "node/self_healing_open_impl.h" namespace ccf { @@ -111,15 +112,9 @@ namespace ccf return cose_signatures_config; } - void self_healing_open_try_start_timers( - ccf::kv::Tx& tx, bool recovering) override + SelfHealingOpenService& self_healing_open() override { - // No-op for stub - } - - void self_healing_open_advance(ccf::kv::Tx& tx, bool timeout) override - { - // No-op for stub + throw std::logic_error("Unimplemented"); } }; diff --git a/src/node/self_healing_open.h b/src/node/self_healing_open.h deleted file mode 100644 index abe3825f880b..000000000000 --- a/src/node/self_healing_open.h +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the Apache 2.0 License. - -#pragma once - -#include "ccf/crypto/pem.h" -#include "ccf/ds/json.h" -#include "ccf/ds/quote_info.h" -#include "ccf/kv/version.h" -#include "ds/actors.h" -#include "http/curl.h" - -#include -#include -#include - -namespace ccf::self_healing_open -{ - struct RequestNodeInfo - { - QuoteInfo quote_info; - std::string published_network_address; - std::string intrinsic_id; - std::string service_identity; - }; - DECLARE_JSON_TYPE(RequestNodeInfo); - DECLARE_JSON_REQUIRED_FIELDS( - RequestNodeInfo, - quote_info, - published_network_address, - intrinsic_id, - service_identity); - - struct GossipRequest - { - RequestNodeInfo info; - ccf::kv::Version txid; - }; - DECLARE_JSON_TYPE(GossipRequest); - DECLARE_JSON_REQUIRED_FIELDS(GossipRequest, txid, info); - - struct VoteRequest - { - RequestNodeInfo info; - }; - DECLARE_JSON_TYPE(VoteRequest); - DECLARE_JSON_REQUIRED_FIELDS(VoteRequest, info); - - struct IAmOpenRequest - { - RequestNodeInfo info; - }; - DECLARE_JSON_TYPE(IAmOpenRequest); - DECLARE_JSON_REQUIRED_FIELDS(IAmOpenRequest, info); - - inline void dispatch_authenticated_message( - nlohmann::json&& request, - const std::string& target_address, - const std::string& endpoint, - const crypto::Pem& self_signed_node_cert, - const crypto::Pem& privkey_pem) - { - curl::UniqueCURL curl_handle; - - // diable SSL verification as no private information is sent - curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); - curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); - curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); - - curl_handle.set_blob_opt( - CURLOPT_SSLCERT_BLOB, - self_signed_node_cert.data(), - self_signed_node_cert.size()); - curl_handle.set_opt(CURLOPT_SSLCERTTYPE, "PEM"); - - curl_handle.set_blob_opt( - CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); - curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); - - auto url = fmt::format( - "https://{}/{}/self_healing_open/{}", - target_address, - get_actor_prefix(ActorsType::nodes), - endpoint); - - curl::UniqueSlist headers; - headers.append("Content-Type", "application/json"); - - auto body = std::make_unique(request); - - auto response_callback = []( - const ccf::curl::CurlRequest& request, - CURLcode curl_code, - long status_code) { - LOG_TRACE_FMT( - "Response received for {} to {}: curl_result {} ({}), status code {}", - request.get_method().c_str(), - request.get_url(), - curl_easy_strerror(curl_code), - curl_code, - status_code); - }; - - auto curl_request = std::make_unique( - std::move(curl_handle), - HTTP_PUT, - std::move(url), - std::move(headers), - std::move(body), - nullptr, - std::move(response_callback)); - - LOG_TRACE_FMT( - "Dispatching attested message for {} to {}: {}", - curl_request->get_method().c_str(), - curl_request->get_url(), - request.dump()); - - curl::CurlmLibuvContextSingleton::get_instance()->attach_request( - std::move(curl_request)); - } - -} \ No newline at end of file diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp new file mode 100644 index 000000000000..5017d6e7826c --- /dev/null +++ b/src/node/self_healing_open_impl.cpp @@ -0,0 +1,525 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. + +#include "self_healing_open_impl.h" + +#include "node_state.h" + +namespace ccf +{ + void SelfHealingOpenService::try_start(ccf::kv::Tx& tx, bool recovering) + { + if ( + !recovering || !node_state->config.recover.self_healing_open.has_value()) + { + LOG_TRACE_FMT( + "Not recovering, or no self-healing-open addresses configured, " + "not starting self-healing-open timers"); + return; + } + + // Reset the self-healing-open state + auto* state_handle = + tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE); + state_handle->clear(); + auto* timeout_state_handle = tx.rw( + Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE); + auto* node_info_handle = + tx.rw(Tables::SELF_HEALING_OPEN_NODES); + node_info_handle->clear(); + auto* gossip_state_handle = + tx.rw(Tables::SELF_HEALING_OPEN_GOSSIPS); + gossip_state_handle->clear(); + auto* chosen_replica = tx.rw( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA); + chosen_replica->clear(); + auto* votes = + tx.rw(Tables::SELF_HEALING_OPEN_VOTES); + votes->clear(); + + start_message_retry_timers(); + start_failover_timers(tx); + } + + void SelfHealingOpenService::advance(ccf::kv::Tx& tx, bool timeout) + { + auto* sm_state_handle = + tx.rw(node_state->network.self_healing_open_sm_state); + auto* timeout_state_handle = + tx.rw(node_state->network.self_healing_open_timeout_sm_state); + if ( + !sm_state_handle->get().has_value() || + !timeout_state_handle->get().has_value()) + { + throw std::logic_error( + "Self-healing-open state not set, cannot advance self-healing-open"); + } + + bool valid_timeout = timeout && + timeout_state_handle->get().value() == sm_state_handle->get().value(); + + // Advance timeout SM + if (timeout) + { + switch (timeout_state_handle->get().value()) + { + case SelfHealingOpenSM::GOSSIPPING: + LOG_TRACE_FMT("Advancing timeout SM to VOTING"); + timeout_state_handle->put(SelfHealingOpenSM::VOTING); + break; + case SelfHealingOpenSM::VOTING: + LOG_TRACE_FMT("Advancing timeout SM to OPENING"); + timeout_state_handle->put(SelfHealingOpenSM::OPENING); + break; + case SelfHealingOpenSM::OPENING: + case SelfHealingOpenSM::JOINING: + case SelfHealingOpenSM::OPEN: + default: + LOG_TRACE_FMT("Timeout SM complete"); + } + } + + // Advance self-healing-open SM + switch (sm_state_handle->get().value()) + { + case SelfHealingOpenSM::GOSSIPPING: + { + auto* gossip_handle = + tx.ro(node_state->network.self_healing_open_gossip); + auto quorum_size = + node_state->config.recover.self_healing_open->addresses.size(); + if (gossip_handle->size() >= quorum_size || valid_timeout) + { + if (gossip_handle->size() == 0) + { + throw std::logic_error("No gossip addresses provided yet"); + } + + // Lexographically maximum pair + std::optional> maximum; + gossip_handle->foreach([&maximum](const auto& iid, const auto& txid) { + if ( + !maximum.has_value() || + maximum.value() < std::make_pair(txid, iid)) + { + maximum = std::make_pair(txid, iid); + } + return true; + }); + + auto* chosen_replica = + tx.rw(node_state->network.self_healing_open_chosen_replica); + chosen_replica->put(maximum->second); + sm_state_handle->put(SelfHealingOpenSM::VOTING); + } + return; + } + case SelfHealingOpenSM::VOTING: + { + auto* votes = tx.rw(node_state->network.self_healing_open_votes); + if ( + votes->size() >= + node_state->config.recover.self_healing_open->addresses.size() / 2 + + 1 || + valid_timeout) + { + if (votes->size() == 0) + { + throw std::logic_error( + "We didn't even vote for ourselves, so why should we open?"); + } + LOG_INFO_FMT("Self-healing-open succeeded, now opening network"); + + auto* service = tx.ro(Tables::SERVICE); + auto service_info = service->get(); + if (!service_info.has_value()) + { + throw std::logic_error( + "Service information cannot be found to transition service to " + "open"); + } + const auto prev_ident = + tx.ro(Tables::PREVIOUS_SERVICE_IDENTITY) + ->get(); + AbstractGovernanceEffects::ServiceIdentities identities{ + .previous = prev_ident, .next = service_info->cert}; + + sm_state_handle->put(SelfHealingOpenSM::OPENING); + + node_state->transition_service_to_open(tx, identities); + } + return; + } + case SelfHealingOpenSM::JOINING: + { + auto chosen_replica = + tx.ro(node_state->network.self_healing_open_chosen_replica)->get(); + if (!chosen_replica.has_value()) + { + throw std::logic_error( + "Self-healing-open chosen node not set, cannot join"); + } + auto node_config = + tx.ro(node_state->network.self_healing_open_node_info) + ->get(chosen_replica.value()); + if (!node_config.has_value()) + { + throw std::logic_error(fmt::format( + "Self-healing-open chosen node {} not found", + chosen_replica.value())); + } + + LOG_INFO_FMT( + "Self-healing-open joining {} with service identity {}", + node_config->published_network_address, + node_config->service_identity); + + RINGBUFFER_WRITE_MESSAGE(AdminMessage::restart, node_state->to_host); + } + case SelfHealingOpenSM::OPENING: + { + if (valid_timeout) + { + sm_state_handle->put(SelfHealingOpenSM::OPEN); + } + } + case SelfHealingOpenSM::OPEN: + { + // Nothing to do here, we are already opening or open or joining + return; + } + default: + throw std::logic_error(fmt::format( + "Unknown self-healing-open state: {}", + static_cast(sm_state_handle->get().value()))); + } + } + + void SelfHealingOpenService::start_message_retry_timers() + { + auto retry_timer_msg = std::make_unique<::threading::Tmsg>( + [](std::unique_ptr<::threading::Tmsg> msg) { + std::lock_guard guard(msg->data.self.node_state->lock); + + auto tx = + msg->data.self.node_state->network.tables->create_read_only_tx(); + auto* sm_state_handle = + tx.ro(msg->data.self.node_state->network.self_healing_open_sm_state); + if (!sm_state_handle->get().has_value()) + { + throw std::logic_error( + "Self-healing-open state not set, cannot retry " + "self-healing-open"); + } + auto sm_state = sm_state_handle->get().value(); + + // Keep doing this until the node is no longer in recovery + if (sm_state == SelfHealingOpenSM::OPEN) + { + LOG_INFO_FMT("Self-healing-open complete, stopping timers."); + return; + } + + switch (sm_state) + { + case SelfHealingOpenSM::GOSSIPPING: + msg->data.self.send_gossip_unsafe(); + break; + case SelfHealingOpenSM::VOTING: + { + auto* node_info_handle = tx.ro( + msg->data.self.node_state->network.self_healing_open_node_info); + auto* chosen_replica_handle = + tx.ro(msg->data.self.node_state->network + .self_healing_open_chosen_replica); + if (!chosen_replica_handle->get().has_value()) + { + throw std::logic_error( + "Self-healing-open chosen node not set, cannot vote"); + } + auto chosen_node_info = + node_info_handle->get(chosen_replica_handle->get().value()); + if (!chosen_node_info.has_value()) + { + throw std::logic_error(fmt::format( + "Self-healing-open chosen node {} not found", + chosen_replica_handle->get().value())); + } + msg->data.self.send_vote_unsafe(chosen_node_info.value()); + // keep gossiping to allow lagging nodes to eventually vote + msg->data.self.send_gossip_unsafe(); + break; + } + case SelfHealingOpenSM::OPENING: + msg->data.self.send_iamopen_unsafe(); + break; + case SelfHealingOpenSM::JOINING: + return; + default: + throw std::logic_error(fmt::format( + "Unknown self-healing-open state: {}", + static_cast(sm_state))); + } + + auto delay = msg->data.self.node_state->config.recover + .self_healing_open->retry_timeout; + ::threading::ThreadMessaging::instance().add_task_after( + std::move(msg), delay); + }, + *node_state); + // kick this off asynchronously as this can be called from a curl callback + ::threading::ThreadMessaging::instance().add_task( + threading::get_current_thread_id(), std::move(retry_timer_msg)); + } + + void SelfHealingOpenService::start_failover_timers(ccf::kv::Tx& tx) + { + auto* state_handle = tx.rw(node_state->network.self_healing_open_sm_state); + state_handle->put(SelfHealingOpenSM::GOSSIPPING); + auto* timeout_state_handle = + tx.rw(node_state->network.self_healing_open_timeout_sm_state); + timeout_state_handle->put(SelfHealingOpenSM::GOSSIPPING); + + // Dispatch timeouts + auto timeout_msg = std::make_unique<::threading::Tmsg>( + [](std::unique_ptr<::threading::Tmsg> msg) { + std::lock_guard guard(msg->data.self.node_state->lock); + LOG_TRACE_FMT( + "Self-healing-open timeout, sending timeout to internal handlers"); + + // Stop the timer if the node has completed its self-healing-open + auto tx = + msg->data.self.node_state->network.tables->create_read_only_tx(); + auto* sm_state_handle = + tx.ro(msg->data.self.node_state->network.self_healing_open_sm_state); + if (!sm_state_handle->get().has_value()) + { + throw std::logic_error( + "Self-healing-open state not set, cannot retry " + "self-healing-open"); + } + auto sm_state = sm_state_handle->get().value(); + if (sm_state == SelfHealingOpenSM::OPEN) + { + LOG_INFO_FMT("Self-healing-open complete, stopping timers."); + return; + } + + // Send a timeout to the internal handlers + curl::UniqueCURL curl_handle; + + auto cert = msg->data.self.node_state->self_signed_node_cert; + curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); + + curl_handle.set_blob_opt( + CURLOPT_SSLCERT_BLOB, cert.data(), cert.size()); + curl_handle.set_opt(CURLOPT_SSLCERTTYPE, "PEM"); + + auto privkey_pem = + msg->data.self.node_state->node_sign_kp->private_key_pem(); + curl_handle.set_blob_opt( + CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); + curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); + + auto url = fmt::format( + "https://{}/{}/self_healing_open/timeout", + msg->data.self.node_state->config.network.rpc_interfaces + .at("primary_rpc_interface") + .published_address, + get_actor_prefix(ActorsType::nodes)); + + curl::UniqueSlist headers; + headers.append("Content-Type: application/json"); + + auto curl_request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + nullptr, + nullptr, + std::nullopt); + curl::CurlmLibuvContextSingleton::get_instance()->attach_request( + std::move(curl_request)); + + auto delay = + msg->data.self.node_state->config.recover.self_healing_open->timeout; + ::threading::ThreadMessaging::instance().add_task_after( + std::move(msg), delay); + }, + *this); + ::threading::ThreadMessaging::instance().add_task_after( + std::move(timeout_msg), + node_state->config.recover.self_healing_open->timeout); + } + + inline void dispatch_authenticated_message( + nlohmann::json&& request, + const std::string& target_address, + const std::string& endpoint, + const crypto::Pem& self_signed_node_cert, + const crypto::Pem& privkey_pem) + { + curl::UniqueCURL curl_handle; + + // diable SSL verification as no private information is sent + curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); + curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); + + curl_handle.set_blob_opt( + CURLOPT_SSLCERT_BLOB, + self_signed_node_cert.data(), + self_signed_node_cert.size()); + curl_handle.set_opt(CURLOPT_SSLCERTTYPE, "PEM"); + + curl_handle.set_blob_opt( + CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); + curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); + + auto url = fmt::format( + "https://{}/{}/self_healing_open/{}", + target_address, + get_actor_prefix(ActorsType::nodes), + endpoint); + + curl::UniqueSlist headers; + headers.append("Content-Type", "application/json"); + + auto body = std::make_unique(request); + + auto response_callback = []( + const ccf::curl::CurlRequest& request, + CURLcode curl_code, + long status_code) { + LOG_TRACE_FMT( + "Response received for {} to {}: curl_result {} ({}), status code {}", + request.get_method().c_str(), + request.get_url(), + curl_easy_strerror(curl_code), + curl_code, + status_code); + }; + + auto curl_request = std::make_unique( + std::move(curl_handle), + HTTP_PUT, + std::move(url), + std::move(headers), + std::move(body), + nullptr, + std::move(response_callback)); + + LOG_TRACE_FMT( + "Dispatching attested message for {} to {}: {}", + curl_request->get_method().c_str(), + curl_request->get_url(), + request.dump()); + + curl::CurlmLibuvContextSingleton::get_instance()->attach_request( + std::move(curl_request)); + } + + self_healing_open::RequestNodeInfo SelfHealingOpenService::make_node_info() + { + return { + .quote_info = node_state->quote_info, + .published_network_address = + node_state->config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + .intrinsic_id = + node_state->config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address, + .service_identity = node_state->network.identity->cert.str(), + }; + } + + void SelfHealingOpenService::send_gossip_unsafe() + { + // Caller must ensure that the current node's quote_info is populated: + // ie not yet reached partOfNetwork + if (!node_state->config.recover.self_healing_open.has_value()) + { + LOG_TRACE_FMT( + "Self-healing-open addresses not set, cannot start gossip retries"); + return; + } + + LOG_TRACE_FMT("Broadcasting self-healing-open gossip"); + + self_healing_open::GossipRequest request{ + .info = make_node_info(), + // TODO fix: This isn't quite right, as it should be the highest txid + // with a signature,before the recovery txs + .txid = node_state->network.tables->current_version(), + }; + + for (auto& target_address : + node_state->config.recover.self_healing_open->addresses) + { + dispatch_authenticated_message( + std::move(request), + target_address, + "gossip", + node_state->self_signed_node_cert, + node_state->node_sign_kp->private_key_pem()); + } + } + + void SelfHealingOpenService::send_vote_unsafe( + const SelfHealingOpenNodeInfo_t& node_info) + { + // Caller must ensure that the current node's quote_info is populated: + // ie not yet reached partOfNetwork + LOG_TRACE_FMT( + "Sending self-healing-open vote to {} at {}", + node_info.intrinsic_id, + node_info.published_network_address); + + self_healing_open::VoteRequest request{.info = make_node_info()}; + + dispatch_authenticated_message( + std::move(request), + node_info.published_network_address, + "vote", + node_state->self_signed_node_cert, + node_state->node_sign_kp->private_key_pem()); + } + + void SelfHealingOpenService::send_iamopen_unsafe() + { + // Caller must ensure that the current node's quote_info is populated: + // ie not yet reached partOfNetwork + if (!node_state->config.recover.self_healing_open.has_value()) + { + LOG_TRACE_FMT("Self-healing-open addresses not set, cannot send iamopen"); + return; + } + + LOG_TRACE_FMT("Sending self-healing-open iamopen"); + + self_healing_open::IAmOpenRequest request{.info = make_node_info()}; + + for (auto& target_address : + node_state->config.recover.self_healing_open->addresses) + { + if ( + target_address == + node_state->config.network.rpc_interfaces.at("primary_rpc_interface") + .published_address) + { + // Don't send to self + continue; + } + dispatch_authenticated_message( + std::move(request), + target_address, + "iamopen", + node_state->self_signed_node_cert, + node_state->node_sign_kp->private_key_pem()); + } + } + +} \ No newline at end of file diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h new file mode 100644 index 000000000000..9e77d680ac44 --- /dev/null +++ b/src/node/self_healing_open_impl.h @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. +#pragma once + +#include "ccf/service/tables/self_heal_open.h" +#include "ccf/tx.h" +#include "self_healing_open_types.h" + +namespace ccf +{ + class NodeState; + class SelfHealingOpenService + { + private: + // SelfHealingOpenService is owned by NodeState + NodeState* node_state; + + public: + SelfHealingOpenService(NodeState* node_state) : node_state(node_state) {} + void try_start(ccf::kv::Tx& tx, bool recovering); + void advance(ccf::kv::Tx& tx, bool timeout); + + private: + struct SHOMsg + { + SHOMsg(SelfHealingOpenService& self_) : self(self_) {} + SelfHealingOpenService& self; + }; + + // Start path + void start_message_retry_timers(); + void start_failover_timers(ccf::kv::Tx& tx); + + // Steady state operations + self_healing_open::RequestNodeInfo make_node_info(); + void send_gossip_unsafe(); + void send_vote_unsafe(const SelfHealingOpenNodeInfo_t&); + void send_iamopen_unsafe(); + }; +} \ No newline at end of file diff --git a/src/node/self_healing_open_types.h b/src/node/self_healing_open_types.h new file mode 100644 index 000000000000..3bc99c225650 --- /dev/null +++ b/src/node/self_healing_open_types.h @@ -0,0 +1,56 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the Apache 2.0 License. + +#pragma once + +#include "ccf/crypto/pem.h" +#include "ccf/ds/json.h" +#include "ccf/ds/quote_info.h" +#include "ccf/kv/version.h" +#include "ds/actors.h" +#include "http/curl.h" + +#include +#include +#include + +namespace ccf::self_healing_open +{ + struct RequestNodeInfo + { + QuoteInfo quote_info; + std::string published_network_address; + std::string intrinsic_id; + std::string service_identity; + }; + DECLARE_JSON_TYPE(RequestNodeInfo); + DECLARE_JSON_REQUIRED_FIELDS( + RequestNodeInfo, + quote_info, + published_network_address, + intrinsic_id, + service_identity); + + struct GossipRequest + { + RequestNodeInfo info; + ccf::kv::Version txid; + }; + DECLARE_JSON_TYPE(GossipRequest); + DECLARE_JSON_REQUIRED_FIELDS(GossipRequest, txid, info); + + struct VoteRequest + { + RequestNodeInfo info; + }; + DECLARE_JSON_TYPE(VoteRequest); + DECLARE_JSON_REQUIRED_FIELDS(VoteRequest, info); + + struct IAmOpenRequest + { + RequestNodeInfo info; + }; + DECLARE_JSON_TYPE(IAmOpenRequest); + DECLARE_JSON_REQUIRED_FIELDS(IAmOpenRequest, info); + +} \ No newline at end of file From f16c4526f2d342be8d574ef1c640a89f7f2b3ddc Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 17 Sep 2025 18:02:16 +0100 Subject: [PATCH 126/197] Inline to prevent ODR violations --- src/crypto/csr.h | 2 +- src/node/node_state.h | 2 +- src/service/internal_tables_access.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/crypto/csr.h b/src/crypto/csr.h index 73cc2789f533..f9123196de82 100644 --- a/src/crypto/csr.h +++ b/src/crypto/csr.h @@ -13,7 +13,7 @@ namespace ccf::crypto * @param signing_request CSR to extract the public key from * @return extracted public key */ - Pem public_key_pem_from_csr(const Pem& signing_request) + inline Pem public_key_pem_from_csr(const Pem& signing_request) { X509* icrt = NULL; OpenSSL::Unique_BIO mem(signing_request); diff --git a/src/node/node_state.h b/src/node/node_state.h index 7e00a11a7d1d..7122811c09b8 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -87,7 +87,7 @@ namespace ccf ccf::crypto::Pem service_cert; }; - void reset_data(std::vector& data) + inline void reset_data(std::vector& data) { data.clear(); data.shrink_to_fit(); diff --git a/src/service/internal_tables_access.h b/src/service/internal_tables_access.h index ba121473ef19..1a81abefc6cf 100644 --- a/src/service/internal_tables_access.h +++ b/src/service/internal_tables_access.h @@ -29,12 +29,12 @@ namespace ccf { /* We can't query the past epochs' TXs if the service hasn't been opened * yet. We do guess values based on epoch value and seqno changing rules. */ - ccf::TxID previous_tx_if_recovery(ccf::TxID txid) + inline ccf::TxID previous_tx_if_recovery(ccf::TxID txid) { return ccf::TxID{ .view = txid.view - aft::starting_view_change, .seqno = txid.seqno - 1}; } - ccf::TxID next_tx_if_recovery(ccf::TxID txid) + inline ccf::TxID next_tx_if_recovery(ccf::TxID txid) { return ccf::TxID{ .view = txid.view + aft::starting_view_change, .seqno = txid.seqno + 1}; From 8003292d864ce2e4e2878384357283cdf7becf71 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 17 Sep 2025 18:02:35 +0100 Subject: [PATCH 127/197] make cmake happy --- CMakeLists.txt | 4 ++-- src/node/self_healing_open_impl.cpp | 4 +--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 56dc0488f5aa..7cfbb13a1826 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -673,7 +673,7 @@ if(BUILD_TESTS) ) target_link_libraries( frontend_test PRIVATE ${CMAKE_THREAD_LIBS_INIT} http_parser ccf_js - ccf_endpoints ccfcrypto ccf_kv + ccf_endpoints ccfcrypto ccf_kv uv curl ) add_unit_test( @@ -704,7 +704,7 @@ if(BUILD_TESTS) ) target_link_libraries( node_frontend_test PRIVATE ${CMAKE_THREAD_LIBS_INIT} http_parser ccf_js - ccf_endpoints ccfcrypto ccf_kv + ccf_endpoints ccfcrypto ccf_kv uv curl ) add_unit_test( diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 5017d6e7826c..ff0f652b3332 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -1,8 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the Apache 2.0 License. -#include "self_healing_open_impl.h" - #include "node_state.h" namespace ccf @@ -266,7 +264,7 @@ namespace ccf ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, - *node_state); + *this); // kick this off asynchronously as this can be called from a curl callback ::threading::ThreadMessaging::instance().add_task( threading::get_current_thread_id(), std::move(retry_timer_msg)); From d21d71fd209aa78abdd8a93807b95c3dc720da66 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 17 Sep 2025 18:05:58 +0100 Subject: [PATCH 128/197] fmt --- CMakeLists.txt | 29 ++++++++++++++++------ src/host/configuration.h | 3 ++- tla/disaster-recovery/stateright/Readme.md | 2 -- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cfbb13a1826..f8694cbd6c34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -667,13 +667,20 @@ if(BUILD_TESTS) add_unit_test( frontend_test ${CMAKE_CURRENT_SOURCE_DIR}/src/node/rpc/test/frontend_test.cpp - ${CCF_DIR}/src/node/quote.cpp ${CCF_DIR}/src/node/uvm_endorsements.cpp + ${CCF_DIR}/src/node/quote.cpp + ${CCF_DIR}/src/node/uvm_endorsements.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/node/self_healing_open_impl.cpp - ) target_link_libraries( - frontend_test PRIVATE ${CMAKE_THREAD_LIBS_INIT} http_parser ccf_js - ccf_endpoints ccfcrypto ccf_kv uv curl + frontend_test + PRIVATE ${CMAKE_THREAD_LIBS_INIT} + http_parser + ccf_js + ccf_endpoints + ccfcrypto + ccf_kv + uv + curl ) add_unit_test( @@ -699,12 +706,20 @@ if(BUILD_TESTS) add_unit_test( node_frontend_test ${CMAKE_CURRENT_SOURCE_DIR}/src/node/rpc/test/node_frontend_test.cpp - ${CCF_DIR}/src/node/quote.cpp ${CCF_DIR}/src/node/uvm_endorsements.cpp + ${CCF_DIR}/src/node/quote.cpp + ${CCF_DIR}/src/node/uvm_endorsements.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/node/self_healing_open_impl.cpp ) target_link_libraries( - node_frontend_test PRIVATE ${CMAKE_THREAD_LIBS_INIT} http_parser ccf_js - ccf_endpoints ccfcrypto ccf_kv uv curl + node_frontend_test + PRIVATE ${CMAKE_THREAD_LIBS_INIT} + http_parser + ccf_js + ccf_endpoints + ccfcrypto + ccf_kv + uv + curl ) add_unit_test( diff --git a/src/host/configuration.h b/src/host/configuration.h index 4499e79627d3..985503603372 100644 --- a/src/host/configuration.h +++ b/src/host/configuration.h @@ -118,7 +118,8 @@ namespace host std::string previous_service_identity_file; std::optional previous_sealed_ledger_secret_location = std::nullopt; - std::optional self_healing_open = std::nullopt; + std::optional self_healing_open = + std::nullopt; bool operator==(const Recover&) const = default; }; Recover recover = {}; diff --git a/tla/disaster-recovery/stateright/Readme.md b/tla/disaster-recovery/stateright/Readme.md index 637c987fd78f..e337787e0027 100644 --- a/tla/disaster-recovery/stateright/Readme.md +++ b/tla/disaster-recovery/stateright/Readme.md @@ -9,5 +9,3 @@ The specification can be checked from the command line via `cargo run check`. However, a more useful UX is via the web-view which is hosted locally via `cargo run serve`. This allows you to explore the specification actions interactively, and the checker can be exhaustively run using the `Run to completion` button, which should find several useful examples of states where the network is opened, and where a deadlock is reached. - - From 1dc62d0f842668070399ec41c4250ee93fa0899c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 17 Sep 2025 18:07:08 +0100 Subject: [PATCH 129/197] Fixup todo --- src/node/self_healing_open_impl.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index ff0f652b3332..737a51948e44 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -449,9 +449,7 @@ namespace ccf self_healing_open::GossipRequest request{ .info = make_node_info(), - // TODO fix: This isn't quite right, as it should be the highest txid - // with a signature,before the recovery txs - .txid = node_state->network.tables->current_version(), + .txid = node_state->get_last_recovered_signed_idx(), }; for (auto& target_address : From 3e04eb424cad17420b9c4006f907120bfa2506d9 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 17 Sep 2025 18:10:18 +0100 Subject: [PATCH 130/197] clean imports diff --- src/node/node_state.h | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 7122811c09b8..b8f1a8690c85 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -7,11 +7,8 @@ #include "ccf/crypto/symmetric_key.h" #include "ccf/crypto/verifier.h" #include "ccf/ds/json.h" -#include "ccf/ds/unit_strings.h" #include "ccf/js/core/context.h" -#include "ccf/json_handler.h" #include "ccf/node/cose_signatures_config.h" -#include "ccf/odata_error.h" #include "ccf/pal/attestation_sev_snp.h" #include "ccf/pal/locking.h" #include "ccf/pal/platform.h" @@ -22,7 +19,6 @@ #include "ccf/service/tables/acme_certificates.h" #include "ccf/service/tables/self_heal_open.h" #include "ccf/service/tables/service.h" -#include "ccf/threading/thread_ids.h" #include "ccf/tx.h" #include "ccf_acme_client.h" #include "consensus/aft/raft.h" @@ -31,14 +27,10 @@ #include "ds/ccf_assert.h" #include "ds/files.h" #include "ds/internal_logger.h" -#include "ds/ring_buffer_types.h" #include "ds/state_machine.h" -#include "ds/thread_messaging.h" -#include "enclave/interface.h" #include "enclave/rpc_sessions.h" #include "encryptor.h" #include "history.h" -#include "http/curl.h" #include "http/http_parser.h" #include "indexing/indexer.h" #include "js/global_class_ids.h" @@ -2920,7 +2912,7 @@ namespace ccf { CCF_ASSERT( snp_tcb_version.has_value(), - "TCB version must be set when unsealing ledger sec/ret"); + "TCB version must be set when unsealing ledger secret"); CCF_ASSERT( config.recover.previous_sealed_ledger_secret_location.has_value(), From 9b5d6b619c10b0b7dc12cdcc3512cdc35156bc70 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Thu, 18 Sep 2025 15:36:22 +0100 Subject: [PATCH 131/197] Fix clang-tidy errors --- src/node/self_healing_open_impl.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 737a51948e44..42af0407cb33 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -353,8 +353,8 @@ namespace ccf node_state->config.recover.self_healing_open->timeout); } - inline void dispatch_authenticated_message( - nlohmann::json&& request, + void dispatch_authenticated_message( + nlohmann::json& request, const std::string& target_address, const std::string& endpoint, const crypto::Pem& self_signed_node_cert, @@ -451,12 +451,13 @@ namespace ccf .info = make_node_info(), .txid = node_state->get_last_recovered_signed_idx(), }; + nlohmann::json request_json = request; for (auto& target_address : node_state->config.recover.self_healing_open->addresses) { dispatch_authenticated_message( - std::move(request), + request_json, target_address, "gossip", node_state->self_signed_node_cert, @@ -476,8 +477,10 @@ namespace ccf self_healing_open::VoteRequest request{.info = make_node_info()}; + nlohmann::json request_json = request; + dispatch_authenticated_message( - std::move(request), + request_json, node_info.published_network_address, "vote", node_state->self_signed_node_cert, @@ -497,6 +500,7 @@ namespace ccf LOG_TRACE_FMT("Sending self-healing-open iamopen"); self_healing_open::IAmOpenRequest request{.info = make_node_info()}; + nlohmann::json request_json = request; for (auto& target_address : node_state->config.recover.self_healing_open->addresses) @@ -510,7 +514,7 @@ namespace ccf continue; } dispatch_authenticated_message( - std::move(request), + request_json, target_address, "iamopen", node_state->self_signed_node_cert, From 7950eb7e3e10acffbbef234509600d2adae855e2 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 19 Sep 2025 10:24:46 +0100 Subject: [PATCH 132/197] error reporter imports --- src/http/error_reporter.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/http/error_reporter.h b/src/http/error_reporter.h index b79a32c2390d..779e474a17ec 100644 --- a/src/http/error_reporter.h +++ b/src/http/error_reporter.h @@ -2,6 +2,8 @@ // Licensed under the Apache 2.0 License. #pragma once +#include "ccf/rpc_context.h" + namespace http { class ErrorReporter From 45f54f041fe3f87ccfb78904ea893db1dcadec42 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 19 Sep 2025 10:25:13 +0100 Subject: [PATCH 133/197] remove extra e2e_curl --- CMakeLists.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f8694cbd6c34..e3d8557a17be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1174,10 +1174,6 @@ if(BUILD_TESTS) 10000 --use-jwt ) - add_test_bin( - curl_test ${CMAKE_CURRENT_SOURCE_DIR}/src/http/test/curl_test.cpp - ) - target_link_libraries(curl_test PRIVATE curl uv http_parser) if(LONG_TESTS) add_e2e_test( @@ -1188,12 +1184,6 @@ if(BUILD_TESTS) curl_test ${CMAKE_CURRENT_SOURCE_DIR}/src/http/test/curl_test.cpp ) target_link_libraries(curl_test PRIVATE curl uv http_parser) - - if(LONG_TESTS) - add_e2e_test( - NAME e2e_curl PYTHON_SCRIPT ${CMAKE_SOURCE_DIR}/tests/e2e_curl.py - ) - endif() endif() endif() From f0175d99c9f6693ff177a0375f64479471217a90 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 19 Sep 2025 17:41:42 +0100 Subject: [PATCH 134/197] Basic running test --- src/node/rpc/node_frontend.h | 10 +++--- src/node/self_healing_open_impl.cpp | 4 ++- tests/e2e_operations.py | 21 ------------ tests/infra/network.py | 53 ++++++++++++++++------------- 4 files changed, 36 insertions(+), 52 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 419cc4f63f51..840d5f7f7801 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -437,7 +437,7 @@ namespace ccf { const auto [code, message] = quote_verification_error(verify_result); LOG_FAIL_FMT( - "Self-healing-open gossip from intrinsic id {} is invalid: {} ({})", + "Self-healing-open message from intrinsic id {} is invalid: {} ({})", in.intrinsic_id, code, message); @@ -445,7 +445,7 @@ namespace ccf } LOG_TRACE_FMT( - "Self-healing-open gossip from intrinsic id {}'s quote is valid", + "Self-healing-open message from intrinsic id {}'s quote is valid", in.intrinsic_id); // Validating that we haven't heard from this node before, of if we have @@ -459,12 +459,12 @@ namespace ccf if (existing_node_info->cert_der != cert_der) { LOG_FAIL_FMT( - "Self-healing-open gossip from intrinsic id {} is invalid: " + "Self-healing-open message from intrinsic id {} is invalid: " "certificate has changed", in.intrinsic_id); return std::make_tuple( HTTP_STATUS_BAD_REQUEST, - "Self-healing-open gossip from intrinsic id is invalid: " + "Self-healing-open message from intrinsic id is invalid: " "certificate has changed"); } } @@ -479,8 +479,6 @@ namespace ccf node_info_handle->put(in.intrinsic_id, src_info); } - // TODO validate that this gossip is for the same network - return std::nullopt; }; diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 42af0407cb33..7c327357a560 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -16,6 +16,8 @@ namespace ccf return; } + LOG_INFO_FMT("Starting self-healing-open"); + // Reset the self-healing-open state auto* state_handle = tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE); @@ -449,7 +451,7 @@ namespace ccf self_healing_open::GossipRequest request{ .info = make_node_info(), - .txid = node_state->get_last_recovered_signed_idx(), + .txid = node_state->last_recovered_signed_idx, }; nlohmann::json request_json = request; diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index a0ca0e29b801..8270b4dbd2dc 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1499,27 +1499,6 @@ def run_self_healing_open(args): committed_ledger_dirs=committed_ledger_dirs, ) - def cycle(items): - while True: - for item in items: - yield item - - # Wait for any node to be waiting for RecoveryShares, ie it opened - for node in cycle(recovered_network.nodes): - try: - recovered_network.wait_for_statuses( - node, - ["WaitingForRecoveryShares", "Open"], - timeout=1, - verify_ca=False, - ) - break - except TimeoutError: - LOG.info( - f"Failed to get the status of {node.local_node_id}, retrying..." - ) - continue - # Refresh the the declared state of nodes which have shut themselves down to join. for node in recovered_network.nodes: node.refresh_network_state(verify_ca=False) diff --git a/tests/infra/network.py b/tests/infra/network.py index 9cd0eba78df0..e5d4b264314f 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -796,6 +796,8 @@ def start_in_self_healing_open( i: None for i in range(len(self.nodes)) } snapshot_dirs = snapshot_dirs or {i: None for i in range(len(self.nodes))} + + # separate out all starting nodes' directories such that they recover independently self.per_node_args_override = { i: ( d @@ -808,9 +810,11 @@ def start_in_self_healing_open( for i, d in self.per_node_args_override.items() } + # Fix the port numbers to make all nodes _well known_ for i, node in enumerate(self.nodes): - node.host.get_primary_interface().port = 5000 + (i + 1) - node.host.get_primary_interface().public_port = 5000 + (i + 1) + port = 1000 + random.randint(0, 64534) + node.host.get_primary_interface().port = port + node.host.get_primary_interface().public_port = port LOG.info("Set up nodes") for node in self.nodes: @@ -858,30 +862,31 @@ def start_in_self_healing_open( self.election_duration = args.election_timeout_ms / 1000 self.observed_election_duration = self.election_duration + 1 - for i, node in enumerate(self.nodes): - end_time = time.time() + timeout - success = False - while time.time() < end_time: - try: - self.wait_for_states( - node, - [ - infra.node.State.PART_OF_PUBLIC_NETWORK.value, - infra.node.State.PART_OF_NETWORK, - ], - timeout=args.ledger_recovery_timeout, - verify_ca=False, # Certs are volatile until the recovery is complete - ) - success = True - break - except CCFConnectionException: - time.sleep(0.1) - if not success: - raise TimeoutError( - f"Failed to get state of node {node.local_node_id} after {timeout} seconds" + def cycle(items): + while True: + for item in items: + yield item + + # Waiting for any node to transition-to-open + end_time = time.time() + timeout + for i, node in cycle(enumerate(self.nodes)): + if time.time() > end_time: + raise TimeoutError("Timed out waiting for any node to open") + try: + self.wait_for_statuses( + node, + ["WaitingForRecoveryShares", "Open"], + timeout=0.5, + verify_ca=False, ) + break + except TimeoutError: + LOG.info( + f"Failed to get the status of {node.local_node_id}, retrying..." + ) + continue - LOG.info("All nodes started") + LOG.info("One node opened") def recover( self, From e9cb10dd889bc952b06288f0fa43cd2f7fd85d30 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 19 Sep 2025 19:45:01 +0100 Subject: [PATCH 135/197] Allow curl handles to fix themselves during shutdown. --- src/http/curl.h | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index 25e9cdbb38a2..f3882425fa1b 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -714,12 +714,6 @@ namespace ccf::curl static void libuv_socket_poll_callback( uv_poll_t* req, int status, int events) { - if (status < 0) - { - LOG_FAIL_FMT("Socket poll error: {}", uv_strerror(status)); - return; - } - auto* socket_context = static_cast(req->data); if (socket_context == nullptr) { @@ -736,7 +730,27 @@ namespace ccf::curl if (self->is_stopping) { - LOG_FAIL_FMT("libuv_socket_poll_callback called while stopping"); + LOG_FAIL_FMT( + "libuv_socket_poll_callback called on {} while stopped", + socket_context->socket); + return; + } + + if (status < 0) + { + LOG_INFO_FMT( + "Socket poll error on {}: {}", + socket_context->socket, + uv_strerror(status)); + + // Notify curl of the error + CHECK_CURL_MULTI( + curl_multi_socket_action, + self->curl_request_curlm, + socket_context->socket, + CURL_CSELECT_ERR, + nullptr); + self->curl_request_curlm.perform(); return; } @@ -779,15 +793,17 @@ namespace ccf::curl case CURL_POLL_OUT: case CURL_POLL_INOUT: { - // Possibly called during shutdown + LOG_INFO_FMT( + "Curl socket callback: listen on socket {}, {}", + static_cast(s), + static_cast(action)); + + // During shutdown ignore requests to add new sockets if (self->is_stopping) { - LOG_FAIL_FMT("curl_socket_callback called while stopping"); return 0; } - LOG_INFO_FMT( - "Curl socket callback: listen on socket {}", static_cast(s)); if (socket_context == nullptr) { auto socket_context_ptr = std::make_unique(); From 8c8816c6ff032eca93ade22acc8d58a36d1487ae Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 19 Sep 2025 19:47:19 +0100 Subject: [PATCH 136/197] Allow nodes to restart before refreshing network state --- tests/e2e_operations.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 8270b4dbd2dc..d3a53429bdd2 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1499,6 +1499,10 @@ def run_self_healing_open(args): committed_ledger_dirs=committed_ledger_dirs, ) + # Wait until all relevant nodes have restarted + + time.sleep(3) + # Refresh the the declared state of nodes which have shut themselves down to join. for node in recovered_network.nodes: node.refresh_network_state(verify_ca=False) From 1c8d6cb8f9c8217d0703904b66f6fca8b0956292 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Fri, 19 Sep 2025 19:47:31 +0100 Subject: [PATCH 137/197] Log restart --- src/host/self_healing_open.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/host/self_healing_open.h b/src/host/self_healing_open.h index 4ae3d41db0d0..350cf511f311 100644 --- a/src/host/self_healing_open.h +++ b/src/host/self_healing_open.h @@ -23,6 +23,7 @@ namespace ccf void trigger_restart() { + LOG_INFO_FMT("Received request to restart enclave, sending stops"); RINGBUFFER_WRITE_MESSAGE(AdminMessage::stop, to_enclave); } }; From 753d51132782264358f91135aadc00046c733eb1 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 10:37:50 +0100 Subject: [PATCH 138/197] Test timeout path --- tests/e2e_operations.py | 66 ++++++++++++++++++++++++----------------- tests/infra/network.py | 4 +-- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index d3a53429bdd2..3da085af66f3 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1536,7 +1536,7 @@ def run_self_healing_open(args): recovered_network.stop_all_nodes() -def run_self_healing_open_single_replica(args): +def run_self_healing_open_timeout_path(args): args.nodes = infra.e2e_args.min_nodes(args, f=1) with infra.network.network( args.nodes, @@ -1545,9 +1545,11 @@ def run_self_healing_open_single_replica(args): ) as network: LOG.info("Start a network and stop it") network.start_and_open(args) - old_common = infra.network.get_common_folder_name(args.workspace, args.label) + network.save_service_identity(args) network.stop_all_nodes() + recovery_args = copy.deepcopy(args) + ledger_dirs = {} committed_ledger_dirs = {} for i, node in enumerate(network.nodes): @@ -1557,42 +1559,51 @@ def run_self_healing_open_single_replica(args): LOG.info("Start a recovery network and stop it") recovered_network = infra.network.Network( - args.nodes, - args.binary_dir, - args.debug_nodes, + recovery_args.nodes, + recovery_args.binary_dir, + recovery_args.debug_nodes, existing_network=network, + starting_nodes = 1, # Force timeout path by starting only one node ) - args.previous_service_identity_file = os.path.join( - old_common, "service_cert.pem" - ) - recovered_network.start_in_self_healing_open( - args, + recovery_args, ledger_dirs=ledger_dirs, committed_ledger_dirs=committed_ledger_dirs, - common_dir=network.common_dir, - start_all_nodes=False, ) - # Wait for the first node to be in RecoveryShares - for node in recovered_network.nodes[0:1]: - recovered_network.wait_for_statuses( - node, - ["WaitingForRecoveryShares", "Open"], - timeout=30, - ) + # Wait until all relevant nodes have restarted + + time.sleep(3) + + # Refresh the the declared state of nodes which have shut themselves down to join. + for node in recovered_network.nodes: + node.refresh_network_state(verify_ca=False) + + recovered_network.refresh_service_identity_file(recovery_args) + recovered_network.consortium.recover_with_shares( recovered_network.find_random_node() ) - # Wait for all replicas to report being part of the network - for node in recovered_network.nodes[0:1]: - recovered_network.wait_for_state( - node, - infra.node.State.PART_OF_NETWORK.value, - timeout=30, - ) - recovered_network._wait_for_app_open(node) + LOG.info("Submitted recovery shares") + + # Wait for all live replicas to report being part of the opened network + successfully_opened = 0 + for node in recovered_network.get_joined_nodes(): + try: + recovered_network.wait_for_status( + node, + "Open", + timeout=10, + ) + recovered_network._wait_for_app_open(node) + successfully_opened += 1 + except TimeoutError: + pass + + assert successfully_opened == 1 + + LOG.info("Completed self-healing open successfully") recovered_network.stop_all_nodes() @@ -1908,3 +1919,4 @@ def run(args): run_read_ledger_on_testdata(args) run_ledger_chunk_bytes_check(args) run_self_healing_open(args) + run_self_healing_open_timeout_path(args) diff --git a/tests/infra/network.py b/tests/infra/network.py index e5d4b264314f..9105978854e4 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -781,7 +781,7 @@ def start_in_self_healing_open( snapshot_dirs=None, common_dir=None, set_authenticate_session=None, - start_all_nodes=True, + starting_nodes=None, timeout=10, **kwargs, ): @@ -834,7 +834,7 @@ def start_in_self_healing_open( for i, node in enumerate(self.nodes): forwarded_args_with_overrides = forwarded_args.copy() forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) - if not start_all_nodes and i > 0: + if not starting_nodes and i > starting_nodes: break try: From 58ffb4d25d0426a133d2b901104c97b4bedb0052 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 11:01:42 +0100 Subject: [PATCH 139/197] Local sealing self-healing-open --- tests/e2e_operations.py | 76 +++++++++++++++++++++++++++++++++++++++-- tests/infra/network.py | 11 ++++-- 2 files changed, 82 insertions(+), 5 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 3da085af66f3..5b181001bc79 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1500,7 +1500,6 @@ def run_self_healing_open(args): ) # Wait until all relevant nodes have restarted - time.sleep(3) # Refresh the the declared state of nodes which have shut themselves down to join. @@ -1572,7 +1571,6 @@ def run_self_healing_open_timeout_path(args): ) # Wait until all relevant nodes have restarted - time.sleep(3) # Refresh the the declared state of nodes which have shut themselves down to join. @@ -1607,6 +1605,79 @@ def run_self_healing_open_timeout_path(args): recovered_network.stop_all_nodes() +def run_self_healing_open_local_unsealing(args): + args.nodes = infra.e2e_args.min_nodes(args, f=1) + with infra.network.network( + args.nodes, + args.binary_dir, + args.debug_nodes, + ) as network: + LOG.info("Start a network and stop it") + network.start_and_open(args) + network.save_service_identity(args) + node_secrets = [ + node.save_sealed_ledger_secret() + for node in network.nodes + ] + network.stop_all_nodes() + + recovery_args = copy.deepcopy(args) + + ledger_dirs = {} + committed_ledger_dirs = {} + for i, node in enumerate(network.nodes): + l_dir, c = node.get_ledger() + ledger_dirs[i] = l_dir + committed_ledger_dirs[i] = c + + LOG.info("Start a recovery network") + recovered_network = infra.network.Network( + recovery_args.nodes, + recovery_args.binary_dir, + recovery_args.debug_nodes, + existing_network=network, + ) + recovered_network.start_in_self_healing_open( + recovery_args, + ledger_dirs=ledger_dirs, + committed_ledger_dirs=committed_ledger_dirs, + sealed_ledger_secrets=node_secrets, + ) + + # Wait until all relevant nodes have restarted + time.sleep(3) + + # Refresh the the declared state of nodes which have shut themselves down to join. + for node in recovered_network.nodes: + node.refresh_network_state(verify_ca=False) + + recovered_network.refresh_service_identity_file(recovery_args) + + recovered_network.consortium.recover_with_shares( + recovered_network.find_random_node() + ) + + LOG.info("Submitted recovery shares") + + # Wait for all live replicas to report being part of the opened network + successfully_opened = 0 + for node in recovered_network.get_joined_nodes(): + try: + recovered_network.wait_for_status( + node, + "Open", + timeout=10, + ) + recovered_network._wait_for_app_open(node) + successfully_opened += 1 + except TimeoutError: + pass + + assert successfully_opened == 1 + + LOG.info("Completed self-healing open successfully") + + recovered_network.stop_all_nodes() def run_read_ledger_on_testdata(args): for testdata_dir in os.scandir(args.historical_testdata): @@ -1920,3 +1991,4 @@ def run(args): run_ledger_chunk_bytes_check(args) run_self_healing_open(args) run_self_healing_open_timeout_path(args) + run_self_healing_open_local_unsealing(args) diff --git a/tests/infra/network.py b/tests/infra/network.py index 9105978854e4..4523a5f4ecc3 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -783,6 +783,7 @@ def start_in_self_healing_open( set_authenticate_session=None, starting_nodes=None, timeout=10, + sealed_ledger_secrets=None, **kwargs, ): self.common_dir = common_dir or get_common_folder_name( @@ -832,11 +833,15 @@ def start_in_self_healing_open( ] for i, node in enumerate(self.nodes): - forwarded_args_with_overrides = forwarded_args.copy() - forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) - if not starting_nodes and i > starting_nodes: + if starting_nodes is not None and i > starting_nodes: break + forwarded_args_with_overrides = forwarded_args.copy() + forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) + if sealed_ledger_secrets is not None and i in sealed_ledger_secrets: + forwarded_args_with_overrides["previous_sealed_ledger_secret_location"] = ( + sealed_ledger_secrets[i] + ) try: node_kwargs = { "lib_name": args.package, From bf08f9c82f38905c2f05436029a5c5688ef2e2e5 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 11:14:25 +0100 Subject: [PATCH 140/197] fmt --- tests/e2e_operations.py | 15 ++++----------- tests/infra/network.py | 14 +++++++------- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 5b181001bc79..fc59ed1d12f5 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1562,7 +1562,7 @@ def run_self_healing_open_timeout_path(args): recovery_args.binary_dir, recovery_args.debug_nodes, existing_network=network, - starting_nodes = 1, # Force timeout path by starting only one node + starting_nodes=1, # Force timeout path by starting only one node ) recovered_network.start_in_self_healing_open( recovery_args, @@ -1605,6 +1605,7 @@ def run_self_healing_open_timeout_path(args): recovered_network.stop_all_nodes() + def run_self_healing_open_local_unsealing(args): args.nodes = infra.e2e_args.min_nodes(args, f=1) with infra.network.network( @@ -1615,10 +1616,7 @@ def run_self_healing_open_local_unsealing(args): LOG.info("Start a network and stop it") network.start_and_open(args) network.save_service_identity(args) - node_secrets = [ - node.save_sealed_ledger_secret() - for node in network.nodes - ] + node_secrets = [node.save_sealed_ledger_secret() for node in network.nodes] network.stop_all_nodes() recovery_args = copy.deepcopy(args) @@ -1653,12 +1651,6 @@ def run_self_healing_open_local_unsealing(args): recovered_network.refresh_service_identity_file(recovery_args) - recovered_network.consortium.recover_with_shares( - recovered_network.find_random_node() - ) - - LOG.info("Submitted recovery shares") - # Wait for all live replicas to report being part of the opened network successfully_opened = 0 for node in recovered_network.get_joined_nodes(): @@ -1679,6 +1671,7 @@ def run_self_healing_open_local_unsealing(args): recovered_network.stop_all_nodes() + def run_read_ledger_on_testdata(args): for testdata_dir in os.scandir(args.historical_testdata): assert testdata_dir.is_dir() diff --git a/tests/infra/network.py b/tests/infra/network.py index 4523a5f4ecc3..f337f0cd18fc 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -811,7 +811,7 @@ def start_in_self_healing_open( for i, d in self.per_node_args_override.items() } - # Fix the port numbers to make all nodes _well known_ + # Fix the port numbers to make all nodes _well known_ for i, node in enumerate(self.nodes): port = 1000 + random.randint(0, 64534) node.host.get_primary_interface().port = port @@ -839,9 +839,9 @@ def start_in_self_healing_open( forwarded_args_with_overrides = forwarded_args.copy() forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) if sealed_ledger_secrets is not None and i in sealed_ledger_secrets: - forwarded_args_with_overrides["previous_sealed_ledger_secret_location"] = ( - sealed_ledger_secrets[i] - ) + forwarded_args_with_overrides[ + "previous_sealed_ledger_secret_location" + ] = sealed_ledger_secrets[i] try: node_kwargs = { "lib_name": args.package, @@ -868,9 +868,9 @@ def start_in_self_healing_open( self.observed_election_duration = self.election_duration + 1 def cycle(items): - while True: - for item in items: - yield item + while True: + for item in items: + yield item # Waiting for any node to transition-to-open end_time = time.time() + timeout From abeccda92707f6f604038090c0082a410247e313 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 11:35:17 +0100 Subject: [PATCH 141/197] fixup test --- tests/e2e_operations.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index fc59ed1d12f5..335cf70a1e0d 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1465,7 +1465,8 @@ def run(self, src_dir, dst_dir): prev_network = recovery_network -def run_self_healing_open(args): +def run_self_healing_open(const_args): + args = copy.deepcopy(const_args) args.nodes = infra.e2e_args.min_nodes(args, f=1) with infra.network.network( args.nodes, @@ -1535,7 +1536,8 @@ def run_self_healing_open(args): recovered_network.stop_all_nodes() -def run_self_healing_open_timeout_path(args): +def run_self_healing_open_timeout_path(const_args): + args = copy.deepcopy(const_args) args.nodes = infra.e2e_args.min_nodes(args, f=1) with infra.network.network( args.nodes, @@ -1606,8 +1608,11 @@ def run_self_healing_open_timeout_path(args): recovered_network.stop_all_nodes() -def run_self_healing_open_local_unsealing(args): +def run_self_healing_open_local_unsealing(const_args): + args = copy.deepcopy(const_args) args.nodes = infra.e2e_args.min_nodes(args, f=1) + args.enable_local_sealing = True + with infra.network.network( args.nodes, args.binary_dir, @@ -1980,8 +1985,8 @@ def run(args): run_recovery_unsealing_corrupt(args) run_recovery_unsealing_validate_audit(args) test_error_message_on_failure_to_read_aci_sec_context(args) + run_self_healing_open_local_unsealing(args) run_read_ledger_on_testdata(args) run_ledger_chunk_bytes_check(args) run_self_healing_open(args) run_self_healing_open_timeout_path(args) - run_self_healing_open_local_unsealing(args) From 53a91391203c2af912fd8bf752f5c181ae57e327 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 11:42:56 +0100 Subject: [PATCH 142/197] Ensure sealed secrets are passed --- tests/infra/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/infra/network.py b/tests/infra/network.py index f337f0cd18fc..37340f949f9f 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -838,7 +838,7 @@ def start_in_self_healing_open( forwarded_args_with_overrides = forwarded_args.copy() forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) - if sealed_ledger_secrets is not None and i in sealed_ledger_secrets: + if sealed_ledger_secrets is not None and i < len(sealed_ledger_secrets): forwarded_args_with_overrides[ "previous_sealed_ledger_secret_location" ] = sealed_ledger_secrets[i] From 6b82bf34f523d1b3cd9ba2794441a2cbf193e856 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 11:43:41 +0100 Subject: [PATCH 143/197] fixup timeout path --- tests/e2e_operations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 335cf70a1e0d..8854eda5fc1f 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1564,12 +1564,12 @@ def run_self_healing_open_timeout_path(const_args): recovery_args.binary_dir, recovery_args.debug_nodes, existing_network=network, - starting_nodes=1, # Force timeout path by starting only one node ) recovered_network.start_in_self_healing_open( recovery_args, ledger_dirs=ledger_dirs, committed_ledger_dirs=committed_ledger_dirs, + starting_nodes=1, # Force timeout path by starting only one node ) # Wait until all relevant nodes have restarted From db6fb56e9b54c255247c7b959896a51db72a13f9 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 11:57:32 +0100 Subject: [PATCH 144/197] Improve test infra --- src/node/rpc/node_frontend.h | 4 ---- tests/infra/network.py | 11 ++++++++++- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 840d5f7f7801..906c2d5942f0 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -2287,7 +2287,6 @@ namespace ccf } LOG_TRACE_FMT("Processing self-healing-open gossip RPC"); - LOG_TRACE_FMT("Self-healing-open gossip params: {}", params.dump()); auto chosen_replica = args.tx.rw(this->network.self_healing_open_chosen_replica); @@ -2364,7 +2363,6 @@ namespace ccf return make_error(code, ccf::errors::InvalidQuote, message); } LOG_TRACE_FMT("Processing self-healing-open vote RPC"); - LOG_TRACE_FMT("Self-healing-open vote params: {}", params.dump()); auto votes = args.tx.rw(this->network.self_healing_open_votes); votes->insert(in.info.intrinsic_id); @@ -2428,9 +2426,7 @@ namespace ccf auto* sm_state = args.tx.rw(this->network.self_healing_open_sm_state); sm_state->put(SelfHealingOpenSM::JOINING); - LOG_INFO_FMT("******************************"); LOG_INFO_FMT("Self-healing-open is JOINING {}", in.info.intrinsic_id); - LOG_INFO_FMT("******************************"); auto* chosen_replica = args.tx.rw(this->network.self_healing_open_chosen_replica); diff --git a/tests/infra/network.py b/tests/infra/network.py index 37340f949f9f..6cad0c271b7a 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -874,8 +874,10 @@ def cycle(items): # Waiting for any node to transition-to-open end_time = time.time() + timeout - for i, node in cycle(enumerate(self.nodes)): + for node in cycle(self.nodes): + LOG.info(f"Seeing if node {node.local_node_id} has opened") if time.time() > end_time: + Log.error("Timed out waiting for any node to open") raise TimeoutError("Timed out waiting for any node to open") try: self.wait_for_statuses( @@ -890,6 +892,13 @@ def cycle(items): f"Failed to get the status of {node.local_node_id}, retrying..." ) continue + except RuntimeError as e: + if "node is stopped" in str(e).lower(): + LOG.info( + f"Failed to get the status of {node.local_node_id} with error {e}, retrying..." + ) + continue + raise e LOG.info("One node opened") From 4c527e28c607d716e57f86521ebc23175f7afc74 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 12:07:06 +0100 Subject: [PATCH 145/197] fixup --- tests/infra/network.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tests/infra/network.py b/tests/infra/network.py index 6cad0c271b7a..3519693030cb 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -26,6 +26,7 @@ from datetime import datetime, timedelta, timezone from infra.consortium import slurp_file from collections import deque +from clients import CCFIOException from loguru import logger as LOG @@ -877,7 +878,7 @@ def cycle(items): for node in cycle(self.nodes): LOG.info(f"Seeing if node {node.local_node_id} has opened") if time.time() > end_time: - Log.error("Timed out waiting for any node to open") + LOG.error("Timed out waiting for any node to open") raise TimeoutError("Timed out waiting for any node to open") try: self.wait_for_statuses( @@ -887,17 +888,14 @@ def cycle(items): verify_ca=False, ) break - except TimeoutError: - LOG.info( - f"Failed to get the status of {node.local_node_id}, retrying..." - ) - continue - except RuntimeError as e: - if "node is stopped" in str(e).lower(): - LOG.info( - f"Failed to get the status of {node.local_node_id} with error {e}, retrying..." - ) - continue + except Exception as e: + if isinstance(e, (CCFIOException, TimeoutError)) or ( + isinstance(e, RuntimeError) and "node is stopped" in str(e).lower() + ): + LOG.info( + f"Failed to get the status of {node.local_node_id}, retrying..." + ) + continue raise e LOG.info("One node opened") From d05b7459d3dc72c80bcf5a47fb00f144c17c88a0 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 12:55:14 +0100 Subject: [PATCH 146/197] imports --- tests/infra/network.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/infra/network.py b/tests/infra/network.py index 3519693030cb..e77c812ee523 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -5,7 +5,7 @@ from contextlib import contextmanager from enum import Enum, IntEnum, auto -from infra.clients import flush_info, CCFConnectionException +from infra.clients import flush_info, CCFConnectionException, CCFIOException import infra.member import infra.path import infra.proc @@ -26,7 +26,6 @@ from datetime import datetime, timedelta, timezone from infra.consortium import slurp_file from collections import deque -from clients import CCFIOException from loguru import logger as LOG From a4b6d83e9d8be24921cd2678af8d10645b94a71a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 13:58:38 +0100 Subject: [PATCH 147/197] Make NodeState a shared_ptr --- src/enclave/enclave.h | 4 +- src/node/node_state.h | 2 +- src/node/self_healing_open_impl.cpp | 79 ++++++++++++++--------------- src/node/self_healing_open_impl.h | 6 +-- 4 files changed, 45 insertions(+), 46 deletions(-) diff --git a/src/enclave/enclave.h b/src/enclave/enclave.h index 4c699a5295e8..7eaa94a263e3 100644 --- a/src/enclave/enclave.h +++ b/src/enclave/enclave.h @@ -48,7 +48,7 @@ namespace ccf ccf::NetworkState network; std::shared_ptr rpc_map; std::shared_ptr rpcsessions; - std::unique_ptr node; + std::shared_ptr node; ringbuffer::WriterPtr to_host = nullptr; std::chrono::high_resolution_clock::time_point last_tick_time; @@ -102,7 +102,7 @@ namespace ccf std::make_shared(chunk_threshold)); LOG_TRACE_FMT("Creating node"); - node = std::make_unique( + node = std::make_shared( *writer_factory, network, rpcsessions, curve_id); LOG_TRACE_FMT("Creating context"); diff --git a/src/node/node_state.h b/src/node/node_state.h index b8f1a8690c85..4169802305eb 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -85,7 +85,7 @@ namespace ccf data.shrink_to_fit(); } - class NodeState : public AbstractNodeState + class NodeState : public AbstractNodeState, public std::enable_shared_from_this { friend class SelfHealingOpenService; diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 7c327357a560..d387ee09a1c3 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -7,6 +7,7 @@ namespace ccf { void SelfHealingOpenService::try_start(ccf::kv::Tx& tx, bool recovering) { + auto node_state = this->weak_node_state.lock(); if ( !recovering || !node_state->config.recover.self_healing_open.has_value()) { @@ -19,23 +20,19 @@ namespace ccf LOG_INFO_FMT("Starting self-healing-open"); // Reset the self-healing-open state - auto* state_handle = - tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE); - state_handle->clear(); - auto* timeout_state_handle = tx.rw( - Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE); - auto* node_info_handle = - tx.rw(Tables::SELF_HEALING_OPEN_NODES); - node_info_handle->clear(); - auto* gossip_state_handle = - tx.rw(Tables::SELF_HEALING_OPEN_GOSSIPS); - gossip_state_handle->clear(); - auto* chosen_replica = tx.rw( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA); - chosen_replica->clear(); - auto* votes = - tx.rw(Tables::SELF_HEALING_OPEN_VOTES); - votes->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE) + ->clear(); + tx.rw( + Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE) + ->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_NODES) + ->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_GOSSIPS) + ->clear(); + tx.rw( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + ->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_VOTES)->clear(); start_message_retry_timers(); start_failover_timers(tx); @@ -43,13 +40,14 @@ namespace ccf void SelfHealingOpenService::advance(ccf::kv::Tx& tx, bool timeout) { + auto node_state = this->weak_node_state.lock(); auto* sm_state_handle = tx.rw(node_state->network.self_healing_open_sm_state); auto* timeout_state_handle = tx.rw(node_state->network.self_healing_open_timeout_sm_state); if ( - !sm_state_handle->get().has_value() || - !timeout_state_handle->get().has_value()) + (!sm_state_handle->get().has_value()) || + (!timeout_state_handle->get().has_value())) { throw std::logic_error( "Self-healing-open state not set, cannot advance self-healing-open"); @@ -199,12 +197,12 @@ namespace ccf { auto retry_timer_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { - std::lock_guard guard(msg->data.self.node_state->lock); + auto node_state = msg->data.self.weak_node_state.lock(); + std::lock_guard guard(node_state->lock); - auto tx = - msg->data.self.node_state->network.tables->create_read_only_tx(); + auto tx = node_state->network.tables->create_read_only_tx(); auto* sm_state_handle = - tx.ro(msg->data.self.node_state->network.self_healing_open_sm_state); + tx.ro(node_state->network.self_healing_open_sm_state); if (!sm_state_handle->get().has_value()) { throw std::logic_error( @@ -227,11 +225,10 @@ namespace ccf break; case SelfHealingOpenSM::VOTING: { - auto* node_info_handle = tx.ro( - msg->data.self.node_state->network.self_healing_open_node_info); + auto* node_info_handle = + tx.ro(node_state->network.self_healing_open_node_info); auto* chosen_replica_handle = - tx.ro(msg->data.self.node_state->network - .self_healing_open_chosen_replica); + tx.ro(node_state->network.self_healing_open_chosen_replica); if (!chosen_replica_handle->get().has_value()) { throw std::logic_error( @@ -261,8 +258,8 @@ namespace ccf static_cast(sm_state))); } - auto delay = msg->data.self.node_state->config.recover - .self_healing_open->retry_timeout; + auto delay = + node_state->config.recover.self_healing_open->retry_timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, @@ -274,6 +271,7 @@ namespace ccf void SelfHealingOpenService::start_failover_timers(ccf::kv::Tx& tx) { + auto node_state = this->weak_node_state.lock(); auto* state_handle = tx.rw(node_state->network.self_healing_open_sm_state); state_handle->put(SelfHealingOpenSM::GOSSIPPING); auto* timeout_state_handle = @@ -283,15 +281,15 @@ namespace ccf // Dispatch timeouts auto timeout_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { - std::lock_guard guard(msg->data.self.node_state->lock); + auto node_state = msg->data.self.weak_node_state.lock(); + std::lock_guard guard(node_state->lock); LOG_TRACE_FMT( "Self-healing-open timeout, sending timeout to internal handlers"); // Stop the timer if the node has completed its self-healing-open - auto tx = - msg->data.self.node_state->network.tables->create_read_only_tx(); + auto tx = node_state->network.tables->create_read_only_tx(); auto* sm_state_handle = - tx.ro(msg->data.self.node_state->network.self_healing_open_sm_state); + tx.ro(node_state->network.self_healing_open_sm_state); if (!sm_state_handle->get().has_value()) { throw std::logic_error( @@ -308,7 +306,7 @@ namespace ccf // Send a timeout to the internal handlers curl::UniqueCURL curl_handle; - auto cert = msg->data.self.node_state->self_signed_node_cert; + auto cert = node_state->self_signed_node_cert; curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); @@ -317,16 +315,14 @@ namespace ccf CURLOPT_SSLCERT_BLOB, cert.data(), cert.size()); curl_handle.set_opt(CURLOPT_SSLCERTTYPE, "PEM"); - auto privkey_pem = - msg->data.self.node_state->node_sign_kp->private_key_pem(); + auto privkey_pem = node_state->node_sign_kp->private_key_pem(); curl_handle.set_blob_opt( CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); auto url = fmt::format( "https://{}/{}/self_healing_open/timeout", - msg->data.self.node_state->config.network.rpc_interfaces - .at("primary_rpc_interface") + node_state->config.network.rpc_interfaces.at("primary_rpc_interface") .published_address, get_actor_prefix(ActorsType::nodes)); @@ -344,8 +340,7 @@ namespace ccf curl::CurlmLibuvContextSingleton::get_instance()->attach_request( std::move(curl_request)); - auto delay = - msg->data.self.node_state->config.recover.self_healing_open->timeout; + auto delay = node_state->config.recover.self_healing_open->timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, @@ -424,6 +419,7 @@ namespace ccf self_healing_open::RequestNodeInfo SelfHealingOpenService::make_node_info() { + auto node_state = this->weak_node_state.lock(); return { .quote_info = node_state->quote_info, .published_network_address = @@ -438,6 +434,7 @@ namespace ccf void SelfHealingOpenService::send_gossip_unsafe() { + auto node_state = this->weak_node_state.lock(); // Caller must ensure that the current node's quote_info is populated: // ie not yet reached partOfNetwork if (!node_state->config.recover.self_healing_open.has_value()) @@ -470,6 +467,7 @@ namespace ccf void SelfHealingOpenService::send_vote_unsafe( const SelfHealingOpenNodeInfo_t& node_info) { + auto node_state = this->weak_node_state.lock(); // Caller must ensure that the current node's quote_info is populated: // ie not yet reached partOfNetwork LOG_TRACE_FMT( @@ -491,6 +489,7 @@ namespace ccf void SelfHealingOpenService::send_iamopen_unsafe() { + auto node_state = this->weak_node_state.lock(); // Caller must ensure that the current node's quote_info is populated: // ie not yet reached partOfNetwork if (!node_state->config.recover.self_healing_open.has_value()) diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index 9e77d680ac44..62456c63272b 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -12,11 +12,11 @@ namespace ccf class SelfHealingOpenService { private: - // SelfHealingOpenService is owned by NodeState - NodeState* node_state; + // SelfHealingOpenService is solely owned by NodeState + std::weak_ptr weak_node_state; public: - SelfHealingOpenService(NodeState* node_state) : node_state(node_state) {} + SelfHealingOpenService(std::shared_ptr& node_state) : weak_node_state(node_state) {} void try_start(ccf::kv::Tx& tx, bool recovering); void advance(ccf::kv::Tx& tx, bool timeout); From 300b13b842cbfeb70fbc446016ef2a8332ec7e1e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 14:01:21 +0100 Subject: [PATCH 148/197] Make clang-tidy happy --- src/node/self_healing_open_impl.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index d387ee09a1c3..c5be0d91608f 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -105,6 +105,10 @@ namespace ccf return true; }); + if (!maximum.has_value()) + { + throw std::logic_error("No valid gossip addresses provided"); + } auto* chosen_replica = tx.rw(node_state->network.self_healing_open_chosen_replica); chosen_replica->put(maximum->second); From cae83b78ec4210f5121d0095e2d425678ef35b03 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 14:04:27 +0100 Subject: [PATCH 149/197] Pass shared_ptr --- src/node/node_state.h | 2 +- src/node/self_healing_open_impl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index 4169802305eb..df8cdc9d3778 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -254,7 +254,7 @@ namespace ccf network(network), rpcsessions(rpcsessions), share_manager(network.ledger_secrets), - self_healing_open_impl(std::make_unique(this)) + self_healing_open_impl(std::make_unique(shared_from_this())) {} QuoteVerificationResult verify_quote( diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index 62456c63272b..c2e2955c1b77 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -16,7 +16,7 @@ namespace ccf std::weak_ptr weak_node_state; public: - SelfHealingOpenService(std::shared_ptr& node_state) : weak_node_state(node_state) {} + SelfHealingOpenService(std::shared_ptr node_state) : weak_node_state(node_state) {} void try_start(ccf::kv::Tx& tx, bool recovering); void advance(ccf::kv::Tx& tx, bool timeout); From f0b0fd5b6395318a3134e58776596c58e173f43b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 15:21:28 +0100 Subject: [PATCH 150/197] tidying --- src/node/node_state.h | 10 ++- src/node/rpc/node_frontend.h | 5 +- src/node/self_healing_open_impl.cpp | 119 +++++++++++++++------------- src/node/self_healing_open_impl.h | 8 +- 4 files changed, 79 insertions(+), 63 deletions(-) diff --git a/src/node/node_state.h b/src/node/node_state.h index df8cdc9d3778..1d8a5f9c2d64 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -42,6 +42,7 @@ #include "node/ledger_secrets.h" #include "node/local_sealing.h" #include "node/node_to_node_channel_manager.h" +#include "node/self_healing_open_impl.h" #include "node/snapshotter.h" #include "node_to_node.h" #include "pal/quote_generation.h" @@ -85,7 +86,8 @@ namespace ccf data.shrink_to_fit(); } - class NodeState : public AbstractNodeState, public std::enable_shared_from_this + class NodeState : public AbstractNodeState, + public std::enable_shared_from_this { friend class SelfHealingOpenService; @@ -236,7 +238,7 @@ namespace ccf last_recovered_signed_idx = last_recovered_idx; } - std::unique_ptr self_healing_open_impl; + SelfHealingOpenService self_healing_open_impl; public: NodeState( @@ -254,7 +256,7 @@ namespace ccf network(network), rpcsessions(rpcsessions), share_manager(network.ledger_secrets), - self_healing_open_impl(std::make_unique(shared_from_this())) + self_healing_open_impl(shared_from_this()) {} QuoteVerificationResult verify_quote( @@ -3008,7 +3010,7 @@ namespace ccf SelfHealingOpenService& self_healing_open() override { - return *self_healing_open_impl; + return self_healing_open_impl; } }; } diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 906c2d5942f0..a42dd240f1f0 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -1720,7 +1720,10 @@ namespace ccf ctx.rpc_ctx->set_claims_digest(std::move(digest_value)); } - this->node_operation.self_healing_open().try_start(ctx.tx, recovering); + if (recovering) + { + this->node_operation.self_healing_open().try_start(ctx.tx); + } LOG_INFO_FMT("Created service"); return make_success(true); diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index c5be0d91608f..cc11f2ffc055 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -1,20 +1,26 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the Apache 2.0 License. +#include "node/self_healing_open_impl.h" + #include "node_state.h" +#include + namespace ccf { - void SelfHealingOpenService::try_start(ccf::kv::Tx& tx, bool recovering) + + SelfHealingOpenService::SelfHealingOpenService( + std::shared_ptr node_state) : + weak_node_state(node_state), + config(node_state->config.recover.self_healing_open) + {} + + void SelfHealingOpenService::try_start(ccf::kv::Tx& tx) { - auto node_state = this->weak_node_state.lock(); - if ( - !recovering || !node_state->config.recover.self_healing_open.has_value()) + if (!config.has_value()) { - LOG_TRACE_FMT( - "Not recovering, or no self-healing-open addresses configured, " - "not starting self-healing-open timers"); - return; + LOG_INFO_FMT("Self-healing-open not configured, skipping"); } LOG_INFO_FMT("Starting self-healing-open"); @@ -35,31 +41,38 @@ namespace ccf tx.rw(Tables::SELF_HEALING_OPEN_VOTES)->clear(); start_message_retry_timers(); - start_failover_timers(tx); + start_failover_timers(); } void SelfHealingOpenService::advance(ccf::kv::Tx& tx, bool timeout) { + if (!config.has_value()) + { + throw std::logic_error("Self-healing-open not configured"); + } + auto node_state = this->weak_node_state.lock(); auto* sm_state_handle = tx.rw(node_state->network.self_healing_open_sm_state); auto* timeout_state_handle = tx.rw(node_state->network.self_healing_open_timeout_sm_state); - if ( - (!sm_state_handle->get().has_value()) || - (!timeout_state_handle->get().has_value())) + + auto sm_state_opt = sm_state_handle->get(); + auto timeout_state_opt = timeout_state_handle->get(); + if ((!sm_state_opt.has_value()) || (!timeout_state_opt.has_value())) { throw std::logic_error( "Self-healing-open state not set, cannot advance self-healing-open"); } + auto& sm_state = sm_state_opt.value(); + auto& timeout_state = timeout_state_opt.value(); - bool valid_timeout = timeout && - timeout_state_handle->get().value() == sm_state_handle->get().value(); + bool valid_timeout = timeout && sm_state == timeout_state; // Advance timeout SM if (timeout) { - switch (timeout_state_handle->get().value()) + switch (timeout_state) { case SelfHealingOpenSM::GOSSIPPING: LOG_TRACE_FMT("Advancing timeout SM to VOTING"); @@ -78,14 +91,13 @@ namespace ccf } // Advance self-healing-open SM - switch (sm_state_handle->get().value()) + switch (sm_state) { case SelfHealingOpenSM::GOSSIPPING: { auto* gossip_handle = tx.ro(node_state->network.self_healing_open_gossip); - auto quorum_size = - node_state->config.recover.self_healing_open->addresses.size(); + auto quorum_size = config->addresses.size(); if (gossip_handle->size() >= quorum_size || valid_timeout) { if (gossip_handle->size() == 0) @@ -119,11 +131,7 @@ namespace ccf case SelfHealingOpenSM::VOTING: { auto* votes = tx.rw(node_state->network.self_healing_open_votes); - if ( - votes->size() >= - node_state->config.recover.self_healing_open->addresses.size() / 2 + - 1 || - valid_timeout) + if (votes->size() >= config->addresses.size() / 2 + 1 || valid_timeout) { if (votes->size() == 0) { @@ -192,8 +200,7 @@ namespace ccf } default: throw std::logic_error(fmt::format( - "Unknown self-healing-open state: {}", - static_cast(sm_state_handle->get().value()))); + "Unknown self-healing-open state: {}", static_cast(sm_state))); } } @@ -201,19 +208,25 @@ namespace ccf { auto retry_timer_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { + if (!msg->data.self.config.has_value()) + { + throw std::logic_error("Self-healing-open not configured"); + } auto node_state = msg->data.self.weak_node_state.lock(); std::lock_guard guard(node_state->lock); auto tx = node_state->network.tables->create_read_only_tx(); auto* sm_state_handle = tx.ro(node_state->network.self_healing_open_sm_state); - if (!sm_state_handle->get().has_value()) + + auto sm_state_opt = sm_state_handle->get(); + if (sm_state_opt.has_value()) { throw std::logic_error( "Self-healing-open state not set, cannot retry " "self-healing-open"); } - auto sm_state = sm_state_handle->get().value(); + auto& sm_state = sm_state_opt.value(); // Keep doing this until the node is no longer in recovery if (sm_state == SelfHealingOpenSM::OPEN) @@ -262,8 +275,7 @@ namespace ccf static_cast(sm_state))); } - auto delay = - node_state->config.recover.self_healing_open->retry_timeout; + auto delay = msg->data.self.config->retry_timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, @@ -273,18 +285,21 @@ namespace ccf threading::get_current_thread_id(), std::move(retry_timer_msg)); } - void SelfHealingOpenService::start_failover_timers(ccf::kv::Tx& tx) + void SelfHealingOpenService::start_failover_timers() { + if (!config.has_value()) + { + throw std::logic_error("Self-healing-open not configured"); + } auto node_state = this->weak_node_state.lock(); - auto* state_handle = tx.rw(node_state->network.self_healing_open_sm_state); - state_handle->put(SelfHealingOpenSM::GOSSIPPING); - auto* timeout_state_handle = - tx.rw(node_state->network.self_healing_open_timeout_sm_state); - timeout_state_handle->put(SelfHealingOpenSM::GOSSIPPING); // Dispatch timeouts auto timeout_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { + if (!msg->data.self.config.has_value()) + { + throw std::logic_error("Self-healing-open not configured"); + } auto node_state = msg->data.self.weak_node_state.lock(); std::lock_guard guard(node_state->lock); LOG_TRACE_FMT( @@ -344,14 +359,13 @@ namespace ccf curl::CurlmLibuvContextSingleton::get_instance()->attach_request( std::move(curl_request)); - auto delay = node_state->config.recover.self_healing_open->timeout; + auto delay = msg->data.self.config->timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, *this); ::threading::ThreadMessaging::instance().add_task_after( - std::move(timeout_msg), - node_state->config.recover.self_healing_open->timeout); + std::move(timeout_msg), config->timeout); } void dispatch_authenticated_message( @@ -438,15 +452,11 @@ namespace ccf void SelfHealingOpenService::send_gossip_unsafe() { - auto node_state = this->weak_node_state.lock(); - // Caller must ensure that the current node's quote_info is populated: - // ie not yet reached partOfNetwork - if (!node_state->config.recover.self_healing_open.has_value()) + if (!config.has_value()) { - LOG_TRACE_FMT( - "Self-healing-open addresses not set, cannot start gossip retries"); - return; + throw std::logic_error("Self-healing-open not configured"); } + auto node_state = this->weak_node_state.lock(); LOG_TRACE_FMT("Broadcasting self-healing-open gossip"); @@ -471,9 +481,12 @@ namespace ccf void SelfHealingOpenService::send_vote_unsafe( const SelfHealingOpenNodeInfo_t& node_info) { + if (!config.has_value()) + { + throw std::logic_error("Self-healing-open not configured"); + } auto node_state = this->weak_node_state.lock(); - // Caller must ensure that the current node's quote_info is populated: - // ie not yet reached partOfNetwork + LOG_TRACE_FMT( "Sending self-healing-open vote to {} at {}", node_info.intrinsic_id, @@ -493,22 +506,18 @@ namespace ccf void SelfHealingOpenService::send_iamopen_unsafe() { - auto node_state = this->weak_node_state.lock(); - // Caller must ensure that the current node's quote_info is populated: - // ie not yet reached partOfNetwork - if (!node_state->config.recover.self_healing_open.has_value()) + if (!config.has_value()) { - LOG_TRACE_FMT("Self-healing-open addresses not set, cannot send iamopen"); - return; + throw std::logic_error("Self-healing-open not configured"); } + auto node_state = this->weak_node_state.lock(); LOG_TRACE_FMT("Sending self-healing-open iamopen"); self_healing_open::IAmOpenRequest request{.info = make_node_info()}; nlohmann::json request_json = request; - for (auto& target_address : - node_state->config.recover.self_healing_open->addresses) + for (auto& target_address : config->addresses) { if ( target_address == diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index c2e2955c1b77..7517d64ba922 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -2,6 +2,7 @@ // Licensed under the Apache 2.0 License. #pragma once +#include "ccf/node/startup_config.h" #include "ccf/service/tables/self_heal_open.h" #include "ccf/tx.h" #include "self_healing_open_types.h" @@ -14,10 +15,11 @@ namespace ccf private: // SelfHealingOpenService is solely owned by NodeState std::weak_ptr weak_node_state; + std::optional config; public: - SelfHealingOpenService(std::shared_ptr node_state) : weak_node_state(node_state) {} - void try_start(ccf::kv::Tx& tx, bool recovering); + SelfHealingOpenService(std::shared_ptr node_state); + void try_start(ccf::kv::Tx& tx); void advance(ccf::kv::Tx& tx, bool timeout); private: @@ -29,7 +31,7 @@ namespace ccf // Start path void start_message_retry_timers(); - void start_failover_timers(ccf::kv::Tx& tx); + void start_failover_timers(); // Steady state operations self_healing_open::RequestNodeInfo make_node_info(); From 2cf40b23ebfaa1a8c627ce17bbe6cb0372f9322e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 15:32:52 +0100 Subject: [PATCH 151/197] tidying 2 --- src/node/self_healing_open_impl.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index cc11f2ffc055..b810662e5d09 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -466,8 +466,7 @@ namespace ccf }; nlohmann::json request_json = request; - for (auto& target_address : - node_state->config.recover.self_healing_open->addresses) + for (auto& target_address :config->addresses) { dispatch_authenticated_message( request_json, From 2e4ca7c77c43061d5012d90d07034e3fe856c0f0 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 15:53:49 +0100 Subject: [PATCH 152/197] Revert shared_ptr node_state --- src/enclave/enclave.h | 4 ++-- src/node/node_state.h | 5 ++-- src/node/self_healing_open_impl.cpp | 36 +++++++++++++++-------------- src/node/self_healing_open_impl.h | 8 +++---- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/enclave/enclave.h b/src/enclave/enclave.h index 7eaa94a263e3..4c699a5295e8 100644 --- a/src/enclave/enclave.h +++ b/src/enclave/enclave.h @@ -48,7 +48,7 @@ namespace ccf ccf::NetworkState network; std::shared_ptr rpc_map; std::shared_ptr rpcsessions; - std::shared_ptr node; + std::unique_ptr node; ringbuffer::WriterPtr to_host = nullptr; std::chrono::high_resolution_clock::time_point last_tick_time; @@ -102,7 +102,7 @@ namespace ccf std::make_shared(chunk_threshold)); LOG_TRACE_FMT("Creating node"); - node = std::make_shared( + node = std::make_unique( *writer_factory, network, rpcsessions, curve_id); LOG_TRACE_FMT("Creating context"); diff --git a/src/node/node_state.h b/src/node/node_state.h index 1d8a5f9c2d64..44fbe0c0a50b 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -86,8 +86,7 @@ namespace ccf data.shrink_to_fit(); } - class NodeState : public AbstractNodeState, - public std::enable_shared_from_this + class NodeState : public AbstractNodeState { friend class SelfHealingOpenService; @@ -256,7 +255,7 @@ namespace ccf network(network), rpcsessions(rpcsessions), share_manager(network.ledger_secrets), - self_healing_open_impl(shared_from_this()) + self_healing_open_impl(this) {} QuoteVerificationResult verify_quote( diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index b810662e5d09..d771398a207d 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -10,14 +10,13 @@ namespace ccf { - SelfHealingOpenService::SelfHealingOpenService( - std::shared_ptr node_state) : - weak_node_state(node_state), - config(node_state->config.recover.self_healing_open) + SelfHealingOpenService::SelfHealingOpenService(NodeState* node_state_) : + node_state(node_state_) {} void SelfHealingOpenService::try_start(ccf::kv::Tx& tx) { + auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) { LOG_INFO_FMT("Self-healing-open not configured, skipping"); @@ -46,12 +45,12 @@ namespace ccf void SelfHealingOpenService::advance(ccf::kv::Tx& tx, bool timeout) { + auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) { throw std::logic_error("Self-healing-open not configured"); } - auto node_state = this->weak_node_state.lock(); auto* sm_state_handle = tx.rw(node_state->network.self_healing_open_sm_state); auto* timeout_state_handle = @@ -208,11 +207,13 @@ namespace ccf { auto retry_timer_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { - if (!msg->data.self.config.has_value()) + auto& config = + msg->data.self.node_state->config.recover.self_healing_open; + if (config) { throw std::logic_error("Self-healing-open not configured"); } - auto node_state = msg->data.self.weak_node_state.lock(); + auto& node_state = msg->data.self.node_state; std::lock_guard guard(node_state->lock); auto tx = node_state->network.tables->create_read_only_tx(); @@ -275,7 +276,7 @@ namespace ccf static_cast(sm_state))); } - auto delay = msg->data.self.config->retry_timeout; + auto delay = config->retry_timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, @@ -287,20 +288,22 @@ namespace ccf void SelfHealingOpenService::start_failover_timers() { + auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) { throw std::logic_error("Self-healing-open not configured"); } - auto node_state = this->weak_node_state.lock(); // Dispatch timeouts auto timeout_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { - if (!msg->data.self.config.has_value()) + auto& config = + msg->data.self.node_state->config.recover.self_healing_open; + if (!config.has_value()) { throw std::logic_error("Self-healing-open not configured"); } - auto node_state = msg->data.self.weak_node_state.lock(); + auto* node_state = msg->data.self.node_state; std::lock_guard guard(node_state->lock); LOG_TRACE_FMT( "Self-healing-open timeout, sending timeout to internal handlers"); @@ -359,7 +362,7 @@ namespace ccf curl::CurlmLibuvContextSingleton::get_instance()->attach_request( std::move(curl_request)); - auto delay = msg->data.self.config->timeout; + auto delay = config->retry_timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, @@ -437,7 +440,6 @@ namespace ccf self_healing_open::RequestNodeInfo SelfHealingOpenService::make_node_info() { - auto node_state = this->weak_node_state.lock(); return { .quote_info = node_state->quote_info, .published_network_address = @@ -452,11 +454,11 @@ namespace ccf void SelfHealingOpenService::send_gossip_unsafe() { + auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) { throw std::logic_error("Self-healing-open not configured"); } - auto node_state = this->weak_node_state.lock(); LOG_TRACE_FMT("Broadcasting self-healing-open gossip"); @@ -466,7 +468,7 @@ namespace ccf }; nlohmann::json request_json = request; - for (auto& target_address :config->addresses) + for (auto& target_address : config->addresses) { dispatch_authenticated_message( request_json, @@ -480,11 +482,11 @@ namespace ccf void SelfHealingOpenService::send_vote_unsafe( const SelfHealingOpenNodeInfo_t& node_info) { + auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) { throw std::logic_error("Self-healing-open not configured"); } - auto node_state = this->weak_node_state.lock(); LOG_TRACE_FMT( "Sending self-healing-open vote to {} at {}", @@ -505,11 +507,11 @@ namespace ccf void SelfHealingOpenService::send_iamopen_unsafe() { + auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) { throw std::logic_error("Self-healing-open not configured"); } - auto node_state = this->weak_node_state.lock(); LOG_TRACE_FMT("Sending self-healing-open iamopen"); diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index 7517d64ba922..42cb236b5341 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -13,12 +13,12 @@ namespace ccf class SelfHealingOpenService { private: - // SelfHealingOpenService is solely owned by NodeState - std::weak_ptr weak_node_state; - std::optional config; + // SelfHealingOpenService is solely owned by NodeState, and all tasks should + // finish before NodeState is destroyed + NodeState* node_state; public: - SelfHealingOpenService(std::shared_ptr node_state); + SelfHealingOpenService(NodeState* node_state); void try_start(ccf::kv::Tx& tx); void advance(ccf::kv::Tx& tx, bool timeout); From cbc15136683f89e7db78c93b53f9510e4c997e14 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 15:58:20 +0100 Subject: [PATCH 153/197] Stop skipping timers --- src/node/self_healing_open_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index d771398a207d..037dfac02f9d 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -209,7 +209,7 @@ namespace ccf [](std::unique_ptr<::threading::Tmsg> msg) { auto& config = msg->data.self.node_state->config.recover.self_healing_open; - if (config) + if (!config.has_value()) { throw std::logic_error("Self-healing-open not configured"); } From 2430e1777290b9a60ded4b0304f4c0afde9638c6 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 16:16:39 +0100 Subject: [PATCH 154/197] Ensure we initialise self-healing-open state --- src/node/rpc/node_frontend.h | 5 +---- src/node/self_healing_open_impl.cpp | 30 ++++++++++++++++++----------- src/node/self_healing_open_impl.h | 2 +- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index a42dd240f1f0..906c2d5942f0 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -1720,10 +1720,7 @@ namespace ccf ctx.rpc_ctx->set_claims_digest(std::move(digest_value)); } - if (recovering) - { - this->node_operation.self_healing_open().try_start(ctx.tx); - } + this->node_operation.self_healing_open().try_start(ctx.tx, recovering); LOG_INFO_FMT("Created service"); return make_success(true); diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 037dfac02f9d..959ec73b9a01 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -14,17 +14,9 @@ namespace ccf node_state(node_state_) {} - void SelfHealingOpenService::try_start(ccf::kv::Tx& tx) + void SelfHealingOpenService::try_start(ccf::kv::Tx& tx, bool recovering) { - auto& config = node_state->config.recover.self_healing_open; - if (!config.has_value()) - { - LOG_INFO_FMT("Self-healing-open not configured, skipping"); - } - - LOG_INFO_FMT("Starting self-healing-open"); - - // Reset the self-healing-open state + // Clear any previous state tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE) ->clear(); tx.rw( @@ -39,6 +31,20 @@ namespace ccf ->clear(); tx.rw(Tables::SELF_HEALING_OPEN_VOTES)->clear(); + auto& config = node_state->config.recover.self_healing_open; + if (!recovering || !config.has_value()) + { + LOG_INFO_FMT("Not recovering or self-healing-open not configured, skipping self-healing-open"); + } + + LOG_INFO_FMT("Starting self-healing-open"); + + tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE) + ->put(SelfHealingOpenSM::GOSSIPPING); + tx.rw( + Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE) + ->put(SelfHealingOpenSM::GOSSIPPING); + start_message_retry_timers(); start_failover_timers(); } @@ -205,6 +211,7 @@ namespace ccf void SelfHealingOpenService::start_message_retry_timers() { + LOG_TRACE_FMT("Self-healing-open: Setting up retry timers"); auto retry_timer_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { auto& config = @@ -221,7 +228,7 @@ namespace ccf tx.ro(node_state->network.self_healing_open_sm_state); auto sm_state_opt = sm_state_handle->get(); - if (sm_state_opt.has_value()) + if (!sm_state_opt.has_value()) { throw std::logic_error( "Self-healing-open state not set, cannot retry " @@ -294,6 +301,7 @@ namespace ccf throw std::logic_error("Self-healing-open not configured"); } + LOG_TRACE_FMT("Self-healing-open: Setting up failover timers"); // Dispatch timeouts auto timeout_msg = std::make_unique<::threading::Tmsg>( [](std::unique_ptr<::threading::Tmsg> msg) { diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index 42cb236b5341..5ba52830a589 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -19,7 +19,7 @@ namespace ccf public: SelfHealingOpenService(NodeState* node_state); - void try_start(ccf::kv::Tx& tx); + void try_start(ccf::kv::Tx& tx, bool recovering); void advance(ccf::kv::Tx& tx, bool timeout); private: From ec72561abf38160b111817b24997924c3e1e3dfe Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 16:27:57 +0100 Subject: [PATCH 155/197] Ensure we use the correct timeout for failovers --- src/node/self_healing_open_impl.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 959ec73b9a01..a21ee168453d 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -34,7 +34,9 @@ namespace ccf auto& config = node_state->config.recover.self_healing_open; if (!recovering || !config.has_value()) { - LOG_INFO_FMT("Not recovering or self-healing-open not configured, skipping self-healing-open"); + LOG_INFO_FMT( + "Not recovering or self-healing-open not configured, skipping " + "self-healing-open"); } LOG_INFO_FMT("Starting self-healing-open"); @@ -370,7 +372,7 @@ namespace ccf curl::CurlmLibuvContextSingleton::get_instance()->attach_request( std::move(curl_request)); - auto delay = config->retry_timeout; + auto delay = config->timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, @@ -437,10 +439,9 @@ namespace ccf std::move(response_callback)); LOG_TRACE_FMT( - "Dispatching attested message for {} to {}: {}", + "Dispatching attested {} message to {}", curl_request->get_method().c_str(), - curl_request->get_url(), - request.dump()); + curl_request->get_url()); curl::CurlmLibuvContextSingleton::get_instance()->attach_request( std::move(curl_request)); From d5e96e03be9ccb365ccf932283dca1cd80ee4306 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 16:31:12 +0100 Subject: [PATCH 156/197] sigh --- src/node/self_healing_open_impl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index a21ee168453d..c02db521d7be 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -37,6 +37,7 @@ namespace ccf LOG_INFO_FMT( "Not recovering or self-healing-open not configured, skipping " "self-healing-open"); + return; } LOG_INFO_FMT("Starting self-healing-open"); From 29b4de4b1585ce7726fa3dd9c93c45868eb9c292 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 16:37:58 +0100 Subject: [PATCH 157/197] Update cchost_config --- doc/host_config_schema/cchost_config.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index d116f5658429..3895c3c11466 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -421,19 +421,19 @@ "self_healing_open": { "type": "object", "properties": { - "self_healing_open_addresses": { + "addresses": { "type": "array", "items": { "type": "string" }, "description": "List of addresses (host:port) of the cluster that should open via self-healing-open" }, - "self_healing_open_retry_timeout": { + "retry_timeout": { "type": "string", "default": "100ms", - "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be leass than 'self_healing_open_timeout'" + "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be leass than 'timeout'" }, - "self_healing_open_timeout": { + "timeout": { "type": "string", "default": "2000ms", "description": "Interval (time string) after which the node forcibly advances to the next phase of the self-healing-open protocol" From 85538ec0c6a181110fe0d6b0d7092468f7cb82ba Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 17:03:40 +0100 Subject: [PATCH 158/197] Reformat frontend --- src/node/rpc/node_frontend.h | 85 ++++++++++++++---------------------- 1 file changed, 32 insertions(+), 53 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 906c2d5942f0..ca1313b7a370 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -2262,34 +2262,30 @@ namespace ccf auto& args, const nlohmann::json& params) { auto config = this->context.get_subsystem(); - if (!config) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "NodeConfigurationSubsystem is not available"); - } - if (!config->get().node_config.recover.self_healing_open.has_value()) + if ( + config == nullptr || + !config->get().node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, ccf::errors::InvalidNodeState, - "Self-healing-open addresses are not configured"); + "Unable to get self-healing-open configuration"); } auto in = params.get(); - auto valid = self_healing_open_validate_and_store_node_info( + auto is_invalid = self_healing_open_validate_and_store_node_info( args, args.tx, in.info); - if (valid.has_value()) + if (is_invalid.has_value()) { - auto [code, message] = valid.value(); + auto [code, message] = is_invalid.value(); return make_error(code, ccf::errors::InvalidQuote, message); } - LOG_TRACE_FMT("Processing self-healing-open gossip RPC"); + LOG_TRACE_FMT( + "Self-healing-open: recieve gossip from {}", in.info.intrinsic_id); auto chosen_replica = - args.tx.rw(this->network.self_healing_open_chosen_replica); + args.tx.ro(this->network.self_healing_open_chosen_replica); // This freezes the gossips at the point where it votes if (chosen_replica->get().has_value()) { @@ -2302,7 +2298,8 @@ namespace ccf auto gossip_handle = args.tx.rw(this->network.self_healing_open_gossip); if (gossip_handle->get(in.info.intrinsic_id).has_value()) { - LOG_INFO_FMT("Node {} already gossiped", in.info.intrinsic_id); + LOG_INFO_FMT( + "Node {} already gossiped, skipping", in.info.intrinsic_id); return make_success( fmt::format("Node {} already gossiped", in.info.intrinsic_id)); } @@ -2339,19 +2336,14 @@ namespace ccf [this](auto& args, const nlohmann::json& params) { auto config = this->context.get_subsystem(); - if (!config) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "NodeConfigurationSubsystem is not available"); - } - if (!config->get().node_config.recover.self_healing_open.has_value()) + if ( + config == nullptr || + !config->get().node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, ccf::errors::InvalidNodeState, - "Self-healing-open addresses are not configured"); + "Unable to get self-healing-open configuration"); } auto in = params.get(); @@ -2362,10 +2354,11 @@ namespace ccf auto [code, message] = valid.value(); return make_error(code, ccf::errors::InvalidQuote, message); } - LOG_TRACE_FMT("Processing self-healing-open vote RPC"); + LOG_TRACE_FMT( + "Self-healing-open: recieve vote from {}", in.info.intrinsic_id); - auto votes = args.tx.rw(this->network.self_healing_open_votes); - votes->insert(in.info.intrinsic_id); + args.tx.rw(this->network.self_healing_open_votes) + ->insert(in.info.intrinsic_id); try { @@ -2399,19 +2392,14 @@ namespace ccf [this](auto& args, const nlohmann::json& params) { auto config = this->context.get_subsystem(); - if (!config) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "NodeConfigurationSubsystem is not available"); - } - if (!config->get().node_config.recover.self_healing_open.has_value()) + if ( + config == nullptr || + !config->get().node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, ccf::errors::InvalidNodeState, - "Self-healing-open addresses are not configured"); + "Unable to get self-healing-open configuration"); } auto in = params.get(); @@ -2423,14 +2411,10 @@ namespace ccf return make_error(code, ccf::errors::InvalidQuote, message); } - auto* sm_state = args.tx.rw(this->network.self_healing_open_sm_state); - sm_state->put(SelfHealingOpenSM::JOINING); - - LOG_INFO_FMT("Self-healing-open is JOINING {}", in.info.intrinsic_id); - - auto* chosen_replica = - args.tx.rw(this->network.self_healing_open_chosen_replica); - chosen_replica->put(in.info.intrinsic_id); + args.tx.rw(this->network.self_healing_open_sm_state) + ->put(SelfHealingOpenSM::JOINING); + args.tx.rw(this->network.self_healing_open_chosen_replica) + ->put(in.info.intrinsic_id); try { @@ -2464,19 +2448,14 @@ namespace ccf const nlohmann::json& params) { (void)params; auto config = this->context.get_subsystem(); - if (!config) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "NodeConfigurationSubsystem is not available"); - } - if (!config->get().node_config.recover.self_healing_open.has_value()) + if ( + config == nullptr || + !config->get().node_config.recover.self_healing_open.has_value()) { return make_error( HTTP_STATUS_BAD_REQUEST, ccf::errors::InvalidNodeState, - "Self-healing-open addresses are not configured"); + "Unable to get self-healing-open configuration"); } LOG_TRACE_FMT("Self-healing-open timeout received"); From 2543e1dfe02ca477ed3d7e58f4e751c66e61979e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 17:04:42 +0100 Subject: [PATCH 159/197] fmt --- src/node/rpc/node_interface.h | 1 - src/node/rpc/node_operation_interface.h | 1 - src/node/self_healing_open_impl.cpp | 2 +- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index cc692e48d192..6ce2714a99e3 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -4,7 +4,6 @@ #include "ccf/crypto/pem.h" #include "ccf/ds/quote_info.h" -#include "ccf/node/startup_config.h" #include "ccf/node_startup_state.h" #include "ccf/service/acme_client_config.h" #include "ccf/service/node_info_network.h" diff --git a/src/node/rpc/node_operation_interface.h b/src/node/rpc/node_operation_interface.h index b3ace7f004cf..f92e77994559 100644 --- a/src/node/rpc/node_operation_interface.h +++ b/src/node/rpc/node_operation_interface.h @@ -6,7 +6,6 @@ #include "ccf/ds/quote_info.h" #include "ccf/node/cose_signatures_config.h" #include "ccf/node/quote.h" -#include "ccf/node/startup_config.h" #include "ccf/node_startup_state.h" #include "ccf/node_subsystem_interface.h" #include "ccf/service/tables/code_id.h" diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index c02db521d7be..504c9a5cb904 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -37,7 +37,7 @@ namespace ccf LOG_INFO_FMT( "Not recovering or self-healing-open not configured, skipping " "self-healing-open"); - return; + return; } LOG_INFO_FMT("Starting self-healing-open"); From fbbf5f94c5e183e184766c0359b67ffe1bef783e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 17:06:24 +0100 Subject: [PATCH 160/197] Remove old tla spec --- tla/disaster-recovery/selfhealingopen.cfg | 14 --- tla/disaster-recovery/selfhealingopen.tla | 139 ---------------------- 2 files changed, 153 deletions(-) delete mode 100644 tla/disaster-recovery/selfhealingopen.cfg delete mode 100644 tla/disaster-recovery/selfhealingopen.tla diff --git a/tla/disaster-recovery/selfhealingopen.cfg b/tla/disaster-recovery/selfhealingopen.cfg deleted file mode 100644 index 3d16b6ceaeb9..000000000000 --- a/tla/disaster-recovery/selfhealingopen.cfg +++ /dev/null @@ -1,14 +0,0 @@ -SPECIFICATION - Spec - -INVARIANTS - TypeOk - InvNoFork - InvUnanimousLiveVotesNoDeadlock - InvNonUnanimousOpen - -CONSTANTS - NID = {r1, r2, r3, r4, r4} - -SYMMETRY - Symmetry \ No newline at end of file diff --git a/tla/disaster-recovery/selfhealingopen.tla b/tla/disaster-recovery/selfhealingopen.tla deleted file mode 100644 index c68902e6e26c..000000000000 --- a/tla/disaster-recovery/selfhealingopen.tla +++ /dev/null @@ -1,139 +0,0 @@ ----- MODULE selfhealingopen ---- - -EXTENDS Integers, Sequences, FiniteSets, TLC - -CONSTANTS - NID - -MAJ_QUORUM_LIMIT == (Cardinality(NID)) \div 2 + 1 - -VARIABLES - next_step, - txids, - gossip_msgs, - recv_gossips, - vote_msgs, - open_msgs - -vars == <> - -TypeOk == - /\ next_step \in [NID -> {"gossip", "vote", "open/join", "open", "join"}] - /\ txids \in [NID -> Nat] - /\ gossip_msgs \subseteq [ - src : NID, - txid : Nat - ] - /\ recv_gossips \in [NID -> SUBSET gossip_msgs] - /\ vote_msgs \subseteq [ - src : NID, - vote : NID, - recv : SUBSET NID - ] - /\ open_msgs \subseteq [ - src : NID - ] - -TXID == - CHOOSE F \in [NID -> 1..Cardinality(NID)]: - \A k1, k2 \in DOMAIN F: F[k1] = F[k2] => k1 = k2 - -Init == - /\ next_step = [n \in NID |-> "gossip"] - /\ txids = [n \in NID|-> TXID[n]] - /\ gossip_msgs = {} - /\ recv_gossips = [n \in NID |-> {}] - /\ vote_msgs = {} - /\ open_msgs = {} - -ActionSendGossip(n) == - LET msg == [src |-> n, txid |-> txids[n]] IN - /\ next_step[n] = "gossip" - /\ next_step' = [next_step EXCEPT ![n] = "vote"] - /\ recv_gossips' = [recv_gossips EXCEPT ![n] = recv_gossips[n] \cup {msg}] - /\ gossip_msgs' = gossip_msgs \cup {msg} - /\ UNCHANGED << txids, vote_msgs, open_msgs >> - -ActionRecvGossip(n) == - \E m \in gossip_msgs: - /\ m \notin recv_gossips[n] - /\ recv_gossips' = [recv_gossips EXCEPT ![n] = recv_gossips[n] \cup {m}] - /\ UNCHANGED << next_step, txids, gossip_msgs, vote_msgs, open_msgs >> - -Vote(n) == - LET recv_nodes == {g.src : g \in recv_gossips[n]} - max_txid_gossip == - CHOOSE g \in recv_gossips[n]: - \A g1 \in recv_gossips[n]: g.txid >= g1.txid - vote == [src |-> n, vote |-> max_txid_gossip.src, recv |-> recv_nodes] - IN - /\ next_step[n] = "vote" - /\ next_step' = [next_step EXCEPT ![n] = "open/join"] - /\ vote_msgs' = vote_msgs \cup {vote} - /\ UNCHANGED << txids, gossip_msgs, recv_gossips, open_msgs >> - -ActionVoteQuorum(n) == - \* Non-Unanimous gossips can cause deadlocks - /\ {g.src : g \in recv_gossips[n]} = NID - /\ Vote(n) - -ActionVoteTimeout(n) == - /\ Cardinality({g.src : g \in recv_gossips[n]}) >= MAJ_QUORUM_LIMIT - /\ Vote(n) - -ActionOpen(n) == - \E Vs \in SUBSET {v \in vote_msgs: v.vote = n}: - /\ Cardinality(Vs) >= MAJ_QUORUM_LIMIT - /\ next_step[n] = "open/join" - /\ next_step' = [next_step EXCEPT ![n] = "open"] - /\ open_msgs' = open_msgs \cup {[src |-> n]} - /\ UNCHANGED << txids, gossip_msgs, recv_gossips, vote_msgs >> - -ActionJoin(n) == - \E o \in open_msgs: - /\ next_step[n] = "open/join" - /\ next_step' = [next_step EXCEPT ![n] = "join"] - /\ UNCHANGED << txids, gossip_msgs, recv_gossips, vote_msgs, open_msgs >> - - -Next == - \E n \in NID: - \/ ActionSendGossip(n) - \/ ActionRecvGossip(n) - \/ ActionVoteQuorum(n) - \/ ActionVoteTimeout(n) - \/ ActionOpen(n) - \/ ActionJoin(n) - -Spec == Init /\ [][Next]_vars - -InvNoFork == - (Cardinality({n \in NID: next_step[n] = "open"}) <= 1) - -InvCorrectState == ~\A n \in NID: next_step[n] \in {"open", "join"} - -\* We optimally should be unable to reach a deadlock state -\* where every node is blocked but it may be impossible due to timeouts -InvNoDeadlockStates == - (\A n \in NID: next_step[n] = "open/join") - => - ( - \E n \in NID: - \/ ENABLED ActionOpen(n) - \/ ENABLED ActionJoin(n) - ) - -InvUnanimousLiveVotesNoDeadlock == - LET live_nid == {n \in NID: next_step[n] /= "gossip"} IN - (\A m \in vote_msgs: m.recv = live_nid) => InvNoDeadlockStates - -InvNonUnanimousOpen == -LET live_nid == {n \in NID: next_step[n] /= "gossip"} IN - ~ /\ \E n \in NID: next_step[n] = "gossip" - /\ \E n \in NID: next_step[n] = "open" - /\ \A m \in vote_msgs: m.recv = live_nid - /\ \A n \in NID: next_step[n] \in {"gossip", "open", "join"} - -Symmetry == Permutations(NID) - -==== \ No newline at end of file From 49cce91417644130cef647a0bcac8440c5b145f4 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 17:12:13 +0100 Subject: [PATCH 161/197] Update network.py to coalesce ledger secrets --- tests/infra/network.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/infra/network.py b/tests/infra/network.py index e77c812ee523..6c7a6fc796d9 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -807,6 +807,9 @@ def start_in_self_healing_open( "read_only_ledger_dirs": committed_ledger_dirs[i] or [], "snapshots_dir": snapshot_dirs[i] or None, } + | {"previous_sealed_ledger_secret_location": sealed_ledger_secrets[i]} + if sealed_ledger_secrets and i < len(sealed_ledger_secrets) + else {} ) for i, d in self.per_node_args_override.items() } @@ -838,10 +841,6 @@ def start_in_self_healing_open( forwarded_args_with_overrides = forwarded_args.copy() forwarded_args_with_overrides.update(self.per_node_args_override.get(i, {})) - if sealed_ledger_secrets is not None and i < len(sealed_ledger_secrets): - forwarded_args_with_overrides[ - "previous_sealed_ledger_secret_location" - ] = sealed_ledger_secrets[i] try: node_kwargs = { "lib_name": args.package, From 22dca963bf74d61deedddb9b67b0ab0d5359ab44 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 22 Sep 2025 17:19:54 +0100 Subject: [PATCH 162/197] Fix network.py --- tests/infra/network.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/infra/network.py b/tests/infra/network.py index 6c7a6fc796d9..82522758e75d 100644 --- a/tests/infra/network.py +++ b/tests/infra/network.py @@ -807,9 +807,11 @@ def start_in_self_healing_open( "read_only_ledger_dirs": committed_ledger_dirs[i] or [], "snapshots_dir": snapshot_dirs[i] or None, } - | {"previous_sealed_ledger_secret_location": sealed_ledger_secrets[i]} - if sealed_ledger_secrets and i < len(sealed_ledger_secrets) - else {} + | ( + {"previous_sealed_ledger_secret_location": sealed_ledger_secrets[i]} + if sealed_ledger_secrets and i < len(sealed_ledger_secrets) + else {} + ) ) for i, d in self.per_node_args_override.items() } @@ -1367,6 +1369,8 @@ def wait_for_states(self, node, states, timeout=3, **client_kwargs): break except ConnectionRefusedError: pass + except CCFConnectionException: + pass time.sleep(0.1) else: raise TimeoutError( From afb4a1d2103da2c44e1fa33a074e42fa5d116bfe Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 10:56:06 +0100 Subject: [PATCH 163/197] Add docs --- doc/operations/recovery.rst | 81 +++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/doc/operations/recovery.rst b/doc/operations/recovery.rst index db257b724ebb..c8b01fe25b1c 100644 --- a/doc/operations/recovery.rst +++ b/doc/operations/recovery.rst @@ -150,6 +150,87 @@ Notes - Operators can track the number of times a given service has undergone the disaster recovery procedure via the :http:GET:`/node/network` endpoint (``recovery_count`` field). +Self-Healing-Open recovery +-------------------------- + +In environments with limited orchestration or external access, it is desirable to allow the network to recover from a disaster without operator intervention. +At a high level, Self-Healing-Open recovery allows recovering replicas to discover which replica has the most up-to-date ledger and automatically recover the network using that ledger. + +There are two paths, a standard path, and a very-high-availablity timeout path. +The standard path ensures that if all nodes restart, at most a minority of the ledgers get rolled back, and no timeouts trigger, then there will be only one recovered network, and all committed entries from the previous network will be preserved. +However, the standard path can become stuck, in which case the timeout path is designed to ensure progress. + +In the standard path, nodes first gossip with each other. +Once they have heard from every node they choose a node to vote for. +If a node receives votes from a majority of nodes, it invokes `transition-to-open` and the other nodes restart to subsequently join it. +This path is illustrated below, and is guaranteed to succeed if at most a minority of nodes have failed, all nodes can communicate and no timeouts trigger. + +.. mermaid:: + sequenceDiagram + participant N1 + participant N2 + participant N3 + + Note over N1, N3: Gossip + + N1 ->> N2: Gossip(Tx=1) + N1 ->> N3: Gossip(Tx=1) + N2 ->> N3: Gossip(Tx=2) + N3 ->> N2: Gossip(Tx=3) + + Note over N1, N3: Vote + N2 ->> N3: Vote + N3 ->> N3: Vote + + Note over N1, N3: Open/Join + N3 ->> N1: IAmOpen + N3 ->> N2: IAmOpen + + Note over N1, N2: Restart + + Note over N3: Transition-to-open + + Note over N3: Local unsealing + + Note over N3: Open + + N1 ->> N3: Join + N2 ->> N3: Join + +In the timeout path, each phase has a timeout to skip to the next phase if a failure has occurred. +For example, the standard path requires all nodes to communicate to advance from the gossip phase to the vote phase. +However, if any node fails to recover, the standard path is stuck. +In this case, after a timeout, nodes will advance to the vote phase regardless of whether they have heard from all nodes, and vote for the best ledger they have heard of at that point. + +Unfortunately, this can lead to multiple forks of the service if different nodes cannot communicate with each other before the timeout. +Hence, we recommend setting the timeout substantially higher than the highest expected recovery time, to minimise the chance of this happening. +This case is illustrated below. + +.. mermaid:: + sequenceDiagram + participant N1 + participant N2 + participant N3 + + Note over N1, N3: Gossip + + N2 ->> N3: Gossip(Tx=2) + N3 ->> N2: Gossip(Tx=3) + + Note over N1: Timeout + Note over N3: Timeout + + Note over N1, N3: Vote + + N1 ->> N1: Vote + N3 ->> N3: Vote + N2 ->> N3: Vote + + Note over N1, N3: Open/Join + + Note over N1: Transition-to-open + Note over N3: Transition-to-open + .. rubric:: Footnotes .. [#crash] When using CFT as consensus algorithm, CCF tolerates up to `(N-1)/2` crashed nodes (where `N` is the number of trusted nodes constituting the network) before having to perform a recovery procedure. For example, in a 5-node network, no more than 2 nodes are allowed to fail for the service to be able to commit new transactions. From fd8750b9dec6f9a639f00203b1bf45795b754c1e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 11:14:10 +0100 Subject: [PATCH 164/197] Add flag for detecting whether a timeout has occurred during self-healing-open --- .../{self_heal_open.h => self_healing_open.h} | 3 + src/node/node_state.h | 2 +- src/node/rpc/node_frontend.h | 2 +- src/node/self_healing_open_impl.cpp | 84 ++++++++++--------- src/node/self_healing_open_impl.h | 2 +- src/service/network_tables.h | 4 +- 6 files changed, 52 insertions(+), 45 deletions(-) rename include/ccf/service/tables/{self_heal_open.h => self_healing_open.h} (92%) diff --git a/include/ccf/service/tables/self_heal_open.h b/include/ccf/service/tables/self_healing_open.h similarity index 92% rename from include/ccf/service/tables/self_heal_open.h rename to include/ccf/service/tables/self_healing_open.h index 7461b113f1ee..a121df801837 100644 --- a/include/ccf/service/tables/self_heal_open.h +++ b/include/ccf/service/tables/self_healing_open.h @@ -54,6 +54,7 @@ namespace ccf using SelfHealingOpenVotes = ServiceSet; using SelfHealingOpenSMState = ServiceValue; using SelfHealingOpenTimeoutSMState = ServiceValue; + using SelfHealingOpenTimeoutFlag = ServiceValue; namespace Tables { @@ -69,5 +70,7 @@ namespace ccf "public:ccf.gov.selfhealingopen.sm_state"; static constexpr auto SELF_HEALING_OPEN_TIMEOUT_SM_STATE = "public:ccf.gov.selfhealingopen.timeout_sm_state"; + static constexpr auto SELF_HEALING_OPEN_TIMEOUT_FLAG = + "public:ccf.gov.selfhealingopen.timeout_used_to_open"; } } diff --git a/src/node/node_state.h b/src/node/node_state.h index 44fbe0c0a50b..5a8be5a6347c 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -17,7 +17,7 @@ #include "ccf/service/node_info_network.h" #include "ccf/service/reconfiguration_type.h" #include "ccf/service/tables/acme_certificates.h" -#include "ccf/service/tables/self_heal_open.h" +#include "ccf/service/tables/self_healing_open.h" #include "ccf/service/tables/service.h" #include "ccf/tx.h" #include "ccf_acme_client.h" diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index ca1313b7a370..b3ec438bf3c6 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -12,7 +12,7 @@ #include "ccf/pal/attestation.h" #include "ccf/pal/mem.h" #include "ccf/service/reconfiguration_type.h" -#include "ccf/service/tables/self_heal_open.h" +#include "ccf/service/tables/self_healing_open.h" #include "ccf/version.h" #include "crypto/certs.h" #include "crypto/csr.h" diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 504c9a5cb904..c6dd4115bb8b 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -3,6 +3,7 @@ #include "node/self_healing_open_impl.h" +#include "ccf/service/tables/self_healing_open.h" #include "node_state.h" #include @@ -17,19 +18,13 @@ namespace ccf void SelfHealingOpenService::try_start(ccf::kv::Tx& tx, bool recovering) { // Clear any previous state - tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE) - ->clear(); - tx.rw( - Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE) - ->clear(); - tx.rw(Tables::SELF_HEALING_OPEN_NODES) - ->clear(); - tx.rw(Tables::SELF_HEALING_OPEN_GOSSIPS) - ->clear(); - tx.rw( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) - ->clear(); - tx.rw(Tables::SELF_HEALING_OPEN_VOTES)->clear(); + tx.rw(node_state->network.self_healing_open_sm_state)->clear(); + tx.rw(node_state->network.self_healing_open_timeout_sm_state)->clear(); + tx.rw(node_state->network.self_healing_open_node_info)->clear(); + tx.rw(node_state->network.self_healing_open_gossip)->clear(); + tx.rw(node_state->network.self_healing_open_chosen_replica)->clear(); + tx.rw(node_state->network.self_healing_open_votes)->clear(); + tx.rw(node_state->network.self_healing_open_timeout_flag)->clear(); auto& config = node_state->config.recover.self_healing_open; if (!recovering || !config.has_value()) @@ -42,10 +37,9 @@ namespace ccf LOG_INFO_FMT("Starting self-healing-open"); - tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE) + tx.rw(node_state->network.self_healing_open_sm_state) ->put(SelfHealingOpenSM::GOSSIPPING); - tx.rw( - Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE) + tx.rw(node_state->network.self_healing_open_timeout_sm_state) ->put(SelfHealingOpenSM::GOSSIPPING); start_message_retry_timers(); @@ -77,27 +71,6 @@ namespace ccf bool valid_timeout = timeout && sm_state == timeout_state; - // Advance timeout SM - if (timeout) - { - switch (timeout_state) - { - case SelfHealingOpenSM::GOSSIPPING: - LOG_TRACE_FMT("Advancing timeout SM to VOTING"); - timeout_state_handle->put(SelfHealingOpenSM::VOTING); - break; - case SelfHealingOpenSM::VOTING: - LOG_TRACE_FMT("Advancing timeout SM to OPENING"); - timeout_state_handle->put(SelfHealingOpenSM::OPENING); - break; - case SelfHealingOpenSM::OPENING: - case SelfHealingOpenSM::JOINING: - case SelfHealingOpenSM::OPEN: - default: - LOG_TRACE_FMT("Timeout SM complete"); - } - } - // Advance self-healing-open SM switch (sm_state) { @@ -134,13 +107,20 @@ namespace ccf chosen_replica->put(maximum->second); sm_state_handle->put(SelfHealingOpenSM::VOTING); } - return; + break; } case SelfHealingOpenSM::VOTING: { auto* votes = tx.rw(node_state->network.self_healing_open_votes); - if (votes->size() >= config->addresses.size() / 2 + 1 || valid_timeout) + auto sufficient_quorum = + votes->size() >= config->addresses.size() / 2 + 1; + if (sufficient_quorum || valid_timeout) { + if (valid_timeout && !sufficient_quorum) + { + tx.rw(node_state->network.self_healing_open_timeout_flag) + ->put(true); + } if (votes->size() == 0) { throw std::logic_error( @@ -166,7 +146,7 @@ namespace ccf node_state->transition_service_to_open(tx, identities); } - return; + break; } case SelfHealingOpenSM::JOINING: { @@ -200,16 +180,38 @@ namespace ccf { sm_state_handle->put(SelfHealingOpenSM::OPEN); } + break; } case SelfHealingOpenSM::OPEN: { // Nothing to do here, we are already opening or open or joining - return; + break; } default: throw std::logic_error(fmt::format( "Unknown self-healing-open state: {}", static_cast(sm_state))); } + + // Advance timeout SM + if (timeout) + { + switch (timeout_state) + { + case SelfHealingOpenSM::GOSSIPPING: + LOG_TRACE_FMT("Advancing timeout SM to VOTING"); + timeout_state_handle->put(SelfHealingOpenSM::VOTING); + break; + case SelfHealingOpenSM::VOTING: + LOG_TRACE_FMT("Advancing timeout SM to OPENING"); + timeout_state_handle->put(SelfHealingOpenSM::OPENING); + break; + case SelfHealingOpenSM::OPENING: + case SelfHealingOpenSM::JOINING: + case SelfHealingOpenSM::OPEN: + default: + LOG_TRACE_FMT("Timeout SM complete"); + } + } } void SelfHealingOpenService::start_message_retry_timers() diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index 5ba52830a589..350ba7f9b648 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -3,7 +3,7 @@ #pragma once #include "ccf/node/startup_config.h" -#include "ccf/service/tables/self_heal_open.h" +#include "ccf/service/tables/self_healing_open.h" #include "ccf/tx.h" #include "self_healing_open_types.h" diff --git a/src/service/network_tables.h b/src/service/network_tables.h index b5ae614fa52f..bf9acc275c1b 100644 --- a/src/service/network_tables.h +++ b/src/service/network_tables.h @@ -16,7 +16,7 @@ #include "ccf/service/tables/modules.h" #include "ccf/service/tables/nodes.h" #include "ccf/service/tables/proposals.h" -#include "ccf/service/tables/self_heal_open.h" +#include "ccf/service/tables/self_healing_open.h" #include "ccf/service/tables/service.h" #include "ccf/service/tables/snp_measurements.h" #include "ccf/service/tables/tcb_verification.h" @@ -259,6 +259,8 @@ namespace ccf Tables::SELF_HEALING_OPEN_SM_STATE}; const SelfHealingOpenSMState self_healing_open_timeout_sm_state = { Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE}; + const SelfHealingOpenTimeoutFlag self_healing_open_timeout_flag = { + Tables::SELF_HEALING_OPEN_TIMEOUT_FLAG}; inline auto get_all_internal_tables() const { From 13e05b7bcc19b473e75a9b3f8ccf038ab717ae9d Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 11:19:31 +0100 Subject: [PATCH 165/197] Doc update --- doc/operations/recovery.rst | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/doc/operations/recovery.rst b/doc/operations/recovery.rst index c8b01fe25b1c..543bf3b222bb 100644 --- a/doc/operations/recovery.rst +++ b/doc/operations/recovery.rst @@ -145,19 +145,14 @@ Which of these two paths is taken is noted in the `public:ccf.internal.last_reco ... $ /opt/ccf/bin/js_generic --config /path/to/config/file -Notes ------ - -- Operators can track the number of times a given service has undergone the disaster recovery procedure via the :http:GET:`/node/network` endpoint (``recovery_count`` field). - Self-Healing-Open recovery -------------------------- -In environments with limited orchestration or external access, it is desirable to allow the network to recover from a disaster without operator intervention. +In environments with limited orchestration or it is difficult for operators to access, it is desirable to allow a limited disaster recover without operator intervention. At a high level, Self-Healing-Open recovery allows recovering replicas to discover which replica has the most up-to-date ledger and automatically recover the network using that ledger. There are two paths, a standard path, and a very-high-availablity timeout path. -The standard path ensures that if all nodes restart, at most a minority of the ledgers get rolled back, and no timeouts trigger, then there will be only one recovered network, and all committed entries from the previous network will be preserved. +The standard path ensures that if: all nodes restart, at most a minority of the ledgers get rolled back, and no timeouts trigger; then there will be only one recovered network, and all committed entries from the previous network will be preserved. However, the standard path can become stuck, in which case the timeout path is designed to ensure progress. In the standard path, nodes first gossip with each other. @@ -204,7 +199,9 @@ In this case, after a timeout, nodes will advance to the vote phase regardless o Unfortunately, this can lead to multiple forks of the service if different nodes cannot communicate with each other before the timeout. Hence, we recommend setting the timeout substantially higher than the highest expected recovery time, to minimise the chance of this happening. -This case is illustrated below. +To check if timeouts were used to open the service, the `public:ccf.gov.selfhealingopen.timeout_used_to_open` map tracks this. + +This timeout path is illustrated below. .. mermaid:: sequenceDiagram @@ -231,6 +228,14 @@ This case is illustrated below. Note over N1: Transition-to-open Note over N3: Transition-to-open + +If the network fails during reconfiguration, each node will use its latest known configuration to recover. Since reconfiguration requires votes from a majority of nodes, the latest configuration should recover using the standard path, however nodes in the previous configuration may recover using the timeout path. + +Notes +----- + +- Operators can track the number of times a given service has undergone the disaster recovery procedure via the :http:GET:`/node/network` endpoint (``recovery_count`` field). + .. rubric:: Footnotes .. [#crash] When using CFT as consensus algorithm, CCF tolerates up to `(N-1)/2` crashed nodes (where `N` is the number of trusted nodes constituting the network) before having to perform a recovery procedure. For example, in a 5-node network, no more than 2 nodes are allowed to fail for the service to be able to commit new transactions. From a9ab4375b092776a710d097486b64f453a1a4c26 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 11:20:07 +0100 Subject: [PATCH 166/197] typo --- doc/operations/recovery.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/operations/recovery.rst b/doc/operations/recovery.rst index 543bf3b222bb..a63d496919c6 100644 --- a/doc/operations/recovery.rst +++ b/doc/operations/recovery.rst @@ -199,7 +199,7 @@ In this case, after a timeout, nodes will advance to the vote phase regardless o Unfortunately, this can lead to multiple forks of the service if different nodes cannot communicate with each other before the timeout. Hence, we recommend setting the timeout substantially higher than the highest expected recovery time, to minimise the chance of this happening. -To check if timeouts were used to open the service, the `public:ccf.gov.selfhealingopen.timeout_used_to_open` map tracks this. +To audit if timeouts were used to open the service, the `public:ccf.gov.selfhealingopen.timeout_used_to_open` map tracks this. This timeout path is illustrated below. From 474b199eb51432da4a59a19184f925b18d27b893 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 11:44:34 +0100 Subject: [PATCH 167/197] Update path names --- doc/operations/recovery.rst | 30 +++++++++---------- .../ccf/service/tables/self_healing_open.h | 6 ++-- src/node/self_healing_open_impl.cpp | 4 +-- src/service/network_tables.h | 4 +-- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/doc/operations/recovery.rst b/doc/operations/recovery.rst index a63d496919c6..30c2bc2d8af1 100644 --- a/doc/operations/recovery.rst +++ b/doc/operations/recovery.rst @@ -148,17 +148,17 @@ Which of these two paths is taken is noted in the `public:ccf.internal.last_reco Self-Healing-Open recovery -------------------------- -In environments with limited orchestration or it is difficult for operators to access, it is desirable to allow a limited disaster recover without operator intervention. +In environments with limited orchestration or limited operator access, it is desirable to allow a limited disaster recovery without operator intervention. At a high level, Self-Healing-Open recovery allows recovering replicas to discover which replica has the most up-to-date ledger and automatically recover the network using that ledger. -There are two paths, a standard path, and a very-high-availablity timeout path. -The standard path ensures that if: all nodes restart, at most a minority of the ledgers get rolled back, and no timeouts trigger; then there will be only one recovered network, and all committed entries from the previous network will be preserved. -However, the standard path can become stuck, in which case the timeout path is designed to ensure progress. +There are two paths, a election path, and a very-high-availablity failover path. +The election path ensures that if: all nodes restart and have full network connectivity, a majority of nodes' on-disk ledger contains every committed transaction, and no timeouts trigger; then there will be only one recovered network, then all committed transaction will be persisted. +However, the election path can become stuck, in which case the failover path is designed to ensure progress. -In the standard path, nodes first gossip with each other. -Once they have heard from every node they choose a node to vote for. -If a node receives votes from a majority of nodes, it invokes `transition-to-open` and the other nodes restart to subsequently join it. -This path is illustrated below, and is guaranteed to succeed if at most a minority of nodes have failed, all nodes can communicate and no timeouts trigger. +In the election path, nodes first gossip with each other, learning of the ledgers of other nodes. +Once they have heard from every node they vote for the node with the best ledger. +If a node receives votes from a majority of nodes, it invokes `transition-to-open` and notifies the other nodes to restart and join it. +This path is illustrated below, and is guaranteed to succeed if all nodes can communicate and no timeouts trigger. .. mermaid:: sequenceDiagram @@ -192,16 +192,16 @@ This path is illustrated below, and is guaranteed to succeed if at most a minori N1 ->> N3: Join N2 ->> N3: Join -In the timeout path, each phase has a timeout to skip to the next phase if a failure has occurred. -For example, the standard path requires all nodes to communicate to advance from the gossip phase to the vote phase. -However, if any node fails to recover, the standard path is stuck. +In the failover path, each phase has a timeout to skip to the next phase if a failure has occurred. +For example, the election path requires all nodes to communicate to advance from the gossip phase to the vote phase. +However, if any node fails to recover, the election path is stuck. In this case, after a timeout, nodes will advance to the vote phase regardless of whether they have heard from all nodes, and vote for the best ledger they have heard of at that point. -Unfortunately, this can lead to multiple forks of the service if different nodes cannot communicate with each other before the timeout. +Unfortunately, this can lead to multiple forks of the service if different nodes cannot communicate with each other and timeout. Hence, we recommend setting the timeout substantially higher than the highest expected recovery time, to minimise the chance of this happening. -To audit if timeouts were used to open the service, the `public:ccf.gov.selfhealingopen.timeout_used_to_open` map tracks this. +To audit if timeouts were used to open the service, the `public:ccf.gov.selfhealingopen.failover_open` table tracks this. -This timeout path is illustrated below. +This failover path is illustrated below. .. mermaid:: sequenceDiagram @@ -229,7 +229,7 @@ This timeout path is illustrated below. Note over N3: Transition-to-open -If the network fails during reconfiguration, each node will use its latest known configuration to recover. Since reconfiguration requires votes from a majority of nodes, the latest configuration should recover using the standard path, however nodes in the previous configuration may recover using the timeout path. +If the network fails during reconfiguration, each node will use its latest known configuration to recover. Since reconfiguration requires votes from a majority of nodes, the latest configuration should recover using the election path, however nodes in the previous configuration may recover using the election path. Notes ----- diff --git a/include/ccf/service/tables/self_healing_open.h b/include/ccf/service/tables/self_healing_open.h index a121df801837..eea4c2f0112f 100644 --- a/include/ccf/service/tables/self_healing_open.h +++ b/include/ccf/service/tables/self_healing_open.h @@ -54,7 +54,7 @@ namespace ccf using SelfHealingOpenVotes = ServiceSet; using SelfHealingOpenSMState = ServiceValue; using SelfHealingOpenTimeoutSMState = ServiceValue; - using SelfHealingOpenTimeoutFlag = ServiceValue; + using SelfHealingOpenFailoverFlag = ServiceValue; namespace Tables { @@ -70,7 +70,7 @@ namespace ccf "public:ccf.gov.selfhealingopen.sm_state"; static constexpr auto SELF_HEALING_OPEN_TIMEOUT_SM_STATE = "public:ccf.gov.selfhealingopen.timeout_sm_state"; - static constexpr auto SELF_HEALING_OPEN_TIMEOUT_FLAG = - "public:ccf.gov.selfhealingopen.timeout_used_to_open"; + static constexpr auto SELF_HEALING_OPEN_FAILOVER_FLAG = + "public:ccf.gov.selfhealingopen.failover_open"; } } diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index c6dd4115bb8b..8ed9d5c5c6bd 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -24,7 +24,7 @@ namespace ccf tx.rw(node_state->network.self_healing_open_gossip)->clear(); tx.rw(node_state->network.self_healing_open_chosen_replica)->clear(); tx.rw(node_state->network.self_healing_open_votes)->clear(); - tx.rw(node_state->network.self_healing_open_timeout_flag)->clear(); + tx.rw(node_state->network.self_healing_open_failover_flag)->clear(); auto& config = node_state->config.recover.self_healing_open; if (!recovering || !config.has_value()) @@ -118,7 +118,7 @@ namespace ccf { if (valid_timeout && !sufficient_quorum) { - tx.rw(node_state->network.self_healing_open_timeout_flag) + tx.rw(node_state->network.self_healing_open_failover_flag) ->put(true); } if (votes->size() == 0) diff --git a/src/service/network_tables.h b/src/service/network_tables.h index bf9acc275c1b..79c4806c8a2f 100644 --- a/src/service/network_tables.h +++ b/src/service/network_tables.h @@ -259,8 +259,8 @@ namespace ccf Tables::SELF_HEALING_OPEN_SM_STATE}; const SelfHealingOpenSMState self_healing_open_timeout_sm_state = { Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE}; - const SelfHealingOpenTimeoutFlag self_healing_open_timeout_flag = { - Tables::SELF_HEALING_OPEN_TIMEOUT_FLAG}; + const SelfHealingOpenFailoverFlag self_healing_open_failover_flag = { + Tables::SELF_HEALING_OPEN_FAILOVER_FLAG}; inline auto get_all_internal_tables() const { From c06895ccfe44cd6269c79d1dec0dd6fa5e015eda Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 11:52:25 +0100 Subject: [PATCH 168/197] Revert "Allow curl handles to fix themselves during shutdown." This reverts commit e9cb10dd889bc952b06288f0fa43cd2f7fd85d30. --- src/http/curl.h | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/src/http/curl.h b/src/http/curl.h index f3882425fa1b..25e9cdbb38a2 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -714,6 +714,12 @@ namespace ccf::curl static void libuv_socket_poll_callback( uv_poll_t* req, int status, int events) { + if (status < 0) + { + LOG_FAIL_FMT("Socket poll error: {}", uv_strerror(status)); + return; + } + auto* socket_context = static_cast(req->data); if (socket_context == nullptr) { @@ -730,27 +736,7 @@ namespace ccf::curl if (self->is_stopping) { - LOG_FAIL_FMT( - "libuv_socket_poll_callback called on {} while stopped", - socket_context->socket); - return; - } - - if (status < 0) - { - LOG_INFO_FMT( - "Socket poll error on {}: {}", - socket_context->socket, - uv_strerror(status)); - - // Notify curl of the error - CHECK_CURL_MULTI( - curl_multi_socket_action, - self->curl_request_curlm, - socket_context->socket, - CURL_CSELECT_ERR, - nullptr); - self->curl_request_curlm.perform(); + LOG_FAIL_FMT("libuv_socket_poll_callback called while stopping"); return; } @@ -793,17 +779,15 @@ namespace ccf::curl case CURL_POLL_OUT: case CURL_POLL_INOUT: { - LOG_INFO_FMT( - "Curl socket callback: listen on socket {}, {}", - static_cast(s), - static_cast(action)); - - // During shutdown ignore requests to add new sockets + // Possibly called during shutdown if (self->is_stopping) { + LOG_FAIL_FMT("curl_socket_callback called while stopping"); return 0; } + LOG_INFO_FMT( + "Curl socket callback: listen on socket {}", static_cast(s)); if (socket_context == nullptr) { auto socket_context_ptr = std::make_unique(); From 7e5af0df2e1e07c5285e695f3d7326fe7952c515 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 12:47:54 +0100 Subject: [PATCH 169/197] Update docs --- doc/operations/recovery.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/operations/recovery.rst b/doc/operations/recovery.rst index 30c2bc2d8af1..d588b95c6729 100644 --- a/doc/operations/recovery.rst +++ b/doc/operations/recovery.rst @@ -161,6 +161,7 @@ If a node receives votes from a majority of nodes, it invokes `transition-to-ope This path is illustrated below, and is guaranteed to succeed if all nodes can communicate and no timeouts trigger. .. mermaid:: + sequenceDiagram participant N1 participant N2 @@ -204,6 +205,7 @@ To audit if timeouts were used to open the service, the `public:ccf.gov.selfheal This failover path is illustrated below. .. mermaid:: + sequenceDiagram participant N1 participant N2 From 009a1c194080b2cf554a85c2c25eae0b7a656433 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 13:09:13 +0100 Subject: [PATCH 170/197] Make clang-tidy happy --- src/node/self_healing_open_impl.cpp | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 8ed9d5c5c6bd..36999a8d5024 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -225,12 +225,12 @@ namespace ccf { throw std::logic_error("Self-healing-open not configured"); } - auto& node_state = msg->data.self.node_state; - std::lock_guard guard(node_state->lock); + auto& node_state_ = msg->data.self.node_state; + std::lock_guard guard(node_state_->lock); - auto tx = node_state->network.tables->create_read_only_tx(); + auto tx = node_state_->network.tables->create_read_only_tx(); auto* sm_state_handle = - tx.ro(node_state->network.self_healing_open_sm_state); + tx.ro(node_state_->network.self_healing_open_sm_state); auto sm_state_opt = sm_state_handle->get(); if (!sm_state_opt.has_value()) @@ -256,9 +256,9 @@ namespace ccf case SelfHealingOpenSM::VOTING: { auto* node_info_handle = - tx.ro(node_state->network.self_healing_open_node_info); + tx.ro(node_state_->network.self_healing_open_node_info); auto* chosen_replica_handle = - tx.ro(node_state->network.self_healing_open_chosen_replica); + tx.ro(node_state_->network.self_healing_open_chosen_replica); if (!chosen_replica_handle->get().has_value()) { throw std::logic_error( @@ -316,15 +316,15 @@ namespace ccf { throw std::logic_error("Self-healing-open not configured"); } - auto* node_state = msg->data.self.node_state; - std::lock_guard guard(node_state->lock); + auto* node_state_ = msg->data.self.node_state; + std::lock_guard guard(node_state_->lock); LOG_TRACE_FMT( "Self-healing-open timeout, sending timeout to internal handlers"); // Stop the timer if the node has completed its self-healing-open - auto tx = node_state->network.tables->create_read_only_tx(); + auto tx = node_state_->network.tables->create_read_only_tx(); auto* sm_state_handle = - tx.ro(node_state->network.self_healing_open_sm_state); + tx.ro(node_state_->network.self_healing_open_sm_state); if (!sm_state_handle->get().has_value()) { throw std::logic_error( @@ -341,7 +341,7 @@ namespace ccf // Send a timeout to the internal handlers curl::UniqueCURL curl_handle; - auto cert = node_state->self_signed_node_cert; + auto cert = node_state_->self_signed_node_cert; curl_handle.set_opt(CURLOPT_SSL_VERIFYHOST, 0L); curl_handle.set_opt(CURLOPT_SSL_VERIFYPEER, 0L); curl_handle.set_opt(CURLOPT_SSL_VERIFYSTATUS, 0L); @@ -350,14 +350,14 @@ namespace ccf CURLOPT_SSLCERT_BLOB, cert.data(), cert.size()); curl_handle.set_opt(CURLOPT_SSLCERTTYPE, "PEM"); - auto privkey_pem = node_state->node_sign_kp->private_key_pem(); + auto privkey_pem = node_state_->node_sign_kp->private_key_pem(); curl_handle.set_blob_opt( CURLOPT_SSLKEY_BLOB, privkey_pem.data(), privkey_pem.size()); curl_handle.set_opt(CURLOPT_SSLKEYTYPE, "PEM"); auto url = fmt::format( "https://{}/{}/self_healing_open/timeout", - node_state->config.network.rpc_interfaces.at("primary_rpc_interface") + node_state_->config.network.rpc_interfaces.at("primary_rpc_interface") .published_address, get_actor_prefix(ActorsType::nodes)); From 9e7d6e0d533fb6551b46779942edb71cc757c03a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 14:36:33 +0100 Subject: [PATCH 171/197] Update doc/host_config_schema/cchost_config.json Co-authored-by: Eddy Ashton --- doc/host_config_schema/cchost_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index 3895c3c11466..3287d452638c 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -431,7 +431,7 @@ "retry_timeout": { "type": "string", "default": "100ms", - "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be leass than 'timeout'" + "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be less than 'timeout'" }, "timeout": { "type": "string", From f01931a6c2ea6bc54782dbe15d448c40feb9888e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 14:36:49 +0100 Subject: [PATCH 172/197] Update doc/operations/recovery.rst Co-authored-by: Eddy Ashton --- doc/operations/recovery.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/operations/recovery.rst b/doc/operations/recovery.rst index d588b95c6729..edfebb0a727c 100644 --- a/doc/operations/recovery.rst +++ b/doc/operations/recovery.rst @@ -148,7 +148,7 @@ Which of these two paths is taken is noted in the `public:ccf.internal.last_reco Self-Healing-Open recovery -------------------------- -In environments with limited orchestration or limited operator access, it is desirable to allow a limited disaster recovery without operator intervention. +In environments with limited orchestration or limited operator access, it is desirable to allow an automated disaster recovery without operator intervention. At a high level, Self-Healing-Open recovery allows recovering replicas to discover which replica has the most up-to-date ledger and automatically recover the network using that ledger. There are two paths, a election path, and a very-high-availablity failover path. From 125a0fb991ec874412abe78b1c59592d81f50d38 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 14:37:02 +0100 Subject: [PATCH 173/197] Update src/common/configuration.h Co-authored-by: Eddy Ashton --- src/common/configuration.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/configuration.h b/src/common/configuration.h index 271f4183ecf8..27c44dd0222a 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -129,7 +129,7 @@ namespace ccf service_cert, follow_redirect); - DECLARE_JSON_TYPE(StartupConfig::Recover); + DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(StartupConfig::Recover); DECLARE_JSON_REQUIRED_FIELDS( StartupConfig::Recover, previous_service_identity); DECLARE_JSON_OPTIONAL_FIELDS( From be6edc992c7207f321b980151236feeb214b1d80 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 14:54:01 +0100 Subject: [PATCH 174/197] typoing --- include/ccf/service/tables/self_healing_open.h | 4 ++-- src/node/self_healing_open_impl.cpp | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/ccf/service/tables/self_healing_open.h b/include/ccf/service/tables/self_healing_open.h index eea4c2f0112f..f3c9f905c675 100644 --- a/include/ccf/service/tables/self_healing_open.h +++ b/include/ccf/service/tables/self_healing_open.h @@ -29,7 +29,7 @@ DECLARE_JSON_REQUIRED_FIELDS( enum class SelfHealingOpenSM { - GOSSIPPING = 0, + GOSSIPING = 0, VOTING, OPENING, // by chosen replica JOINING, // by all other replicas @@ -38,7 +38,7 @@ enum class SelfHealingOpenSM DECLARE_JSON_ENUM( SelfHealingOpenSM, - {{SelfHealingOpenSM::GOSSIPPING, "Gossipping"}, + {{SelfHealingOpenSM::GOSSIPING, "Gossiping"}, {SelfHealingOpenSM::VOTING, "Voting"}, {SelfHealingOpenSM::OPENING, "Opening"}, {SelfHealingOpenSM::JOINING, "Joining"}, diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 36999a8d5024..9256c4647201 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -38,9 +38,9 @@ namespace ccf LOG_INFO_FMT("Starting self-healing-open"); tx.rw(node_state->network.self_healing_open_sm_state) - ->put(SelfHealingOpenSM::GOSSIPPING); + ->put(SelfHealingOpenSM::GOSSIPING); tx.rw(node_state->network.self_healing_open_timeout_sm_state) - ->put(SelfHealingOpenSM::GOSSIPPING); + ->put(SelfHealingOpenSM::GOSSIPING); start_message_retry_timers(); start_failover_timers(); @@ -74,7 +74,7 @@ namespace ccf // Advance self-healing-open SM switch (sm_state) { - case SelfHealingOpenSM::GOSSIPPING: + case SelfHealingOpenSM::GOSSIPING: { auto* gossip_handle = tx.ro(node_state->network.self_healing_open_gossip); @@ -197,7 +197,7 @@ namespace ccf { switch (timeout_state) { - case SelfHealingOpenSM::GOSSIPPING: + case SelfHealingOpenSM::GOSSIPING: LOG_TRACE_FMT("Advancing timeout SM to VOTING"); timeout_state_handle->put(SelfHealingOpenSM::VOTING); break; @@ -250,7 +250,7 @@ namespace ccf switch (sm_state) { - case SelfHealingOpenSM::GOSSIPPING: + case SelfHealingOpenSM::GOSSIPING: msg->data.self.send_gossip_unsafe(); break; case SelfHealingOpenSM::VOTING: From 7ee3d5bd4f8c8ec85e2c4a90cae4446c5b4498bb Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 15:04:21 +0100 Subject: [PATCH 175/197] config snags --- doc/host_config_schema/cchost_config.json | 4 ++-- include/ccf/node/startup_config.h | 2 +- include/ccf/service/tables/self_healing_open.h | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/host_config_schema/cchost_config.json b/doc/host_config_schema/cchost_config.json index 3287d452638c..c00e33de3097 100644 --- a/doc/host_config_schema/cchost_config.json +++ b/doc/host_config_schema/cchost_config.json @@ -431,9 +431,9 @@ "retry_timeout": { "type": "string", "default": "100ms", - "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be less than 'timeout'" + "description": "Interval (time string) at which the node re-sends self-healing-open messages. This should be significantly less than 'failover_timeout'" }, - "timeout": { + "failover_timeout": { "type": "string", "default": "2000ms", "description": "Interval (time string) after which the node forcibly advances to the next phase of the self-healing-open protocol" diff --git a/include/ccf/node/startup_config.h b/include/ccf/node/startup_config.h index 3eed69867024..48e725b005fb 100644 --- a/include/ccf/node/startup_config.h +++ b/include/ccf/node/startup_config.h @@ -106,7 +106,7 @@ namespace ccf { std::vector addresses; ccf::ds::TimeString retry_timeout = {"100ms"}; - ccf::ds::TimeString timeout = {"2000ms"}; + ccf::ds::TimeString failover_timeout = {"2000ms"}; bool operator==(const SelfHealingOpenConfig&) const = default; }; diff --git a/include/ccf/service/tables/self_healing_open.h b/include/ccf/service/tables/self_healing_open.h index f3c9f905c675..fc00fc291606 100644 --- a/include/ccf/service/tables/self_healing_open.h +++ b/include/ccf/service/tables/self_healing_open.h @@ -9,7 +9,7 @@ using IntrinsicIdentifier = std::string; -struct SelfHealingOpenNodeInfo_t +struct SelfHealingOpenNodeInfo { ccf::QuoteInfo quote_info; std::string published_network_address; @@ -18,9 +18,9 @@ struct SelfHealingOpenNodeInfo_t IntrinsicIdentifier intrinsic_id; }; -DECLARE_JSON_TYPE(SelfHealingOpenNodeInfo_t); +DECLARE_JSON_TYPE(SelfHealingOpenNodeInfo); DECLARE_JSON_REQUIRED_FIELDS( - SelfHealingOpenNodeInfo_t, + SelfHealingOpenNodeInfo, quote_info, published_network_address, cert_der, @@ -46,8 +46,8 @@ DECLARE_JSON_ENUM( namespace ccf { - using SelfHealingOpenNodeInfo = - ServiceMap; + using SelfHealingOpenNodeInfoMap = + ServiceMap; using SelfHealingOpenGossips = ServiceMap; using SelfHealingOpenChosenReplica = ServiceValue; From ccb43b7f41064d52dbecb5c6e169d0240fb0a85b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 15:04:31 +0100 Subject: [PATCH 176/197] inline restarter --- src/host/handle_ring_buffer.h | 5 +-- src/host/self_healing_open.h | 63 ----------------------------------- 2 files changed, 3 insertions(+), 65 deletions(-) delete mode 100644 src/host/self_healing_open.h diff --git a/src/host/handle_ring_buffer.h b/src/host/handle_ring_buffer.h index 8aee28cc1cba..bed88891b9aa 100644 --- a/src/host/handle_ring_buffer.h +++ b/src/host/handle_ring_buffer.h @@ -6,7 +6,6 @@ #include "../enclave/interface.h" #include "ds/internal_logger.h" #include "ds/non_blocking.h" -#include "self_healing_open.h" #include "timer.h" #include @@ -58,7 +57,9 @@ namespace asynchost DISPATCHER_SET_MESSAGE_HANDLER( bp, AdminMessage::restart, [&](const uint8_t*, size_t) { - ccf::SelfHealingOpenRBHandlerSingleton::instance()->trigger_restart(); + LOG_INFO_FMT("Received request to restart enclave, sending stops"); + auto to_enclave = nbwf.create_writer_to_inside(); + RINGBUFFER_WRITE_MESSAGE(AdminMessage::stop, to_enclave); }); } diff --git a/src/host/self_healing_open.h b/src/host/self_healing_open.h deleted file mode 100644 index 350cf511f311..000000000000 --- a/src/host/self_healing_open.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the Apache 2.0 License. -#pragma once - -#include "../enclave/interface.h" -#include "ds/ring_buffer_types.h" - -#include -#include -#include -#include -namespace ccf -{ - class SelfHealingOpenRBHandler - { - public: - ringbuffer::WriterPtr to_enclave; - - SelfHealingOpenRBHandler( - ringbuffer::AbstractWriterFactory& writer_factory) : - to_enclave(writer_factory.create_writer_to_inside()) - {} - - void trigger_restart() - { - LOG_INFO_FMT("Received request to restart enclave, sending stops"); - RINGBUFFER_WRITE_MESSAGE(AdminMessage::stop, to_enclave); - } - }; - - class SelfHealingOpenRBHandlerSingleton - { - private: - static std::unique_ptr& instance_unsafe() - { - static std::unique_ptr instance = nullptr; - return instance; - } - - public: - static std::unique_ptr& instance() - { - auto& instance = instance_unsafe(); - if (instance == nullptr) - { - throw std::logic_error( - "SelfHealingOpenSingleton instance not initialized"); - } - return instance; - } - - static void initialise(ringbuffer::AbstractWriterFactory& writer_factory) - { - auto& instance = instance_unsafe(); - if (instance != nullptr) - { - throw std::logic_error( - "SelfHealingOpenSingleton instance already initialized"); - } - instance = std::make_unique(writer_factory); - } - }; -} \ No newline at end of file From af6757f8770c0034f62bc981d8264b7bb7d8c5f2 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 15:17:35 +0100 Subject: [PATCH 177/197] Refactoring --- src/node/node_state.h | 6 +-- src/node/rpc/node_frontend.h | 1 - src/node/rpc/node_interface.h | 2 +- src/node/rpc/node_operation.h | 2 +- src/node/rpc/node_operation_interface.h | 2 +- src/node/rpc/test/node_stub.h | 2 +- src/node/self_healing_open_impl.cpp | 20 ++++----- src/node/self_healing_open_impl.h | 52 ++++++++++++++++++++--- src/node/self_healing_open_types.h | 56 ------------------------- 9 files changed, 62 insertions(+), 81 deletions(-) delete mode 100644 src/node/self_healing_open_types.h diff --git a/src/node/node_state.h b/src/node/node_state.h index 5a8be5a6347c..f29e11865661 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -88,7 +88,7 @@ namespace ccf class NodeState : public AbstractNodeState { - friend class SelfHealingOpenService; + friend class SelfHealingOpenSubSystem; private: // @@ -237,7 +237,7 @@ namespace ccf last_recovered_signed_idx = last_recovered_idx; } - SelfHealingOpenService self_healing_open_impl; + SelfHealingOpenSubSystem self_healing_open_impl; public: NodeState( @@ -3007,7 +3007,7 @@ namespace ccf return writer_factory; } - SelfHealingOpenService& self_healing_open() override + SelfHealingOpenSubSystem& self_healing_open() override { return self_healing_open_impl; } diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index b3ec438bf3c6..299b4240ca2f 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -25,7 +25,6 @@ #include "node/rpc/no_create_tx_claims_digest.cpp" #include "node/rpc/serialization.h" #include "node/self_healing_open_impl.h" -#include "node/self_healing_open_types.h" #include "node/session_metrics.h" #include "node_interface.h" #include "service/internal_tables_access.h" diff --git a/src/node/rpc/node_interface.h b/src/node/rpc/node_interface.h index 6ce2714a99e3..e56ea19fd2a4 100644 --- a/src/node/rpc/node_interface.h +++ b/src/node/rpc/node_interface.h @@ -65,7 +65,7 @@ namespace ccf virtual size_t get_jwt_attempts() = 0; virtual ccf::crypto::Pem get_self_signed_certificate() = 0; virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; - virtual SelfHealingOpenService& self_healing_open() = 0; + virtual SelfHealingOpenSubsystem& self_healing_open() = 0; virtual const ccf::StartupConfig& get_node_config() const = 0; virtual ccf::crypto::Pem get_network_cert() = 0; virtual void stop_notice() = 0; diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index b3d3ad495980..f11bef5f2616 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -111,7 +111,7 @@ namespace ccf return impl.get_cose_signatures_config(); } - SelfHealingOpenService& self_healing_open() override + SelfHealingOpenSubSystem& self_healing_open() override { return impl.self_healing_open(); } diff --git a/src/node/rpc/node_operation_interface.h b/src/node/rpc/node_operation_interface.h index f92e77994559..72bdd87b30d8 100644 --- a/src/node/rpc/node_operation_interface.h +++ b/src/node/rpc/node_operation_interface.h @@ -62,6 +62,6 @@ namespace ccf virtual const ccf::COSESignaturesConfig& get_cose_signatures_config() = 0; - virtual SelfHealingOpenService& self_healing_open() = 0; + virtual SelfHealingOpenSubsystem& self_healing_open() = 0; }; } \ No newline at end of file diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index fe89f21f4778..5e43226545c1 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -112,7 +112,7 @@ namespace ccf return cose_signatures_config; } - SelfHealingOpenService& self_healing_open() override + SelfHealingOpenSubSystem& self_healing_open() override { throw std::logic_error("Unimplemented"); } diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 9256c4647201..d182c0c6d1f0 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -11,7 +11,7 @@ namespace ccf { - SelfHealingOpenService::SelfHealingOpenService(NodeState* node_state_) : + SelfHealingOpenSubSystem::SelfHealingOpenSubSystem(NodeState* node_state_) : node_state(node_state_) {} @@ -29,9 +29,7 @@ namespace ccf auto& config = node_state->config.recover.self_healing_open; if (!recovering || !config.has_value()) { - LOG_INFO_FMT( - "Not recovering or self-healing-open not configured, skipping " - "self-healing-open"); + LOG_INFO_FMT("Skipping self-healing-open"); return; } @@ -46,7 +44,7 @@ namespace ccf start_failover_timers(); } - void SelfHealingOpenService::advance(ccf::kv::Tx& tx, bool timeout) + void SelfHealingOpenSubSystem::advance(ccf::kv::Tx& tx, bool timeout) { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -214,7 +212,7 @@ namespace ccf } } - void SelfHealingOpenService::start_message_retry_timers() + void SelfHealingOpenSubSystem::start_message_retry_timers() { LOG_TRACE_FMT("Self-healing-open: Setting up retry timers"); auto retry_timer_msg = std::make_unique<::threading::Tmsg>( @@ -298,7 +296,7 @@ namespace ccf threading::get_current_thread_id(), std::move(retry_timer_msg)); } - void SelfHealingOpenService::start_failover_timers() + void SelfHealingOpenSubSystem::start_failover_timers() { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -450,7 +448,7 @@ namespace ccf std::move(curl_request)); } - self_healing_open::RequestNodeInfo SelfHealingOpenService::make_node_info() + self_healing_open::RequestNodeInfo SelfHealingOpenSubSystem::make_node_info() { return { .quote_info = node_state->quote_info, @@ -464,7 +462,7 @@ namespace ccf }; } - void SelfHealingOpenService::send_gossip_unsafe() + void SelfHealingOpenSubSystem::send_gossip_unsafe() { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -491,7 +489,7 @@ namespace ccf } } - void SelfHealingOpenService::send_vote_unsafe( + void SelfHealingOpenSubSystem::send_vote_unsafe( const SelfHealingOpenNodeInfo_t& node_info) { auto& config = node_state->config.recover.self_healing_open; @@ -517,7 +515,7 @@ namespace ccf node_state->node_sign_kp->private_key_pem()); } - void SelfHealingOpenService::send_iamopen_unsafe() + void SelfHealingOpenSubSystem::send_iamopen_unsafe() { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index 350ba7f9b648..ab27d53aff94 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -5,12 +5,52 @@ #include "ccf/node/startup_config.h" #include "ccf/service/tables/self_healing_open.h" #include "ccf/tx.h" -#include "self_healing_open_types.h" + +namespace ccf::self_healing_open +{ + struct RequestNodeInfo + { + QuoteInfo quote_info; + std::string published_network_address; + std::string intrinsic_id; + std::string service_identity; + }; + DECLARE_JSON_TYPE(RequestNodeInfo); + DECLARE_JSON_REQUIRED_FIELDS( + RequestNodeInfo, + quote_info, + published_network_address, + intrinsic_id, + service_identity); + + struct GossipRequest + { + RequestNodeInfo info; + ccf::kv::Version txid; + }; + DECLARE_JSON_TYPE(GossipRequest); + DECLARE_JSON_REQUIRED_FIELDS(GossipRequest, txid, info); + + struct VoteRequest + { + RequestNodeInfo info; + }; + DECLARE_JSON_TYPE(VoteRequest); + DECLARE_JSON_REQUIRED_FIELDS(VoteRequest, info); + + struct IAmOpenRequest + { + RequestNodeInfo info; + }; + DECLARE_JSON_TYPE(IAmOpenRequest); + DECLARE_JSON_REQUIRED_FIELDS(IAmOpenRequest, info); + +} namespace ccf { class NodeState; - class SelfHealingOpenService + class SelfHealingOpenSubSystem { private: // SelfHealingOpenService is solely owned by NodeState, and all tasks should @@ -18,15 +58,15 @@ namespace ccf NodeState* node_state; public: - SelfHealingOpenService(NodeState* node_state); + SelfHealingOpenSubSystem(NodeState* node_state); void try_start(ccf::kv::Tx& tx, bool recovering); void advance(ccf::kv::Tx& tx, bool timeout); private: struct SHOMsg { - SHOMsg(SelfHealingOpenService& self_) : self(self_) {} - SelfHealingOpenService& self; + SHOMsg(SelfHealingOpenSubSystem& self_) : self(self_) {} + SelfHealingOpenSubSystem& self; }; // Start path @@ -36,7 +76,7 @@ namespace ccf // Steady state operations self_healing_open::RequestNodeInfo make_node_info(); void send_gossip_unsafe(); - void send_vote_unsafe(const SelfHealingOpenNodeInfo_t&); + void send_vote_unsafe(const SelfHealingOpenNodeInfo&); void send_iamopen_unsafe(); }; } \ No newline at end of file diff --git a/src/node/self_healing_open_types.h b/src/node/self_healing_open_types.h deleted file mode 100644 index 3bc99c225650..000000000000 --- a/src/node/self_healing_open_types.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the Apache 2.0 License. - -#pragma once - -#include "ccf/crypto/pem.h" -#include "ccf/ds/json.h" -#include "ccf/ds/quote_info.h" -#include "ccf/kv/version.h" -#include "ds/actors.h" -#include "http/curl.h" - -#include -#include -#include - -namespace ccf::self_healing_open -{ - struct RequestNodeInfo - { - QuoteInfo quote_info; - std::string published_network_address; - std::string intrinsic_id; - std::string service_identity; - }; - DECLARE_JSON_TYPE(RequestNodeInfo); - DECLARE_JSON_REQUIRED_FIELDS( - RequestNodeInfo, - quote_info, - published_network_address, - intrinsic_id, - service_identity); - - struct GossipRequest - { - RequestNodeInfo info; - ccf::kv::Version txid; - }; - DECLARE_JSON_TYPE(GossipRequest); - DECLARE_JSON_REQUIRED_FIELDS(GossipRequest, txid, info); - - struct VoteRequest - { - RequestNodeInfo info; - }; - DECLARE_JSON_TYPE(VoteRequest); - DECLARE_JSON_REQUIRED_FIELDS(VoteRequest, info); - - struct IAmOpenRequest - { - RequestNodeInfo info; - }; - DECLARE_JSON_TYPE(IAmOpenRequest); - DECLARE_JSON_REQUIRED_FIELDS(IAmOpenRequest, info); - -} \ No newline at end of file From a83a44e94cc32e4e2a7b745cb9895378d21b8fa1 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 15:29:22 +0100 Subject: [PATCH 178/197] Don't use network.tables anymore --- src/node/rpc/node_frontend.h | 102 +++++++++++++++------------- src/node/self_healing_open_impl.cpp | 73 +++++++++++--------- src/service/network_tables.h | 16 ----- 3 files changed, 96 insertions(+), 95 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 299b4240ca2f..397820f41e6f 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -449,7 +449,8 @@ namespace ccf // Validating that we haven't heard from this node before, of if we have // that the cert hasn't changed - auto* node_info_handle = tx.rw(this->network.self_healing_open_node_info); + auto* node_info_handle = + tx.rw(Tables::SELF_HEALING_OPEN_NODES); auto existing_node_info = node_info_handle->get(in.intrinsic_id); if (existing_node_info.has_value()) @@ -2283,9 +2284,9 @@ namespace ccf LOG_TRACE_FMT( "Self-healing-open: recieve gossip from {}", in.info.intrinsic_id); - auto chosen_replica = - args.tx.ro(this->network.self_healing_open_chosen_replica); - // This freezes the gossips at the point where it votes + // Stop accepting gossips once a node has voted + auto chosen_replica = args.tx.template ro( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA); if (chosen_replica->get().has_value()) { return make_error( @@ -2294,7 +2295,8 @@ namespace ccf "This replica has already voted"); } - auto gossip_handle = args.tx.rw(this->network.self_healing_open_gossip); + auto gossip_handle = args.tx.template rw( + Tables::SELF_HEALING_OPEN_GOSSIPS); if (gossip_handle->get(in.info.intrinsic_id).has_value()) { LOG_INFO_FMT( @@ -2356,7 +2358,8 @@ namespace ccf LOG_TRACE_FMT( "Self-healing-open: recieve vote from {}", in.info.intrinsic_id); - args.tx.rw(this->network.self_healing_open_votes) + args.tx + .template rw(Tables::SELF_HEALING_OPEN_VOTES) ->insert(in.info.intrinsic_id); try @@ -2387,52 +2390,55 @@ namespace ccf .set_openapi_hidden(true) .install(); - auto self_healing_open_iamopen = - [this](auto& args, const nlohmann::json& params) { - auto config = - this->context.get_subsystem(); - if ( - config == nullptr || - !config->get().node_config.recover.self_healing_open.has_value()) - { - return make_error( - HTTP_STATUS_BAD_REQUEST, - ccf::errors::InvalidNodeState, - "Unable to get self-healing-open configuration"); - } + auto self_healing_open_iamopen = [this]( + auto& args, + const nlohmann::json& params) { + auto config = this->context.get_subsystem(); + if ( + config == nullptr || + !config->get().node_config.recover.self_healing_open.has_value()) + { + return make_error( + HTTP_STATUS_BAD_REQUEST, + ccf::errors::InvalidNodeState, + "Unable to get self-healing-open configuration"); + } - auto in = params.get(); - auto valid = self_healing_open_validate_and_store_node_info( - args, args.tx, in.info); - if (valid.has_value()) - { - auto [code, message] = valid.value(); - return make_error(code, ccf::errors::InvalidQuote, message); - } + auto in = params.get(); + auto valid = self_healing_open_validate_and_store_node_info( + args, args.tx, in.info); + if (valid.has_value()) + { + auto [code, message] = valid.value(); + return make_error(code, ccf::errors::InvalidQuote, message); + } - args.tx.rw(this->network.self_healing_open_sm_state) - ->put(SelfHealingOpenSM::JOINING); - args.tx.rw(this->network.self_healing_open_chosen_replica) - ->put(in.info.intrinsic_id); + args.tx + .template rw(Tables::SELF_HEALING_OPEN_SM_STATE) + ->put(SelfHealingOpenSM::JOINING); + args.tx + .template rw( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + ->put(in.info.intrinsic_id); - try - { - this->node_operation.self_healing_open().advance(args.tx, false); - } - catch (const std::logic_error& e) - { - LOG_FAIL_FMT( - "Self-healing-open gossip failed to advance state: {}", e.what()); - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - fmt::format( - "Failed to advance self-healing-open state: {}", e.what())); - } + try + { + this->node_operation.self_healing_open().advance(args.tx, false); + } + catch (const std::logic_error& e) + { + LOG_FAIL_FMT( + "Self-healing-open gossip failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); + } - return make_success(fmt::format( - "Node {} is joining self-healing-open", in.info.intrinsic_id)); - }; + return make_success(fmt::format( + "Node {} is joining self-healing-open", in.info.intrinsic_id)); + }; make_endpoint( "/self_healing_open/iamopen", HTTP_PUT, diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index d182c0c6d1f0..555b800ff37f 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -15,16 +15,21 @@ namespace ccf node_state(node_state_) {} - void SelfHealingOpenService::try_start(ccf::kv::Tx& tx, bool recovering) + void SelfHealingOpenSubSystem::try_start(ccf::kv::Tx& tx, bool recovering) { // Clear any previous state - tx.rw(node_state->network.self_healing_open_sm_state)->clear(); - tx.rw(node_state->network.self_healing_open_timeout_sm_state)->clear(); - tx.rw(node_state->network.self_healing_open_node_info)->clear(); - tx.rw(node_state->network.self_healing_open_gossip)->clear(); - tx.rw(node_state->network.self_healing_open_chosen_replica)->clear(); - tx.rw(node_state->network.self_healing_open_votes)->clear(); - tx.rw(node_state->network.self_healing_open_failover_flag)->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE)->clear(); + tx.rw( + Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE) + ->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_NODES)->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_GOSSIPS)->clear(); + tx.rw( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + ->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_VOTES)->clear(); + tx.rw(Tables::SELF_HEALING_OPEN_FAILOVER_FLAG) + ->clear(); auto& config = node_state->config.recover.self_healing_open; if (!recovering || !config.has_value()) @@ -35,9 +40,10 @@ namespace ccf LOG_INFO_FMT("Starting self-healing-open"); - tx.rw(node_state->network.self_healing_open_sm_state) + tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE) ->put(SelfHealingOpenSM::GOSSIPING); - tx.rw(node_state->network.self_healing_open_timeout_sm_state) + tx.rw( + Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE) ->put(SelfHealingOpenSM::GOSSIPING); start_message_retry_timers(); @@ -53,9 +59,9 @@ namespace ccf } auto* sm_state_handle = - tx.rw(node_state->network.self_healing_open_sm_state); - auto* timeout_state_handle = - tx.rw(node_state->network.self_healing_open_timeout_sm_state); + tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE); + auto* timeout_state_handle = tx.rw( + Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE); auto sm_state_opt = sm_state_handle->get(); auto timeout_state_opt = timeout_state_handle->get(); @@ -75,7 +81,7 @@ namespace ccf case SelfHealingOpenSM::GOSSIPING: { auto* gossip_handle = - tx.ro(node_state->network.self_healing_open_gossip); + tx.ro(Tables::SELF_HEALING_OPEN_GOSSIPS); auto quorum_size = config->addresses.size(); if (gossip_handle->size() >= quorum_size || valid_timeout) { @@ -100,23 +106,27 @@ namespace ccf { throw std::logic_error("No valid gossip addresses provided"); } - auto* chosen_replica = - tx.rw(node_state->network.self_healing_open_chosen_replica); - chosen_replica->put(maximum->second); + tx.rw( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + ->put(maximum->second); + sm_state_handle->put(SelfHealingOpenSM::VOTING); } break; } case SelfHealingOpenSM::VOTING: { - auto* votes = tx.rw(node_state->network.self_healing_open_votes); + auto* votes = + tx.rw(Tables::SELF_HEALING_OPEN_VOTES); + auto sufficient_quorum = votes->size() >= config->addresses.size() / 2 + 1; if (sufficient_quorum || valid_timeout) { if (valid_timeout && !sufficient_quorum) { - tx.rw(node_state->network.self_healing_open_failover_flag) + tx.rw( + Tables::SELF_HEALING_OPEN_FAILOVER_FLAG) ->put(true); } if (votes->size() == 0) @@ -148,15 +158,16 @@ namespace ccf } case SelfHealingOpenSM::JOINING: { - auto chosen_replica = - tx.ro(node_state->network.self_healing_open_chosen_replica)->get(); + auto chosen_replica = tx.ro( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + ->get(); if (!chosen_replica.has_value()) { throw std::logic_error( "Self-healing-open chosen node not set, cannot join"); } auto node_config = - tx.ro(node_state->network.self_healing_open_node_info) + tx.ro(Tables::SELF_HEALING_OPEN_NODES) ->get(chosen_replica.value()); if (!node_config.has_value()) { @@ -228,7 +239,7 @@ namespace ccf auto tx = node_state_->network.tables->create_read_only_tx(); auto* sm_state_handle = - tx.ro(node_state_->network.self_healing_open_sm_state); + tx.ro(Tables::SELF_HEALING_OPEN_SM_STATE); auto sm_state_opt = sm_state_handle->get(); if (!sm_state_opt.has_value()) @@ -253,10 +264,10 @@ namespace ccf break; case SelfHealingOpenSM::VOTING: { - auto* node_info_handle = - tx.ro(node_state_->network.self_healing_open_node_info); - auto* chosen_replica_handle = - tx.ro(node_state_->network.self_healing_open_chosen_replica); + auto* node_info_handle = tx.ro( + Tables::SELF_HEALING_OPEN_NODES); + auto* chosen_replica_handle = tx.ro( + Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA); if (!chosen_replica_handle->get().has_value()) { throw std::logic_error( @@ -322,7 +333,7 @@ namespace ccf // Stop the timer if the node has completed its self-healing-open auto tx = node_state_->network.tables->create_read_only_tx(); auto* sm_state_handle = - tx.ro(node_state_->network.self_healing_open_sm_state); + tx.ro(Tables::SELF_HEALING_OPEN_SM_STATE); if (!sm_state_handle->get().has_value()) { throw std::logic_error( @@ -373,13 +384,13 @@ namespace ccf curl::CurlmLibuvContextSingleton::get_instance()->attach_request( std::move(curl_request)); - auto delay = config->timeout; + auto delay = config->failover_timeout; ::threading::ThreadMessaging::instance().add_task_after( std::move(msg), delay); }, *this); ::threading::ThreadMessaging::instance().add_task_after( - std::move(timeout_msg), config->timeout); + std::move(timeout_msg), config->failover_timeout); } void dispatch_authenticated_message( @@ -490,7 +501,7 @@ namespace ccf } void SelfHealingOpenSubSystem::send_vote_unsafe( - const SelfHealingOpenNodeInfo_t& node_info) + const SelfHealingOpenNodeInfo& node_info) { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) diff --git a/src/service/network_tables.h b/src/service/network_tables.h index 79c4806c8a2f..2f94a2b21db1 100644 --- a/src/service/network_tables.h +++ b/src/service/network_tables.h @@ -246,22 +246,6 @@ namespace ccf return std::make_tuple(signatures, serialise_tree); } - // Self-healing open tables - const SelfHealingOpenNodeInfo self_healing_open_node_info = { - Tables::SELF_HEALING_OPEN_NODES}; - const SelfHealingOpenGossips self_healing_open_gossip = { - Tables::SELF_HEALING_OPEN_GOSSIPS}; - const SelfHealingOpenChosenReplica self_healing_open_chosen_replica = { - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA}; - const SelfHealingOpenVotes self_healing_open_votes = { - Tables::SELF_HEALING_OPEN_VOTES}; - const SelfHealingOpenSMState self_healing_open_sm_state = { - Tables::SELF_HEALING_OPEN_SM_STATE}; - const SelfHealingOpenSMState self_healing_open_timeout_sm_state = { - Tables::SELF_HEALING_OPEN_TIMEOUT_SM_STATE}; - const SelfHealingOpenFailoverFlag self_healing_open_failover_flag = { - Tables::SELF_HEALING_OPEN_FAILOVER_FLAG}; - inline auto get_all_internal_tables() const { return std::tuple_cat( From 56ebb3ef122252c55cf7dfbcd568316d8a6c749b Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 15:59:25 +0100 Subject: [PATCH 179/197] Refactor and document --- doc/audit/builtin_maps.rst | 49 ++++++++++++++++++- .../ccf/service/tables/self_healing_open.h | 6 +-- src/node/rpc/node_frontend.h | 8 +-- src/node/self_healing_open_impl.cpp | 16 +++--- 4 files changed, 63 insertions(+), 16 deletions(-) diff --git a/doc/audit/builtin_maps.rst b/doc/audit/builtin_maps.rst index 82bb7fdf3be7..202fb7d30e46 100644 --- a/doc/audit/builtin_maps.rst +++ b/doc/audit/builtin_maps.rst @@ -571,4 +571,51 @@ While the contents themselves are encrypted, the table is public so as to be acc **Value** The mechanism by which the ledger secret was recovered. .. doxygenenum:: ccf::RecoveryType - :project: CCF \ No newline at end of file + :project: CCF + +``self_healing_open.nodes`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Key** Intrinsic node ID: A string which is unique to a particular node role within a cluster. Currently its IP and port. + +**Value** + +.. doxygenstruct:: ccf::SelfHealingOpenNodeInfo + :project: CCF + :members: + +``self_healing_open.gossip`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Key** Intrinsic node ID of the source of the gossip message. + +**Value** + +.. doxygenstruct:: ccf::SelfHealingOpenGossip + :project: CCF + :members: + +``self_healing_open.chosen_node`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Value** The intrinsic node ID of the chosen replica node. This will either be the node this node voted for, or the node that is has received an `IAmOpen` message from. + +``self_healing_open.votes`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Key** Intrinsic node ID of the node which has voted for this node to be opened. + +``selfhealingopen.sm_state`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Value** State machine state of the self-healing open protocol. + +``selfhealingopen.timeout_sm_state`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Value** Timeout state machine state of the self-healing open protocol. Ticks based on `failover_timeout` and advances `selfhealingopen.sm_state` if it falls behind. + +``selfhealingopen.failover_open`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Value** Boolean flag indicating whether the latest self-healing-open recovery opened using a failover timeout. \ No newline at end of file diff --git a/include/ccf/service/tables/self_healing_open.h b/include/ccf/service/tables/self_healing_open.h index fc00fc291606..038ebed2de6b 100644 --- a/include/ccf/service/tables/self_healing_open.h +++ b/include/ccf/service/tables/self_healing_open.h @@ -50,7 +50,7 @@ namespace ccf ServiceMap; using SelfHealingOpenGossips = ServiceMap; - using SelfHealingOpenChosenReplica = ServiceValue; + using SelfHealingOpenChosenNode = ServiceValue; using SelfHealingOpenVotes = ServiceSet; using SelfHealingOpenSMState = ServiceValue; using SelfHealingOpenTimeoutSMState = ServiceValue; @@ -62,8 +62,8 @@ namespace ccf "public:ccf.gov.selfhealingopen.nodes"; static constexpr auto SELF_HEALING_OPEN_GOSSIPS = "public:ccf.gov.selfhealingopen.gossip"; - static constexpr auto SELF_HEALING_OPEN_CHOSEN_REPLICA = - "public:ccf.gov.selfhealingopen.chosen_replica"; + static constexpr auto SELF_HEALING_OPEN_CHOSEN_NODE = + "public:ccf.gov.selfhealingopen.chosen_node"; static constexpr auto SELF_HEALING_OPEN_VOTES = "public:ccf.gov.selfhealingopen.votes"; static constexpr auto SELF_HEALING_OPEN_SM_STATE = diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 397820f41e6f..16079684fd20 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -2285,8 +2285,8 @@ namespace ccf "Self-healing-open: recieve gossip from {}", in.info.intrinsic_id); // Stop accepting gossips once a node has voted - auto chosen_replica = args.tx.template ro( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA); + auto chosen_replica = args.tx.template ro( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE); if (chosen_replica->get().has_value()) { return make_error( @@ -2417,8 +2417,8 @@ namespace ccf .template rw(Tables::SELF_HEALING_OPEN_SM_STATE) ->put(SelfHealingOpenSM::JOINING); args.tx - .template rw( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + .template rw( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE) ->put(in.info.intrinsic_id); try diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 555b800ff37f..681d64ad0711 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -24,8 +24,8 @@ namespace ccf ->clear(); tx.rw(Tables::SELF_HEALING_OPEN_NODES)->clear(); tx.rw(Tables::SELF_HEALING_OPEN_GOSSIPS)->clear(); - tx.rw( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + tx.rw( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE) ->clear(); tx.rw(Tables::SELF_HEALING_OPEN_VOTES)->clear(); tx.rw(Tables::SELF_HEALING_OPEN_FAILOVER_FLAG) @@ -106,8 +106,8 @@ namespace ccf { throw std::logic_error("No valid gossip addresses provided"); } - tx.rw( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + tx.rw( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE) ->put(maximum->second); sm_state_handle->put(SelfHealingOpenSM::VOTING); @@ -158,8 +158,8 @@ namespace ccf } case SelfHealingOpenSM::JOINING: { - auto chosen_replica = tx.ro( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA) + auto chosen_replica = tx.ro( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE) ->get(); if (!chosen_replica.has_value()) { @@ -266,8 +266,8 @@ namespace ccf { auto* node_info_handle = tx.ro( Tables::SELF_HEALING_OPEN_NODES); - auto* chosen_replica_handle = tx.ro( - Tables::SELF_HEALING_OPEN_CHOSEN_REPLICA); + auto* chosen_replica_handle = tx.ro( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE); if (!chosen_replica_handle->get().has_value()) { throw std::logic_error( From cb233434705eb5d68002d6f030ce4fc7dbee5273 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 15:59:31 +0100 Subject: [PATCH 180/197] rejig --- tests/infra/clients.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/infra/clients.py b/tests/infra/clients.py index d1a3a486e2b7..ccbf267c5f7b 100644 --- a/tests/infra/clients.py +++ b/tests/infra/clients.py @@ -605,11 +605,15 @@ def request( if rc.returncode != 0: if rc.returncode in [ + # COULDNT_CONNECT, 7, + # PEER_FAILED_VERIFICATION, 35, + # SEND_ERROR, 55, + # SSL_CONNECT_ERROR 60, - ]: # COULDNT_CONNECT, PEER_FAILED_VERIFICATION, SEND_ERROR, SSL_CONNECT_ERROR + ]: raise CCFConnectionException if rc.returncode == 28: # OPERATION_TIMEDOUT raise TimeoutError From e977b749ea446e8cbcc35b6b0dcb6da6d7311bdb Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 17:10:05 +0100 Subject: [PATCH 181/197] de-replica-ing --- doc/audit/builtin_maps.rst | 2 +- doc/operations/recovery.rst | 2 +- include/ccf/service/tables/self_healing_open.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/audit/builtin_maps.rst b/doc/audit/builtin_maps.rst index 202fb7d30e46..7e8c0af649d0 100644 --- a/doc/audit/builtin_maps.rst +++ b/doc/audit/builtin_maps.rst @@ -598,7 +598,7 @@ While the contents themselves are encrypted, the table is public so as to be acc ``self_healing_open.chosen_node`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -**Value** The intrinsic node ID of the chosen replica node. This will either be the node this node voted for, or the node that is has received an `IAmOpen` message from. +**Value** The intrinsic node ID of the chosen node. This will either be the node this node voted for, or the node that is has received an `IAmOpen` message from. ``self_healing_open.votes`` ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/operations/recovery.rst b/doc/operations/recovery.rst index edfebb0a727c..dacb54d72b98 100644 --- a/doc/operations/recovery.rst +++ b/doc/operations/recovery.rst @@ -149,7 +149,7 @@ Self-Healing-Open recovery -------------------------- In environments with limited orchestration or limited operator access, it is desirable to allow an automated disaster recovery without operator intervention. -At a high level, Self-Healing-Open recovery allows recovering replicas to discover which replica has the most up-to-date ledger and automatically recover the network using that ledger. +At a high level, Self-Healing-Open recovery allows recovering replicas to discover which node has the most up-to-date ledger and automatically recover the network using that ledger. There are two paths, a election path, and a very-high-availablity failover path. The election path ensures that if: all nodes restart and have full network connectivity, a majority of nodes' on-disk ledger contains every committed transaction, and no timeouts trigger; then there will be only one recovered network, then all committed transaction will be persisted. diff --git a/include/ccf/service/tables/self_healing_open.h b/include/ccf/service/tables/self_healing_open.h index 038ebed2de6b..e7e112af9ccd 100644 --- a/include/ccf/service/tables/self_healing_open.h +++ b/include/ccf/service/tables/self_healing_open.h @@ -31,7 +31,7 @@ enum class SelfHealingOpenSM { GOSSIPING = 0, VOTING, - OPENING, // by chosen replica + OPENING, // by chosen node JOINING, // by all other replicas OPEN, }; From c3a8a46136fe9a3abc9bac478dc872669a4e7129 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 17:10:20 +0100 Subject: [PATCH 182/197] improved error messages --- src/node/rpc/node_frontend.h | 233 +++++++++++++++++------------------ 1 file changed, 116 insertions(+), 117 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 16079684fd20..6e458169f6f5 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -420,7 +420,7 @@ namespace ccf } } - std::optional> + std::optional self_healing_open_validate_and_store_node_info( endpoints::EndpointContext& args, ccf::kv::Tx& tx, @@ -440,7 +440,7 @@ namespace ccf in.intrinsic_id, code, message); - return std::make_tuple(code, message); + return make_error(code, ccf::errors::InvalidQuote, message); } LOG_TRACE_FMT( @@ -458,19 +458,18 @@ namespace ccf // If we have seen this node before, check that the cert is the same if (existing_node_info->cert_der != cert_der) { - LOG_FAIL_FMT( + auto message = fmt::format( "Self-healing-open message from intrinsic id {} is invalid: " "certificate has changed", in.intrinsic_id); - return std::make_tuple( - HTTP_STATUS_BAD_REQUEST, - "Self-healing-open message from intrinsic id is invalid: " - "certificate has changed"); + LOG_FAIL_FMT("{}", message); + return make_error( + HTTP_STATUS_BAD_REQUEST, ccf::errors::NodeAlreadyExists, message); } } else { - SelfHealingOpenNodeInfo_t src_info{ + SelfHealingOpenNodeInfo src_info{ .quote_info = in.quote_info, .published_network_address = in.published_network_address, .cert_der = cert_der, @@ -2258,72 +2257,73 @@ namespace ccf .set_openapi_hidden(true) .install(); - auto self_healing_open_gossip = [this]( - auto& args, - const nlohmann::json& params) { - auto config = this->context.get_subsystem(); - if ( - config == nullptr || - !config->get().node_config.recover.self_healing_open.has_value()) - { - return make_error( - HTTP_STATUS_BAD_REQUEST, - ccf::errors::InvalidNodeState, - "Unable to get self-healing-open configuration"); - } + auto self_healing_open_gossip = + [this](auto& args, const nlohmann::json& params) { + auto config = + this->context.get_subsystem(); + if ( + config == nullptr || + !config->get().node_config.recover.self_healing_open.has_value()) + { + return make_error( + HTTP_STATUS_BAD_REQUEST, + ccf::errors::InvalidNodeState, + "Unable to get self-healing-open configuration"); + } - auto in = params.get(); - auto is_invalid = self_healing_open_validate_and_store_node_info( - args, args.tx, in.info); - if (is_invalid.has_value()) - { - auto [code, message] = is_invalid.value(); - return make_error(code, ccf::errors::InvalidQuote, message); - } + auto in = params.get(); + auto validation_result = self_healing_open_validate_and_store_node_info( + args, args.tx, in.info); + if (validation_result.has_value()) + { + return validation_result.value(); + } - LOG_TRACE_FMT( - "Self-healing-open: recieve gossip from {}", in.info.intrinsic_id); + LOG_TRACE_FMT( + "Self-healing-open: recieve gossip from {}", in.info.intrinsic_id); - // Stop accepting gossips once a node has voted - auto chosen_replica = args.tx.template ro( - Tables::SELF_HEALING_OPEN_CHOSEN_NODE); - if (chosen_replica->get().has_value()) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - "This replica has already voted"); - } + // Stop accepting gossips once a node has voted + auto chosen_replica = args.tx.template ro( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE); + if (chosen_replica->get().has_value()) + { + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "This node has already voted for {}", + chosen_replica->get().value())); + } - auto gossip_handle = args.tx.template rw( - Tables::SELF_HEALING_OPEN_GOSSIPS); - if (gossip_handle->get(in.info.intrinsic_id).has_value()) - { - LOG_INFO_FMT( - "Node {} already gossiped, skipping", in.info.intrinsic_id); - return make_success( - fmt::format("Node {} already gossiped", in.info.intrinsic_id)); - } - gossip_handle->put(in.info.intrinsic_id, in.txid); + auto gossip_handle = args.tx.template rw( + Tables::SELF_HEALING_OPEN_GOSSIPS); + if (gossip_handle->get(in.info.intrinsic_id).has_value()) + { + LOG_INFO_FMT( + "Node {} already gossiped, skipping", in.info.intrinsic_id); + return make_success( + fmt::format("Node {} already gossiped", in.info.intrinsic_id)); + } + gossip_handle->put(in.info.intrinsic_id, in.txid); - try - { - this->node_operation.self_healing_open().advance(args.tx, false); - } - catch (const std::logic_error& e) - { - LOG_FAIL_FMT( - "Self-healing-open gossip failed to advance state: {}", e.what()); - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - fmt::format( - "Failed to advance self-healing-open state: {}", e.what())); - } + try + { + this->node_operation.self_healing_open().advance(args.tx, false); + } + catch (const std::logic_error& e) + { + LOG_FAIL_FMT( + "Self-healing-open gossip failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); + } - return make_success(fmt::format( - "Node {} gossiped for self-healing-open", in.info.intrinsic_id)); - }; + return make_success(fmt::format( + "Node {} gossiped for self-healing-open", in.info.intrinsic_id)); + }; make_endpoint( "/self_healing_open/gossip", HTTP_PUT, @@ -2348,12 +2348,11 @@ namespace ccf } auto in = params.get(); - auto valid = self_healing_open_validate_and_store_node_info( + auto validation_result = self_healing_open_validate_and_store_node_info( args, args.tx, in.info); - if (valid.has_value()) + if (validation_result.has_value()) { - auto [code, message] = valid.value(); - return make_error(code, ccf::errors::InvalidQuote, message); + return validation_result.value(); } LOG_TRACE_FMT( "Self-healing-open: recieve vote from {}", in.info.intrinsic_id); @@ -2390,55 +2389,55 @@ namespace ccf .set_openapi_hidden(true) .install(); - auto self_healing_open_iamopen = [this]( - auto& args, - const nlohmann::json& params) { - auto config = this->context.get_subsystem(); - if ( - config == nullptr || - !config->get().node_config.recover.self_healing_open.has_value()) - { - return make_error( - HTTP_STATUS_BAD_REQUEST, - ccf::errors::InvalidNodeState, - "Unable to get self-healing-open configuration"); - } + auto self_healing_open_iamopen = + [this](auto& args, const nlohmann::json& params) { + auto config = + this->context.get_subsystem(); + if ( + config == nullptr || + !config->get().node_config.recover.self_healing_open.has_value()) + { + return make_error( + HTTP_STATUS_BAD_REQUEST, + ccf::errors::InvalidNodeState, + "Unable to get self-healing-open configuration"); + } - auto in = params.get(); - auto valid = self_healing_open_validate_and_store_node_info( - args, args.tx, in.info); - if (valid.has_value()) - { - auto [code, message] = valid.value(); - return make_error(code, ccf::errors::InvalidQuote, message); - } + auto in = params.get(); + auto validation_result = self_healing_open_validate_and_store_node_info( + args, args.tx, in.info); + if (validation_result.has_value()) + { + return validation_result.value(); + } - args.tx - .template rw(Tables::SELF_HEALING_OPEN_SM_STATE) - ->put(SelfHealingOpenSM::JOINING); - args.tx - .template rw( - Tables::SELF_HEALING_OPEN_CHOSEN_NODE) - ->put(in.info.intrinsic_id); + args.tx + .template rw( + Tables::SELF_HEALING_OPEN_SM_STATE) + ->put(SelfHealingOpenSM::JOINING); + args.tx + .template rw( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE) + ->put(in.info.intrinsic_id); - try - { - this->node_operation.self_healing_open().advance(args.tx, false); - } - catch (const std::logic_error& e) - { - LOG_FAIL_FMT( - "Self-healing-open gossip failed to advance state: {}", e.what()); - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - fmt::format( - "Failed to advance self-healing-open state: {}", e.what())); - } + try + { + this->node_operation.self_healing_open().advance(args.tx, false); + } + catch (const std::logic_error& e) + { + LOG_FAIL_FMT( + "Self-healing-open gossip failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); + } - return make_success(fmt::format( - "Node {} is joining self-healing-open", in.info.intrinsic_id)); - }; + return make_success(fmt::format( + "Node {} is joining self-healing-open", in.info.intrinsic_id)); + }; make_endpoint( "/self_healing_open/iamopen", HTTP_PUT, From d6827c91be7c15ac24e1a46561fcfebdc6595559 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 18:26:48 +0100 Subject: [PATCH 183/197] Refactor node_frontend --- src/common/configuration.h | 2 +- src/host/run.cpp | 3 - src/node/node_state.h | 6 +- src/node/rpc/node_frontend.h | 329 ++++++++++++++-------------- src/node/rpc/node_operation.h | 2 +- src/node/rpc/test/node_stub.h | 2 +- src/node/self_healing_open_impl.cpp | 29 ++- src/node/self_healing_open_impl.h | 33 ++- 8 files changed, 203 insertions(+), 203 deletions(-) diff --git a/src/common/configuration.h b/src/common/configuration.h index 27c44dd0222a..355d5d3ce5b6 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -115,7 +115,7 @@ namespace ccf DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(SelfHealingOpenConfig); DECLARE_JSON_REQUIRED_FIELDS(SelfHealingOpenConfig, addresses); - DECLARE_JSON_OPTIONAL_FIELDS(SelfHealingOpenConfig, retry_timeout, timeout); + DECLARE_JSON_OPTIONAL_FIELDS(SelfHealingOpenConfig, retry_timeout, failover_timeout); DECLARE_JSON_TYPE(StartupConfig::Start); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/host/run.cpp b/src/host/run.cpp index 6c11a268023d..00f7e6c3b3b5 100644 --- a/src/host/run.cpp +++ b/src/host/run.cpp @@ -30,7 +30,6 @@ #include "enclave/entry_points.h" #include "handle_ring_buffer.h" #include "host/env.h" -#include "host/self_healing_open.h" #include "http/curl.h" #include "json_schema.h" #include "lfs_file_handler.h" @@ -525,8 +524,6 @@ namespace ccf auto curl_libuv_context = curl::CurlmLibuvContextSingleton(uv_default_loop()); - ccf::SelfHealingOpenRBHandlerSingleton::initialise(writer_factory); - ResolvedAddresses resolved_rpc_addresses; for (auto& [name, interface] : config.network.rpc_interfaces) { diff --git a/src/node/node_state.h b/src/node/node_state.h index f29e11865661..c633d4ab439b 100644 --- a/src/node/node_state.h +++ b/src/node/node_state.h @@ -88,7 +88,7 @@ namespace ccf class NodeState : public AbstractNodeState { - friend class SelfHealingOpenSubSystem; + friend class SelfHealingOpenSubsystem; private: // @@ -237,7 +237,7 @@ namespace ccf last_recovered_signed_idx = last_recovered_idx; } - SelfHealingOpenSubSystem self_healing_open_impl; + SelfHealingOpenSubsystem self_healing_open_impl; public: NodeState( @@ -3007,7 +3007,7 @@ namespace ccf return writer_factory; } - SelfHealingOpenSubSystem& self_healing_open() override + SelfHealingOpenSubsystem& self_healing_open() override { return self_healing_open_impl; } diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 6e458169f6f5..9595489b7d12 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -4,6 +4,7 @@ #include "ccf/common_auth_policies.h" #include "ccf/common_endpoint_registry.h" +#include "ccf/endpoint_context.h" #include "ccf/http_query.h" #include "ccf/js/core/context.h" #include "ccf/json_handler.h" @@ -420,6 +421,116 @@ namespace ccf } } + template + using SelfHealingOpenHandler = std::function( + endpoints::EndpointContext& args, In& in)>; + + template + HandlerJsonParamsAndForward wrap_self_healing_open( + SelfHealingOpenHandler cb) + { + return [cb = std::move(cb), this]( + endpoints::EndpointContext& args, const nlohmann::json& params) { + auto config = this->context.get_subsystem(); + if ( + config == nullptr || + !config->get().node_config.recover.self_healing_open.has_value()) + { + return make_error( + HTTP_STATUS_BAD_REQUEST, + ccf::errors::InvalidNodeState, + "Unable to get self-healing-open configuration"); + } + + auto in = params.get(); + self_healing_open::RequestNodeInfo info = in.info; + + // ---- Validate the quote and store the node info ---- + + auto cert_der = ccf::crypto::public_key_der_from_cert( + args.rpc_ctx->get_session_context()->caller_cert); + + pal::PlatformAttestationMeasurement measurement; + QuoteVerificationResult verify_result = + this->node_operation.verify_quote( + args.tx, info.quote_info, cert_der, measurement); + if (verify_result != QuoteVerificationResult::Verified) + { + const auto [code, message] = quote_verification_error(verify_result); + LOG_FAIL_FMT( + "Self-healing-open message from intrinsic id {} is invalid: {} " + "({})", + info.intrinsic_id, + code, + message); + return make_error(code, ccf::errors::InvalidQuote, message); + } + + LOG_TRACE_FMT( + "Self-healing-open message from intrinsic id {}'s quote is valid", + info.intrinsic_id); + + // Validating that we haven't heard from this node before, of if we have + // that the cert hasn't changed + auto* node_info_handle = args.tx.rw( + Tables::SELF_HEALING_OPEN_NODES); + auto existing_node_info = node_info_handle->get(info.intrinsic_id); + + if (existing_node_info.has_value()) + { + // If we have seen this node before, check that the cert is the same + if (existing_node_info->cert_der != cert_der) + { + auto message = fmt::format( + "Self-healing-open message from intrinsic id {} is invalid: " + "certificate has changed", + info.intrinsic_id); + LOG_FAIL_FMT("{}", message); + return make_error( + HTTP_STATUS_BAD_REQUEST, ccf::errors::NodeAlreadyExists, message); + } + } + else + { + SelfHealingOpenNodeInfo src_info{ + .quote_info = info.quote_info, + .published_network_address = info.published_network_address, + .cert_der = cert_der, + .service_identity = info.service_identity, + .intrinsic_id = info.intrinsic_id}; + node_info_handle->put(info.intrinsic_id, src_info); + } + + // ---- Run callback ---- + + auto ret = cb(args, in); + if (ret.has_value()) + { + jsonhandler::JsonAdapterResponse res = ret.value(); + return res; + } + + // ---- Advance state machine ---- + + try + { + this->node_operation.self_healing_open().advance(args.tx, false); + } + catch (const std::logic_error& e) + { + LOG_FAIL_FMT( + "Self-healing-open failed to advance state: {}", e.what()); + return make_error( + HTTP_STATUS_INTERNAL_SERVER_ERROR, + ccf::errors::InternalError, + fmt::format( + "Failed to advance self-healing-open state: {}", e.what())); + } + + return make_success(); + }; + } + std::optional self_healing_open_validate_and_store_node_info( endpoints::EndpointContext& args, @@ -2258,190 +2369,90 @@ namespace ccf .install(); auto self_healing_open_gossip = - [this](auto& args, const nlohmann::json& params) { - auto config = - this->context.get_subsystem(); - if ( - config == nullptr || - !config->get().node_config.recover.self_healing_open.has_value()) - { - return make_error( - HTTP_STATUS_BAD_REQUEST, - ccf::errors::InvalidNodeState, - "Unable to get self-healing-open configuration"); - } - - auto in = params.get(); - auto validation_result = self_healing_open_validate_and_store_node_info( - args, args.tx, in.info); - if (validation_result.has_value()) - { - return validation_result.value(); - } - - LOG_TRACE_FMT( - "Self-healing-open: recieve gossip from {}", in.info.intrinsic_id); - - // Stop accepting gossips once a node has voted - auto chosen_replica = args.tx.template ro( - Tables::SELF_HEALING_OPEN_CHOSEN_NODE); - if (chosen_replica->get().has_value()) - { - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - fmt::format( - "This node has already voted for {}", - chosen_replica->get().value())); - } - - auto gossip_handle = args.tx.template rw( - Tables::SELF_HEALING_OPEN_GOSSIPS); - if (gossip_handle->get(in.info.intrinsic_id).has_value()) - { - LOG_INFO_FMT( - "Node {} already gossiped, skipping", in.info.intrinsic_id); - return make_success( - fmt::format("Node {} already gossiped", in.info.intrinsic_id)); - } - gossip_handle->put(in.info.intrinsic_id, in.txid); + [this]( + auto& args, + self_healing_open::GossipRequest in) -> std::optional { + LOG_TRACE_FMT( + "Self-healing-open: recieve gossip from {}", in.info.intrinsic_id); - try - { - this->node_operation.self_healing_open().advance(args.tx, false); - } - catch (const std::logic_error& e) - { - LOG_FAIL_FMT( - "Self-healing-open gossip failed to advance state: {}", e.what()); - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - fmt::format( - "Failed to advance self-healing-open state: {}", e.what())); - } + // Stop accepting gossips once a node has voted + auto chosen_replica = args.tx.template ro( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE); + if (chosen_replica->get().has_value()) + { + return ErrorDetails{ + .status = HTTP_STATUS_INTERNAL_SERVER_ERROR, + .code = ccf::errors::InternalError, + .msg = fmt::format( + "This node has already voted for {}", + chosen_replica->get().value())}; + } - return make_success(fmt::format( - "Node {} gossiped for self-healing-open", in.info.intrinsic_id)); - }; + auto gossip_handle = args.tx.template rw( + Tables::SELF_HEALING_OPEN_GOSSIPS); + if (gossip_handle->get(in.info.intrinsic_id).has_value()) + { + LOG_INFO_FMT( + "Node {} already gossiped, skipping", in.info.intrinsic_id); + return std::nullopt; + } + gossip_handle->put(in.info.intrinsic_id, in.txid); + return std::nullopt; + }; make_endpoint( "/self_healing_open/gossip", HTTP_PUT, - json_adapter(self_healing_open_gossip), + json_adapter(wrap_self_healing_open( + self_healing_open_gossip)), no_auth_required) .set_forwarding_required(endpoints::ForwardingRequired::Never) .set_openapi_hidden(true) .install(); auto self_healing_open_vote = - [this](auto& args, const nlohmann::json& params) { - auto config = - this->context.get_subsystem(); - if ( - config == nullptr || - !config->get().node_config.recover.self_healing_open.has_value()) - { - return make_error( - HTTP_STATUS_BAD_REQUEST, - ccf::errors::InvalidNodeState, - "Unable to get self-healing-open configuration"); - } - - auto in = params.get(); - auto validation_result = self_healing_open_validate_and_store_node_info( - args, args.tx, in.info); - if (validation_result.has_value()) - { - return validation_result.value(); - } - LOG_TRACE_FMT( - "Self-healing-open: recieve vote from {}", in.info.intrinsic_id); - - args.tx - .template rw(Tables::SELF_HEALING_OPEN_VOTES) - ->insert(in.info.intrinsic_id); + [this](auto& args, self_healing_open::TaggedWithNodeInfo in) + -> std::optional { + LOG_TRACE_FMT( + "Self-healing-open: recieve vote from {}", in.info.intrinsic_id); - try - { - this->node_operation.self_healing_open().advance(args.tx, false); - } - catch (const std::logic_error& e) - { - LOG_FAIL_FMT( - "Self-healing-open gossip failed to advance state: {}", e.what()); - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - fmt::format( - "Failed to advance self-healing-open state: {}", e.what())); - } + args.tx + .template rw(Tables::SELF_HEALING_OPEN_VOTES) + ->insert(in.info.intrinsic_id); - // if sufficient votes, then we can open the network - return make_success(fmt::format( - "Node {} voted for self-healing-open", in.info.intrinsic_id)); - }; + return std::nullopt; + }; make_endpoint( "/self_healing_open/vote", HTTP_PUT, - json_adapter(self_healing_open_vote), + json_adapter( + wrap_self_healing_open( + self_healing_open_vote)), no_auth_required) .set_forwarding_required(endpoints::ForwardingRequired::Never) .set_openapi_hidden(true) .install(); auto self_healing_open_iamopen = - [this](auto& args, const nlohmann::json& params) { - auto config = - this->context.get_subsystem(); - if ( - config == nullptr || - !config->get().node_config.recover.self_healing_open.has_value()) - { - return make_error( - HTTP_STATUS_BAD_REQUEST, - ccf::errors::InvalidNodeState, - "Unable to get self-healing-open configuration"); - } - - auto in = params.get(); - auto validation_result = self_healing_open_validate_and_store_node_info( - args, args.tx, in.info); - if (validation_result.has_value()) - { - return validation_result.value(); - } - - args.tx - .template rw( - Tables::SELF_HEALING_OPEN_SM_STATE) - ->put(SelfHealingOpenSM::JOINING); - args.tx - .template rw( - Tables::SELF_HEALING_OPEN_CHOSEN_NODE) - ->put(in.info.intrinsic_id); - - try - { - this->node_operation.self_healing_open().advance(args.tx, false); - } - catch (const std::logic_error& e) - { - LOG_FAIL_FMT( - "Self-healing-open gossip failed to advance state: {}", e.what()); - return make_error( - HTTP_STATUS_INTERNAL_SERVER_ERROR, - ccf::errors::InternalError, - fmt::format( - "Failed to advance self-healing-open state: {}", e.what())); - } - - return make_success(fmt::format( - "Node {} is joining self-healing-open", in.info.intrinsic_id)); - }; + [this](auto& args, self_healing_open::TaggedWithNodeInfo in) + -> std::optional { + LOG_TRACE_FMT( + "Self-healing-open: recieve IAmOpen from {}", in.info.intrinsic_id); + args.tx + .template rw( + Tables::SELF_HEALING_OPEN_SM_STATE) + ->put(SelfHealingOpenSM::JOINING); + args.tx + .template rw( + Tables::SELF_HEALING_OPEN_CHOSEN_NODE) + ->put(in.info.intrinsic_id); + return std::nullopt; + }; make_endpoint( "/self_healing_open/iamopen", HTTP_PUT, - json_adapter(self_healing_open_iamopen), + json_adapter( + wrap_self_healing_open( + self_healing_open_iamopen)), no_auth_required) .set_forwarding_required(endpoints::ForwardingRequired::Never) .set_openapi_hidden(true) diff --git a/src/node/rpc/node_operation.h b/src/node/rpc/node_operation.h index f11bef5f2616..35993a52f4b3 100644 --- a/src/node/rpc/node_operation.h +++ b/src/node/rpc/node_operation.h @@ -111,7 +111,7 @@ namespace ccf return impl.get_cose_signatures_config(); } - SelfHealingOpenSubSystem& self_healing_open() override + SelfHealingOpenSubsystem& self_healing_open() override { return impl.self_healing_open(); } diff --git a/src/node/rpc/test/node_stub.h b/src/node/rpc/test/node_stub.h index 5e43226545c1..658ea06fe358 100644 --- a/src/node/rpc/test/node_stub.h +++ b/src/node/rpc/test/node_stub.h @@ -112,7 +112,7 @@ namespace ccf return cose_signatures_config; } - SelfHealingOpenSubSystem& self_healing_open() override + SelfHealingOpenSubsystem& self_healing_open() override { throw std::logic_error("Unimplemented"); } diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 681d64ad0711..8a325845723c 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -11,11 +11,11 @@ namespace ccf { - SelfHealingOpenSubSystem::SelfHealingOpenSubSystem(NodeState* node_state_) : + SelfHealingOpenSubsystem::SelfHealingOpenSubsystem(NodeState* node_state_) : node_state(node_state_) {} - void SelfHealingOpenSubSystem::try_start(ccf::kv::Tx& tx, bool recovering) + void SelfHealingOpenSubsystem::try_start(ccf::kv::Tx& tx, bool recovering) { // Clear any previous state tx.rw(Tables::SELF_HEALING_OPEN_SM_STATE)->clear(); @@ -50,7 +50,7 @@ namespace ccf start_failover_timers(); } - void SelfHealingOpenSubSystem::advance(ccf::kv::Tx& tx, bool timeout) + void SelfHealingOpenSubsystem::advance(ccf::kv::Tx& tx, bool timeout) { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -223,7 +223,7 @@ namespace ccf } } - void SelfHealingOpenSubSystem::start_message_retry_timers() + void SelfHealingOpenSubsystem::start_message_retry_timers() { LOG_TRACE_FMT("Self-healing-open: Setting up retry timers"); auto retry_timer_msg = std::make_unique<::threading::Tmsg>( @@ -307,7 +307,7 @@ namespace ccf threading::get_current_thread_id(), std::move(retry_timer_msg)); } - void SelfHealingOpenSubSystem::start_failover_timers() + void SelfHealingOpenSubsystem::start_failover_timers() { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -459,7 +459,7 @@ namespace ccf std::move(curl_request)); } - self_healing_open::RequestNodeInfo SelfHealingOpenSubSystem::make_node_info() + self_healing_open::RequestNodeInfo SelfHealingOpenSubsystem::make_node_info() { return { .quote_info = node_state->quote_info, @@ -473,7 +473,7 @@ namespace ccf }; } - void SelfHealingOpenSubSystem::send_gossip_unsafe() + void SelfHealingOpenSubsystem::send_gossip_unsafe() { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -483,10 +483,9 @@ namespace ccf LOG_TRACE_FMT("Broadcasting self-healing-open gossip"); - self_healing_open::GossipRequest request{ - .info = make_node_info(), - .txid = node_state->last_recovered_signed_idx, - }; + self_healing_open::GossipRequest request; + request.info = make_node_info(); + request.txid = node_state->last_recovered_signed_idx; nlohmann::json request_json = request; for (auto& target_address : config->addresses) @@ -500,7 +499,7 @@ namespace ccf } } - void SelfHealingOpenSubSystem::send_vote_unsafe( + void SelfHealingOpenSubsystem::send_vote_unsafe( const SelfHealingOpenNodeInfo& node_info) { auto& config = node_state->config.recover.self_healing_open; @@ -514,7 +513,7 @@ namespace ccf node_info.intrinsic_id, node_info.published_network_address); - self_healing_open::VoteRequest request{.info = make_node_info()}; + self_healing_open::TaggedWithNodeInfo request{.info = make_node_info()}; nlohmann::json request_json = request; @@ -526,7 +525,7 @@ namespace ccf node_state->node_sign_kp->private_key_pem()); } - void SelfHealingOpenSubSystem::send_iamopen_unsafe() + void SelfHealingOpenSubsystem::send_iamopen_unsafe() { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -536,7 +535,7 @@ namespace ccf LOG_TRACE_FMT("Sending self-healing-open iamopen"); - self_healing_open::IAmOpenRequest request{.info = make_node_info()}; + self_healing_open::TaggedWithNodeInfo request{.info = make_node_info()}; nlohmann::json request_json = request; for (auto& target_address : config->addresses) diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index ab27d53aff94..3ccba7710746 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -2,6 +2,7 @@ // Licensed under the Apache 2.0 License. #pragma once +#include "ccf/ds/json.h" #include "ccf/node/startup_config.h" #include "ccf/service/tables/self_healing_open.h" #include "ccf/tx.h" @@ -23,34 +24,26 @@ namespace ccf::self_healing_open intrinsic_id, service_identity); - struct GossipRequest - { - RequestNodeInfo info; - ccf::kv::Version txid; - }; - DECLARE_JSON_TYPE(GossipRequest); - DECLARE_JSON_REQUIRED_FIELDS(GossipRequest, txid, info); - - struct VoteRequest + struct TaggedWithNodeInfo { + public: RequestNodeInfo info; }; - DECLARE_JSON_TYPE(VoteRequest); - DECLARE_JSON_REQUIRED_FIELDS(VoteRequest, info); + DECLARE_JSON_TYPE(TaggedWithNodeInfo); + DECLARE_JSON_REQUIRED_FIELDS(TaggedWithNodeInfo, info); - struct IAmOpenRequest + struct GossipRequest : public TaggedWithNodeInfo { - RequestNodeInfo info; + ccf::kv::Version txid{}; }; - DECLARE_JSON_TYPE(IAmOpenRequest); - DECLARE_JSON_REQUIRED_FIELDS(IAmOpenRequest, info); - + DECLARE_JSON_TYPE(GossipRequest); + DECLARE_JSON_REQUIRED_FIELDS(GossipRequest, txid); } namespace ccf { class NodeState; - class SelfHealingOpenSubSystem + class SelfHealingOpenSubsystem { private: // SelfHealingOpenService is solely owned by NodeState, and all tasks should @@ -58,15 +51,15 @@ namespace ccf NodeState* node_state; public: - SelfHealingOpenSubSystem(NodeState* node_state); + SelfHealingOpenSubsystem(NodeState* node_state); void try_start(ccf::kv::Tx& tx, bool recovering); void advance(ccf::kv::Tx& tx, bool timeout); private: struct SHOMsg { - SHOMsg(SelfHealingOpenSubSystem& self_) : self(self_) {} - SelfHealingOpenSubSystem& self; + SHOMsg(SelfHealingOpenSubsystem& self_) : self(self_) {} + SelfHealingOpenSubsystem& self; }; // Start path From eacbf6685d1ba4a12df51c229a3178ef2b937fd8 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 18:29:10 +0100 Subject: [PATCH 184/197] fmt --- src/common/configuration.h | 3 ++- src/node/self_healing_open_impl.cpp | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/common/configuration.h b/src/common/configuration.h index 355d5d3ce5b6..d303a3076fa0 100644 --- a/src/common/configuration.h +++ b/src/common/configuration.h @@ -115,7 +115,8 @@ namespace ccf DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(SelfHealingOpenConfig); DECLARE_JSON_REQUIRED_FIELDS(SelfHealingOpenConfig, addresses); - DECLARE_JSON_OPTIONAL_FIELDS(SelfHealingOpenConfig, retry_timeout, failover_timeout); + DECLARE_JSON_OPTIONAL_FIELDS( + SelfHealingOpenConfig, retry_timeout, failover_timeout); DECLARE_JSON_TYPE(StartupConfig::Start); DECLARE_JSON_REQUIRED_FIELDS( diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index 8a325845723c..ccdc8af80e3d 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -24,8 +24,7 @@ namespace ccf ->clear(); tx.rw(Tables::SELF_HEALING_OPEN_NODES)->clear(); tx.rw(Tables::SELF_HEALING_OPEN_GOSSIPS)->clear(); - tx.rw( - Tables::SELF_HEALING_OPEN_CHOSEN_NODE) + tx.rw(Tables::SELF_HEALING_OPEN_CHOSEN_NODE) ->clear(); tx.rw(Tables::SELF_HEALING_OPEN_VOTES)->clear(); tx.rw(Tables::SELF_HEALING_OPEN_FAILOVER_FLAG) From 0e25ca7ca340fcf496a48b4237dc66184218507a Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 23 Sep 2025 18:38:26 +0100 Subject: [PATCH 185/197] Add model checking --- .github/workflows/ci-verification.yml | 22 +++++++++++++++++++ tla/disaster-recovery/.gitignore | 4 +--- .../{stateright => }/Cargo.lock | 0 .../{stateright => }/Cargo.toml | 0 .../{stateright => }/Readme.md | 0 .../{stateright => }/src/main.rs | 0 .../{stateright => }/src/model.rs | 0 tla/disaster-recovery/stateright/.gitignore | 1 - 8 files changed, 23 insertions(+), 4 deletions(-) rename tla/disaster-recovery/{stateright => }/Cargo.lock (100%) rename tla/disaster-recovery/{stateright => }/Cargo.toml (100%) rename tla/disaster-recovery/{stateright => }/Readme.md (100%) rename tla/disaster-recovery/{stateright => }/src/main.rs (100%) rename tla/disaster-recovery/{stateright => }/src/model.rs (100%) delete mode 100644 tla/disaster-recovery/stateright/.gitignore diff --git a/.github/workflows/ci-verification.yml b/.github/workflows/ci-verification.yml index 13836fbdbe08..1d46cfc2fd22 100644 --- a/.github/workflows/ci-verification.yml +++ b/.github/workflows/ci-verification.yml @@ -233,3 +233,25 @@ jobs: name: tlc-trace-validation-consensus path: | tla/traces/* + + model-checking-self-healing-open: + name: Model Checking - Self-Healing Open + runs-on: [self-hosted, 1ES.Pool=gha-vmss-d16av5-ci] + container: + image: mcr.microsoft.com/azurelinux/base/core:3.0 + options: --user root --publish-all --cap-add NET_ADMIN --cap-add NET_RAW --cap-add SYS_PTRACE + + steps: + - name: "Checkout dependencies" + shell: bash + run: | + gpg --import /etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY + tdnf -y update + tdnf -y install ca-certificates git + + - uses: actions/checkout@v5 + - name: Install Stateright dependencies + run: | + tdnf install -y cargo + + - run: cd tla/disaster-recovery && cargo run check \ No newline at end of file diff --git a/tla/disaster-recovery/.gitignore b/tla/disaster-recovery/.gitignore index f410e7eb4643..eb5a316cbd19 100644 --- a/tla/disaster-recovery/.gitignore +++ b/tla/disaster-recovery/.gitignore @@ -1,3 +1 @@ -.envrc -states - +target diff --git a/tla/disaster-recovery/stateright/Cargo.lock b/tla/disaster-recovery/Cargo.lock similarity index 100% rename from tla/disaster-recovery/stateright/Cargo.lock rename to tla/disaster-recovery/Cargo.lock diff --git a/tla/disaster-recovery/stateright/Cargo.toml b/tla/disaster-recovery/Cargo.toml similarity index 100% rename from tla/disaster-recovery/stateright/Cargo.toml rename to tla/disaster-recovery/Cargo.toml diff --git a/tla/disaster-recovery/stateright/Readme.md b/tla/disaster-recovery/Readme.md similarity index 100% rename from tla/disaster-recovery/stateright/Readme.md rename to tla/disaster-recovery/Readme.md diff --git a/tla/disaster-recovery/stateright/src/main.rs b/tla/disaster-recovery/src/main.rs similarity index 100% rename from tla/disaster-recovery/stateright/src/main.rs rename to tla/disaster-recovery/src/main.rs diff --git a/tla/disaster-recovery/stateright/src/model.rs b/tla/disaster-recovery/src/model.rs similarity index 100% rename from tla/disaster-recovery/stateright/src/model.rs rename to tla/disaster-recovery/src/model.rs diff --git a/tla/disaster-recovery/stateright/.gitignore b/tla/disaster-recovery/stateright/.gitignore deleted file mode 100644 index eb5a316cbd19..000000000000 --- a/tla/disaster-recovery/stateright/.gitignore +++ /dev/null @@ -1 +0,0 @@ -target From e9b5c4c45c3ed24dd22fb47c5fa1b9cc710d2076 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 24 Sep 2025 10:04:47 +0100 Subject: [PATCH 186/197] Setup rustfmt --- .github/workflows/ci.yml | 3 ++- scripts/ci-checks.sh | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 96fa556b933e..aac332d0d35c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,8 @@ jobs: run: | set -ex git config --global --add safe.directory /__w/CCF/CCF - tdnf -y install tar npm build-essential + tdnf -y install tar npm build-essential cargo + cargo install rustfmt ./scripts/setup-dev.sh ./scripts/ci-checks.sh shell: bash diff --git a/scripts/ci-checks.sh b/scripts/ci-checks.sh index afb31b33c340..09cd1166b1a7 100755 --- a/scripts/ci-checks.sh +++ b/scripts/ci-checks.sh @@ -162,6 +162,14 @@ group "Python types" git ls-files python/ | grep -e '\.py$' | xargs mypy || fail endgroup +group "Rust format" +if [ $FIX -ne 0 ]; then + rustfmt (find . --name '*.rs') || fail +else + rustfmt --check (find . --name '*.rs') || fail +fi +ungroup + group "Summary" if [[ -n "$FAIL" ]]; then echo "The following checks failed: ${FAIL//;/, }" From c3f2b7971deb20cb30e4e098ed399119d86d6069 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 24 Sep 2025 10:04:53 +0100 Subject: [PATCH 187/197] fmt --- tla/disaster-recovery/src/model.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tla/disaster-recovery/src/model.rs b/tla/disaster-recovery/src/model.rs index 497679b86ffe..d156ef314a12 100644 --- a/tla/disaster-recovery/src/model.rs +++ b/tla/disaster-recovery/src/model.rs @@ -93,8 +93,7 @@ where { } fn advance_several(&self, state: &mut State, o: &mut Out, id: Id, timeout: bool) { - while self.advance_step(state, o, id, timeout) { - } + while self.advance_step(state, o, id, timeout) {} } } From 750e2594be4de765bc07a7cece0fd264bc20436e Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 24 Sep 2025 13:31:00 +0100 Subject: [PATCH 188/197] Remove rustfmt for separate PR --- .github/workflows/ci.yml | 3 +-- scripts/ci-checks.sh | 8 -------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aac332d0d35c..96fa556b933e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,8 +40,7 @@ jobs: run: | set -ex git config --global --add safe.directory /__w/CCF/CCF - tdnf -y install tar npm build-essential cargo - cargo install rustfmt + tdnf -y install tar npm build-essential ./scripts/setup-dev.sh ./scripts/ci-checks.sh shell: bash diff --git a/scripts/ci-checks.sh b/scripts/ci-checks.sh index 09cd1166b1a7..afb31b33c340 100755 --- a/scripts/ci-checks.sh +++ b/scripts/ci-checks.sh @@ -162,14 +162,6 @@ group "Python types" git ls-files python/ | grep -e '\.py$' | xargs mypy || fail endgroup -group "Rust format" -if [ $FIX -ne 0 ]; then - rustfmt (find . --name '*.rs') || fail -else - rustfmt --check (find . --name '*.rs') || fail -fi -ungroup - group "Summary" if [[ -n "$FAIL" ]]; then echo "The following checks failed: ${FAIL//;/, }" From b8126eed98972bdfd866ad7c6def8a298eb204f6 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 24 Sep 2025 14:11:33 +0100 Subject: [PATCH 189/197] fmt --- .github/workflows/ci-verification.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-verification.yml b/.github/workflows/ci-verification.yml index 1d46cfc2fd22..392095a2ac8a 100644 --- a/.github/workflows/ci-verification.yml +++ b/.github/workflows/ci-verification.yml @@ -233,7 +233,7 @@ jobs: name: tlc-trace-validation-consensus path: | tla/traces/* - + model-checking-self-healing-open: name: Model Checking - Self-Healing Open runs-on: [self-hosted, 1ES.Pool=gha-vmss-d16av5-ci] @@ -254,4 +254,4 @@ jobs: run: | tdnf install -y cargo - - run: cd tla/disaster-recovery && cargo run check \ No newline at end of file + - run: cd tla/disaster-recovery && cargo run check From 6807ae01c56f1974a9d662280cc25ba75ca77471 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 29 Sep 2025 11:48:36 +0100 Subject: [PATCH 190/197] Ensure inheritance works as expected --- src/node/self_healing_open_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index 3ccba7710746..190fff0d3e11 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -36,7 +36,7 @@ namespace ccf::self_healing_open { ccf::kv::Version txid{}; }; - DECLARE_JSON_TYPE(GossipRequest); + DECLARE_JSON_TYPE_WITH_BASE(GossipRequest, TaggedWithNodeInfo); DECLARE_JSON_REQUIRED_FIELDS(GossipRequest, txid); } From b69730d2bcf16410af0bfa4691330316b2e42fb5 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 29 Sep 2025 11:52:45 +0100 Subject: [PATCH 191/197] Fix docs --- doc/audit/builtin_maps.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/audit/builtin_maps.rst b/doc/audit/builtin_maps.rst index 7e8c0af649d0..985380cfa87c 100644 --- a/doc/audit/builtin_maps.rst +++ b/doc/audit/builtin_maps.rst @@ -580,7 +580,7 @@ While the contents themselves are encrypted, the table is public so as to be acc **Value** -.. doxygenstruct:: ccf::SelfHealingOpenNodeInfo +.. doxygenstruct:: SelfHealingOpenNodeInfo :project: CCF :members: From 3c84f45539bc9ff86f28842aa80861bf18a601c7 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Mon, 29 Sep 2025 11:53:24 +0100 Subject: [PATCH 192/197] Fix doc --- doc/audit/builtin_maps.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/audit/builtin_maps.rst b/doc/audit/builtin_maps.rst index 985380cfa87c..0de6abf707cb 100644 --- a/doc/audit/builtin_maps.rst +++ b/doc/audit/builtin_maps.rst @@ -591,7 +591,7 @@ While the contents themselves are encrypted, the table is public so as to be acc **Value** -.. doxygenstruct:: ccf::SelfHealingOpenGossip +.. doxygenstruct:: ccf::self_healing_open::GossipRequest :project: CCF :members: From dba39194f3481cd9e8f0fd5143adf996c9c511d8 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 30 Sep 2025 13:18:42 +0100 Subject: [PATCH 193/197] Clean up old validation code --- src/node/rpc/node_frontend.h | 61 ------------------------------------ 1 file changed, 61 deletions(-) diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 9595489b7d12..237595a104aa 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -531,67 +531,6 @@ namespace ccf }; } - std::optional - self_healing_open_validate_and_store_node_info( - endpoints::EndpointContext& args, - ccf::kv::Tx& tx, - const self_healing_open::RequestNodeInfo& in) - { - auto cert_der = ccf::crypto::public_key_der_from_cert( - args.rpc_ctx->get_session_context()->caller_cert); - - pal::PlatformAttestationMeasurement measurement; - QuoteVerificationResult verify_result = this->node_operation.verify_quote( - args.tx, in.quote_info, cert_der, measurement); - if (verify_result != QuoteVerificationResult::Verified) - { - const auto [code, message] = quote_verification_error(verify_result); - LOG_FAIL_FMT( - "Self-healing-open message from intrinsic id {} is invalid: {} ({})", - in.intrinsic_id, - code, - message); - return make_error(code, ccf::errors::InvalidQuote, message); - } - - LOG_TRACE_FMT( - "Self-healing-open message from intrinsic id {}'s quote is valid", - in.intrinsic_id); - - // Validating that we haven't heard from this node before, of if we have - // that the cert hasn't changed - auto* node_info_handle = - tx.rw(Tables::SELF_HEALING_OPEN_NODES); - auto existing_node_info = node_info_handle->get(in.intrinsic_id); - - if (existing_node_info.has_value()) - { - // If we have seen this node before, check that the cert is the same - if (existing_node_info->cert_der != cert_der) - { - auto message = fmt::format( - "Self-healing-open message from intrinsic id {} is invalid: " - "certificate has changed", - in.intrinsic_id); - LOG_FAIL_FMT("{}", message); - return make_error( - HTTP_STATUS_BAD_REQUEST, ccf::errors::NodeAlreadyExists, message); - } - } - else - { - SelfHealingOpenNodeInfo src_info{ - .quote_info = in.quote_info, - .published_network_address = in.published_network_address, - .cert_der = cert_der, - .service_identity = in.service_identity, - .intrinsic_id = in.intrinsic_id}; - node_info_handle->put(in.intrinsic_id, src_info); - } - - return std::nullopt; - }; - public: NodeEndpoints(NetworkState& network_, ccf::AbstractNodeContext& context_) : CommonEndpointRegistry(get_actor_prefix(ActorsType::nodes), context_), From a78b8060cdbabb1c180885e3876f74ee1e6c8a66 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 30 Sep 2025 13:24:40 +0100 Subject: [PATCH 194/197] Fix race condition around quote endorsements, where they are wiped after opening, meaning iamopen messages are rejected. --- src/node/self_healing_open_impl.cpp | 34 +++++++++++++++++++---------- src/node/self_healing_open_impl.h | 8 +++---- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/src/node/self_healing_open_impl.cpp b/src/node/self_healing_open_impl.cpp index ccdc8af80e3d..900d740e5891 100644 --- a/src/node/self_healing_open_impl.cpp +++ b/src/node/self_healing_open_impl.cpp @@ -3,7 +3,9 @@ #include "node/self_healing_open_impl.h" +#include "ccf/service/tables/nodes.h" #include "ccf/service/tables/self_healing_open.h" +#include "ccf/tx.h" #include "node_state.h" #include @@ -259,7 +261,7 @@ namespace ccf switch (sm_state) { case SelfHealingOpenSM::GOSSIPING: - msg->data.self.send_gossip_unsafe(); + msg->data.self.send_gossip_unsafe(tx); break; case SelfHealingOpenSM::VOTING: { @@ -280,13 +282,13 @@ namespace ccf "Self-healing-open chosen node {} not found", chosen_replica_handle->get().value())); } - msg->data.self.send_vote_unsafe(chosen_node_info.value()); + msg->data.self.send_vote_unsafe(tx, chosen_node_info.value()); // keep gossiping to allow lagging nodes to eventually vote - msg->data.self.send_gossip_unsafe(); + msg->data.self.send_gossip_unsafe(tx); break; } case SelfHealingOpenSM::OPENING: - msg->data.self.send_iamopen_unsafe(); + msg->data.self.send_iamopen_unsafe(tx); break; case SelfHealingOpenSM::JOINING: return; @@ -458,10 +460,18 @@ namespace ccf std::move(curl_request)); } - self_healing_open::RequestNodeInfo SelfHealingOpenSubsystem::make_node_info() + self_healing_open::RequestNodeInfo SelfHealingOpenSubsystem::make_node_info( + kv::ReadOnlyTx& tx) { + auto* nodes_handle = tx.ro(Tables::NODES); + auto node_info_opt = nodes_handle->get(node_state->get_node_id()); + if (!node_info_opt.has_value()) + { + throw std::logic_error(fmt::format( + "Node {} not found in nodes table", node_state->get_node_id())); + } return { - .quote_info = node_state->quote_info, + .quote_info = node_info_opt->quote_info, .published_network_address = node_state->config.network.rpc_interfaces.at("primary_rpc_interface") .published_address, @@ -472,7 +482,7 @@ namespace ccf }; } - void SelfHealingOpenSubsystem::send_gossip_unsafe() + void SelfHealingOpenSubsystem::send_gossip_unsafe(kv::ReadOnlyTx& tx) { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -483,7 +493,7 @@ namespace ccf LOG_TRACE_FMT("Broadcasting self-healing-open gossip"); self_healing_open::GossipRequest request; - request.info = make_node_info(); + request.info = make_node_info(tx); request.txid = node_state->last_recovered_signed_idx; nlohmann::json request_json = request; @@ -499,7 +509,7 @@ namespace ccf } void SelfHealingOpenSubsystem::send_vote_unsafe( - const SelfHealingOpenNodeInfo& node_info) + kv::ReadOnlyTx& tx, const SelfHealingOpenNodeInfo& node_info) { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -512,7 +522,7 @@ namespace ccf node_info.intrinsic_id, node_info.published_network_address); - self_healing_open::TaggedWithNodeInfo request{.info = make_node_info()}; + self_healing_open::TaggedWithNodeInfo request{.info = make_node_info(tx)}; nlohmann::json request_json = request; @@ -524,7 +534,7 @@ namespace ccf node_state->node_sign_kp->private_key_pem()); } - void SelfHealingOpenSubsystem::send_iamopen_unsafe() + void SelfHealingOpenSubsystem::send_iamopen_unsafe(ccf::kv::ReadOnlyTx& tx) { auto& config = node_state->config.recover.self_healing_open; if (!config.has_value()) @@ -534,7 +544,7 @@ namespace ccf LOG_TRACE_FMT("Sending self-healing-open iamopen"); - self_healing_open::TaggedWithNodeInfo request{.info = make_node_info()}; + self_healing_open::TaggedWithNodeInfo request{.info = make_node_info(tx)}; nlohmann::json request_json = request; for (auto& target_address : config->addresses) diff --git a/src/node/self_healing_open_impl.h b/src/node/self_healing_open_impl.h index 190fff0d3e11..f7bf6c6f3666 100644 --- a/src/node/self_healing_open_impl.h +++ b/src/node/self_healing_open_impl.h @@ -67,9 +67,9 @@ namespace ccf void start_failover_timers(); // Steady state operations - self_healing_open::RequestNodeInfo make_node_info(); - void send_gossip_unsafe(); - void send_vote_unsafe(const SelfHealingOpenNodeInfo&); - void send_iamopen_unsafe(); + self_healing_open::RequestNodeInfo make_node_info(kv::ReadOnlyTx&); + void send_gossip_unsafe(kv::ReadOnlyTx&); + void send_vote_unsafe(kv::ReadOnlyTx&, const SelfHealingOpenNodeInfo&); + void send_iamopen_unsafe(kv::ReadOnlyTx&); }; } \ No newline at end of file From 98dd582fccae416b98a11e485f3b087c6e945737 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Tue, 30 Sep 2025 14:38:46 +0100 Subject: [PATCH 195/197] Allow difference between ledgers as they are different networks --- tests/e2e_operations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e_operations.py b/tests/e2e_operations.py index 5863d78c37ee..7760baf753ff 100644 --- a/tests/e2e_operations.py +++ b/tests/e2e_operations.py @@ -1534,7 +1534,7 @@ def run_self_healing_open(const_args): LOG.info("Completed self-healing open successfully") - recovered_network.stop_all_nodes() + recovered_network.stop_all_nodes(accept_ledger_diff=True) def run_self_healing_open_timeout_path(const_args): From 66b046085d4e216b12b365c44ac61d2a46bc5266 Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 1 Oct 2025 11:55:22 +0100 Subject: [PATCH 196/197] More logging for curl --- src/http/curl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/http/curl.h b/src/http/curl.h index eca31d587016..bc03eeb10062 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -765,6 +765,8 @@ namespace ccf::curl CURL_CSELECT_ERR, nullptr); self->curl_request_curlm.perform(); + LOG_TRACE_FMT( + "Finished handling error on socket {}", socket_context->socket); return; } From 9f2fc570545c17b36689049b55712d4957ebef4c Mon Sep 17 00:00:00 2001 From: cjen1-msft Date: Wed, 1 Oct 2025 15:50:09 +0100 Subject: [PATCH 197/197] Fix curl write to nullptr --- src/http/curl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/http/curl.h b/src/http/curl.h index bc03eeb10062..e5068cfe95d7 100644 --- a/src/http/curl.h +++ b/src/http/curl.h @@ -758,12 +758,13 @@ namespace ccf::curl } // Notify curl of the error + int running_handles = 0; CHECK_CURL_MULTI( curl_multi_socket_action, self->curl_request_curlm, socket_context->socket, CURL_CSELECT_ERR, - nullptr); + &running_handles); self->curl_request_curlm.perform(); LOG_TRACE_FMT( "Finished handling error on socket {}", socket_context->socket);