From a75e0c6f1b51b61aa1576bf671bd0d74bd45490f Mon Sep 17 00:00:00 2001 From: Hansie Odendaal <39146854+hansieodendaal@users.noreply.github.com> Date: Wed, 27 Nov 2024 13:41:59 +0200 Subject: [PATCH] feat: fix tari pulse running as fast as possible (#6704) Description --- ~- Solve openssl build issue for non-base nodes by moving the Tari pulse service to the base node application~ - Fixed Tari pulse not adhering to the tick intervals Motivation and Context --- ~- Openssl cause build issues for other platforms~ - Tari puls were running _as fast as possible_, instead of adhering to the tick intervals ``` 2024-11-27 10:31:41.834109900 [c::bn::tari_pulse] INFO Initializing Tari Pulse Service 2024-11-27 10:31:41.834119800 [c::bn::tari_pulse] INFO Tari Pulse Service initialized 2024-11-27 10:31:41.994249400 [c::bn::tari_pulse] INFO Tari Pulse Service initialized with DNS name: checkpoints-nextnet.tari.com 2024-11-27 10:31:41.995570900 [c::bn::tari_pulse] TRACE Interval tick: 1 2024-11-27 10:31:42.093937500 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) 2024-11-27 10:31:42.099942200 [c::bn::tari_pulse] TRACE Interval tick: 2 2024-11-27 10:31:42.151937500 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) 2024-11-27 10:31:42.162359500 [c::bn::tari_pulse] TRACE Interval tick: 3 2024-11-27 10:31:42.553671300 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) 2024-11-27 10:31:42.568889200 [c::bn::tari_pulse] TRACE Interval tick: 4 2024-11-27 10:31:42.622576100 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) 2024-11-27 10:31:42.627371600 [c::bn::tari_pulse] TRACE Interval tick: 5 2024-11-27 10:31:42.870493900 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) ``` - This PR: ``` 2024-11-27 11:51:08.123837900 [c::bn::tari_pulse] INFO Initializing Tari Pulse Service 2024-11-27 11:51:08.123852600 [c::bn::tari_pulse] INFO Tari Pulse Service initialized 2024-11-27 11:51:08.278346300 [c::bn::tari_pulse] INFO Tari Pulse Service initialized with DNS name: checkpoints-nextnet.tari.com 2024-11-27 11:51:08.279742100 [c::bn::tari_pulse] TRACE Interval tick: 1 2024-11-27 11:51:08.353759600 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) 2024-11-27 11:53:08.297138300 [c::bn::tari_pulse] TRACE Interval tick: 2 2024-11-27 11:53:08.700206500 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) 2024-11-27 11:55:08.303411400 [c::bn::tari_pulse] TRACE Interval tick: 3 2024-11-27 11:55:08.358827400 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) 2024-11-27 11:57:08.311930500 [c::bn::tari_pulse] TRACE Interval tick: 4 2024-11-27 11:57:08.416916200 [c::bn::tari_pulse] TRACE Passed checkpoints: true, DNS: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa), Local: (45168, 1e3c56468f464072d9879e29a70891e9916f2206116c8819637c266a8b1d2efa) ``` How Has This Been Tested? --- System-level testing. ~**Note:** Build tests for other platforms must still be performed.~ What process can a PR reviewer use to test or verify this change? --- Code review. System-level testing. Breaking Changes --- - [x] None - [ ] Requires data directory on base node to be deleted - [ ] Requires hard fork - [ ] Other - Please specify --- .../src/base_node/tari_pulse_service/mod.rs | 59 +++++++++++-------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/base_layer/core/src/base_node/tari_pulse_service/mod.rs b/base_layer/core/src/base_node/tari_pulse_service/mod.rs index 0d7492b686..3ae3f5bfee 100644 --- a/base_layer/core/src/base_node/tari_pulse_service/mod.rs +++ b/base_layer/core/src/base_node/tari_pulse_service/mod.rs @@ -20,7 +20,7 @@ // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE // USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -use std::{str::FromStr, time::Duration}; +use std::{cmp::min, str::FromStr, time::Duration}; use futures::future; use hickory_client::{ @@ -33,13 +33,13 @@ use hickory_client::{ rr::{DNSClass, Name, RData, Record, RecordType}, tcp::TcpClientStream, }; -use log::{error, info, warn}; +use log::{debug, error, info, trace, warn}; use serde::{Deserialize, Serialize}; use tari_p2p::Network; use tari_service_framework::{async_trait, ServiceInitializationError, ServiceInitializer, ServiceInitializerContext}; use tari_shutdown::ShutdownSignal; use tari_utilities::hex::Hex; -use tokio::{net::TcpStream as TokioTcpStream, sync::watch, time}; +use tokio::{net::TcpStream as TokioTcpStream, sync::watch, time, time::MissedTickBehavior}; use super::LocalNodeCommsInterface; use crate::base_node::comms_interface::CommsInterfaceError; @@ -121,31 +121,37 @@ impl TariPulseService { notify_passed_checkpoints: watch::Sender, ) { let mut interval = time::interval(self.config.check_interval); + interval.set_missed_tick_behavior(MissedTickBehavior::Delay); + tokio::pin!(interval); let mut shutdown_signal = self.shutdown_signal.clone(); + let mut count = 0u64; + let mut skip_ticks = 0; + let mut skipped_ticks = 0; loop { tokio::select! { _ = interval.tick() => { - let passed_checkpoints = match self.passed_checkpoints(&mut base_node_service).await { - Ok(passed) => { - interval = time::interval(self.config.check_interval); // reset interval if back to healthy - passed - }, - Err(err) => { - warn!(target: LOG_TARGET, "Failed to check if node has passed checkpoints: {:?}", err); - let old_interval = interval.period().as_secs(); - let new_interval = if old_interval > (60 * 30) { - warn!(target: LOG_TARGET, "Reached maximum retry interval of 30 minutes."); - old_interval - } else { - // increase interval if node repeatedly (up to 30 min) fails to fetch checkpoints - interval = time::interval(Duration::from_secs(old_interval * 2)); - interval.tick().await; - interval.period().as_secs() - }; - warn!(target: LOG_TARGET, "Retrying in {} seconds", new_interval); - continue; - }, + count += 1; + trace!(target: LOG_TARGET, "Interval tick: {}", count); + if skipped_ticks < skip_ticks { + skipped_ticks += 1; + debug!(target: LOG_TARGET, "Skipping {} of {} ticks", skipped_ticks, skip_ticks); + continue; + } + let passed_checkpoints = { + match self.passed_checkpoints(&mut base_node_service).await { + Ok(passed) => { + skip_ticks = 0; + skipped_ticks = 0; + passed + }, + Err(err) => { + warn!(target: LOG_TARGET, "Failed to check if node has passed checkpoints: {:?}", err); + skip_ticks = min(skip_ticks + 1, 30 * 60 / self.config.check_interval.as_secs()); + skipped_ticks = 0; + continue; + }, + } }; notify_passed_checkpoints @@ -174,7 +180,12 @@ impl TariPulseService { .max_by(|a, b| a.0.cmp(&b.0)) .ok_or(CommsInterfaceError::InternalError("No checkpoints found".to_string()))?; let local_checkpoints = self.get_node_block(base_node_service, max_height_block.0).await?; - Ok(local_checkpoints.1 == max_height_block.1) + let passed = local_checkpoints.1 == max_height_block.1; + trace!( + target: LOG_TARGET, "Passed checkpoints: {}, DNS: ({}, {}), Local: ({}, {})", + passed, max_height_block.0, max_height_block.1, local_checkpoints.0, local_checkpoints.1 + ); + Ok(passed) } async fn get_node_block(