diff --git a/iroh/src/magicsock/metrics.rs b/iroh/src/magicsock/metrics.rs index e6f53d95c7..13bfa1ce77 100644 --- a/iroh/src/magicsock/metrics.rs +++ b/iroh/src/magicsock/metrics.rs @@ -1,4 +1,4 @@ -use iroh_metrics::{Counter, Histogram, MetricsGroup}; +use iroh_metrics::{Counter, MetricsGroup}; use serde::{Deserialize, Serialize}; /// Enum of metrics for the module @@ -14,11 +14,8 @@ pub struct Metrics { pub send_ipv4: Counter, pub send_ipv6: Counter, pub send_relay: Counter, - pub send_relay_error: Counter, // Data packets (non-disco) - pub send_data: Counter, - pub send_data_network_down: Counter, pub recv_data_relay: Counter, pub recv_data_ipv4: Counter, pub recv_data_ipv6: Counter, @@ -50,15 +47,20 @@ pub struct Metrics { /* * Connection Metrics + * + * These all only count connections that completed the TLS handshake successfully. This means + * that short lived 0RTT connections are potentially not included in these counts. */ - /// The number of direct connections we have made to peers. - pub num_direct_conns_added: Counter, - /// The number of direct connections we have lost to peers. - pub num_direct_conns_removed: Counter, - /// The number of connections to peers we have added over relay. - pub num_relay_conns_added: Counter, - /// The number of connections to peers we have removed over relay. - pub num_relay_conns_removed: Counter, + /// Number of connections opened (only handshaked connections are counted). + pub num_conns_opened: Counter, + /// Number of connections closed (only handshaked connections are counted). + pub num_conns_closed: Counter, + /// Number of connections that had only relay paths over their lifetime. + pub num_conns_transport_relay_only: Counter, + /// Number of connections that had only IP paths over their lifetime. + pub num_conns_transport_ip_only: Counter, + /// Number of connections that had both IP and relay paths. + pub num_conns_transport_ip_and_relay: Counter, pub actor_tick_main: Counter, pub actor_tick_msg: Counter, @@ -67,36 +69,25 @@ pub struct Metrics { pub actor_tick_direct_addr_heartbeat: Counter, pub actor_link_change: Counter, pub actor_tick_other: Counter, - - /// Number of endpoints we have attempted to contact. - pub endpoints_contacted: Counter, - /// Number of endpoints we have managed to contact directly. - pub endpoints_contacted_directly: Counter, - - /// Number of connections with a successful handshake. - pub connection_handshake_success: Counter, - /// Number of connections with a successful handshake that became direct. - pub connection_became_direct: Counter, - /// Histogram of connection latency in milliseconds across all endpoint connections. - #[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))] - pub connection_latency_ms: Histogram, - - /* - * Path Congestion Metrics - */ - /// Number of times a path was marked as outdated due to consecutive ping failures. - pub path_marked_outdated: Counter, - /// Number of ping failures recorded across all paths. - pub path_ping_failures: Counter, - /// Number of consecutive failure resets (path recovered). - pub path_failure_resets: Counter, - /// Histogram of packet loss rates (0.0-1.0) observed on UDP paths. - #[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))] - pub path_packet_loss_rate: Histogram, - /// Histogram of RTT variance (in milliseconds) as a congestion indicator. - #[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))] - pub path_rtt_variance_ms: Histogram, - /// Histogram of path quality scores (0.0-1.0). - #[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))] - pub path_quality_score: Histogram, + // /// Histogram of connection latency in milliseconds across all endpoint connections. + // #[default(Histogram::new(vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0, 1000.0, f64::INFINITY]))] + // pub connection_latency_ms: Histogram, + // /* + // * Path Congestion Metrics + // */ + // /// Number of times a path was marked as outdated due to consecutive ping failures. + // pub path_marked_outdated: Counter, + // /// Number of ping failures recorded across all paths. + // pub path_ping_failures: Counter, + // /// Number of consecutive failure resets (path recovered). + // pub path_failure_resets: Counter, + // /// Histogram of packet loss rates (0.0-1.0) observed on UDP paths. + // #[default(Histogram::new(vec![0.0, 0.01, 0.05, 0.1, 0.2, 0.5, 1.0]))] + // pub path_packet_loss_rate: Histogram, + // /// Histogram of RTT variance (in milliseconds) as a congestion indicator. + // #[default(Histogram::new(vec![0.0, 1.0, 5.0, 10.0, 20.0, 50.0, 100.0, 200.0]))] + // pub path_rtt_variance_ms: Histogram, + // /// Histogram of path quality scores (0.0-1.0). + // #[default(Histogram::new(vec![0.0, 0.3, 0.5, 0.7, 0.85, 0.95, 1.0]))] + // pub path_quality_score: Histogram, } diff --git a/iroh/src/magicsock/remote_map/remote_state.rs b/iroh/src/magicsock/remote_map/remote_state.rs index 6fda36e3c9..d2af0944a1 100644 --- a/iroh/src/magicsock/remote_map/remote_state.rs +++ b/iroh/src/magicsock/remote_map/remote_state.rs @@ -270,11 +270,7 @@ impl RemoteStateActor { self.handle_path_event(id, evt); } Some(conn_id) = self.connections_close.next(), if !self.connections_close.is_empty() => { - self.connections.remove(&conn_id); - if self.connections.is_empty() { - trace!("last connection closed - clearing selected_path"); - self.selected_path.set(None).ok(); - } + self.handle_connection_close(conn_id); } res = self.local_direct_addrs.updated() => { if let Err(n0_watcher::Disconnected) = res { @@ -415,6 +411,7 @@ impl RemoteStateActor { ) { let pub_open_paths = Watchable::default(); if let Some(conn) = handle.upgrade() { + self.metrics.num_conns_opened.inc(); // Remove any conflicting stable_ids from the local state. let conn_id = ConnId(conn.stable_id()); self.connections.remove(&conn_id); @@ -433,6 +430,7 @@ impl RemoteStateActor { paths: Default::default(), open_paths: Default::default(), path_ids: Default::default(), + transport_summary: TransportSummary::default(), }) .into_mut(); @@ -589,6 +587,17 @@ impl RemoteStateActor { tx.send(rtt).ok(); } + fn handle_connection_close(&mut self, conn_id: ConnId) { + if let Some(state) = self.connections.remove(&conn_id) { + self.metrics.num_conns_closed.inc(); + state.transport_summary.record(&self.metrics); + } + if self.connections.is_empty() { + trace!("last connection closed - clearing selected_path"); + self.selected_path.set(None).ok(); + } + } + fn handle_discovery_item(&mut self, item: Option>) { match item { None => { @@ -1165,6 +1174,8 @@ struct ConnectionState { open_paths: FxHashMap, /// Reverse map of [`Self::paths]. path_ids: FxHashMap, + /// Summary over transports used in this connection, for metrics tracking. + transport_summary: TransportSummary, } impl ConnectionState { @@ -1176,10 +1187,10 @@ impl ConnectionState { /// Tracks an open path for the connection. fn add_open_path(&mut self, remote: transports::Addr, path_id: PathId) { + self.transport_summary.add_path(&remote); self.paths.insert(path_id, remote.clone()); self.open_paths.insert(path_id, remote.clone()); self.path_ids.insert(remote, path_id); - self.update_pub_path_info(); } @@ -1424,6 +1435,44 @@ impl Future for OnClosed { } } +/// Used for metrics tracking. +#[derive(Debug, Clone, Copy, Default)] +enum TransportSummary { + #[default] + None, + IpOnly, + RelayOnly, + IpAndRelay, +} + +impl TransportSummary { + fn add_path(&mut self, addr: &transports::Addr) { + use transports::Addr; + *self = match (*self, addr) { + (TransportSummary::None | TransportSummary::IpOnly, Addr::Ip(_)) => Self::IpOnly, + (TransportSummary::None | TransportSummary::RelayOnly, Addr::Relay(_, _)) => { + Self::RelayOnly + } + _ => Self::IpAndRelay, + } + } + + fn record(&self, metrics: &MagicsockMetrics) { + match self { + TransportSummary::IpOnly => { + metrics.num_conns_transport_ip_only.inc(); + } + TransportSummary::RelayOnly => { + metrics.num_conns_transport_relay_only.inc(); + } + TransportSummary::IpAndRelay => { + metrics.num_conns_transport_ip_and_relay.inc(); + } + TransportSummary::None => {} + } + } +} + /// Converts an iterator of [`TransportAddr'] into an iterator of [`transports::Addr`]. fn to_transports_addr( endpoint_id: EndpointId,