fix keepalives

This commit is contained in:
John Smith 2023-07-14 14:21:00 -04:00
parent 742b8e09a5
commit 41b9a22595
10 changed files with 160 additions and 105 deletions

View File

@ -814,8 +814,8 @@ impl Network {
} }
// commit routing table edits // commit routing table edits
editor_public_internet.commit().await; editor_public_internet.commit();
editor_local_network.commit().await; editor_local_network.commit();
info!("network started"); info!("network started");
self.inner.lock().network_started = true; self.inner.lock().network_started = true;
@ -868,12 +868,12 @@ impl Network {
let mut editor = routing_table.edit_routing_domain(RoutingDomain::PublicInternet); let mut editor = routing_table.edit_routing_domain(RoutingDomain::PublicInternet);
editor.clear_dial_info_details(); editor.clear_dial_info_details();
editor.set_network_class(None); editor.set_network_class(None);
editor.commit().await; editor.commit();
let mut editor = routing_table.edit_routing_domain(RoutingDomain::LocalNetwork); let mut editor = routing_table.edit_routing_domain(RoutingDomain::LocalNetwork);
editor.clear_dial_info_details(); editor.clear_dial_info_details();
editor.set_network_class(None); editor.set_network_class(None);
editor.commit().await; editor.commit();
// Reset state including network class // Reset state including network class
*self.inner.lock() = Self::new_inner(); *self.inner.lock() = Self::new_inner();

View File

@ -176,7 +176,7 @@ impl DiscoveryContext {
.routing_table .routing_table
.find_fast_public_nodes_filtered(node_count, filters); .find_fast_public_nodes_filtered(node_count, filters);
if peers.is_empty() { if peers.is_empty() {
log_net!( log_net!(debug
"no external address detection peers of type {:?}:{:?}", "no external address detection peers of type {:?}:{:?}",
protocol_type, protocol_type,
address_type address_type
@ -195,7 +195,7 @@ impl DiscoveryContext {
return Some((sa, peer)); return Some((sa, peer));
} }
} }
log_net!("no peers responded with an external address"); log_net!(debug "no peers responded with an external address");
None None
} }
@ -943,8 +943,8 @@ impl Network {
punish(); punish();
} }
} else { } else {
// Send updates to everyone // Commit updates
editor.commit().await; editor.commit();
} }
Ok(()) Ok(())

View File

@ -23,10 +23,6 @@ const UNRELIABLE_PING_SPAN_SECS: u32 = 60;
/// - Interval is the number of seconds between each ping /// - Interval is the number of seconds between each ping
const UNRELIABLE_PING_INTERVAL_SECS: u32 = 5; const UNRELIABLE_PING_INTERVAL_SECS: u32 = 5;
/// Keepalive pings are done occasionally to ensure holepunched public dialinfo
/// remains valid, as well as to make sure we remain in any relay node's routing table
const KEEPALIVE_PING_INTERVAL_SECS: u32 = 10;
/// How many times do we try to ping a never-reached node before we call it dead /// How many times do we try to ping a never-reached node before we call it dead
const NEVER_REACHED_PING_COUNT: u32 = 3; const NEVER_REACHED_PING_COUNT: u32 = 3;
@ -636,16 +632,10 @@ impl BucketEntryInner {
} }
// Check if this node needs a ping right now to validate it is still reachable // Check if this node needs a ping right now to validate it is still reachable
pub(super) fn needs_ping(&self, cur_ts: Timestamp, needs_keepalive: bool) -> bool { pub(super) fn needs_ping(&self, cur_ts: Timestamp) -> bool {
// See which ping pattern we are to use // See which ping pattern we are to use
let state = self.state(cur_ts); let state = self.state(cur_ts);
// If this entry needs a keepalive (like a relay node),
// then we should ping it regularly to keep our association alive
if needs_keepalive {
return self.needs_constant_ping(cur_ts, TimestampDuration::new(KEEPALIVE_PING_INTERVAL_SECS as u64 * 1000000u64));
}
// If we don't have node status for this node, then we should ping it to get some node status // If we don't have node status for this node, then we should ping it to get some node status
for routing_domain in RoutingDomainSet::all() { for routing_domain in RoutingDomainSet::all() {
if self.has_node_info(routing_domain.into()) { if self.has_node_info(routing_domain.into()) {

View File

@ -463,6 +463,10 @@ impl RoutingTable {
self.inner.read().relay_node(domain) self.inner.read().relay_node(domain)
} }
pub fn relay_node_last_keepalive(&self, domain: RoutingDomain) -> Option<Timestamp> {
self.inner.read().relay_node_last_keepalive(domain)
}
pub fn has_dial_info(&self, domain: RoutingDomain) -> bool { pub fn has_dial_info(&self, domain: RoutingDomain) -> bool {
self.inner.read().has_dial_info(domain) self.inner.read().has_dial_info(domain)
} }

View File

@ -6,6 +6,9 @@ enum RoutingDomainChange {
SetRelayNode { SetRelayNode {
relay_node: NodeRef, relay_node: NodeRef,
}, },
SetRelayNodeKeepalive {
ts: Option<Timestamp>,
},
AddDialInfoDetail { AddDialInfoDetail {
dial_info_detail: DialInfoDetail, dial_info_detail: DialInfoDetail,
}, },
@ -36,24 +39,33 @@ impl RoutingDomainEditor {
} }
#[instrument(level = "debug", skip(self))] #[instrument(level = "debug", skip(self))]
pub fn clear_dial_info_details(&mut self) { pub fn clear_dial_info_details(&mut self) -> &mut Self {
self.changes.push(RoutingDomainChange::ClearDialInfoDetails); self.changes.push(RoutingDomainChange::ClearDialInfoDetails);
self
} }
#[instrument(level = "debug", skip(self))] #[instrument(level = "debug", skip(self))]
pub fn clear_relay_node(&mut self) { pub fn clear_relay_node(&mut self) -> &mut Self {
self.changes.push(RoutingDomainChange::ClearRelayNode); self.changes.push(RoutingDomainChange::ClearRelayNode);
self
} }
#[instrument(level = "debug", skip(self))] #[instrument(level = "debug", skip(self))]
pub fn set_relay_node(&mut self, relay_node: NodeRef) { pub fn set_relay_node(&mut self, relay_node: NodeRef) -> &mut Self {
self.changes self.changes
.push(RoutingDomainChange::SetRelayNode { relay_node }) .push(RoutingDomainChange::SetRelayNode { relay_node });
self
} }
#[instrument(level = "debug", skip(self), err)] #[instrument(level = "debug", skip(self))]
pub fn set_relay_node_keepalive(&mut self, ts: Option<Timestamp>) -> &mut Self {
self.changes
.push(RoutingDomainChange::SetRelayNodeKeepalive { ts });
self
}
#[instrument(level = "debug", skip(self))]
pub fn register_dial_info( pub fn register_dial_info(
&mut self, &mut self,
dial_info: DialInfo, dial_info: DialInfo,
class: DialInfoClass, class: DialInfoClass,
) -> EyreResult<()> { ) -> EyreResult<&mut Self> {
if !self if !self
.routing_table .routing_table
.ensure_dial_info_is_valid(self.routing_domain, &dial_info) .ensure_dial_info_is_valid(self.routing_domain, &dial_info)
@ -72,7 +84,7 @@ impl RoutingDomainEditor {
}, },
}); });
Ok(()) Ok(self)
} }
#[instrument(level = "debug", skip(self))] #[instrument(level = "debug", skip(self))]
pub fn setup_network( pub fn setup_network(
@ -81,23 +93,25 @@ impl RoutingDomainEditor {
inbound_protocols: ProtocolTypeSet, inbound_protocols: ProtocolTypeSet,
address_types: AddressTypeSet, address_types: AddressTypeSet,
capabilities: Vec<Capability>, capabilities: Vec<Capability>,
) { ) -> &mut Self {
self.changes.push(RoutingDomainChange::SetupNetwork { self.changes.push(RoutingDomainChange::SetupNetwork {
outbound_protocols, outbound_protocols,
inbound_protocols, inbound_protocols,
address_types, address_types,
capabilities, capabilities,
}) });
self
} }
#[instrument(level = "debug", skip(self))] #[instrument(level = "debug", skip(self))]
pub fn set_network_class(&mut self, network_class: Option<NetworkClass>) { pub fn set_network_class(&mut self, network_class: Option<NetworkClass>) -> &mut Self {
self.changes self.changes
.push(RoutingDomainChange::SetNetworkClass { network_class }) .push(RoutingDomainChange::SetNetworkClass { network_class });
self
} }
#[instrument(level = "debug", skip(self))] #[instrument(level = "debug", skip(self))]
pub async fn commit(self) { pub fn commit(&mut self) {
// No locking if we have nothing to do // No locking if we have nothing to do
if self.changes.is_empty() { if self.changes.is_empty() {
return; return;
@ -109,7 +123,7 @@ impl RoutingDomainEditor {
let mut inner = self.routing_table.inner.write(); let mut inner = self.routing_table.inner.write();
inner.with_routing_domain_mut(self.routing_domain, |detail| { inner.with_routing_domain_mut(self.routing_domain, |detail| {
for change in self.changes { for change in self.changes.drain(..) {
match change { match change {
RoutingDomainChange::ClearDialInfoDetails => { RoutingDomainChange::ClearDialInfoDetails => {
debug!("[{:?}] cleared dial info details", self.routing_domain); debug!("[{:?}] cleared dial info details", self.routing_domain);
@ -123,7 +137,12 @@ impl RoutingDomainEditor {
} }
RoutingDomainChange::SetRelayNode { relay_node } => { RoutingDomainChange::SetRelayNode { relay_node } => {
debug!("[{:?}] set relay node: {}", self.routing_domain, relay_node); debug!("[{:?}] set relay node: {}", self.routing_domain, relay_node);
detail.common_mut().set_relay_node(Some(relay_node)); detail.common_mut().set_relay_node(Some(relay_node.clone()));
changed = true;
}
RoutingDomainChange::SetRelayNodeKeepalive { ts } => {
debug!("[{:?}] relay node keepalive: {:?}", self.routing_domain, ts);
detail.common_mut().set_relay_node_last_keepalive(ts);
changed = true; changed = true;
} }
RoutingDomainChange::AddDialInfoDetail { dial_info_detail } => { RoutingDomainChange::AddDialInfoDetail { dial_info_detail } => {
@ -155,7 +174,7 @@ impl RoutingDomainEditor {
let this_changed = old_outbound_protocols != outbound_protocols let this_changed = old_outbound_protocols != outbound_protocols
|| old_inbound_protocols != inbound_protocols || old_inbound_protocols != inbound_protocols
|| old_address_types != address_types || old_address_types != address_types
|| old_capabilities != capabilities; || old_capabilities != *capabilities;
debug!( debug!(
"[{:?}] setup network: {:?} {:?} {:?} {:?}", "[{:?}] setup network: {:?} {:?} {:?} {:?}",
@ -170,7 +189,7 @@ impl RoutingDomainEditor {
outbound_protocols, outbound_protocols,
inbound_protocols, inbound_protocols,
address_types, address_types,
capabilities, capabilities.clone(),
); );
if this_changed { if this_changed {
changed = true; changed = true;

View File

@ -27,6 +27,7 @@ pub struct RoutingDomainDetailCommon {
inbound_protocols: ProtocolTypeSet, inbound_protocols: ProtocolTypeSet,
address_types: AddressTypeSet, address_types: AddressTypeSet,
relay_node: Option<NodeRef>, relay_node: Option<NodeRef>,
relay_node_last_keepalive: Option<Timestamp>,
capabilities: Vec<Capability>, capabilities: Vec<Capability>,
dial_info_details: Vec<DialInfoDetail>, dial_info_details: Vec<DialInfoDetail>,
// caches // caches
@ -42,6 +43,7 @@ impl RoutingDomainDetailCommon {
inbound_protocols: Default::default(), inbound_protocols: Default::default(),
address_types: Default::default(), address_types: Default::default(),
relay_node: Default::default(), relay_node: Default::default(),
relay_node_last_keepalive: Default::default(),
capabilities: Default::default(), capabilities: Default::default(),
dial_info_details: Default::default(), dial_info_details: Default::default(),
cached_peer_info: Mutex::new(Default::default()), cached_peer_info: Mutex::new(Default::default()),
@ -85,12 +87,19 @@ impl RoutingDomainDetailCommon {
pub fn relay_node(&self) -> Option<NodeRef> { pub fn relay_node(&self) -> Option<NodeRef> {
self.relay_node.clone() self.relay_node.clone()
} }
pub fn relay_node_last_keepalive(&self) -> Option<Timestamp> {
self.relay_node_last_keepalive
}
pub(super) fn set_relay_node(&mut self, opt_relay_node: Option<NodeRef>) { pub(super) fn set_relay_node(&mut self, opt_relay_node: Option<NodeRef>) {
self.relay_node = opt_relay_node.map(|nr| { self.relay_node = opt_relay_node.map(|nr| {
nr.filtered_clone(NodeRefFilter::new().with_routing_domain(self.routing_domain)) nr.filtered_clone(NodeRefFilter::new().with_routing_domain(self.routing_domain))
}); });
self.relay_node_last_keepalive = None;
self.clear_cache(); self.clear_cache();
} }
pub(super) fn set_relay_node_last_keepalive(&mut self, ts: Option<Timestamp>) {
self.relay_node_last_keepalive = ts;
}
pub fn dial_info_details(&self) -> &Vec<DialInfoDetail> { pub fn dial_info_details(&self) -> &Vec<DialInfoDetail> {
&self.dial_info_details &self.dial_info_details
} }

View File

@ -98,6 +98,10 @@ impl RoutingTableInner {
self.with_routing_domain(domain, |rd| rd.common().relay_node()) self.with_routing_domain(domain, |rd| rd.common().relay_node())
} }
pub fn relay_node_last_keepalive(&self, domain: RoutingDomain) -> Option<Timestamp> {
self.with_routing_domain(domain, |rd| rd.common().relay_node_last_keepalive())
}
pub fn has_dial_info(&self, domain: RoutingDomain) -> bool { pub fn has_dial_info(&self, domain: RoutingDomain) -> bool {
self.with_routing_domain(domain, |rd| !rd.common().dial_info_details().is_empty()) self.with_routing_domain(domain, |rd| !rd.common().dial_info_details().is_empty())
} }
@ -547,8 +551,6 @@ impl RoutingTableInner {
routing_domain: RoutingDomain, routing_domain: RoutingDomain,
cur_ts: Timestamp, cur_ts: Timestamp,
) -> Vec<NodeRef> { ) -> Vec<NodeRef> {
// Collect relay nodes
let opt_relay = self.with_routing_domain(routing_domain, |rd| rd.common().relay_node());
let own_node_info_ts = self.get_own_node_info_ts(routing_domain); let own_node_info_ts = self.get_own_node_info_ts(routing_domain);
// Collect all entries that are 'needs_ping' and have some node info making them reachable somehow // Collect all entries that are 'needs_ping' and have some node info making them reachable somehow
@ -559,13 +561,8 @@ impl RoutingTableInner {
if !e.exists_in_routing_domain(rti, routing_domain) { if !e.exists_in_routing_domain(rti, routing_domain) {
return false; return false;
} }
// If we need a ping via the normal timing mechanism, then do it // If we need a ping then do it
// or if this node is our own relay, then we keep it alive if e.needs_ping(cur_ts) {
let is_our_relay = opt_relay
.as_ref()
.map(|nr| nr.same_bucket_entry(&entry))
.unwrap_or(false);
if e.needs_ping(cur_ts, is_our_relay) {
return true; return true;
} }
// If we need a ping because this node hasn't seen our latest node info, then do it // If we need a ping because this node hasn't seen our latest node info, then do it

View File

@ -1,10 +1,90 @@
use super::*; use super::*;
/// Keepalive pings are done occasionally to ensure holepunched public dialinfo
/// remains valid, as well as to make sure we remain in any relay node's routing table
const KEEPALIVE_PING_INTERVAL_SECS: u32 = 10;
use futures_util::stream::{FuturesUnordered, StreamExt}; use futures_util::stream::{FuturesUnordered, StreamExt};
use futures_util::FutureExt; use futures_util::FutureExt;
use stop_token::future::FutureExt as StopFutureExt; use stop_token::future::FutureExt as StopFutureExt;
impl RoutingTable { impl RoutingTable {
// Ping each node in the routing table if they need to be pinged
// to determine their reliability
#[instrument(level = "trace", skip(self), err)]
fn relay_keepalive_public_internet(
&self,
cur_ts: Timestamp,
relay_nr: NodeRef,
unord: &mut FuturesUnordered<
SendPinBoxFuture<Result<NetworkResult<Answer<Option<SenderInfo>>>, RPCError>>,
>,
) -> EyreResult<()> {
let rpc = self.rpc_processor();
// Get our publicinternet dial info
let dids = self.all_filtered_dial_info_details(
RoutingDomain::PublicInternet.into(),
&DialInfoFilter::all(),
);
let opt_relay_keepalive_ts = self.relay_node_last_keepalive(RoutingDomain::PublicInternet);
let relay_needs_keepalive = opt_relay_keepalive_ts
.map(|kts| {
cur_ts.saturating_sub(kts).as_u64()
>= (KEEPALIVE_PING_INTERVAL_SECS as u64 * 1_000_000u64)
})
.unwrap_or(true);
if !relay_needs_keepalive {
return Ok(());
}
// Say we're doing this keepalive now
self.edit_routing_domain(RoutingDomain::PublicInternet)
.set_relay_node_keepalive(Some(cur_ts))
.commit();
// Look up any NAT mappings we may need to try to preserve with keepalives
let mut mapped_port_info = self.get_low_level_port_info();
// Relay nodes get pinged over all protocols we have inbound dialinfo for
// This is so we can preserve the inbound NAT mappings at our router
for did in &dids {
// Do we need to do this ping?
// Check if we have already pinged over this low-level-protocol/address-type/port combo
// We want to ensure we do the bare minimum required here
let pt = did.dial_info.protocol_type();
let at = did.dial_info.address_type();
let needs_ping_for_protocol =
if let Some((llpt, port)) = mapped_port_info.protocol_to_port.get(&(pt, at)) {
mapped_port_info
.low_level_protocol_ports
.remove(&(*llpt, at, *port))
} else {
false
};
if !needs_ping_for_protocol {
continue;
}
let rpc = rpc.clone();
let dif = did.dial_info.make_filter();
let relay_nr_filtered =
relay_nr.filtered_clone(NodeRefFilter::new().with_dial_info_filter(dif));
#[cfg(feature = "network-result-extra")]
log_rtab!(debug "--> Keepalive ping to {:?}", relay_nr_filtered);
unord.push(
async move {
rpc.rpc_call_status(Destination::direct(relay_nr_filtered))
.await
}
.instrument(Span::current())
.boxed(),
);
}
Ok(())
}
// Ping each node in the routing table if they need to be pinged // Ping each node in the routing table if they need to be pinged
// to determine their reliability // to determine their reliability
#[instrument(level = "trace", skip(self), err)] #[instrument(level = "trace", skip(self), err)]
@ -20,70 +100,22 @@ impl RoutingTable {
// Get all nodes needing pings in the PublicInternet routing domain // Get all nodes needing pings in the PublicInternet routing domain
let node_refs = self.get_nodes_needing_ping(RoutingDomain::PublicInternet, cur_ts); let node_refs = self.get_nodes_needing_ping(RoutingDomain::PublicInternet, cur_ts);
// Look up any NAT mappings we may need to try to preserve with keepalives
let mut mapped_port_info = self.get_low_level_port_info();
// Get the PublicInternet relay if we are using one // Get the PublicInternet relay if we are using one
let opt_relay_nr = self.relay_node(RoutingDomain::PublicInternet); let opt_relay_nr = self.relay_node(RoutingDomain::PublicInternet);
// Get our publicinternet dial info // If this is our relay, let's check for NAT keepalives
let dids = self.all_filtered_dial_info_details( if let Some(relay_nr) = opt_relay_nr {
RoutingDomain::PublicInternet.into(), self.relay_keepalive_public_internet(cur_ts, relay_nr, unord)?;
&DialInfoFilter::all(), }
);
// For all nodes needing pings, figure out how many and over what protocols // Just do a single ping with the best protocol for all the other nodes to check for liveness
for nr in node_refs { for nr in node_refs {
// If this is our relay, let's check for NAT keepalives let rpc = rpc.clone();
let mut did_pings = false; unord.push(
if let Some(relay_nr) = &opt_relay_nr { async move { rpc.rpc_call_status(Destination::direct(nr)).await }
if nr.same_entry(relay_nr) { .instrument(Span::current())
// Relay nodes get pinged over all protocols we have inbound dialinfo for .boxed(),
// This is so we can preserve the inbound NAT mappings at our router );
for did in &dids {
// Do we need to do this ping?
// Check if we have already pinged over this low-level-protocol/address-type/port combo
// We want to ensure we do the bare minimum required here
let pt = did.dial_info.protocol_type();
let at = did.dial_info.address_type();
let needs_ping = if let Some((llpt, port)) =
mapped_port_info.protocol_to_port.get(&(pt, at))
{
mapped_port_info
.low_level_protocol_ports
.remove(&(*llpt, at, *port))
} else {
false
};
if needs_ping {
let rpc = rpc.clone();
let dif = did.dial_info.make_filter();
let nr_filtered =
nr.filtered_clone(NodeRefFilter::new().with_dial_info_filter(dif));
log_net!("--> Keepalive ping to {:?}", nr_filtered);
unord.push(
async move {
rpc.rpc_call_status(Destination::direct(nr_filtered)).await
}
.instrument(Span::current())
.boxed(),
);
did_pings = true;
}
}
}
}
// Just do a single ping with the best protocol for all the other nodes,
// ensuring that we at least ping a relay with -something- even if we didnt have
// any mapped ports to preserve
if !did_pings {
let rpc = rpc.clone();
unord.push(
async move { rpc.rpc_call_status(Destination::direct(nr)).await }
.instrument(Span::current())
.boxed(),
);
}
} }
Ok(()) Ok(())

View File

@ -69,7 +69,7 @@ impl RoutingTable {
false, false,
) { ) {
Ok(nr) => { Ok(nr) => {
log_rtab!("Outbound relay node selected: {}", nr); info!("Outbound relay node selected: {}", nr);
editor.set_relay_node(nr); editor.set_relay_node(nr);
got_outbound_relay = true; got_outbound_relay = true;
} }
@ -77,6 +77,8 @@ impl RoutingTable {
log_rtab!(error "failed to register node with peer info: {}", e); log_rtab!(error "failed to register node with peer info: {}", e);
} }
} }
} else {
info!("Outbound relay desired but not available");
} }
} }
if !got_outbound_relay { if !got_outbound_relay {
@ -89,7 +91,7 @@ impl RoutingTable {
} }
// Commit the changes // Commit the changes
editor.commit().await; editor.commit();
Ok(()) Ok(())
} }

View File

@ -1,3 +1,5 @@
#![allow(non_snake_case)]
use super::*; use super::*;
// Routing domain here is listed in order of preference, keep in order // Routing domain here is listed in order of preference, keep in order