From 6f6ec298cfd41d94e2cd17a7f9fb0a4489ef4a59 Mon Sep 17 00:00:00 2001 From: John Smith Date: Fri, 22 Jul 2022 13:05:28 -0400 Subject: [PATCH] move tasks to network manager --- veilid-cli/src/command_processor.rs | 5 +- veilid-core/src/attachment_manager.rs | 7 + .../native/utils/network_interfaces/mod.rs | 21 +- veilid-core/src/network_manager/mod.rs | 294 +++++----- veilid-core/src/network_manager/native/mod.rs | 4 +- .../native/network_class_discovery.rs | 4 +- veilid-core/src/network_manager/tasks.rs | 500 ++++++++++++++++++ veilid-core/src/routing_table/mod.rs | 209 ++------ veilid-core/src/routing_table/tasks.rs | 371 ------------- veilid-core/src/veilid_api/debug.rs | 33 ++ 10 files changed, 761 insertions(+), 687 deletions(-) create mode 100644 veilid-core/src/network_manager/tasks.rs diff --git a/veilid-cli/src/command_processor.rs b/veilid-cli/src/command_processor.rs index 9227cdaa..81d0bb0b 100644 --- a/veilid-cli/src/command_processor.rs +++ b/veilid-cli/src/command_processor.rs @@ -183,10 +183,7 @@ change_log_level - change the log level for a tracing layer spawn_detached_local(async move { match capi.server_debug(rest.unwrap_or_default()).await { Ok(output) => ui.display_string_dialog("Debug Output", output, callback), - Err(e) => { - error!("Server command 'debug' failed: {}", e); - ui.send_callback(callback); - } + Err(e) => ui.display_string_dialog("Debug Error", e.to_string(), callback), } }); Ok(()) diff --git a/veilid-core/src/attachment_manager.rs b/veilid-core/src/attachment_manager.rs index 4d9fa1ab..08176e37 100644 --- a/veilid-core/src/attachment_manager.rs +++ b/veilid-core/src/attachment_manager.rs @@ -248,6 +248,13 @@ impl AttachmentManager { break; } + // see if we need to restart the network + if netman.needs_restart() { + info!("Restarting network"); + restart = true; + break; + } + self.update_attachment().await; // sleep should be at the end in case maintain_peers changes state diff --git a/veilid-core/src/intf/native/utils/network_interfaces/mod.rs b/veilid-core/src/intf/native/utils/network_interfaces/mod.rs index ed4633db..79b4cefd 100644 --- a/veilid-core/src/intf/native/utils/network_interfaces/mod.rs +++ b/veilid-core/src/intf/native/utils/network_interfaces/mod.rs @@ -354,16 +354,23 @@ impl NetworkInterfaces { core::mem::swap(&mut inner.interfaces, &mut last_interfaces); inner.valid = true; - let changed = last_interfaces != inner.interfaces; - if changed { + if last_interfaces != inner.interfaces { + // get last address cache + let old_best_addresses = inner.interface_address_cache.clone(); + + // redo the address cache Self::cache_best_addresses(&mut *inner); - trace!( - "NetworkInterfaces refreshed: {:#?}?", - inner.interface_address_cache - ); + // See if our best addresses have changed + if old_best_addresses != inner.interface_address_cache { + trace!( + "Network interface addresses changed: {:?}", + inner.interface_address_cache + ); + return Ok(true); + } } - Ok(changed) + Ok(false) } pub fn with_interfaces(&self, f: F) -> R where diff --git a/veilid-core/src/network_manager/mod.rs b/veilid-core/src/network_manager/mod.rs index b9028b57..4941a8a2 100644 --- a/veilid-core/src/network_manager/mod.rs +++ b/veilid-core/src/network_manager/mod.rs @@ -10,6 +10,7 @@ mod connection_limits; mod connection_manager; mod connection_table; mod network_connection; +mod tasks; pub mod tests; @@ -22,6 +23,7 @@ use connection_handle::*; use connection_limits::*; use connection_manager::*; use dht::*; +use futures_util::stream::{FuturesUnordered, StreamExt}; use hashlink::LruCache; use intf::*; #[cfg(not(target_arch = "wasm32"))] @@ -42,6 +44,16 @@ pub const IPADDR_MAX_INACTIVE_DURATION_US: u64 = 300_000_000u64; // 5 minutes pub const GLOBAL_ADDRESS_CHANGE_DETECTION_COUNT: usize = 3; pub const BOOT_MAGIC: &[u8; 4] = b"BOOT"; +pub const BOOTSTRAP_TXT_VERSION: u8 = 0; + +#[derive(Clone, Debug)] +pub struct BootstrapRecord { + min_version: u8, + max_version: u8, + dial_info_details: Vec, +} +pub type BootstrapRecordMap = BTreeMap; + #[derive(Copy, Clone, Debug, Default)] pub struct ProtocolConfig { pub outbound: ProtocolSet, @@ -129,6 +141,10 @@ struct NetworkManagerUnlockedInner { // Background processes rolling_transfers_task: TickTask, relay_management_task: TickTask, + bootstrap_task: TickTask, + peer_minimum_refresh_task: TickTask, + ping_validator_task: TickTask, + node_info_update_single_future: MustJoinSingleFuture<()>, } #[derive(Clone)] @@ -152,11 +168,15 @@ impl NetworkManager { public_address_check_cache: LruCache::new(8), } } - fn new_unlocked_inner(_config: VeilidConfig) -> NetworkManagerUnlockedInner { - //let c = config.get(); + fn new_unlocked_inner(config: VeilidConfig) -> NetworkManagerUnlockedInner { + let c = config.get(); NetworkManagerUnlockedInner { rolling_transfers_task: TickTask::new(ROLLING_TRANSFERS_INTERVAL_SECS), relay_management_task: TickTask::new(RELAY_MANAGEMENT_INTERVAL_SECS), + bootstrap_task: TickTask::new(1), + peer_minimum_refresh_task: TickTask::new_ms(c.network.dht.min_peer_refresh_time_ms), + ping_validator_task: TickTask::new(1), + node_info_update_single_future: MustJoinSingleFuture::new(), } } @@ -186,6 +206,31 @@ impl NetworkManager { Box::pin(this2.clone().relay_management_task_routine(s, l, t)) }); } + // Set bootstrap tick task + { + let this2 = this.clone(); + this.unlocked_inner + .bootstrap_task + .set_routine(move |s, _l, _t| Box::pin(this2.clone().bootstrap_task_routine(s))); + } + // Set peer minimum refresh tick task + { + let this2 = this.clone(); + this.unlocked_inner + .peer_minimum_refresh_task + .set_routine(move |s, _l, _t| { + Box::pin(this2.clone().peer_minimum_refresh_task_routine(s)) + }); + } + // Set ping validator tick task + { + let this2 = this.clone(); + this.unlocked_inner + .ping_validator_task + .set_routine(move |s, l, t| { + Box::pin(this2.clone().ping_validator_task_routine(s, l, t)) + }); + } this } pub fn config(&self) -> VeilidConfig { @@ -298,6 +343,10 @@ impl NetworkManager { return Err(e); } + // Inform routing table entries that our dial info has changed + self.send_node_info_updates(true).await; + + // Inform api clients that things have changed self.send_network_update(); Ok(()) @@ -312,10 +361,32 @@ impl NetworkManager { if let Err(e) = self.unlocked_inner.rolling_transfers_task.stop().await { warn!("rolling_transfers_task not stopped: {}", e); } - debug!("stopping relay management task task"); + debug!("stopping relay management task"); if let Err(e) = self.unlocked_inner.relay_management_task.stop().await { warn!("relay_management_task not stopped: {}", e); } + debug!("stopping bootstrap task"); + if let Err(e) = self.unlocked_inner.bootstrap_task.stop().await { + error!("bootstrap_task not stopped: {}", e); + } + debug!("stopping peer minimum refresh task"); + if let Err(e) = self.unlocked_inner.peer_minimum_refresh_task.stop().await { + error!("peer_minimum_refresh_task not stopped: {}", e); + } + debug!("stopping ping_validator task"); + if let Err(e) = self.unlocked_inner.ping_validator_task.stop().await { + error!("ping_validator_task not stopped: {}", e); + } + debug!("stopping node info update singlefuture"); + if self + .unlocked_inner + .node_info_update_single_future + .join() + .await + .is_err() + { + error!("node_info_update_single_future not stopped"); + } // Shutdown network components if they started up debug!("shutting down network components"); @@ -386,13 +457,9 @@ impl NetworkManager { } } - #[instrument(level = "debug", skip_all, err)] - async fn restart_net(&self, net: Network) -> EyreResult<()> { - net.shutdown().await; - self.send_network_update(); - net.startup().await?; - self.send_network_update(); - Ok(()) + pub fn needs_restart(&self) -> bool { + let net = self.net(); + net.needs_restart() } pub async fn tick(&self) -> EyreResult<()> { @@ -406,18 +473,30 @@ impl NetworkManager { ) }; - // If the network needs to be reset, do it - // if things can't restart, then we fail out of the attachment manager - if net.needs_restart() { - self.restart_net(net.clone()).await?; - } - // Run the rolling transfers task self.unlocked_inner.rolling_transfers_task.tick().await?; // Run the relay management task self.unlocked_inner.relay_management_task.tick().await?; + // If routing table has no live entries, then add the bootstrap nodes to it + let live_entry_count = routing_table.get_entry_count(BucketEntryState::Unreliable); + if live_entry_count == 0 { + self.unlocked_inner.bootstrap_task.tick().await?; + } + + // If we still don't have enough peers, find nodes until we do + let min_peer_count = { + let c = self.config.get(); + c.network.dht.min_peer_count as usize + }; + if live_entry_count < min_peer_count { + self.unlocked_inner.peer_minimum_refresh_task.tick().await?; + } + + // Ping validate some nodes to groom the table + self.unlocked_inner.ping_validator_task.tick().await?; + // Run the routing table tick routing_table.tick().await?; @@ -1313,135 +1392,6 @@ impl NetworkManager { Ok(true) } - // Keep relays assigned and accessible - #[instrument(level = "trace", skip(self), err)] - async fn relay_management_task_routine( - self, - stop_token: StopToken, - _last_ts: u64, - cur_ts: u64, - ) -> EyreResult<()> { - // Get our node's current node info and network class and do the right thing - let routing_table = self.routing_table(); - let node_info = routing_table.get_own_node_info(); - let network_class = self.get_network_class(); - let mut node_info_changed = false; - - // Do we know our network class yet? - if let Some(network_class) = network_class { - // If we already have a relay, see if it is dead, or if we don't need it any more - let has_relay = { - let mut inner = self.inner.lock(); - if let Some(relay_node) = inner.relay_node.clone() { - let state = relay_node.operate(|e| e.state(cur_ts)); - // Relay node is dead or no longer needed - if matches!(state, BucketEntryState::Dead) { - info!("Relay node died, dropping relay {}", relay_node); - inner.relay_node = None; - node_info_changed = true; - false - } else if !node_info.requires_relay() { - info!( - "Relay node no longer required, dropping relay {}", - relay_node - ); - inner.relay_node = None; - node_info_changed = true; - false - } else { - true - } - } else { - false - } - }; - - // Do we need a relay? - if !has_relay && node_info.requires_relay() { - // Do we need an outbound relay? - if network_class.outbound_wants_relay() { - // The outbound relay is the host of the PWA - if let Some(outbound_relay_peerinfo) = intf::get_outbound_relay_peer().await { - let mut inner = self.inner.lock(); - - // Register new outbound relay - if let Some(nr) = routing_table.register_node_with_signed_node_info( - outbound_relay_peerinfo.node_id.key, - outbound_relay_peerinfo.signed_node_info, - ) { - info!("Outbound relay node selected: {}", nr); - inner.relay_node = Some(nr); - node_info_changed = true; - } - } - // Otherwise we must need an inbound relay - } else { - // Find a node in our routing table that is an acceptable inbound relay - if let Some(nr) = routing_table.find_inbound_relay(cur_ts) { - let mut inner = self.inner.lock(); - info!("Inbound relay node selected: {}", nr); - inner.relay_node = Some(nr); - node_info_changed = true; - } - } - } - } - - // Re-send our node info if we selected a relay - if node_info_changed { - self.routing_table().send_node_info_updates(true).await; - } - - Ok(()) - } - - // Compute transfer statistics for the low level network - #[instrument(level = "trace", skip(self), err)] - async fn rolling_transfers_task_routine( - self, - stop_token: StopToken, - last_ts: u64, - cur_ts: u64, - ) -> EyreResult<()> { - // log_net!("--- network manager rolling_transfers task"); - { - let inner = &mut *self.inner.lock(); - - // Roll the low level network transfer stats for our address - inner - .stats - .self_stats - .transfer_stats_accounting - .roll_transfers(last_ts, cur_ts, &mut inner.stats.self_stats.transfer_stats); - - // Roll all per-address transfers - let mut dead_addrs: HashSet = HashSet::new(); - for (addr, stats) in &mut inner.stats.per_address_stats { - stats.transfer_stats_accounting.roll_transfers( - last_ts, - cur_ts, - &mut stats.transfer_stats, - ); - - // While we're here, lets see if this address has timed out - if cur_ts - stats.last_seen_ts >= IPADDR_MAX_INACTIVE_DURATION_US { - // it's dead, put it in the dead list - dead_addrs.insert(*addr); - } - } - - // Remove the dead addresses from our tables - for da in &dead_addrs { - inner.stats.per_address_stats.remove(da); - } - } - - // Send update - self.send_network_update(); - - Ok(()) - } - // Callbacks from low level network for statistics gathering pub fn stats_packet_sent(&self, addr: IpAddr, bytes: u64) { let inner = &mut *self.inner.lock(); @@ -1612,4 +1562,58 @@ impl NetworkManager { net.reset_network_class(); } } + + // Inform routing table entries that our dial info has changed + pub async fn send_node_info_updates(&self, all: bool) { + let this = self.clone(); + + // Run in background only once + let _ = self + .clone() + .unlocked_inner + .node_info_update_single_future + .single_spawn(async move { + // Only update if we actually have a valid network class + if matches!( + this.get_network_class().unwrap_or(NetworkClass::Invalid), + NetworkClass::Invalid + ) { + trace!( + "not sending node info update because our network class is not yet valid" + ); + return; + } + + // Get the list of refs to all nodes to update + let cur_ts = intf::get_timestamp(); + let node_refs = this.routing_table().get_nodes_needing_updates(cur_ts, all); + + // Send the updates + log_net!(debug "Sending node info updates to {} nodes", node_refs.len()); + let mut unord = FuturesUnordered::new(); + for nr in node_refs { + let rpc = this.rpc_processor(); + unord.push(async move { + // Update the node + if let Err(e) = rpc + .rpc_call_node_info_update(Destination::Direct(nr.clone()), None) + .await + { + // Not fatal, but we should be able to see if this is happening + trace!("failed to send node info update to {:?}: {}", nr, e); + return; + } + + // Mark the node as updated + nr.set_seen_our_node_info(); + }); + } + + // Wait for futures to complete + while unord.next().await.is_some() {} + + log_rtab!(debug "Finished sending node updates"); + }) + .await; + } } diff --git a/veilid-core/src/network_manager/native/mod.rs b/veilid-core/src/network_manager/native/mod.rs index 6c29d157..9b4605a9 100644 --- a/veilid-core/src/network_manager/native/mod.rs +++ b/veilid-core/src/network_manager/native/mod.rs @@ -281,6 +281,7 @@ impl Network { if !self.unlocked_inner.interfaces.refresh().await? { return Ok(false); } + self.inner.lock().network_needs_restart = true; Ok(true) } @@ -613,9 +614,6 @@ impl Network { info!("network started"); self.inner.lock().network_started = true; - // Inform routing table entries that our dial info has changed - self.routing_table().send_node_info_updates(true).await; - Ok(()) } diff --git a/veilid-core/src/network_manager/native/network_class_discovery.rs b/veilid-core/src/network_manager/native/network_class_discovery.rs index 0974c0df..18f96c2e 100644 --- a/veilid-core/src/network_manager/native/network_class_discovery.rs +++ b/veilid-core/src/network_manager/native/network_class_discovery.rs @@ -633,6 +633,8 @@ impl Network { if network_class.is_some() { // Update public dial info let routing_table = self.routing_table(); + let network_manager = self.network_manager(); + for ctx in contexts { let inner = ctx.inner.lock(); if let Some(pdi) = &inner.detected_public_dial_info { @@ -650,7 +652,7 @@ impl Network { log_net!(debug "network class changed to {:?}", network_class); // Send updates to everyone - routing_table.send_node_info_updates(true).await; + network_manager.send_node_info_updates(true).await; } Ok(()) diff --git a/veilid-core/src/network_manager/tasks.rs b/veilid-core/src/network_manager/tasks.rs new file mode 100644 index 00000000..a399b270 --- /dev/null +++ b/veilid-core/src/network_manager/tasks.rs @@ -0,0 +1,500 @@ +use super::*; + +use crate::dht::*; +use crate::xx::*; +use stop_token::future::FutureExt; + +impl NetworkManager { + // Bootstrap lookup process + #[instrument(level = "trace", skip(self), ret, err)] + pub(super) async fn resolve_bootstrap( + &self, + bootstrap: Vec, + ) -> EyreResult { + // Resolve from bootstrap root to bootstrap hostnames + let mut bsnames = Vec::::new(); + for bh in bootstrap { + // Get TXT record for bootstrap (bootstrap.veilid.net, or similar) + let records = intf::txt_lookup(&bh).await?; + for record in records { + // Split the bootstrap name record by commas + for rec in record.split(',') { + let rec = rec.trim(); + // If the name specified is fully qualified, go with it + let bsname = if rec.ends_with('.') { + rec.to_string() + } + // If the name is not fully qualified, prepend it to the bootstrap name + else { + format!("{}.{}", rec, bh) + }; + + // Add to the list of bootstrap name to look up + bsnames.push(bsname); + } + } + } + + // Get bootstrap nodes from hostnames concurrently + let mut unord = FuturesUnordered::new(); + for bsname in bsnames { + unord.push(async move { + // look up boostrap node txt records + let bsnirecords = match intf::txt_lookup(&bsname).await { + Err(e) => { + warn!("bootstrap node txt lookup failed for {}: {}", bsname, e); + return None; + } + Ok(v) => v, + }; + // for each record resolve into key/bootstraprecord pairs + let mut bootstrap_records: Vec<(DHTKey, BootstrapRecord)> = Vec::new(); + for bsnirecord in bsnirecords { + // Bootstrap TXT Record Format Version 0: + // txt_version,min_version,max_version,nodeid,hostname,dialinfoshort* + // + // Split bootstrap node record by commas. Example: + // 0,0,0,7lxDEabK_qgjbe38RtBa3IZLrud84P6NhGP-pRTZzdQ,bootstrap-dev-alpha.veilid.net,T5150,U5150,W5150/ws + let records: Vec = bsnirecord + .trim() + .split(',') + .map(|x| x.trim().to_owned()) + .collect(); + if records.len() < 6 { + warn!("invalid number of fields in bootstrap txt record"); + continue; + } + + // Bootstrap TXT record version + let txt_version: u8 = match records[0].parse::() { + Ok(v) => v, + Err(e) => { + warn!( + "invalid txt_version specified in bootstrap node txt record: {}", + e + ); + continue; + } + }; + if txt_version != BOOTSTRAP_TXT_VERSION { + warn!("unsupported bootstrap txt record version"); + continue; + } + + // Min/Max wire protocol version + let min_version: u8 = match records[1].parse::() { + Ok(v) => v, + Err(e) => { + warn!( + "invalid min_version specified in bootstrap node txt record: {}", + e + ); + continue; + } + }; + let max_version: u8 = match records[2].parse::() { + Ok(v) => v, + Err(e) => { + warn!( + "invalid max_version specified in bootstrap node txt record: {}", + e + ); + continue; + } + }; + + // Node Id + let node_id_str = &records[3]; + let node_id_key = match DHTKey::try_decode(node_id_str) { + Ok(v) => v, + Err(e) => { + warn!( + "Invalid node id in bootstrap node record {}: {}", + node_id_str, e + ); + continue; + } + }; + + // Hostname + let hostname_str = &records[4]; + + // If this is our own node id, then we skip it for bootstrap, in case we are a bootstrap node + if self.routing_table().node_id() == node_id_key { + continue; + } + + // Resolve each record and store in node dial infos list + let mut bootstrap_record = BootstrapRecord { + min_version, + max_version, + dial_info_details: Vec::new(), + }; + for rec in &records[5..] { + let rec = rec.trim(); + let dial_infos = match DialInfo::try_vec_from_short(rec, hostname_str) { + Ok(dis) => dis, + Err(e) => { + warn!("Couldn't resolve bootstrap node dial info {}: {}", rec, e); + continue; + } + }; + + for di in dial_infos { + bootstrap_record.dial_info_details.push(DialInfoDetail { + dial_info: di, + class: DialInfoClass::Direct, + }); + } + } + bootstrap_records.push((node_id_key, bootstrap_record)); + } + Some(bootstrap_records) + }); + } + + let mut bsmap = BootstrapRecordMap::new(); + while let Some(bootstrap_records) = unord.next().await { + if let Some(bootstrap_records) = bootstrap_records { + for (bskey, mut bsrec) in bootstrap_records { + let rec = bsmap.entry(bskey).or_insert_with(|| BootstrapRecord { + min_version: bsrec.min_version, + max_version: bsrec.max_version, + dial_info_details: Vec::new(), + }); + rec.dial_info_details.append(&mut bsrec.dial_info_details); + } + } + } + + Ok(bsmap) + } + + // 'direct' bootstrap task routine for systems incapable of resolving TXT records, such as browser WASM + pub(super) async fn direct_bootstrap_task_routine( + self, + stop_token: StopToken, + bootstrap_dialinfos: Vec, + ) -> EyreResult<()> { + let mut unord = FuturesUnordered::new(); + let routing_table = self.routing_table(); + + for bootstrap_di in bootstrap_dialinfos { + let peer_info = self.boot_request(bootstrap_di).await?; + + // Got peer info, let's add it to the routing table + for pi in peer_info { + let k = pi.node_id.key; + // Register the node + if let Some(nr) = + routing_table.register_node_with_signed_node_info(k, pi.signed_node_info) + { + // Add this our futures to process in parallel + let routing_table = routing_table.clone(); + unord.push( + // lets ask bootstrap to find ourselves now + async move { routing_table.reverse_find_node(nr, true).await }, + ); + } + } + } + + // Wait for all bootstrap operations to complete before we complete the singlefuture + while let Ok(Some(_)) = unord.next().timeout_at(stop_token.clone()).await {} + + Ok(()) + } + + #[instrument(level = "trace", skip(self), err)] + pub(super) async fn bootstrap_task_routine(self, stop_token: StopToken) -> EyreResult<()> { + let (bootstrap, bootstrap_nodes) = { + let c = self.config.get(); + ( + c.network.bootstrap.clone(), + c.network.bootstrap_nodes.clone(), + ) + }; + let routing_table = self.routing_table(); + + log_net!(debug "--- bootstrap_task"); + + // See if we are specifying a direct dialinfo for bootstrap, if so use the direct mechanism + if !bootstrap.is_empty() && bootstrap_nodes.is_empty() { + let mut bootstrap_dialinfos = Vec::::new(); + for b in &bootstrap { + if let Ok(bootstrap_di_vec) = DialInfo::try_vec_from_url(&b) { + for bootstrap_di in bootstrap_di_vec { + bootstrap_dialinfos.push(bootstrap_di); + } + } + } + if bootstrap_dialinfos.len() > 0 { + return self + .direct_bootstrap_task_routine(stop_token, bootstrap_dialinfos) + .await; + } + } + + // If we aren't specifying a bootstrap node list explicitly, then pull from the bootstrap server(s) + let bsmap: BootstrapRecordMap = if !bootstrap_nodes.is_empty() { + let mut bsmap = BootstrapRecordMap::new(); + let mut bootstrap_node_dial_infos = Vec::new(); + for b in bootstrap_nodes { + let ndis = NodeDialInfo::from_str(b.as_str()) + .wrap_err("Invalid node dial info in bootstrap entry")?; + bootstrap_node_dial_infos.push(ndis); + } + for ndi in bootstrap_node_dial_infos { + let node_id = ndi.node_id.key; + bsmap + .entry(node_id) + .or_insert_with(|| BootstrapRecord { + min_version: MIN_VERSION, + max_version: MAX_VERSION, + dial_info_details: Vec::new(), + }) + .dial_info_details + .push(DialInfoDetail { + dial_info: ndi.dial_info, + class: DialInfoClass::Direct, // Bootstraps are always directly reachable + }); + } + bsmap + } else { + // Resolve bootstrap servers and recurse their TXT entries + self.resolve_bootstrap(bootstrap).await? + }; + + // Map all bootstrap entries to a single key with multiple dialinfo + + // Run all bootstrap operations concurrently + let mut unord = FuturesUnordered::new(); + for (k, mut v) in bsmap { + // Sort dial info so we get the preferred order correct + v.dial_info_details.sort(); + + log_net!("--- bootstrapping {} with {:?}", k.encode(), &v); + + // Make invalid signed node info (no signature) + if let Some(nr) = routing_table.register_node_with_signed_node_info( + k, + SignedNodeInfo::with_no_signature(NodeInfo { + network_class: NetworkClass::InboundCapable, // Bootstraps are always inbound capable + outbound_protocols: ProtocolSet::empty(), // Bootstraps do not participate in relaying and will not make outbound requests + min_version: v.min_version, // Minimum protocol version specified in txt record + max_version: v.max_version, // Maximum protocol version specified in txt record + dial_info_detail_list: v.dial_info_details, // Dial info is as specified in the bootstrap list + relay_peer_info: None, // Bootstraps never require a relay themselves + }), + ) { + // Add this our futures to process in parallel + let routing_table = routing_table.clone(); + unord.push(intf::spawn(async move { + // Need VALID signed peer info, so ask bootstrap to find_node of itself + // which will ensure it has the bootstrap's signed peer info as part of the response + let _ = routing_table.find_target(nr.clone()).await; + + // Ensure we got the signed peer info + if !nr.operate(|e| e.has_valid_signed_node_info()) { + log_net!(warn + "bootstrap at {:?} did not return valid signed node info", + nr + ); + // If this node info is invalid, it will time out after being unpingable + } else { + // otherwise this bootstrap is valid, lets ask it to find ourselves now + routing_table.reverse_find_node(nr, true).await + } + })); + } + } + + // Wait for all bootstrap operations to complete before we complete the singlefuture + while let Ok(Some(_)) = unord.next().timeout_at(stop_token.clone()).await {} + Ok(()) + } + + // Ping each node in the routing table if they need to be pinged + // to determine their reliability + #[instrument(level = "trace", skip(self), err)] + pub(super) async fn ping_validator_task_routine( + self, + stop_token: StopToken, + _last_ts: u64, + cur_ts: u64, + ) -> EyreResult<()> { + let rpc = self.rpc_processor(); + let routing_table = self.routing_table(); + + let relay_node_id = self.relay_node().map(|nr| nr.node_id()); + + let mut unord = FuturesUnordered::new(); + + let node_refs = routing_table.get_nodes_needing_ping(cur_ts, relay_node_id); + for nr in node_refs { + let rpc = rpc.clone(); + unord.push(intf::spawn(async move { rpc.rpc_call_status(nr).await })); + } + + // Wait for futures to complete + while let Ok(Some(_)) = unord.next().timeout_at(stop_token.clone()).await {} + + Ok(()) + } + + // Ask our remaining peers to give us more peers before we go + // back to the bootstrap servers to keep us from bothering them too much + #[instrument(level = "trace", skip(self), err)] + pub(super) async fn peer_minimum_refresh_task_routine( + self, + stop_token: StopToken, + ) -> EyreResult<()> { + let routing_table = self.routing_table(); + let cur_ts = intf::get_timestamp(); + + // get list of all peers we know about, even the unreliable ones, and ask them to find nodes close to our node too + let noderefs = routing_table.get_all_nodes(cur_ts); + + // do peer minimum search concurrently + let mut unord = FuturesUnordered::new(); + for nr in noderefs { + log_net!("--- peer minimum search with {:?}", nr); + let routing_table = routing_table.clone(); + unord.push(intf::spawn(async move { + routing_table.reverse_find_node(nr, false).await + })); + } + while let Ok(Some(_)) = unord.next().timeout_at(stop_token.clone()).await {} + + Ok(()) + } + + // Keep relays assigned and accessible + #[instrument(level = "trace", skip(self), err)] + pub(super) async fn relay_management_task_routine( + self, + stop_token: StopToken, + _last_ts: u64, + cur_ts: u64, + ) -> EyreResult<()> { + // Get our node's current node info and network class and do the right thing + let routing_table = self.routing_table(); + let node_info = routing_table.get_own_node_info(); + let network_class = self.get_network_class(); + let mut node_info_changed = false; + + // Do we know our network class yet? + if let Some(network_class) = network_class { + // If we already have a relay, see if it is dead, or if we don't need it any more + let has_relay = { + let mut inner = self.inner.lock(); + if let Some(relay_node) = inner.relay_node.clone() { + let state = relay_node.operate(|e| e.state(cur_ts)); + // Relay node is dead or no longer needed + if matches!(state, BucketEntryState::Dead) { + info!("Relay node died, dropping relay {}", relay_node); + inner.relay_node = None; + node_info_changed = true; + false + } else if !node_info.requires_relay() { + info!( + "Relay node no longer required, dropping relay {}", + relay_node + ); + inner.relay_node = None; + node_info_changed = true; + false + } else { + true + } + } else { + false + } + }; + + // Do we need a relay? + if !has_relay && node_info.requires_relay() { + // Do we need an outbound relay? + if network_class.outbound_wants_relay() { + // The outbound relay is the host of the PWA + if let Some(outbound_relay_peerinfo) = intf::get_outbound_relay_peer().await { + let mut inner = self.inner.lock(); + + // Register new outbound relay + if let Some(nr) = routing_table.register_node_with_signed_node_info( + outbound_relay_peerinfo.node_id.key, + outbound_relay_peerinfo.signed_node_info, + ) { + info!("Outbound relay node selected: {}", nr); + inner.relay_node = Some(nr); + node_info_changed = true; + } + } + // Otherwise we must need an inbound relay + } else { + // Find a node in our routing table that is an acceptable inbound relay + if let Some(nr) = routing_table.find_inbound_relay(cur_ts) { + let mut inner = self.inner.lock(); + info!("Inbound relay node selected: {}", nr); + inner.relay_node = Some(nr); + node_info_changed = true; + } + } + } + } + + // Re-send our node info if we selected a relay + if node_info_changed { + self.send_node_info_updates(true).await; + } + + Ok(()) + } + + // Compute transfer statistics for the low level network + #[instrument(level = "trace", skip(self), err)] + pub(super) async fn rolling_transfers_task_routine( + self, + stop_token: StopToken, + last_ts: u64, + cur_ts: u64, + ) -> EyreResult<()> { + // log_net!("--- network manager rolling_transfers task"); + { + let inner = &mut *self.inner.lock(); + + // Roll the low level network transfer stats for our address + inner + .stats + .self_stats + .transfer_stats_accounting + .roll_transfers(last_ts, cur_ts, &mut inner.stats.self_stats.transfer_stats); + + // Roll all per-address transfers + let mut dead_addrs: HashSet = HashSet::new(); + for (addr, stats) in &mut inner.stats.per_address_stats { + stats.transfer_stats_accounting.roll_transfers( + last_ts, + cur_ts, + &mut stats.transfer_stats, + ); + + // While we're here, lets see if this address has timed out + if cur_ts - stats.last_seen_ts >= IPADDR_MAX_INACTIVE_DURATION_US { + // it's dead, put it in the dead list + dead_addrs.insert(*addr); + } + } + + // Remove the dead addresses from our tables + for da in &dead_addrs { + inner.stats.per_address_stats.remove(da); + } + } + + // Send update + self.send_network_update(); + + Ok(()) + } +} diff --git a/veilid-core/src/routing_table/mod.rs b/veilid-core/src/routing_table/mod.rs index 85260b8b..b894affb 100644 --- a/veilid-core/src/routing_table/mod.rs +++ b/veilid-core/src/routing_table/mod.rs @@ -11,27 +11,15 @@ use crate::network_manager::*; use crate::rpc_processor::*; use crate::xx::*; use crate::*; -use alloc::str::FromStr; use bucket::*; pub use bucket_entry::*; pub use debug::*; pub use find_nodes::*; -use futures_util::stream::{FuturesUnordered, StreamExt}; pub use node_ref::*; pub use stats_accounting::*; ////////////////////////////////////////////////////////////////////////// -pub const BOOTSTRAP_TXT_VERSION: u8 = 0; - -#[derive(Clone, Debug)] -pub struct BootstrapRecord { - min_version: u8, - max_version: u8, - dial_info_details: Vec, -} -pub type BootstrapRecordMap = BTreeMap; - #[derive(Debug, Copy, Clone, PartialEq, PartialOrd, Ord, Eq)] pub enum RoutingDomain { PublicInternet, @@ -70,10 +58,6 @@ pub struct RoutingTableHealth { struct RoutingTableUnlockedInner { // Background processes rolling_transfers_task: TickTask, - bootstrap_task: TickTask, - peer_minimum_refresh_task: TickTask, - ping_validator_task: TickTask, - node_info_update_single_future: MustJoinSingleFuture<()>, kick_buckets_task: TickTask, } @@ -100,14 +84,10 @@ impl RoutingTable { self_transfer_stats: TransferStatsDownUp::default(), } } - fn new_unlocked_inner(config: VeilidConfig) -> RoutingTableUnlockedInner { - let c = config.get(); + fn new_unlocked_inner(_config: VeilidConfig) -> RoutingTableUnlockedInner { + //let c = config.get(); RoutingTableUnlockedInner { rolling_transfers_task: TickTask::new(ROLLING_TRANSFERS_INTERVAL_SECS), - bootstrap_task: TickTask::new(1), - peer_minimum_refresh_task: TickTask::new_ms(c.network.dht.min_peer_refresh_time_ms), - ping_validator_task: TickTask::new(1), - node_info_update_single_future: MustJoinSingleFuture::new(), kick_buckets_task: TickTask::new(1), } } @@ -127,31 +107,7 @@ impl RoutingTable { Box::pin(this2.clone().rolling_transfers_task_routine(s, l, t)) }); } - // Set bootstrap tick task - { - let this2 = this.clone(); - this.unlocked_inner - .bootstrap_task - .set_routine(move |s, _l, _t| Box::pin(this2.clone().bootstrap_task_routine(s))); - } - // Set peer minimum refresh tick task - { - let this2 = this.clone(); - this.unlocked_inner - .peer_minimum_refresh_task - .set_routine(move |s, _l, _t| { - Box::pin(this2.clone().peer_minimum_refresh_task_routine(s)) - }); - } - // Set ping validator tick task - { - let this2 = this.clone(); - this.unlocked_inner - .ping_validator_task - .set_routine(move |s, l, t| { - Box::pin(this2.clone().ping_validator_task_routine(s, l, t)) - }); - } + // Set kick buckets tick task { let this2 = this.clone(); @@ -402,27 +358,9 @@ impl RoutingTable { if let Err(e) = self.unlocked_inner.rolling_transfers_task.stop().await { error!("rolling_transfers_task not stopped: {}", e); } - debug!("stopping bootstrap task"); - if let Err(e) = self.unlocked_inner.bootstrap_task.stop().await { - error!("bootstrap_task not stopped: {}", e); - } - debug!("stopping peer minimum refresh task"); - if let Err(e) = self.unlocked_inner.peer_minimum_refresh_task.stop().await { - error!("peer_minimum_refresh_task not stopped: {}", e); - } - debug!("stopping ping_validator task"); - if let Err(e) = self.unlocked_inner.ping_validator_task.stop().await { - error!("ping_validator_task not stopped: {}", e); - } - debug!("stopping node info update singlefuture"); - if self - .unlocked_inner - .node_info_update_single_future - .join() - .await - .is_err() - { - error!("node_info_update_single_future not stopped"); + debug!("stopping kick buckets task"); + if let Err(e) = self.unlocked_inner.kick_buckets_task.stop().await { + error!("kick_buckets_task not stopped: {}", e); } *self.inner.write() = Self::new_inner(self.network_manager()); @@ -430,72 +368,6 @@ impl RoutingTable { debug!("finished routing table terminate"); } - // Inform routing table entries that our dial info has changed - pub async fn send_node_info_updates(&self, all: bool) { - let this = self.clone(); - - // Run in background only once - let _ = self - .clone() - .unlocked_inner - .node_info_update_single_future - .single_spawn(async move { - // Only update if we actually have a valid network class - let netman = this.network_manager(); - if matches!( - netman.get_network_class().unwrap_or(NetworkClass::Invalid), - NetworkClass::Invalid - ) { - trace!( - "not sending node info update because our network class is not yet valid" - ); - return; - } - - // Get the list of refs to all nodes to update - let node_refs = { - let inner = this.inner.read(); - let mut node_refs = Vec::::with_capacity(inner.bucket_entry_count); - let cur_ts = intf::get_timestamp(); - Self::with_entries(&*inner, cur_ts, BucketEntryState::Unreliable, |k, v| { - // Only update nodes that haven't seen our node info yet - if all || !v.with(|e| e.has_seen_our_node_info()) { - node_refs.push(NodeRef::new(this.clone(), k, v, None)); - } - Option::<()>::None - }); - node_refs - }; - - // Send the updates - log_rtab!(debug "Sending node info updates to {} nodes", node_refs.len()); - let mut unord = FuturesUnordered::new(); - for nr in node_refs { - let rpc = this.rpc_processor(); - unord.push(async move { - // Update the node - if let Err(e) = rpc - .rpc_call_node_info_update(Destination::Direct(nr.clone()), None) - .await - { - // Not fatal, but we should be able to see if this is happening - trace!("failed to send node info update to {:?}: {}", nr, e); - return; - } - - // Mark the node as updated - nr.set_seen_our_node_info(); - }); - } - - // Wait for futures to complete - while unord.next().await.is_some() {} - - log_rtab!(debug "Finished sending node updates"); - }) - .await; - } - // Attempt to empty the routing table // should only be performed when there are no node_refs (detached) pub fn purge(&self) { @@ -539,7 +411,12 @@ impl RoutingTable { .unwrap() } - fn get_entry_count(inner: &RoutingTableInner, min_state: BucketEntryState) -> usize { + pub fn get_entry_count(&self, min_state: BucketEntryState) -> usize { + let inner = self.inner.read(); + Self::get_entry_count_inner(&*inner, min_state) + } + + fn get_entry_count_inner(inner: &RoutingTableInner, min_state: BucketEntryState) -> usize { let mut count = 0usize; let cur_ts = intf::get_timestamp(); Self::with_entries(inner, cur_ts, min_state, |_, _| { @@ -567,6 +444,46 @@ impl RoutingTable { None } + pub fn get_nodes_needing_updates(&self, cur_ts: u64, all: bool) -> Vec { + let inner = self.inner.read(); + let mut node_refs = Vec::::with_capacity(inner.bucket_entry_count); + Self::with_entries(&*inner, cur_ts, BucketEntryState::Unreliable, |k, v| { + // Only update nodes that haven't seen our node info yet + if all || !v.with(|e| e.has_seen_our_node_info()) { + node_refs.push(NodeRef::new(self.clone(), k, v, None)); + } + Option::<()>::None + }); + node_refs + } + + pub fn get_nodes_needing_ping( + &self, + cur_ts: u64, + relay_node_id: Option, + ) -> Vec { + let inner = self.inner.read(); + let mut node_refs = Vec::::with_capacity(inner.bucket_entry_count); + Self::with_entries(&*inner, cur_ts, BucketEntryState::Unreliable, |k, v| { + // Only update nodes that haven't seen our node info yet + if v.with(|e| e.needs_ping(&k, cur_ts, relay_node_id)) { + node_refs.push(NodeRef::new(self.clone(), k, v, None)); + } + Option::<()>::None + }); + node_refs + } + + pub fn get_all_nodes(&self, cur_ts: u64) -> Vec { + let inner = self.inner.read(); + let mut node_refs = Vec::::with_capacity(inner.bucket_entry_count); + Self::with_entries(&*inner, cur_ts, BucketEntryState::Unreliable, |k, v| { + node_refs.push(NodeRef::new(self.clone(), k, v, None)); + Option::<()>::None + }); + node_refs + } + fn queue_bucket_kick(&self, node_id: DHTKey) { let mut inner = self.inner.write(); let idx = Self::find_bucket_index(&*inner, node_id); @@ -612,7 +529,7 @@ impl RoutingTable { // Kick the bucket inner.kick_queue.insert(idx); - log_rtab!(debug "Routing table now has {} nodes, {} live", cnt, Self::get_entry_count(&mut *inner, BucketEntryState::Unreliable)); + log_rtab!(debug "Routing table now has {} nodes, {} live", cnt, Self::get_entry_count_inner(&mut *inner, BucketEntryState::Unreliable)); nr } @@ -684,26 +601,6 @@ impl RoutingTable { // Do rolling transfers every ROLLING_TRANSFERS_INTERVAL_SECS secs self.unlocked_inner.rolling_transfers_task.tick().await?; - // If routing table has no live entries, then add the bootstrap nodes to it - let live_entry_count = - Self::get_entry_count(&*self.inner.read(), BucketEntryState::Unreliable); - - if live_entry_count == 0 { - self.unlocked_inner.bootstrap_task.tick().await?; - } - - // If we still don't have enough peers, find nodes until we do - let min_peer_count = { - let c = self.config.get(); - c.network.dht.min_peer_count as usize - }; - if live_entry_count < min_peer_count { - self.unlocked_inner.peer_minimum_refresh_task.tick().await?; - } - - // Ping validate some nodes to groom the table - self.unlocked_inner.ping_validator_task.tick().await?; - // Kick buckets task let kick_bucket_queue_count = { self.inner.read().kick_queue.len() }; if kick_bucket_queue_count > 0 { diff --git a/veilid-core/src/routing_table/tasks.rs b/veilid-core/src/routing_table/tasks.rs index bb8b9ebb..8284aa05 100644 --- a/veilid-core/src/routing_table/tasks.rs +++ b/veilid-core/src/routing_table/tasks.rs @@ -1,9 +1,5 @@ use super::*; - -use crate::dht::*; use crate::xx::*; -use crate::*; -use stop_token::future::FutureExt; impl RoutingTable { // Compute transfer statistics to determine how 'fast' a node is @@ -32,373 +28,6 @@ impl RoutingTable { Ok(()) } - // Bootstrap lookup process - #[instrument(level = "trace", skip(self), ret, err)] - pub(super) async fn resolve_bootstrap( - &self, - bootstrap: Vec, - ) -> EyreResult { - // Resolve from bootstrap root to bootstrap hostnames - let mut bsnames = Vec::::new(); - for bh in bootstrap { - // Get TXT record for bootstrap (bootstrap.veilid.net, or similar) - let records = intf::txt_lookup(&bh).await?; - for record in records { - // Split the bootstrap name record by commas - for rec in record.split(',') { - let rec = rec.trim(); - // If the name specified is fully qualified, go with it - let bsname = if rec.ends_with('.') { - rec.to_string() - } - // If the name is not fully qualified, prepend it to the bootstrap name - else { - format!("{}.{}", rec, bh) - }; - - // Add to the list of bootstrap name to look up - bsnames.push(bsname); - } - } - } - - // Get bootstrap nodes from hostnames concurrently - let mut unord = FuturesUnordered::new(); - for bsname in bsnames { - unord.push(async move { - // look up boostrap node txt records - let bsnirecords = match intf::txt_lookup(&bsname).await { - Err(e) => { - warn!("bootstrap node txt lookup failed for {}: {}", bsname, e); - return None; - } - Ok(v) => v, - }; - // for each record resolve into key/bootstraprecord pairs - let mut bootstrap_records: Vec<(DHTKey, BootstrapRecord)> = Vec::new(); - for bsnirecord in bsnirecords { - // Bootstrap TXT Record Format Version 0: - // txt_version,min_version,max_version,nodeid,hostname,dialinfoshort* - // - // Split bootstrap node record by commas. Example: - // 0,0,0,7lxDEabK_qgjbe38RtBa3IZLrud84P6NhGP-pRTZzdQ,bootstrap-dev-alpha.veilid.net,T5150,U5150,W5150/ws - let records: Vec = bsnirecord - .trim() - .split(',') - .map(|x| x.trim().to_owned()) - .collect(); - if records.len() < 6 { - warn!("invalid number of fields in bootstrap txt record"); - continue; - } - - // Bootstrap TXT record version - let txt_version: u8 = match records[0].parse::() { - Ok(v) => v, - Err(e) => { - warn!( - "invalid txt_version specified in bootstrap node txt record: {}", - e - ); - continue; - } - }; - if txt_version != BOOTSTRAP_TXT_VERSION { - warn!("unsupported bootstrap txt record version"); - continue; - } - - // Min/Max wire protocol version - let min_version: u8 = match records[1].parse::() { - Ok(v) => v, - Err(e) => { - warn!( - "invalid min_version specified in bootstrap node txt record: {}", - e - ); - continue; - } - }; - let max_version: u8 = match records[2].parse::() { - Ok(v) => v, - Err(e) => { - warn!( - "invalid max_version specified in bootstrap node txt record: {}", - e - ); - continue; - } - }; - - // Node Id - let node_id_str = &records[3]; - let node_id_key = match DHTKey::try_decode(node_id_str) { - Ok(v) => v, - Err(e) => { - warn!( - "Invalid node id in bootstrap node record {}: {}", - node_id_str, e - ); - continue; - } - }; - - // Hostname - let hostname_str = &records[4]; - - // If this is our own node id, then we skip it for bootstrap, in case we are a bootstrap node - if self.node_id() == node_id_key { - continue; - } - - // Resolve each record and store in node dial infos list - let mut bootstrap_record = BootstrapRecord { - min_version, - max_version, - dial_info_details: Vec::new(), - }; - for rec in &records[5..] { - let rec = rec.trim(); - let dial_infos = match DialInfo::try_vec_from_short(rec, hostname_str) { - Ok(dis) => dis, - Err(e) => { - warn!("Couldn't resolve bootstrap node dial info {}: {}", rec, e); - continue; - } - }; - - for di in dial_infos { - bootstrap_record.dial_info_details.push(DialInfoDetail { - dial_info: di, - class: DialInfoClass::Direct, - }); - } - } - bootstrap_records.push((node_id_key, bootstrap_record)); - } - Some(bootstrap_records) - }); - } - - let mut bsmap = BootstrapRecordMap::new(); - while let Some(bootstrap_records) = unord.next().await { - if let Some(bootstrap_records) = bootstrap_records { - for (bskey, mut bsrec) in bootstrap_records { - let rec = bsmap.entry(bskey).or_insert_with(|| BootstrapRecord { - min_version: bsrec.min_version, - max_version: bsrec.max_version, - dial_info_details: Vec::new(), - }); - rec.dial_info_details.append(&mut bsrec.dial_info_details); - } - } - } - - Ok(bsmap) - } - - // 'direct' bootstrap task routine for systems incapable of resolving TXT records, such as browser WASM - async fn direct_bootstrap_task_routine( - self, - stop_token: StopToken, - bootstrap_dialinfos: Vec, - ) -> EyreResult<()> { - let network_manager = self.network_manager(); - - let mut unord = FuturesUnordered::new(); - for bootstrap_di in bootstrap_dialinfos { - let peer_info = network_manager.boot_request(bootstrap_di).await?; - - // Got peer info, let's add it to the routing table - for pi in peer_info { - let k = pi.node_id.key; - // Register the node - if let Some(nr) = self.register_node_with_signed_node_info(k, pi.signed_node_info) { - // Add this our futures to process in parallel - unord.push( - // lets ask bootstrap to find ourselves now - self.reverse_find_node(nr, true), - ); - } - } - } - - // Wait for all bootstrap operations to complete before we complete the singlefuture - while let Ok(Some(_)) = unord.next().timeout_at(stop_token.clone()).await {} - - Ok(()) - } - - #[instrument(level = "trace", skip(self), err)] - pub(super) async fn bootstrap_task_routine(self, stop_token: StopToken) -> EyreResult<()> { - let (bootstrap, bootstrap_nodes) = { - let c = self.config.get(); - ( - c.network.bootstrap.clone(), - c.network.bootstrap_nodes.clone(), - ) - }; - - log_rtab!(debug "--- bootstrap_task"); - - // See if we are specifying a direct dialinfo for bootstrap, if so use the direct mechanism - if !bootstrap.is_empty() && bootstrap_nodes.is_empty() { - let mut bootstrap_dialinfos = Vec::::new(); - for b in &bootstrap { - if let Ok(bootstrap_di_vec) = DialInfo::try_vec_from_url(&b) { - for bootstrap_di in bootstrap_di_vec { - bootstrap_dialinfos.push(bootstrap_di); - } - } - } - if bootstrap_dialinfos.len() > 0 { - return self - .direct_bootstrap_task_routine(stop_token, bootstrap_dialinfos) - .await; - } - } - - // If we aren't specifying a bootstrap node list explicitly, then pull from the bootstrap server(s) - let bsmap: BootstrapRecordMap = if !bootstrap_nodes.is_empty() { - let mut bsmap = BootstrapRecordMap::new(); - let mut bootstrap_node_dial_infos = Vec::new(); - for b in bootstrap_nodes { - let ndis = NodeDialInfo::from_str(b.as_str()) - .wrap_err("Invalid node dial info in bootstrap entry")?; - bootstrap_node_dial_infos.push(ndis); - } - for ndi in bootstrap_node_dial_infos { - let node_id = ndi.node_id.key; - bsmap - .entry(node_id) - .or_insert_with(|| BootstrapRecord { - min_version: MIN_VERSION, - max_version: MAX_VERSION, - dial_info_details: Vec::new(), - }) - .dial_info_details - .push(DialInfoDetail { - dial_info: ndi.dial_info, - class: DialInfoClass::Direct, // Bootstraps are always directly reachable - }); - } - bsmap - } else { - // Resolve bootstrap servers and recurse their TXT entries - self.resolve_bootstrap(bootstrap).await? - }; - - // Map all bootstrap entries to a single key with multiple dialinfo - - // Run all bootstrap operations concurrently - let mut unord = FuturesUnordered::new(); - for (k, mut v) in bsmap { - // Sort dial info so we get the preferred order correct - v.dial_info_details.sort(); - - log_rtab!("--- bootstrapping {} with {:?}", k.encode(), &v); - - // Make invalid signed node info (no signature) - if let Some(nr) = self.register_node_with_signed_node_info( - k, - SignedNodeInfo::with_no_signature(NodeInfo { - network_class: NetworkClass::InboundCapable, // Bootstraps are always inbound capable - outbound_protocols: ProtocolSet::empty(), // Bootstraps do not participate in relaying and will not make outbound requests - min_version: v.min_version, // Minimum protocol version specified in txt record - max_version: v.max_version, // Maximum protocol version specified in txt record - dial_info_detail_list: v.dial_info_details, // Dial info is as specified in the bootstrap list - relay_peer_info: None, // Bootstraps never require a relay themselves - }), - ) { - // Add this our futures to process in parallel - let this = self.clone(); - unord.push(async move { - // Need VALID signed peer info, so ask bootstrap to find_node of itself - // which will ensure it has the bootstrap's signed peer info as part of the response - let _ = this.find_target(nr.clone()).await; - - // Ensure we got the signed peer info - if !nr.operate(|e| e.has_valid_signed_node_info()) { - log_rtab!(warn - "bootstrap at {:?} did not return valid signed node info", - nr - ); - // If this node info is invalid, it will time out after being unpingable - } else { - // otherwise this bootstrap is valid, lets ask it to find ourselves now - this.reverse_find_node(nr, true).await - } - }); - } - } - - // Wait for all bootstrap operations to complete before we complete the singlefuture - while let Ok(Some(_)) = unord.next().timeout_at(stop_token.clone()).await {} - Ok(()) - } - - // Ping each node in the routing table if they need to be pinged - // to determine their reliability - #[instrument(level = "trace", skip(self), err)] - pub(super) async fn ping_validator_task_routine( - self, - stop_token: StopToken, - _last_ts: u64, - cur_ts: u64, - ) -> EyreResult<()> { - let rpc = self.rpc_processor(); - let netman = self.network_manager(); - let relay_node_id = netman.relay_node().map(|nr| nr.node_id()); - - let mut unord = FuturesUnordered::new(); - { - let inner = self.inner.read(); - - Self::with_entries(&*inner, cur_ts, BucketEntryState::Unreliable, |k, v| { - if v.with(|e| e.needs_ping(&k, cur_ts, relay_node_id)) { - let nr = NodeRef::new(self.clone(), k, v, None); - unord.push(intf::spawn(rpc.clone().rpc_call_status(nr))); - } - Option::<()>::None - }); - } - - // Wait for futures to complete - while let Ok(Some(_)) = unord.next().timeout_at(stop_token.clone()).await {} - - Ok(()) - } - - // Ask our remaining peers to give us more peers before we go - // back to the bootstrap servers to keep us from bothering them too much - #[instrument(level = "trace", skip(self), err)] - pub(super) async fn peer_minimum_refresh_task_routine( - self, - stop_token: StopToken, - ) -> EyreResult<()> { - // get list of all peers we know about, even the unreliable ones, and ask them to find nodes close to our node too - let noderefs = { - let inner = self.inner.read(); - let mut noderefs = Vec::::with_capacity(inner.bucket_entry_count); - let cur_ts = intf::get_timestamp(); - Self::with_entries(&*inner, cur_ts, BucketEntryState::Unreliable, |k, v| { - noderefs.push(NodeRef::new(self.clone(), k, v, None)); - Option::<()>::None - }); - noderefs - }; - - // do peer minimum search concurrently - let mut unord = FuturesUnordered::new(); - for nr in noderefs { - log_rtab!("--- peer minimum search with {:?}", nr); - unord.push(self.reverse_find_node(nr, false)); - } - while let Ok(Some(_)) = unord.next().timeout_at(stop_token.clone()).await {} - - Ok(()) - } - // Kick the queued buckets in the routing table to free dead nodes if necessary // Attempts to keep the size of the routing table down to the bucket depth #[instrument(level = "trace", skip(self), err)] diff --git a/veilid-core/src/veilid_api/debug.rs b/veilid-core/src/veilid_api/debug.rs index 259f6947..d9b39dc8 100644 --- a/veilid-core/src/veilid_api/debug.rs +++ b/veilid-core/src/veilid_api/debug.rs @@ -160,6 +160,36 @@ impl VeilidAPI { Ok("Config value set".to_owned()) } + async fn debug_restart(&self, args: String) -> Result { + let args = args.trim_start(); + if args.is_empty() { + return Err(VeilidAPIError::missing_argument("debug_restart", "arg_0")); + } + let (arg, _rest) = args.split_once(' ').unwrap_or((args, "")); + // let rest = rest.trim_start().to_owned(); + + if arg == "network" { + // Must be attached + if matches!( + self.get_state().await?.attachment.state, + AttachmentState::Detached + ) { + apibail_internal!("Must be attached to restart network"); + } + + let netman = self.network_manager()?; + netman.net().restart_network(); + + Ok("Network restarted".to_owned()) + } else { + Err(VeilidAPIError::invalid_argument( + "debug_restart", + "arg_1", + arg, + )) + } + } + async fn debug_purge(&self, args: String) -> Result { let args: Vec = args.split_whitespace().map(|s| s.to_owned()).collect(); if !args.is_empty() { @@ -226,6 +256,7 @@ impl VeilidAPI { purge buckets attach detach + restart network "# .to_owned()) } @@ -261,6 +292,8 @@ impl VeilidAPI { self.debug_detach(rest).await } else if arg == "config" { self.debug_config(rest).await + } else if arg == "restart" { + self.debug_restart(rest).await } else { Ok(">>> Unknown command\n".to_owned()) }