many fixes for bootstrap and public internet connectivity

This commit is contained in:
John Smith
2022-05-24 17:13:52 -04:00
parent 9a54ee052c
commit 424ceedfd8
26 changed files with 606 additions and 320 deletions

View File

@@ -23,6 +23,9 @@ const UNRELIABLE_PING_INTERVAL_SECS: u32 = 5;
// remains valid, as well as to make sure we remain in any relay node's routing table
const KEEPALIVE_PING_INTERVAL_SECS: u32 = 20;
// How many times do we try to ping a never-reached node before we call it dead
const NEVER_REACHED_PING_COUNT: u32 = 3;
// Do not change order here, it will mess up other sorts
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum BucketEntryState {
@@ -58,7 +61,6 @@ impl BucketEntry {
transfer_stats_accounting: TransferStatsAccounting::new(),
peer_stats: PeerStats {
time_added: now,
last_seen: None,
rpc_stats: RPCStats::default(),
latency: None,
transfer: TransferStatsDownUp::default(),
@@ -129,7 +131,7 @@ impl BucketEntry {
pub fn has_valid_signed_node_info(&self) -> bool {
if let Some(sni) = &self.opt_signed_node_info {
sni.signature.valid
sni.is_valid()
} else {
false
}
@@ -213,8 +215,13 @@ impl BucketEntry {
///// state machine handling
pub(super) fn check_reliable(&self, cur_ts: u64) -> bool {
// if we have had consecutive ping replies for longer that UNRELIABLE_PING_SPAN_SECS
match self.peer_stats.rpc_stats.first_consecutive_answer_time {
// If we have had any failures to send, this is not reliable
if self.peer_stats.rpc_stats.failed_to_send > 0 {
return false;
}
// if we have seen the node consistently for longer that UNRELIABLE_PING_SPAN_SECS
match self.peer_stats.rpc_stats.first_consecutive_seen_ts {
None => false,
Some(ts) => {
cur_ts.saturating_sub(ts) >= (UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64)
@@ -222,10 +229,15 @@ impl BucketEntry {
}
}
pub(super) fn check_dead(&self, cur_ts: u64) -> bool {
// If we have failured to send NEVER_REACHED_PING_COUNT times in a row, the node is dead
if self.peer_stats.rpc_stats.failed_to_send >= NEVER_REACHED_PING_COUNT {
return true;
}
// if we have not heard from the node at all for the duration of the unreliable ping span
// a node is not dead if we haven't heard from it yet
match self.peer_stats.last_seen {
None => false,
// a node is not dead if we haven't heard from it yet,
// but we give it NEVER_REACHED_PING_COUNT chances to ping before we say it's dead
match self.peer_stats.rpc_stats.last_seen_ts {
None => self.peer_stats.rpc_stats.recent_lost_answers < NEVER_REACHED_PING_COUNT,
Some(ts) => {
cur_ts.saturating_sub(ts) >= (UNRELIABLE_PING_SPAN_SECS as u64 * 1000000u64)
}
@@ -233,9 +245,20 @@ impl BucketEntry {
}
fn needs_constant_ping(&self, cur_ts: u64, interval: u64) -> bool {
match self.peer_stats.last_seen {
// If we have not either seen the node, nor asked it a question in the last 'interval'
// then we should ping it
let latest_contact_time = self
.peer_stats
.rpc_stats
.last_seen_ts
.max(self.peer_stats.rpc_stats.last_question);
match latest_contact_time {
None => true,
Some(last_seen) => cur_ts.saturating_sub(last_seen) >= (interval * 1000000u64),
Some(latest_contact_time) => {
// If we haven't done anything with this node in 'interval' seconds
cur_ts.saturating_sub(latest_contact_time) >= (interval * 1000000u64)
}
}
}
@@ -259,19 +282,26 @@ impl BucketEntry {
match state {
BucketEntryState::Reliable => {
// If we are in a reliable state, we need a ping on an exponential scale
match self.peer_stats.last_seen {
None => true,
Some(last_seen) => {
let first_consecutive_answer_time = self
.peer_stats
.rpc_stats
.first_consecutive_answer_time
.unwrap();
let start_of_reliable_time = first_consecutive_answer_time
let latest_contact_time = self
.peer_stats
.rpc_stats
.last_seen_ts
.max(self.peer_stats.rpc_stats.last_question);
match latest_contact_time {
None => {
error!("Peer is reliable, but not seen!");
true
}
Some(latest_contact_time) => {
let first_consecutive_seen_ts =
self.peer_stats.rpc_stats.first_consecutive_seen_ts.unwrap();
let start_of_reliable_time = first_consecutive_seen_ts
+ ((UNRELIABLE_PING_SPAN_SECS - UNRELIABLE_PING_INTERVAL_SECS) as u64
* 1_000_000u64);
let reliable_cur = cur_ts.saturating_sub(start_of_reliable_time);
let reliable_last = last_seen.saturating_sub(start_of_reliable_time);
let reliable_last =
latest_contact_time.saturating_sub(start_of_reliable_time);
retry_falloff_log(
reliable_last,
@@ -292,37 +322,44 @@ impl BucketEntry {
}
pub(super) fn touch_last_seen(&mut self, ts: u64) {
// If we've heard from the node at all, we can always restart our lost ping count
self.peer_stats.rpc_stats.recent_lost_answers = 0;
// Mark the node as seen
self.peer_stats.last_seen = Some(ts);
if self
.peer_stats
.rpc_stats
.first_consecutive_seen_ts
.is_none()
{
self.peer_stats.rpc_stats.first_consecutive_seen_ts = Some(ts);
}
self.peer_stats.rpc_stats.last_seen_ts = Some(ts);
}
pub(super) fn state_debug_info(&self, cur_ts: u64) -> String {
let first_consecutive_answer_time = if let Some(first_consecutive_answer_time) =
self.peer_stats.rpc_stats.first_consecutive_answer_time
let first_consecutive_seen_ts = if let Some(first_consecutive_seen_ts) =
self.peer_stats.rpc_stats.first_consecutive_seen_ts
{
format!(
"{}s ago",
timestamp_to_secs(cur_ts.saturating_sub(first_consecutive_answer_time))
timestamp_to_secs(cur_ts.saturating_sub(first_consecutive_seen_ts))
)
} else {
"never".to_owned()
};
let last_seen = if let Some(last_seen) = self.peer_stats.last_seen {
let last_seen_ts_str = if let Some(last_seen_ts) = self.peer_stats.rpc_stats.last_seen_ts {
format!(
"{}s ago",
timestamp_to_secs(cur_ts.saturating_sub(last_seen))
timestamp_to_secs(cur_ts.saturating_sub(last_seen_ts))
)
} else {
"never".to_owned()
};
format!(
"state: {:?}, first_consecutive_answer_time: {}, last_seen: {}",
"state: {:?}, first_consecutive_seen_ts: {}, last_seen_ts: {}",
self.state(cur_ts),
first_consecutive_answer_time,
last_seen
first_consecutive_seen_ts,
last_seen_ts_str
)
}
@@ -332,11 +369,10 @@ impl BucketEntry {
pub(super) fn question_sent(&mut self, ts: u64, bytes: u64, expects_answer: bool) {
self.transfer_stats_accounting.add_up(bytes);
self.peer_stats.rpc_stats.messages_sent += 1;
self.peer_stats.rpc_stats.failed_to_send = 0;
if expects_answer {
self.peer_stats.rpc_stats.questions_in_flight += 1;
}
if self.peer_stats.last_seen.is_none() {
self.peer_stats.last_seen = Some(ts);
self.peer_stats.rpc_stats.last_question = Some(ts);
}
}
pub(super) fn question_rcvd(&mut self, ts: u64, bytes: u64) {
@@ -344,33 +380,40 @@ impl BucketEntry {
self.peer_stats.rpc_stats.messages_rcvd += 1;
self.touch_last_seen(ts);
}
pub(super) fn answer_sent(&mut self, _ts: u64, bytes: u64) {
pub(super) fn answer_sent(&mut self, bytes: u64) {
self.transfer_stats_accounting.add_up(bytes);
self.peer_stats.rpc_stats.messages_sent += 1;
self.peer_stats.rpc_stats.failed_to_send = 0;
}
pub(super) fn answer_rcvd(&mut self, send_ts: u64, recv_ts: u64, bytes: u64) {
self.transfer_stats_accounting.add_down(bytes);
self.peer_stats.rpc_stats.messages_rcvd += 1;
self.peer_stats.rpc_stats.questions_in_flight -= 1;
if self
.peer_stats
.rpc_stats
.first_consecutive_answer_time
.is_none()
{
self.peer_stats.rpc_stats.first_consecutive_answer_time = Some(recv_ts);
}
self.record_latency(recv_ts - send_ts);
self.touch_last_seen(recv_ts);
self.peer_stats.rpc_stats.recent_lost_answers = 0;
}
pub(super) fn question_lost(&mut self, _ts: u64) {
self.peer_stats.rpc_stats.first_consecutive_answer_time = None;
pub(super) fn question_lost(&mut self) {
self.peer_stats.rpc_stats.first_consecutive_seen_ts = None;
self.peer_stats.rpc_stats.questions_in_flight -= 1;
self.peer_stats.rpc_stats.recent_lost_answers += 1;
}
pub(super) fn failed_to_send(&mut self, ts: u64, expects_answer: bool) {
if expects_answer {
self.peer_stats.rpc_stats.last_question = Some(ts);
}
self.peer_stats.rpc_stats.failed_to_send += 1;
self.peer_stats.rpc_stats.first_consecutive_seen_ts = None;
}
}
impl Drop for BucketEntry {
fn drop(&mut self) {
assert_eq!(self.ref_count, 0);
if self.ref_count != 0 {
panic!(
"bucket entry dropped with non-zero refcount: {:#?}",
self.node_info()
)
}
}
}