various fixes, including node_ref last_connection sorting problem

This commit is contained in:
John Smith 2022-10-04 11:27:38 -04:00
parent 0a01c0d23e
commit 4b2164a546
16 changed files with 740 additions and 517 deletions

936
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -77,4 +77,5 @@ rustup target add aarch64-linux-android armv7-linux-androideabi i686-linux-andro
cargo install wasm-bindgen-cli
# Ensure packages are installed
sudo apt-get install libc6-dev-i386 libc6:i386 libncurses5:i386 libstdc++6:i386 lib32z1 libbz2-1.0:i386 openjdk-11-jdk llvm wabt capnproto
sudo apt-get install libc6-dev-i386 libc6:i386 libncurses5:i386 libstdc++6:i386 lib32z1 libbz2-1.0:i386 openjdk-11-jdk llvm wabt capnproto protobuf-compiler

View File

@ -114,5 +114,5 @@ if [ "$BREW_USER" == "" ]; then
BREW_USER=`whoami`
fi
fi
sudo -H -u $BREW_USER brew install capnp cmake wabt llvm
sudo -H -u $BREW_USER brew install capnp cmake wabt llvm protobuf

View File

@ -26,6 +26,7 @@ struct ConnectionManagerArc {
connection_initial_timeout_ms: u32,
connection_inactivity_timeout_ms: u32,
connection_table: ConnectionTable,
address_lock_table: AsyncTagLockTable<SocketAddr>,
inner: Mutex<Option<ConnectionManagerInner>>,
}
impl core::fmt::Debug for ConnectionManagerArc {
@ -69,6 +70,7 @@ impl ConnectionManager {
connection_initial_timeout_ms,
connection_inactivity_timeout_ms,
connection_table: ConnectionTable::new(config),
address_lock_table: AsyncTagLockTable::new(),
inner: Mutex::new(None),
}
}
@ -196,7 +198,7 @@ impl ConnectionManager {
}
// Returns a network connection if one already is established
#[instrument(level = "trace", skip(self), ret)]
//#[instrument(level = "trace", skip(self), ret)]
pub fn get_connection(&self, descriptor: ConnectionDescriptor) -> Option<ConnectionHandle> {
self.arc
.connection_table
@ -236,11 +238,6 @@ impl ConnectionManager {
did_kill
}
/// Locak remote address
// async fn lock_remote_address(&self, remote_addr: SocketAddr) -> {
// }
/// Called when we want to create a new connection or get the current one that already exists
/// This will kill off any connections that are in conflict with the new connection to be made
/// in order to make room for the new connection in the system's connection table
@ -251,18 +248,17 @@ impl ConnectionManager {
local_addr: Option<SocketAddr>,
dial_info: DialInfo,
) -> EyreResult<NetworkResult<ConnectionHandle>> {
warn!(
// Async lock on the remote address for atomicity per remote
let peer_address = dial_info.to_peer_address();
let remote_addr = peer_address.to_socket_addr();
let _lock_guard = self.arc.address_lock_table.lock_tag(remote_addr);
log_net!(
"== get_or_create_connection local_addr={:?} dial_info={:?}",
local_addr.green(),
dial_info.green()
);
// Make a connection descriptor for this dialinfo
let peer_address = dial_info.to_peer_address();
// Async lock on the remote address for atomicity
//let _lock_guard = self.lock_remote_address(peer_address.to_socket_addr());
// Kill off any possibly conflicting connections
let did_kill = self.kill_off_colliding_connections(&dial_info).await;
let mut retry_count = if did_kill { 2 } else { 0 };
@ -299,6 +295,22 @@ impl ConnectionManager {
}
Err(e) => {
if retry_count == 0 {
// Try one last time to return a connection from the table, in case
// an 'accept' happened at literally the same time as our connect
if let Some(conn) = self
.arc
.connection_table
.get_last_connection_by_remote(peer_address)
{
log_net!(
"== Returning existing connection in race local_addr={:?} peer_address={:?}",
local_addr.green(),
peer_address.green()
);
return Ok(NetworkResult::Value(conn));
}
return Err(e).wrap_err("failed to connect");
}
}

View File

@ -144,7 +144,7 @@ impl ConnectionTable {
let mut out_conn = None;
if inner.conn_by_id[protocol_index].len() > inner.max_connections[protocol_index] {
if let Some((lruk, lru_conn)) = inner.conn_by_id[protocol_index].remove_lru() {
debug!("connection lru out: {:?}", lru_conn);
log_net!(debug "connection lru out: {:?}", lru_conn);
out_conn = Some(lru_conn);
Self::remove_connection_records(&mut *inner, lruk);
}
@ -158,7 +158,8 @@ impl ConnectionTable {
Ok(out_conn)
}
#[instrument(level = "trace", skip(self), ret)]
//#[instrument(level = "trace", skip(self), ret)]
#[allow(dead_code)]
pub fn get_connection_by_id(&self, id: NetworkConnectionId) -> Option<ConnectionHandle> {
let mut inner = self.inner.lock();
let protocol_index = *inner.protocol_index_by_id.get(&id)?;
@ -166,7 +167,7 @@ impl ConnectionTable {
Some(out.get_handle())
}
#[instrument(level = "trace", skip(self), ret)]
//#[instrument(level = "trace", skip(self), ret)]
pub fn get_connection_by_descriptor(
&self,
descriptor: ConnectionDescriptor,
@ -179,7 +180,7 @@ impl ConnectionTable {
Some(out.get_handle())
}
#[instrument(level = "trace", skip(self), ret)]
//#[instrument(level = "trace", skip(self), ret)]
pub fn get_last_connection_by_remote(&self, remote: PeerAddress) -> Option<ConnectionHandle> {
let mut inner = self.inner.lock();
@ -189,7 +190,8 @@ impl ConnectionTable {
Some(out.get_handle())
}
#[instrument(level = "trace", skip(self), ret)]
//#[instrument(level = "trace", skip(self), ret)]
#[allow(dead_code)]
pub fn get_connection_ids_by_remote(&self, remote: PeerAddress) -> Vec<NetworkConnectionId> {
let inner = self.inner.lock();
inner

View File

@ -1949,47 +1949,53 @@ impl NetworkManager {
.clone()
.unlocked_inner
.node_info_update_single_future
.single_spawn(async move {
// Only update if we actually have valid signed node info for this routing domain
if !this.routing_table().has_valid_own_node_info(routing_domain) {
trace!(
.single_spawn(
async move {
// Only update if we actually have valid signed node info for this routing domain
if !this.routing_table().has_valid_own_node_info(routing_domain) {
trace!(
"not sending node info update because our network class is not yet valid"
);
return;
return;
}
// Get the list of refs to all nodes to update
let cur_ts = intf::get_timestamp();
let node_refs =
this.routing_table()
.get_nodes_needing_updates(routing_domain, cur_ts, all);
// Send the updates
log_net!(debug "Sending node info updates to {} nodes", node_refs.len());
let mut unord = FuturesUnordered::new();
for nr in node_refs {
let rpc = this.rpc_processor();
unord.push(
async move {
// Update the node
if let Err(e) = rpc
.rpc_call_node_info_update(nr.clone(), routing_domain)
.await
{
// Not fatal, but we should be able to see if this is happening
trace!("failed to send node info update to {:?}: {}", nr, e);
return;
}
// Mark the node as having seen our node info
nr.set_seen_our_node_info(routing_domain);
}
.instrument(Span::current()),
);
}
// Wait for futures to complete
while unord.next().await.is_some() {}
log_rtab!(debug "Finished sending node updates");
}
// Get the list of refs to all nodes to update
let cur_ts = intf::get_timestamp();
let node_refs =
this.routing_table()
.get_nodes_needing_updates(routing_domain, cur_ts, all);
// Send the updates
log_net!(debug "Sending node info updates to {} nodes", node_refs.len());
let mut unord = FuturesUnordered::new();
for nr in node_refs {
let rpc = this.rpc_processor();
unord.push(async move {
// Update the node
if let Err(e) = rpc
.rpc_call_node_info_update(nr.clone(), routing_domain)
.await
{
// Not fatal, but we should be able to see if this is happening
trace!("failed to send node info update to {:?}: {}", nr, e);
return;
}
// Mark the node as having seen our node info
nr.set_seen_our_node_info(routing_domain);
});
}
// Wait for futures to complete
while unord.next().await.is_some() {}
log_rtab!(debug "Finished sending node updates");
})
.instrument(Span::current()),
)
.await;
}
}

View File

@ -255,6 +255,13 @@ impl DiscoveryContext {
{
return Some(external_mapped_dial_info);
} else {
warn!("UPNP port mapping succeeded but port {}/{} is still unreachable.\nYou may need to add a local firewall allowed port on this machine.\n",
local_port, match llpt {
LowLevelProtocolType::UDP => "udp",
LowLevelProtocolType::TCP => "tcp",
}
);
// release the mapping if we're still unreachable
let _ = self
.net
@ -628,6 +635,7 @@ impl Network {
}
Some(vec![udpv4_context])
}
.instrument(trace_span!("do_public_dial_info_check UDPv4"))
.boxed(),
);
}
@ -647,6 +655,7 @@ impl Network {
}
Some(vec![udpv6_context])
}
.instrument(trace_span!("do_public_dial_info_check UDPv6"))
.boxed(),
);
}
@ -669,6 +678,7 @@ impl Network {
}
Some(vec![tcpv4_context])
}
.instrument(trace_span!("do_public_dial_info_check TCPv4"))
.boxed(),
);
}
@ -688,6 +698,7 @@ impl Network {
}
Some(vec![wsv4_context])
}
.instrument(trace_span!("do_public_dial_info_check WSv4"))
.boxed(),
);
}
@ -710,6 +721,7 @@ impl Network {
}
Some(vec![tcpv6_context])
}
.instrument(trace_span!("do_public_dial_info_check TCPv6"))
.boxed(),
);
}
@ -729,6 +741,7 @@ impl Network {
}
Some(vec![wsv6_context])
}
.instrument(trace_span!("do_public_dial_info_check WSv6"))
.boxed(),
);
}

View File

@ -109,11 +109,11 @@ impl Network {
};
// XXX
warn!(
"DEBUGACCEPT: local={} remote={}",
tcp_stream.local_addr().unwrap(),
tcp_stream.peer_addr().unwrap(),
);
// warn!(
// "DEBUGACCEPT: local={} remote={}",
// tcp_stream.local_addr().unwrap(),
// tcp_stream.peer_addr().unwrap(),
// );
let listener_state = listener_state.clone();
let connection_manager = connection_manager.clone();

View File

@ -84,7 +84,7 @@ impl Network {
}
}
}
};
}.instrument(Span::current());
protocol_handlers_unordered.push(ph_future);
}

View File

@ -185,7 +185,7 @@ pub async fn nonblocking_connect(
let socket2_addr = socket2::SockAddr::from(addr);
// XXX
let bind_local_addr = socket.local_addr().unwrap().as_socket().unwrap();
//let bind_local_addr = socket.local_addr().unwrap().as_socket().unwrap();
// Connect to the remote address
match socket.connect(&socket2_addr) {
@ -197,24 +197,24 @@ pub async fn nonblocking_connect(
}
.map_err(|e| {
// XXX
warn!(
"DEBUGCONNECT XXXFAILXXX: bind={} local={} remote={}\nbacktrace={:?}",
bind_local_addr,
socket.local_addr().unwrap().as_socket().unwrap(),
addr,
backtrace::Backtrace::new(),
);
// warn!(
// "DEBUGCONNECT XXXFAILXXX: bind={} local={} remote={}\nbacktrace={:?}",
// bind_local_addr,
// socket.local_addr().unwrap().as_socket().unwrap(),
// addr,
// backtrace::Backtrace::new(),
// );
e
})?;
// XXX
warn!(
"DEBUGCONNECT: bind={} local={} remote={}\nbacktrace={:?}",
bind_local_addr,
socket.local_addr().unwrap().as_socket().unwrap(),
addr,
backtrace::Backtrace::new(),
);
// warn!(
// "DEBUGCONNECT: bind={} local={} remote={}\nbacktrace={:?}",
// bind_local_addr,
// socket.local_addr().unwrap().as_socket().unwrap(),
// addr,
// backtrace::Backtrace::new(),
// );
let async_stream = Async::new(std::net::TcpStream::from(socket))?;

View File

@ -210,11 +210,13 @@ impl NetworkConnection {
Ok(NetworkResult::Value(out))
}
#[allow(dead_code)]
pub fn stats(&self) -> NetworkConnectionStats {
let stats = self.stats.lock();
stats.clone()
}
#[allow(dead_code)]
pub fn established_time(&self) -> u64 {
self.established_time
}
@ -260,10 +262,11 @@ impl NetworkConnection {
need_sender = false;
let sender_fut = receiver.recv_async().then(|res| async {
match res {
Ok((span_id, message)) => {
Ok((_span_id, message)) => {
let recv_span = span!(parent: None, Level::TRACE, "process_connection recv");
recv_span.follows_from(span_id);
let recv_span = span!(Level::TRACE, "process_connection recv");
// xxx: causes crash (Missing otel data span extensions)
// recv_span.follows_from(span_id);
// send the packet
if let Err(e) = Self::send_internal(

View File

@ -246,7 +246,7 @@ impl ReceiptManager {
if let Some(callback) =
Self::perform_callback(ReceiptEvent::Expired, &mut expired_record_mut)
{
callbacks.push(callback)
callbacks.push(callback.instrument(Span::current()))
}
}

View File

@ -264,32 +264,62 @@ impl BucketEntryInner {
self.last_connections.clear();
}
// Gets the best 'last connection' that matches a set of routing domain, protocol types and address types
pub(super) fn last_connection(
// Gets the 'last connection' that matches a specific connection key
// pub(super) fn last_connection(
// &self,
// protocol_type: ProtocolType,
// address_type: AddressType,
// ) -> Option<(ConnectionDescriptor, u64)> {
// let key = LastConnectionKey(protocol_type, address_type);
// self.last_connections.get(&key).cloned()
// }
// Gets all the 'last connections' that match a particular filter
pub(super) fn last_connections(
&self,
routing_table_inner: &RoutingTableInner,
node_ref_filter: Option<NodeRefFilter>,
) -> Option<(ConnectionDescriptor, u64)> {
// Iterate peer scopes and protocol types and address type in order to ensure we pick the preferred protocols if all else is the same
let nrf = node_ref_filter.unwrap_or_default();
for pt in nrf.dial_info_filter.protocol_type_set {
for at in nrf.dial_info_filter.address_type_set {
let key = LastConnectionKey(pt, at);
if let Some(v) = self.last_connections.get(&key) {
// Verify this connection could be in the filtered routing domain
let address = v.0.remote_address().address();
if let Some(rd) =
RoutingTable::routing_domain_for_address_inner(routing_table_inner, address)
{
if nrf.routing_domain_set.contains(rd) {
return Some(*v);
filter: Option<NodeRefFilter>,
) -> Vec<(ConnectionDescriptor, u64)> {
let mut out: Vec<(ConnectionDescriptor, u64)> = self
.last_connections
.iter()
.filter_map(|(k, v)| {
let include = if let Some(filter) = &filter {
let remote_address = v.0.remote_address().address();
if let Some(routing_domain) = RoutingTable::routing_domain_for_address_inner(
routing_table_inner,
remote_address,
) {
if filter.routing_domain_set.contains(routing_domain)
&& filter.dial_info_filter.protocol_type_set.contains(k.0)
&& filter.dial_info_filter.address_type_set.contains(k.1)
{
// matches filter
true
} else {
// does not match filter
false
}
} else {
// no valid routing domain
false
}
} else {
// no filter
true
};
if include {
Some(v.clone())
} else {
None
}
}
}
None
})
.collect();
// Sort with newest timestamps first
out.sort_by(|a, b| b.1.cmp(&a.1));
out
}
pub fn set_min_max_version(&mut self, min_max_version: (u8, u8)) {
self.min_max_version = Some(min_max_version);
}

View File

@ -318,24 +318,29 @@ impl NodeRef {
}
pub fn last_connection(&self) -> Option<ConnectionDescriptor> {
// Get the last connection and the last time we saw anything with this connection
let (last_connection, last_seen) =
self.operate(|rti, e| e.last_connection(rti, self.filter.clone()))?;
// Get the last connections and the last time we saw anything with this connection
// Filtered first and then sorted by most recent
let last_connections = self.operate(|rti, e| e.last_connections(rti, self.filter.clone()));
// Should we check the connection table?
if last_connection.protocol_type().is_connection_oriented() {
// Look the connection up in the connection manager and see if it's still there
let connection_manager = self.routing_table.network_manager().connection_manager();
connection_manager.get_connection(last_connection)?;
} else {
// If this is not connection oriented, then we check our last seen time
// to see if this mapping has expired (beyond our timeout)
let cur_ts = intf::get_timestamp();
if (last_seen + (CONNECTIONLESS_TIMEOUT_SECS as u64 * 1_000_000u64)) < cur_ts {
return None;
// Do some checks to ensure these are possibly still 'live'
for (last_connection, last_seen) in last_connections {
// Should we check the connection table?
if last_connection.protocol_type().is_connection_oriented() {
// Look the connection up in the connection manager and see if it's still there
let connection_manager = self.routing_table.network_manager().connection_manager();
if connection_manager.get_connection(last_connection).is_some() {
return Some(last_connection);
}
} else {
// If this is not connection oriented, then we check our last seen time
// to see if this mapping has expired (beyond our timeout)
let cur_ts = intf::get_timestamp();
if (last_seen + (CONNECTIONLESS_TIMEOUT_SECS as u64 * 1_000_000u64)) >= cur_ts {
return Some(last_connection);
}
}
}
Some(last_connection)
None
}
pub fn clear_last_connections(&self) {

View File

@ -941,13 +941,12 @@ impl RPCProcessor {
stop_token: StopToken,
receiver: flume::Receiver<(Option<Id>, RPCMessageEncoded)>,
) {
while let Ok(Ok((span_id, msg))) =
while let Ok(Ok((_span_id, msg))) =
receiver.recv_async().timeout_at(stop_token.clone()).await
{
let rpc_worker_span = span!(parent: None, Level::TRACE, "rpc_worker");
//let rpc_worker_span = span!(Level::TRACE, "rpc_worker");
// fixme: causes crashes? "Missing otel data span extensions"??
rpc_worker_span.follows_from(span_id);
let rpc_worker_span = span!(parent: None, Level::TRACE, "rpc_worker recv");
// xxx: causes crash (Missing otel data span extensions)
// rpc_worker_span.follows_from(span_id);
let _ = self
.process_rpc_message(msg)
.instrument(rpc_worker_span)

View File

@ -127,8 +127,8 @@ where
let (_span_id, ret) = res.take_value().unwrap();
let end_ts = intf::get_timestamp();
// fixme: causes crashes? "Missing otel data span extensions"??
//Span::current().follows_from(span_id);
//xxx: causes crash (Missing otel data span extensions)
// Span::current().follows_from(span_id);
(ret, end_ts - start_ts)
}))