Skip to content

Commit

Permalink
Merge branch 'KillConsistentlyBadNode' into rc-2024.10.3
Browse files Browse the repository at this point in the history
  • Loading branch information
jacderida committed Oct 21, 2024
2 parents a53dad4 + 90a8f26 commit 08f091b
Show file tree
Hide file tree
Showing 5 changed files with 80 additions and 12 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions sn_node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ sn_registers = { path = "../sn_registers", version = "0.4.0-rc.1" }
sn_transfers = { path = "../sn_transfers", version = "0.20.0-rc.1" }
sn_service_management = { path = "../sn_service_management", version = "0.4.0-rc.1" }
sn_evm = { path = "../sn_evm", version = "0.1.1-rc.1" }
sysinfo = { version = "0.30.8", default-features = false }
thiserror = "1.0.23"
tokio = { version = "1.32.0", features = [
"io-util",
Expand Down
78 changes: 69 additions & 9 deletions sn_node/src/bin/safenode/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use sn_logging::{Level, LogFormat, LogOutputDest, ReloadHandle};
use sn_node::{Marker, NodeBuilder, NodeEvent, NodeEventsReceiver};
use sn_peers_acquisition::PeersArgs;
use sn_protocol::{
node::get_safenode_root_dir, node_rpc::NodeCtrl, version::IDENTIFY_PROTOCOL_STR,
node::get_safenode_root_dir, node_rpc::{NodeCtrl, StopResult}, version::IDENTIFY_PROTOCOL_STR,
};
use std::{
env,
Expand All @@ -34,6 +34,7 @@ use std::{
process::Command,
time::Duration,
};
use sysinfo::{self, System};
use tokio::{
runtime::Runtime,
sync::{broadcast::error::RecvError, mpsc},
Expand Down Expand Up @@ -380,13 +381,65 @@ You can check your reward balance by running:
if let Err(err) = ctrl_tx_clone
.send(NodeCtrl::Stop {
delay: Duration::from_secs(1),
cause: eyre!("Ctrl-C received!"),
result: StopResult::Error(eyre!("Ctrl-C received!")),
})
.await
{
error!("Failed to send node control msg to safenode bin main thread: {err}");
}
});
let ctrl_tx_clone_cpu = ctrl_tx.clone();
// Monitor host CPU usage
tokio::spawn(async move {
use rand::{thread_rng, Rng};

const CPU_CHECK_INTERVAL: Duration = Duration::from_secs(60);
const CPU_USAGE_THRESHOLD: f32 = 50.0;
const HIGH_CPU_CONSECUTIVE_LIMIT: u8 = 5;
const NODE_STOP_DELAY: Duration = Duration::from_secs(1);
const INITIAL_DELAY_MIN_S: u64 = 10;
const INITIAL_DELAY_MAX_S: u64 =
HIGH_CPU_CONSECUTIVE_LIMIT as u64 * CPU_CHECK_INTERVAL.as_secs();
const JITTER_MIN_S: u64 = 1;
const JITTER_MAX_S: u64 = 15;

let mut sys = System::new_all();

let mut high_cpu_count: u8 = 0;

// Random initial delay between 1 and 5 minutes
let initial_delay =
Duration::from_secs(thread_rng().gen_range(INITIAL_DELAY_MIN_S..=INITIAL_DELAY_MAX_S));
tokio::time::sleep(initial_delay).await;

loop {
sys.refresh_cpu();
let cpu_usage = sys.global_cpu_info().cpu_usage();

if cpu_usage > CPU_USAGE_THRESHOLD {
high_cpu_count += 1;
} else {
high_cpu_count = 0;
}

if high_cpu_count >= HIGH_CPU_CONSECUTIVE_LIMIT {
if let Err(err) = ctrl_tx_clone_cpu
.send(NodeCtrl::Stop {
delay: NODE_STOP_DELAY,
result: StopResult::Success(format!("Excess host CPU %{CPU_USAGE_THRESHOLD} detected for {HIGH_CPU_CONSECUTIVE_LIMIT} consecutive minutes!")),
})
.await
{
error!("Failed to send node control msg to safenode bin main thread: {err}");
}
break;
}

// Add jitter to the interval
let jitter = Duration::from_secs(thread_rng().gen_range(JITTER_MIN_S..=JITTER_MAX_S));
tokio::time::sleep(CPU_CHECK_INTERVAL + jitter).await;
}
});

// Start up gRPC interface if enabled by user
if let Some(addr) = rpc {
Expand Down Expand Up @@ -422,12 +475,21 @@ You can check your reward balance by running:

break Ok(res);
}
Some(NodeCtrl::Stop { delay, cause }) => {
Some(NodeCtrl::Stop { delay, result }) => {
let msg = format!("Node is stopping in {delay:?}...");
info!("{msg}");
println!("{msg} Node log path: {log_output_dest}");
sleep(delay).await;
return Err(cause);
match result {
StopResult::Success(message) => {
info!("Node stopped successfully: {}", message);
return Ok(None);
}
StopResult::Error(cause) => {
error!("Node stopped with error: {}", cause);
return Err(cause);
}
}
}
Some(NodeCtrl::Update(_delay)) => {
// TODO: implement self-update once safenode app releases are published again
Expand All @@ -450,7 +512,7 @@ fn monitor_node_events(mut node_events_rx: NodeEventsReceiver, ctrl_tx: mpsc::Se
if let Err(err) = ctrl_tx
.send(NodeCtrl::Stop {
delay: Duration::from_secs(1),
cause: eyre!("Node events channel closed!"),
result: StopResult::Error(eyre!("Node events channel closed!")),
})
.await
{
Expand All @@ -464,13 +526,11 @@ fn monitor_node_events(mut node_events_rx: NodeEventsReceiver, ctrl_tx: mpsc::Se
if let Err(err) = ctrl_tx
.send(NodeCtrl::Stop {
delay: Duration::from_secs(1),
cause: eyre!("Node terminated due to: {reason:?}"),
result: StopResult::Error(eyre!("Node terminated due to: {reason:?}")),
})
.await
{
error!(
"Failed to send node control msg to safenode bin main thread: {err}"
);
error!("Failed to send node control msg to safenode bin main thread: {err}");
break;
}
}
Expand Down
4 changes: 2 additions & 2 deletions sn_node/src/bin/safenode/rpc_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
use eyre::{ErrReport, Result};
use sn_logging::ReloadHandle;
use sn_node::RunningNode;
use sn_protocol::node_rpc::NodeCtrl;
use sn_protocol::node_rpc::{NodeCtrl, StopResult};
use sn_protocol::safenode_proto::{
k_buckets_response,
safe_node_server::{SafeNode, SafeNodeServer},
Expand Down Expand Up @@ -202,7 +202,7 @@ impl SafeNode for SafeNodeRpcService {
};

let delay = Duration::from_millis(request.get_ref().delay_millis);
match self.ctrl_tx.send(NodeCtrl::Stop { delay, cause }).await {
match self.ctrl_tx.send(NodeCtrl::Stop { delay, result: StopResult::Success(cause.to_string()) }).await {
Ok(()) => Ok(Response::new(StopResponse {})),
Err(err) => Err(Status::new(
Code::Internal,
Expand Down
8 changes: 7 additions & 1 deletion sn_protocol/src/node_rpc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pub enum NodeCtrl {
/// Request to stop the execution of the safenode app, providing an error as a reason for it.
Stop {
delay: Duration,
cause: Error,
result: StopResult,
},
/// Request to restart the execution of the safenode app, retrying to join the network, after the requested delay.
/// Set `retain_peer_id` to `true` if you want to re-use the same root dir/secret keys/PeerId.
Expand All @@ -26,3 +26,9 @@ pub enum NodeCtrl {
// Request to update the safenode app, and restart it, after the requested delay.
Update(Duration),
}

#[derive(Debug)]
pub enum StopResult {
Success(String),
Error(Error),
}

0 comments on commit 08f091b

Please sign in to comment.