Skip to content

Commit

Permalink
Merge pull request #1783 from AntelopeIO/merge_resmon_fix_to_5_0
Browse files Browse the repository at this point in the history
[4.0 -> 5.0] hardening resource monitor manager plugin shutdown handling
  • Loading branch information
linh2931 authored Oct 17, 2023
2 parents a7090e0 + 88b97fa commit 92fa264
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 206 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,47 @@

#include <eosio/chain/application.hpp>
#include <eosio/chain/exceptions.hpp>

#include <eosio/chain/thread_utils.hpp>

namespace eosio::resource_monitor {
template<typename SpaceProvider>
class file_space_handler {
public:
file_space_handler(SpaceProvider&& space_provider, boost::asio::io_context& ctx)
:space_provider(std::move(space_provider)),
timer{ctx}
file_space_handler(SpaceProvider&& space_provider)
:space_provider(std::move(space_provider))
{
}

void start(const std::vector<std::filesystem::path>& directories) {
for ( auto& dir: directories ) {
add_file_system( dir );

// A directory like "data" contains subdirectories like
// "block". Those subdirectories can mount on different
// file systems. Make sure they are taken care of.
for (std::filesystem::directory_iterator itr(dir); itr != std::filesystem::directory_iterator(); ++itr) {
if (std::filesystem::is_directory(itr->path())) {
add_file_system( itr->path() );
}
}
}

thread_pool.start(thread_pool_size,
[]( const fc::exception& e ) {
elog("Exception in resource monitor plugin thread pool, exiting: ${e}", ("e", e.to_detail_string()) );
appbase::app().quit(); },
[&]() { space_monitor_loop(); }
);
}

// called on main thread from plugin shutdown()
void stop() {
// After thread pool stops, timer is not accessible within it.
// In addition, timer's destructor will call cancel.
// Therefore, no need to call cancel explicitly.
thread_pool.stop();
}

void set_sleep_time(uint32_t sleep_time) {
sleep_time_in_secs = sleep_time;
}
Expand Down Expand Up @@ -128,6 +157,7 @@ namespace eosio::resource_monitor {
("path_name", path_name.string())("shutdown_available", to_gib(shutdown_available)) ("capacity", to_gib(info.capacity))("threshold_desc", threshold_desc()) );
}

// on resmon thread
void space_monitor_loop() {
if ( is_threshold_exceeded() && shutdown_on_exceeded ) {
elog("Gracefully shutting down, exceeded file system configured threshold.");
Expand All @@ -137,9 +167,12 @@ namespace eosio::resource_monitor {
update_warning_interval_counter();

timer.expires_from_now( boost::posix_time::seconds( sleep_time_in_secs ));

timer.async_wait([this](const auto& ec) {
if ( ec ) {
// No need to check if ec is operation_aborted (cancelled),
// as cancel callback will never be make it here after thread_pool
// is stopped, even though cancel is called in the timer's
// destructor.
wlog("Exit due to error: ${ec}, message: ${message}",
("ec", ec.value())
("message", ec.message()));
Expand All @@ -154,7 +187,10 @@ namespace eosio::resource_monitor {
private:
SpaceProvider space_provider;

boost::asio::deadline_timer timer;
static constexpr size_t thread_pool_size = 1;
eosio::chain::named_thread_pool<struct resmon> thread_pool;

boost::asio::deadline_timer timer {thread_pool.get_executor()};

uint32_t sleep_time_in_secs {2};
uint32_t shutdown_threshold {90};
Expand Down
40 changes: 5 additions & 35 deletions plugins/resource_monitor_plugin/resource_monitor_plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ namespace eosio {
class resource_monitor_plugin_impl {
public:
resource_monitor_plugin_impl()
:space_handler(system_file_space_provider(), ctx)
:space_handler(system_file_space_provider())
{
}

Expand Down Expand Up @@ -109,42 +109,14 @@ class resource_monitor_plugin_impl {

// Start main thread
void plugin_startup() {
ilog("Creating and starting monitor thread");

// By now all plugins are initialized.
// Find out filesystems containing the directories requested
// so far.
for ( auto& dir: directories_registered ) {
space_handler.add_file_system( dir );

// A directory like "data" contains subdirectories like
// "block". Those subdirectories can mount on different
// file systems. Make sure they are taken care of.
for (std::filesystem::directory_iterator itr(dir); itr != std::filesystem::directory_iterator(); ++itr) {
if (std::filesystem::is_directory(itr->path())) {
space_handler.add_file_system( itr->path() );
}
}
}

monitor_thread = std::thread( [this] {
fc::set_thread_name( "resmon" ); // console_appender uses 9 chars for thread name reporting.
space_handler.space_monitor_loop();

ctx.run();
} );
space_handler.start(directories_registered);
}

// System is shutting down.
void plugin_shutdown() {
ilog("shutdown...");

ctx.stop();

// Wait for the thread to end
monitor_thread.join();

ilog("exit shutdown");
ilog("entered shutdown...");
space_handler.stop();
ilog("exiting shutdown");
}

void monitor_directory(const std::filesystem::path& path) {
Expand All @@ -169,8 +141,6 @@ class resource_monitor_plugin_impl {
static constexpr uint32_t warning_interval_min = 1;
static constexpr uint32_t warning_interval_max = 450; // e.g. if the monitor interval is 2 sec, the warning interval is at most 15 minutes

boost::asio::io_context ctx;

using file_space_handler_t = file_space_handler<system_file_space_provider>;
file_space_handler_t space_handler;
};
Expand Down
2 changes: 1 addition & 1 deletion plugins/resource_monitor_plugin/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
add_executable( test_resmon_plugin test_resmon_plugin.cpp test_add_file_system.cpp test_monitor_loop.cpp test_threshold.cpp )
add_executable( test_resmon_plugin test_resmon_plugin.cpp test_add_file_system.cpp test_threshold.cpp )
target_link_libraries( test_resmon_plugin resource_monitor_plugin )
target_include_directories( test_resmon_plugin PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" )
add_test(NAME test_resmon_plugin COMMAND plugins/resource_monitor_plugin/test/test_resmon_plugin WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,9 @@ struct add_file_system_fixture {
add_file_system_fixture& fixture;
};

boost::asio::io_context ctx;

using file_space_handler_t = file_space_handler<mock_space_provider>;
add_file_system_fixture()
: space_handler(mock_space_provider(*this), ctx)
: space_handler(mock_space_provider(*this))
{
}

Expand Down
151 changes: 0 additions & 151 deletions plugins/resource_monitor_plugin/test/test_monitor_loop.cpp

This file was deleted.

6 changes: 2 additions & 4 deletions plugins/resource_monitor_plugin/test/test_threshold.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,9 @@ struct threshold_fixture {
threshold_fixture& fixture;
};

boost::asio::io_context ctx;

using file_space_handler_t = file_space_handler<mock_space_provider>;
threshold_fixture()
: space_handler(std::make_unique<file_space_handler_t>(mock_space_provider(*this), ctx))
: space_handler(std::make_unique<file_space_handler_t>(mock_space_provider(*this)))
{
}

Expand All @@ -49,7 +47,7 @@ struct threshold_fixture {

bool test_threshold_common(std::map<std::filesystem::path, uintmax_t>& available, std::map<std::filesystem::path, int>& dev, uint32_t warning_threshold=75) {
bool first = test_threshold_common_(available, dev, warning_threshold);
space_handler = std::make_unique<file_space_handler_t>(mock_space_provider(*this), ctx);
space_handler = std::make_unique<file_space_handler_t>(mock_space_provider(*this));

test_absolute = true;
bool second = test_threshold_common_(available, dev, warning_threshold);
Expand Down
13 changes: 7 additions & 6 deletions tests/resource_monitor_plugin_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
stderrFile=dataDir + "/stderr.txt"

testNum=0
max_start_time_secs=10 # time nodeos takes to start

# We need debug level to get more information about nodeos process
logging="""{
Expand Down Expand Up @@ -105,7 +106,7 @@ def testCommon(title, extraNodeosArgs, expectedMsgs):

prepareDirectories()

timeout=120 # Leave sufficient time such nodeos can start up fully in any platforms
timeout=max_start_time_secs # Leave sufficient time such nodeos can start up fully in any platforms
runNodeos(extraNodeosArgs, timeout)

for msg in expectedMsgs:
Expand Down Expand Up @@ -156,7 +157,7 @@ def testInterval(title, extraNodeosArgs, interval, expectedMsgs, warningThreshol
prepareDirectories()
fillFS(dataDir, warningThreshold)

timeout = 120 + interval * 2 # Leave sufficient time so nodeos can start up fully in any platforms, and at least two warnings can be output
timeout = max_start_time_secs + interval * 2 # Leave sufficient time so nodeos can start up fully in any platforms, and at least two warnings can be output
if timeout > testIntervalMaxTimeout:
errorExit ("Max timeout for testInterval is %d sec" % (testIntervalMaxTimeout))
runNodeos(extraNodeosArgs, timeout)
Expand All @@ -169,15 +170,15 @@ def testInterval(title, extraNodeosArgs, interval, expectedMsgs, warningThreshol
errorExit ("Log containing \"%s\" should be output every %d seconds" % (msg, interval))

def testAll():
testCommon("Resmon enabled: all arguments", "--plugin eosio::resource_monitor_plugin --resource-monitor-space-threshold=85 --resource-monitor-interval-seconds=5 --resource-monitor-not-shutdown-on-threshold-exceeded", ["threshold set to 85", "interval set to 5", "Shutdown flag when threshold exceeded set to false", "Creating and starting monitor thread"])
testCommon("Resmon enabled: all arguments", "--plugin eosio::resource_monitor_plugin --resource-monitor-space-threshold=85 --resource-monitor-interval-seconds=5 --resource-monitor-not-shutdown-on-threshold-exceeded", ["threshold set to 85", "interval set to 5", "Shutdown flag when threshold exceeded set to false"])

# default arguments and default directories to be monitored
testCommon("Resmon not enabled: no arguments", "", ["interval set to 2", "threshold set to 90", "Shutdown flag when threshold exceeded set to true", "Creating and starting monitor thread", "snapshots's file system to be monitored", "blocks's file system to be monitored", "state's file system to be monitored"])
testCommon("Resmon not enabled: no arguments", "", ["interval set to 2", "threshold set to 90", "Shutdown flag when threshold exceeded set to true", "snapshots's file system to be monitored", "blocks's file system to be monitored", "state's file system to be monitored"])

# default arguments with registered directories
testCommon("Resmon not enabled: Producer, Chain, State History and Trace Api", "--plugin eosio::state_history_plugin --state-history-dir=/tmp/state-history --plugin eosio::trace_api_plugin --trace-dir=/tmp/trace --trace-no-abis", ["interval set to 2", "threshold set to 90", "Shutdown flag when threshold exceeded set to true", "snapshots's file system to be monitored", "blocks's file system to be monitored", "state's file system to be monitored", "state-history's file system to be monitored", "trace's file system to be monitored", "Creating and starting monitor thread"])
testCommon("Resmon not enabled: Producer, Chain, State History and Trace Api", "--plugin eosio::state_history_plugin --state-history-dir=/tmp/state-history --disable-replay-opts --plugin eosio::trace_api_plugin --trace-dir=/tmp/trace --trace-no-abis", ["interval set to 2", "threshold set to 90", "Shutdown flag when threshold exceeded set to true", "snapshots's file system to be monitored", "blocks's file system to be monitored", "state's file system to be monitored", "state-history's file system to be monitored", "trace's file system to be monitored"])

testCommon("Resmon enabled: Producer, Chain, State History and Trace Api", "--plugin eosio::resource_monitor_plugin --plugin eosio::state_history_plugin --state-history-dir=/tmp/state-history --plugin eosio::trace_api_plugin --trace-dir=/tmp/trace --trace-no-abis --resource-monitor-space-threshold=80 --resource-monitor-interval-seconds=3", ["snapshots's file system to be monitored", "blocks's file system to be monitored", "state's file system to be monitored", "state-history's file system to be monitored", "trace's file system to be monitored", "Creating and starting monitor thread", "threshold set to 80", "interval set to 3", "Shutdown flag when threshold exceeded set to true"])
testCommon("Resmon enabled: Producer, Chain, State History and Trace Api", "--plugin eosio::resource_monitor_plugin --plugin eosio::state_history_plugin --state-history-dir=/tmp/state-history --disable-replay-opts --plugin eosio::trace_api_plugin --trace-dir=/tmp/trace --trace-no-abis --resource-monitor-space-threshold=80 --resource-monitor-interval-seconds=3", ["snapshots's file system to be monitored", "blocks's file system to be monitored", "state's file system to be monitored", "state-history's file system to be monitored", "trace's file system to be monitored", "threshold set to 80", "interval set to 3", "Shutdown flag when threshold exceeded set to true"])

# Only test minimum warning threshold (i.e. 6) to trigger warning as much as possible
testInterval("Resmon enabled: set warning interval",
Expand Down

0 comments on commit 92fa264

Please sign in to comment.