From 2f4e8fe355fdf564bc3fded054b5397455345e1d Mon Sep 17 00:00:00 2001 From: Yadu Date: Fri, 5 Apr 2024 00:28:45 +0800 Subject: [PATCH 1/3] Retry router connection (#135) * Retry router connection Signed-off-by: Yadunund * Configure behavior via ZENOH_ROUTER_CHECK_ATTEMPTS Signed-off-by: Yadunund * Address feedback Signed-off-by: Yadunund --------- Signed-off-by: Yadunund --- README.md | 33 +++++++++++++----- rmw_zenoh_cpp/src/detail/zenoh_config.cpp | 34 +++++++++++++++++++ rmw_zenoh_cpp/src/detail/zenoh_config.hpp | 12 +++++++ .../src/detail/zenoh_router_check.cpp | 16 ++++++--- rmw_zenoh_cpp/src/rmw_init.cpp | 23 ++++++++++--- rmw_zenoh_cpp/src/rmw_zenoh.cpp | 2 +- 6 files changed, 101 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 278ebc26..621aa7ac 100644 --- a/README.md +++ b/README.md @@ -68,11 +68,26 @@ ros2 run demo_nodes_cpp listener The listener node should start receiving messages over the `/chatter` topic. ## Configuration -`rmw_zenoh` relies on separate configurations files to configure the Zenoh `router` and `session` respectively. -To understand more about `routers` and `sessions`, see [Zenoh documentation](https://zenoh.io/docs/getting-started/deployment/). + +By default, `Zenoh sessions` created by `rmw_zenoh` will attempt to connect to a Zenoh router to receive discovery information. +To understand more about `Zenoh routers` and `Zenoh sessions`, see [Zenoh documentation](https://zenoh.io/docs/getting-started/deployment/). + +### Checking for a Zenoh router. +The `ZENOH_ROUTER_CHECK_ATTEMPTS` environment variable can be used to configure if and how a `Zenoh session` checks for the presence of a `Zenoh router`. +The behavior is explained in the table below. + + +| ZENOH_ROUTER_CHECK_ATTEMPTS | Session behavior | +|:---------------------------:|:----------------------------------------------------------------------------------------------------------------:| +| unset or 0 | Indefinitely waits for connection to a Zenoh router. | +| < 0 | Skips Zenoh router check. | +| > 0 | Attempts to connect to a Zenoh router in `ZENOH_ROUTER_CHECK_ATTEMPTS` attempts with 1 second wait between checks. | + +### Session and Router configs +`rmw_zenoh` relies on separate configurations files to configure the `Zenoh router` and `Zenoh session` respectively. For more information on the topology of Zenoh adopted in `rmw_zenoh`, please see [Design](#design). Default configuration files are used by `rmw_zenoh` however certain environment variables may be set to provide absolute paths to custom configuration files. -The table below summarizes the default files and the environment variables for the `router` and `session`. +The table below summarizes the default files and the environment variables for the `Zenoh router` and `Zenoh session`. For a complete list of configurable parameters, see [zenoh/DEFAULT_CONFIG.json5](https://github.com/eclipse-zenoh/zenoh/blob/main/DEFAULT_CONFIG.json5). | | Default config | Envar for custom config | @@ -80,16 +95,16 @@ For a complete list of configurable parameters, see [zenoh/DEFAULT_CONFIG.json5] | Router | [DEFAULT_RMW_ZENOH_ROUTER_CONFIG.json5](rmw_zenoh_cpp/config/DEFAULT_RMW_ZENOH_ROUTER_CONFIG.json5) | `ZENOH_ROUTER_CONFIG_URI` | | Session | [DEFAULT_RMW_ZENOH_SESSION_CONFIG.json5](rmw_zenoh_cpp/config/DEFAULT_RMW_ZENOH_SESSION_CONFIG.json5) | `ZENOH_SESSION_CONFIG_URI` | -For example, to set the path to a custom `router` configuration file, +For example, to set the path to a custom `Zenoh router` configuration file, ```bash export ZENOH_ROUTER_CONFIG_URI=$HOME/MY_ZENOH_ROUTER_CONFIG.json5 ``` ### Connecting multiple hosts -By default, all discovery traffic is local per host, where the host is the PC running a Zenoh `router`. -To bridge communications across two hosts, the `router` configuration for one the hosts must be updated to connect to the other `router` at startup. -This is done by specifying an endpoint in host's `router` configuration file to as seen below. -In this example, the `router` will connect to the `router` running on a second host with IP address `192.168.1.1` and port `7447`. +By default, all discovery traffic is local per host, where the host is the PC running a `Zenoh router`. +To bridge communications across two hosts, the `Zenoh router` configuration for one the hosts must be updated to connect to the other `Zenoh router` at startup. +This is done by specifying an endpoint in host's `Zenoh router` configuration file to as seen below. +In this example, the `Zenoh router` will connect to the `Zenoh router` running on a second host with IP address `192.168.1.1` and port `7447`. ```json { @@ -99,4 +114,4 @@ In this example, the `router` will connect to the `router` running on a second h } ``` -> Note: To connect multiple hosts, include the endpoints of all routers in the network. +> Note: To connect multiple hosts, include the endpoints of all `Zenoh routers` in the network. diff --git a/rmw_zenoh_cpp/src/detail/zenoh_config.cpp b/rmw_zenoh_cpp/src/detail/zenoh_config.cpp index 2db5eb43..de0b5a0c 100644 --- a/rmw_zenoh_cpp/src/detail/zenoh_config.cpp +++ b/rmw_zenoh_cpp/src/detail/zenoh_config.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -35,6 +36,8 @@ static const std::unordered_mapsecond.first, default_config_path.c_str(), config); } + +///============================================================================== +std::optional zenoh_router_check_attempts() +{ + const char * envar_value; + // The default value is to check indefinitely. + uint64_t default_value = std::numeric_limits::max(); + + if (NULL != rcutils_get_env(router_check_attempts_envar, &envar_value)) { + // NULL is returned if everything is ok. + RCUTILS_LOG_ERROR_NAMED( + "rmw_zenoh_cpp", "Envar %s cannot be read. Report this bug.", + router_check_attempts_envar); + return default_value; + } + // If the environment variable contains a value, handle it accordingly. + if (envar_value[0] != '\0') { + const auto read_value = std::strtol(envar_value, nullptr, 10); + if (read_value > 0) { + return read_value; + } else if (read_value < 0) { + // If less than 0, we skip the check. + return std::nullopt; + } + // If the value is 0, check indefinitely. + return default_value; + } + + // If unset, check indefinitely. + return default_value; +} diff --git a/rmw_zenoh_cpp/src/detail/zenoh_config.hpp b/rmw_zenoh_cpp/src/detail/zenoh_config.hpp index 1fff73cd..1a2c6a59 100644 --- a/rmw_zenoh_cpp/src/detail/zenoh_config.hpp +++ b/rmw_zenoh_cpp/src/detail/zenoh_config.hpp @@ -17,6 +17,7 @@ #include +#include #include #include @@ -43,4 +44,15 @@ enum class ConfigurableEntity : uint8_t [[nodiscard]] rmw_ret_t get_z_config(const ConfigurableEntity & entity, z_owned_config_t * config); +///============================================================================== +/// Get the number of times rmw_init should try to connect to a zenoh router +/// based on the environment variable ZENOH_ROUTER_CHECK_ATTEMPTS. +/// @details The behavior is as follows: +/// - If not set or 0, the max value is returned. +/// - If less than 0, std::nullopt is returned. +/// - Else value of environemnt variable is returned. +/// @return The number of times to try connecting to a zenoh router and +/// std::nullopt if establishing a connection to a router is not required. +std::optional zenoh_router_check_attempts(); + #endif // DETAIL__ZENOH_CONFIG_HPP_ diff --git a/rmw_zenoh_cpp/src/detail/zenoh_router_check.cpp b/rmw_zenoh_cpp/src/detail/zenoh_router_check.cpp index 5e674089..5f12e89d 100644 --- a/rmw_zenoh_cpp/src/detail/zenoh_router_check.cpp +++ b/rmw_zenoh_cpp/src/detail/zenoh_router_check.cpp @@ -21,7 +21,9 @@ #include #include +#include "liveliness_utils.hpp" +///============================================================================= rmw_ret_t zenoh_router_check(z_session_t session) { // Initialize context for callback @@ -29,9 +31,12 @@ rmw_ret_t zenoh_router_check(z_session_t session) // Define callback auto callback = [](const struct z_id_t * id, void * ctx) { + const std::string id_str = liveliness::zid_to_str(*id); + RCUTILS_LOG_INFO_NAMED( + "rmw_zenoh_cpp", + "Successfully connected to a Zenoh router with id %s.", id_str.c_str()); // Note: Callback is guaranteed to never be called // concurrently according to z_info_routers_zid docstring - static_cast(id); (*(static_cast(ctx)))++; }; @@ -39,14 +44,15 @@ rmw_ret_t zenoh_router_check(z_session_t session) z_owned_closure_zid_t router_callback = z_closure(callback, nullptr /* drop */, &context); if (z_info_routers_zid(session, z_move(router_callback))) { RCUTILS_LOG_ERROR_NAMED( - "ZenohRouterCheck", - "Failed to evaluate if Zenoh routers are connected to the session"); + "rmw_zenoh_cpp", + "Failed to evaluate if Zenoh routers are connected to the session."); ret = RMW_RET_ERROR; } else { if (context == 0) { RCUTILS_LOG_ERROR_NAMED( - "ZenohRouterCheck", - "No Zenoh router connected to the session"); + "rmw_zenoh_cpp", + "Unable to connect to a Zenoh router. " + "Have you started a router with `ros2 run rmw_zenoh_cpp rmw_zenohd`?"); ret = RMW_RET_ERROR; } } diff --git a/rmw_zenoh_cpp/src/rmw_init.cpp b/rmw_zenoh_cpp/src/rmw_init.cpp index b90cdf61..b302a882 100644 --- a/rmw_zenoh_cpp/src/rmw_init.cpp +++ b/rmw_zenoh_cpp/src/rmw_init.cpp @@ -16,6 +16,7 @@ #include #include +#include #include "detail/guard_condition.hpp" #include "detail/identifier.hpp" @@ -175,10 +176,24 @@ rmw_init(const rmw_init_options_t * options, rmw_context_t * context) z_id_t zid = z_info_zid(z_loan(context->impl->session)); context->impl->graph_cache = std::make_unique(zid); - // Verify if the zenoh router is running. - if ((ret = zenoh_router_check(z_loan(context->impl->session))) != RMW_RET_OK) { - RMW_SET_ERROR_MSG("Error while checking for Zenoh router"); - return ret; + // Verify if the zenoh router is running if configured. + const std::optional configured_connection_attempts = zenoh_router_check_attempts(); + if (configured_connection_attempts.has_value()) { + ret = RMW_RET_ERROR; + uint64_t connection_attempts = 0; + // Retry until the connection is successful. + while (ret != RMW_RET_OK && connection_attempts < configured_connection_attempts.value()) { + if ((ret = zenoh_router_check(z_loan(context->impl->session))) != RMW_RET_OK) { + ++connection_attempts; + } + std::this_thread::sleep_for(std::chrono::seconds(1)); + } + if (ret != RMW_RET_OK) { + RMW_SET_ERROR_MSG_WITH_FORMAT_STRING( + "Unable to connect to a Zenoh router after %zu retries.", + configured_connection_attempts.value()); + return ret; + } } // Initialize the shm manager if shared_memory is enabled in the config. diff --git a/rmw_zenoh_cpp/src/rmw_zenoh.cpp b/rmw_zenoh_cpp/src/rmw_zenoh.cpp index 802a1269..08c90b87 100644 --- a/rmw_zenoh_cpp/src/rmw_zenoh.cpp +++ b/rmw_zenoh_cpp/src/rmw_zenoh.cpp @@ -2277,7 +2277,7 @@ rmw_send_request( opts.target = Z_QUERY_TARGET_ALL_COMPLETE; // The default timeout for a z_get query is 10 seconds and if a response is not received within // this window, the queryable will return an invalid reply. However, it is common for actions, - // which are implemented using services, to take an extended duration to complete.Hence, we set + // which are implemented using services, to take an extended duration to complete. Hence, we set // the timeout_ms to the largest supported value to account for most realistic scenarios. opts.timeout_ms = std::numeric_limits::max(); // Latest consolidation guarantees unicity of replies for the same key expression, From cd778fa07347e4bfcb0915e4b156255e49058416 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Thu, 4 Apr 2024 21:43:47 -0400 Subject: [PATCH 2/3] Add rmw_zenoh_cpp to the implementations group. (#149) * Add rmw_zenoh_cpp to the implementations group. Otherwise, the correct ament index resource files won't be generated, and rmw_zenoh_cpp won't be picked up as a viable RMW during automatic detection. Signed-off-by: Chris Lalancette * Also make sure to build_export_depend zenoh_c_vendor. Otherwise, we won't properly find the library when using it in downstream tests. Signed-off-by: Chris Lalancette --------- Signed-off-by: Chris Lalancette --- rmw_zenoh_cpp/package.xml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rmw_zenoh_cpp/package.xml b/rmw_zenoh_cpp/package.xml index 67b488be..6304ab32 100644 --- a/rmw_zenoh_cpp/package.xml +++ b/rmw_zenoh_cpp/package.xml @@ -10,6 +10,7 @@ ament_cmake zenoh_c_vendor + zenoh_c_vendor ament_index_cpp fastcdr @@ -23,6 +24,8 @@ ament_lint_auto ament_lint_common + rmw_implementation_packages + ament_cmake From 87f20f87730eb2a36d65fc4db666f43ec75fcd64 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Thu, 4 Apr 2024 21:45:52 -0400 Subject: [PATCH 3/3] Move graph guard condition destruction to rmw_context_fini. (#150) That's because during rmw_shutdown(), the graph guard condition may still be in use in rwm_wait() and friends. Fix this by moving the destruction of it later, into rmw_context_fini. Signed-off-by: Chris Lalancette --- rmw_zenoh_cpp/src/rmw_init.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/rmw_zenoh_cpp/src/rmw_init.cpp b/rmw_zenoh_cpp/src/rmw_init.cpp index b302a882..edd95455 100644 --- a/rmw_zenoh_cpp/src/rmw_init.cpp +++ b/rmw_zenoh_cpp/src/rmw_init.cpp @@ -377,15 +377,6 @@ rmw_shutdown(rmw_context_t * context) return RMW_RET_ERROR; } - const rcutils_allocator_t * allocator = &context->options.allocator; - - RMW_TRY_DESTRUCTOR( - static_cast(context->impl->graph_guard_condition->data)->~GuardCondition(), - GuardCondition, ); - allocator->deallocate(context->impl->graph_guard_condition->data, allocator->state); - - allocator->deallocate(context->impl->graph_guard_condition, allocator->state); - context->impl->is_shutdown = true; return RMW_RET_OK; @@ -411,10 +402,18 @@ rmw_context_fini(rmw_context_t * context) return RMW_RET_INVALID_ARGUMENT; } - RMW_TRY_DESTRUCTOR(context->impl->~rmw_context_impl_t(), rmw_context_impl_t, ); - const rcutils_allocator_t * allocator = &context->options.allocator; + RMW_TRY_DESTRUCTOR( + static_cast(context->impl->graph_guard_condition->data)->~GuardCondition(), + GuardCondition, ); + allocator->deallocate(context->impl->graph_guard_condition->data, allocator->state); + + allocator->deallocate(context->impl->graph_guard_condition, allocator->state); + context->impl->graph_guard_condition = nullptr; + + RMW_TRY_DESTRUCTOR(context->impl->~rmw_context_impl_t(), rmw_context_impl_t, ); + allocator->deallocate(context->impl, allocator->state); rmw_ret_t ret = rmw_init_options_fini(&context->options);