From bc0f5ee7f7174eab056c8e75343b37dc0a9c1467 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damian=20Kr=C3=B3lik?= <66667989+Damian-Nordic@users.noreply.github.com> Date: Tue, 7 May 2024 11:44:16 +0200 Subject: [PATCH] [mrp] Increase default retry interval for Thread (#33314) * [mrp] Increase default retry interval for Thread The current 800ms is not enough in real setups, where Thread routers must serve as intermediate hops for many parallel conversations. Bump this to 2s. Signed-off-by: Damian Krolik * [mrp] Make additional MRP backoff time dynamic for all Make the additional MRP backoff time (aka CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST) dynamic for all build configurations to remove the need for adjusting timeouts in unit tests whenever this parameter changes. In messaging tests, by default, set this parameter to 0, except for tests that explicitly verify its meaning. By the way, fix tests increasing the MRP backoff time by the slow-polling instead of fast-polling interval. --------- Signed-off-by: Damian Krolik --- .../CHIP/MTRDeviceControllerFactory.mm | 2 +- src/messaging/ReliableMessageMgr.cpp | 15 ++-- src/messaging/ReliableMessageMgr.h | 11 +-- src/messaging/ReliableMessageProtocolConfig.h | 6 +- src/messaging/tests/MessagingContext.cpp | 7 ++ .../tests/TestAbortExchangesForFabric.cpp | 8 +-- .../tests/TestReliableMessageProtocol.cpp | 69 +++++++++++-------- src/platform/nrfconnect/CHIPPlatformConfig.h | 13 ++-- .../secure_channel/tests/TestPASESession.cpp | 11 +-- 9 files changed, 72 insertions(+), 70 deletions(-) diff --git a/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm b/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm index c7d69d2c8772db..5886876923a89d 100644 --- a/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm +++ b/src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm @@ -1330,7 +1330,7 @@ void MTRSetMessageReliabilityParameters(NSNumber * _Nullable idleRetransmitMs, resetAdvertising = ReliableMessageProtocolConfig::SetLocalMRPConfig(NullOptional); } else { if (additionalRetransmitDelayMs != nil) { - System::Clock::Milliseconds64 additionalBackoff(additionalRetransmitDelayMs.unsignedLongLongValue); + System::Clock::Timeout additionalBackoff(additionalRetransmitDelayMs.unsignedLongValue); Messaging::ReliableMessageMgr::SetAdditionalMRPBackoffTime(MakeOptional(additionalBackoff)); } diff --git a/src/messaging/ReliableMessageMgr.cpp b/src/messaging/ReliableMessageMgr.cpp index 17049069b71430..4bc196c45510ea 100644 --- a/src/messaging/ReliableMessageMgr.cpp +++ b/src/messaging/ReliableMessageMgr.cpp @@ -47,9 +47,7 @@ using namespace chip::System::Clock::Literals; namespace chip { namespace Messaging { -#if CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG -Optional ReliableMessageMgr::sAdditionalMRPBackoffTime; -#endif // CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG +System::Clock::Timeout ReliableMessageMgr::sAdditionalMRPBackoffTime = CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST; ReliableMessageMgr::RetransTableEntry::RetransTableEntry(ReliableMessageContext * rc) : ec(*rc->GetExchangeContext()), nextRetransTime(0), sendCount(0) @@ -269,11 +267,7 @@ System::Clock::Timeout ReliableMessageMgr::GetBackoff(System::Clock::Timeout bas mrpBackoffTime += ICDConfigurationData::GetInstance().GetFastPollingInterval(); #endif -#if CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG - mrpBackoffTime += sAdditionalMRPBackoffTime.ValueOr(CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST); -#else - mrpBackoffTime += CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST; -#endif // CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG + mrpBackoffTime += sAdditionalMRPBackoffTime; return std::chrono::duration_cast(mrpBackoffTime); } @@ -463,6 +457,11 @@ CHIP_ERROR ReliableMessageMgr::MapSendError(CHIP_ERROR error, uint16_t exchangeI return error; } +void ReliableMessageMgr::SetAdditionalMRPBackoffTime(const Optional & additionalTime) +{ + sAdditionalMRPBackoffTime = additionalTime.ValueOr(CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST); +} + void ReliableMessageMgr::CalculateNextRetransTime(RetransTableEntry & entry) { System::Clock::Timeout baseTimeout = System::Clock::Timeout(0); diff --git a/src/messaging/ReliableMessageMgr.h b/src/messaging/ReliableMessageMgr.h index 953de1db0aea40..5036b832108443 100644 --- a/src/messaging/ReliableMessageMgr.h +++ b/src/messaging/ReliableMessageMgr.h @@ -206,7 +206,6 @@ class ReliableMessageMgr } #endif // CHIP_CONFIG_TEST -#if CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG /** * Set the value to add to the MRP backoff time we compute. This is meant to * account for high network latency on the sending side (us) that can't be @@ -220,11 +219,7 @@ class ReliableMessageMgr * set this before actually bringing up the stack and having access to a * ReliableMessageMgr. */ - static void SetAdditionalMRPBackoffTime(const Optional & additionalTime) - { - sAdditionalMRPBackoffTime = additionalTime; - } -#endif // CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG + static void SetAdditionalMRPBackoffTime(const Optional & additionalTime); private: /** @@ -255,9 +250,7 @@ class ReliableMessageMgr SessionUpdateDelegate * mSessionUpdateDelegate = nullptr; -#if CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG - static Optional sAdditionalMRPBackoffTime; -#endif // CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG + static System::Clock::Timeout sAdditionalMRPBackoffTime; }; } // namespace Messaging diff --git a/src/messaging/ReliableMessageProtocolConfig.h b/src/messaging/ReliableMessageProtocolConfig.h index e3255a780b7b70..69135aa378de41 100644 --- a/src/messaging/ReliableMessageProtocolConfig.h +++ b/src/messaging/ReliableMessageProtocolConfig.h @@ -56,7 +56,7 @@ namespace chip { */ #ifndef CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL #if CHIP_ENABLE_OPENTHREAD && !CHIP_DEVICE_LAYER_TARGET_LINUX -#define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (800_ms32) +#define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (2000_ms32) #else #define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (300_ms32) #endif @@ -82,7 +82,7 @@ namespace chip { */ #ifndef CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL #if CHIP_ENABLE_OPENTHREAD && !CHIP_DEVICE_LAYER_TARGET_LINUX -#define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (800_ms32) +#define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (2000_ms32) #else #define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (500_ms32) #endif @@ -174,7 +174,7 @@ namespace chip { */ #ifndef CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST #if CHIP_ENABLE_OPENTHREAD && !CHIP_DEVICE_LAYER_TARGET_LINUX -#define CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST (500_ms) +#define CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST (1500_ms) #else #define CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST (0_ms) #endif diff --git a/src/messaging/tests/MessagingContext.cpp b/src/messaging/tests/MessagingContext.cpp index dc41292dc0f313..72ab1650da552c 100644 --- a/src/messaging/tests/MessagingContext.cpp +++ b/src/messaging/tests/MessagingContext.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include namespace chip { @@ -70,6 +71,9 @@ CHIP_ERROR MessagingContext::Init(TransportMgrBase * transport, IOContext * ioCo ReturnErrorOnFailure(CreatePASESessionDavidToCharlie()); } + // Set the additional MRP backoff to zero so that it does not affect the test execution time. + Messaging::ReliableMessageMgr::SetAdditionalMRPBackoffTime(MakeOptional(System::Clock::kZero)); + return CHIP_NO_ERROR; } @@ -85,6 +89,9 @@ void MessagingContext::Shutdown() mFabricTable.Shutdown(); mOpCertStore.Finish(); mOpKeyStore.Finish(); + + // Reset the default additional MRP backoff. + Messaging::ReliableMessageMgr::SetAdditionalMRPBackoffTime(NullOptional); } CHIP_ERROR MessagingContext::InitFromExisting(const MessagingContext & existing) diff --git a/src/messaging/tests/TestAbortExchangesForFabric.cpp b/src/messaging/tests/TestAbortExchangesForFabric.cpp index 5993a771e76bd4..1540b91725a3aa 100644 --- a/src/messaging/tests/TestAbortExchangesForFabric.cpp +++ b/src/messaging/tests/TestAbortExchangesForFabric.cpp @@ -226,15 +226,11 @@ void CommonCheckAbortAllButOneExchange(nlTestSuite * inSuite, TestContext & ctx, // auto waitTimeout = System::Clock::Milliseconds32(1000); -#if CHIP_CONFIG_ENABLE_ICD_SERVER == 1 +#if CHIP_CONFIG_ENABLE_ICD_SERVER // If running as an ICD, increase waitTimeout to account for the polling interval - waitTimeout += ICDConfigurationData::GetInstance().GetSlowPollingInterval(); + waitTimeout += ICDConfigurationData::GetInstance().GetFastPollingInterval(); #endif - // Account for the retry delay booster, so that we do not timeout our IO processing before the - // retransmission failure is triggered. - waitTimeout += CHIP_CONFIG_RMP_DEFAULT_MAX_RETRANS * CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST; - ctx.GetIOContext().DriveIOUntil(waitTimeout, [&]() { return false; }); } else diff --git a/src/messaging/tests/TestReliableMessageProtocol.cpp b/src/messaging/tests/TestReliableMessageProtocol.cpp index dc70e013a55bc5..057aa599726df3 100644 --- a/src/messaging/tests/TestReliableMessageProtocol.cpp +++ b/src/messaging/tests/TestReliableMessageProtocol.cpp @@ -65,14 +65,6 @@ using namespace chip::System::Clock::Literals; const char PAYLOAD[] = "Hello!"; -// The CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST can be set to non-zero value -// to boost the retransmission timeout for a high latency network like Thread to -// avoid spurious retransmits. -// -// This adds extra I/O time to account for this. See the documentation for -// CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST for more details. -constexpr auto retryBoosterTimeout = CHIP_CONFIG_RMP_DEFAULT_MAX_RETRANS * CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST; - class TestContext : public chip::Test::LoopbackMessagingContext { public: @@ -324,6 +316,34 @@ struct BackoffComplianceTestVector theBackoffComplianceTestVector[] = { { .backoffMax = System::Clock::Timeout(20'286'001), } }; +void CheckGetBackoffImpl(nlTestSuite * inSuite, System::Clock::Timeout additionalMRPBackoffTime) +{ + ReliableMessageMgr::SetAdditionalMRPBackoffTime(MakeOptional(additionalMRPBackoffTime)); + + // Run 3x iterations to thoroughly test random jitter always results in backoff within bounds. + for (uint32_t j = 0; j < 3; j++) + { + for (const auto & test : theBackoffComplianceTestVector) + { + System::Clock::Timeout backoff = ReliableMessageMgr::GetBackoff(test.backoffBase, test.sendCount); + System::Clock::Timeout extraBackoff = additionalMRPBackoffTime; + +#if CHIP_CONFIG_ENABLE_ICD_SERVER + // If running as an ICD, increase maxBackoff to account for the polling interval + extraBackoff += ICDConfigurationData::GetInstance().GetFastPollingInterval(); +#endif + + ChipLogProgress(Test, "Backoff base %" PRIu32 " extra %" PRIu32 " # %d: %" PRIu32, test.backoffBase.count(), + extraBackoff.count(), test.sendCount, backoff.count()); + + NL_TEST_ASSERT(inSuite, backoff >= test.backoffMin + extraBackoff); + NL_TEST_ASSERT(inSuite, backoff <= test.backoffMax + extraBackoff); + } + } + + ReliableMessageMgr::SetAdditionalMRPBackoffTime(NullOptional); +} + } // namespace class TestReliableMessageProtocol @@ -348,6 +368,7 @@ class TestReliableMessageProtocol static void CheckLostStandaloneAck(nlTestSuite * inSuite, void * inContext); static void CheckIsPeerActiveNotInitiator(nlTestSuite * inSuite, void * inContext); static void CheckGetBackoff(nlTestSuite * inSuite, void * inContext); + static void CheckGetBackoffAdditionalTime(nlTestSuite * inSuite, void * inContext); static void CheckApplicationResponseDelayed(nlTestSuite * inSuite, void * inContext); static void CheckApplicationResponseNeverComes(nlTestSuite * inSuite, void * inContext); }; @@ -449,7 +470,7 @@ void TestReliableMessageProtocol::CheckResendApplicationMessage(nlTestSuite * in NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1); // Wait for the initial message to fail (should take 330-413ms) - ctx.GetIOContext().DriveIOUntil(1000_ms32 + retryBoosterTimeout, [&] { return loopback.mSentMessageCount >= 2; }); + ctx.GetIOContext().DriveIOUntil(1000_ms32, [&] { return loopback.mSentMessageCount >= 2; }); now = System::SystemClock().GetMonotonicTimestamp(); timeoutTime = now - startTime; ChipLogProgress(Test, "Attempt #1 Timeout : %" PRIu32 "ms", timeoutTime.count()); @@ -466,7 +487,7 @@ void TestReliableMessageProtocol::CheckResendApplicationMessage(nlTestSuite * in NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1); // Wait for the 1st retry to fail (should take 330-413ms) - ctx.GetIOContext().DriveIOUntil(1000_ms32 + retryBoosterTimeout, [&] { return loopback.mSentMessageCount >= 3; }); + ctx.GetIOContext().DriveIOUntil(1000_ms32, [&] { return loopback.mSentMessageCount >= 3; }); now = System::SystemClock().GetMonotonicTimestamp(); timeoutTime = now - startTime; ChipLogProgress(Test, "Attempt #2 Timeout : %" PRIu32 "ms", timeoutTime.count()); @@ -483,7 +504,7 @@ void TestReliableMessageProtocol::CheckResendApplicationMessage(nlTestSuite * in NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1); // Wait for the 2nd retry to fail (should take 528-660ms) - ctx.GetIOContext().DriveIOUntil(1000_ms32 + retryBoosterTimeout, [&] { return loopback.mSentMessageCount >= 4; }); + ctx.GetIOContext().DriveIOUntil(1000_ms32, [&] { return loopback.mSentMessageCount >= 4; }); now = System::SystemClock().GetMonotonicTimestamp(); timeoutTime = now - startTime; ChipLogProgress(Test, "Attempt #3 Timeout : %" PRIu32 "ms", timeoutTime.count()); @@ -500,7 +521,7 @@ void TestReliableMessageProtocol::CheckResendApplicationMessage(nlTestSuite * in NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1); // Wait for the 3rd retry to fail (should take 845-1056ms) - ctx.GetIOContext().DriveIOUntil(1500_ms32 + retryBoosterTimeout, [&] { return loopback.mSentMessageCount >= 5; }); + ctx.GetIOContext().DriveIOUntil(1500_ms32, [&] { return loopback.mSentMessageCount >= 5; }); now = System::SystemClock().GetMonotonicTimestamp(); timeoutTime = now - startTime; ChipLogProgress(Test, "Attempt #4 Timeout : %" PRIu32 "ms", timeoutTime.count()); @@ -1845,25 +1866,12 @@ void TestReliableMessageProtocol::CheckLostStandaloneAck(nlTestSuite * inSuite, void TestReliableMessageProtocol::CheckGetBackoff(nlTestSuite * inSuite, void * inContext) { - // Run 3x iterations to thoroughly test random jitter always results in backoff within bounds. - for (uint32_t j = 0; j < 3; j++) - { - for (const auto & test : theBackoffComplianceTestVector) - { - System::Clock::Timeout backoff = ReliableMessageMgr::GetBackoff(test.backoffBase, test.sendCount); - ChipLogProgress(Test, "Backoff base %" PRIu32 " # %d: %" PRIu32, test.backoffBase.count(), test.sendCount, - backoff.count()); - - NL_TEST_ASSERT(inSuite, backoff >= test.backoffMin); + CheckGetBackoffImpl(inSuite, System::Clock::kZero); +} - auto maxBackoff = test.backoffMax + retryBoosterTimeout; -#if CHIP_CONFIG_ENABLE_ICD_SERVER == 1 - // If running as an ICD, increase maxBackoff to account for the polling interval - maxBackoff += ICDConfigurationData::GetInstance().GetSlowPollingInterval(); -#endif - NL_TEST_ASSERT(inSuite, backoff <= maxBackoff); - } - } +void TestReliableMessageProtocol::CheckGetBackoffAdditionalTime(nlTestSuite * inSuite, void * inContext) +{ + CheckGetBackoffImpl(inSuite, System::Clock::Seconds32(1)); } void TestReliableMessageProtocol::CheckApplicationResponseDelayed(nlTestSuite * inSuite, void * inContext) @@ -2207,6 +2215,7 @@ const nlTest sTests[] = { TestReliableMessageProtocol::CheckLostStandaloneAck), NL_TEST_DEF("Test Is Peer Active Retry logic", TestReliableMessageProtocol::CheckIsPeerActiveNotInitiator), NL_TEST_DEF("Test MRP backoff algorithm", TestReliableMessageProtocol::CheckGetBackoff), + NL_TEST_DEF("Test MRP backoff algorithm with additional time", TestReliableMessageProtocol::CheckGetBackoffAdditionalTime), // TODO: Re-enable this test, after changing test to use Mock clock / DriveIO rather than DriveIOUntil. // Issue: https://github.com/project-chip/connectedhomeip/issues/32440 // NL_TEST_DEF("Test an application response that comes after MRP retransmits run out", diff --git a/src/platform/nrfconnect/CHIPPlatformConfig.h b/src/platform/nrfconnect/CHIPPlatformConfig.h index 3ece933d377996..ee4609e05c5e5c 100644 --- a/src/platform/nrfconnect/CHIPPlatformConfig.h +++ b/src/platform/nrfconnect/CHIPPlatformConfig.h @@ -106,21 +106,16 @@ #define CHIP_CONFIG_LOG_MODULE_Support_PROGRESS 0 #endif -// Set MRP retry intervals for Thread and Wi-Fi to test-proven values. #ifndef CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL -#if CHIP_ENABLE_OPENTHREAD -#define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (800_ms32) -#else +#ifndef CONFIG_NET_L2_OPENTHREAD #define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (1000_ms32) -#endif // CHIP_ENABLE_OPENTHREAD +#endif // CONFIG_NET_L2_OPENTHREAD #endif // CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL #ifndef CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL -#if CHIP_ENABLE_OPENTHREAD -#define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (800_ms32) -#else +#ifndef CONFIG_NET_L2_OPENTHREAD #define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (1000_ms32) -#endif // CHIP_ENABLE_OPENTHREAD +#endif // CONFIG_NET_L2_OPENTHREAD #endif // CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL #ifndef CHIP_CONFIG_ICD_IDLE_MODE_DURATION_SEC diff --git a/src/protocols/secure_channel/tests/TestPASESession.cpp b/src/protocols/secure_channel/tests/TestPASESession.cpp index b25252f15b7385..368811a273ff66 100644 --- a/src/protocols/secure_channel/tests/TestPASESession.cpp +++ b/src/protocols/secure_channel/tests/TestPASESession.cpp @@ -305,11 +305,14 @@ void SecurePairingHandshakeTestCommon(nlTestSuite * inSuite, void * inContext, S while (delegate.mMessageDropped) { - auto waitTimeout = 100_ms + CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST; + auto waitTimeout = 100_ms; -#if CHIP_CONFIG_ENABLE_ICD_SERVER == 1 - // If running as an ICD, increase waitTimeout to account for the polling interval - waitTimeout += ICDConfigurationData::GetInstance().GetSlowPollingInterval(); +#if CHIP_CONFIG_ENABLE_ICD_SERVER + // If running as an ICD, increase waitTimeout to account for: + // - longer MRP intervals, configured above to 1s/1s, + // - the fast-polling interval that is added to the MRP backoff time. + waitTimeout += 2000_ms32; + waitTimeout += ICDConfigurationData::GetInstance().GetFastPollingInterval(); #endif // Wait some time so the dropped message will be retransmitted when we drain the IO.