Skip to content

Commit

Permalink
[mrp] Increase default retry interval for Thread (#33314)
Browse files Browse the repository at this point in the history
* [mrp] Increase default retry interval for Thread

The current 800ms is not enough in real setups, where Thread
routers must serve as intermediate hops for many parallel
conversations. Bump this to 2s.

Signed-off-by: Damian Krolik <[email protected]>

* [mrp] Make additional MRP backoff time dynamic for all

Make the additional MRP backoff time (aka
CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST) dynamic for
all build configurations to remove the need for adjusting
timeouts in unit tests whenever this parameter changes.

In messaging tests, by default, set this parameter to 0,
except for tests that explicitly verify its meaning.

By the way, fix tests increasing the MRP backoff time
by the slow-polling instead of fast-polling interval.

---------

Signed-off-by: Damian Krolik <[email protected]>
(cherry picked from commit bc0f5ee)
  • Loading branch information
Damian-Nordic committed May 13, 2024
1 parent cb6604f commit 5c908a1
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 70 deletions.
2 changes: 1 addition & 1 deletion src/darwin/Framework/CHIP/MTRDeviceControllerFactory.mm
Original file line number Diff line number Diff line change
Expand Up @@ -1489,7 +1489,7 @@ void MTRSetMessageReliabilityParameters(NSNumber * _Nullable idleRetransmitMs,
resetAdvertising = ReliableMessageProtocolConfig::SetLocalMRPConfig(NullOptional);
} else {
if (additionalRetransmitDelayMs != nil) {
System::Clock::Milliseconds64 additionalBackoff(additionalRetransmitDelayMs.unsignedLongLongValue);
System::Clock::Timeout additionalBackoff(additionalRetransmitDelayMs.unsignedLongValue);
Messaging::ReliableMessageMgr::SetAdditionalMRPBackoffTime(MakeOptional(additionalBackoff));
}

Expand Down
15 changes: 7 additions & 8 deletions src/messaging/ReliableMessageMgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ using namespace chip::System::Clock::Literals;
namespace chip {
namespace Messaging {

#if CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG
Optional<System::Clock::Milliseconds64> ReliableMessageMgr::sAdditionalMRPBackoffTime;
#endif // CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG
System::Clock::Timeout ReliableMessageMgr::sAdditionalMRPBackoffTime = CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST;

ReliableMessageMgr::RetransTableEntry::RetransTableEntry(ReliableMessageContext * rc) :
ec(*rc->GetExchangeContext()), nextRetransTime(0), sendCount(0)
Expand Down Expand Up @@ -267,11 +265,7 @@ System::Clock::Timestamp ReliableMessageMgr::GetBackoff(System::Clock::Timestamp
mrpBackoffTime += ICDConfigurationData::GetInstance().GetFastPollingInterval();
#endif

#if CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG
mrpBackoffTime += sAdditionalMRPBackoffTime.ValueOr(CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST);
#else
mrpBackoffTime += CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST;
#endif // CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG
mrpBackoffTime += sAdditionalMRPBackoffTime;

return mrpBackoffTime;
}
Expand Down Expand Up @@ -461,6 +455,11 @@ CHIP_ERROR ReliableMessageMgr::MapSendError(CHIP_ERROR error, uint16_t exchangeI
return error;
}

void ReliableMessageMgr::SetAdditionalMRPBackoffTime(const Optional<System::Clock::Timeout> & additionalTime)
{
sAdditionalMRPBackoffTime = additionalTime.ValueOr(CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST);
}

void ReliableMessageMgr::CalculateNextRetransTime(RetransTableEntry & entry)
{
System::Clock::Timestamp baseTimeout = System::Clock::Milliseconds64(0);
Expand Down
11 changes: 2 additions & 9 deletions src/messaging/ReliableMessageMgr.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,6 @@ class ReliableMessageMgr
}
#endif // CHIP_CONFIG_TEST

#if CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG
/**
* Set the value to add to the MRP backoff time we compute. This is meant to
* account for high network latency on the sending side (us) that can't be
Expand All @@ -220,11 +219,7 @@ class ReliableMessageMgr
* set this before actually bringing up the stack and having access to a
* ReliableMessageMgr.
*/
static void SetAdditionalMRPBackoffTime(const Optional<System::Clock::Milliseconds64> & additionalTime)
{
sAdditionalMRPBackoffTime = additionalTime;
}
#endif // CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG
static void SetAdditionalMRPBackoffTime(const Optional<System::Clock::Timeout> & additionalTime);

private:
/**
Expand Down Expand Up @@ -255,9 +250,7 @@ class ReliableMessageMgr

SessionUpdateDelegate * mSessionUpdateDelegate = nullptr;

#if CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG
static Optional<System::Clock::Milliseconds64> sAdditionalMRPBackoffTime;
#endif // CHIP_DEVICE_CONFIG_ENABLE_DYNAMIC_MRP_CONFIG
static System::Clock::Timeout sAdditionalMRPBackoffTime;
};

} // namespace Messaging
Expand Down
6 changes: 3 additions & 3 deletions src/messaging/ReliableMessageProtocolConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ namespace chip {
*/
#ifndef CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL
#if CHIP_ENABLE_OPENTHREAD && !CHIP_DEVICE_LAYER_TARGET_LINUX
#define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (800_ms32)
#define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (2000_ms32)
#else
#define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (300_ms32)
#endif
Expand All @@ -82,7 +82,7 @@ namespace chip {
*/
#ifndef CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL
#if CHIP_ENABLE_OPENTHREAD && !CHIP_DEVICE_LAYER_TARGET_LINUX
#define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (800_ms32)
#define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (2000_ms32)
#else
#define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (500_ms32)
#endif
Expand Down Expand Up @@ -174,7 +174,7 @@ namespace chip {
*/
#ifndef CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST
#if CHIP_ENABLE_OPENTHREAD && !CHIP_DEVICE_LAYER_TARGET_LINUX
#define CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST (500_ms)
#define CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST (1500_ms)
#else
#define CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST (0_ms)
#endif
Expand Down
7 changes: 7 additions & 0 deletions src/messaging/tests/MessagingContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <credentials/tests/CHIPCert_unit_test_vectors.h>
#include <lib/core/ErrorStr.h>
#include <lib/support/CodeUtils.h>
#include <messaging/ReliableMessageMgr.h>
#include <protocols/secure_channel/Constants.h>

namespace chip {
Expand Down Expand Up @@ -70,6 +71,9 @@ CHIP_ERROR MessagingContext::Init(TransportMgrBase * transport, IOContext * ioCo
ReturnErrorOnFailure(CreatePASESessionDavidToCharlie());
}

// Set the additional MRP backoff to zero so that it does not affect the test execution time.
Messaging::ReliableMessageMgr::SetAdditionalMRPBackoffTime(MakeOptional(System::Clock::kZero));

return CHIP_NO_ERROR;
}

Expand All @@ -85,6 +89,9 @@ void MessagingContext::Shutdown()
mFabricTable.Shutdown();
mOpCertStore.Finish();
mOpKeyStore.Finish();

// Reset the default additional MRP backoff.
Messaging::ReliableMessageMgr::SetAdditionalMRPBackoffTime(NullOptional);
}

CHIP_ERROR MessagingContext::InitFromExisting(const MessagingContext & existing)
Expand Down
8 changes: 2 additions & 6 deletions src/messaging/tests/TestAbortExchangesForFabric.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,15 +212,11 @@ void CommonCheckAbortAllButOneExchange(nlTestSuite * inSuite, TestContext & ctx,
//
auto waitTimeout = System::Clock::Milliseconds32(1000);

#if CHIP_CONFIG_ENABLE_ICD_SERVER == 1
#if CHIP_CONFIG_ENABLE_ICD_SERVER
// If running as an ICD, increase waitTimeout to account for the polling interval
waitTimeout += ICDConfigurationData::GetInstance().GetSlowPollingInterval();
waitTimeout += ICDConfigurationData::GetInstance().GetFastPollingInterval();
#endif

// Account for the retry delay booster, so that we do not timeout our IO processing before the
// retransmission failure is triggered.
waitTimeout += CHIP_CONFIG_RMP_DEFAULT_MAX_RETRANS * CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST;

ctx.GetIOContext().DriveIOUntil(waitTimeout, [&]() { return false; });
}
else
Expand Down
69 changes: 39 additions & 30 deletions src/messaging/tests/TestReliableMessageProtocol.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,6 @@ using namespace chip::System::Clock::Literals;

const char PAYLOAD[] = "Hello!";

// The CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST can be set to non-zero value
// to boost the retransmission timeout for a high latency network like Thread to
// avoid spurious retransmits.
//
// This adds extra I/O time to account for this. See the documentation for
// CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST for more details.
constexpr auto retryBoosterTimeout = CHIP_CONFIG_RMP_DEFAULT_MAX_RETRANS * CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST;

class TestContext : public chip::Test::LoopbackMessagingContext
{
public:
Expand Down Expand Up @@ -312,6 +304,34 @@ struct BackoffComplianceTestVector theBackoffComplianceTestVector[] = {
},
};

void CheckGetBackoffImpl(nlTestSuite * inSuite, System::Clock::Timeout additionalMRPBackoffTime)
{
ReliableMessageMgr::SetAdditionalMRPBackoffTime(MakeOptional(additionalMRPBackoffTime));

// Run 3x iterations to thoroughly test random jitter always results in backoff within bounds.
for (uint32_t j = 0; j < 3; j++)
{
for (const auto & test : theBackoffComplianceTestVector)
{
System::Clock::Timeout backoff = ReliableMessageMgr::GetBackoff(test.backoffBase, test.sendCount);
System::Clock::Timeout extraBackoff = additionalMRPBackoffTime;

#if CHIP_CONFIG_ENABLE_ICD_SERVER
// If running as an ICD, increase maxBackoff to account for the polling interval
extraBackoff += ICDConfigurationData::GetInstance().GetFastPollingInterval();
#endif

ChipLogProgress(Test, "Backoff base %" PRIu32 " extra %" PRIu32 " # %d: %" PRIu32, test.backoffBase.count(),
extraBackoff.count(), test.sendCount, backoff.count());

NL_TEST_ASSERT(inSuite, backoff >= test.backoffMin + extraBackoff);
NL_TEST_ASSERT(inSuite, backoff <= test.backoffMax + extraBackoff);
}
}

ReliableMessageMgr::SetAdditionalMRPBackoffTime(NullOptional);
}

} // namespace

class TestReliableMessageProtocol
Expand All @@ -336,6 +356,7 @@ class TestReliableMessageProtocol
static void CheckLostStandaloneAck(nlTestSuite * inSuite, void * inContext);
static void CheckIsPeerActiveNotInitiator(nlTestSuite * inSuite, void * inContext);
static void CheckGetBackoff(nlTestSuite * inSuite, void * inContext);
static void CheckGetBackoffAdditionalTime(nlTestSuite * inSuite, void * inContext);
static void CheckApplicationResponseDelayed(nlTestSuite * inSuite, void * inContext);
static void CheckApplicationResponseNeverComes(nlTestSuite * inSuite, void * inContext);
};
Expand Down Expand Up @@ -437,7 +458,7 @@ void TestReliableMessageProtocol::CheckResendApplicationMessage(nlTestSuite * in
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1);

// Wait for the initial message to fail (should take 330-413ms)
ctx.GetIOContext().DriveIOUntil(1000_ms32 + retryBoosterTimeout, [&] { return loopback.mSentMessageCount >= 2; });
ctx.GetIOContext().DriveIOUntil(1000_ms32, [&] { return loopback.mSentMessageCount >= 2; });
now = System::SystemClock().GetMonotonicTimestamp();
timeoutTime = now - startTime;
ChipLogProgress(Test, "Attempt #1 Timeout : %" PRIu32 "ms", timeoutTime.count());
Expand All @@ -454,7 +475,7 @@ void TestReliableMessageProtocol::CheckResendApplicationMessage(nlTestSuite * in
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1);

// Wait for the 1st retry to fail (should take 330-413ms)
ctx.GetIOContext().DriveIOUntil(1000_ms32 + retryBoosterTimeout, [&] { return loopback.mSentMessageCount >= 3; });
ctx.GetIOContext().DriveIOUntil(1000_ms32, [&] { return loopback.mSentMessageCount >= 3; });
now = System::SystemClock().GetMonotonicTimestamp();
timeoutTime = now - startTime;
ChipLogProgress(Test, "Attempt #2 Timeout : %" PRIu32 "ms", timeoutTime.count());
Expand All @@ -471,7 +492,7 @@ void TestReliableMessageProtocol::CheckResendApplicationMessage(nlTestSuite * in
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1);

// Wait for the 2nd retry to fail (should take 528-660ms)
ctx.GetIOContext().DriveIOUntil(1000_ms32 + retryBoosterTimeout, [&] { return loopback.mSentMessageCount >= 4; });
ctx.GetIOContext().DriveIOUntil(1000_ms32, [&] { return loopback.mSentMessageCount >= 4; });
now = System::SystemClock().GetMonotonicTimestamp();
timeoutTime = now - startTime;
ChipLogProgress(Test, "Attempt #3 Timeout : %" PRIu32 "ms", timeoutTime.count());
Expand All @@ -488,7 +509,7 @@ void TestReliableMessageProtocol::CheckResendApplicationMessage(nlTestSuite * in
NL_TEST_ASSERT(inSuite, rm->TestGetCountRetransTable() == 1);

// Wait for the 3rd retry to fail (should take 845-1056ms)
ctx.GetIOContext().DriveIOUntil(1500_ms32 + retryBoosterTimeout, [&] { return loopback.mSentMessageCount >= 5; });
ctx.GetIOContext().DriveIOUntil(1500_ms32, [&] { return loopback.mSentMessageCount >= 5; });
now = System::SystemClock().GetMonotonicTimestamp();
timeoutTime = now - startTime;
ChipLogProgress(Test, "Attempt #4 Timeout : %" PRIu32 "ms", timeoutTime.count());
Expand Down Expand Up @@ -1833,25 +1854,12 @@ void TestReliableMessageProtocol::CheckLostStandaloneAck(nlTestSuite * inSuite,

void TestReliableMessageProtocol::CheckGetBackoff(nlTestSuite * inSuite, void * inContext)
{
// Run 3x iterations to thoroughly test random jitter always results in backoff within bounds.
for (uint32_t j = 0; j < 3; j++)
{
for (const auto & test : theBackoffComplianceTestVector)
{
System::Clock::Timeout backoff = ReliableMessageMgr::GetBackoff(test.backoffBase, test.sendCount);
ChipLogProgress(Test, "Backoff base %" PRIu32 " # %d: %" PRIu32, test.backoffBase.count(), test.sendCount,
backoff.count());

NL_TEST_ASSERT(inSuite, backoff >= test.backoffMin);
CheckGetBackoffImpl(inSuite, System::Clock::kZero);
}

auto maxBackoff = test.backoffMax + retryBoosterTimeout;
#if CHIP_CONFIG_ENABLE_ICD_SERVER == 1
// If running as an ICD, increase maxBackoff to account for the polling interval
maxBackoff += ICDConfigurationData::GetInstance().GetSlowPollingInterval();
#endif
NL_TEST_ASSERT(inSuite, backoff <= maxBackoff);
}
}
void TestReliableMessageProtocol::CheckGetBackoffAdditionalTime(nlTestSuite * inSuite, void * inContext)
{
CheckGetBackoffImpl(inSuite, System::Clock::Seconds32(1));
}

void TestReliableMessageProtocol::CheckApplicationResponseDelayed(nlTestSuite * inSuite, void * inContext)
Expand Down Expand Up @@ -2195,6 +2203,7 @@ const nlTest sTests[] = {
TestReliableMessageProtocol::CheckLostStandaloneAck),
NL_TEST_DEF("Test Is Peer Active Retry logic", TestReliableMessageProtocol::CheckIsPeerActiveNotInitiator),
NL_TEST_DEF("Test MRP backoff algorithm", TestReliableMessageProtocol::CheckGetBackoff),
NL_TEST_DEF("Test MRP backoff algorithm with additional time", TestReliableMessageProtocol::CheckGetBackoffAdditionalTime),
// TODO: Re-enable this test, after changing test to use Mock clock / DriveIO rather than DriveIOUntil.
// Issue: https://github.com/project-chip/connectedhomeip/issues/32440
// NL_TEST_DEF("Test an application response that comes after MRP retransmits run out",
Expand Down
13 changes: 4 additions & 9 deletions src/platform/nrfconnect/CHIPPlatformConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,16 @@
#define CHIP_CONFIG_LOG_MODULE_Support_PROGRESS 0
#endif

// Set MRP retry intervals for Thread and Wi-Fi to test-proven values.
#ifndef CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL
#if CHIP_ENABLE_OPENTHREAD
#define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (800_ms32)
#else
#ifndef CONFIG_NET_L2_OPENTHREAD
#define CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL (1000_ms32)
#endif // CHIP_ENABLE_OPENTHREAD
#endif // CONFIG_NET_L2_OPENTHREAD
#endif // CHIP_CONFIG_MRP_LOCAL_ACTIVE_RETRY_INTERVAL

#ifndef CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL
#if CHIP_ENABLE_OPENTHREAD
#define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (800_ms32)
#else
#ifndef CONFIG_NET_L2_OPENTHREAD
#define CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL (1000_ms32)
#endif // CHIP_ENABLE_OPENTHREAD
#endif // CONFIG_NET_L2_OPENTHREAD
#endif // CHIP_CONFIG_MRP_LOCAL_IDLE_RETRY_INTERVAL

#ifndef CHIP_CONFIG_ICD_IDLE_MODE_DURATION_SEC
Expand Down
11 changes: 7 additions & 4 deletions src/protocols/secure_channel/tests/TestPASESession.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,11 +304,14 @@ void SecurePairingHandshakeTestCommon(nlTestSuite * inSuite, void * inContext, S

while (delegate.mMessageDropped)
{
auto waitTimeout = 100_ms + CHIP_CONFIG_MRP_RETRY_INTERVAL_SENDER_BOOST;
auto waitTimeout = 100_ms;

#if CHIP_CONFIG_ENABLE_ICD_SERVER == 1
// If running as an ICD, increase waitTimeout to account for the polling interval
waitTimeout += ICDConfigurationData::GetInstance().GetSlowPollingInterval();
#if CHIP_CONFIG_ENABLE_ICD_SERVER
// If running as an ICD, increase waitTimeout to account for:
// - longer MRP intervals, configured above to 1s/1s,
// - the fast-polling interval that is added to the MRP backoff time.
waitTimeout += 2000_ms32;
waitTimeout += ICDConfigurationData::GetInstance().GetFastPollingInterval();
#endif

// Wait some time so the dropped message will be retransmitted when we drain the IO.
Expand Down

0 comments on commit 5c908a1

Please sign in to comment.