From 8806e63e98063f901646e0c3739d9e5dfc736d1c Mon Sep 17 00:00:00 2001 From: Andrea Date: Wed, 15 Jan 2025 15:49:37 +0100 Subject: [PATCH] fix(reshading): cancelling the resharding task should not cancel the entire resharding (#12739) This PR fixes a rather old misconception in flat storage resharding. When the slit parent task gets cancelled (because the node gets stopped) the resharding status of parent and children should not be reverted back to what it was previously. In this way resharding can resume when the node starts again. --- chain/chain/src/flat_storage_resharder.rs | 28 +++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/chain/chain/src/flat_storage_resharder.rs b/chain/chain/src/flat_storage_resharder.rs index f13957dcd4c..852c4df9170 100644 --- a/chain/chain/src/flat_storage_resharder.rs +++ b/chain/chain/src/flat_storage_resharder.rs @@ -287,7 +287,7 @@ impl FlatStorageResharder { .send(FlatStorageSplitShardRequest { resharder }); } - /// Cleans up children shards flat storage's content (status is excluded). + /// Cleans up children shards flat storage's content (status and deltas are excluded). #[tracing::instrument( level = "info", target = "resharding", @@ -300,7 +300,6 @@ impl FlatStorageResharder { info!(target: "resharding", ?left_child_shard, ?right_child_shard, "cleaning up children shards flat storage's content"); let mut store_update = self.runtime.store().flat_store().store_update(); for child in [left_child_shard, right_child_shard] { - store_update.remove_all_deltas(*child); store_update.remove_all_values(*child); } store_update.commit()?; @@ -564,19 +563,24 @@ impl FlatStorageResharder { ); } } - FlatStorageReshardingTaskResult::Failed - | FlatStorageReshardingTaskResult::Cancelled => { - // We got an error or a cancellation request. + FlatStorageReshardingTaskResult::Failed => { // Reset parent. store_update.set_flat_storage_status( parent_shard, FlatStorageStatus::Ready(FlatStorageReadyStatus { flat_head }), ); - // Remove children shards leftovers. + // Remove children shards entirely. for child_shard in [left_child_shard, right_child_shard] { store_update.remove_flat_storage(child_shard); } } + FlatStorageReshardingTaskResult::Cancelled => { + // Remove children shards leftovers, but keep intact their current status and deltas + // plus the current status of the parent, so resharding can resume later. + for child_shard in [left_child_shard, right_child_shard] { + store_update.remove_all_values(child_shard); + } + } FlatStorageReshardingTaskResult::Postponed => { panic!("can't finalize processing of a postponed split task!"); } @@ -1839,8 +1843,7 @@ mod tests { // Perform resharding. assert!(resharder.start_resharding(resharding_event_type, &new_shard_layout).is_ok()); let (parent_shard, split_params) = resharder.get_parent_shard_and_split_params().unwrap(); - let ParentSplitParameters { left_child_shard, right_child_shard, flat_head, .. } = - split_params; + let ParentSplitParameters { left_child_shard, right_child_shard, .. } = split_params; // Cancel the task before it starts. resharder.controller.handle.stop(); @@ -1848,16 +1851,17 @@ mod tests { // Run the task. sender.call_split_shard_task(); - // Check that resharding was effectively cancelled. + // Check that the resharding task was effectively cancelled. + // Note that resharding as a whole is not cancelled: it should resume if the node restarts. let flat_store = resharder.runtime.store().flat_store(); - assert_eq!( + assert_matches!( flat_store.get_flat_storage_status(parent_shard), - Ok(FlatStorageStatus::Ready(FlatStorageReadyStatus { flat_head })) + Ok(FlatStorageStatus::Resharding(FlatStorageReshardingStatus::SplittingParent(_))) ); for child_shard in [left_child_shard, right_child_shard] { assert_eq!( flat_store.get_flat_storage_status(child_shard), - Ok(FlatStorageStatus::Empty) + Ok(FlatStorageStatus::Resharding(FlatStorageReshardingStatus::CreatingChild)) ); assert_eq!(flat_store.iter(child_shard).count(), 0); }