Skip to content

Commit

Permalink
fix(reshading): cancelling the resharding task should not cancel the …
Browse files Browse the repository at this point in the history
…entire resharding (#12739)

This PR fixes a rather old misconception in flat storage resharding.
When the slit parent task gets cancelled (because the node gets stopped)
the resharding status of parent and children should not be reverted back
to what it was previously.
In this way resharding can resume when the node starts again.
  • Loading branch information
Trisfald authored Jan 15, 2025
1 parent 1bde566 commit 8806e63
Showing 1 changed file with 16 additions and 12 deletions.
28 changes: 16 additions & 12 deletions chain/chain/src/flat_storage_resharder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ impl FlatStorageResharder {
.send(FlatStorageSplitShardRequest { resharder });
}

/// Cleans up children shards flat storage's content (status is excluded).
/// Cleans up children shards flat storage's content (status and deltas are excluded).
#[tracing::instrument(
level = "info",
target = "resharding",
Expand All @@ -300,7 +300,6 @@ impl FlatStorageResharder {
info!(target: "resharding", ?left_child_shard, ?right_child_shard, "cleaning up children shards flat storage's content");
let mut store_update = self.runtime.store().flat_store().store_update();
for child in [left_child_shard, right_child_shard] {
store_update.remove_all_deltas(*child);
store_update.remove_all_values(*child);
}
store_update.commit()?;
Expand Down Expand Up @@ -564,19 +563,24 @@ impl FlatStorageResharder {
);
}
}
FlatStorageReshardingTaskResult::Failed
| FlatStorageReshardingTaskResult::Cancelled => {
// We got an error or a cancellation request.
FlatStorageReshardingTaskResult::Failed => {
// Reset parent.
store_update.set_flat_storage_status(
parent_shard,
FlatStorageStatus::Ready(FlatStorageReadyStatus { flat_head }),
);
// Remove children shards leftovers.
// Remove children shards entirely.
for child_shard in [left_child_shard, right_child_shard] {
store_update.remove_flat_storage(child_shard);
}
}
FlatStorageReshardingTaskResult::Cancelled => {
// Remove children shards leftovers, but keep intact their current status and deltas
// plus the current status of the parent, so resharding can resume later.
for child_shard in [left_child_shard, right_child_shard] {
store_update.remove_all_values(child_shard);
}
}
FlatStorageReshardingTaskResult::Postponed => {
panic!("can't finalize processing of a postponed split task!");
}
Expand Down Expand Up @@ -1839,25 +1843,25 @@ mod tests {
// Perform resharding.
assert!(resharder.start_resharding(resharding_event_type, &new_shard_layout).is_ok());
let (parent_shard, split_params) = resharder.get_parent_shard_and_split_params().unwrap();
let ParentSplitParameters { left_child_shard, right_child_shard, flat_head, .. } =
split_params;
let ParentSplitParameters { left_child_shard, right_child_shard, .. } = split_params;

// Cancel the task before it starts.
resharder.controller.handle.stop();

// Run the task.
sender.call_split_shard_task();

// Check that resharding was effectively cancelled.
// Check that the resharding task was effectively cancelled.
// Note that resharding as a whole is not cancelled: it should resume if the node restarts.
let flat_store = resharder.runtime.store().flat_store();
assert_eq!(
assert_matches!(
flat_store.get_flat_storage_status(parent_shard),
Ok(FlatStorageStatus::Ready(FlatStorageReadyStatus { flat_head }))
Ok(FlatStorageStatus::Resharding(FlatStorageReshardingStatus::SplittingParent(_)))
);
for child_shard in [left_child_shard, right_child_shard] {
assert_eq!(
flat_store.get_flat_storage_status(child_shard),
Ok(FlatStorageStatus::Empty)
Ok(FlatStorageStatus::Resharding(FlatStorageReshardingStatus::CreatingChild))
);
assert_eq!(flat_store.iter(child_shard).count(), 0);
}
Expand Down

0 comments on commit 8806e63

Please sign in to comment.