Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

elastic scaling: rework core selector handling #6939

Open
wants to merge 46 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
4f898fc
add zombienet test for rfc 103
alindima Nov 12, 2024
89d8e97
Merge remote-tracking branch 'origin/master' into alindima/rfc-103-test
alindima Nov 12, 2024
a2028bf
override col_image
alindima Nov 13, 2024
0fcdcbb
add zombienet test for mixed receipt versions
alindima Nov 13, 2024
c81783f
fix yml
alindima Nov 13, 2024
d7fba75
try fixing col_image
alindima Nov 14, 2024
2e83300
remove resource reqs
alindima Nov 14, 2024
3a6b09e
fix
alindima Nov 14, 2024
037a632
relax
alindima Nov 20, 2024
dd3ede2
Merge remote-tracking branch 'origin/master' into alindima/rfc-103-test
alindima Nov 20, 2024
478fbb7
Merge remote-tracking branch 'origin/master' into alindima/rfc-103-test
alindima Nov 29, 2024
6f4dbb4
rewrite first test using zombienet-sdk
alindima Dec 2, 2024
610f29d
fixes
alindima Dec 2, 2024
7e1f0d4
add second test
alindima Dec 2, 2024
78eaf5e
do not re-init the logger
alindima Dec 2, 2024
184f206
prdoc
alindima Dec 2, 2024
01fbf6b
oops
alindima Dec 2, 2024
12c9f10
oops again
alindima Dec 2, 2024
069b7ba
fix prdoc
alindima Dec 2, 2024
385a139
Merge branch 'master' into alindima/rfc-103-test
alindima Dec 2, 2024
0a5f5ed
feedback
alindima Dec 4, 2024
b221b31
Merge branch 'master' into alindima/rfc-103-test
alindima Dec 4, 2024
e47d0f1
Merge branch 'master' into alindima/rfc-103-test
alindima Dec 10, 2024
7b36433
relax assertion
alindima Dec 10, 2024
f25672b
bump zombienet version
pepoviola Dec 11, 2024
145dfb5
update lock file
pepoviola Dec 11, 2024
3293a62
update zombienet
pepoviola Dec 12, 2024
467ada4
Merge remote-tracking branch 'origin/master' into alindima/rfc-103-test
alindima Dec 12, 2024
5b899f0
Merge branch 'master' into alindima/rfc-103-test
pepoviola Dec 12, 2024
994fb2a
update lock
pepoviola Dec 12, 2024
7988e65
WIP
alindima Dec 17, 2024
1db88ce
Merge remote-tracking branch 'origin/alindima/rfc-103-test' into alin…
alindima Dec 17, 2024
be786cb
some updates
alindima Dec 17, 2024
5c27201
Merge remote-tracking branch 'origin/master' into alindima/remove-exp…
alindima Dec 18, 2024
52bdd24
fixes
alindima Dec 18, 2024
90f7b5c
add candidate-validation tests
alindima Dec 18, 2024
e2d788e
clippy
alindima Dec 18, 2024
bb9a399
remove leftover feature flag
alindima Dec 18, 2024
137d9a4
add tests to parachain-system
alindima Dec 18, 2024
6dc74f6
remove dumb code
alindima Dec 23, 2024
796fd14
Merge remote-tracking branch 'origin/master' into alindima/remove-exp…
alindima Dec 23, 2024
4ef8830
fix warns
alindima Dec 23, 2024
81905bd
Merge branch 'master' into alindima/remove-experimental-ump-signals-f…
alindima Jan 6, 2025
b7b8031
Merge remote-tracking branch 'origin/master' into alindima/remove-exp…
alindima Jan 8, 2025
eddec7e
Merge remote-tracking branch 'origin/master' into alindima/remove-exp…
alindima Jan 13, 2025
61df047
fix zn test, do some renaming and add comments
alindima Jan 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitlab/pipeline/zombienet/polkadot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,7 @@ zombienet-polkadot-elastic-scaling-doesnt-break-parachains:
before_script:
- !reference [ ".zombienet-polkadot-common", "before_script" ]
- export POLKADOT_IMAGE="${ZOMBIENET_INTEGRATION_TEST_IMAGE}"
- export CUMULUS_IMAGE="${COL_IMAGE}"
- export X_INFRA_INSTANCE=spot # use spot by default
variables:
KUBERNETES_CPU_REQUEST: "1"
Expand Down
1 change: 1 addition & 0 deletions cumulus/client/collator/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ where
)
})
.ok()?;

let horizontal_messages = collation_info
.horizontal_messages
.try_into()
Expand Down
25 changes: 16 additions & 9 deletions cumulus/client/consensus/aura/src/collators/slot_based/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,13 +219,20 @@ where
{
let runtime_api = para_client.runtime_api();

if runtime_api.has_api::<dyn GetCoreSelectorApi<Block>>(parent_hash)? {
Ok(runtime_api.core_selector(parent_hash)?)
} else {
let next_block_number: U256 = (parent_number + One::one()).into();

// If the runtime API does not support the core selector API, fallback to some default
// values.
Ok((CoreSelector(next_block_number.byte(0)), ClaimQueueOffset(DEFAULT_CLAIM_QUEUE_OFFSET)))
}
let next_block_number: U256 = (parent_number + One::one()).into();
// If the runtime API does not support the core selector API, fallback to some default
// values.
let fallback_core_selector =
(CoreSelector(next_block_number.byte(0)), ClaimQueueOffset(DEFAULT_CLAIM_QUEUE_OFFSET));

let maybe_api_version =
runtime_api.api_version::<dyn GetCoreSelectorApi<Block>>(parent_hash)?;

Ok(match maybe_api_version {
Some(api_version) if api_version >= 2 =>
runtime_api.core_selector(parent_hash)?.unwrap_or(fallback_core_selector),
alindima marked this conversation as resolved.
Show resolved Hide resolved
#[allow(deprecated)]
Some(_) => runtime_api.core_selector_before_version_2(parent_hash)?,
None => fallback_core_selector,
})
}
2 changes: 0 additions & 2 deletions cumulus/pallets/parachain-system/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -120,5 +120,3 @@ try-runtime = [
"polkadot-runtime-parachains/try-runtime",
"sp-runtime/try-runtime",
]

experimental-ump-signals = []
82 changes: 52 additions & 30 deletions cumulus/pallets/parachain-system/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ use core::{cmp, marker::PhantomData};
use cumulus_primitives_core::{
relay_chain::{
self,
vstaging::{ClaimQueueOffset, CoreSelector, DEFAULT_CLAIM_QUEUE_OFFSET},
vstaging::{
ClaimQueueOffset, CoreSelector, UMPSignal, DEFAULT_CLAIM_QUEUE_OFFSET, UMP_SEPARATOR,
},
},
AbridgedHostConfiguration, ChannelInfo, ChannelStatus, CollationInfo, GetChannelInfo,
InboundDownwardMessage, InboundHrmpMessage, ListChannelInfos, MessageSendError,
Expand Down Expand Up @@ -192,44 +194,67 @@ pub mod ump_constants {
}

/// Trait for selecting the next core to build the candidate for.
/// If methods return `None`,
/// WARN: Only use an implementation that returns `Some(..)` if you're sure that all collator nodes
/// have been updated to the latest version and are using the slot-based collator.
pub trait SelectCore {
/// Core selector information for the current block.
fn selected_core() -> (CoreSelector, ClaimQueueOffset);
fn selected_core() -> Option<(CoreSelector, ClaimQueueOffset)>;
/// Core selector information for the next block.
fn select_next_core() -> (CoreSelector, ClaimQueueOffset);
fn select_next_core() -> Option<(CoreSelector, ClaimQueueOffset)>;
}

/// The default core selection policy.
pub struct DefaultCoreSelector<T>(PhantomData<T>);
impl SelectCore for () {
fn selected_core() -> Option<(CoreSelector, ClaimQueueOffset)> {
None
}
fn select_next_core() -> Option<(CoreSelector, ClaimQueueOffset)> {
None
}
}

/// Default core selection policy, no-op. Leaves the core selection up to the node side.
/// Backwards compatible with all node versions. Use this unless you are sure that all collator
/// nodes have been updated to latest version and are using the slot-based collator.
pub type DefaultCoreSelector = ();

/// Core selection policy which does a round-robin selection on the assigned cores at claim queue
/// depth 0.
/// WARN: Only use if you're sure that all collator nodes have been updated to the latest version
/// and are using the slot-based collator.
pub struct RoundRobinCoreSelector<T>(PhantomData<T>);

impl<T: frame_system::Config> SelectCore for DefaultCoreSelector<T> {
fn selected_core() -> (CoreSelector, ClaimQueueOffset) {
impl<T: frame_system::Config> SelectCore for RoundRobinCoreSelector<T> {
fn selected_core() -> Option<(CoreSelector, ClaimQueueOffset)> {
let core_selector: U256 = frame_system::Pallet::<T>::block_number().into();

(CoreSelector(core_selector.byte(0)), ClaimQueueOffset(DEFAULT_CLAIM_QUEUE_OFFSET))
Some((CoreSelector(core_selector.byte(0)), ClaimQueueOffset(DEFAULT_CLAIM_QUEUE_OFFSET)))
}

fn select_next_core() -> (CoreSelector, ClaimQueueOffset) {
fn select_next_core() -> Option<(CoreSelector, ClaimQueueOffset)> {
let core_selector: U256 = (frame_system::Pallet::<T>::block_number() + One::one()).into();

(CoreSelector(core_selector.byte(0)), ClaimQueueOffset(DEFAULT_CLAIM_QUEUE_OFFSET))
Some((CoreSelector(core_selector.byte(0)), ClaimQueueOffset(DEFAULT_CLAIM_QUEUE_OFFSET)))
}
}

/// Core selection policy that builds on claim queue offset 1.
pub struct LookaheadCoreSelector<T>(PhantomData<T>);
/// Core selection policy which does a round-robin selection on the assigned cores at claim queue
/// depth 1.
/// WARN: Only use if you're sure that all collator nodes have been updated to the latest version
/// and are using the slot-based collator.
pub struct LookaheadRoundRobinCoreSelector<T>(PhantomData<T>);

impl<T: frame_system::Config> SelectCore for LookaheadCoreSelector<T> {
fn selected_core() -> (CoreSelector, ClaimQueueOffset) {
impl<T: frame_system::Config> SelectCore for LookaheadRoundRobinCoreSelector<T> {
fn selected_core() -> Option<(CoreSelector, ClaimQueueOffset)> {
let core_selector: U256 = frame_system::Pallet::<T>::block_number().into();

(CoreSelector(core_selector.byte(0)), ClaimQueueOffset(1))
Some((CoreSelector(core_selector.byte(0)), ClaimQueueOffset(1)))
}

fn select_next_core() -> (CoreSelector, ClaimQueueOffset) {
fn select_next_core() -> Option<(CoreSelector, ClaimQueueOffset)> {
let core_selector: U256 = (frame_system::Pallet::<T>::block_number() + One::one()).into();

(CoreSelector(core_selector.byte(0)), ClaimQueueOffset(1))
Some((CoreSelector(core_selector.byte(0)), ClaimQueueOffset(1)))
}
}

Expand Down Expand Up @@ -391,9 +416,7 @@ pub mod pallet {
UpwardMessages::<T>::put(&up[..num as usize]);
*up = up.split_off(num as usize);

// Send the core selector UMP signal. This is experimental until relay chain
// validators are upgraded to handle ump signals.
#[cfg(feature = "experimental-ump-signals")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't we also used this to guard until enabling RFC103 on the relay chain?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's important that they know at least to ignore the signals if the feature is disabled (which I'm pretty sure was included in a node release already, will check)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm, I think you're right. the feature needs to be enabled on the relay chain, this was another reason I chose a compile time feature back then..

So I'll leave the core selector config for later and cherry-pick the changes to only refuse candidates that output UMP signals but use v1 descriptor into its own PR that can be merged and also backported

// Send the core selector UMP signal.
Self::send_ump_signal();

// If the total size of the pending messages is less than the threshold,
Expand Down Expand Up @@ -1429,7 +1452,7 @@ impl<T: Config> Pallet<T> {
}

/// Returns the core selector for the next block.
pub fn core_selector() -> (CoreSelector, ClaimQueueOffset) {
pub fn core_selector() -> Option<(CoreSelector, ClaimQueueOffset)> {
T::SelectCore::select_next_core()
}

Expand All @@ -1450,17 +1473,16 @@ impl<T: Config> Pallet<T> {
}

/// Send the ump signals
#[cfg(feature = "experimental-ump-signals")]
fn send_ump_signal() {
use cumulus_primitives_core::relay_chain::vstaging::{UMPSignal, UMP_SEPARATOR};
// If the runtime is configured with a core selection policy, send the core selector signal.
let maybe_core_selector = T::SelectCore::selected_core();

UpwardMessages::<T>::mutate(|up| {
up.push(UMP_SEPARATOR);

// Send the core selector signal.
let core_selector = T::SelectCore::selected_core();
up.push(UMPSignal::SelectCore(core_selector.0, core_selector.1).encode());
});
if let Some(core_selector) = maybe_core_selector {
UpwardMessages::<T>::mutate(|up| {
up.push(UMP_SEPARATOR);
up.push(UMPSignal::SelectCore(core_selector.0, core_selector.1).encode());
});
}
}

/// Open HRMP channel for using it in benchmarks or tests.
Expand Down
28 changes: 26 additions & 2 deletions cumulus/pallets/parachain-system/src/mock.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use super::*;

use alloc::collections::vec_deque::VecDeque;
use codec::Encode;
use core::num::NonZeroU32;
use core::{cell::Cell, num::NonZeroU32};
use cumulus_primitives_core::{
relay_chain::BlockNumber as RelayBlockNumber, AggregateMessageOrigin, InboundDownwardMessage,
InboundHrmpMessage, PersistedValidationData,
Expand Down Expand Up @@ -94,7 +94,31 @@ impl Config for Test {
type CheckAssociatedRelayNumber = AnyRelayNumber;
type ConsensusHook = TestConsensusHook;
type WeightInfo = ();
type SelectCore = DefaultCoreSelector<Test>;
type SelectCore = TestCoreSelector<RoundRobinCoreSelector<Test>>;
}

std::thread_local! {
pub static USE_CORE_SELECTOR: Cell<bool> = Cell::new(true);
}

pub struct TestCoreSelector<Selector>(PhantomData<Selector>);

impl<Selector: SelectCore> SelectCore for TestCoreSelector<Selector> {
fn selected_core() -> Option<(CoreSelector, ClaimQueueOffset)> {
if USE_CORE_SELECTOR.get() {
Selector::selected_core()
} else {
None
}
}

fn select_next_core() -> Option<(CoreSelector, ClaimQueueOffset)> {
if USE_CORE_SELECTOR.get() {
Selector::select_next_core()
} else {
None
}
}
}

std::thread_local! {
Expand Down
Loading
Loading