Skip to content

Commit

Permalink
Add cluster provision functionality
Browse files Browse the repository at this point in the history
After starting the metdata store service and the grpc server, the node will
try to initialize itself by joining an existing cluster. Additionally each node
exposes an provision cluster grpc call with which it is possible to provision
a cluster (writing the initial NodesConfiguration, PartitionTable and Logs).
Nodes can only join after the cluster is provisioned.

This fixes restatedev#2409.
  • Loading branch information
tillrohrmann committed Dec 24, 2024
1 parent 1ac1f70 commit 46215ab
Show file tree
Hide file tree
Showing 23 changed files with 825 additions and 268 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 10 additions & 25 deletions crates/admin/src/cluster_controller/logs_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,8 @@ use tracing::{debug, error, trace, trace_span, Instrument};
use xxhash_rust::xxh3::Xxh3Builder;

use restate_bifrost::{Bifrost, BifrostAdmin, Error as BifrostError};
use restate_core::metadata_store::{
retry_on_network_error, MetadataStoreClient, Precondition, ReadWriteError, WriteError,
};
use restate_core::metadata_store::{MetadataStoreClient, Precondition, ReadWriteError, WriteError};
use restate_core::{Metadata, MetadataWriter, ShutdownError, TaskCenterFutureExt};
use restate_types::config::Configuration;
use restate_types::errors::GenericError;
use restate_types::identifiers::PartitionId;
use restate_types::live::Pinned;
Expand Down Expand Up @@ -639,9 +636,9 @@ struct LogsControllerInner {
}

impl LogsControllerInner {
fn new(configuration: LogsConfiguration, retry_policy: RetryPolicy) -> Self {
fn new(current_logs: Arc<Logs>, retry_policy: RetryPolicy) -> Self {
Self {
current_logs: Arc::new(Logs::with_logs_configuration(configuration)),
current_logs,
logs_state: HashMap::with_hasher(Xxh3Builder::default()),
logs_write_in_progress: None,
retry_policy,
Expand Down Expand Up @@ -925,26 +922,11 @@ pub struct LogsController {
}

impl LogsController {
pub async fn init(
configuration: &Configuration,
pub fn new(
bifrost: Bifrost,
metadata_store_client: MetadataStoreClient,
metadata_writer: MetadataWriter,
) -> Result<Self> {
// obtain the latest logs or init it with an empty logs variant
let logs = retry_on_network_error(
configuration.common.network_error_retry_policy.clone(),
|| {
metadata_store_client.get_or_insert(BIFROST_CONFIG_KEY.clone(), || {
Logs::from_configuration(configuration)
})
},
)
.await?;

let logs_configuration = logs.configuration().clone();
metadata_writer.update(Arc::new(logs)).await?;

) -> Self {
//todo(azmy): make configurable
let retry_policy = RetryPolicy::exponential(
Duration::from_millis(10),
Expand All @@ -955,7 +937,10 @@ impl LogsController {

let mut this = Self {
effects: Some(Vec::new()),
inner: LogsControllerInner::new(logs_configuration, retry_policy),
inner: LogsControllerInner::new(
Metadata::with_current(|m| m.logs_snapshot()),
retry_policy,
),
bifrost,
metadata_store_client,
metadata_writer,
Expand All @@ -964,7 +949,7 @@ impl LogsController {
};

this.find_logs_tail();
Ok(this)
this
}

pub fn find_logs_tail(&mut self) {
Expand Down
32 changes: 1 addition & 31 deletions crates/admin/src/cluster_controller/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ use restate_types::partition_table::{
use restate_types::replicated_loglet::ReplicatedLogletParams;

use restate_bifrost::{Bifrost, BifrostAdmin, SealedSegment};
use restate_core::metadata_store::{retry_on_network_error, MetadataStoreClient};
use restate_core::metadata_store::MetadataStoreClient;
use restate_core::network::rpc_router::RpcRouter;
use restate_core::network::tonic_service_filter::{TonicServiceFilter, WaitForReady};
use restate_core::network::{
Expand Down Expand Up @@ -296,8 +296,6 @@ impl<T: TransportConnect> Service<T> {
}

pub async fn run(mut self) -> anyhow::Result<()> {
self.init_partition_table().await?;

let mut config_watcher = Configuration::watcher();
let mut cluster_state_watcher = self.cluster_state_refresher.cluster_state_watcher();

Expand Down Expand Up @@ -353,34 +351,6 @@ impl<T: TransportConnect> Service<T> {
}
}

/// creates partition table iff it does not exist
async fn init_partition_table(&mut self) -> anyhow::Result<()> {
let configuration = self.configuration.live_load();

let partition_table = retry_on_network_error(
configuration.common.network_error_retry_policy.clone(),
|| {
self.metadata_store_client
.get_or_insert(PARTITION_TABLE_KEY.clone(), || {
let partition_table = PartitionTable::with_equally_sized_partitions(
Version::MIN,
configuration.common.bootstrap_num_partitions.get(),
);

debug!("Initializing the partition table with '{partition_table:?}'");

partition_table
})
},
)
.await?;

self.metadata_writer
.update(Arc::new(partition_table))
.await?;

Ok(())
}
/// Triggers a snapshot creation for the given partition by issuing an RPC
/// to the node hosting the active leader.
async fn create_partition_snapshot(
Expand Down
6 changes: 2 additions & 4 deletions crates/admin/src/cluster_controller/service/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,11 @@ where
)
.await?;

let logs_controller = LogsController::init(
&configuration,
let logs_controller = LogsController::new(
service.bifrost.clone(),
service.metadata_store_client.clone(),
service.metadata_writer.clone(),
)
.await?;
);

let (log_trim_interval, log_trim_threshold) =
create_log_trim_interval(&configuration.admin);
Expand Down
8 changes: 4 additions & 4 deletions crates/bifrost/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@ impl BifrostService {
pub fn handle(&self) -> Bifrost {
self.bifrost.clone()
}
/// Runs initialization phase, then returns a handle to join on shutdown.
/// In this phase the system should wait until this is completed before
/// continuing. For instance, a worker mark itself as `STARTING_UP` and not
/// accept any requests until this is completed.

/// Runs initialization phase. In this phase the system should wait until this is completed
/// before continuing. For instance, a worker mark itself as `STARTING_UP` and not accept any
/// requests until this is completed.
///
/// This requires to run within a task_center context.
pub async fn start(self) -> anyhow::Result<()> {
Expand Down
1 change: 1 addition & 0 deletions crates/core/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.protoc_arg("--experimental_allow_proto3_optional")
.extern_path(".restate.node", "::restate_types::protobuf::node")
.extern_path(".restate.common", "::restate_types::protobuf::common")
.extern_path(".restate.cluster", "::restate_types::protobuf::cluster")
.compile_protos(
&["./protobuf/node_ctl_svc.proto"],
&["protobuf", "../types/protobuf"],
Expand Down
25 changes: 25 additions & 0 deletions crates/core/protobuf/node_ctl_svc.proto
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
syntax = "proto3";

import "google/protobuf/empty.proto";
import "restate/cluster.proto";
import "restate/common.proto";
import "restate/node.proto";

Expand All @@ -20,6 +21,30 @@ service NodeCtlSvc {
rpc GetIdent(google.protobuf.Empty) returns (IdentResponse);

rpc GetMetadata(GetMetadataRequest) returns (GetMetadataResponse);

// Provision the Restate cluster on this node.
rpc ProvisionCluster(ProvisionClusterRequest) returns (ProvisionClusterResponse);
}

message ProvisionClusterRequest {
bool dry_run = 1;
optional uint32 num_partitions = 2;
optional restate.cluster.ReplicationStrategy placement_strategy = 3;
optional restate.cluster.DefaultProvider log_provider = 4;
}

enum ProvisionClusterResponseKind {
ProvisionClusterResponseType_UNKNOWN = 0;
ERROR = 1;
SUCCESS = 2;
DRY_RUN = 3;
}

message ProvisionClusterResponse {
ProvisionClusterResponseKind kind = 1;
// If there is an error, this field will be set. All other fields will be empty.
optional string error = 2;
optional restate.cluster.ClusterConfiguration cluster_configuration = 3;
}

message IdentResponse {
Expand Down
141 changes: 129 additions & 12 deletions crates/core/src/metadata_store.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@ use async_trait::async_trait;
use bytes::{Bytes, BytesMut};
use bytestring::ByteString;
use restate_types::errors::GenericError;
use restate_types::metadata_store::keys::NODES_CONFIG_KEY;
use restate_types::nodes_config::NodesConfiguration;
use restate_types::retries::RetryPolicy;
use restate_types::storage::{StorageCodec, StorageDecode, StorageEncode};
use restate_types::storage::{StorageCodec, StorageDecode, StorageEncode, StorageEncodeError};
use restate_types::{flexbuffers_storage_encode_decode, Version, Versioned};
use std::future::Future;
use std::sync::Arc;
Expand Down Expand Up @@ -51,6 +53,29 @@ pub enum WriteError {
Store(GenericError),
}

#[derive(Debug, thiserror::Error)]
pub enum ProvisionError {
#[error("network error: {0}")]
Network(GenericError),
#[error("internal error: {0}")]
Internal(String),
#[error("codec error: {0}")]
Codec(GenericError),
#[error("store error: {0}")]
Store(GenericError),
}

impl MetadataStoreClientError for ProvisionError {
fn is_network_error(&self) -> bool {
match self {
ProvisionError::Network(_) => true,
ProvisionError::Internal(_) | ProvisionError::Codec(_) | ProvisionError::Store(_) => {
false
}
}
}
}

#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct VersionedValue {
pub version: Version,
Expand Down Expand Up @@ -100,6 +125,89 @@ pub trait MetadataStore {
/// Deletes the key-value pair for the given key following the provided precondition. If the
/// precondition is not met, then the operation returns a [`WriteError::PreconditionViolation`].
async fn delete(&self, key: ByteString, precondition: Precondition) -> Result<(), WriteError>;

/// Tries to provision the metadata store with the provided [`NodesConfiguration`]. Returns
/// `true` if the metadata store was newly provisioned. Returns `false` if the metadata store
/// is already provisioned.
async fn provision(
&self,
nodes_configuration: &NodesConfiguration,
) -> Result<bool, ProvisionError>;
}

/// A provisioned metadata store does not need to be explicitly provisioned. Therefore, a provision
/// call is translated into a put command.
#[async_trait]
pub trait ProvisionedMetadataStore {
/// Gets the value and its current version for the given key. If key-value pair is not present,
/// then return [`None`].
async fn get(&self, key: ByteString) -> Result<Option<VersionedValue>, ReadError>;

/// Gets the current version for the given key. If key-value pair is not present, then return
/// [`None`].
async fn get_version(&self, key: ByteString) -> Result<Option<Version>, ReadError>;

/// Puts the versioned value under the given key following the provided precondition. If the
/// precondition is not met, then the operation returns a [`WriteError::PreconditionViolation`].
async fn put(
&self,
key: ByteString,
value: VersionedValue,
precondition: Precondition,
) -> Result<(), WriteError>;

/// Deletes the key-value pair for the given key following the provided precondition. If the
/// precondition is not met, then the operation returns a [`WriteError::PreconditionViolation`].
async fn delete(&self, key: ByteString, precondition: Precondition) -> Result<(), WriteError>;
}

#[async_trait]
impl<T: ProvisionedMetadataStore + Sync> MetadataStore for T {
async fn get(&self, key: ByteString) -> Result<Option<VersionedValue>, ReadError> {
self.get(key).await
}

async fn get_version(&self, key: ByteString) -> Result<Option<Version>, ReadError> {
self.get_version(key).await
}

async fn put(
&self,
key: ByteString,
value: VersionedValue,
precondition: Precondition,
) -> Result<(), WriteError> {
self.put(key, value, precondition).await
}

async fn delete(&self, key: ByteString, precondition: Precondition) -> Result<(), WriteError> {
self.delete(key, precondition).await
}

async fn provision(
&self,
nodes_configuration: &NodesConfiguration,
) -> Result<bool, ProvisionError> {
let versioned_value = serialize_value(nodes_configuration)
.map_err(|err| ProvisionError::Codec(err.into()))?;
match self
.put(
NODES_CONFIG_KEY.clone(),
versioned_value,
Precondition::DoesNotExist,
)
.await
{
Ok(()) => Ok(true),
Err(err) => match err {
WriteError::FailedPrecondition(_) => Ok(false),
WriteError::Network(err) => Err(ProvisionError::Network(err)),
WriteError::Internal(err) => Err(ProvisionError::Internal(err)),
WriteError::Codec(err) => Err(ProvisionError::Codec(err)),
WriteError::Store(err) => Err(ProvisionError::Store(err)),
},
}
}
}

/// Metadata store client which allows storing [`Versioned`] values into a [`MetadataStore`].
Expand Down Expand Up @@ -170,18 +278,10 @@ impl MetadataStoreClient {
where
T: Versioned + StorageEncode,
{
let version = value.version();
let versioned_value =
serialize_value(value).map_err(|err| WriteError::Codec(err.into()))?;

let mut buf = BytesMut::default();
StorageCodec::encode(value, &mut buf).map_err(|err| WriteError::Codec(err.into()))?;

self.inner
.put(
key,
VersionedValue::new(version, buf.freeze()),
precondition,
)
.await
self.inner.put(key, versioned_value, precondition).await
}

/// Deletes the key-value pair for the given key following the provided precondition. If the
Expand Down Expand Up @@ -285,6 +385,23 @@ impl MetadataStoreClient {
}
}
}

pub async fn provision(
&self,
nodes_configuration: &NodesConfiguration,
) -> Result<bool, ProvisionError> {
self.inner.provision(nodes_configuration).await
}
}

pub fn serialize_value<T: Versioned + StorageEncode>(
value: &T,
) -> Result<VersionedValue, StorageEncodeError> {
let version = value.version();
let mut buf = BytesMut::default();
StorageCodec::encode(value, &mut buf)?;
let versioned_value = VersionedValue::new(version, buf.freeze());
Ok(versioned_value)
}

#[derive(Debug, thiserror::Error)]
Expand Down
Loading

0 comments on commit 46215ab

Please sign in to comment.