diff --git a/.config/nextest.toml b/.config/nextest.toml index fa2933367..0e29314f7 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -3,3 +3,4 @@ retries = 0 slow-timeout = { period = "10s", terminate-after = 3 } status-level = "all" final-status-level = "slow" +fail-fast = true diff --git a/Cargo.lock b/Cargo.lock index cab788fcd..b817d71f3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -344,6 +344,7 @@ dependencies = [ "clap", "clippy-utilities", "etcd-client", + "futures", "indicatif", "rand", "thiserror", @@ -3723,6 +3724,7 @@ dependencies = [ "engine", "etcd-client", "event-listener", + "flume", "futures", "hyper", "itertools 0.13.0", diff --git a/Cargo.toml b/Cargo.toml index 8afd49d17..1d04beb35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,3 +27,4 @@ madsim = { git = "https://github.com/LucienY01/madsim.git", branch = "bz/tonic-0 madsim-tonic = { git = "https://github.com/LucienY01/madsim.git", branch = "bz/tonic-0-12" } madsim-tonic-build = { git = "https://github.com/LucienY01/madsim.git", branch = "bz/tonic-0-12" } madsim-tokio = { git = "https://github.com/LucienY01/madsim.git", branch = "bz/tonic-0-12" } + diff --git a/crates/benchmark/Cargo.toml b/crates/benchmark/Cargo.toml index 652e49932..c0443fade 100644 --- a/crates/benchmark/Cargo.toml +++ b/crates/benchmark/Cargo.toml @@ -17,6 +17,7 @@ anyhow = "1.0.83" clap = { version = "4", features = ["derive"] } clippy-utilities = "0.2.0" etcd-client = { version = "0.14.0", features = ["tls"] } +futures = "0.3.30" indicatif = "0.17.8" rand = "0.8.5" thiserror = "1.0.61" diff --git a/crates/benchmark/src/runner.rs b/crates/benchmark/src/runner.rs index f53063d59..fb167716f 100644 --- a/crates/benchmark/src/runner.rs +++ b/crates/benchmark/src/runner.rs @@ -9,6 +9,7 @@ use std::{ use anyhow::Result; use clippy_utilities::{NumericCast, OverflowArithmetic}; +use futures::future::join_all; use indicatif::ProgressBar; use rand::RngCore; use tokio::{ @@ -158,7 +159,6 @@ impl CommandRunner { /// Create clients async fn create_clients(&self) -> Result> { - let mut clients = Vec::with_capacity(self.args.clients); let client_options = ClientOptions::default().with_client_config(ClientConfig::new( Duration::from_secs(10), Duration::from_secs(5), @@ -180,11 +180,15 @@ impl CommandRunner { } }) .collect::>(); - for _ in 0..self.args.clients { - let client = - BenchClient::new(addrs.clone(), self.args.use_curp, client_options.clone()).await?; - clients.push(client); - } + let clients_futs = std::iter::repeat_with(|| { + BenchClient::new(addrs.clone(), self.args.use_curp, client_options.clone()) + }) + .take(self.args.clients); + let clients = join_all(clients_futs) + .await + .into_iter() + .collect::>()?; + Ok(clients) } diff --git a/crates/curp-external-api/src/cmd.rs b/crates/curp-external-api/src/cmd.rs index e0a634f0a..5b282b8bd 100644 --- a/crates/curp-external-api/src/cmd.rs +++ b/crates/curp-external-api/src/cmd.rs @@ -28,7 +28,7 @@ impl pri::Serializable for T where T: pri::ThreadSafe + Clone + Serialize + D #[async_trait] pub trait Command: pri::Serializable + ConflictCheck + PbCodec { /// Error type - type Error: pri::Serializable + PbCodec + std::error::Error; + type Error: pri::Serializable + PbCodec + std::error::Error + Clone; /// K (key) is used to tell confliction /// @@ -50,48 +50,17 @@ pub trait Command: pri::Serializable + ConflictCheck + PbCodec { /// Returns `true` if the command is read-only fn is_read_only(&self) -> bool; - /// Prepare the command - /// - /// # Errors - /// - /// Return `Self::Error` when `CommandExecutor::prepare` goes wrong - #[inline] - fn prepare(&self, e: &E) -> Result - where - E: CommandExecutor + Send + Sync, - { - >::prepare(e, self) - } - /// Execute the command according to the executor /// /// # Errors /// /// Return `Self::Error` when `CommandExecutor::execute` goes wrong #[inline] - async fn execute(&self, e: &E) -> Result - where - E: CommandExecutor + Send + Sync, - { - >::execute(e, self).await - } - - /// Execute the command after_sync callback - /// - /// # Errors - /// - /// Return `Self::Error` when `CommandExecutor::after_sync` goes wrong - #[inline] - async fn after_sync( - &self, - e: &E, - index: LogIndex, - prepare_res: Self::PR, - ) -> Result + fn execute(&self, e: &E) -> Result where E: CommandExecutor + Send + Sync, { - >::after_sync(e, self, index, prepare_res).await + >::execute(e, self) } } @@ -127,40 +96,42 @@ pub trait CommandExecutor: pri::ThreadSafe where C: Command, { - /// Prepare the command + /// Execute the command /// /// # Errors /// - /// This function may return an error if there is a problem preparing the command. - fn prepare(&self, cmd: &C) -> Result; + /// This function may return an error if there is a problem executing the + /// command. + fn execute(&self, cmd: &C) -> Result; - /// Execute the command + /// Execute the read-only command /// /// # Errors /// - /// This function may return an error if there is a problem executing the command. - async fn execute(&self, cmd: &C) -> Result; + /// This function may return an error if there is a problem executing the + /// command. + fn execute_ro(&self, cmd: &C) -> Result<(C::ER, C::ASR), C::Error>; - /// Execute the after_sync callback - /// - /// # Errors + /// Batch execute the after_sync callback /// - /// This function may return an error if there is a problem executing the after_sync callback. - async fn after_sync( + /// This `highest_index` means the last log index of the `cmds` + fn after_sync( &self, - cmd: &C, - index: LogIndex, - prepare_res: C::PR, - ) -> Result; + cmds: Vec>, + // might be `None` if it's a speculative execution + highest_index: Option, + ) -> Vec, C::Error>>; - /// Set the index of the last log entry that has been successfully applied to the command executor + /// Set the index of the last log entry that has been successfully applied + /// to the command executor /// /// # Errors /// /// Returns an error if setting the last applied log entry fails. fn set_last_applied(&self, index: LogIndex) -> Result<(), C::Error>; - /// Get the index of the last log entry that has been successfully applied to the command executor + /// Get the index of the last log entry that has been successfully applied + /// to the command executor /// /// # Errors /// @@ -171,17 +142,21 @@ where /// /// # Errors /// - /// This function may return an error if there is a problem taking a snapshot. + /// This function may return an error if there is a problem taking a + /// snapshot. async fn snapshot(&self) -> Result; - /// Reset the command executor using the snapshot or to the initial state if None + /// Reset the command executor using the snapshot or to the initial state if + /// None /// /// # Errors /// - /// This function may return an error if there is a problem resetting the command executor. + /// This function may return an error if there is a problem resetting the + /// command executor. async fn reset(&self, snapshot: Option<(Snapshot, LogIndex)>) -> Result<(), C::Error>; - /// Trigger the barrier of the given trigger id (based on propose id) and log index. + /// Trigger the barrier of the given trigger id (based on propose id) and + /// log index. fn trigger(&self, id: InflightId); } @@ -215,3 +190,59 @@ impl From for PbSerializeError { PbSerializeError::RpcDecode(err) } } + +#[allow(clippy::module_name_repetitions)] +/// After sync command type +#[derive(Debug)] +pub struct AfterSyncCmd<'a, C> { + /// The command + cmd: &'a C, + /// Whether the command needs to be executed in after sync stage + to_execute: bool, +} + +impl<'a, C> AfterSyncCmd<'a, C> { + /// Creates a new `AfterSyncCmd` + #[inline] + pub fn new(cmd: &'a C, to_execute: bool) -> Self { + Self { cmd, to_execute } + } + + /// Gets the command + #[inline] + #[must_use] + pub fn cmd(&self) -> &'a C { + self.cmd + } + + /// Convert self into parts + #[inline] + #[must_use] + pub fn into_parts(&'a self) -> (&'a C, bool) { + (self.cmd, self.to_execute) + } +} + +/// Ok type of the after sync result +#[derive(Debug)] +pub struct AfterSyncOk { + /// After Sync Result + asr: C::ASR, + /// Optional Execution Result + er_opt: Option, +} + +impl AfterSyncOk { + /// Creates a new [`AfterSyncOk`]. + #[inline] + pub fn new(asr: C::ASR, er_opt: Option) -> Self { + Self { asr, er_opt } + } + + /// Decomposes `AfterSyncOk` into its constituent parts. + #[inline] + pub fn into_parts(self) -> (C::ASR, Option) { + let Self { asr, er_opt } = self; + (asr, er_opt) + } +} diff --git a/crates/curp-test-utils/src/test_cmd.rs b/crates/curp-test-utils/src/test_cmd.rs index fec6aef60..c3fa23895 100644 --- a/crates/curp-test-utils/src/test_cmd.rs +++ b/crates/curp-test-utils/src/test_cmd.rs @@ -9,7 +9,7 @@ use std::{ use async_trait::async_trait; use curp_external_api::{ - cmd::{Command, CommandExecutor, ConflictCheck, PbCodec}, + cmd::{AfterSyncCmd, AfterSyncOk, Command, CommandExecutor, ConflictCheck, PbCodec}, InflightId, LogIndex, }; use engine::{ @@ -18,7 +18,7 @@ use engine::{ use itertools::Itertools; use serde::{Deserialize, Serialize}; use thiserror::Error; -use tokio::{sync::mpsc, time::sleep}; +use tokio::sync::mpsc; use tracing::debug; use utils::config::EngineConfig; @@ -239,32 +239,11 @@ pub struct TestCE { #[async_trait] impl CommandExecutor for TestCE { - fn prepare( - &self, - cmd: &TestCommand, - ) -> Result<::PR, ::Error> { - let rev = if let TestCommandType::Put(_) = cmd.cmd_type { - let rev = self.revision.fetch_add(1, Ordering::Relaxed); - let wr_ops = vec![WriteOperation::new_put( - META_TABLE, - LAST_REVISION_KEY.into(), - rev.to_le_bytes().to_vec(), - )]; - self.store - .write_multi(wr_ops, true) - .map_err(|e| ExecuteError(e.to_string()))?; - rev - } else { - -1 - }; - Ok(rev) - } - - async fn execute( + fn execute( &self, cmd: &TestCommand, ) -> Result<::ER, ::Error> { - sleep(cmd.exe_dur).await; + std::thread::sleep(cmd.exe_dur); if cmd.exe_should_fail { return Err(ExecuteError("fail".to_owned())); } @@ -305,53 +284,101 @@ impl CommandExecutor for TestCE { Ok(result) } - async fn after_sync( + fn execute_ro( &self, cmd: &TestCommand, - index: LogIndex, - revision: ::PR, - ) -> Result<::ASR, ::Error> { - sleep(cmd.as_dur).await; - if cmd.as_should_fail { - return Err(ExecuteError("fail".to_owned())); + ) -> Result< + (::ER, ::ASR), + ::Error, + > { + self.execute(cmd).map(|er| (er, LogIndexResult(0))) + } + + fn after_sync( + &self, + cmds: Vec>, + highest_index: Option, + ) -> Vec, ::Error>> { + let as_duration = cmds + .iter() + .fold(Duration::default(), |acc, c| acc + c.cmd().as_dur); + std::thread::sleep(as_duration); + let total = cmds.len(); + let mut wr_ops = Vec::new(); + + if let Some(index) = highest_index { + for (i, cmd) in cmds.iter().enumerate() { + // Calculate the log index of the current cmd + let index = index - (total - i - 1) as u64; + self.after_sync_sender + .send((cmd.cmd().clone(), index)) + .expect("failed to send after sync msg"); + } + wr_ops.push(WriteOperation::new_put( + META_TABLE, + APPLIED_INDEX_KEY.into(), + index.to_le_bytes().to_vec(), + )); } - self.after_sync_sender - .send((cmd.clone(), index)) - .expect("failed to send after sync msg"); - let mut wr_ops = vec![WriteOperation::new_put( - META_TABLE, - APPLIED_INDEX_KEY.into(), - index.to_le_bytes().to_vec(), - )]; - if let TestCommandType::Put(v) = cmd.cmd_type { - debug!("cmd {:?}-{:?} revision is {}", cmd.cmd_type, cmd, revision); - let value = v.to_le_bytes().to_vec(); - let keys = cmd - .keys - .iter() - .map(|k| k.to_le_bytes().to_vec()) - .collect_vec(); - wr_ops.extend( - keys.clone() - .into_iter() - .map(|key| WriteOperation::new_put(TEST_TABLE, key, value.clone())) - .chain(keys.into_iter().map(|key| { - WriteOperation::new_put( - REVISION_TABLE, - key, - revision.to_le_bytes().to_vec(), - ) - })), + let mut asrs = Vec::new(); + for (i, (cmd, to_execute)) in cmds.iter().map(AfterSyncCmd::into_parts).enumerate() { + let index = highest_index + .map(|index| index - (total - i - 1) as u64) + .unwrap_or(0); + if cmd.as_should_fail { + asrs.push(Err(ExecuteError("fail".to_owned()))); + continue; + } + if let TestCommandType::Put(v) = cmd.cmd_type { + let revision = match self.next_revision(cmd) { + Ok(rev) => rev, + Err(e) => { + asrs.push(Err(e)); + continue; + } + }; + + debug!("cmd {:?}-{:?} revision is {}", cmd.cmd_type, cmd, revision); + let value = v.to_le_bytes().to_vec(); + let keys = cmd + .keys + .iter() + .map(|k| k.to_le_bytes().to_vec()) + .collect_vec(); + wr_ops.extend( + keys.clone() + .into_iter() + .map(|key| WriteOperation::new_put(TEST_TABLE, key, value.clone())) + .chain(keys.into_iter().map(|key| { + WriteOperation::new_put( + REVISION_TABLE, + key, + revision.to_le_bytes().to_vec(), + ) + })), + ); + } + match to_execute.then(|| self.execute(cmd)).transpose() { + Ok(er) => { + asrs.push(Ok(AfterSyncOk::new(LogIndexResult(index), er))); + } + Err(e) => asrs.push(Err(e)), + } + debug!( + "{} after sync cmd({:?} - {:?}), index: {index}", + self.server_name, cmd.cmd_type, cmd ); - self.store - .write_multi(wr_ops, true) - .map_err(|e| ExecuteError(e.to_string()))?; } - debug!( - "{} after sync cmd({:?} - {:?}), index: {index}", - self.server_name, cmd.cmd_type, cmd - ); - Ok(index.into()) + + if let Err(e) = self + .store + .write_multi(wr_ops, true) + .map_err(|e| ExecuteError(e.to_string())) + { + return std::iter::repeat(e).map(Err).take(cmds.len()).collect(); + } + + asrs } fn set_last_applied(&self, index: LogIndex) -> Result<(), ::Error> { @@ -444,4 +471,22 @@ impl TestCE { after_sync_sender, } } + + fn next_revision(&self, cmd: &TestCommand) -> Result::Error> { + let rev = if let TestCommandType::Put(_) = cmd.cmd_type { + let rev = self.revision.fetch_add(1, Ordering::Relaxed); + let wr_ops = vec![WriteOperation::new_put( + META_TABLE, + LAST_REVISION_KEY.into(), + rev.to_le_bytes().to_vec(), + )]; + self.store + .write_multi(wr_ops, true) + .map_err(|e| ExecuteError(e.to_string()))?; + rev + } else { + -1 + }; + Ok(rev) + } } diff --git a/crates/curp/proto/common b/crates/curp/proto/common index 2d81c8f0b..19cfc8d48 160000 --- a/crates/curp/proto/common +++ b/crates/curp/proto/common @@ -1 +1 @@ -Subproject commit 2d81c8f0b167ad962eeb8c4c55e5ee2a14eb98e6 +Subproject commit 19cfc8d48da30c190e240a477802b2b7f2a14633 diff --git a/crates/curp/src/client/mod.rs b/crates/curp/src/client/mod.rs index 740509c56..475c5c500 100644 --- a/crates/curp/src/client/mod.rs +++ b/crates/curp/src/client/mod.rs @@ -21,11 +21,14 @@ mod state; #[cfg(test)] mod tests; -use std::{collections::HashMap, fmt::Debug, sync::Arc}; +#[cfg(madsim)] +use std::sync::atomic::AtomicU64; +use std::{collections::HashMap, fmt::Debug, ops::Deref, sync::Arc, time::Duration}; use async_trait::async_trait; use curp_external_api::cmd::Command; use futures::{stream::FuturesUnordered, StreamExt}; +use parking_lot::RwLock; use tokio::task::JoinHandle; #[cfg(not(madsim))] use tonic::transport::ClientTlsConfig; @@ -45,6 +48,7 @@ use crate::{ protocol_client::ProtocolClient, ConfChange, FetchClusterRequest, FetchClusterResponse, Member, ProposeId, Protocol, ReadState, }, + tracker::Tracker, }; /// The response of propose command, deserialized from [`crate::rpc::ProposeResponse`] or @@ -123,11 +127,43 @@ pub trait ClientApi { } } +/// Propose id guard, used to ensure the sequence of propose id is recorded. +struct ProposeIdGuard<'a> { + /// The propose id + propose_id: ProposeId, + /// The tracker + tracker: &'a RwLock, +} + +impl Deref for ProposeIdGuard<'_> { + type Target = ProposeId; + + fn deref(&self) -> &Self::Target { + &self.propose_id + } +} + +impl<'a> ProposeIdGuard<'a> { + /// Create a new propose id guard + fn new(tracker: &'a RwLock, propose_id: ProposeId) -> Self { + Self { + propose_id, + tracker, + } + } +} + +impl Drop for ProposeIdGuard<'_> { + fn drop(&mut self) { + let _ig = self.tracker.write().record(self.propose_id.1); + } +} + /// This trait override some unrepeatable methods in ClientApi, and a client with this trait will be able to retry. #[async_trait] trait RepeatableClientApi: ClientApi { /// Generate a unique propose id during the retry process. - fn gen_propose_id(&self) -> Result; + fn gen_propose_id(&self) -> Result, Self::Error>; /// Send propose to the whole cluster, `use_fast_path` set to `false` to fallback into ordered /// requests (event the requests are commutative). @@ -356,6 +392,29 @@ impl ClientBuilder { }) } + /// Wait for client id + async fn wait_for_client_id(&self, state: Arc) -> Result<(), tonic::Status> { + /// Max retry count for waiting for a client ID + /// + /// TODO: This retry count is set relatively high to avoid test cluster startup timeouts. + /// We should consider setting this to a more reasonable value. + const RETRY_COUNT: usize = 30; + /// The interval for each retry + const RETRY_INTERVAL: Duration = Duration::from_secs(1); + + for _ in 0..RETRY_COUNT { + if state.client_id() != 0 { + return Ok(()); + } + debug!("waiting for client_id"); + tokio::time::sleep(RETRY_INTERVAL).await; + } + + Err(tonic::Status::deadline_exceeded( + "timeout waiting for client id", + )) + } + /// Build the client /// /// # Errors @@ -364,17 +423,55 @@ impl ClientBuilder { #[inline] pub async fn build( &self, + ) -> Result + Send + Sync + 'static, tonic::Status> + { + let state = Arc::new( + self.init_state_builder() + .build() + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?, + ); + let client = Retry::new( + Unary::new(Arc::clone(&state), self.init_unary_config()), + self.init_retry_config(), + Some(self.spawn_bg_tasks(Arc::clone(&state))), + ); + self.wait_for_client_id(state).await?; + Ok(client) + } + + #[cfg(madsim)] + /// Build the client, also returns the current client id + /// + /// # Errors + /// + /// Return `tonic::transport::Error` for connection failure. + #[inline] + pub async fn build_with_client_id( + &self, ) -> Result< - impl ClientApi + Send + Sync + 'static, - tonic::transport::Error, + ( + impl ClientApi + Send + Sync + 'static, + Arc, + ), + tonic::Status, > { - let state = Arc::new(self.init_state_builder().build().await?); + let state = Arc::new( + self.init_state_builder() + .build() + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?, + ); + let client = Retry::new( Unary::new(Arc::clone(&state), self.init_unary_config()), self.init_retry_config(), - Some(self.spawn_bg_tasks(state)), + Some(self.spawn_bg_tasks(Arc::clone(&state))), ); - Ok(client) + let client_id = state.clone_client_id(); + self.wait_for_client_id(state).await?; + + Ok((client, client_id)) } } @@ -387,18 +484,20 @@ impl ClientBuilderWithBypass

{ #[inline] pub async fn build( self, - ) -> Result, tonic::transport::Error> { + ) -> Result, tonic::Status> { let state = self .inner .init_state_builder() .build_bypassed::

(self.local_server_id, self.local_server) - .await?; + .await + .map_err(|e| tonic::Status::internal(e.to_string()))?; let state = Arc::new(state); let client = Retry::new( Unary::new(Arc::clone(&state), self.inner.init_unary_config()), self.inner.init_retry_config(), - Some(self.inner.spawn_bg_tasks(state)), + Some(self.inner.spawn_bg_tasks(Arc::clone(&state))), ); + self.inner.wait_for_client_id(state).await?; Ok(client) } } diff --git a/crates/curp/src/client/retry.rs b/crates/curp/src/client/retry.rs index 80e5d515a..06e670a89 100644 --- a/crates/curp/src/client/retry.rs +++ b/crates/curp/src/client/retry.rs @@ -3,7 +3,7 @@ use std::{ops::SubAssign, time::Duration}; use async_trait::async_trait; use futures::Future; use tokio::task::JoinHandle; -use tracing::warn; +use tracing::{info, warn}; use super::{ClientApi, LeaderStateUpdate, ProposeResponse, RepeatableClientApi}; use crate::{ @@ -110,6 +110,7 @@ pub(super) struct Retry { impl Drop for Retry { fn drop(&mut self) { if let Some(handle) = self.bg_handle.as_ref() { + info!("stopping background task"); handle.abort(); } } @@ -183,6 +184,13 @@ where .update_leader(leader_id.as_ref().map(Into::into), term) .await; } + + // update the cluster state if got Zombie + CurpError::Zombie(()) => { + if let Err(e) = self.inner.fetch_cluster(true).await { + warn!("fetch cluster failed, error {e:?}"); + } + } } #[cfg(feature = "client-metrics")] @@ -222,9 +230,9 @@ where token: Option<&String>, use_fast_path: bool, ) -> Result, tonic::Status> { - let propose_id = self.inner.gen_propose_id()?; - self.retry::<_, _>(|client| { - RepeatableClientApi::propose(client, propose_id, cmd, token, use_fast_path) + self.retry::<_, _>(|client| async move { + let propose_id = self.inner.gen_propose_id()?; + RepeatableClientApi::propose(client, *propose_id, cmd, token, use_fast_path).await }) .await } @@ -234,19 +242,23 @@ where &self, changes: Vec, ) -> Result, tonic::Status> { - let propose_id = self.inner.gen_propose_id()?; self.retry::<_, _>(|client| { let changes_c = changes.clone(); - RepeatableClientApi::propose_conf_change(client, propose_id, changes_c) + async move { + let propose_id = self.inner.gen_propose_id()?; + RepeatableClientApi::propose_conf_change(client, *propose_id, changes_c).await + } }) .await } /// Send propose to shutdown cluster async fn propose_shutdown(&self) -> Result<(), tonic::Status> { - let propose_id = self.inner.gen_propose_id()?; - self.retry::<_, _>(|client| RepeatableClientApi::propose_shutdown(client, propose_id)) - .await + self.retry::<_, _>(|client| async move { + let propose_id = self.inner.gen_propose_id()?; + RepeatableClientApi::propose_shutdown(client, *propose_id).await + }) + .await } /// Send propose to publish a node id and name @@ -256,17 +268,20 @@ where node_name: String, node_client_urls: Vec, ) -> Result<(), Self::Error> { - let propose_id = self.inner.gen_propose_id()?; self.retry::<_, _>(|client| { let name_c = node_name.clone(); let node_client_urls_c = node_client_urls.clone(); - RepeatableClientApi::propose_publish( - client, - propose_id, - node_id, - name_c, - node_client_urls_c, - ) + async move { + let propose_id = self.inner.gen_propose_id()?; + RepeatableClientApi::propose_publish( + client, + *propose_id, + node_id, + name_c, + node_client_urls_c, + ) + .await + } }) .await } diff --git a/crates/curp/src/client/state.rs b/crates/curp/src/client/state.rs index 074550145..8476e46b8 100644 --- a/crates/curp/src/client/state.rs +++ b/crates/curp/src/client/state.rs @@ -2,10 +2,12 @@ use std::{ cmp::Ordering, collections::{hash_map::Entry, HashMap, HashSet}, sync::{atomic::AtomicU64, Arc}, + time::Duration, }; use event_listener::Event; use futures::{stream::FuturesUnordered, Future}; +use rand::seq::IteratorRandom; use tokio::sync::RwLock; #[cfg(not(madsim))] use tonic::transport::ClientTlsConfig; @@ -18,7 +20,7 @@ use crate::{ rpc::{ self, connect::{BypassedConnect, ConnectApi}, - CurpError, FetchClusterResponse, Protocol, + CurpError, FetchClusterRequest, FetchClusterResponse, Protocol, }, }; @@ -127,6 +129,28 @@ impl State { } } + /// Choose a random server to try to refresh the state + /// Use when the current leader is missing. + pub(crate) async fn try_refresh_state(&self) -> Result<(), CurpError> { + /// The timeout for refreshing the state + const REFRESH_TIMEOUT: Duration = Duration::from_secs(1); + + let rand_conn = { + let state = self.mutable.read().await; + state + .connects + .values() + .choose(&mut rand::thread_rng()) + .map(Arc::clone) + .ok_or_else(CurpError::wrong_cluster_version)? + }; + let resp = rand_conn + .fetch_cluster(FetchClusterRequest::default(), REFRESH_TIMEOUT) + .await?; + self.check_and_update(&resp.into_inner()).await?; + Ok(()) + } + /// Get the local server connection pub(super) async fn local_connect(&self) -> Option> { let id = self.immutable.local_server?; @@ -148,6 +172,11 @@ impl State { self.mutable.read().await.leader } + /// Get term of the cluster + pub(super) async fn term(&self) -> u64 { + self.mutable.read().await.term + } + /// Take an async function and map to the dedicated server, return `Err(CurpError:WrongClusterVersion(()))` /// if the server can not found in local state pub(super) async fn map_server>>( @@ -170,6 +199,11 @@ impl State { f(conn).await } + /// Returns the number of members in the cluster + pub(super) async fn connects_len(&self) -> usize { + self.mutable.read().await.connects.len() + } + /// Take an async function and map to all server, returning `FuturesUnordered` pub(super) async fn for_each_server>( &self, @@ -185,6 +219,22 @@ impl State { .collect() } + /// Take an async function and map to all server, returning `FuturesUnordered` + pub(super) async fn for_each_follower>( + &self, + leader_id: u64, + f: impl FnMut(Arc) -> F, + ) -> FuturesUnordered { + let mutable_r = self.mutable.read().await; + mutable_r + .connects + .iter() + .filter_map(|(id, conn)| (*id != leader_id).then_some(conn)) + .map(Arc::clone) + .map(f) + .collect() + } + /// Inner check and update leader fn check_and_update_leader_inner( &self, diff --git a/crates/curp/src/client/stream.rs b/crates/curp/src/client/stream.rs index a15c7b3c6..9ebeb1599 100644 --- a/crates/curp/src/client/stream.rs +++ b/crates/curp/src/client/stream.rs @@ -1,7 +1,7 @@ use std::{sync::Arc, time::Duration}; use futures::Future; -use tracing::{debug, warn}; +use tracing::{debug, info, warn}; use super::state::State; use crate::rpc::{connect::ConnectApi, CurpError, Redirect}; @@ -29,6 +29,9 @@ pub(super) struct Streaming { config: StreamingConfig, } +/// Prevent lock contention when leader crashed or some unknown errors +const RETRY_DELAY: Duration = Duration::from_millis(100); + impl Streaming { /// Create a stream client pub(super) fn new(state: Arc, config: StreamingConfig) -> Self { @@ -43,8 +46,9 @@ impl Streaming { ) -> Result { loop { let Some(leader_id) = self.state.leader_id().await else { - debug!("cannot find the leader id in state, wait for leadership update"); - self.state.leader_notifier().listen().await; + warn!("cannot find leader_id, refreshing state..."); + let _ig = self.state.try_refresh_state().await; + tokio::time::sleep(RETRY_DELAY).await; continue; }; if let Some(local_id) = self.state.local_server_id() { @@ -61,8 +65,6 @@ impl Streaming { /// Keep heartbeat pub(super) async fn keep_heartbeat(&self) { - /// Prevent lock contention when leader crashed or some unknown errors - const RETRY_DELAY: Duration = Duration::from_millis(100); #[allow(clippy::ignored_unit_patterns)] // tokio select internal triggered loop { let heartbeat = self.map_remote_leader::<(), _>(|conn| async move { @@ -87,9 +89,16 @@ impl Streaming { ); self.state.leader_notifier().listen().await; } + CurpError::RpcTransport(()) => { + warn!( + "got rpc transport error when keep heartbeat, refreshing state..." + ); + let _ig = self.state.try_refresh_state().await; + tokio::time::sleep(RETRY_DELAY).await; + } CurpError::ShuttingDown(()) => { - debug!("shutting down stream client background task"); - break Err(err); + info!("cluster is shutting down, exiting heartbeat task"); + return Ok(()); } _ => { warn!("got unexpected error {err:?} when keep heartbeat, retrying..."); diff --git a/crates/curp/src/client/tests.rs b/crates/curp/src/client/tests.rs index c61125d11..f8c0649f3 100644 --- a/crates/curp/src/client/tests.rs +++ b/crates/curp/src/client/tests.rs @@ -1,35 +1,36 @@ use std::{ collections::HashMap, - ops::AddAssign, - sync::{ - atomic::{AtomicBool, AtomicU64}, - Arc, Mutex, - }, - time::Duration, + sync::{atomic::AtomicU64, Arc, Mutex}, + time::{Duration, Instant}, }; -use curp_external_api::LogIndex; use curp_test_utils::test_cmd::{LogIndexResult, TestCommand, TestCommandResult}; -use futures::future::BoxFuture; -use tokio::time::Instant; +use futures::{future::BoxFuture, Stream}; #[cfg(not(madsim))] use tonic::transport::ClientTlsConfig; +use tonic::Status; use tracing_test::traced_test; #[cfg(madsim)] use utils::ClientTlsConfig; use super::{ - retry::{Retry, RetryConfig}, state::State, stream::{Streaming, StreamingConfig}, unary::{Unary, UnaryConfig}, }; use crate::{ - client::ClientApi, + client::{ + retry::{Retry, RetryConfig}, + ClientApi, + }, members::ServerId, rpc::{ connect::{ConnectApi, MockConnectApi}, - *, + CurpError, FetchClusterRequest, FetchClusterResponse, FetchReadStateRequest, + FetchReadStateResponse, Member, MoveLeaderRequest, MoveLeaderResponse, OpResponse, + ProposeConfChangeRequest, ProposeConfChangeResponse, ProposeRequest, ProposeResponse, + PublishRequest, PublishResponse, ReadIndexResponse, RecordRequest, RecordResponse, + ResponseOp, ShutdownRequest, ShutdownResponse, SyncedResponse, }, }; @@ -259,231 +260,56 @@ async fn test_unary_fetch_clusters_linearizable_failed() { }); let unary = init_unary_client(connects, None, None, 0, 0, None); let res = unary.fetch_cluster(true).await.unwrap_err(); - // only server(0, 1)'s responses are valid, less than majority quorum(3), got a mocked RpcTransport to retry + // only server(0, 1)'s responses are valid, less than majority quorum(3), got a + // mocked RpcTransport to retry assert_eq!(res, CurpError::RpcTransport(())); } -#[traced_test] -#[tokio::test] -async fn test_unary_fast_round_works() { - let connects = init_mocked_connects(5, |id, conn| { - conn.expect_propose() - .return_once(move |_req, _token, _timeout| { - let resp = match id { - 0 => ProposeResponse::new_result::(&Ok( - TestCommandResult::default(), - )), - 1 | 2 | 3 => ProposeResponse::new_empty(), - 4 => return Err(CurpError::key_conflict()), - _ => unreachable!("there are only 5 nodes"), - }; - Ok(tonic::Response::new(resp)) - }); - }); - let unary = init_unary_client(connects, None, None, 0, 0, None); - let res = unary - .fast_round(ProposeId(0, 0), &TestCommand::default(), None) - .await - .unwrap() - .unwrap(); - assert_eq!(res, TestCommandResult::default()); +fn build_propose_response(conflict: bool) -> OpResponse { + let resp = ResponseOp::Propose(ProposeResponse::new_result::( + &Ok(TestCommandResult::default()), + conflict, + )); + OpResponse { op: Some(resp) } } -#[traced_test] -#[tokio::test] -async fn test_unary_fast_round_return_early_err() { - for early_err in [ - CurpError::duplicated(), - CurpError::shutting_down(), - CurpError::invalid_config(), - CurpError::node_already_exists(), - CurpError::node_not_exist(), - CurpError::learner_not_catch_up(), - CurpError::expired_client_id(), - CurpError::redirect(Some(1), 0), - ] { - assert!(early_err.should_abort_fast_round()); - // record how many times `handle_propose` was invoked. - let counter = Arc::new(Mutex::new(0)); - let connects = init_mocked_connects(3, |_id, conn| { - let counter_c = Arc::clone(&counter); - let err = early_err.clone(); - conn.expect_propose() - .return_once(move |_req, _token, _timeout| { - counter_c.lock().unwrap().add_assign(1); - Err(err) - }); - }); - let unary = init_unary_client(connects, None, None, 0, 0, None); - let err = unary - .fast_round(ProposeId(0, 0), &TestCommand::default(), None) - .await - .unwrap_err(); - assert_eq!(err, early_err); - assert_eq!(*counter.lock().unwrap(), 1); - } +fn build_synced_response() -> OpResponse { + let resp = ResponseOp::Synced(SyncedResponse::new_result::(&Ok(1.into()))); + OpResponse { op: Some(resp) } } -#[traced_test] -#[tokio::test] -async fn test_unary_fast_round_less_quorum() { - let connects = init_mocked_connects(5, |id, conn| { - conn.expect_propose() - .return_once(move |_req, _token, _timeout| { - let resp = match id { - 0 => ProposeResponse::new_result::(&Ok( - TestCommandResult::default(), - )), - 1 | 2 => ProposeResponse::new_empty(), - 3 | 4 => return Err(CurpError::key_conflict()), - _ => unreachable!("there are only 5 nodes"), - }; - Ok(tonic::Response::new(resp)) - }); - }); - let unary = init_unary_client(connects, None, None, 0, 0, None); - let err = unary - .fast_round(ProposeId(0, 0), &TestCommand::default(), None) - .await - .unwrap_err(); - assert_eq!(err, CurpError::KeyConflict(())); -} - -/// FIXME: two leader -/// TODO: fix in subsequence PR -#[traced_test] -#[tokio::test] -#[should_panic(expected = "should not set exe result twice")] -async fn test_unary_fast_round_with_two_leader() { - let connects = init_mocked_connects(5, |id, conn| { - conn.expect_propose() - .return_once(move |_req, _token, _timeout| { - let resp = - match id { - // The execution result has been returned, indicating that server(0) has also recorded the command. - 0 => ProposeResponse::new_result::(&Ok( - TestCommandResult::new(vec![1], vec![1]), - )), - // imagine that server(1) is the new leader - 1 => ProposeResponse::new_result::(&Ok( - TestCommandResult::new(vec![2], vec![2]), - )), - 2 | 3 => ProposeResponse::new_empty(), - 4 => return Err(CurpError::key_conflict()), - _ => unreachable!("there are only 5 nodes"), - }; - Ok(tonic::Response::new(resp)) - }); - }); - // old local leader(0), term 1 - let unary = init_unary_client(connects, None, Some(0), 1, 0, None); - let res = unary - .fast_round(ProposeId(0, 0), &TestCommand::default(), None) - .await - .unwrap() - .unwrap(); - // quorum: server(0, 1, 2, 3) - assert_eq!(res, TestCommandResult::new(vec![2], vec![2])); -} - -// We may encounter this scenario during leader election -#[traced_test] -#[tokio::test] -async fn test_unary_fast_round_without_leader() { - let connects = init_mocked_connects(5, |id, conn| { - conn.expect_propose() - .return_once(move |_req, _token, _timeout| { - let resp = match id { - 0 | 1 | 2 | 3 | 4 => ProposeResponse::new_empty(), - _ => unreachable!("there are only 5 nodes"), - }; - Ok(tonic::Response::new(resp)) - }); - }); - // old local leader(0), term 1 - let unary = init_unary_client(connects, None, Some(0), 1, 0, None); - let res = unary - .fast_round(ProposeId(0, 0), &TestCommand::default(), None) - .await - .unwrap_err(); - // quorum: server(0, 1, 2, 3) - assert_eq!(res, CurpError::WrongClusterVersion(())); -} - -#[traced_test] -#[tokio::test] -async fn test_unary_slow_round_fetch_leader_first() { - let flag = Arc::new(AtomicBool::new(false)); - let connects = init_mocked_connects(3, |id, conn| { - let flag_c = Arc::clone(&flag); - conn.expect_fetch_cluster() - .return_once(move |_req, _timeout| { - flag_c.store(true, std::sync::atomic::Ordering::Relaxed); - Ok(tonic::Response::new(FetchClusterResponse { - leader_id: Some(0.into()), - term: 1, - cluster_id: 123, - members: vec![ - Member::new(0, "S0", vec!["A0".to_owned()], [], false), - Member::new(1, "S1", vec!["A1".to_owned()], [], false), - Member::new(2, "S2", vec!["A2".to_owned()], [], false), - ], - cluster_version: 1, - })) - }); - let flag_c = Arc::clone(&flag); - conn.expect_wait_synced() - .return_once(move |_req, _timeout| { - assert!(id == 0, "wait synced should send to leader"); - assert!( - flag_c.load(std::sync::atomic::Ordering::Relaxed), - "fetch_leader should invoke first" - ); - Ok(tonic::Response::new(WaitSyncedResponse::new_from_result::< - TestCommand, - >( - Ok(TestCommandResult::default()), - Some(Ok(1.into())), - ))) - }); - }); - let unary = init_unary_client(connects, None, None, 0, 0, None); - let res = unary.slow_round(ProposeId(0, 0)).await.unwrap().unwrap(); - assert_eq!(LogIndex::from(res.0), 1); - assert_eq!(res.1, TestCommandResult::default()); +// TODO: rewrite this tests +#[cfg(ignore)] +fn build_empty_response() -> OpResponse { + OpResponse { op: None } } #[traced_test] #[tokio::test] async fn test_unary_propose_fast_path_works() { let connects = init_mocked_connects(5, |id, conn| { - conn.expect_propose() + conn.expect_propose_stream() .return_once(move |_req, _token, _timeout| { - let resp = match id { - 0 => ProposeResponse::new_result::(&Ok( - TestCommandResult::default(), - )), - 1 | 2 | 3 => ProposeResponse::new_empty(), - 4 => return Err(CurpError::key_conflict()), - _ => unreachable!("there are only 5 nodes"), + assert_eq!(id, 0, "followers should not receive propose"); + let resp = async_stream::stream! { + yield Ok(build_propose_response(false)); + yield Ok(build_synced_response()); }; - Ok(tonic::Response::new(resp)) - }); - conn.expect_wait_synced() - .return_once(move |_req, _timeout| { - assert!(id == 0, "wait synced should send to leader"); - std::thread::sleep(Duration::from_millis(100)); - Ok(tonic::Response::new(WaitSyncedResponse::new_from_result::< - TestCommand, - >( - Ok(TestCommandResult::default()), - Some(Ok(1.into())), - ))) + Ok(tonic::Response::new(Box::new(resp))) }); + conn.expect_record().return_once(move |_req, _timeout| { + let resp = match id { + 0 => unreachable!("leader should not receive record request"), + 1 | 2 | 3 => RecordResponse { conflict: false }, + 4 => RecordResponse { conflict: true }, + _ => unreachable!("there are only 5 nodes"), + }; + Ok(tonic::Response::new(resp)) + }); }); let unary = init_unary_client(connects, None, Some(0), 1, 0, None); let res = unary - .propose(&TestCommand::default(), None, true) + .propose(&TestCommand::new_put(vec![1], 1), None, true) .await .unwrap() .unwrap(); @@ -494,34 +320,31 @@ async fn test_unary_propose_fast_path_works() { #[tokio::test] async fn test_unary_propose_slow_path_works() { let connects = init_mocked_connects(5, |id, conn| { - conn.expect_propose() + conn.expect_propose_stream() .return_once(move |_req, _token, _timeout| { - let resp = match id { - 0 => ProposeResponse::new_result::(&Ok( - TestCommandResult::default(), - )), - 1 | 2 | 3 => ProposeResponse::new_empty(), - 4 => return Err(CurpError::key_conflict()), - _ => unreachable!("there are only 5 nodes"), + assert_eq!(id, 0, "followers should not receive propose"); + let resp = async_stream::stream! { + yield Ok(build_propose_response(false)); + tokio::time::sleep(Duration::from_millis(100)).await; + yield Ok(build_synced_response()); }; - Ok(tonic::Response::new(resp)) - }); - conn.expect_wait_synced() - .return_once(move |_req, _timeout| { - assert!(id == 0, "wait synced should send to leader"); - std::thread::sleep(Duration::from_millis(100)); - Ok(tonic::Response::new(WaitSyncedResponse::new_from_result::< - TestCommand, - >( - Ok(TestCommandResult::default()), - Some(Ok(1.into())), - ))) + Ok(tonic::Response::new(Box::new(resp))) }); + conn.expect_record().return_once(move |_req, _timeout| { + let resp = match id { + 0 => unreachable!("leader should not receive record request"), + 1 | 2 | 3 => RecordResponse { conflict: false }, + 4 => RecordResponse { conflict: true }, + _ => unreachable!("there are only 5 nodes"), + }; + Ok(tonic::Response::new(resp)) + }); }); + let unary = init_unary_client(connects, None, Some(0), 1, 0, None); let start_at = Instant::now(); let res = unary - .propose(&TestCommand::default(), None, false) + .propose(&TestCommand::new_put(vec![1], 1), None, false) .await .unwrap() .unwrap(); @@ -538,36 +361,33 @@ async fn test_unary_propose_slow_path_works() { #[traced_test] #[tokio::test] async fn test_unary_propose_fast_path_fallback_slow_path() { + // record how many times `handle_propose` was invoked. let connects = init_mocked_connects(5, |id, conn| { - conn.expect_propose() + conn.expect_propose_stream() .return_once(move |_req, _token, _timeout| { - // insufficient quorum to force slow path. - let resp = match id { - 0 => ProposeResponse::new_result::(&Ok( - TestCommandResult::default(), - )), - 1 | 2 => ProposeResponse::new_empty(), - 3 | 4 => return Err(CurpError::key_conflict()), - _ => unreachable!("there are only 5 nodes"), + assert_eq!(id, 0, "followers should not receive propose"); + let resp = async_stream::stream! { + yield Ok(build_propose_response(false)); + tokio::time::sleep(Duration::from_millis(100)).await; + yield Ok(build_synced_response()); }; - Ok(tonic::Response::new(resp)) - }); - conn.expect_wait_synced() - .return_once(move |_req, _timeout| { - assert!(id == 0, "wait synced should send to leader"); - std::thread::sleep(Duration::from_millis(100)); - Ok(tonic::Response::new(WaitSyncedResponse::new_from_result::< - TestCommand, - >( - Ok(TestCommandResult::default()), - Some(Ok(1.into())), - ))) + Ok(tonic::Response::new(Box::new(resp))) }); + // insufficient quorum + conn.expect_record().return_once(move |_req, _timeout| { + let resp = match id { + 0 => unreachable!("leader should not receive record request"), + 1 | 2 => RecordResponse { conflict: false }, + 3 | 4 => RecordResponse { conflict: true }, + _ => unreachable!("there are only 5 nodes"), + }; + Ok(tonic::Response::new(resp)) + }); }); let unary = init_unary_client(connects, None, Some(0), 1, 0, None); let start_at = Instant::now(); let res = unary - .propose(&TestCommand::default(), None, true) + .propose(&TestCommand::new_put(vec![1], 1), None, true) .await .unwrap() .unwrap(); @@ -575,6 +395,7 @@ async fn test_unary_propose_fast_path_fallback_slow_path() { start_at.elapsed() > Duration::from_millis(100), "slow round takes at least 100ms" ); + // indicate that we actually run out of fast round assert_eq!( res, (TestCommandResult::default(), Some(LogIndexResult::from(1))) @@ -596,26 +417,22 @@ async fn test_unary_propose_return_early_err() { assert!(early_err.should_abort_fast_round()); // record how many times rpc was invoked. let counter = Arc::new(Mutex::new(0)); - let connects = init_mocked_connects(5, |id, conn| { + let connects = init_mocked_connects(5, |_id, conn| { let err = early_err.clone(); let counter_c = Arc::clone(&counter); - conn.expect_propose() + conn.expect_propose_stream() .return_once(move |_req, _token, _timeout| { - counter_c.lock().unwrap().add_assign(1); + *counter_c.lock().unwrap() += 1; Err(err) }); + let err = early_err.clone(); - let counter_c = Arc::clone(&counter); - conn.expect_wait_synced() - .return_once(move |_req, _timeout| { - assert!(id == 0, "wait synced should send to leader"); - counter_c.lock().unwrap().add_assign(1); - Err(err) - }); + conn.expect_record() + .return_once(move |_req, _timeout| Err(err)); }); let unary = init_unary_client(connects, None, Some(0), 1, 0, None); let err = unary - .propose(&TestCommand::default(), None, true) + .propose(&TestCommand::new_put(vec![1], 1), None, true) .await .unwrap_err(); assert_eq!(err, early_err); @@ -637,22 +454,18 @@ async fn test_retry_propose_return_no_retry_error() { ] { // record how many times rpc was invoked. let counter = Arc::new(Mutex::new(0)); - let connects = init_mocked_connects(5, |id, conn| { + let connects = init_mocked_connects(5, |_id, conn| { let err = early_err.clone(); let counter_c = Arc::clone(&counter); - conn.expect_propose() + conn.expect_propose_stream() .return_once(move |_req, _token, _timeout| { - counter_c.lock().unwrap().add_assign(1); + *counter_c.lock().unwrap() += 1; Err(err) }); + let err = early_err.clone(); - let counter_c = Arc::clone(&counter); - conn.expect_wait_synced() - .return_once(move |_req, _timeout| { - assert!(id == 0, "wait synced should send to leader"); - counter_c.lock().unwrap().add_assign(1); - Err(err) - }); + conn.expect_record() + .return_once(move |_req, _timeout| Err(err)); }); let unary = init_unary_client(connects, None, Some(0), 1, 0, None); let retry = Retry::new( @@ -661,12 +474,11 @@ async fn test_retry_propose_return_no_retry_error() { None, ); let err = retry - .propose(&TestCommand::default(), None, false) + .propose(&TestCommand::new_put(vec![1], 1), None, false) .await .unwrap_err(); assert_eq!(err.message(), tonic::Status::from(early_err).message()); - // fast path + slow path = 2 - assert_eq!(*counter.lock().unwrap(), 2); + assert_eq!(*counter.lock().unwrap(), 1); } } @@ -674,13 +486,10 @@ async fn test_retry_propose_return_no_retry_error() { #[tokio::test] async fn test_retry_propose_return_retry_error() { for early_err in [ - CurpError::expired_client_id(), - CurpError::key_conflict(), CurpError::RpcTransport(()), CurpError::internal("No reason"), ] { let connects = init_mocked_connects(5, |id, conn| { - let err = early_err.clone(); conn.expect_fetch_cluster() .returning(move |_req, _timeout| { Ok(tonic::Response::new(FetchClusterResponse { @@ -697,14 +506,16 @@ async fn test_retry_propose_return_retry_error() { cluster_version: 1, })) }); - conn.expect_propose() - .returning(move |_req, _token, _timeout| Err(err.clone())); if id == 0 { let err = early_err.clone(); - conn.expect_wait_synced() - .times(5) // wait synced should be retried in 5 times on leader - .returning(move |_req, _timeout| Err(err.clone())); + conn.expect_propose_stream() + .times(5) // propose should be retried in 5 times on leader + .returning(move |_req, _token, _timeout| Err(err.clone())); } + + let err = early_err.clone(); + conn.expect_record() + .returning(move |_req, _timeout| Err(err.clone())); }); let unary = init_unary_client(connects, None, Some(0), 1, 0, None); let retry = Retry::new( @@ -713,13 +524,75 @@ async fn test_retry_propose_return_retry_error() { None, ); let err = retry - .propose(&TestCommand::default(), None, false) + .propose(&TestCommand::new_put(vec![1], 1), None, false) .await .unwrap_err(); assert!(err.message().contains("request timeout")); } } +#[traced_test] +#[tokio::test] +async fn test_read_index_success() { + let connects = init_mocked_connects(5, |id, conn| { + conn.expect_propose_stream() + .return_once(move |_req, _token, _timeout| { + assert_eq!(id, 0, "followers should not receive propose"); + let resp = async_stream::stream! { + yield Ok(build_propose_response(false)); + yield Ok(build_synced_response()); + }; + Ok(tonic::Response::new(Box::new(resp))) + }); + conn.expect_read_index().return_once(move |_timeout| { + let resp = match id { + 0 => unreachable!("read index should not send to leader"), + 1 | 2 => ReadIndexResponse { term: 1 }, + 3 | 4 => ReadIndexResponse { term: 2 }, + _ => unreachable!("there are only 5 nodes"), + }; + + Ok(tonic::Response::new(resp)) + }); + }); + let unary = init_unary_client(connects, None, Some(0), 1, 0, None); + let res = unary + .propose(&TestCommand::default(), None, true) + .await + .unwrap() + .unwrap(); + assert_eq!(res, (TestCommandResult::default(), None)); +} + +#[traced_test] +#[tokio::test] +async fn test_read_index_fail() { + let connects = init_mocked_connects(5, |id, conn| { + conn.expect_propose_stream() + .return_once(move |_req, _token, _timeout| { + assert_eq!(id, 0, "followers should not receive propose"); + let resp = async_stream::stream! { + yield Ok(build_propose_response(false)); + yield Ok(build_synced_response()); + }; + Ok(tonic::Response::new(Box::new(resp))) + }); + conn.expect_read_index().return_once(move |_timeout| { + let resp = match id { + 0 => unreachable!("read index should not send to leader"), + 1 => ReadIndexResponse { term: 1 }, + 2 | 3 | 4 => ReadIndexResponse { term: 2 }, + _ => unreachable!("there are only 5 nodes"), + }; + + Ok(tonic::Response::new(resp)) + }); + }); + let unary = init_unary_client(connects, None, Some(0), 1, 0, None); + let res = unary.propose(&TestCommand::default(), None, true).await; + assert!(res.is_err()); +} + // Tests for stream client struct MockedStreamConnectApi { @@ -741,12 +614,30 @@ impl ConnectApi for MockedStreamConnectApi { } /// Send `ProposeRequest` - async fn propose( + async fn propose_stream( &self, _request: ProposeRequest, _token: Option, _timeout: Duration, - ) -> Result, CurpError> { + ) -> Result> + Send>>, CurpError> + { + unreachable!("please use MockedConnectApi") + } + + /// Send `RecordRequest` + async fn record( + &self, + _request: RecordRequest, + _timeout: Duration, + ) -> Result, CurpError> { + unreachable!("please use MockedConnectApi") + } + + /// Send `ReadIndexRequest` + async fn read_index( + &self, + _timeout: Duration, + ) -> Result, CurpError> { unreachable!("please use MockedConnectApi") } @@ -768,15 +659,6 @@ impl ConnectApi for MockedStreamConnectApi { unreachable!("please use MockedConnectApi") } - /// Send `WaitSyncedRequest` - async fn wait_synced( - &self, - _request: WaitSyncedRequest, - _timeout: Duration, - ) -> Result, CurpError> { - unreachable!("please use MockedConnectApi") - } - /// Send `ShutdownRequest` async fn shutdown( &self, diff --git a/crates/curp/src/client/unary.rs b/crates/curp/src/client/unary.rs index e13e5284d..2acf6658a 100644 --- a/crates/curp/src/client/unary.rs +++ b/crates/curp/src/client/unary.rs @@ -1,21 +1,33 @@ -use std::{cmp::Ordering, marker::PhantomData, ops::AddAssign, sync::Arc, time::Duration}; +use std::{ + cmp::Ordering, + marker::PhantomData, + sync::{atomic::AtomicU64, Arc}, + time::Duration, +}; use async_trait::async_trait; use curp_external_api::cmd::Command; -use futures::{Future, StreamExt}; -use tonic::Response; +use futures::{future, stream::FuturesUnordered, Future, Stream, StreamExt}; +use parking_lot::RwLock; +use tonic::{Response, Status}; use tracing::{debug, warn}; -use super::{state::State, ClientApi, LeaderStateUpdate, ProposeResponse, RepeatableClientApi}; +use super::{ + state::State, ClientApi, LeaderStateUpdate, ProposeIdGuard, ProposeResponse, + RepeatableClientApi, +}; use crate::{ members::ServerId, - quorum, recover_quorum, + quorum, + response::ResponseReceiver, rpc::{ connect::ConnectApi, ConfChange, CurpError, FetchClusterRequest, FetchClusterResponse, - FetchReadStateRequest, Member, MoveLeaderRequest, ProposeConfChangeRequest, ProposeId, - ProposeRequest, PublishRequest, ReadState, ShutdownRequest, WaitSyncedRequest, + FetchReadStateRequest, Member, MoveLeaderRequest, OpResponse, ProposeConfChangeRequest, + ProposeId, ProposeRequest, PublishRequest, ReadIndexResponse, ReadState, RecordRequest, + RecordResponse, ShutdownRequest, }, super_quorum, + tracker::Tracker, }; /// The unary client config @@ -46,6 +58,10 @@ pub(super) struct Unary { state: Arc, /// Unary config config: UnaryConfig, + /// Request tracker + tracker: RwLock, + /// Last sent sequence number + last_sent_seq: AtomicU64, /// marker phantom: PhantomData, } @@ -56,6 +72,8 @@ impl Unary { Self { state, config, + tracker: RwLock::new(Tracker::default()), + last_sent_seq: AtomicU64::new(0), phantom: PhantomData, } } @@ -83,128 +101,86 @@ impl Unary { self.state.map_server(leader_id, f).await } - /// Send proposal to all servers - pub(super) async fn fast_round( - &self, - propose_id: ProposeId, - cmd: &C, - token: Option<&String>, - ) -> Result, CurpError> { - let req = ProposeRequest::new(propose_id, cmd, self.state.cluster_version().await); - let timeout = self.config.propose_timeout; - - let mut responses = self - .state - .for_each_server(|conn| { - let req_c = req.clone(); - let token_c = token.cloned(); - async move { (conn.id(), conn.propose(req_c, token_c, timeout).await) } - }) - .await; - let super_quorum = super_quorum(responses.len()); - let recover_quorum = recover_quorum(responses.len()); - - let mut err: Option = None; - let mut execute_result: Option = None; - let (mut ok_cnt, mut key_conflict_cnt) = (0, 0); - - while let Some((id, resp)) = responses.next().await { - if key_conflict_cnt >= recover_quorum { - return Err(CurpError::KeyConflict(())); - } - - let resp = match resp { - Ok(resp) => resp.into_inner(), - Err(e) => { - warn!("propose cmd({propose_id}) to server({id}) error: {e:?}"); - if e.should_abort_fast_round() { - return Err(e); - } - if matches!(e, CurpError::KeyConflict(())) { - key_conflict_cnt.add_assign(1); - } - if let Some(old_err) = err.as_ref() { - if old_err.priority() <= e.priority() { - err = Some(e); - } - } else { - err = Some(e); - } - continue; - } - }; - let deserialize_res = resp.map_result::>(|res| { - let er = match res { - Ok(er) => er, - Err(cmd_err) => return Err(cmd_err), - }; - if let Some(er) = er { - assert!(execute_result.is_none(), "should not set exe result twice"); - execute_result = Some(er); - } - ok_cnt.add_assign(1); - Ok(()) - }); - let dr = match deserialize_res { - Ok(dr) => dr, - Err(ser_err) => { - warn!("serialize error: {ser_err}"); - // We blame this error to the server, although it may be a local error. - // We need to retry as same as a server error. - err = Some(CurpError::from(ser_err)); - continue; - } - }; - if let Err(cmd_err) = dr { - // got a command execution error early, abort the next requests and return the cmd error - return Ok(Err(cmd_err)); - } - // if the propose meets the super quorum and we got the execute result, - // that means we can safely abort the next requests - if ok_cnt >= super_quorum { - if let Some(er) = execute_result { - debug!("fast round for cmd({}) succeed", propose_id); - return Ok(Ok(er)); - } - } - } - - if let Some(err) = err { - return Err(err); + /// Gets the leader id + async fn leader_id(&self) -> Result { + let cached_leader = self.state.leader_id().await; + match cached_leader { + Some(id) => Ok(id), + None => as ClientApi>::fetch_leader_id(self, false).await, } - - // We will at least send the request to the leader if no `WrongClusterVersion` returned. - // If no errors occur, the leader should return the ER - // If it is because the super quorum has not been reached, an error will definitely occur. - // Otherwise, there is no leader in the cluster state currently, return wrong cluster version - // and attempt to retrieve the cluster state again. - Err(CurpError::wrong_cluster_version()) - } - - /// Wait synced result from server - pub(super) async fn slow_round( - &self, - propose_id: ProposeId, - ) -> Result, CurpError> { - let timeout = self.config.wait_synced_timeout; - let req = WaitSyncedRequest::new(propose_id, self.state.cluster_version().await); - let resp = self - .map_leader(|conn| async move { conn.wait_synced(req, timeout).await }) - .await? - .into_inner(); - let synced_res = resp.map_result::(|res| res).map_err(|ser_err| { - warn!("serialize error: {ser_err}"); - // Same as fast round, we blame the server for the serializing error. - CurpError::from(ser_err) - })?; - debug!("slow round for cmd({}) succeed", propose_id); - Ok(synced_res) } /// New a seq num and record it #[allow(clippy::unused_self)] // TODO: implement request tracker fn new_seq_num(&self) -> u64 { - rand::random() + self.last_sent_seq + .fetch_add(1, std::sync::atomic::Ordering::Relaxed) + } +} + +impl Unary { + /// Propose for read only commands + /// + /// For read-only commands, we only need to send propose to leader + async fn propose_read_only( + propose_fut: PF, + use_fast_path: bool, + read_index_futs: FuturesUnordered, + term: u64, + quorum: usize, + ) -> Result, CurpError> + where + PF: Future< + Output = Result< + Response> + Send>>, + CurpError, + >, + >, + RIF: Future, CurpError>>, + { + let term_count_fut = read_index_futs + .filter_map(|res| future::ready(res.ok())) + .filter(|resp| future::ready(resp.get_ref().term == term)) + .take(quorum.wrapping_sub(1)) + .count(); + let (propose_res, num_valid) = tokio::join!(propose_fut, term_count_fut); + if num_valid < quorum.wrapping_sub(1) { + return Err(CurpError::WrongClusterVersion(())); + } + let resp_stream = propose_res?.into_inner(); + let mut response_rx = ResponseReceiver::new(resp_stream); + response_rx.recv::(!use_fast_path).await + } + + /// Propose for mutative commands + async fn propose_mutative( + propose_fut: PF, + record_futs: FuturesUnordered, + use_fast_path: bool, + superquorum: usize, + ) -> Result, CurpError> + where + PF: Future< + Output = Result< + Response> + Send>>, + CurpError, + >, + >, + RF: Future, CurpError>>, + { + let record_futs_filtered = record_futs + .filter_map(|res| future::ready(res.ok())) + .filter(|resp| future::ready(!resp.get_ref().conflict)) + .take(superquorum.wrapping_sub(1)) + .collect::>(); + let (propose_res, record_resps) = tokio::join!(propose_fut, record_futs_filtered); + + let resp_stream = propose_res?.into_inner(); + let mut response_rx = ResponseReceiver::new(resp_stream); + let fast_path_failed = record_resps.len() < superquorum.wrapping_sub(1); + response_rx + .recv::(fast_path_failed || !use_fast_path) + .await } } @@ -225,7 +201,7 @@ impl ClientApi for Unary { use_fast_path: bool, ) -> Result, CurpError> { let propose_id = self.gen_propose_id()?; - RepeatableClientApi::propose(self, propose_id, cmd, token, use_fast_path).await + RepeatableClientApi::propose(self, *propose_id, cmd, token, use_fast_path).await } /// Send propose configuration changes to the cluster @@ -234,13 +210,13 @@ impl ClientApi for Unary { changes: Vec, ) -> Result, CurpError> { let propose_id = self.gen_propose_id()?; - RepeatableClientApi::propose_conf_change(self, propose_id, changes).await + RepeatableClientApi::propose_conf_change(self, *propose_id, changes).await } /// Send propose to shutdown cluster async fn propose_shutdown(&self) -> Result<(), CurpError> { let propose_id = self.gen_propose_id()?; - RepeatableClientApi::propose_shutdown(self, propose_id).await + RepeatableClientApi::propose_shutdown(self, *propose_id).await } /// Send propose to publish a node id and name @@ -251,8 +227,14 @@ impl ClientApi for Unary { node_client_urls: Vec, ) -> Result<(), Self::Error> { let propose_id = self.gen_propose_id()?; - RepeatableClientApi::propose_publish(self, propose_id, node_id, node_name, node_client_urls) - .await + RepeatableClientApi::propose_publish( + self, + *propose_id, + node_id, + node_name, + node_client_urls, + ) + .await } /// Send move leader request @@ -287,7 +269,7 @@ impl ClientApi for Unary { /// Send fetch cluster requests to all servers /// Note: The fetched cluster may still be outdated if `linearizable` is false - async fn fetch_cluster(&self, linearizable: bool) -> Result { + async fn fetch_cluster(&self, linearizable: bool) -> Result { let timeout = self.config.wait_synced_timeout; if !linearizable { // firstly, try to fetch the local server @@ -297,12 +279,7 @@ impl ClientApi for Unary { let resp = connect .fetch_cluster(FetchClusterRequest::default(), FETCH_LOCAL_TIMEOUT) - .await - .unwrap_or_else(|e| { - unreachable!( - "fetch cluster from local connect should never failed, err {e:?}" - ) - }) + .await? .into_inner(); debug!("fetch local cluster {resp:?}"); @@ -395,10 +372,13 @@ impl ClientApi for Unary { #[async_trait] impl RepeatableClientApi for Unary { /// Generate a unique propose id during the retry process. - fn gen_propose_id(&self) -> Result { + fn gen_propose_id(&self) -> Result, Self::Error> { let client_id = self.state.client_id(); let seq_num = self.new_seq_num(); - Ok(ProposeId(client_id, seq_num)) + Ok(ProposeIdGuard::new( + &self.tracker, + ProposeId(client_id, seq_num), + )) } /// Send propose to the whole cluster, `use_fast_path` set to `false` to fallback into ordered @@ -410,93 +390,47 @@ impl RepeatableClientApi for Unary { token: Option<&String>, use_fast_path: bool, ) -> Result, Self::Error> { - tokio::pin! { - let fast_round = self.fast_round(propose_id, cmd, token); - let slow_round = self.slow_round(propose_id); - } + let cmd_arc = Arc::new(cmd); + let term = self.state.term().await; + let propose_req = ProposeRequest::new::( + propose_id, + cmd_arc.as_ref(), + self.state.cluster_version().await, + term, + !use_fast_path, + self.tracker.read().first_incomplete(), + ); + let record_req = RecordRequest::new::(propose_id, cmd_arc.as_ref()); + let connects_len = self.state.connects_len().await; + let quorum = quorum(connects_len); + let superquorum = super_quorum(connects_len); + let leader_id = self.leader_id().await?; + let timeout = self.config.propose_timeout; - let res: ProposeResponse = if use_fast_path { - match futures::future::select(fast_round, slow_round).await { - futures::future::Either::Left((fast_result, slow_round)) => match fast_result { - Ok(er) => er.map(|e| { - #[cfg(feature = "client-metrics")] - super::metrics::get().client_fast_path_count.add(1, &[]); - - (e, None) - }), - Err(fast_err) => { - if fast_err.should_abort_slow_round() { - return Err(fast_err); - } - // fallback to slow round if fast round failed - let sr = match slow_round.await { - Ok(sr) => sr, - Err(slow_err) => { - return Err(std::cmp::max_by_key(fast_err, slow_err, |err| { - err.priority() - })) - } - }; - sr.map(|(asr, er)| { - #[cfg(feature = "client-metrics")] - { - super::metrics::get().client_slow_path_count.add(1, &[]); - super::metrics::get() - .client_fast_path_fallback_slow_path_count - .add(1, &[]); - } - - (er, Some(asr)) - }) - } - }, - futures::future::Either::Right((slow_result, fast_round)) => match slow_result { - Ok(er) => er.map(|(asr, e)| { - #[cfg(feature = "client-metrics")] - super::metrics::get().client_slow_path_count.add(1, &[]); - - (e, Some(asr)) - }), - Err(slow_err) => { - if slow_err.should_abort_fast_round() { - return Err(slow_err); - } - // try to poll fast round - let fr = match fast_round.await { - Ok(fr) => fr, - Err(fast_err) => { - return Err(std::cmp::max_by_key(fast_err, slow_err, |err| { - err.priority() - })) - } - }; - fr.map(|er| { - #[cfg(feature = "client-metrics")] - super::metrics::get().client_fast_path_count.add(1, &[]); - - (er, None) - }) - } - }, - } - } else { - match futures::future::join(fast_round, slow_round).await { - (_, Ok(sr)) => sr.map(|(asr, er)| { - #[cfg(feature = "client-metrics")] - super::metrics::get().client_slow_path_count.add(1, &[]); - - (er, Some(asr)) - }), - (Ok(_), Err(err)) => return Err(err), - (Err(fast_err), Err(slow_err)) => { - return Err(std::cmp::max_by_key(fast_err, slow_err, |err| { - err.priority() - })) - } - } - }; + let propose_fut = self.state.map_server(leader_id, |conn| async move { + conn.propose_stream(propose_req, token.cloned(), timeout) + .await + }); + let record_futs = self + .state + .for_each_follower(leader_id, |conn| { + let record_req_c = record_req.clone(); + async move { conn.record(record_req_c, timeout).await } + }) + .await; + let read_index_futs = self + .state + .for_each_follower( + leader_id, + |conn| async move { conn.read_index(timeout).await }, + ) + .await; - Ok(res) + if cmd.is_read_only() { + Self::propose_read_only(propose_fut, use_fast_path, read_index_futs, term, quorum).await + } else { + Self::propose_mutative(propose_fut, record_futs, use_fast_path, superquorum).await + } } /// Send propose configuration changes to the cluster diff --git a/crates/curp/src/lib.rs b/crates/curp/src/lib.rs index a6a337218..e5e5111b6 100644 --- a/crates/curp/src/lib.rs +++ b/crates/curp/src/lib.rs @@ -203,6 +203,9 @@ pub mod rpc; /// Snapshot mod snapshot; +/// Propose response sender +mod response; + /// Calculate the super quorum #[inline] #[must_use] diff --git a/crates/curp/src/log_entry.rs b/crates/curp/src/log_entry.rs index f2b19c14b..96ba66d8d 100644 --- a/crates/curp/src/log_entry.rs +++ b/crates/curp/src/log_entry.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use crate::{ members::ServerId, - rpc::{ConfChange, PoolEntryInner, ProposeId, PublishRequest}, + rpc::{ConfChange, ProposeId, PublishRequest}, }; /// Log entry @@ -53,15 +53,6 @@ impl From> for EntryData { } } -impl From> for EntryData { - fn from(value: PoolEntryInner) -> Self { - match value { - PoolEntryInner::Command(cmd) => EntryData::Command(cmd), - PoolEntryInner::ConfChange(conf_change) => EntryData::ConfChange(conf_change), - } - } -} - impl From for EntryData { fn from(value: PublishRequest) -> Self { EntryData::SetNodeState(value.node_id, value.name, value.client_urls) @@ -93,6 +84,12 @@ where } } +impl AsRef> for LogEntry { + fn as_ref(&self) -> &LogEntry { + self + } +} + /// Propose id to inflight id pub(super) fn propose_id_to_inflight_id(id: ProposeId) -> InflightId { let mut hasher = std::collections::hash_map::DefaultHasher::new(); diff --git a/crates/curp/src/response.rs b/crates/curp/src/response.rs new file mode 100644 index 000000000..e6c5ca7e6 --- /dev/null +++ b/crates/curp/src/response.rs @@ -0,0 +1,134 @@ +use std::{ + pin::Pin, + sync::atomic::{AtomicBool, Ordering}, +}; + +use curp_external_api::cmd::Command; +use futures::Stream; +use tokio_stream::StreamExt; +use tonic::Status; + +use crate::rpc::{CurpError, OpResponse, ProposeResponse, ResponseOp, SyncedResponse}; + +/// The response sender +#[derive(Debug)] +pub(super) struct ResponseSender { + /// The stream sender + tx: flume::Sender>, + /// Whether the command will be speculatively executed + conflict: AtomicBool, +} + +impl ResponseSender { + /// Creates a new `ResponseSender` + pub(super) fn new(tx: flume::Sender>) -> ResponseSender { + ResponseSender { + tx, + conflict: AtomicBool::new(false), + } + } + + /// Gets whether the command associated with this sender will be + /// speculatively executed + pub(super) fn is_conflict(&self) -> bool { + self.conflict.load(Ordering::SeqCst) + } + + /// Sets the the command associated with this sender will be + /// speculatively executed + pub(super) fn set_conflict(&self, conflict: bool) { + let _ignore = self.conflict.fetch_or(conflict, Ordering::SeqCst); + } + + /// Sends propose result + pub(super) fn send_propose(&self, resp: ProposeResponse) { + let resp = OpResponse { + op: Some(ResponseOp::Propose(resp)), + }; + // Ignore the result because the client might close the receiving stream + let _ignore = self.tx.try_send(Ok(resp)); + } + + /// Sends after sync result + pub(super) fn send_synced(&self, resp: SyncedResponse) { + let resp = OpResponse { + op: Some(ResponseOp::Synced(resp)), + }; + // Ignore the result because the client might close the receiving stream + let _ignore = self.tx.try_send(Ok(resp)); + } +} + +/// Receiver for obtaining execution or after sync results +pub(crate) struct ResponseReceiver { + /// The response stream + resp_stream: Pin> + Send>>, +} + +impl ResponseReceiver { + /// Creates a new [`ResponseReceiver`]. + pub(crate) fn new( + resp_stream: Box> + Send>, + ) -> Self { + Self { + resp_stream: Box::into_pin(resp_stream), + } + } + + /// Receives the results + pub(crate) async fn recv( + &mut self, + both: bool, + ) -> Result), C::Error>, CurpError> { + let fst = self.recv_resp().await?; + + match fst { + ResponseOp::Propose(propose_resp) => { + let conflict = propose_resp.conflict; + let er_result = propose_resp.map_result::(|res| { + res.map(|er| er.unwrap_or_else(|| unreachable!())) + })?; + if let Err(e) = er_result { + return Ok(Err(e)); + } + if conflict || both { + let snd = self.recv_resp().await?; + let ResponseOp::Synced(synced_resp) = snd else { + unreachable!() + }; + let asr_result = synced_resp + .map_result::(|res| res.unwrap_or_else(|| unreachable!()))?; + return Ok(er_result.and_then(|er| asr_result.map(|asr| (er, Some(asr))))); + } + Ok(er_result.map(|er| (er, None))) + } + ResponseOp::Synced(synced_resp) => { + let asr_result = synced_resp + .map_result::(|res| res.unwrap_or_else(|| unreachable!()))?; + if let Err(e) = asr_result { + return Ok(Err(e)); + } + let snd = self.recv_resp().await?; + let ResponseOp::Propose(propose_resp) = snd else { + unreachable!("op: {snd:?}") + }; + let er_result = propose_resp.map_result::(|res| { + res.map(|er| er.unwrap_or_else(|| unreachable!())) + })?; + Ok(er_result.and_then(|er| asr_result.map(|asr| (er, Some(asr))))) + } + } + } + + /// Receives a single response from stream + async fn recv_resp(&mut self) -> Result { + let resp = self + .resp_stream + .next() + .await + .ok_or(CurpError::internal("stream reaches on an end".to_owned()))??; + Ok(resp + .op + .unwrap_or_else(|| unreachable!("op should always exist"))) + } +} diff --git a/crates/curp/src/rpc/connect.rs b/crates/curp/src/rpc/connect.rs index 0975e3687..d438b6c28 100644 --- a/crates/curp/src/rpc/connect.rs +++ b/crates/curp/src/rpc/connect.rs @@ -34,13 +34,17 @@ use crate::{ FetchClusterResponse, FetchReadStateRequest, FetchReadStateResponse, InstallSnapshotRequest, InstallSnapshotResponse, LeaseKeepAliveMsg, MoveLeaderRequest, MoveLeaderResponse, ProposeConfChangeRequest, ProposeConfChangeResponse, ProposeRequest, - ProposeResponse, Protocol, PublishRequest, PublishResponse, ShutdownRequest, - ShutdownResponse, TriggerShutdownRequest, TryBecomeLeaderNowRequest, VoteRequest, - VoteResponse, WaitSyncedRequest, WaitSyncedResponse, + Protocol, PublishRequest, PublishResponse, ShutdownRequest, ShutdownResponse, + TriggerShutdownRequest, TryBecomeLeaderNowRequest, VoteRequest, VoteResponse, }, snapshot::Snapshot, }; +use super::{ + proto::commandpb::{ReadIndexRequest, ReadIndexResponse}, + OpResponse, RecordRequest, RecordResponse, +}; + /// Install snapshot chunk size: 64KB const SNAPSHOT_CHUNK_SIZE: u64 = 64 * 1024; @@ -158,12 +162,28 @@ pub(crate) trait ConnectApi: Send + Sync + 'static { async fn update_addrs(&self, addrs: Vec) -> Result<(), tonic::transport::Error>; /// Send `ProposeRequest` - async fn propose( + async fn propose_stream( &self, request: ProposeRequest, token: Option, timeout: Duration, - ) -> Result, CurpError>; + ) -> Result< + tonic::Response> + Send>>, + CurpError, + >; + + /// Send `RecordRequest` + async fn record( + &self, + request: RecordRequest, + timeout: Duration, + ) -> Result, CurpError>; + + /// Send `ReadIndexRequest` + async fn read_index( + &self, + timeout: Duration, + ) -> Result, CurpError>; /// Send `ProposeRequest` async fn propose_conf_change( @@ -179,13 +199,6 @@ pub(crate) trait ConnectApi: Send + Sync + 'static { timeout: Duration, ) -> Result, CurpError>; - /// Send `WaitSyncedRequest` - async fn wait_synced( - &self, - request: WaitSyncedRequest, - timeout: Duration, - ) -> Result, CurpError>; - /// Send `ShutdownRequest` async fn shutdown( &self, @@ -369,6 +382,15 @@ impl Connect { } } +/// Sets timeout for a client connection +macro_rules! with_timeout { + ($timeout:expr, $client_op:expr) => { + tokio::time::timeout($timeout, $client_op) + .await + .map_err(|_| tonic::Status::deadline_exceeded("timeout"))? + }; +} + #[async_trait] impl ConnectApi for Connect> { /// Get server id @@ -382,21 +404,46 @@ impl ConnectApi for Connect> { } /// Send `ProposeRequest` - #[instrument(skip(self), name = "client propose")] - async fn propose( + async fn propose_stream( &self, request: ProposeRequest, token: Option, timeout: Duration, - ) -> Result, CurpError> { + ) -> Result< + tonic::Response> + Send>>, + CurpError, + > { let mut client = self.rpc_connect.clone(); let mut req = tonic::Request::new(request); - req.set_timeout(timeout); - req.metadata_mut().inject_current(); if let Some(token) = token { _ = req.metadata_mut().insert("token", token.parse()?); } - client.propose(req).await.map_err(Into::into) + let resp = with_timeout!(timeout, client.propose_stream(req))?.into_inner(); + Ok(tonic::Response::new(Box::new(resp))) + + // let resp = client.propose_stream(req).await?.map(Box::new); + // Ok(resp) + } + + /// Send `RecordRequest` + async fn record( + &self, + request: RecordRequest, + timeout: Duration, + ) -> Result, CurpError> { + let mut client = self.rpc_connect.clone(); + let req = tonic::Request::new(request); + with_timeout!(timeout, client.record(req)).map_err(Into::into) + } + + /// Send `ReadIndexRequest` + async fn read_index( + &self, + timeout: Duration, + ) -> Result, CurpError> { + let mut client = self.rpc_connect.clone(); + let req = tonic::Request::new(ReadIndexRequest {}); + with_timeout!(timeout, client.read_index(req)).map_err(Into::into) } /// Send `ShutdownRequest` @@ -408,9 +455,8 @@ impl ConnectApi for Connect> { ) -> Result, CurpError> { let mut client = self.rpc_connect.clone(); let mut req = tonic::Request::new(request); - req.set_timeout(timeout); req.metadata_mut().inject_current(); - client.shutdown(req).await.map_err(Into::into) + with_timeout!(timeout, client.shutdown(req)).map_err(Into::into) } /// Send `ProposeRequest` @@ -422,9 +468,8 @@ impl ConnectApi for Connect> { ) -> Result, CurpError> { let mut client = self.rpc_connect.clone(); let mut req = tonic::Request::new(request); - req.set_timeout(timeout); req.metadata_mut().inject_current(); - client.propose_conf_change(req).await.map_err(Into::into) + with_timeout!(timeout, client.propose_conf_change(req)).map_err(Into::into) } /// Send `PublishRequest` @@ -436,23 +481,8 @@ impl ConnectApi for Connect> { ) -> Result, CurpError> { let mut client = self.rpc_connect.clone(); let mut req = tonic::Request::new(request); - req.set_timeout(timeout); - req.metadata_mut().inject_current(); - client.publish(req).await.map_err(Into::into) - } - - /// Send `WaitSyncedRequest` - #[instrument(skip(self), name = "client propose")] - async fn wait_synced( - &self, - request: WaitSyncedRequest, - timeout: Duration, - ) -> Result, CurpError> { - let mut client = self.rpc_connect.clone(); - let mut req = tonic::Request::new(request); - req.set_timeout(timeout); req.metadata_mut().inject_current(); - client.wait_synced(req).await.map_err(Into::into) + with_timeout!(timeout, client.publish(req)).map_err(Into::into) } /// Send `FetchClusterRequest` @@ -462,9 +492,8 @@ impl ConnectApi for Connect> { timeout: Duration, ) -> Result, CurpError> { let mut client = self.rpc_connect.clone(); - let mut req = tonic::Request::new(request); - req.set_timeout(timeout); - client.fetch_cluster(req).await.map_err(Into::into) + let req = tonic::Request::new(request); + with_timeout!(timeout, client.fetch_cluster(req)).map_err(Into::into) } /// Send `FetchReadStateRequest` @@ -474,9 +503,8 @@ impl ConnectApi for Connect> { timeout: Duration, ) -> Result, CurpError> { let mut client = self.rpc_connect.clone(); - let mut req = tonic::Request::new(request); - req.set_timeout(timeout); - client.fetch_read_state(req).await.map_err(Into::into) + let req = tonic::Request::new(request); + with_timeout!(timeout, client.fetch_read_state(req)).map_err(Into::into) } /// Send `MoveLeaderRequest` @@ -486,9 +514,8 @@ impl ConnectApi for Connect> { timeout: Duration, ) -> Result, CurpError> { let mut client = self.rpc_connect.clone(); - let mut req = tonic::Request::new(request); - req.set_timeout(timeout); - client.move_leader(req).await.map_err(Into::into) + let req = tonic::Request::new(request); + with_timeout!(timeout, client.move_leader(req)).map_err(Into::into) } /// Keep send lease keep alive to server and mutate the client id @@ -533,9 +560,8 @@ impl InnerConnectApi for Connect> { let start_at = self.before_rpc::(); let mut client = self.rpc_connect.clone(); - let mut req = tonic::Request::new(request); - req.set_timeout(timeout); - let result = client.append_entries(req).await; + let req = tonic::Request::new(request); + let result = with_timeout!(timeout, client.append_entries(req)); #[cfg(feature = "client-metrics")] self.after_rpc(start_at, &result); @@ -553,9 +579,8 @@ impl InnerConnectApi for Connect> { let start_at = self.before_rpc::(); let mut client = self.rpc_connect.clone(); - let mut req = tonic::Request::new(request); - req.set_timeout(timeout); - let result = client.vote(req).await; + let req = tonic::Request::new(request); + let result = with_timeout!(timeout, client.vote(req)); #[cfg(feature = "client-metrics")] self.after_rpc(start_at, &result); @@ -601,9 +626,8 @@ impl InnerConnectApi for Connect> { let start_at = self.before_rpc::(); let mut client = self.rpc_connect.clone(); - let mut req = tonic::Request::new(TryBecomeLeaderNowRequest::default()); - req.set_timeout(timeout); - let result = client.try_become_leader_now(req).await; + let req = tonic::Request::new(TryBecomeLeaderNowRequest::default()); + let result = with_timeout!(timeout, client.try_become_leader_now(req)); #[cfg(feature = "client-metrics")] self.after_rpc(start_at, &result); @@ -675,19 +699,47 @@ where } /// Send `ProposeRequest` - async fn propose( + #[instrument(skip(self), name = "client propose stream")] + async fn propose_stream( &self, request: ProposeRequest, token: Option, _timeout: Duration, - ) -> Result, CurpError> { + ) -> Result< + tonic::Response> + Send>>, + CurpError, + > { let mut req = tonic::Request::new(request); req.metadata_mut().inject_bypassed(); req.metadata_mut().inject_current(); if let Some(token) = token { _ = req.metadata_mut().insert("token", token.parse()?); } - self.server.propose(req).await.map_err(Into::into) + let resp = self.server.propose_stream(req).await?.into_inner(); + Ok(tonic::Response::new(Box::new(resp))) + } + + /// Send `RecordRequest` + #[instrument(skip(self), name = "client record")] + async fn record( + &self, + request: RecordRequest, + _timeout: Duration, + ) -> Result, CurpError> { + let mut req = tonic::Request::new(request); + req.metadata_mut().inject_bypassed(); + req.metadata_mut().inject_current(); + self.server.record(req).await.map_err(Into::into) + } + + async fn read_index( + &self, + _timeout: Duration, + ) -> Result, CurpError> { + let mut req = tonic::Request::new(ReadIndexRequest {}); + req.metadata_mut().inject_bypassed(); + req.metadata_mut().inject_current(); + self.server.read_index(req).await.map_err(Into::into) } /// Send `PublishRequest` @@ -717,18 +769,6 @@ where .map_err(Into::into) } - /// Send `WaitSyncedRequest` - async fn wait_synced( - &self, - request: WaitSyncedRequest, - _timeout: Duration, - ) -> Result, CurpError> { - let mut req = tonic::Request::new(request); - req.metadata_mut().inject_bypassed(); - req.metadata_mut().inject_current(); - self.server.wait_synced(req).await.map_err(Into::into) - } - /// Send `ShutdownRequest` async fn shutdown( &self, diff --git a/crates/curp/src/rpc/mod.rs b/crates/curp/src/rpc/mod.rs index 560401961..c064c3bb0 100644 --- a/crates/curp/src/rpc/mod.rs +++ b/crates/curp/src/rpc/mod.rs @@ -2,6 +2,7 @@ use std::{collections::HashMap, sync::Arc}; use curp_external_api::{ cmd::{ConflictCheck, PbCodec, PbSerializeError}, + conflict::EntryId, InflightId, }; use prost::Message; @@ -22,6 +23,7 @@ pub use self::proto::{ curp_error::Err as CurpError, // easy for match curp_error::Redirect, fetch_read_state_response::{IdSet, ReadState}, + op_response::Op as ResponseOp, propose_conf_change_request::{ConfChange, ConfChangeType}, protocol_client, protocol_server::{Protocol, ProtocolServer}, @@ -34,6 +36,7 @@ pub use self::proto::{ Member, MoveLeaderRequest, MoveLeaderResponse, + OpResponse, OptionalU64, ProposeConfChangeRequest, ProposeConfChangeResponse, @@ -42,8 +45,13 @@ pub use self::proto::{ ProposeResponse, PublishRequest, PublishResponse, + ReadIndexRequest, + ReadIndexResponse, + RecordRequest, + RecordResponse, ShutdownRequest, ShutdownResponse, + SyncedResponse, WaitSyncedRequest, WaitSyncedResponse, }, @@ -160,11 +168,21 @@ impl FetchClusterResponse { impl ProposeRequest { /// Create a new `Propose` request #[inline] - pub fn new(propose_id: ProposeId, cmd: &C, cluster_version: u64) -> Self { + pub fn new( + propose_id: ProposeId, + cmd: &C, + cluster_version: u64, + term: u64, + slow_path: bool, + first_incomplete: u64, + ) -> Self { Self { propose_id: Some(propose_id.into()), command: cmd.encode(), cluster_version, + term, + slow_path, + first_incomplete, } } @@ -190,7 +208,7 @@ impl ProposeRequest { impl ProposeResponse { /// Create an ok propose response - pub(crate) fn new_result(result: &Result) -> Self { + pub(crate) fn new_result(result: &Result, conflict: bool) -> Self { let result = match *result { Ok(ref er) => Some(CmdResult { result: Some(CmdResultInner::Ok(er.encode())), @@ -199,12 +217,16 @@ impl ProposeResponse { result: Some(CmdResultInner::Error(e.encode())), }), }; - Self { result } + Self { result, conflict } } /// Create an empty propose response + #[allow(unused)] pub(crate) fn new_empty() -> Self { - Self { result: None } + Self { + result: None, + conflict: false, + } } /// Deserialize result in response and take a map function @@ -223,16 +245,16 @@ impl ProposeResponse { } } -impl WaitSyncedRequest { - /// Create a `WaitSynced` request - pub(crate) fn new(id: ProposeId, cluster_version: u64) -> Self { - Self { - propose_id: Some(id.into()), - cluster_version, +impl RecordRequest { + /// Creates a new `RecordRequest` + pub(crate) fn new(propose_id: ProposeId, command: &C) -> Self { + RecordRequest { + propose_id: Some(propose_id.into()), + command: command.encode(), } } - /// Get the `propose_id` reference + /// Get the propose id pub(crate) fn propose_id(&self) -> ProposeId { self.propose_id .unwrap_or_else(|| { @@ -240,101 +262,43 @@ impl WaitSyncedRequest { }) .into() } -} - -impl WaitSyncedResponse { - /// Create a success response - fn new_success(asr: &C::ASR, er: &C::ER) -> Self { - Self { - after_sync_result: Some(CmdResult { - result: Some(CmdResultInner::Ok(asr.encode())), - }), - exe_result: Some(CmdResult { - result: Some(CmdResultInner::Ok(er.encode())), - }), - } - } - - /// Create an error response which includes an execution error - fn new_er_error(er: &C::Error) -> Self { - Self { - after_sync_result: None, - exe_result: Some(CmdResult { - result: Some(CmdResultInner::Error(er.encode())), - }), - } - } - /// Create an error response which includes an `after_sync` error - fn new_asr_error(er: &C::ER, asr_err: &C::Error) -> Self { - Self { - after_sync_result: Some(CmdResult { - result: Some(CmdResultInner::Error(asr_err.encode())), - }), - exe_result: Some(CmdResult { - result: Some(CmdResultInner::Ok(er.encode())), - }), - } + /// Get command + pub(crate) fn cmd(&self) -> Result { + C::decode(&self.command) } +} - /// Create a new response from execution result and `after_sync` result - pub(crate) fn new_from_result( - er: Result, - asr: Option>, - ) -> Self { - match (er, asr) { - (Ok(ref er), Some(Err(ref asr_err))) => { - WaitSyncedResponse::new_asr_error::(er, asr_err) - } - (Ok(ref er), Some(Ok(ref asr))) => WaitSyncedResponse::new_success::(asr, er), - (Ok(ref _er), None) => unreachable!("can't get after sync result"), - (Err(ref err), _) => WaitSyncedResponse::new_er_error::(err), +impl SyncedResponse { + /// Create a new response from `after_sync` result + pub(crate) fn new_result(result: &Result) -> Self { + match *result { + Ok(ref asr) => SyncedResponse { + after_sync_result: Some(CmdResult { + result: Some(CmdResultInner::Ok(asr.encode())), + }), + }, + Err(ref e) => SyncedResponse { + after_sync_result: Some(CmdResult { + result: Some(CmdResultInner::Error(e.encode())), + }), + }, } } - /// Similar to `ProposeResponse::map_result` + /// Deserialize result in response and take a map function pub(crate) fn map_result(self, f: F) -> Result where - F: FnOnce(Result<(C::ASR, C::ER), C::Error>) -> R, + F: FnOnce(Option>) -> R, { - // according to the above methods, we can only get the following response union - // ER: Some(OK), ASR: Some(OK) <- WaitSyncedResponse::new_success - // ER: Some(Err), ASR: None <- WaitSyncedResponse::new_er_error - // ER: Some(OK), ASR: Some(Err) <- WaitSyncedResponse::new_asr_error - let res = match (self.exe_result, self.after_sync_result) { - ( - Some(CmdResult { - result: Some(CmdResultInner::Ok(ref er)), - }), - Some(CmdResult { - result: Some(CmdResultInner::Ok(ref asr)), - }), - ) => { - let er = ::ER::decode(er)?; - let asr = ::ASR::decode(asr)?; - Ok((asr, er)) - } - ( - Some(CmdResult { - result: Some(CmdResultInner::Error(ref buf)), - }), - None, - ) - | ( - Some(CmdResult { - result: Some(CmdResultInner::Ok(_)), - }), - Some(CmdResult { - result: Some(CmdResultInner::Error(ref buf)), - }), - ) => { - let er = ::Error::decode(buf.as_slice())?; - Err(er) - } - _ => unreachable!("got unexpected WaitSyncedResponse"), + let Some(res) = self.after_sync_result.and_then(|res| res.result) else { + return Ok(f(None)); }; - - Ok(f(res)) + let res = match res { + CmdResultInner::Ok(ref buf) => Ok(::ASR::decode(buf)?), + CmdResultInner::Error(ref buf) => Err(::Error::decode(buf)?), + }; + Ok(f(Some(res))) } } @@ -639,16 +603,13 @@ impl PublishRequest { /// NOTICE: /// -/// Please check test case `test_unary_fast_round_return_early_err` `test_unary_propose_return_early_err` -/// `test_retry_propose_return_no_retry_error` `test_retry_propose_return_retry_error` if you added some -/// new [`CurpError`] +/// Please check test case `test_unary_fast_round_return_early_err` +/// `test_unary_propose_return_early_err` +/// `test_retry_propose_return_no_retry_error` +/// `test_retry_propose_return_retry_error` if you added some new [`CurpError`] impl CurpError { - /// `KeyConflict` error - pub(crate) fn key_conflict() -> Self { - Self::KeyConflict(()) - } - /// `Duplicated` error + #[allow(unused)] pub(crate) fn duplicated() -> Self { Self::Duplicated(()) } @@ -718,6 +679,7 @@ impl CurpError { } /// Whether to abort slow round early + #[allow(unused)] pub(crate) fn should_abort_slow_round(&self) -> bool { matches!( *self, @@ -743,7 +705,8 @@ impl CurpError { | CurpError::LearnerNotCatchUp(()) | CurpError::ExpiredClientId(()) | CurpError::Redirect(_) - | CurpError::WrongClusterVersion(()) => CurpErrorPriority::High, + | CurpError::WrongClusterVersion(()) + | CurpError::Zombie(()) => CurpErrorPriority::High, CurpError::RpcTransport(()) | CurpError::Internal(_) | CurpError::KeyConflict(()) @@ -846,6 +809,10 @@ impl From for tonic::Status { tonic::Code::FailedPrecondition, "Leader transfer error: A leader transfer error occurred.", ), + CurpError::Zombie(()) => ( + tonic::Code::FailedPrecondition, + "Zombie leader error: The leader is a zombie with outdated term.", + ), }; let details = CurpErrorWrapper { err: Some(err) }.encode_to_vec(); @@ -857,32 +824,19 @@ impl From for tonic::Status { // User defined types /// Entry of speculative pool -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(test, derive(PartialEq))] -pub(crate) struct PoolEntry { +#[derive(Debug, Serialize, Deserialize)] +pub struct PoolEntry { /// Propose id pub(crate) id: ProposeId, /// Inner entry - pub(crate) inner: PoolEntryInner, -} - -/// Inner entry of speculative pool -#[derive(Debug, Clone, Serialize, Deserialize)] -#[cfg_attr(test, derive(PartialEq))] -pub(crate) enum PoolEntryInner { - /// Command entry - Command(Arc), - /// ConfChange entry - ConfChange(Vec), + pub(crate) cmd: Arc, } impl PoolEntry { /// Create a new pool entry - pub(crate) fn new(id: ProposeId, inner: impl Into>) -> Self { - Self { - id, - inner: inner.into(), - } + #[inline] + pub fn new(id: ProposeId, inner: Arc) -> Self { + Self { id, cmd: inner } } } @@ -890,26 +844,74 @@ impl ConflictCheck for PoolEntry where C: ConflictCheck, { + #[inline] fn is_conflict(&self, other: &Self) -> bool { - let PoolEntryInner::Command(ref cmd1) = self.inner else { - return true; - }; - let PoolEntryInner::Command(ref cmd2) = other.inner else { - return true; - }; - cmd1.is_conflict(cmd2) + self.cmd.is_conflict(&other.cmd) + } +} + +impl Clone for PoolEntry { + #[inline] + fn clone(&self) -> Self { + Self { + id: self.id, + cmd: Arc::clone(&self.cmd), + } + } +} + +impl std::ops::Deref for PoolEntry { + type Target = C; + + #[inline] + fn deref(&self) -> &Self::Target { + &self.cmd + } +} + +impl AsRef for PoolEntry { + #[inline] + fn as_ref(&self) -> &C { + self.cmd.as_ref() + } +} + +impl std::hash::Hash for PoolEntry { + #[inline] + fn hash(&self, state: &mut H) { + self.id.hash(state); } } -impl From> for PoolEntryInner { - fn from(value: Arc) -> Self { - Self::Command(value) +impl PartialEq for PoolEntry { + #[inline] + fn eq(&self, other: &Self) -> bool { + self.id.eq(&other.id) } } -impl From> for PoolEntryInner { - fn from(value: Vec) -> Self { - Self::ConfChange(value) +impl Eq for PoolEntry {} + +impl PartialOrd for PoolEntry { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.id.cmp(&other.id)) + } +} + +impl Ord for PoolEntry { + #[inline] + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.id.cmp(&other.id) + } +} + +impl EntryId for PoolEntry { + type Id = ProposeId; + + #[inline] + fn id(&self) -> Self::Id { + self.id } } diff --git a/crates/curp/src/server/cmd_board.rs b/crates/curp/src/server/cmd_board.rs index c35c64bef..64169323a 100644 --- a/crates/curp/src/server/cmd_board.rs +++ b/crates/curp/src/server/cmd_board.rs @@ -1,3 +1,5 @@ +#![allow(unused)] // TODO remove + use std::{collections::HashMap, sync::Arc}; use event_listener::{Event, EventListener}; @@ -5,7 +7,7 @@ use indexmap::{IndexMap, IndexSet}; use parking_lot::RwLock; use utils::parking_lot_lock::RwLockMap; -use crate::{cmd::Command, rpc::ProposeId}; +use crate::{cmd::Command, rpc::ProposeId, tracker::Tracker}; /// Ref to the cmd board pub(super) type CmdBoardRef = Arc>>; @@ -21,10 +23,10 @@ pub(super) struct CommandBoard { shutdown_notifier: Event, /// Store all notifiers for conf change results conf_notifier: HashMap, + /// The result trackers track all cmd, this is used for dedup + pub(super) trackers: HashMap, /// Store all conf change propose ids pub(super) conf_buffer: IndexSet, - /// The cmd has been received before, this is used for dedup - pub(super) sync: IndexSet, /// Store all execution results pub(super) er_buffer: IndexMap>, /// Store all after sync results @@ -38,7 +40,7 @@ impl CommandBoard { er_notifiers: HashMap::new(), asr_notifiers: HashMap::new(), shutdown_notifier: Event::new(), - sync: IndexSet::new(), + trackers: HashMap::new(), er_buffer: IndexMap::new(), asr_buffer: IndexMap::new(), conf_notifier: HashMap::new(), @@ -46,6 +48,16 @@ impl CommandBoard { } } + /// Get the tracker for a client id + pub(super) fn tracker(&mut self, client_id: u64) -> &mut Tracker { + self.trackers.entry(client_id).or_default() + } + + /// Remove client result tracker from trackers if it is expired + pub(super) fn client_expired(&mut self, client_id: u64) { + let _ig = self.trackers.remove(&client_id); + } + /// Release notifiers pub(super) fn release_notifiers(&mut self) { self.er_notifiers.drain().for_each(|(_, event)| { @@ -56,10 +68,11 @@ impl CommandBoard { }); } - /// Clear + /// Clear, called when leader retires pub(super) fn clear(&mut self) { self.er_buffer.clear(); self.asr_buffer.clear(); + self.trackers.clear(); self.release_notifiers(); } @@ -181,14 +194,12 @@ impl CommandBoard { pub(super) async fn wait_for_er_asr( cb: &CmdBoardRef, id: ProposeId, - ) -> (Result, Option>) { + ) -> (Result, Result) { loop { { let cb_r = cb.read(); - match (cb_r.er_buffer.get(&id), cb_r.asr_buffer.get(&id)) { - (Some(er), None) if er.is_err() => return (er.clone(), None), - (Some(er), Some(asr)) => return (er.clone(), Some(asr.clone())), - _ => {} + if let (Some(er), Some(asr)) = (cb_r.er_buffer.get(&id), cb_r.asr_buffer.get(&id)) { + return (er.clone(), asr.clone()); } } let listener = cb.write().asr_listener(id); diff --git a/crates/curp/src/server/cmd_worker/conflict_checked_mpmc.rs b/crates/curp/src/server/cmd_worker/conflict_checked_mpmc.rs deleted file mode 100644 index d30c41a9b..000000000 --- a/crates/curp/src/server/cmd_worker/conflict_checked_mpmc.rs +++ /dev/null @@ -1,600 +0,0 @@ -#![allow( - clippy::wildcard_enum_match_arm, - clippy::match_wildcard_for_single_variants -)] // wildcard actually is more clear in this module -#![allow(clippy::arithmetic_side_effects)] // u64 is large enough - -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, -}; - -use tokio::sync::oneshot; -use tracing::{debug, error}; -use utils::task_manager::{tasks::TaskName, Listener, State, TaskManager}; - -use self::cart::Cart; -use super::{CEEvent, CEEventTx}; -use crate::{ - cmd::{Command, CommandExecutor}, - log_entry::{EntryData, LogEntry}, - rpc::ProposeId, - snapshot::{Snapshot, SnapshotMeta}, -}; - -/// Cart -mod cart { - /// Cart is a utility that acts as a temporary container. - /// - /// It is usually filled by the provider and consumed by the customer. - /// - /// This is useful when we are sure that the provider will fill the cart and the cart will be consumed by the customer - /// so that we don't need to check whether there is something in the `Option`. - #[derive(Debug)] - pub(super) struct Cart(Option); - - impl Cart { - /// New cart with object - pub(super) fn new(object: T) -> Self { - Self(Some(object)) - } - /// Take the object. Panic if its inner has already been taken. - pub(super) fn take(&mut self) -> T { - #[allow(clippy::expect_used)] - self.0.take().expect("the cart is empty") - } - /// Check whether the object is taken - pub(super) fn is_taken(&self) -> bool { - self.0.is_none() - } - } -} - -/// CE task -pub(in crate::server) struct Task { - /// Corresponding vertex id - vid: u64, - /// Task type - inner: Cart>, -} - -/// Task Type -pub(super) enum TaskType { - /// Execute a cmd - SpecExe(Arc>, Option), - /// After sync a cmd - AS(Arc>, Option), - /// Reset the CE - Reset(Option, oneshot::Sender<()>), - /// Snapshot - Snapshot(SnapshotMeta, oneshot::Sender), -} - -impl Task { - /// Get inner task - pub(super) fn take(&mut self) -> TaskType { - self.inner.take() - } -} - -/// Vertex -#[derive(Debug)] -struct Vertex { - /// Successor cmds that arrive later with keys that conflict this cmd - successors: HashSet, - /// Number of predecessor cmds that arrive earlier with keys that conflict this cmd - predecessor_cnt: u64, - /// Vertex inner - inner: VertexInner, -} - -impl Vertex { - /// Whether two vertex conflict each other - fn is_conflict(&self, other: &Vertex) -> bool { - #[allow(clippy::pattern_type_mismatch)] - // it seems it's impossible to get away with this lint - match (&self.inner, &other.inner) { - ( - VertexInner::Entry { entry: entry1, .. }, - VertexInner::Entry { entry: entry2, .. }, - ) => { - let EntryData::Command(ref cmd1) = entry1.entry_data else { - return true; - }; - let EntryData::Command(ref cmd2) = entry2.entry_data else { - return true; - }; - cmd1.is_conflict(cmd2) - } - _ => true, - } - } -} - -/// Vertex inner -#[derive(Debug)] -enum VertexInner { - /// A entry vertex - Entry { - /// Entry - entry: Arc>, - /// Execution state - exe_st: ExeState, - /// After sync state - as_st: AsState, - }, - /// A reset vertex - Reset { - /// The snapshot and finish notifier - inner: Cart<(Box>, oneshot::Sender<()>)>, // use `Box` to avoid enum members with large size - /// Reset state - st: OnceState, - }, - /// A snapshot vertex - Snapshot { - /// The sender - inner: Cart<(SnapshotMeta, oneshot::Sender)>, - /// Snapshot state - st: OnceState, - }, -} - -/// Execute state of a cmd -#[derive(Debug, Clone, Copy)] -enum ExeState { - /// Is ready to execute - ExecuteReady, - /// Executing - Executing, - /// Has been executed, and the result - Executed(bool), -} - -/// After sync state of a cmd -#[derive(Debug, Clone)] -enum AsState { - /// Not Synced yet - NotSynced(Option), - /// Is ready to do after sync - AfterSyncReady(Option), - /// Is doing after syncing - AfterSyncing, - /// Has been after synced - AfterSynced, -} - -impl AsState { - /// set the prepare result into the `AsState` - #[inline] - fn set_prepare_result(&mut self, res: C::PR) { - match *self { - Self::NotSynced(ref mut pre_res) | Self::AfterSyncReady(ref mut pre_res) => { - *pre_res = Some(res); - } - Self::AfterSyncing | Self::AfterSynced => { - unreachable!("Pre-execute result cannot be set in the {:?} stage", *self) - } - } - } -} - -/// State of a vertex that only has one task -#[derive(Debug, PartialEq, Eq)] -enum OnceState { - /// Reset ready - Ready, - /// Resetting - Doing, - /// Completed - Completed, -} - -/// The filter will block any msg if its predecessors(msgs that arrive earlier and conflict with it) haven't finished process -/// -/// Internally it maintains a dependency graph of conflicting cmds - -struct Filter { - /// Index from `ProposeId` to `vertex` - cmd_vid: HashMap, - /// Conflict graph - vs: HashMap>, - /// Next vertex id - next_id: u64, - /// Send task to users - filter_tx: flume::Sender>, - /// Command Executor - cmd_executor: Arc, -} - -impl> Filter { - /// Create a new filter that checks conflict in between msgs - fn new(filter_tx: flume::Sender>, ce: Arc) -> Self { - Self { - cmd_vid: HashMap::new(), - vs: HashMap::new(), - next_id: 0, - filter_tx, - cmd_executor: ce, - } - } - - /// Next vertex id - fn next_vertex_id(&mut self) -> u64 { - let new_vid = self.next_id; - self.next_id = self.next_id.wrapping_add(1); - new_vid - } - - /// Insert a new vertex to inner graph - fn insert_new_vertex(&mut self, new_vid: u64, mut new_v: Vertex) { - for v in self.vs.values_mut() { - if v.is_conflict(&new_v) { - assert!(v.successors.insert(new_vid), "cannot insert a vertex twice"); - new_v.predecessor_cnt += 1; - } - } - assert!( - self.vs.insert(new_vid, new_v).is_none(), - "cannot insert a vertex twice" - ); - } - - /// Progress a vertex - fn progress(&mut self, vid: u64, succeeded: bool) { - let v = self.get_vertex_mut(vid); - match v.inner { - VertexInner::Entry { - ref mut exe_st, - ref mut as_st, - .. - } => { - if matches!(*exe_st, ExeState::Executing) - && !matches!(*as_st, AsState::AfterSyncing) - { - *exe_st = ExeState::Executed(succeeded); - } else if matches!(*as_st, AsState::AfterSyncing) { - *as_st = AsState::AfterSynced; - } else { - unreachable!("cmd is neither being executed nor being after synced, exe_st: {exe_st:?}, as_st: {as_st:?}") - } - } - VertexInner::Reset { - ref inner, - ref mut st, - } => { - if *st == OnceState::Doing { - debug_assert!(inner.is_taken(), "snapshot and tx is not taken by the user"); - *st = OnceState::Completed; - } else { - unreachable!("reset is not ongoing when it is marked done, reset state: {st:?}") - } - } - VertexInner::Snapshot { - ref inner, - ref mut st, - } => { - if *st == OnceState::Doing { - debug_assert!( - inner.is_taken(), - "snapshot meta and tx is not taken by the user" - ); - *st = OnceState::Completed; - } else { - unreachable!( - "snapshot is not ongoing when it is marked done, reset state: {st:?}" - ) - } - } - } - self.update_graph(vid); - } - - /// Update a graph after a vertex has been updated - fn update_graph(&mut self, vid: u64) { - let vertex_finished = self.update_vertex(vid); - if vertex_finished { - #[allow(clippy::expect_used)] - let v = self - .vs - .remove(&vid) - .expect("no such vertex in conflict graph"); - if let VertexInner::Entry { ref entry, .. } = v.inner { - assert!( - self.cmd_vid.remove(&entry.propose_id).is_some(), - "no such cmd" - ); - } - self.update_successors(&v); - } - } - - /// Update a vertex's successors - fn update_successors(&mut self, v: &Vertex) { - for successor_id in v.successors.iter().copied() { - let successor = self.get_vertex_mut(successor_id); - successor.predecessor_cnt -= 1; - assert!( - !self.update_vertex(successor_id), - "successor can't have finished before predecessor" - ); - } - } - - /// Update the vertex, see if it can progress - /// - /// Return true if it can be removed - #[allow(clippy::expect_used, clippy::too_many_lines)] // TODO: split this function - fn update_vertex(&mut self, vid: u64) -> bool { - let v = self - .vs - .get_mut(&vid) - .expect("no such vertex in conflict graph"); - - if v.predecessor_cnt != 0 { - return false; - } - match v.inner { - VertexInner::Entry { - ref entry, - ref mut exe_st, - ref mut as_st, - } => match (*exe_st, as_st.clone()) { - ( - ExeState::ExecuteReady, - AsState::NotSynced(prepare) | AsState::AfterSyncReady(prepare), - ) => { - assert!(prepare.is_none(), "The prepare result of a given cmd can only be calculated when exe_state change from ExecuteReady to Executing"); - let prepare_err = match entry.entry_data { - EntryData::Command(ref cmd) => { - match self.cmd_executor.prepare(cmd.as_ref()) { - Ok(pre_res) => { - as_st.set_prepare_result(pre_res); - None - } - Err(err) => { - self.cmd_executor.trigger(entry.inflight_id()); - Some(err) - } - } - } - EntryData::ConfChange(_) - | EntryData::Shutdown - | EntryData::Empty - | EntryData::SetNodeState(_, _, _) => None, - }; - *exe_st = ExeState::Executing; - let task = Task { - vid, - inner: Cart::new(TaskType::SpecExe(Arc::clone(entry), prepare_err)), - }; - if let Err(e) = self.filter_tx.send(task) { - error!("failed to send task through filter, {e}"); - } - false - } - (ExeState::Executed(true), AsState::AfterSyncReady(prepare)) => { - *as_st = AsState::AfterSyncing; - let task = Task { - vid, - inner: Cart::new(TaskType::AS(Arc::clone(entry), prepare)), - }; - if let Err(e) = self.filter_tx.send(task) { - error!("failed to send task through filter, {e}"); - } - false - } - (ExeState::Executed(false), AsState::AfterSyncReady(_)) - | (ExeState::Executed(_), AsState::AfterSynced) => true, - (ExeState::Executing | ExeState::Executed(_), AsState::NotSynced(_)) - | (ExeState::Executing, AsState::AfterSyncReady(_) | AsState::AfterSyncing) - | (ExeState::Executed(true), AsState::AfterSyncing) => false, - (exe_st, as_st) => { - unreachable!("no such exe and as state can be reached: {exe_st:?}, {as_st:?}") - } - }, - VertexInner::Reset { - ref mut inner, - ref mut st, - } => match *st { - OnceState::Ready => { - let (snapshot, tx) = inner.take(); - let task = Task { - vid, - inner: Cart::new(TaskType::Reset(*snapshot, tx)), - }; - *st = OnceState::Doing; - if let Err(e) = self.filter_tx.send(task) { - error!("failed to send task through filter, {e}"); - } - false - } - OnceState::Doing => false, - OnceState::Completed => true, - }, - VertexInner::Snapshot { - ref mut inner, - ref mut st, - } => match *st { - OnceState::Ready => { - let (meta, tx) = inner.take(); - let task = Task { - vid, - inner: Cart::new(TaskType::Snapshot(meta, tx)), - }; - *st = OnceState::Doing; - if let Err(e) = self.filter_tx.send(task) { - error!("failed to send task through filter, {e}"); - } - false - } - OnceState::Doing => false, - OnceState::Completed => true, - }, - } - } - - /// Get vertex from id - fn get_vertex_mut(&mut self, vid: u64) -> &mut Vertex { - #[allow(clippy::expect_used)] - self.vs - .get_mut(&vid) - .expect("no such vertex in conflict graph") - } - - /// Handle event - fn handle_event(&mut self, event: CEEvent) { - debug!("new ce event: {event:?}"); - let vid = match event { - CEEvent::SpecExeReady(entry) => { - let new_vid = self.next_vertex_id(); - assert!( - self.cmd_vid.insert(entry.propose_id, new_vid).is_none(), - "cannot insert a cmd twice" - ); - let new_v = Vertex { - successors: HashSet::new(), - predecessor_cnt: 0, - inner: VertexInner::Entry { - exe_st: ExeState::ExecuteReady, - as_st: AsState::NotSynced(None), - entry, - }, - }; - self.insert_new_vertex(new_vid, new_v); - new_vid - } - CEEvent::ASReady(entry) => { - if let Some(vid) = self.cmd_vid.get(&entry.propose_id).copied() { - let v = self.get_vertex_mut(vid); - match v.inner { - VertexInner::Entry { ref mut as_st, .. } => { - let AsState::NotSynced(ref mut prepare) = *as_st else { - unreachable!("after sync state should be AsState::NotSynced but found {as_st:?}"); - }; - *as_st = AsState::AfterSyncReady(prepare.take()); - } - _ => unreachable!("impossible vertex type"), - } - vid - } else { - let new_vid = self.next_vertex_id(); - assert!( - self.cmd_vid.insert(entry.propose_id, new_vid).is_none(), - "cannot insert a cmd twice" - ); - let new_v = Vertex { - successors: HashSet::new(), - predecessor_cnt: 0, - inner: VertexInner::Entry { - exe_st: ExeState::ExecuteReady, - as_st: AsState::AfterSyncReady(None), - entry, - }, - }; - self.insert_new_vertex(new_vid, new_v); - new_vid - } - } - CEEvent::Reset(snapshot, finish_tx) => { - // since a reset is needed, all other vertices doesn't matter anymore, so delete them all - self.cmd_vid.clear(); - self.vs.clear(); - - let new_vid = self.next_vertex_id(); - let new_v = Vertex { - successors: HashSet::new(), - predecessor_cnt: 0, - inner: VertexInner::Reset { - inner: Cart::new((Box::new(snapshot), finish_tx)), - st: OnceState::Ready, - }, - }; - self.insert_new_vertex(new_vid, new_v); - new_vid - } - CEEvent::Snapshot(meta, tx) => { - let new_vid = self.next_vertex_id(); - let new_v = Vertex { - successors: HashSet::new(), - predecessor_cnt: 0, - inner: VertexInner::Snapshot { - inner: Cart::new((meta, tx)), - st: OnceState::Ready, - }, - }; - self.insert_new_vertex(new_vid, new_v); - new_vid - } - }; - self.update_graph(vid); - } -} - -/// Create conflict checked channel. The channel guarantees there will be no conflicted msgs received by multiple receivers at the same time. -/// The user should use the `CEEventTx` to send events for command executor. -/// The events will be automatically processed and corresponding ce tasks will be generated and sent through the task receiver. -/// After the task is finished, the user should notify the channel by the done notifier. -// Message flow: -// send_tx -> filter_rx -> filter -> filter_tx -> recv_rx -> done_tx -> done_rx -#[allow(clippy::type_complexity)] // it's clear -pub(in crate::server) fn channel>( - ce: Arc, - task_manager: Arc, -) -> ( - CEEventTx, - flume::Receiver>, - flume::Sender<(Task, bool)>, -) { - // recv from user, insert it into filter - let (send_tx, filter_rx) = flume::unbounded(); - // recv from filter, pass the msg to user - let (filter_tx, recv_rx) = flume::unbounded(); - // recv from user to mark a msg done - let (done_tx, done_rx) = flume::unbounded::<(Task, bool)>(); - task_manager.spawn(TaskName::ConflictCheckedMpmc, |n| { - conflict_checked_mpmc_task(filter_tx, filter_rx, ce, done_rx, n) - }); - let ce_event_tx = CEEventTx(send_tx, task_manager); - (ce_event_tx, recv_rx, done_tx) -} - -/// Conflict checked mpmc task -async fn conflict_checked_mpmc_task>( - filter_tx: flume::Sender>, - filter_rx: flume::Receiver>, - ce: Arc, - done_rx: flume::Receiver<(Task, bool)>, - shutdown_listener: Listener, -) { - let mut filter = Filter::new(filter_tx, ce); - let mut is_shutdown_state = false; - // tokio internal triggers - #[allow(clippy::arithmetic_side_effects, clippy::pattern_type_mismatch)] - loop { - tokio::select! { - biased; // cleanup filter first so that the buffer in filter can be kept as small as possible - state = shutdown_listener.wait_state(), if !is_shutdown_state => { - match state { - State::Running => unreachable!("wait state should not return Run"), - State::Shutdown => return, - State::ClusterShutdown => is_shutdown_state = true, - } - }, - Ok((task, succeeded)) = done_rx.recv_async() => { - filter.progress(task.vid, succeeded); - }, - Ok(event) = filter_rx.recv_async() => { - filter.handle_event(event); - }, - else => { - error!("mpmc channel stopped unexpectedly"); - return; - } - } - - if is_shutdown_state && filter.vs.is_empty() { - shutdown_listener.mark_mpmc_channel_shutdown(); - return; - } - } -} diff --git a/crates/curp/src/server/cmd_worker/mod.rs b/crates/curp/src/server/cmd_worker/mod.rs index bb73e6a0a..d70cc20e7 100644 --- a/crates/curp/src/server/cmd_worker/mod.rs +++ b/crates/curp/src/server/cmd_worker/mod.rs @@ -1,257 +1,254 @@ //! `exe` stands for execution //! `as` stands for after sync -use std::{fmt::Debug, iter, sync::Arc}; +use std::sync::Arc; -use async_trait::async_trait; -use clippy_utilities::NumericCast; -#[cfg(test)] -use mockall::automock; +use curp_external_api::cmd::{AfterSyncCmd, AfterSyncOk}; use tokio::sync::oneshot; use tracing::{debug, error, info, warn}; -use utils::task_manager::{tasks::TaskName, Listener, TaskManager}; -use self::conflict_checked_mpmc::Task; -use super::raw_curp::RawCurp; +use super::{curp_node::AfterSyncEntry, raw_curp::RawCurp}; use crate::{ cmd::{Command, CommandExecutor}, log_entry::{EntryData, LogEntry}, + response::ResponseSender, role_change::RoleChange, - rpc::{ConfChangeType, PoolEntry}, - server::cmd_worker::conflict_checked_mpmc::TaskType, + rpc::{ConfChangeType, PoolEntry, ProposeId, ProposeResponse, SyncedResponse}, snapshot::{Snapshot, SnapshotMeta}, }; -/// The special conflict checked mpmc -pub(super) mod conflict_checked_mpmc; - -/// Event for command executor -pub(super) enum CEEvent { - /// The cmd is ready for speculative execution - SpecExeReady(Arc>), - /// The cmd is ready for after sync - ASReady(Arc>), - /// Reset the command executor, send(()) when finishes - Reset(Option, oneshot::Sender<()>), - /// Take a snapshot - Snapshot(SnapshotMeta, oneshot::Sender), -} - -impl Debug for CEEvent { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match *self { - Self::SpecExeReady(ref entry) => f.debug_tuple("SpecExeReady").field(entry).finish(), - Self::ASReady(ref entry) => f.debug_tuple("ASReady").field(entry).finish(), - Self::Reset(ref ss, _) => { - if ss.is_none() { - write!(f, "Reset(None)") - } else { - write!(f, "Reset(Some(_))") - } - } - Self::Snapshot(meta, _) => f.debug_tuple("Snapshot").field(&meta).finish(), - } +/// Removes an entry from sp and ucp +fn remove_from_sp_ucp(curp: &RawCurp, entries: I) +where + C: Command, + RC: RoleChange, + E: AsRef>, + I: IntoIterator, +{ + let (mut sp, mut ucp) = (curp.spec_pool().lock(), curp.uncommitted_pool().lock()); + for entry in entries { + let entry = entry.as_ref(); + if let EntryData::Command(ref c) = entry.entry_data { + let pool_entry = PoolEntry::new(entry.propose_id, Arc::clone(c)); + sp.remove(&pool_entry); + ucp.remove(&pool_entry); + }; } } -/// Worker that execute commands -async fn cmd_worker, RC: RoleChange>( - dispatch_rx: impl TaskRxApi, - done_tx: flume::Sender<(Task, bool)>, - curp: Arc>, - ce: Arc, - shutdown_listener: Listener, -) { - #[allow(clippy::arithmetic_side_effects, clippy::ignored_unit_patterns)] - // introduced by tokio select - loop { - tokio::select! { - task = dispatch_rx.recv() => { - let Ok(task) = task else { - return; - }; - handle_task(task, &done_tx, ce.as_ref(), curp.as_ref()).await; - } - _ = shutdown_listener.wait() => break, - } - } - while let Ok(task) = dispatch_rx.try_recv() { - handle_task(task, &done_tx, ce.as_ref(), curp.as_ref()).await; - } - debug!("cmd worker exits"); -} +/// ER and ASR +type ErAsr = (::ER, Option<::ASR>); -/// Handle task -async fn handle_task, RC: RoleChange>( - mut task: Task, - done_tx: &flume::Sender<(Task, bool)>, +/// Cmd worker execute handler +pub(super) fn execute, RC: RoleChange>( + entry: &LogEntry, ce: &CE, curp: &RawCurp, -) { - let succeeded = match task.take() { - TaskType::SpecExe(entry, pre_err) => worker_exe(entry, pre_err, ce, curp).await, - TaskType::AS(entry, prepare) => worker_as(entry, prepare, ce, curp).await, - TaskType::Reset(snapshot, finish_tx) => worker_reset(snapshot, finish_tx, ce, curp).await, - TaskType::Snapshot(meta, tx) => worker_snapshot(meta, tx, ce, curp).await, +) -> Result, ::Error> { + let cb = curp.cmd_board(); + let id = curp.id(); + let EntryData::Command(ref cmd) = entry.entry_data else { + unreachable!("should not speculative execute {:?}", entry.entry_data); }; - if let Err(e) = done_tx.send((task, succeeded)) { - if !curp.is_shutdown() { - error!("can't mark a task done, the channel could be closed, {e}"); - } + if cmd.is_read_only() { + ce.execute_ro(cmd).map(|(er, asr)| (er, Some(asr))) + } else { + let er = ce.execute(cmd); + let mut cb_w = cb.write(); + cb_w.insert_er(entry.propose_id, er.clone()); + debug!( + "{id} cmd({}) is speculatively executed, exe status: {}", + entry.propose_id, + er.is_ok(), + ); + er.map(|e| (e, None)) } } -/// Cmd worker execute handler -async fn worker_exe, RC: RoleChange>( - entry: Arc>, - pre_err: Option, +/// After sync cmd entries +#[allow(clippy::pattern_type_mismatch)] // Can't be fixed +fn after_sync_cmds, RC: RoleChange>( + cmd_entries: &[AfterSyncEntry], ce: &CE, curp: &RawCurp, -) -> bool { - let (cb, sp, ucp) = (curp.cmd_board(), curp.spec_pool(), curp.uncommitted_pool()); - let id = curp.id(); - let success = match entry.entry_data { - EntryData::Command(ref cmd) => { - let er = if let Some(err_msg) = pre_err { - Err(err_msg) - } else { - ce.execute(cmd).await +) { + if cmd_entries.is_empty() { + return; + } + info!("after sync: {cmd_entries:?}"); + let resp_txs = cmd_entries + .iter() + .map(|(_, tx)| tx.as_ref().map(AsRef::as_ref)); + let highest_index = cmd_entries + .last() + .map_or_else(|| unreachable!(), |(entry, _)| entry.index); + let cmds: Vec<_> = cmd_entries + .iter() + .map(|(entry, tx)| { + let EntryData::Command(ref cmd) = entry.entry_data else { + unreachable!("only allows command entry"); }; - let er_ok = er.is_ok(); - cb.write().insert_er(entry.propose_id, er); - if !er_ok { - sp.lock() - .remove(PoolEntry::new(entry.propose_id, Arc::clone(cmd))); - if curp.is_leader() { - ucp.lock() - .remove(PoolEntry::new(entry.propose_id, Arc::clone(cmd))); - } + AfterSyncCmd::new( + cmd.as_ref(), + // If the response sender is absent, it indicates that a new leader + // has been elected, and the entry has been recovered from the log + // or the speculative pool. In such cases, these entries needs to + // be re-executed. + tx.as_ref().map_or(true, |t| t.is_conflict()), + ) + }) + .collect(); + let propose_ids = cmd_entries.iter().map(|(e, _)| e.propose_id); + + let results = ce.after_sync(cmds, Some(highest_index)); + + send_results(curp, results.into_iter(), resp_txs, propose_ids); + + for (entry, _) in cmd_entries { + curp.trigger(&entry.propose_id); + ce.trigger(entry.inflight_id()); + } + remove_from_sp_ucp(curp, cmd_entries.iter().map(|(e, _)| e)); +} + +/// Send cmd results to clients +fn send_results<'a, C, RC, R, S, P>(curp: &RawCurp, results: R, txs: S, propose_ids: P) +where + C: Command, + RC: RoleChange, + R: Iterator, C::Error>>, + S: Iterator>, + P: Iterator, +{ + let cb = curp.cmd_board(); + let mut cb_w = cb.write(); + + for ((result, tx_opt), id) in results.zip(txs).zip(propose_ids) { + match result { + Ok(r) => { + let (asr, er_opt) = r.into_parts(); + let _ignore_er = tx_opt.as_ref().zip(er_opt.as_ref()).map(|(tx, er)| { + tx.send_propose(ProposeResponse::new_result::(&Ok(er.clone()), true)); + }); + let _ignore = er_opt.map(|er| cb_w.insert_er(id, Ok(er))); + let _ignore_asr = tx_opt + .as_ref() + .map(|tx| tx.send_synced(SyncedResponse::new_result::(&Ok(asr.clone())))); + cb_w.insert_asr(id, Ok(asr)); + } + Err(e) => { + let _ignore = tx_opt + .as_ref() + .map(|tx| tx.send_synced(SyncedResponse::new_result::(&Err(e.clone())))); + cb_w.insert_asr(id, Err(e.clone())); } - debug!( - "{id} cmd({}) is speculatively executed, exe status: {er_ok}", - entry.propose_id - ); - er_ok } - EntryData::ConfChange(_) - | EntryData::Shutdown - | EntryData::Empty - | EntryData::SetNodeState(_, _, _) => true, - }; - if !success { - ce.trigger(entry.inflight_id()); } - success } -/// Cmd worker after sync handler -async fn worker_as, RC: RoleChange>( - entry: Arc>, - prepare: Option, +/// After sync entries other than cmd +async fn after_sync_others, RC: RoleChange>( + others: Vec>, ce: &CE, curp: &RawCurp, -) -> bool { - let (cb, sp, ucp) = (curp.cmd_board(), curp.spec_pool(), curp.uncommitted_pool()); +) { let id = curp.id(); - let success = match entry.entry_data { - EntryData::Command(ref cmd) => { - let Some(prepare) = prepare else { - unreachable!("prepare should always be Some(_) when entry is a command"); - }; - let asr = ce.after_sync(cmd.as_ref(), entry.index, prepare).await; - let asr_ok = asr.is_ok(); - cb.write().insert_asr(entry.propose_id, asr); - sp.lock() - .remove(PoolEntry::new(entry.propose_id, Arc::clone(cmd))); - if curp.is_leader() { - ucp.lock() - .remove(PoolEntry::new(entry.propose_id, Arc::clone(cmd))); - } - debug!("{id} cmd({}) after sync is called", entry.propose_id); - asr_ok - } - EntryData::Shutdown => { - curp.task_manager().cluster_shutdown(); - if curp.is_leader() { - curp.task_manager().mark_leader_notified(); - } - if let Err(e) = ce.set_last_applied(entry.index) { - error!("failed to set last_applied, {e}"); - } - cb.write().notify_shutdown(); - true - } - EntryData::ConfChange(ref conf_change) => { - if let Err(e) = ce.set_last_applied(entry.index) { - error!("failed to set last_applied, {e}"); - return false; - } - let change = conf_change.first().unwrap_or_else(|| { - unreachable!("conf change should always have at least one change") - }); - let shutdown_self = - change.change_type() == ConfChangeType::Remove && change.node_id == id; - cb.write().insert_conf(entry.propose_id); - sp.lock() - .remove(PoolEntry::new(entry.propose_id, conf_change.clone())); - if curp.is_leader() { - ucp.lock() - .remove(PoolEntry::new(entry.propose_id, conf_change.clone())); + let cb = curp.cmd_board(); + #[allow(clippy::pattern_type_mismatch)] // Can't be fixed + for (entry, resp_tx) in others { + match (&entry.entry_data, resp_tx) { + (EntryData::Shutdown, _) => { + curp.task_manager().cluster_shutdown(); + if curp.is_leader() { + curp.task_manager().mark_leader_notified(); + } + if let Err(e) = ce.set_last_applied(entry.index) { + error!("failed to set last_applied, {e}"); + } + cb.write().notify_shutdown(); } - if shutdown_self { - if let Some(maybe_new_leader) = curp.pick_new_leader() { - info!( - "the old leader {} will shutdown, try to move leadership to {}", - id, maybe_new_leader - ); - if curp - .handle_move_leader(maybe_new_leader) - .unwrap_or_default() - { - if let Err(e) = curp - .connects() - .get(&maybe_new_leader) - .unwrap_or_else(|| { - unreachable!("connect to {} should exist", maybe_new_leader) - }) - .try_become_leader_now(curp.cfg().wait_synced_timeout) - .await + (EntryData::ConfChange(ref conf_change), _) => { + if let Err(e) = ce.set_last_applied(entry.index) { + error!("failed to set last_applied, {e}"); + return; + } + let change = conf_change.first().unwrap_or_else(|| { + unreachable!("conf change should always have at least one change") + }); + let shutdown_self = + change.change_type() == ConfChangeType::Remove && change.node_id == id; + cb.write().insert_conf(entry.propose_id); + remove_from_sp_ucp(curp, Some(&entry)); + if shutdown_self { + if let Some(maybe_new_leader) = curp.pick_new_leader() { + info!( + "the old leader {} will shutdown, try to move leadership to {}", + id, maybe_new_leader + ); + if curp + .handle_move_leader(maybe_new_leader) + .unwrap_or_default() { - warn!( - "{} send try become leader now to {} failed: {:?}", - curp.id(), - maybe_new_leader, - e - ); - }; - } - } else { - info!( + if let Err(e) = curp + .connects() + .get(&maybe_new_leader) + .unwrap_or_else(|| { + unreachable!("connect to {} should exist", maybe_new_leader) + }) + .try_become_leader_now(curp.cfg().wait_synced_timeout) + .await + { + warn!( + "{} send try become leader now to {} failed: {:?}", + curp.id(), + maybe_new_leader, + e + ); + }; + } + } else { + info!( "the old leader {} will shutdown, but no other node can be the leader now", id ); + } + curp.task_manager().shutdown(false).await; } - curp.task_manager().shutdown(false).await; } - true - } - EntryData::SetNodeState(node_id, ref name, ref client_urls) => { - if let Err(e) = ce.set_last_applied(entry.index) { - error!("failed to set last_applied, {e}"); - return false; + (EntryData::SetNodeState(node_id, ref name, ref client_urls), _) => { + info!("setting node state: {node_id}, urls: {:?}", client_urls); + if let Err(e) = ce.set_last_applied(entry.index) { + error!("failed to set last_applied, {e}"); + return; + } + curp.cluster() + .set_node_state(*node_id, name.clone(), client_urls.clone()); } - curp.cluster() - .set_node_state(node_id, name.clone(), client_urls.clone()); - true + // The no-op command has been applied to state machine + (EntryData::Empty, _) => curp.set_no_op_applied(), + _ => unreachable!(), } - EntryData::Empty => true, - }; - ce.trigger(entry.inflight_id()); - success + ce.trigger(entry.inflight_id()); + debug!("{id} cmd({}) after sync is called", entry.propose_id); + } +} + +/// Cmd worker after sync handler +pub(super) async fn after_sync, RC: RoleChange>( + entries: Vec>, + ce: &CE, + curp: &RawCurp, +) { + #[allow(clippy::pattern_type_mismatch)] // Can't be fixed + let (cmd_entries, others): (Vec<_>, Vec<_>) = entries + .into_iter() + .partition(|(entry, _)| matches!(entry.entry_data, EntryData::Command(_))); + after_sync_cmds(&cmd_entries, ce, curp); + after_sync_others(others, ce, curp).await; } /// Cmd worker reset handler -async fn worker_reset, RC: RoleChange>( +pub(super) async fn worker_reset, RC: RoleChange>( snapshot: Option, finish_tx: oneshot::Sender<()>, ce: &CE, @@ -287,7 +284,7 @@ async fn worker_reset, RC: RoleChange>( } /// Cmd worker snapshot handler -async fn worker_snapshot, RC: RoleChange>( +pub(super) async fn worker_snapshot, RC: RoleChange>( meta: SnapshotMeta, tx: oneshot::Sender, ce: &CE, @@ -313,574 +310,3 @@ async fn worker_snapshot, RC: RoleChange>( } } } - -/// Send event to background command executor workers -#[derive(Debug, Clone)] -pub(super) struct CEEventTx(flume::Sender>, Arc); - -/// Recv cmds that need to be executed -#[derive(Clone)] -struct TaskRx(flume::Receiver>); - -/// Send cmd to background execution worker -#[cfg_attr(test, automock)] -pub(crate) trait CEEventTxApi: Send + Sync + 'static { - /// Send cmd to background cmd worker for speculative execution - fn send_sp_exe(&self, entry: Arc>); - - /// Send after sync event to the background cmd worker so that after sync can be called - fn send_after_sync(&self, entry: Arc>); - - /// Send reset - fn send_reset(&self, snapshot: Option) -> oneshot::Receiver<()>; - - /// Send snapshot - fn send_snapshot(&self, meta: SnapshotMeta) -> oneshot::Receiver; -} - -impl CEEventTx { - /// Send ce event - fn send_event(&self, event: CEEvent) { - if let Err(e) = self.0.send(event) { - if self.1.is_shutdown() { - info!("send event after current node shutdown"); - return; - } - error!("failed to send cmd exe event to background cmd worker, {e}"); - } - } -} - -impl CEEventTxApi for CEEventTx { - fn send_sp_exe(&self, entry: Arc>) { - let event = CEEvent::SpecExeReady(Arc::clone(&entry)); - self.send_event(event); - } - - fn send_after_sync(&self, entry: Arc>) { - let event = CEEvent::ASReady(Arc::clone(&entry)); - self.send_event(event); - } - - fn send_reset(&self, snapshot: Option) -> oneshot::Receiver<()> { - let (tx, rx) = oneshot::channel(); - let event = CEEvent::Reset(snapshot, tx); - self.send_event(event); - rx - } - - fn send_snapshot(&self, meta: SnapshotMeta) -> oneshot::Receiver { - let (tx, rx) = oneshot::channel(); - let event = CEEvent::Snapshot(meta, tx); - self.send_event(event); - rx - } -} - -/// Cmd exe recv interface -#[cfg_attr(test, automock)] -#[async_trait] -trait TaskRxApi { - /// Recv execute msg and done notifier - async fn recv(&self) -> Result, flume::RecvError>; - /// Try recv execute msg and done notifier - fn try_recv(&self) -> Result, flume::TryRecvError>; -} - -#[async_trait] -impl TaskRxApi for TaskRx { - async fn recv(&self) -> Result, flume::RecvError> { - self.0.recv_async().await - } - - fn try_recv(&self) -> Result, flume::TryRecvError> { - self.0.try_recv() - } -} - -/// Run cmd execute workers. Each cmd execute worker will continually fetch task to perform from `task_rx`. -pub(super) fn start_cmd_workers, RC: RoleChange>( - cmd_executor: Arc, - curp: Arc>, - task_rx: flume::Receiver>, - done_tx: flume::Sender<(Task, bool)>, -) { - let n_workers: usize = curp.cfg().cmd_workers.numeric_cast(); - let task_manager = curp.task_manager(); - #[allow(clippy::shadow_unrelated)] // false positive - iter::repeat((task_rx, done_tx, curp, cmd_executor)) - .take(n_workers) - .for_each(|(task_rx, done_tx, curp, ce)| { - task_manager.spawn(TaskName::CmdWorker, |n| { - cmd_worker(TaskRx(task_rx), done_tx, curp, ce, n) - }); - }); -} - -#[cfg(test)] -mod tests { - use std::time::Duration; - - use curp_test_utils::{ - mock_role_change, sleep_millis, sleep_secs, - test_cmd::{TestCE, TestCommand}, - }; - use test_macros::abort_on_panic; - use tokio::{sync::mpsc, time::Instant}; - use tracing_test::traced_test; - use utils::config::EngineConfig; - - use super::*; - use crate::{log_entry::LogEntry, rpc::ProposeId}; - - // This should happen in fast path in most cases - #[traced_test] - #[tokio::test] - #[abort_on_panic] - async fn fast_path_normal() { - let (er_tx, mut er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut as_rx) = mpsc::unbounded_channel(); - let ce = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let task_manager = Arc::new(TaskManager::new()); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce), Arc::clone(&task_manager)); - start_cmd_workers( - Arc::clone(&ce), - Arc::new(RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager), - )), - task_rx, - done_tx, - ); - - let entry = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 0), - Arc::new(TestCommand::default()), - )); - - ce_event_tx.send_sp_exe(Arc::clone(&entry)); - assert_eq!(er_rx.recv().await.unwrap().1.values, Vec::::new()); - - ce_event_tx.send_after_sync(entry); - assert_eq!(as_rx.recv().await.unwrap().1, 1); - task_manager.shutdown(true).await; - } - - // When the execution takes more time than sync, `as` should be called after exe has finished - #[traced_test] - #[tokio::test] - #[abort_on_panic] - async fn fast_path_cond1() { - let (er_tx, _er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut as_rx) = mpsc::unbounded_channel(); - let ce = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let task_manager = Arc::new(TaskManager::new()); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce), Arc::clone(&task_manager)); - start_cmd_workers( - Arc::clone(&ce), - Arc::new(RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager), - )), - task_rx, - done_tx, - ); - - let begin = Instant::now(); - let entry = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 0), - Arc::new(TestCommand::default().set_exe_dur(Duration::from_secs(1))), - )); - - ce_event_tx.send_sp_exe(Arc::clone(&entry)); - - // at 500ms, sync has completed, call after sync, then needs_as will be updated - sleep_millis(500).await; - ce_event_tx.send_after_sync(entry); - - assert_eq!(as_rx.recv().await.unwrap().1, 1); - - assert!((Instant::now() - begin) >= Duration::from_secs(1)); - task_manager.shutdown(true).await; - } - - // When the execution takes more time than sync and fails, after sync should not be called - #[traced_test] - #[tokio::test] - #[abort_on_panic] - async fn fast_path_cond2() { - let (er_tx, mut er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut as_rx) = mpsc::unbounded_channel(); - let ce = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let task_manager = Arc::new(TaskManager::new()); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce), Arc::clone(&task_manager)); - start_cmd_workers( - Arc::clone(&ce), - Arc::new(RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager), - )), - task_rx, - done_tx, - ); - - let entry = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 0), - Arc::new( - TestCommand::default() - .set_exe_dur(Duration::from_secs(1)) - .set_exe_should_fail(), - ), - )); - - ce_event_tx.send_sp_exe(Arc::clone(&entry)); - - // at 500ms, sync has completed - sleep_millis(500).await; - ce_event_tx.send_after_sync(entry); - - // at 1500ms, as should not be called - sleep_secs(1).await; - assert!(er_rx.try_recv().is_err()); - assert!(as_rx.try_recv().is_err()); - task_manager.shutdown(true).await; - } - - // This should happen in slow path in most cases - #[traced_test] - #[tokio::test] - #[abort_on_panic] - async fn slow_path_normal() { - let (er_tx, mut er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut as_rx) = mpsc::unbounded_channel(); - let ce = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let task_manager = Arc::new(TaskManager::new()); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce), Arc::clone(&task_manager)); - start_cmd_workers( - Arc::clone(&ce), - Arc::new(RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager), - )), - task_rx, - done_tx, - ); - - let entry = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 0), - Arc::new(TestCommand::default()), - )); - - ce_event_tx.send_after_sync(entry); - - assert_eq!(er_rx.recv().await.unwrap().1.revisions, Vec::::new()); - assert_eq!(as_rx.recv().await.unwrap().1, 1); - task_manager.shutdown(true).await; - } - - // When exe fails - #[traced_test] - #[tokio::test] - #[abort_on_panic] - async fn slow_path_exe_fails() { - let (er_tx, mut er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut as_rx) = mpsc::unbounded_channel(); - let ce = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let task_manager = Arc::new(TaskManager::new()); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce), Arc::clone(&task_manager)); - start_cmd_workers( - Arc::clone(&ce), - Arc::new(RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager), - )), - task_rx, - done_tx, - ); - - let entry = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 0), - Arc::new(TestCommand::default().set_exe_should_fail()), - )); - - ce_event_tx.send_after_sync(entry); - - sleep_millis(100).await; - let er = er_rx.try_recv(); - assert!(er.is_err(), "The execute command result is {er:?}"); - let asr = as_rx.try_recv(); - assert!(asr.is_err(), "The after sync result is {asr:?}"); - task_manager.shutdown(true).await; - } - - // If cmd1 and cmd2 conflict, order will be (cmd1 exe) -> (cmd1 as) -> (cmd2 exe) -> (cmd2 as) - #[traced_test] - #[tokio::test] - #[abort_on_panic] - async fn conflict_cmd_order() { - let (er_tx, mut er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut as_rx) = mpsc::unbounded_channel(); - let ce = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let task_manager = Arc::new(TaskManager::new()); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce), Arc::clone(&task_manager)); - start_cmd_workers( - Arc::clone(&ce), - Arc::new(RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager), - )), - task_rx, - done_tx, - ); - - let entry1 = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 0), - Arc::new(TestCommand::new_put(vec![1], 1)), - )); - let entry2 = Arc::new(LogEntry::new( - 2, - 1, - ProposeId(0, 1), - Arc::new(TestCommand::new_get(vec![1])), - )); - - ce_event_tx.send_sp_exe(Arc::clone(&entry1)); - ce_event_tx.send_sp_exe(Arc::clone(&entry2)); - - // cmd1 exe done - assert_eq!(er_rx.recv().await.unwrap().1.revisions, Vec::::new()); - - sleep_millis(100).await; - - // cmd2 will not be executed - assert!(er_rx.try_recv().is_err()); - assert!(as_rx.try_recv().is_err()); - - // cmd1 and cmd2 after sync - ce_event_tx.send_after_sync(entry1); - ce_event_tx.send_after_sync(entry2); - - assert_eq!(er_rx.recv().await.unwrap().1.revisions, vec![1]); - assert_eq!(as_rx.recv().await.unwrap().1, 1); - assert_eq!(as_rx.recv().await.unwrap().1, 2); - task_manager.shutdown(true).await; - } - - #[traced_test] - #[tokio::test] - #[abort_on_panic] - async fn reset_will_wipe_all_states_and_outdated_cmds() { - let (er_tx, mut er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut as_rx) = mpsc::unbounded_channel(); - let ce = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let task_manager = Arc::new(TaskManager::new()); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce), Arc::clone(&task_manager)); - start_cmd_workers( - Arc::clone(&ce), - Arc::new(RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager), - )), - task_rx, - done_tx, - ); - - let entry1 = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 0), - Arc::new(TestCommand::new_put(vec![1], 1).set_as_dur(Duration::from_millis(50))), - )); - let entry2 = Arc::new(LogEntry::new( - 2, - 1, - ProposeId(0, 1), - Arc::new(TestCommand::new_get(vec![1])), - )); - ce_event_tx.send_sp_exe(Arc::clone(&entry1)); - ce_event_tx.send_sp_exe(Arc::clone(&entry2)); - - assert_eq!(er_rx.recv().await.unwrap().1.revisions, Vec::::new()); - - ce_event_tx.send_reset(None); - - let entry3 = Arc::new(LogEntry::new( - 3, - 1, - ProposeId(0, 2), - Arc::new(TestCommand::new_get(vec![1])), - )); - - ce_event_tx.send_after_sync(entry3); - - assert_eq!(er_rx.recv().await.unwrap().1.revisions, Vec::::new()); - - // there will be only one after sync results - assert!(as_rx.recv().await.is_some()); - assert!(as_rx.try_recv().is_err()); - task_manager.shutdown(true).await; - } - - #[traced_test] - #[tokio::test] - #[abort_on_panic] - async fn test_snapshot() { - let task_manager1 = Arc::new(TaskManager::new()); - let task_manager2 = Arc::new(TaskManager::new()); - - // ce1 - let (er_tx, mut _er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut _as_rx) = mpsc::unbounded_channel(); - let ce1 = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce1), Arc::clone(&task_manager1)); - let curp = RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager1), - ); - let s2_id = curp.cluster().get_id_by_name("S2").unwrap(); - curp.handle_append_entries( - 1, - s2_id, - 0, - 0, - vec![LogEntry::new( - 1, - 1, - ProposeId(0, 0), - Arc::new(TestCommand::default()), - )], - 0, - ) - .unwrap(); - start_cmd_workers(Arc::clone(&ce1), Arc::new(curp), task_rx, done_tx); - - let entry = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 1), - Arc::new(TestCommand::new_put(vec![1], 1).set_exe_dur(Duration::from_millis(50))), - )); - - ce_event_tx.send_after_sync(entry); - - let snapshot = ce_event_tx - .send_snapshot(SnapshotMeta { - last_included_index: 1, - last_included_term: 0, - }) - .await - .unwrap(); - - // ce2 - let (er_tx, mut er_rx) = mpsc::unbounded_channel(); - let (as_tx, mut _as_rx) = mpsc::unbounded_channel(); - let ce2 = Arc::new(TestCE::new( - "S1".to_owned(), - er_tx, - as_tx, - EngineConfig::Memory, - )); - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&ce2), Arc::clone(&task_manager2)); - start_cmd_workers( - Arc::clone(&ce2), - Arc::new(RawCurp::new_test( - 3, - ce_event_tx.clone(), - mock_role_change(), - Arc::clone(&task_manager2), - )), - task_rx, - done_tx, - ); - - ce_event_tx.send_reset(Some(snapshot)).await.unwrap(); - - let entry = Arc::new(LogEntry::new( - 1, - 1, - ProposeId(0, 2), - Arc::new(TestCommand::new_get(vec![1])), - )); - ce_event_tx.send_after_sync(entry); - assert_eq!(er_rx.recv().await.unwrap().1.revisions, vec![1]); - task_manager1.shutdown(true).await; - task_manager2.shutdown(true).await; - } -} diff --git a/crates/curp/src/server/conflict/mod.rs b/crates/curp/src/server/conflict/mod.rs index 4a13f9dad..08fb96d65 100644 --- a/crates/curp/src/server/conflict/mod.rs +++ b/crates/curp/src/server/conflict/mod.rs @@ -10,150 +10,3 @@ mod tests; /// Conflict pool used in tests #[doc(hidden)] pub mod test_pools; - -use std::{ops::Deref, sync::Arc}; - -use curp_external_api::conflict::EntryId; - -use crate::rpc::{ConfChange, PoolEntry, PoolEntryInner, ProposeId}; - -// TODO: relpace `PoolEntry` with this -/// Entry stored in conflict pools -pub(super) enum ConflictPoolEntry { - /// A command entry - Command(CommandEntry), - /// A conf change entry - ConfChange(ConfChangeEntry), -} - -impl From> for ConflictPoolEntry { - fn from(entry: PoolEntry) -> Self { - match entry.inner { - PoolEntryInner::Command(c) => ConflictPoolEntry::Command(CommandEntry { - id: entry.id, - cmd: c, - }), - PoolEntryInner::ConfChange(c) => ConflictPoolEntry::ConfChange(ConfChangeEntry { - id: entry.id, - conf_change: c, - }), - } - } -} - -/// Command entry type -#[derive(Debug)] -pub struct CommandEntry { - /// The propose id - id: ProposeId, - /// The command - cmd: Arc, -} - -impl CommandEntry { - /// Creates a new `CommandEntry` - #[inline] - pub fn new(id: ProposeId, cmd: Arc) -> Self { - Self { id, cmd } - } -} - -impl EntryId for CommandEntry { - type Id = ProposeId; - - #[inline] - fn id(&self) -> Self::Id { - self.id - } -} - -impl Clone for CommandEntry { - #[inline] - fn clone(&self) -> Self { - Self { - id: self.id, - cmd: Arc::clone(&self.cmd), - } - } -} - -impl Deref for CommandEntry { - type Target = C; - - #[inline] - fn deref(&self) -> &Self::Target { - &self.cmd - } -} - -impl AsRef for CommandEntry { - #[inline] - fn as_ref(&self) -> &C { - self.cmd.as_ref() - } -} - -impl std::hash::Hash for CommandEntry { - #[inline] - fn hash(&self, state: &mut H) { - self.id.hash(state); - } -} - -impl PartialEq for CommandEntry { - #[inline] - fn eq(&self, other: &Self) -> bool { - self.id.eq(&other.id) - } -} - -impl Eq for CommandEntry {} - -impl PartialOrd for CommandEntry { - #[inline] - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for CommandEntry { - #[inline] - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.id.cmp(&other.id) - } -} - -impl From> for PoolEntry { - fn from(entry: CommandEntry) -> Self { - PoolEntry { - id: entry.id, - inner: PoolEntryInner::Command(entry.cmd), - } - } -} - -/// Conf change entry type -#[derive(Clone, PartialEq)] -pub(super) struct ConfChangeEntry { - /// The propose id - id: ProposeId, - /// The conf change entry - conf_change: Vec, -} - -impl EntryId for ConfChangeEntry { - type Id = ProposeId; - - fn id(&self) -> Self::Id { - self.id - } -} - -impl From for PoolEntry { - fn from(entry: ConfChangeEntry) -> Self { - PoolEntry { - id: entry.id, - inner: PoolEntryInner::ConfChange(entry.conf_change), - } - } -} diff --git a/crates/curp/src/server/conflict/spec_pool_new.rs b/crates/curp/src/server/conflict/spec_pool_new.rs index c17acf9fe..97cded6f3 100644 --- a/crates/curp/src/server/conflict/spec_pool_new.rs +++ b/crates/curp/src/server/conflict/spec_pool_new.rs @@ -1,17 +1,22 @@ -use curp_external_api::conflict::{ConflictPoolOp, SpeculativePoolOp}; +use std::{collections::HashMap, sync::Arc}; -use super::{CommandEntry, ConfChangeEntry, ConflictPoolEntry}; -use crate::rpc::PoolEntry; +use curp_external_api::conflict::SpeculativePoolOp; +use parking_lot::Mutex; + +use crate::rpc::{PoolEntry, ProposeId}; + +/// Ref to `SpeculativePool` +pub(crate) type SpeculativePoolRef = Arc>>; /// A speculative pool object -pub type SpObject = Box> + Send + 'static>; +pub type SpObject = Box> + Send + 'static>; /// Union type of `SpeculativePool` objects pub(crate) struct SpeculativePool { /// Command speculative pools command_sps: Vec>, - /// Conf change speculative pool - conf_change_sp: ConfChangeSp, + /// propose id to entry mapping + entries: HashMap>, } impl SpeculativePool { @@ -19,51 +24,38 @@ impl SpeculativePool { pub(crate) fn new(command_sps: Vec>) -> Self { Self { command_sps, - conf_change_sp: ConfChangeSp::default(), + entries: HashMap::new(), } } /// Inserts an entry into the pool + #[allow(clippy::needless_pass_by_value)] // we need to consume the entry pub(crate) fn insert(&mut self, entry: PoolEntry) -> Option> { - if !self.conf_change_sp.is_empty() { - return Some(entry); - } - - match ConflictPoolEntry::from(entry) { - ConflictPoolEntry::Command(c) => { - for csp in &mut self.command_sps { - if let Some(e) = csp.insert_if_not_conflict(c.clone()) { - return Some(e.into()); - } - } - } - ConflictPoolEntry::ConfChange(c) => { - if !self - .command_sps - .iter() - .map(AsRef::as_ref) - .all(ConflictPoolOp::is_empty) - { - return Some(c.into()); - } - let _ignore = self.conf_change_sp.insert_if_not_conflict(c); + for csp in &mut self.command_sps { + if let Some(e) = csp.insert_if_not_conflict(entry.clone()) { + return Some(e); } } + let _ignore = self.entries.insert(entry.id, entry); + None } - // TODO: Use reference instead of clone /// Removes an entry from the pool - pub(crate) fn remove(&mut self, entry: PoolEntry) { - match ConflictPoolEntry::from(entry) { - ConflictPoolEntry::Command(c) => { - for csp in &mut self.command_sps { - csp.remove(&c); - } - } - ConflictPoolEntry::ConfChange(c) => { - self.conf_change_sp.remove(&c); + pub(crate) fn remove(&mut self, entry: &PoolEntry) { + for csp in &mut self.command_sps { + csp.remove(entry); + } + + let _ignore = self.entries.remove(&entry.id); + } + + /// Removes an entry from the pool by it's propose id + pub(crate) fn remove_by_id(&mut self, id: &ProposeId) { + if let Some(entry) = self.entries.remove(id) { + for csp in &mut self.command_sps { + csp.remove(&entry); } } } @@ -74,7 +66,6 @@ impl SpeculativePool { for csp in &self.command_sps { entries.extend(csp.all().into_iter().map(Into::into)); } - entries.extend(self.conf_change_sp.all().into_iter().map(Into::into)); entries } @@ -84,47 +75,5 @@ impl SpeculativePool { self.command_sps .iter() .fold(0, |sum, pool| sum + pool.len()) - + self.conf_change_sp.len() - } -} - -/// Speculative pool for conf change entries -#[derive(Default)] -struct ConfChangeSp { - /// Store current conf change - change: Option, -} - -impl ConflictPoolOp for ConfChangeSp { - type Entry = ConfChangeEntry; - - fn is_empty(&self) -> bool { - self.change.is_none() - } - - fn remove(&mut self, _entry: &Self::Entry) { - self.change = None; - } - - fn all(&self) -> Vec { - self.change.clone().into_iter().collect() - } - - fn clear(&mut self) { - self.change = None; - } - - fn len(&self) -> usize { - self.change.iter().count() - } -} - -impl SpeculativePoolOp for ConfChangeSp { - fn insert_if_not_conflict(&mut self, entry: Self::Entry) -> Option { - if self.change.is_some() { - return Some(entry); - } - self.change = Some(entry); - None } } diff --git a/crates/curp/src/server/conflict/test_pools.rs b/crates/curp/src/server/conflict/test_pools.rs index 05fbfc21e..1147dff81 100644 --- a/crates/curp/src/server/conflict/test_pools.rs +++ b/crates/curp/src/server/conflict/test_pools.rs @@ -4,15 +4,15 @@ use curp_external_api::{ }; use curp_test_utils::test_cmd::TestCommand; -use super::CommandEntry; +use crate::rpc::PoolEntry; #[derive(Debug, Default)] pub struct TestSpecPool { - cmds: Vec>, + cmds: Vec>, } impl ConflictPoolOp for TestSpecPool { - type Entry = CommandEntry; + type Entry = PoolEntry; #[inline] fn len(&self) -> usize { @@ -55,11 +55,11 @@ impl SpeculativePoolOp for TestSpecPool { #[derive(Debug, Default)] pub struct TestUncomPool { - cmds: Vec>, + cmds: Vec>, } impl ConflictPoolOp for TestUncomPool { - type Entry = CommandEntry; + type Entry = PoolEntry; #[inline] fn all(&self) -> Vec { diff --git a/crates/curp/src/server/conflict/tests.rs b/crates/curp/src/server/conflict/tests.rs index cf6a51123..bc9f1d6d1 100644 --- a/crates/curp/src/server/conflict/tests.rs +++ b/crates/curp/src/server/conflict/tests.rs @@ -1,20 +1,20 @@ -use std::{cmp::Ordering, sync::Arc}; +use std::sync::Arc; use curp_external_api::conflict::{ConflictPoolOp, SpeculativePoolOp, UncommittedPoolOp}; -use super::{spec_pool_new::SpeculativePool, CommandEntry}; +use super::spec_pool_new::SpeculativePool; use crate::{ - rpc::{ConfChange, PoolEntry, PoolEntryInner, ProposeId}, + rpc::{PoolEntry, ProposeId}, server::conflict::uncommitted_pool::UncommittedPool, }; #[derive(Debug, Default)] struct TestSp { - entries: Vec>, + entries: Vec>, } impl ConflictPoolOp for TestSp { - type Entry = CommandEntry; + type Entry = PoolEntry; fn len(&self) -> usize { self.entries.len() @@ -55,11 +55,11 @@ impl SpeculativePoolOp for TestSp { #[derive(Debug, Default)] struct TestUcp { - entries: Vec>, + entries: Vec>, } impl ConflictPoolOp for TestUcp { - type Entry = CommandEntry; + type Entry = PoolEntry; fn all(&self) -> Vec { self.entries.clone() @@ -103,41 +103,6 @@ impl UncommittedPoolOp for TestUcp { } } -impl Eq for PoolEntry {} - -impl PartialOrd for PoolEntry { - fn partial_cmp(&self, other: &Self) -> Option { - #[allow(clippy::pattern_type_mismatch)] - match (&self.inner, &other.inner) { - (PoolEntryInner::Command(a), PoolEntryInner::Command(b)) => a.partial_cmp(&b), - (PoolEntryInner::Command(_), PoolEntryInner::ConfChange(_)) => Some(Ordering::Less), - (PoolEntryInner::ConfChange(_), PoolEntryInner::Command(_)) => Some(Ordering::Greater), - (PoolEntryInner::ConfChange(a), PoolEntryInner::ConfChange(b)) => { - for (ae, be) in a.iter().zip(b.iter()) { - let ord = ae.change_type.cmp(&be.change_type).then( - ae.node_id - .cmp(&be.node_id) - .then(ae.address.cmp(&be.address)), - ); - if !matches!(ord, Ordering::Equal) { - return Some(ord); - } - } - if a.len() > b.len() { - return Some(Ordering::Greater); - } - return Some(Ordering::Less); - } - } - } -} - -impl Ord for PoolEntry { - fn cmp(&self, other: &Self) -> Ordering { - self.partial_cmp(other).unwrap() - } -} - #[test] fn conflict_should_be_detected_in_sp() { let mut sp = SpeculativePool::new(vec![Box::new(TestSp::default())]); @@ -146,31 +111,8 @@ fn conflict_should_be_detected_in_sp() { assert!(sp.insert(entry1.clone()).is_none()); assert!(sp.insert(entry2).is_none()); assert!(sp.insert(entry1.clone()).is_some()); - sp.remove(entry1.clone()); - assert!(sp.insert(entry1).is_none()); -} - -#[test] -fn conf_change_should_conflict_with_all_entries_in_sp() { - let mut sp = SpeculativePool::new(vec![Box::new(TestSp::default())]); - let entry1 = PoolEntry::new(ProposeId::default(), Arc::new(0)); - let entry2 = PoolEntry::new(ProposeId::default(), Arc::new(1)); - let entry3 = PoolEntry::::new(ProposeId::default(), vec![ConfChange::default()]); - let entry4 = PoolEntry::::new( - ProposeId::default(), - vec![ConfChange { - change_type: 0, - node_id: 1, - address: vec![], - }], - ); - assert!(sp.insert(entry3.clone()).is_none()); - assert!(sp.insert(entry1.clone()).is_some()); - assert!(sp.insert(entry2.clone()).is_some()); - assert!(sp.insert(entry4).is_some()); - sp.remove(entry3.clone()); + sp.remove(&entry1); assert!(sp.insert(entry1).is_none()); - assert!(sp.insert(entry3).is_some()); } #[test] @@ -196,39 +138,16 @@ fn conflict_should_be_detected_in_ucp() { let mut ucp = UncommittedPool::new(vec![Box::new(TestUcp::default())]); let entry1 = PoolEntry::new(ProposeId::default(), Arc::new(0)); let entry2 = PoolEntry::new(ProposeId::default(), Arc::new(1)); - assert!(!ucp.insert(entry1.clone())); - assert!(!ucp.insert(entry2)); - assert!(ucp.insert(entry1.clone())); - ucp.remove(entry1.clone()); + assert!(!ucp.insert(&entry1)); + assert!(!ucp.insert(&entry2)); + assert!(ucp.insert(&entry1)); + ucp.remove(&entry1); // Ucp allows conflict cmds to co-exist in the same pool. // Therefore, we should still get `conflict=true` - assert!(ucp.insert(entry1.clone())); - ucp.remove(entry1.clone()); - ucp.remove(entry1.clone()); - assert!(!ucp.insert(entry1)); -} - -#[test] -fn conf_change_should_conflict_with_all_entries_in_ucp() { - let mut ucp = UncommittedPool::new(vec![Box::new(TestUcp::default())]); - let entry1 = PoolEntry::new(ProposeId::default(), Arc::new(0)); - let entry2 = PoolEntry::new(ProposeId::default(), Arc::new(1)); - let entry3 = PoolEntry::::new(ProposeId::default(), vec![ConfChange::default()]); - let entry4 = PoolEntry::::new( - ProposeId::default(), - vec![ConfChange { - change_type: 0, - node_id: 1, - address: vec![], - }], - ); - assert!(!ucp.insert(entry3.clone())); - assert!(ucp.insert(entry1.clone())); - assert!(ucp.insert(entry4.clone())); - ucp.remove(entry3.clone()); - ucp.remove(entry4.clone()); - assert!(!ucp.insert(entry2)); - assert!(ucp.insert(entry3)); + assert!(ucp.insert(&entry1)); + ucp.remove(&entry1); + ucp.remove(&entry1); + assert!(!ucp.insert(&entry1)); } #[test] @@ -237,11 +156,11 @@ fn ucp_should_returns_all_entries() { let entries: Vec<_> = (0..10) .map(|i| PoolEntry::new(ProposeId::default(), Arc::new(i))) .collect(); - for e in entries.clone() { + for e in &entries { ucp.insert(e); } - for e in entries.clone() { - assert!(ucp.insert(e)); + for e in &entries { + assert!(ucp.insert(&e)); } let results = ucp.all(); @@ -256,14 +175,12 @@ fn ucp_should_returns_all_conflict_entries() { .map(|i| PoolEntry::new(ProposeId::default(), Arc::new(i))) .collect(); for e in &entries { - ucp.insert(e.clone()); - ucp.insert(e.clone()); + ucp.insert(e); + ucp.insert(e); } - let conf_change = PoolEntry::::new(ProposeId::default(), vec![ConfChange::default()]); - ucp.insert(conf_change.clone()); for e in entries { - let mut all = ucp.all_conflict(e.clone()); + let mut all = ucp.all_conflict(&e); all.sort(); - assert_eq!(all, vec![e.clone(), e.clone(), conf_change.clone()]); + assert_eq!(all, vec![e.clone(), e.clone()]); } } diff --git a/crates/curp/src/server/conflict/uncommitted_pool.rs b/crates/curp/src/server/conflict/uncommitted_pool.rs index c8bb86ceb..432d72a1d 100644 --- a/crates/curp/src/server/conflict/uncommitted_pool.rs +++ b/crates/curp/src/server/conflict/uncommitted_pool.rs @@ -1,98 +1,46 @@ -use curp_external_api::conflict::{ConflictPoolOp, UncommittedPoolOp}; +use curp_external_api::conflict::UncommittedPoolOp; -use super::{CommandEntry, ConfChangeEntry, ConflictPoolEntry}; use crate::rpc::PoolEntry; /// An uncommitted pool object -pub type UcpObject = Box> + Send + 'static>; +pub type UcpObject = Box> + Send + 'static>; /// Union type of `UncommittedPool` objects pub(crate) struct UncommittedPool { /// Command uncommitted pools command_ucps: Vec>, - /// Conf change uncommitted pools - conf_change_ucp: ConfChangeUcp, } impl UncommittedPool { /// Creates a new `UncomPool` pub(crate) fn new(command_ucps: Vec>) -> Self { - Self { - command_ucps, - conf_change_ucp: ConfChangeUcp::default(), - } + Self { command_ucps } } /// Insert an entry into the pool - pub(crate) fn insert(&mut self, entry: PoolEntry) -> bool { + pub(crate) fn insert(&mut self, entry: &PoolEntry) -> bool { let mut conflict = false; - conflict |= !self.conf_change_ucp.is_empty(); - - match ConflictPoolEntry::from(entry) { - ConflictPoolEntry::Command(c) => { - for cucp in &mut self.command_ucps { - conflict |= cucp.insert(c.clone()); - } - } - ConflictPoolEntry::ConfChange(c) => { - let _ignore = self.conf_change_ucp.insert(c); - conflict |= !self - .command_ucps - .iter() - .map(AsRef::as_ref) - .all(ConflictPoolOp::is_empty); - } + for cucp in &mut self.command_ucps { + conflict |= cucp.insert(entry.clone()); } conflict } /// Removes an entry from the pool - pub(crate) fn remove(&mut self, entry: PoolEntry) { - match ConflictPoolEntry::from(entry) { - ConflictPoolEntry::Command(c) => { - for cucp in &mut self.command_ucps { - cucp.remove(&c); - } - } - ConflictPoolEntry::ConfChange(c) => { - self.conf_change_ucp.remove(&c); - } + pub(crate) fn remove(&mut self, entry: &PoolEntry) { + for cucp in &mut self.command_ucps { + cucp.remove(entry); } } /// Returns all entries in the pool that conflict with the given entry - pub(crate) fn all_conflict(&self, entry: PoolEntry) -> Vec> { - match ConflictPoolEntry::from(entry) { - // A command entry conflict with other conflict entries plus all conf change entries - ConflictPoolEntry::Command(ref c) => self - .conf_change_ucp - .all() - .into_iter() - .map(Into::into) - .chain( - self.command_ucps - .iter() - .flat_map(|p| p.all_conflict(c)) - .map(Into::into), - ) - .collect(), - // A conf change entry conflict with all other entries - ConflictPoolEntry::ConfChange(_) => self - .conf_change_ucp - .all() - .into_iter() - .map(Into::into) - .chain( - self.command_ucps - .iter() - .map(AsRef::as_ref) - .flat_map(ConflictPoolOp::all) - .map(Into::into), - ) - .collect(), - } + pub(crate) fn all_conflict(&self, entry: &PoolEntry) -> Vec> { + self.command_ucps + .iter() + .flat_map(|p| p.all_conflict(entry)) + .collect() } #[cfg(test)] @@ -100,16 +48,15 @@ impl UncommittedPool { pub(crate) fn all(&self) -> Vec> { let mut entries = Vec::new(); for csp in &self.command_ucps { - entries.extend(csp.all().into_iter().map(Into::into)); + entries.extend(csp.all().into_iter()); } - entries.extend(self.conf_change_ucp.all().into_iter().map(Into::into)); entries } #[cfg(test)] /// Returns `true` if the pool is empty pub(crate) fn is_empty(&self) -> bool { - self.command_ucps.iter().all(|ucp| ucp.is_empty()) && self.conf_change_ucp.is_empty() + self.command_ucps.iter().all(|ucp| ucp.is_empty()) } /// Clears all entries in the pool @@ -117,51 +64,5 @@ impl UncommittedPool { for ucp in &mut self.command_ucps { ucp.clear(); } - self.conf_change_ucp.clear(); - } -} - -/// Conf change uncommitted pool -#[derive(Default)] -struct ConfChangeUcp { - /// entry count - conf_changes: Vec, -} - -impl ConflictPoolOp for ConfChangeUcp { - type Entry = ConfChangeEntry; - - fn is_empty(&self) -> bool { - self.conf_changes.is_empty() - } - - fn remove(&mut self, entry: &Self::Entry) { - if let Some(pos) = self.conf_changes.iter().position(|x| x == entry) { - let _ignore = self.conf_changes.remove(pos); - } - } - - fn all(&self) -> Vec { - self.conf_changes.clone() - } - - fn clear(&mut self) { - self.conf_changes.clear(); - } - - fn len(&self) -> usize { - self.conf_changes.len() - } -} - -impl UncommittedPoolOp for ConfChangeUcp { - fn insert(&mut self, entry: Self::Entry) -> bool { - let conflict = !self.conf_changes.is_empty(); - self.conf_changes.push(entry); - conflict - } - - fn all_conflict(&self, _entry: &Self::Entry) -> Vec { - self.conf_changes.clone() } } diff --git a/crates/curp/src/server/curp_node.rs b/crates/curp/src/server/curp_node.rs index 760e6e23e..1b1b94cc9 100644 --- a/crates/curp/src/server/curp_node.rs +++ b/crates/curp/src/server/curp_node.rs @@ -10,9 +10,10 @@ use engine::{SnapshotAllocator, SnapshotApi}; use event_listener::Event; use futures::{pin_mut, stream::FuturesUnordered, Stream, StreamExt}; use madsim::rand::{thread_rng, Rng}; +use opentelemetry::KeyValue; use parking_lot::{Mutex, RwLock}; use tokio::{ - sync::{broadcast, mpsc}, + sync::{broadcast, oneshot}, time::MissedTickBehavior, }; #[cfg(not(madsim))] @@ -21,18 +22,17 @@ use tracing::{debug, error, info, trace, warn}; #[cfg(madsim)] use utils::ClientTlsConfig; use utils::{ + barrier::IdBarrier, config::CurpConfig, task_manager::{tasks::TaskName, Listener, State, TaskManager}, }; use super::{ cmd_board::{CmdBoardRef, CommandBoard}, - cmd_worker::{conflict_checked_mpmc, start_cmd_workers}, - conflict::{ - spec_pool_new::{SpObject, SpeculativePool}, - uncommitted_pool::{UcpObject, UncommittedPool}, - }, - gc::gc_cmd_board, + cmd_worker::execute, + conflict::spec_pool_new::{SpObject, SpeculativePool}, + conflict::uncommitted_pool::{UcpObject, UncommittedPool}, + gc::gc_client_lease, lease_manager::LeaseManager, raw_curp::{AppendEntries, RawCurp, Vote}, storage::StorageApi, @@ -41,6 +41,7 @@ use crate::{ cmd::{Command, CommandExecutor}, log_entry::{EntryData, LogEntry}, members::{ClusterInfo, ServerId}, + response::ResponseSender, role_change::RoleChange, rpc::{ self, @@ -48,58 +49,295 @@ use crate::{ AppendEntriesRequest, AppendEntriesResponse, ConfChange, ConfChangeType, CurpError, FetchClusterRequest, FetchClusterResponse, FetchReadStateRequest, FetchReadStateResponse, InstallSnapshotRequest, InstallSnapshotResponse, LeaseKeepAliveMsg, MoveLeaderRequest, - MoveLeaderResponse, ProposeConfChangeRequest, ProposeConfChangeResponse, ProposeRequest, - ProposeResponse, PublishRequest, PublishResponse, ShutdownRequest, ShutdownResponse, - TriggerShutdownRequest, TriggerShutdownResponse, TryBecomeLeaderNowRequest, - TryBecomeLeaderNowResponse, VoteRequest, VoteResponse, WaitSyncedRequest, - WaitSyncedResponse, + MoveLeaderResponse, PoolEntry, ProposeConfChangeRequest, ProposeConfChangeResponse, + ProposeId, ProposeRequest, ProposeResponse, PublishRequest, PublishResponse, + ReadIndexResponse, RecordRequest, RecordResponse, ShutdownRequest, ShutdownResponse, + SyncedResponse, TriggerShutdownRequest, TriggerShutdownResponse, TryBecomeLeaderNowRequest, + TryBecomeLeaderNowResponse, VoteRequest, VoteResponse, + }, + server::{ + cmd_worker::{after_sync, worker_reset, worker_snapshot}, + metrics, + raw_curp::SyncAction, + storage::db::DB, }, - server::{cmd_worker::CEEventTxApi, metrics, raw_curp::SyncAction, storage::db::DB}, snapshot::{Snapshot, SnapshotMeta}, }; +/// After sync entry, composed of a log entry and response sender +pub(crate) type AfterSyncEntry = (Arc>, Option>); + +/// The after sync task type +#[derive(Debug)] +pub(super) enum TaskType { + /// After sync an entry + Entries(Vec>), + /// Reset the CE + Reset(Option, oneshot::Sender<()>), + /// Snapshot + Snapshot(SnapshotMeta, oneshot::Sender), +} + +/// A propose type +pub(super) struct Propose { + /// The command of the propose + pub(super) cmd: Arc, + /// Propose id + pub(super) id: ProposeId, + /// Term the client proposed + /// NOTE: this term should be equal to the cluster's latest term + /// for the propose to be accepted. + pub(super) term: u64, + /// Tx used for sending the streaming response back to client + pub(super) resp_tx: Arc, +} + +impl Propose +where + C: Command, +{ + /// Attempts to create a new `Propose` from request + fn try_new(req: &ProposeRequest, resp_tx: Arc) -> Result { + let cmd: Arc = Arc::new(req.cmd()?); + Ok(Self { + cmd, + id: req.propose_id(), + term: req.term, + resp_tx, + }) + } + + /// Returns `true` if the proposed command is read-only + fn is_read_only(&self) -> bool { + self.cmd.is_read_only() + } + + /// Gets response sender + fn response_tx(&self) -> Arc { + Arc::clone(&self.resp_tx) + } + + /// Convert self into parts + fn into_parts(self) -> (Arc, ProposeId, u64, Arc) { + let Self { + cmd, + id, + term, + resp_tx, + } = self; + (cmd, id, term, resp_tx) + } +} + +/// Entry to execute +type ExecutorEntry = (Arc>, Arc); + /// `CurpNode` represents a single node of curp cluster -pub(super) struct CurpNode { +pub(super) struct CurpNode, RC: RoleChange> { /// `RawCurp` state machine curp: Arc>, /// Cmd watch board for tracking the cmd sync results cmd_board: CmdBoardRef, - /// CE event tx, - ce_event_tx: Arc>, /// Storage storage: Arc>, /// Snapshot allocator snapshot_allocator: Box, + /// Command Executor + #[allow(unused)] + cmd_executor: Arc, + /// Tx to send entries to after_sync + as_tx: flume::Sender>, + /// Tx to send to propose task + propose_tx: flume::Sender>, } /// Handlers for clients -impl CurpNode { - /// Handle `Propose` requests - pub(super) async fn propose(&self, req: ProposeRequest) -> Result { +impl, RC: RoleChange> CurpNode { + /// Handle `ProposeStream` requests + pub(super) async fn propose_stream( + &self, + req: &ProposeRequest, + resp_tx: Arc, + bypassed: bool, + ) -> Result<(), CurpError> { if self.curp.is_shutdown() { return Err(CurpError::shutting_down()); } - let id = req.propose_id(); + self.curp.check_leader_transfer()?; self.check_cluster_version(req.cluster_version)?; + self.curp.check_term(req.term)?; + + if req.slow_path { + resp_tx.set_conflict(true); + } else { + info!("not using slow path for: {req:?}"); + } + + if bypassed { + self.curp.mark_client_id_bypassed(req.propose_id().0); + } + + match self + .curp + .deduplicate(req.propose_id(), Some(req.first_incomplete)) + { + // If the propose is duplicated, return the result directly + Err(CurpError::Duplicated(())) => { + let (er, asr) = + CommandBoard::wait_for_er_asr(&self.cmd_board, req.propose_id()).await; + resp_tx.send_propose(ProposeResponse::new_result::(&er, true)); + resp_tx.send_synced(SyncedResponse::new_result::(&asr)); + } + Err(CurpError::ExpiredClientId(())) => { + metrics::get() + .proposals_failed + .add(1, &[KeyValue::new("reason", "duplicated proposal")]); + return Err(CurpError::expired_client_id()); + } + Err(_) => unreachable!("deduplicate won't return other type of errors"), + Ok(()) => {} + } + + let propose = Propose::try_new(req, resp_tx)?; + let _ignore = self.propose_tx.send(propose); + + Ok(()) + } + + /// Handle `Record` requests + pub(super) fn record(&self, req: &RecordRequest) -> Result { + if self.curp.is_shutdown() { + return Err(CurpError::shutting_down()); + } + let id = req.propose_id(); let cmd: Arc = Arc::new(req.cmd()?); - // handle proposal - let sp_exec = self.curp.handle_propose(id, Arc::clone(&cmd))?; + let conflict = self.curp.follower_record(id, &cmd); + + Ok(RecordResponse { conflict }) + } - // if speculatively executed, wait for the result and return - if sp_exec { - let er_res = CommandBoard::wait_for_er(&self.cmd_board, id).await; - return Ok(ProposeResponse::new_result::(&er_res)); + /// Handle `Record` requests + pub(super) fn read_index(&self) -> Result { + if self.curp.is_shutdown() { + return Err(CurpError::shutting_down()); } + Ok(ReadIndexResponse { + term: self.curp.term(), + }) + } + + /// Handle propose task + async fn handle_propose_task( + ce: Arc, + curp: Arc>, + rx: flume::Receiver>, + ) { + /// Max number of propose in a batch + const MAX_BATCH_SIZE: usize = 1024; - Ok(ProposeResponse::new_empty()) + let cmd_executor = Self::build_executor(ce, Arc::clone(&curp)); + loop { + let Ok(first) = rx.recv_async().await else { + info!("handle propose task exit"); + break; + }; + let mut addition: Vec<_> = std::iter::repeat_with(|| rx.try_recv()) + .take(MAX_BATCH_SIZE) + .flatten() + .collect(); + addition.push(first); + let (read_onlys, mutatives): (Vec<_>, Vec<_>) = + addition.into_iter().partition(Propose::is_read_only); + + Self::handle_read_onlys(cmd_executor.clone(), &curp, read_onlys); + Self::handle_mutatives(cmd_executor.clone(), &curp, mutatives); + } + } + + /// Handle read-only proposes + fn handle_read_onlys( + cmd_executor: Executor, + curp: &RawCurp, + proposes: Vec>, + ) where + Executor: Fn(ExecutorEntry) + Clone + Send + 'static, + { + for propose in proposes { + info!("handle read only cmd: {:?}", propose.cmd); + // TODO: Disable dedup if the command is read only or commute + let Propose { + cmd, resp_tx, id, .. + } = propose; + // Use default value for the entry as we don't need to put it into curp log + let entry = Arc::new(LogEntry::new(0, 0, id, Arc::clone(&cmd))); + let wait_conflict = curp.wait_conflicts_synced(cmd); + let wait_no_op = curp.wait_no_op_applied(); + let cmd_executor_c = cmd_executor.clone(); + let _ignore = tokio::spawn(async move { + tokio::join!(wait_conflict, wait_no_op); + cmd_executor_c((entry, resp_tx)); + }); + } + } + + /// Handle read-only proposes + fn handle_mutatives( + cmd_executor: Executor, + curp: &RawCurp, + proposes: Vec>, + ) where + Executor: Fn(ExecutorEntry), + { + if proposes.is_empty() { + return; + } + let pool_entries = proposes + .iter() + .map(|p| PoolEntry::new(p.id, Arc::clone(&p.cmd))); + let conflicts = curp.leader_record(pool_entries); + for (p, conflict) in proposes.iter().zip(conflicts) { + info!("handle mutative cmd: {:?}, conflict: {conflict}", p.cmd); + p.resp_tx.set_conflict(conflict); + } + let resp_txs: Vec<_> = proposes.iter().map(Propose::response_tx).collect(); + let logs: Vec<_> = proposes.into_iter().map(Propose::into_parts).collect(); + let entries = curp.push_logs(logs); + #[allow(clippy::pattern_type_mismatch)] // Can't be fixed + entries + .into_iter() + .zip(resp_txs) + .filter(|(_, tx)| !tx.is_conflict()) + .for_each(cmd_executor); + } + + /// Speculatively execute a command + fn build_executor(ce: Arc, curp: Arc>) -> impl Fn(ExecutorEntry) + Clone { + move |(entry, resp_tx): (_, Arc)| { + info!("spec execute entry: {entry:?}"); + let result = execute(&entry, ce.as_ref(), curp.as_ref()); + match result { + Ok((er, Some(asr))) => { + resp_tx.send_propose(ProposeResponse::new_result::(&Ok(er), false)); + resp_tx.send_synced(SyncedResponse::new_result::(&Ok(asr))); + } + Ok((er, None)) => { + resp_tx.send_propose(ProposeResponse::new_result::(&Ok(er), false)); + } + Err(e) => resp_tx.send_synced(SyncedResponse::new_result::(&Err(e))), + } + } } /// Handle `Shutdown` requests pub(super) async fn shutdown( &self, req: ShutdownRequest, + bypassed: bool, ) -> Result { self.check_cluster_version(req.cluster_version)?; + if bypassed { + self.curp.mark_client_id_bypassed(req.propose_id().0); + } self.curp.handle_shutdown(req.propose_id())?; CommandBoard::wait_for_shutdown_synced(&self.cmd_board).await; Ok(ShutdownResponse::default()) @@ -109,9 +347,13 @@ impl CurpNode { pub(super) async fn propose_conf_change( &self, req: ProposeConfChangeRequest, + bypassed: bool, ) -> Result { self.check_cluster_version(req.cluster_version)?; let id = req.propose_id(); + if bypassed { + self.curp.mark_client_id_bypassed(id.0); + } self.curp.handle_propose_conf_change(id, req.changes)?; CommandBoard::wait_for_conf(&self.cmd_board, id).await; let members = self.curp.cluster().all_members_vec(); @@ -119,7 +361,14 @@ impl CurpNode { } /// Handle `Publish` requests - pub(super) fn publish(&self, req: PublishRequest) -> Result { + pub(super) fn publish( + &self, + req: PublishRequest, + bypassed: bool, + ) -> Result { + if bypassed { + self.curp.mark_client_id_bypassed(req.propose_id().0); + } self.curp.handle_publish(req)?; Ok(PublishResponse::default()) } @@ -131,6 +380,9 @@ impl CurpNode { ) -> Result { pin_mut!(req_stream); while let Some(req) = req_stream.next().await { + // NOTE: The leader may shutdown itself in configuration change. + // We must first check this situation. + self.curp.check_leader_transfer()?; if self.curp.is_shutdown() { return Err(CurpError::shutting_down()); } @@ -151,7 +403,7 @@ impl CurpNode { } /// Handlers for peers -impl CurpNode { +impl, RC: RoleChange> CurpNode { /// Handle `AppendEntries` requests pub(super) fn append_entries( &self, @@ -168,7 +420,11 @@ impl CurpNode { req.leader_commit, ); let resp = match result { - Ok(term) => AppendEntriesResponse::new_accept(term), + Ok((term, to_persist)) => { + self.storage + .put_log_entries(&to_persist.iter().map(Arc::as_ref).collect::>())?; + AppendEntriesResponse::new_accept(term) + } Err((term, hint)) => AppendEntriesResponse::new_reject(term, hint), }; @@ -176,7 +432,7 @@ impl CurpNode { } /// Handle `Vote` requests - pub(super) async fn vote(&self, req: VoteRequest) -> Result { + pub(super) fn vote(&self, req: &VoteRequest) -> Result { let result = if req.is_pre_vote { self.curp.handle_pre_vote( req.term, @@ -196,7 +452,7 @@ impl CurpNode { let resp = match result { Ok((term, sp)) => { if !req.is_pre_vote { - self.storage.flush_voted_for(term, req.candidate_id).await?; + self.storage.flush_voted_for(term, req.candidate_id)?; } VoteResponse::new_accept(term, sp)? } @@ -213,25 +469,6 @@ impl CurpNode { TriggerShutdownResponse::default() } - /// handle `WaitSynced` requests - pub(super) async fn wait_synced( - &self, - req: WaitSyncedRequest, - ) -> Result { - if self.curp.is_shutdown() { - return Err(CurpError::shutting_down()); - } - self.check_cluster_version(req.cluster_version)?; - let id = req.propose_id(); - debug!("{} get wait synced request for cmd({id})", self.curp.id()); - if self.curp.get_transferee().is_some() { - return Err(CurpError::leader_transfer("leader transferring")); - } - let (er, asr) = CommandBoard::wait_for_er_asr(&self.cmd_board, id).await; - debug!("{} wait synced for cmd({id}) finishes", self.curp.id()); - Ok(WaitSyncedResponse::new_from_result::(er, asr)) - } - /// Handle `FetchCluster` requests #[allow(clippy::unnecessary_wraps, clippy::needless_pass_by_value)] // To keep type consistent with other request handlers pub(super) fn fetch_cluster( @@ -305,15 +542,14 @@ impl CurpNode { "{} successfully received a snapshot, {snapshot:?}", self.curp.id(), ); - self.ce_event_tx - .send_reset(Some(snapshot)) - .await - .map_err(|err| { - error!("failed to reset the command executor by snapshot, {err}"); - CurpError::internal(format!( - "failed to reset the command executor by snapshot, {err}" - )) - })?; + let (tx, rx) = oneshot::channel(); + self.as_tx.send(TaskType::Reset(Some(snapshot), tx))?; + rx.await.map_err(|err| { + error!("failed to reset the command executor by snapshot, {err}"); + CurpError::internal(format!( + "failed to reset the command executor by snapshot, {err}" + )) + })?; metrics::get().apply_snapshot_in_progress.add(-1, &[]); metrics::get() .snapshot_install_total_duration_seconds @@ -392,7 +628,7 @@ impl CurpNode { } /// Spawned tasks -impl CurpNode { +impl, RC: RoleChange> CurpNode { /// Tick periodically #[allow(clippy::arithmetic_side_effects, clippy::ignored_unit_patterns)] async fn election_task(curp: Arc>, shutdown_listener: Listener) { @@ -569,42 +805,41 @@ impl CurpNode { debug!("{} to {} sync follower task exits", curp.id(), connect.id()); } - /// Log persist task - pub(super) async fn log_persist_task( - mut log_rx: mpsc::UnboundedReceiver>>, - storage: Arc>, - shutdown_listener: Listener, + /// After sync task + async fn after_sync_task( + curp: Arc>, + cmd_executor: Arc, + as_rx: flume::Receiver>, ) { - #[allow(clippy::arithmetic_side_effects, clippy::ignored_unit_patterns)] - // introduced by tokio select - loop { - tokio::select! { - e = log_rx.recv() => { - let Some(e) = e else { - return; - }; - if let Err(err) = storage.put_log_entry(e.as_ref()).await { - error!("storage error, {err}"); - } - } - _ = shutdown_listener.wait() => break, - } + while let Ok(task) = as_rx.recv_async().await { + Self::handle_as_task(&curp, &cmd_executor, task).await; } - while let Ok(e) = log_rx.try_recv() { - if let Err(err) = storage.put_log_entry(e.as_ref()).await { - error!("storage error, {err}"); + debug!("after sync task exits"); + } + + /// Handles a after sync task + async fn handle_as_task(curp: &RawCurp, cmd_executor: &CE, task: TaskType) { + debug!("after sync: {task:?}"); + match task { + TaskType::Entries(entries) => { + after_sync(entries, cmd_executor, curp).await; + } + TaskType::Reset(snap, tx) => { + let _ignore = worker_reset(snap, tx, cmd_executor, curp).await; + } + TaskType::Snapshot(meta, tx) => { + let _ignore = worker_snapshot(meta, tx, cmd_executor, curp).await; } } - debug!("log persist task exits"); } } // utils -impl CurpNode { +impl, RC: RoleChange> CurpNode { /// Create a new server instance #[inline] #[allow(clippy::too_many_arguments)] // TODO: refactor this use builder pattern - pub(super) async fn new>( + pub(super) async fn new( cluster_info: Arc, is_leader: bool, cmd_executor: Arc, @@ -626,28 +861,25 @@ impl CurpNode { .await .map_err(|e| CurpError::internal(format!("parse peers addresses failed, err {e:?}")))? .collect(); - let (log_tx, log_rx) = mpsc::unbounded_channel(); let cmd_board = Arc::new(RwLock::new(CommandBoard::new())); let lease_manager = Arc::new(RwLock::new(LeaseManager::new())); let last_applied = cmd_executor .last_applied() .map_err(|e| CurpError::internal(format!("get applied index error, {e}")))?; - let (ce_event_tx, task_rx, done_tx) = - conflict_checked_mpmc::channel(Arc::clone(&cmd_executor), Arc::clone(&task_manager)); - let ce_event_tx: Arc> = Arc::new(ce_event_tx); - + let (as_tx, as_rx) = flume::unbounded(); + let (propose_tx, propose_rx) = flume::bounded(4096); + let sp = Arc::new(Mutex::new(SpeculativePool::new(sps))); + let ucp = Arc::new(Mutex::new(UncommittedPool::new(ucps))); // create curp state machine - let (voted_for, entries) = storage.recover().await?; + let (voted_for, entries) = storage.recover()?; let curp = Arc::new( RawCurp::builder() .cluster_info(Arc::clone(&cluster_info)) .is_leader(is_leader) .cmd_board(Arc::clone(&cmd_board)) - .lease_manager(lease_manager) + .lease_manager(Arc::clone(&lease_manager)) .cfg(Arc::clone(&curp_cfg)) - .cmd_tx(Arc::clone(&ce_event_tx)) .sync_events(sync_events) - .log_tx(log_tx) .role_change(role_change) .task_manager(Arc::clone(&task_manager)) .connects(connects) @@ -656,36 +888,51 @@ impl CurpNode { .entries(entries) .curp_storage(Arc::clone(&storage)) .client_tls_config(client_tls_config) - .spec_pool(Arc::new(Mutex::new(SpeculativePool::new(sps)))) - .uncommitted_pool(Arc::new(Mutex::new(UncommittedPool::new(ucps)))) + .spec_pool(Arc::clone(&sp)) + .uncommitted_pool(ucp) + .as_tx(as_tx.clone()) + .resp_txs(Arc::new(Mutex::default())) + .id_barrier(Arc::new(IdBarrier::new())) .build_raw_curp() .map_err(|e| CurpError::internal(format!("build raw curp failed, {e}")))?, ); metrics::Metrics::register_callback(Arc::clone(&curp))?; - start_cmd_workers(cmd_executor, Arc::clone(&curp), task_rx, done_tx); - - task_manager.spawn(TaskName::GcCmdBoard, |n| { - gc_cmd_board(Arc::clone(&cmd_board), curp_cfg.gc_interval, n) + task_manager.spawn(TaskName::GcClientLease, |n| { + gc_client_lease( + lease_manager, + Arc::clone(&cmd_board), + sp, + curp_cfg.gc_interval, + n, + ) }); - Self::run_bg_tasks(Arc::clone(&curp), Arc::clone(&storage), log_rx); + Self::run_bg_tasks( + Arc::clone(&curp), + Arc::clone(&cmd_executor), + propose_rx, + as_rx, + ); Ok(Self { curp, cmd_board, - ce_event_tx, storage, snapshot_allocator, + cmd_executor, + as_tx, + propose_tx, }) } /// Run background tasks for Curp server fn run_bg_tasks( curp: Arc>, - storage: Arc + 'static>, - log_rx: mpsc::UnboundedReceiver>>, + cmd_executor: Arc, + propose_rx: flume::Receiver>, + as_rx: flume::Receiver>, ) { let task_manager = curp.task_manager(); @@ -711,10 +958,13 @@ impl CurpNode { } task_manager.spawn(TaskName::ConfChange, |n| { - Self::conf_change_handler(curp, remove_events, n) + Self::conf_change_handler(Arc::clone(&curp), remove_events, n) }); - task_manager.spawn(TaskName::LogPersist, |n| { - Self::log_persist_task(log_rx, storage, n) + task_manager.spawn(TaskName::HandlePropose, |_n| { + Self::handle_propose_task(Arc::clone(&cmd_executor), Arc::clone(&curp), propose_rx) + }); + task_manager.spawn(TaskName::AfterSync, |_n| { + Self::after_sync_task(curp, cmd_executor, as_rx) }); } @@ -972,7 +1222,7 @@ impl CurpNode { } } -impl Debug for CurpNode { +impl, RC: RoleChange> Debug for CurpNode { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("CurpNode") .field("raw_curp", &self.curp) @@ -983,14 +1233,14 @@ impl Debug for CurpNode { #[cfg(test)] mod tests { - use curp_test_utils::{mock_role_change, sleep_secs, test_cmd::TestCommand}; + use curp_test_utils::{ + mock_role_change, sleep_secs, + test_cmd::{TestCE, TestCommand}, + }; use tracing_test::traced_test; use super::*; - use crate::{ - rpc::{connect::MockInnerConnectApi, ConfChange}, - server::cmd_worker::MockCEEventTxApi, - }; + use crate::rpc::{connect::MockInnerConnectApi, ConfChange}; #[traced_test] #[tokio::test] @@ -998,7 +1248,6 @@ mod tests { let task_manager = Arc::new(TaskManager::new()); let curp = Arc::new(RawCurp::new_test( 3, - MockCEEventTxApi::::default(), mock_role_change(), Arc::clone(&task_manager), )); @@ -1011,7 +1260,7 @@ mod tests { mock_connect1.expect_id().return_const(s1_id); let remove_event = Arc::new(Event::new()); task_manager.spawn(TaskName::SyncFollower, |n| { - CurpNode::sync_follower_task( + CurpNode::<_, TestCE, _>::sync_follower_task( Arc::clone(&curp), InnerConnectApiWrapper::new_from_arc(Arc::new(mock_connect1)), Arc::new(Event::new()), @@ -1028,10 +1277,8 @@ mod tests { async fn tick_task_will_bcast_votes() { let task_manager = Arc::new(TaskManager::new()); let curp = { - let exe_tx = MockCEEventTxApi::::default(); Arc::new(RawCurp::new_test( 3, - exe_tx, mock_role_change(), Arc::clone(&task_manager), )) @@ -1066,7 +1313,7 @@ mod tests { InnerConnectApiWrapper::new_from_arc(Arc::new(mock_connect2)), ); task_manager.spawn(TaskName::Election, |n| { - CurpNode::election_task(Arc::clone(&curp), n) + CurpNode::<_, TestCE, _>::election_task(Arc::clone(&curp), n) }); sleep_secs(3).await; assert!(curp.is_leader()); @@ -1078,10 +1325,8 @@ mod tests { async fn vote_will_not_send_to_learner_during_election() { let task_manager = Arc::new(TaskManager::new()); let curp = { - let exe_tx = MockCEEventTxApi::::default(); Arc::new(RawCurp::new_test( 3, - exe_tx, mock_role_change(), Arc::clone(&task_manager), )) @@ -1132,7 +1377,7 @@ mod tests { InnerConnectApiWrapper::new_from_arc(Arc::new(mock_connect_learner)), ); task_manager.spawn(TaskName::Election, |n| { - CurpNode::election_task(Arc::clone(&curp), n) + CurpNode::<_, TestCE, _>::election_task(Arc::clone(&curp), n) }); sleep_secs(3).await; assert!(curp.is_leader()); diff --git a/crates/curp/src/server/gc.rs b/crates/curp/src/server/gc.rs index e1e8c7360..92af3aeb7 100644 --- a/crates/curp/src/server/gc.rs +++ b/crates/curp/src/server/gc.rs @@ -2,20 +2,18 @@ use std::time::Duration; use utils::task_manager::Listener; -use crate::{cmd::Command, server::cmd_board::CmdBoardRef}; +use crate::{cmd::Command, rpc::ProposeId, server::cmd_board::CmdBoardRef}; -// TODO: Speculative pool GC +use super::{conflict::spec_pool_new::SpeculativePoolRef, lease_manager::LeaseManagerRef}; -/// Cleanup cmd board -pub(super) async fn gc_cmd_board( +/// Garbage collects relevant objects when the client lease expires +pub(super) async fn gc_client_lease( + lease_mamanger: LeaseManagerRef, cmd_board: CmdBoardRef, + sp: SpeculativePoolRef, interval: Duration, shutdown_listener: Listener, ) { - let mut last_check_len_er = 0; - let mut last_check_len_asr = 0; - let mut last_check_len_sync = 0; - let mut last_check_len_conf = 0; #[allow(clippy::arithmetic_side_effects, clippy::ignored_unit_patterns)] // introduced by tokio select loop { @@ -23,32 +21,24 @@ pub(super) async fn gc_cmd_board( _ = tokio::time::sleep(interval) => {} _ = shutdown_listener.wait() => break, } - let mut board = cmd_board.write(); - - // last_check_len_xxx should always be smaller than board.xxx_.len(), the check is just for precaution - - if last_check_len_er <= board.er_buffer.len() { - let new_er_buffer = board.er_buffer.split_off(last_check_len_er); - board.er_buffer = new_er_buffer; - last_check_len_er = board.er_buffer.len(); - } - if last_check_len_asr <= board.asr_buffer.len() { - let new_asr_buffer = board.asr_buffer.split_off(last_check_len_asr); - board.asr_buffer = new_asr_buffer; - last_check_len_asr = board.asr_buffer.len(); - } - - if last_check_len_sync <= board.sync.len() { - let new_sync = board.sync.split_off(last_check_len_sync); - board.sync = new_sync; - last_check_len_sync = board.sync.len(); + let mut lm_w = lease_mamanger.write(); + let mut board = cmd_board.write(); + let mut sp_l = sp.lock(); + let expired_ids = lm_w.gc_expired(); + + let mut expired_propose_ids = Vec::new(); + for id in expired_ids { + if let Some(tracker) = board.trackers.get(&id) { + let incompleted_nums = tracker.all_incompleted(); + expired_propose_ids + .extend(incompleted_nums.into_iter().map(|num| ProposeId(id, num))); + } } - - if last_check_len_conf <= board.conf_buffer.len() { - let new_conf = board.conf_buffer.split_off(last_check_len_conf); - board.conf_buffer = new_conf; - last_check_len_conf = board.conf_buffer.len(); + for id in &expired_propose_ids { + let _ignore_er = board.er_buffer.swap_remove(id); + let _ignore_asr = board.asr_buffer.swap_remove(id); + sp_l.remove_by_id(id); } } } @@ -58,15 +48,17 @@ mod tests { use std::{sync::Arc, time::Duration}; use curp_test_utils::test_cmd::{TestCommand, TestCommandResult}; - use parking_lot::RwLock; + use parking_lot::{Mutex, RwLock}; use test_macros::abort_on_panic; use utils::task_manager::{tasks::TaskName, TaskManager}; use crate::{ - rpc::ProposeId, + rpc::{PoolEntry, ProposeId}, server::{ cmd_board::{CmdBoardRef, CommandBoard}, - gc::gc_cmd_board, + conflict::{spec_pool_new::SpeculativePool, test_pools::TestSpecPool}, + gc::gc_client_lease, + lease_manager::LeaseManager, }, }; @@ -75,48 +67,130 @@ mod tests { async fn cmd_board_gc_test() { let task_manager = TaskManager::new(); let board: CmdBoardRef = Arc::new(RwLock::new(CommandBoard::new())); - task_manager.spawn(TaskName::GcCmdBoard, |n| { - gc_cmd_board(Arc::clone(&board), Duration::from_millis(500), n) + let lease_manager = Arc::new(RwLock::new(LeaseManager::new())); + let lease_manager_c = Arc::clone(&lease_manager); + let sp = Arc::new(Mutex::new(SpeculativePool::new(vec![]))); + let sp_c = Arc::clone(&sp); + task_manager.spawn(TaskName::GcClientLease, |n| { + gc_client_lease( + lease_manager_c, + Arc::clone(&board), + sp_c, + Duration::from_millis(500), + n, + ) }); tokio::time::sleep(Duration::from_millis(100)).await; + let id1 = lease_manager + .write() + .grant(Some(Duration::from_millis(900))); + let id2 = lease_manager + .write() + .grant(Some(Duration::from_millis(900))); + let _ignore = board.write().tracker(id1).only_record(1); + let _ignore = board.write().tracker(id2).only_record(2); + sp.lock().insert(PoolEntry::new( + ProposeId(id1, 1), + Arc::new(TestCommand::default()), + )); + sp.lock().insert(PoolEntry::new( + ProposeId(id2, 2), + Arc::new(TestCommand::default()), + )); board .write() .er_buffer - .insert(ProposeId(1, 1), Ok(TestCommandResult::default())); + .insert(ProposeId(id1, 1), Ok(TestCommandResult::default())); tokio::time::sleep(Duration::from_millis(100)).await; board .write() .er_buffer - .insert(ProposeId(2, 2), Ok(TestCommandResult::default())); + .insert(ProposeId(id2, 2), Ok(TestCommandResult::default())); board .write() .asr_buffer - .insert(ProposeId(1, 1), Ok(0.into())); + .insert(ProposeId(id1, 1), Ok(0.into())); tokio::time::sleep(Duration::from_millis(100)).await; board .write() .asr_buffer - .insert(ProposeId(2, 2), Ok(0.into())); + .insert(ProposeId(id2, 2), Ok(0.into())); // at 600ms tokio::time::sleep(Duration::from_millis(400)).await; + let id3 = lease_manager + .write() + .grant(Some(Duration::from_millis(500))); board .write() .er_buffer - .insert(ProposeId(3, 3), Ok(TestCommandResult::default())); + .insert(ProposeId(id3, 3), Ok(TestCommandResult::default())); board .write() .asr_buffer - .insert(ProposeId(3, 3), Ok(0.into())); + .insert(ProposeId(id3, 3), Ok(0.into())); // at 1100ms, the first two kv should be removed tokio::time::sleep(Duration::from_millis(500)).await; let board = board.write(); assert_eq!(board.er_buffer.len(), 1); - assert_eq!(*board.er_buffer.get_index(0).unwrap().0, ProposeId(3, 3)); + assert_eq!(*board.er_buffer.get_index(0).unwrap().0, ProposeId(id3, 3)); assert_eq!(board.asr_buffer.len(), 1); - assert_eq!(*board.asr_buffer.get_index(0).unwrap().0, ProposeId(3, 3)); + assert_eq!(*board.asr_buffer.get_index(0).unwrap().0, ProposeId(id3, 3)); + task_manager.shutdown(true).await; + } + + #[tokio::test] + #[abort_on_panic] + async fn spec_gc_test() { + let task_manager = TaskManager::new(); + let board: CmdBoardRef = Arc::new(RwLock::new(CommandBoard::new())); + let lease_manager = Arc::new(RwLock::new(LeaseManager::new())); + let lease_manager_c = Arc::clone(&lease_manager); + let sp = Arc::new(Mutex::new(SpeculativePool::new(vec![Box::new( + TestSpecPool::default(), + )]))); + let sp_cloned = Arc::clone(&sp); + task_manager.spawn(TaskName::GcClientLease, |n| { + gc_client_lease( + lease_manager_c, + Arc::clone(&board), + sp_cloned, + Duration::from_millis(500), + n, + ) + }); + + tokio::time::sleep(Duration::from_millis(100)).await; + + let id1 = lease_manager + .write() + .grant(Some(Duration::from_millis(900))); + let id2 = lease_manager + .write() + .grant(Some(Duration::from_millis(2000))); + let _ignore = board.write().tracker(id1).only_record(1); + let cmd1 = Arc::new(TestCommand::new_put(vec![1], 1)); + sp.lock().insert(PoolEntry::new(ProposeId(id1, 1), cmd1)); + + tokio::time::sleep(Duration::from_millis(100)).await; + let _ignore = board.write().tracker(id1).only_record(2); + let cmd2 = Arc::new(TestCommand::new_put(vec![2], 1)); + sp.lock().insert(PoolEntry::new(ProposeId(id1, 2), cmd2)); + + // at 600ms + tokio::time::sleep(Duration::from_millis(400)).await; + let _ignore = board.write().tracker(id2).only_record(1); + let cmd3 = Arc::new(TestCommand::new_put(vec![3], 1)); + sp.lock() + .insert(PoolEntry::new(ProposeId(id2, 1), Arc::clone(&cmd3))); + + // at 1100ms, the first two kv should be removed + tokio::time::sleep(Duration::from_millis(500)).await; + let spec = sp.lock(); + assert_eq!(spec.len(), 1); + assert_eq!(spec.all(), vec![PoolEntry::new(ProposeId(id2, 1), cmd3)]); task_manager.shutdown(true).await; } } diff --git a/crates/curp/src/server/lease_manager.rs b/crates/curp/src/server/lease_manager.rs index 840e2fb07..2ac1b6fdc 100644 --- a/crates/curp/src/server/lease_manager.rs +++ b/crates/curp/src/server/lease_manager.rs @@ -1,8 +1,9 @@ -use std::{cmp::Reverse, ops::Add, sync::Arc, time::Duration}; +use std::{cmp::Reverse, collections::HashSet, ops::Add, sync::Arc, time::Duration}; use parking_lot::RwLock; use priority_queue::PriorityQueue; use tokio::time::Instant; +use tracing::info; /// Ref to lease manager pub(crate) type LeaseManagerRef = Arc>; @@ -15,7 +16,9 @@ pub(crate) struct LeaseManager { /// client_id => expired_at /// /// expiry queue to check the smallest expired_at - pub(super) expiry_queue: PriorityQueue>, + expiry_queue: PriorityQueue>, + /// Bypassed client ids + bypassed: HashSet, } impl LeaseManager { @@ -23,11 +26,15 @@ impl LeaseManager { pub(crate) fn new() -> Self { Self { expiry_queue: PriorityQueue::new(), + bypassed: HashSet::from([12345]), } } /// Check if the client is alive pub(crate) fn check_alive(&self, client_id: u64) -> bool { + if self.bypassed.contains(&client_id) { + return true; + } if let Some(expired_at) = self.expiry_queue.get(&client_id).map(|(_, v)| v.0) { expired_at > Instant::now() } else { @@ -36,44 +43,67 @@ impl LeaseManager { } /// Generate a new client id and grant a lease - pub(crate) fn grant(&mut self) -> u64 { + pub(crate) fn grant(&mut self, ttl: Option) -> u64 { let mut client_id: u64 = rand::random(); while self.expiry_queue.get(&client_id).is_some() { client_id = rand::random(); } - let expiry = Instant::now().add(DEFAULT_LEASE_TTL); - let _ig = self.expiry_queue.push(client_id, Reverse(expiry)); - // gc all expired client id while granting a new client id - self.gc_expired(); + let expiry = Instant::now().add(ttl.unwrap_or(DEFAULT_LEASE_TTL)); + _ = self.expiry_queue.push(client_id, Reverse(expiry)); client_id } /// GC the expired client ids - pub(crate) fn gc_expired(&mut self) { + pub(crate) fn gc_expired(&mut self) -> Vec { + let mut expired = Vec::new(); while let Some(expiry) = self.expiry_queue.peek().map(|(_, v)| v.0) { if expiry > Instant::now() { - return; + break; } - let _ig = self.expiry_queue.pop(); + let (id, _) = self + .expiry_queue + .pop() + .unwrap_or_else(|| unreachable!("Expiry queue should not be empty")); + expired.push(id); } + expired } /// Renew a client id - pub(crate) fn renew(&mut self, client_id: u64) { - let expiry = Instant::now().add(DEFAULT_LEASE_TTL); - let _ig = self + pub(crate) fn renew(&mut self, client_id: u64, ttl: Option) { + if self.bypassed.contains(&client_id) { + return; + } + let expiry = Instant::now().add(ttl.unwrap_or(DEFAULT_LEASE_TTL)); + _ = self .expiry_queue .change_priority(&client_id, Reverse(expiry)); } + /// Bypass a client id, the means the client is on the server + pub(crate) fn bypass(&mut self, client_id: u64) { + if self.bypassed.insert(client_id) { + info!("bypassed client_id: {}", client_id); + } + _ = self.expiry_queue.remove(&client_id); + } + /// Clear, called when leader retires pub(crate) fn clear(&mut self) { self.expiry_queue.clear(); + self.bypassed.clear(); + } + + /// Get the online clients count (excluding bypassed clients) + pub(crate) fn online_clients(&self) -> usize { + self.expiry_queue.len() } /// Revoke a lease pub(crate) fn revoke(&mut self, client_id: u64) { - let _ig = self.expiry_queue.remove(&client_id); + _ = self.expiry_queue.remove(&client_id); + _ = self.bypassed.remove(&client_id); + info!("revoked client_id: {}", client_id); } } @@ -85,7 +115,12 @@ mod test { fn test_basic_lease_manager() { let mut lm = LeaseManager::new(); - let client_id = lm.grant(); + let client_id = lm.grant(None); + assert!(lm.check_alive(client_id)); + lm.revoke(client_id); + assert!(!lm.check_alive(client_id)); + + lm.bypass(client_id); assert!(lm.check_alive(client_id)); lm.revoke(client_id); assert!(!lm.check_alive(client_id)); @@ -95,7 +130,7 @@ mod test { async fn test_lease_expire() { let mut lm = LeaseManager::new(); - let client_id = lm.grant(); + let client_id = lm.grant(None); assert!(lm.check_alive(client_id)); tokio::time::sleep(DEFAULT_LEASE_TTL).await; assert!(!lm.check_alive(client_id)); @@ -105,10 +140,10 @@ mod test { async fn test_renew_lease() { let mut lm = LeaseManager::new(); - let client_id = lm.grant(); + let client_id = lm.grant(None); assert!(lm.check_alive(client_id)); tokio::time::sleep(DEFAULT_LEASE_TTL / 2).await; - lm.renew(client_id); + lm.renew(client_id, None); tokio::time::sleep(DEFAULT_LEASE_TTL / 2).await; assert!(lm.check_alive(client_id)); } diff --git a/crates/curp/src/server/metrics.rs b/crates/curp/src/server/metrics.rs index bcc9ba658..e0a9e31c1 100644 --- a/crates/curp/src/server/metrics.rs +++ b/crates/curp/src/server/metrics.rs @@ -120,8 +120,8 @@ impl Metrics { let sp_size = curp.spec_pool().lock().len(); observer.observe_u64(&sp_cnt, sp_size.numeric_cast(), &[]); - let client_ids = curp.lease_manager().read().expiry_queue.len(); - observer.observe_u64(&online_clients, client_ids.numeric_cast(), &[]); + let client_count = curp.lease_manager().read().online_clients(); + observer.observe_u64(&online_clients, client_count.numeric_cast(), &[]); let commit_index = curp.commit_index(); let last_log_index = curp.last_log_index(); diff --git a/crates/curp/src/server/mod.rs b/crates/curp/src/server/mod.rs index 33bf76b80..8ed11971f 100644 --- a/crates/curp/src/server/mod.rs +++ b/crates/curp/src/server/mod.rs @@ -1,6 +1,7 @@ use std::{fmt::Debug, sync::Arc}; use engine::SnapshotAllocator; +use flume::r#async::RecvStream; use tokio::sync::broadcast; #[cfg(not(madsim))] use tonic::transport::ClientTlsConfig; @@ -14,20 +15,25 @@ pub use self::{ conflict::{spec_pool_new::SpObject, uncommitted_pool::UcpObject}, raw_curp::RawCurp, }; +use crate::rpc::{OpResponse, RecordRequest, RecordResponse}; use crate::{ cmd::{Command, CommandExecutor}, members::{ClusterInfo, ServerId}, role_change::RoleChange, rpc::{ - AppendEntriesRequest, AppendEntriesResponse, FetchClusterRequest, FetchClusterResponse, - FetchReadStateRequest, FetchReadStateResponse, InstallSnapshotRequest, - InstallSnapshotResponse, LeaseKeepAliveMsg, MoveLeaderRequest, MoveLeaderResponse, - ProposeConfChangeRequest, ProposeConfChangeResponse, ProposeRequest, ProposeResponse, + connect::Bypass, AppendEntriesRequest, AppendEntriesResponse, FetchClusterRequest, + FetchClusterResponse, FetchReadStateRequest, FetchReadStateResponse, + InstallSnapshotRequest, InstallSnapshotResponse, LeaseKeepAliveMsg, MoveLeaderRequest, + MoveLeaderResponse, ProposeConfChangeRequest, ProposeConfChangeResponse, ProposeRequest, PublishRequest, PublishResponse, ShutdownRequest, ShutdownResponse, TriggerShutdownRequest, TriggerShutdownResponse, TryBecomeLeaderNowRequest, TryBecomeLeaderNowResponse, - VoteRequest, VoteResponse, WaitSyncedRequest, WaitSyncedResponse, + VoteRequest, VoteResponse, }, }; +use crate::{ + response::ResponseSender, + rpc::{ReadIndexRequest, ReadIndexResponse}, +}; /// Command worker to do execution and after sync mod cmd_worker; @@ -62,12 +68,12 @@ pub use storage::{db::DB, StorageApi, StorageError}; /// /// This Wrapper is introduced due to the `MadSim` rpc lib #[derive(Debug)] -pub struct Rpc { +pub struct Rpc, RC: RoleChange> { /// The inner server is wrapped in an Arc so that its state can be shared while cloning the rpc wrapper - inner: Arc>, + inner: Arc>, } -impl Clone for Rpc { +impl, RC: RoleChange> Clone for Rpc { #[inline] fn clone(&self) -> Self { Self { @@ -77,26 +83,51 @@ impl Clone for Rpc { } #[tonic::async_trait] -impl crate::rpc::Protocol for Rpc { - #[instrument(skip_all, name = "curp_propose")] - async fn propose( +impl, RC: RoleChange> crate::rpc::Protocol for Rpc { + type ProposeStreamStream = RecvStream<'static, Result>; + + #[instrument(skip_all, name = "propose_stream")] + async fn propose_stream( &self, request: tonic::Request, - ) -> Result, tonic::Status> { - request.metadata().extract_span(); + ) -> Result, tonic::Status> { + let bypassed = request.metadata().is_bypassed(); + let (tx, rx) = flume::bounded(2); + let resp_tx = Arc::new(ResponseSender::new(tx)); + self.inner + .propose_stream(&request.into_inner(), resp_tx, bypassed) + .await?; + + Ok(tonic::Response::new(rx.into_stream())) + } + + #[instrument(skip_all, name = "record")] + async fn record( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { Ok(tonic::Response::new( - self.inner.propose(request.into_inner()).await?, + self.inner.record(&request.into_inner())?, )) } + #[instrument(skip_all, name = "read_index")] + async fn read_index( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> { + Ok(tonic::Response::new(self.inner.read_index()?)) + } + #[instrument(skip_all, name = "curp_shutdown")] async fn shutdown( &self, request: tonic::Request, ) -> Result, tonic::Status> { + let bypassed = request.metadata().is_bypassed(); request.metadata().extract_span(); Ok(tonic::Response::new( - self.inner.shutdown(request.into_inner()).await?, + self.inner.shutdown(request.into_inner(), bypassed).await?, )) } @@ -105,9 +136,12 @@ impl crate::rpc::Protocol for Rpc { &self, request: tonic::Request, ) -> Result, tonic::Status> { + let bypassed = request.metadata().is_bypassed(); request.metadata().extract_span(); Ok(tonic::Response::new( - self.inner.propose_conf_change(request.into_inner()).await?, + self.inner + .propose_conf_change(request.into_inner(), bypassed) + .await?, )) } @@ -116,20 +150,10 @@ impl crate::rpc::Protocol for Rpc { &self, request: tonic::Request, ) -> Result, tonic::Status> { + let bypassed = request.metadata().is_bypassed(); request.metadata().extract_span(); Ok(tonic::Response::new( - self.inner.publish(request.into_inner())?, - )) - } - - #[instrument(skip_all, name = "curp_wait_synced")] - async fn wait_synced( - &self, - request: tonic::Request, - ) -> Result, tonic::Status> { - request.metadata().extract_span(); - Ok(tonic::Response::new( - self.inner.wait_synced(request.into_inner()).await?, + self.inner.publish(request.into_inner(), bypassed)?, )) } @@ -177,7 +201,9 @@ impl crate::rpc::Protocol for Rpc { } #[tonic::async_trait] -impl crate::rpc::InnerProtocol for Rpc { +impl, RC: RoleChange> crate::rpc::InnerProtocol + for Rpc +{ #[instrument(skip_all, name = "curp_append_entries")] async fn append_entries( &self, @@ -194,7 +220,7 @@ impl crate::rpc::InnerProtocol for Rpc { request: tonic::Request, ) -> Result, tonic::Status> { Ok(tonic::Response::new( - self.inner.vote(request.into_inner()).await?, + self.inner.vote(&request.into_inner())?, )) } @@ -230,7 +256,7 @@ impl crate::rpc::InnerProtocol for Rpc { } } -impl Rpc { +impl, RC: RoleChange> Rpc { /// New `Rpc` /// /// # Panics @@ -238,7 +264,7 @@ impl Rpc { /// Panic if storage creation failed #[inline] #[allow(clippy::too_many_arguments)] // TODO: refactor this use builder pattern - pub async fn new>( + pub async fn new( cluster_info: Arc, is_leader: bool, executor: Arc, @@ -287,7 +313,7 @@ impl Rpc { #[cfg(madsim)] #[allow(clippy::too_many_arguments)] #[inline] - pub async fn run_from_addr( + pub async fn run_from_addr( cluster_info: Arc, is_leader: bool, addr: std::net::SocketAddr, @@ -300,10 +326,7 @@ impl Rpc { client_tls_config: Option, sps: Vec>, ucps: Vec>, - ) -> Result<(), crate::error::ServerError> - where - CE: CommandExecutor, - { + ) -> Result<(), crate::error::ServerError> { use utils::task_manager::tasks::TaskName; use crate::rpc::{InnerProtocolServer, ProtocolServer}; diff --git a/crates/curp/src/server/raw_curp/log.rs b/crates/curp/src/server/raw_curp/log.rs index 062f432cf..5d25e3f3b 100644 --- a/crates/curp/src/server/raw_curp/log.rs +++ b/crates/curp/src/server/raw_curp/log.rs @@ -11,8 +11,7 @@ use std::{ use clippy_utilities::NumericCast; use itertools::Itertools; -use tokio::sync::mpsc; -use tracing::{error, warn}; +use tracing::warn; use crate::{ cmd::Command, @@ -119,8 +118,6 @@ pub(super) struct Log { pub(super) last_exe: LogIndex, /// Contexts of fallback log entries pub(super) fallback_contexts: HashMap>, - /// Tx to send log entries to persist task - log_tx: mpsc::UnboundedSender>>, /// Entries to keep in memory entries_cap: usize, } @@ -318,13 +315,12 @@ type ConfChangeEntries = Vec>>; /// Fallback indexes type type FallbackIndexes = HashSet; +/// Type returned when append success +type AppendSuccess = (Vec>>, ConfChangeEntries, FallbackIndexes); + impl Log { /// Create a new log - pub(super) fn new( - log_tx: mpsc::UnboundedSender>>, - batch_limit: u64, - entries_cap: usize, - ) -> Self { + pub(super) fn new(batch_limit: u64, entries_cap: usize) -> Self { Self { entries: VecDeque::with_capacity(entries_cap), batch_end: VecDeque::with_capacity(entries_cap), @@ -336,7 +332,6 @@ impl Log { base_term: 0, last_as: 0, last_exe: 0, - log_tx, fallback_contexts: HashMap::new(), entries_cap, } @@ -381,7 +376,8 @@ impl Log { entries: Vec>, prev_log_index: LogIndex, prev_log_term: u64, - ) -> Result<(ConfChangeEntries, FallbackIndexes), Vec>> { + ) -> Result, Vec>> { + let mut to_persist = Vec::with_capacity(entries.len()); let mut conf_changes = vec![]; let mut need_fallback_indexes = HashSet::new(); // check if entries can be appended @@ -427,17 +423,10 @@ impl Log { bincode::serialized_size(&entry).expect("log entry {entry:?} cannot be serialized"), ); - self.send_persist(entry); + to_persist.push(entry); } - Ok((conf_changes, need_fallback_indexes)) - } - - /// Send log entries to persist task - pub(super) fn send_persist(&self, entry: Arc>) { - if let Err(err) = self.log_tx.send(entry) { - error!("failed to send log to persist, {err}"); - } + Ok((to_persist, conf_changes, need_fallback_indexes)) } /// Check if the candidate's log is up-to-date @@ -452,18 +441,20 @@ impl Log { } /// Push a log entry into the end of log + // FIXME: persistent other log entries + // TODO: Avoid allocation during locking pub(super) fn push( &mut self, term: u64, propose_id: ProposeId, entry: impl Into>, - ) -> Result>, bincode::Error> { + ) -> Arc> { let index = self.last_log_index() + 1; let entry = Arc::new(LogEntry::new(index, term, propose_id, entry)); - let size = bincode::serialized_size(&entry)?; + let size = bincode::serialized_size(&entry) + .unwrap_or_else(|_| unreachable!("bindcode serialization should always succeed")); self.push_back(Arc::clone(&entry), size); - self.send_persist(Arc::clone(&entry)); - Ok(entry) + entry } /// check whether the log entry range [li,..) exceeds the batch limit or not @@ -619,9 +610,7 @@ mod tests { #[test] fn test_log_up_to_date() { - let (log_tx, _log_rx) = mpsc::unbounded_channel(); - let mut log = - Log::::new(log_tx, default_batch_max_size(), default_log_entries_cap()); + let mut log = Log::::new(default_batch_max_size(), default_log_entries_cap()); let result = log.try_append_entries( vec![ LogEntry::new(1, 1, ProposeId(0, 0), Arc::new(TestCommand::default())), @@ -641,9 +630,7 @@ mod tests { #[test] fn try_append_entries_will_remove_inconsistencies() { - let (log_tx, _log_rx) = mpsc::unbounded_channel(); - let mut log = - Log::::new(log_tx, default_batch_max_size(), default_log_entries_cap()); + let mut log = Log::::new(default_batch_max_size(), default_log_entries_cap()); let result = log.try_append_entries( vec![ LogEntry::new(1, 1, ProposeId(0, 1), Arc::new(TestCommand::default())), @@ -670,9 +657,7 @@ mod tests { #[test] fn try_append_entries_will_not_append() { - let (log_tx, _log_rx) = mpsc::unbounded_channel(); - let mut log = - Log::::new(log_tx, default_batch_max_size(), default_log_entries_cap()); + let mut log = Log::::new(default_batch_max_size(), default_log_entries_cap()); let result = log.try_append_entries( vec![LogEntry::new( 1, @@ -708,16 +693,14 @@ mod tests { #[test] fn get_from_should_success() { - let (tx, _rx) = mpsc::unbounded_channel(); - let mut log = - Log::::new(tx, default_batch_max_size(), default_log_entries_cap()); + let mut log = Log::::new(default_batch_max_size(), default_log_entries_cap()); // Note: this test must use the same test command to ensure the size of the entry is fixed let test_cmd = Arc::new(TestCommand::default()); let _res = repeat(Arc::clone(&test_cmd)) .take(10) .enumerate() - .map(|(idx, cmd)| log.push(1, ProposeId(0, idx.numeric_cast()), cmd).unwrap()) + .map(|(idx, cmd)| log.push(1, ProposeId(0, idx.numeric_cast()), cmd)) .collect::>(); let log_entry_size = log.entries[0].size; @@ -802,9 +785,7 @@ mod tests { ) }) .collect::>>(); - let (tx, _rx) = mpsc::unbounded_channel(); - let mut log = - Log::::new(tx, default_batch_max_size(), default_log_entries_cap()); + let mut log = Log::::new(default_batch_max_size(), default_log_entries_cap()); log.restore_entries(entries).unwrap(); assert_eq!(log.entries.len(), 10); @@ -813,12 +794,10 @@ mod tests { #[test] fn compact_test() { - let (log_tx, _log_rx) = mpsc::unbounded_channel(); - let mut log = Log::::new(log_tx, default_batch_max_size(), 10); + let mut log = Log::::new(default_batch_max_size(), 10); for i in 0..30 { - log.push(0, ProposeId(0, i), Arc::new(TestCommand::default())) - .unwrap(); + log.push(0, ProposeId(0, i), Arc::new(TestCommand::default())); } log.last_as = 22; log.last_exe = 22; @@ -831,11 +810,9 @@ mod tests { #[test] fn get_from_should_success_after_compact() { - let (log_tx, _log_rx) = mpsc::unbounded_channel(); - let mut log = Log::::new(log_tx, default_batch_max_size(), 10); + let mut log = Log::::new(default_batch_max_size(), 10); for i in 0..30 { - log.push(0, ProposeId(0, i), Arc::new(TestCommand::default())) - .unwrap(); + log.push(0, ProposeId(0, i), Arc::new(TestCommand::default())); } let log_entry_size = log.entries[0].size; log.set_batch_limit(2 * log_entry_size); @@ -871,8 +848,7 @@ mod tests { #[test] fn batch_info_should_update_correctly_after_truncated() { - let (log_tx, _log_rx) = mpsc::unbounded_channel(); - let mut log = Log::::new(log_tx, 11, 10); + let mut log = Log::::new(11, 10); let mock_entries_sizes = vec![1, 5, 6, 2, 3, 4, 5]; let test_cmd = Arc::new(TestCommand::default()); diff --git a/crates/curp/src/server/raw_curp/mod.rs b/crates/curp/src/server/raw_curp/mod.rs index e3f24d22b..fd367400f 100644 --- a/crates/curp/src/server/raw_curp/mod.rs +++ b/crates/curp/src/server/raw_curp/mod.rs @@ -10,7 +10,7 @@ #![allow(clippy::arithmetic_side_effects)] // u64 is large enough and won't overflow use std::{ - cmp::min, + cmp::{self, min}, collections::{HashMap, HashSet}, fmt::Debug, sync::{ @@ -23,10 +23,11 @@ use clippy_utilities::{NumericCast, OverflowArithmetic}; use dashmap::DashMap; use derive_builder::Builder; use event_listener::Event; +use futures::Future; use itertools::Itertools; use opentelemetry::KeyValue; use parking_lot::{Mutex, RwLock, RwLockUpgradableReadGuard, RwLockWriteGuard}; -use tokio::sync::{broadcast, mpsc, oneshot}; +use tokio::sync::{broadcast, oneshot}; #[cfg(not(madsim))] use tonic::transport::ClientTlsConfig; use tracing::{ @@ -37,6 +38,7 @@ use tracing::{ #[cfg(madsim)] use utils::ClientTlsConfig; use utils::{ + barrier::IdBarrier, config::CurpConfig, parking_lot_lock::{MutexMap, RwLockMap}, task_manager::TaskManager, @@ -47,8 +49,9 @@ use self::{ state::{CandidateState, LeaderState, State}, }; use super::{ - cmd_worker::CEEventTxApi, + cmd_board::CommandBoard, conflict::{spec_pool_new::SpeculativePool, uncommitted_pool::UncommittedPool}, + curp_node::TaskType, lease_manager::LeaseManagerRef, storage::StorageApi, DB, @@ -58,11 +61,12 @@ use crate::{ log_entry::{EntryData, LogEntry}, members::{ClusterInfo, ServerId}, quorum, recover_quorum, + response::ResponseSender, role_change::RoleChange, rpc::{ connect::{InnerConnectApi, InnerConnectApiWrapper}, ConfChange, ConfChangeType, CurpError, IdSet, Member, PoolEntry, ProposeId, PublishRequest, - ReadState, + ReadState, Redirect, }, server::{ cmd_board::CmdBoardRef, @@ -119,10 +123,6 @@ pub(super) struct RawCurpArgs { lease_manager: LeaseManagerRef, /// Config cfg: Arc, - /// Tx to send cmds to execute and do after sync - cmd_tx: Arc>, - /// Tx to send log entries - log_tx: mpsc::UnboundedSender>>, /// Role change callback role_change: RC, /// Task manager @@ -149,6 +149,12 @@ pub(super) struct RawCurpArgs { spec_pool: Arc>>, /// Uncommitted pool uncommitted_pool: Arc>>, + /// Tx to send entries to after_sync + as_tx: flume::Sender>, + /// Response Senders + resp_txs: Arc>>>, + /// Barrier for waiting unsynced commands + id_barrier: Arc>, } impl RawCurpBuilder { @@ -162,18 +168,13 @@ impl RawCurpBuilder { )); let lst = LeaderState::new(&args.cluster_info.peers_ids()); let cst = Mutex::new(CandidateState::new(args.cluster_info.all_ids().into_iter())); - let log = RwLock::new(Log::new( - args.log_tx, - args.cfg.batch_max_size, - args.cfg.log_entries_cap, - )); + let log = RwLock::new(Log::new(args.cfg.batch_max_size, args.cfg.log_entries_cap)); let ctx = Context::builder() .cluster_info(args.cluster_info) .cb(args.cmd_board) .lm(args.lease_manager) .cfg(args.cfg) - .cmd_tx(args.cmd_tx) .sync_events(args.sync_events) .role_change(args.role_change) .connects(args.connects) @@ -181,6 +182,9 @@ impl RawCurpBuilder { .client_tls_config(args.client_tls_config) .spec_pool(args.spec_pool) .uncommitted_pool(args.uncommitted_pool) + .as_tx(args.as_tx) + .resp_txs(args.resp_txs) + .id_barrier(args.id_barrier) .build() .map_err(|e| match e { ContextBuilderError::UninitializedField(s) => { @@ -294,6 +298,10 @@ enum Role { } /// Relevant context for Curp +/// +/// WARN: To avoid deadlock, the lock order should be: +/// 1. `spec_pool` +/// 2. `uncommitted_pool` #[derive(Builder)] #[builder(build_fn(skip))] struct Context { @@ -313,8 +321,6 @@ struct Context { /// Election tick #[builder(setter(skip))] election_tick: AtomicU8, - /// Tx to send cmds to execute and do after sync - cmd_tx: Arc>, /// Followers sync event trigger sync_events: DashMap>, /// Become leader event @@ -339,6 +345,13 @@ struct Context { spec_pool: Arc>>, /// Uncommitted pool uncommitted_pool: Arc>>, + /// Tx to send entries to after_sync + as_tx: flume::Sender>, + /// Response Senders + // TODO: this could be replaced by a queue + resp_txs: Arc>>>, + /// Barrier for waiting unsynced commands + id_barrier: Arc>, } impl Context { @@ -371,10 +384,6 @@ impl ContextBuilder { }, leader_tx: broadcast::channel(1).0, election_tick: AtomicU8::new(0), - cmd_tx: match self.cmd_tx.take() { - Some(value) => value, - None => return Err(ContextBuilderError::UninitializedField("cmd_tx")), - }, sync_events: match self.sync_events.take() { Some(value) => value, None => return Err(ContextBuilderError::UninitializedField("sync_events")), @@ -407,6 +416,18 @@ impl ContextBuilder { Some(value) => value, None => return Err(ContextBuilderError::UninitializedField("uncommitted_pool")), }, + as_tx: match self.as_tx.take() { + Some(value) => value, + None => return Err(ContextBuilderError::UninitializedField("as_tx")), + }, + resp_txs: match self.resp_txs.take() { + Some(value) => value, + None => return Err(ContextBuilderError::UninitializedField("resp_txs")), + }, + id_barrier: match self.id_barrier.take() { + Some(value) => value, + None => return Err(ContextBuilderError::UninitializedField("id_barrier")), + }, }) } } @@ -455,72 +476,158 @@ impl RawCurp { } } +/// Term, entries +type AppendEntriesSuccess = (u64, Vec>>); +/// Term, index +type AppendEntriesFailure = (u64, LogIndex); + // Curp handlers +// TODO: Tidy up the handlers +// Possible improvements: +// * split metrics collection from CurpError into a separate function +// * split the handlers into separate modules impl RawCurp { - /// Handle `propose` request - /// Return `true` if the leader speculatively executed the command - pub(super) fn handle_propose( - &self, - propose_id: ProposeId, - cmd: Arc, - ) -> Result { - debug!("{} gets proposal for cmd({})", self.id(), propose_id); - let mut conflict = self - .ctx - .spec_pool - .map_lock(|mut sp_l| sp_l.insert(PoolEntry::new(propose_id, Arc::clone(&cmd)))) - .is_some(); - + /// Checks the if term are up-to-date + pub(super) fn check_term(&self, term: u64) -> Result<(), CurpError> { let st_r = self.st.read(); - // Non-leader doesn't need to sync or execute - if st_r.role != Role::Leader { - if conflict { - metrics::get() - .proposals_failed - .add(1, &[KeyValue::new("reason", "follower key conflict")]); - return Err(CurpError::key_conflict()); - } - return Ok(false); - } - if self.lst.get_transferee().is_some() { - return Err(CurpError::LeaderTransfer("leader transferring".to_owned())); + + // Rejects the request + // When `st_r.term > term`, the client is using an outdated leader + // When `st_r.term < term`, the current node is a zombie + match st_r.term.cmp(&term) { + // Current node is a zombie + cmp::Ordering::Less => Err(CurpError::Zombie(())), + cmp::Ordering::Greater => Err(CurpError::Redirect(Redirect { + leader_id: st_r.leader_id.map(Into::into), + term: st_r.term, + })), + cmp::Ordering::Equal => Ok(()), } - if !self + } + + /// Handles record + pub(super) fn follower_record(&self, propose_id: ProposeId, cmd: &Arc) -> bool { + let conflict = self .ctx - .cb - .map_write(|mut cb_w| cb_w.sync.insert(propose_id)) - { + .spec_pool + .lock() + .insert(PoolEntry::new(propose_id, Arc::clone(cmd))) + .is_some(); + if conflict { metrics::get() .proposals_failed - .add(1, &[KeyValue::new("reason", "duplicated proposal")]); - return Err(CurpError::duplicated()); + .add(1, &[KeyValue::new("reason", "follower key conflict")]); } + conflict + } - // leader also needs to check if the cmd conflicts un-synced commands - conflict |= self - .ctx - .uncommitted_pool - .map_lock(|mut ucp_l| ucp_l.insert(PoolEntry::new(propose_id, Arc::clone(&cmd)))); + /// Handles record + pub(super) fn leader_record(&self, entries: impl Iterator>) -> Vec { + let mut sp_l = self.ctx.spec_pool.lock(); + let mut ucp_l = self.ctx.uncommitted_pool.lock(); + let mut conflicts = Vec::new(); + for entry in entries { + let mut conflict = sp_l.insert(entry.clone()).is_some(); + conflict |= ucp_l.insert(&entry); + conflicts.push(conflict); + } + metrics::get().proposals_failed.add( + conflicts.iter().filter(|c| **c).count().numeric_cast(), + &[KeyValue::new("reason", "leader key conflict")], + ); + conflicts + } + /// Handles leader propose + pub(super) fn push_logs( + &self, + proposes: Vec<(Arc, ProposeId, u64, Arc)>, + ) -> Vec>> { + let term = proposes + .first() + .unwrap_or_else(|| unreachable!("no propose in proposes")) + .2; + let mut log_entries = Vec::with_capacity(proposes.len()); + let mut to_process = Vec::with_capacity(proposes.len()); let mut log_w = self.log.write(); - let entry = log_w.push(st_r.term, propose_id, cmd).map_err(|e| { - metrics::get() - .proposals_failed - .add(1, &[KeyValue::new("reason", "log serialize failed")]); - e - })?; - debug!("{} gets new log[{}]", self.id(), entry.index); + self.ctx.resp_txs.map_lock(|mut tx_map| { + for propose in proposes { + let (cmd, id, _term, resp_tx) = propose; + let entry = log_w.push(term, id, cmd); + let index = entry.index; + let conflict = resp_tx.is_conflict(); + to_process.push((index, conflict)); + log_entries.push(entry); + assert!( + tx_map.insert(index, Arc::clone(&resp_tx)).is_none(), + "Should not insert resp_tx twice" + ); + } + }); + self.entry_process_multi(&mut log_w, &to_process, term); - self.entry_process(&mut log_w, entry, conflict, st_r.term); + let log_r = RwLockWriteGuard::downgrade(log_w); + self.persistent_log_entries( + &log_entries.iter().map(Arc::as_ref).collect::>(), + &log_r, + ); - if conflict { - metrics::get() - .proposals_failed - .add(1, &[KeyValue::new("reason", "leader key conflict")]); - return Err(CurpError::key_conflict()); + log_entries + } + + /// Persistent log entries + /// + /// NOTE: A `&Log` is required because we do not want the `Log` structure gets mutated + /// during the persistent + #[allow(clippy::panic)] + #[allow(dropping_references)] + fn persistent_log_entries(&self, entries: &[&LogEntry], _log: &Log) { + // We panic when the log persistence fails because it likely indicates an unrecoverable error. + // Our WAL implementation does not support rollback on failure, as a file write syscall is not + // guaranteed to be atomic. + if let Err(e) = self.ctx.curp_storage.put_log_entries(entries) { + panic!("log persistent failed: {e}"); } + } - Ok(true) + /// Wait synced for all conflict commands + pub(super) fn wait_conflicts_synced(&self, cmd: Arc) -> impl Future + Send { + let conflict_cmds: Vec<_> = self + .ctx + .uncommitted_pool + .lock() + .all_conflict(&PoolEntry::new(ProposeId::default(), cmd)) + .into_iter() + .map(|e| e.id) + .collect(); + self.ctx.id_barrier.wait_all(conflict_cmds) + } + + /// Wait all logs in previous term have been applied to state machine + pub(super) fn wait_no_op_applied(&self) -> Box + Send + Unpin> { + // if the leader is at term 1, it won't commit a no-op log + if self.term() == 1 { + return Box::new(futures::future::ready(())); + } + Box::new(self.lst.wait_no_op_applied()) + } + + /// Sets the no-op log as applied + pub(super) fn set_no_op_applied(&self) { + self.lst.set_no_op_applied(); + } + + /// Trigger the barrier of the given inflight id. + pub(super) fn trigger(&self, propose_id: &ProposeId) { + self.ctx.id_barrier.trigger(propose_id); + } + + /// Returns `CurpError::LeaderTransfer` if the leadership is transferring + pub(super) fn check_leader_transfer(&self) -> Result<(), CurpError> { + if self.lst.get_transferee().is_some() { + return Err(CurpError::LeaderTransfer("leader transferring".to_owned())); + } + Ok(()) } /// Handle `shutdown` request @@ -532,17 +639,15 @@ impl RawCurp { if self.lst.get_transferee().is_some() { return Err(CurpError::LeaderTransfer("leader transferring".to_owned())); } + self.deduplicate(propose_id, None)?; let mut log_w = self.log.write(); - let entry = log_w - .push(st_r.term, propose_id, EntryData::Shutdown) - .map_err(|e| { - metrics::get() - .proposals_failed - .add(1, &[KeyValue::new("reason", "log serialize failed")]); - e - })?; + let entry = log_w.push(st_r.term, propose_id, EntryData::Shutdown); debug!("{} gets new log[{}]", self.id(), entry.index); - self.entry_process(&mut log_w, entry, true, st_r.term); + self.entry_process_single(&mut log_w, entry.as_ref(), true, st_r.term); + + let log_r = RwLockWriteGuard::downgrade(log_w); + self.persistent_log_entries(&[entry.as_ref()], &log_r); + Ok(()) } @@ -568,37 +673,25 @@ impl RawCurp { } self.check_new_config(&conf_changes)?; - let mut conflict = self - .ctx - .spec_pool - .lock() - .insert(PoolEntry::new(propose_id, conf_changes.clone())) - .is_some(); - conflict |= self - .ctx - .uncommitted_pool - .lock() - .insert(PoolEntry::new(propose_id, conf_changes.clone())); - + self.deduplicate(propose_id, None)?; let mut log_w = self.log.write(); - let entry = log_w - .push(st_r.term, propose_id, conf_changes.clone()) - .map_err(|e| { - metrics::get() - .proposals_failed - .add(1, &[KeyValue::new("reason", "log serialize failed")]); - e - })?; + let entry = log_w.push(st_r.term, propose_id, conf_changes.clone()); debug!("{} gets new log[{}]", self.id(), entry.index); - let (addrs, name, is_learner) = self.apply_conf_change(conf_changes); + let apply_opt = self.apply_conf_change(conf_changes); self.ctx .last_conf_change_idx .store(entry.index, Ordering::Release); - let _ig = log_w.fallback_contexts.insert( - entry.index, - FallbackContext::new(Arc::clone(&entry), addrs, name, is_learner), - ); - self.entry_process(&mut log_w, entry, conflict, st_r.term); + if let Some((addrs, name, is_learner)) = apply_opt { + let _ig = log_w.fallback_contexts.insert( + entry.index, + FallbackContext::new(Arc::clone(&entry), addrs, name, is_learner), + ); + } + self.entry_process_single(&mut log_w, &entry, false, st_r.term); + + let log_r = RwLockWriteGuard::downgrade(log_w); + self.persistent_log_entries(&[entry.as_ref()], &log_r); + Ok(()) } @@ -616,15 +709,17 @@ impl RawCurp { if self.lst.get_transferee().is_some() { return Err(CurpError::leader_transfer("leader transferring")); } + + self.deduplicate(req.propose_id(), None)?; + let mut log_w = self.log.write(); - let entry = log_w.push(st_r.term, req.propose_id(), req).map_err(|e| { - metrics::get() - .proposals_failed - .add(1, &[KeyValue::new("reason", "log serialize failed")]); - e - })?; + let entry = log_w.push(st_r.term, req.propose_id(), req); debug!("{} gets new log[{}]", self.id(), entry.index); - self.entry_process(&mut log_w, entry, false, st_r.term); + self.entry_process_single(&mut log_w, entry.as_ref(), false, st_r.term); + + let log_r = RwLockWriteGuard::downgrade(log_w); + self.persistent_log_entries(&[entry.as_ref()], &log_r); + Ok(()) } @@ -632,20 +727,20 @@ impl RawCurp { pub(super) fn handle_lease_keep_alive(&self, client_id: u64) -> Option { let mut lm_w = self.ctx.lm.write(); if client_id == 0 { - return Some(lm_w.grant()); + return Some(lm_w.grant(None)); } if lm_w.check_alive(client_id) { - lm_w.renew(client_id); + lm_w.renew(client_id, None); None } else { metrics::get().client_id_revokes.add(1, &[]); lm_w.revoke(client_id); - Some(lm_w.grant()) + Some(lm_w.grant(None)) } } /// Handle `append_entries` - /// Return `Ok(term)` if succeeds + /// Return `Ok(term, entries)` if succeeds /// Return `Err(term, hint_index)` if fails pub(super) fn handle_append_entries( &self, @@ -655,7 +750,7 @@ impl RawCurp { prev_log_term: u64, entries: Vec>, leader_commit: LogIndex, - ) -> Result { + ) -> Result, AppendEntriesFailure> { if entries.is_empty() { trace!( "{} received heartbeat from {}: term({}), commit({}), prev_log_index({}), prev_log_term({})", @@ -692,7 +787,7 @@ impl RawCurp { // append log entries let mut log_w = self.log.write(); - let (cc_entries, fallback_indexes) = log_w + let (to_persist, cc_entries, fallback_indexes) = log_w .try_append_entries(entries, prev_log_index, prev_log_term) .map_err(|_ig| (term, log_w.commit_index + 1))?; // fallback overwritten conf change entries @@ -711,7 +806,9 @@ impl RawCurp { let EntryData::ConfChange(ref cc) = e.entry_data else { unreachable!("cc_entry should be conf change entry"); }; - let (addrs, name, is_learner) = self.apply_conf_change(cc.clone()); + let Some((addrs, name, is_learner)) = self.apply_conf_change(cc.clone()) else { + continue; + }; let _ig = log_w.fallback_contexts.insert( e.index, FallbackContext::new(Arc::clone(&e), addrs, name, is_learner), @@ -723,7 +820,7 @@ impl RawCurp { if prev_commit_index < log_w.commit_index { self.apply(&mut *log_w); } - Ok(term) + Ok((term, to_persist)) } /// Handle `append_entries` response @@ -952,7 +1049,8 @@ impl RawCurp { let prev_last_log_index = log_w.last_log_index(); // TODO: Generate client id in the same way as client let propose_id = ProposeId(rand::random(), 0); - let _ignore = log_w.push(st_w.term, propose_id, EntryData::Empty); + let entry = log_w.push(st_w.term, propose_id, EntryData::Empty); + self.persistent_log_entries(&[&entry], &log_w); self.recover_from_spec_pools(&st_w, &mut log_w, spec_pools); self.recover_ucp_from_log(&log_w); let last_log_index = log_w.last_log_index(); @@ -1059,7 +1157,7 @@ impl RawCurp { let ids: Vec<_> = self .ctx .uncommitted_pool - .map_lock(|ucp| ucp.all_conflict(PoolEntry::new(ProposeId::default(), cmd))) + .map_lock(|ucp| ucp.all_conflict(&PoolEntry::new(ProposeId::default(), cmd))) .into_iter() .map(|entry| entry.id) .collect(); @@ -1195,12 +1293,15 @@ impl RawCurp { // the leader will take a snapshot itself every time `sync` is called in effort to // calibrate it. Since taking a snapshot will block the leader's execute workers, we should // not take snapshot so often. A better solution would be to keep a snapshot cache. - Some(SyncAction::Snapshot(self.ctx.cmd_tx.send_snapshot( - SnapshotMeta { - last_included_index: entry.index, - last_included_term: entry.term, - }, - ))) + let meta = SnapshotMeta { + last_included_index: entry.index, + last_included_term: entry.term, + }; + let (tx, rx) = oneshot::channel(); + if let Err(e) = self.ctx.as_tx.send(TaskType::Snapshot(meta, tx)) { + error!("failed to send task to after sync: {e}"); + } + Some(SyncAction::Snapshot(rx)) } else { let (prev_log_index, prev_log_term) = log_r.get_prev_entry_info(next_index); let entries = log_r.get_from(next_index); @@ -1253,13 +1354,13 @@ impl RawCurp { } /// Get a reference to spec pool - pub(super) fn spec_pool(&self) -> Arc>> { - Arc::clone(&self.ctx.spec_pool) + pub(super) fn spec_pool(&self) -> &Mutex> { + &self.ctx.spec_pool } /// Get a reference to uncommitted pool - pub(super) fn uncommitted_pool(&self) -> Arc>> { - Arc::clone(&self.ctx.uncommitted_pool) + pub(super) fn uncommitted_pool(&self) -> &Mutex> { + &self.ctx.uncommitted_pool } /// Get sync event @@ -1361,7 +1462,7 @@ impl RawCurp { pub(super) fn apply_conf_change( &self, changes: Vec, - ) -> (Vec, String, bool) { + ) -> Option<(Vec, String, bool)> { assert_eq!(changes.len(), 1, "Joint consensus is not supported yet"); let Some(conf_change) = changes.into_iter().next() else { unreachable!("conf change is empty"); @@ -1392,6 +1493,7 @@ impl RawCurp { unreachable!("conf change is empty"); }; let node_id = conf_change.node_id; + #[allow(clippy::explicit_auto_deref)] // Avoid compiler complaint about `Dashmap::Ref` type let fallback_change = match conf_change.change_type() { ConfChangeType::Add | ConfChangeType::AddLearner => { self.cst @@ -1422,7 +1524,7 @@ impl RawCurp { let m = self.ctx.cluster_info.get(&node_id).unwrap_or_else(|| { unreachable!("node {} should exist in cluster info", node_id) }); - let _ig = self.ctx.curp_storage.put_member(&m); + let _ig = self.ctx.curp_storage.put_member(&*m); Some(ConfChange::update(node_id, old_addrs)) } ConfChangeType::Promote => { @@ -1435,7 +1537,7 @@ impl RawCurp { let m = self.ctx.cluster_info.get(&node_id).unwrap_or_else(|| { unreachable!("node {} should exist in cluster info", node_id) }); - let _ig = self.ctx.curp_storage.put_member(&m); + let _ig = self.ctx.curp_storage.put_member(&*m); None } }; @@ -1517,6 +1619,12 @@ impl RawCurp { None } + /// Mark a client id as bypassed + pub(super) fn mark_client_id_bypassed(&self, client_id: u64) { + let mut lm_w = self.ctx.lm.write(); + lm_w.bypass(client_id); + } + /// Get client tls config pub(super) fn client_tls_config(&self) -> Option<&ClientTlsConfig> { self.ctx.client_tls_config.as_ref() @@ -1725,24 +1833,24 @@ impl RawCurp { }) .collect_vec(); - let mut cb_w = self.ctx.cb.write(); let mut sp_l = self.ctx.spec_pool.lock(); let term = st.term; + let mut entries = vec![]; for entry in recovered_cmds { - let _ig_sync = cb_w.sync.insert(entry.id); // may have been inserted before let _ig_spec = sp_l.insert(entry.clone()); // may have been inserted before #[allow(clippy::expect_used)] - let entry = log - .push(term, entry.id, entry.inner) - .expect("cmd {cmd:?} cannot be serialized"); + let entry = log.push(term, entry.id, entry.cmd); debug!( "{} recovers speculatively executed cmd({}) in log[{}]", self.id(), entry.propose_id, entry.index, ); + entries.push(entry); } + + self.persistent_log_entries(&entries.iter().map(Arc::as_ref).collect::>(), log); } /// Recover the ucp from uncommitted log entries @@ -1756,18 +1864,20 @@ impl RawCurp { let propose_id = entry.propose_id; match entry.entry_data { EntryData::Command(ref cmd) => { - let _ignore = ucp_l.insert(PoolEntry::new(propose_id, Arc::clone(cmd))); + let _ignore = ucp_l.insert(&PoolEntry::new(propose_id, Arc::clone(cmd))); } - EntryData::ConfChange(ref conf_change) => { - let _ignore = ucp_l.insert(PoolEntry::new(propose_id, conf_change.clone())); - } - EntryData::Shutdown | EntryData::Empty | EntryData::SetNodeState(_, _, _) => {} + EntryData::ConfChange(_) + | EntryData::Shutdown + | EntryData::Empty + | EntryData::SetNodeState(_, _, _) => {} } } } /// Apply new logs fn apply(&self, log: &mut Log) { + let mut entries = Vec::new(); + let mut resp_txs_l = self.ctx.resp_txs.lock(); for i in (log.last_as + 1)..=log.commit_index { let entry = log.get(i).unwrap_or_else(|| { unreachable!( @@ -1775,7 +1885,8 @@ impl RawCurp { log.last_log_index() ) }); - self.ctx.cmd_tx.send_after_sync(Arc::clone(entry)); + let tx = resp_txs_l.remove(&i); + entries.push((Arc::clone(entry), tx)); log.last_as = i; if log.last_exe < log.last_as { log.last_exe = log.last_as; @@ -1787,6 +1898,8 @@ impl RawCurp { i ); } + debug!("sending {} entries to after sync task", entries.len()); + let _ignore = self.ctx.as_tx.send(TaskType::Entries(entries)); log.compact(); } @@ -1796,12 +1909,18 @@ impl RawCurp { self.ctx.cb.write().clear(); self.ctx.lm.write().clear(); self.ctx.uncommitted_pool.lock().clear(); + self.lst.reset_no_op_state(); } /// Switch to a new config and return old member infos for fallback - fn switch_config(&self, conf_change: ConfChange) -> (Vec, String, bool) { + /// + /// FIXME: The state of `ctx.cluster_info` might be inconsistent with the log. A potential + /// fix would be to include the entire cluster info in the conf change log entry and + /// overwrite `ctx.cluster_info` when switching + fn switch_config(&self, conf_change: ConfChange) -> Option<(Vec, String, bool)> { let node_id = conf_change.node_id; let mut cst_l = self.cst.lock(); + #[allow(clippy::explicit_auto_deref)] // Avoid compiler complaint about `Dashmap::Ref` type let (modified, fallback_info) = match conf_change.change_type() { ConfChangeType::Add | ConfChangeType::AddLearner => { let is_learner = matches!(conf_change.change_type(), ConfChangeType::AddLearner); @@ -1811,7 +1930,7 @@ impl RawCurp { _ = self.ctx.sync_events.insert(node_id, Arc::new(Event::new())); let _ig = self.ctx.curp_storage.put_member(&member); let m = self.ctx.cluster_info.insert(member); - (m.is_none(), (vec![], String::new(), is_learner)) + (m.is_none(), Some((vec![], String::new(), is_learner))) } ConfChangeType::Remove => { _ = cst_l.config.remove(node_id); @@ -1819,16 +1938,15 @@ impl RawCurp { _ = self.ctx.sync_events.remove(&node_id); _ = self.ctx.connects.remove(&node_id); let _ig = self.ctx.curp_storage.remove_member(node_id); - let m = self.ctx.cluster_info.remove(&node_id); - let removed_member = - m.unwrap_or_else(|| unreachable!("the member should exist before remove")); + // The member may not exist because the node could be restarted + // and has fetched the newest cluster info + // + // TODO: Review all the usages of `ctx.cluster_info` to ensure all + // the assertions are correct. + let member_opt = self.ctx.cluster_info.remove(&node_id); ( true, - ( - removed_member.peer_urls, - removed_member.name, - removed_member.is_learner, - ), + member_opt.map(|m| (m.peer_urls, m.name, m.is_learner)), ) } ConfChangeType::Update => { @@ -1839,10 +1957,10 @@ impl RawCurp { let m = self.ctx.cluster_info.get(&node_id).unwrap_or_else(|| { unreachable!("the member should exist after update"); }); - let _ig = self.ctx.curp_storage.put_member(&m); + let _ig = self.ctx.curp_storage.put_member(&*m); ( old_addrs != conf_change.address, - (old_addrs, String::new(), false), + Some((old_addrs, String::new(), false)), ) } ConfChangeType::Promote => { @@ -1853,55 +1971,118 @@ impl RawCurp { let m = self.ctx.cluster_info.get(&node_id).unwrap_or_else(|| { unreachable!("the member should exist after promote"); }); - let _ig = self.ctx.curp_storage.put_member(&m); - (modified, (vec![], String::new(), false)) + let _ig = self.ctx.curp_storage.put_member(&*m); + (modified, Some((vec![], String::new(), false))) } }; if modified { self.ctx.cluster_info.cluster_version_update(); } - if self.is_leader() { - self.ctx - .change_tx - .send(conf_change) - .unwrap_or_else(|_e| unreachable!("change_rx should not be dropped")); - if self + self.ctx + .change_tx + .send(conf_change) + .unwrap_or_else(|_e| unreachable!("change_rx should not be dropped")); + // TODO: We could wrap lst inside a role checking to prevent accidental lst mutation + if self.is_leader() + && self .lst .get_transferee() .is_some_and(|transferee| !cst_l.config.voters().contains(&transferee)) - { - self.lst.reset_transferee(); - } + { + self.lst.reset_transferee(); } fallback_info } + /// Notify sync events + fn notify_sync_events(&self, log: &Log) { + self.ctx.sync_events.iter().for_each(|e| { + if let Some(next) = self.lst.get_next_index(*e.key()) { + if next > log.base_index && log.has_next_batch(next) { + let _ignore = e.notify(1); + } + } + }); + } + + /// Update index in single node cluster + fn update_index_single_node(&self, log: &mut Log, index: u64, term: u64) { + // check if commit_index needs to be updated + if self.can_update_commit_index_to(log, index, term) && index > log.commit_index { + log.commit_to(index); + debug!("{} updates commit index to {index}", self.id()); + self.apply(&mut *log); + } + } + /// Entry process shared by `handle_xxx` - fn entry_process( + #[allow(clippy::pattern_type_mismatch)] // Can't be fixed + fn entry_process_multi(&self, log: &mut Log, entries: &[(u64, bool)], term: u64) { + if let Some(last_no_conflict) = entries + .iter() + .rev() + .find(|(_, conflict)| *conflict) + .map(|(index, _)| *index) + { + log.last_exe = last_no_conflict; + } + let highest_index = entries + .last() + .unwrap_or_else(|| unreachable!("no log in entries")) + .0; + self.notify_sync_events(log); + self.update_index_single_node(log, highest_index, term); + } + + /// Entry process shared by `handle_xxx` + fn entry_process_single( &self, log_w: &mut RwLockWriteGuard<'_, Log>, - entry: Arc>, + entry: &LogEntry, conflict: bool, term: u64, ) { let index = entry.index; if !conflict { log_w.last_exe = index; - self.ctx.cmd_tx.send_sp_exe(entry); } - self.ctx.sync_events.iter().for_each(|e| { - if let Some(next) = self.lst.get_next_index(*e.key()) { - if next > log_w.base_index && log_w.has_next_batch(next) { - let _ignore = e.notify(1); + self.notify_sync_events(log_w); + self.update_index_single_node(log_w, index, term); + } + + /// Process deduplication and acknowledge the `first_incomplete` for this client id + pub(crate) fn deduplicate( + &self, + ProposeId(client_id, seq_num): ProposeId, + first_incomplete: Option, + ) -> Result<(), CurpError> { + // deduplication + if self.ctx.lm.read().check_alive(client_id) { + let mut cb_w = self.ctx.cb.write(); + let tracker = cb_w.tracker(client_id); + if tracker.only_record(seq_num) { + // TODO: obtain the previous ER from cmd_board and packed into CurpError::Duplicated as an entry. + return Err(CurpError::duplicated()); + } + if let Some(first_incomplete) = first_incomplete { + let before = tracker.first_incomplete(); + if tracker.must_advance_to(first_incomplete) { + for seq_num_ack in before..first_incomplete { + Self::ack(ProposeId(client_id, seq_num_ack), &mut cb_w); + } } } - }); - - // check if commit_index needs to be updated - if self.can_update_commit_index_to(log_w, index, term) && index > log_w.commit_index { - log_w.commit_to(index); - debug!("{} updates commit index to {index}", self.id()); - self.apply(&mut *log_w); + } else { + self.ctx.cb.write().client_expired(client_id); + return Err(CurpError::expired_client_id()); } + Ok(()) + } + + /// Acknowledge the propose id and GC it's cmd board result + fn ack(id: ProposeId, cb: &mut CommandBoard) { + let _ignore_er = cb.er_buffer.swap_remove(&id); + let _ignore_asr = cb.asr_buffer.swap_remove(&id); + let _ignore_conf = cb.conf_buffer.swap_remove(&id); } } diff --git a/crates/curp/src/server/raw_curp/state.rs b/crates/curp/src/server/raw_curp/state.rs index d202c6a7a..f1504888c 100644 --- a/crates/curp/src/server/raw_curp/state.rs +++ b/crates/curp/src/server/raw_curp/state.rs @@ -1,6 +1,7 @@ use std::{ collections::{HashMap, HashSet}, - sync::atomic::{AtomicU64, Ordering}, + pin::Pin, + sync::atomic::{AtomicBool, AtomicU64, Ordering}, }; use dashmap::{ @@ -10,6 +11,8 @@ use dashmap::{ }, DashMap, }; +use event_listener::Event; +use futures::{future, Future}; use madsim::rand::{thread_rng, Rng}; use tracing::{debug, warn}; @@ -92,6 +95,38 @@ pub(super) struct LeaderState { statuses: DashMap, /// Leader Transferee leader_transferee: AtomicU64, + /// Event of the application of the no-op log, used for readIndex + no_op_state: NoOpState, +} + +/// The state of the no-op log entry application +#[derive(Debug, Default)] +struct NoOpState { + /// The event that triggers after application + event: Event, + /// Whether the no-op entry has been applied + applied: AtomicBool, +} + +impl NoOpState { + /// Sets the no-op entry as applied + fn set_applied(&self) { + self.applied.store(true, Ordering::Release); + let _ignore = self.event.notify(usize::MAX); + } + + /// Resets the no-op application state + fn reset(&self) { + self.applied.store(false, Ordering::Release); + } + + /// Waits for the no-op log to be applied + fn wait(&self) -> Pin + Send>> { + if self.applied.load(Ordering::Acquire) { + return Box::pin(future::ready(())); + } + Box::pin(self.event.listen()) + } } impl State { @@ -130,6 +165,7 @@ impl LeaderState { .map(|o| (*o, FollowerStatus::default())) .collect(), leader_transferee: AtomicU64::new(0), + no_op_state: NoOpState::default(), } } @@ -231,6 +267,21 @@ impl LeaderState { let val = self.leader_transferee.swap(node_id, Ordering::SeqCst); (val != 0).then_some(val) } + + /// Sets the no-op log as applied + pub(super) fn set_no_op_applied(&self) { + self.no_op_state.set_applied(); + } + + /// Resets the no-op application state + pub(super) fn reset_no_op_state(&self) { + self.no_op_state.reset(); + } + + /// Waits for the no-op log to be applied + pub(super) fn wait_no_op_applied(&self) -> impl Future + Send { + self.no_op_state.wait() + } } impl CandidateState { diff --git a/crates/curp/src/server/raw_curp/tests.rs b/crates/curp/src/server/raw_curp/tests.rs index 5e3896c37..d2eda551a 100644 --- a/crates/curp/src/server/raw_curp/tests.rs +++ b/crates/curp/src/server/raw_curp/tests.rs @@ -1,11 +1,6 @@ -use std::{cmp::Reverse, ops::Add, time::Duration}; - use curp_test_utils::{mock_role_change, test_cmd::TestCommand, TestRoleChange, TEST_CLIENT_ID}; use test_macros::abort_on_panic; -use tokio::{ - sync::oneshot, - time::{sleep, Instant}, -}; +use tokio::time::{sleep, Instant}; use tracing_test::traced_test; use utils::config::{ default_candidate_timeout_ticks, default_follower_timeout_ticks, default_heartbeat_interval, @@ -17,10 +12,10 @@ use crate::{ rpc::{connect::MockInnerConnectApi, Redirect}, server::{ cmd_board::CommandBoard, - cmd_worker::{CEEventTxApi, MockCEEventTxApi}, conflict::test_pools::{TestSpecPool, TestUncomPool}, lease_manager::LeaseManager, }, + tracker::Tracker, LogIndex, }; @@ -38,9 +33,8 @@ impl RawCurp { } #[allow(clippy::mem_forget)] // we should prevent the channel from being dropped - pub(crate) fn new_test>( + pub(crate) fn new_test( n: u64, - exe_tx: Tx, role_change: TestRoleChange, task_manager: Arc, ) -> Self { @@ -50,9 +44,6 @@ impl RawCurp { let cluster_info = Arc::new(ClusterInfo::from_members_map(all_members, [], "S0")); let cmd_board = Arc::new(RwLock::new(CommandBoard::new())); let lease_manager = Arc::new(RwLock::new(LeaseManager::new())); - let (log_tx, log_rx) = mpsc::unbounded_channel(); - // prevent the channel from being closed - std::mem::forget(log_rx); let sync_events = cluster_info .peers_ids() .into_iter() @@ -73,12 +64,10 @@ impl RawCurp { .build() .unwrap(); let curp_storage = Arc::new(DB::open(&curp_config.engine_cfg).unwrap()); + let _ignore = curp_storage.recover().unwrap(); - // grant a infinity expiry lease for test client id - lease_manager.write().expiry_queue.push( - TEST_CLIENT_ID, - Reverse(Instant::now().add(Duration::from_nanos(u64::MAX))), - ); + // bypass test client id + lease_manager.write().bypass(TEST_CLIENT_ID); let sp = Arc::new(Mutex::new(SpeculativePool::new(vec![Box::new( TestSpecPool::default(), @@ -86,6 +75,10 @@ impl RawCurp { let ucp = Arc::new(Mutex::new(UncommittedPool::new(vec![Box::new( TestUncomPool::default(), )]))); + let (as_tx, as_rx) = flume::unbounded(); + std::mem::forget(as_rx); + let resp_txs = Arc::new(Mutex::default()); + let id_barrier = Arc::new(IdBarrier::new()); Self::builder() .cluster_info(cluster_info) @@ -93,15 +86,16 @@ impl RawCurp { .cmd_board(cmd_board) .lease_manager(lease_manager) .cfg(Arc::new(curp_config)) - .cmd_tx(Arc::new(exe_tx)) .sync_events(sync_events) - .log_tx(log_tx) .role_change(role_change) .task_manager(task_manager) .connects(connects) .curp_storage(curp_storage) .spec_pool(sp) .uncommitted_pool(ucp) + .as_tx(as_tx) + .resp_txs(resp_txs) + .id_barrier(id_barrier) .build_raw_curp() .unwrap() } @@ -111,11 +105,21 @@ impl RawCurp { self.ctx.connects.entry(id).and_modify(|c| *c = connect); } + pub(crate) fn tracker(&self, client_id: u64) -> Tracker { + self.ctx + .cb + .read() + .trackers + .get(&client_id) + .cloned() + .unwrap_or_else(|| unreachable!("cannot find {client_id} in result trackers")) + } + /// Add a new cmd to the log, will return log entry index pub(crate) fn push_cmd(&self, propose_id: ProposeId, cmd: Arc) -> LogIndex { let st_r = self.st.read(); let mut log_w = self.log.write(); - log_w.push(st_r.term, propose_id, cmd).unwrap().index + log_w.push(st_r.term, propose_id, cmd).index } pub(crate) fn check_learner(&self, node_id: ServerId, is_learner: bool) -> bool { @@ -136,112 +140,88 @@ impl RawCurp { } /*************** tests for propose **************/ +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] #[traced_test] #[test] fn leader_handle_propose_will_succeed() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx.expect_send_sp_exe().returning(|_| {}); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; let cmd = Arc::new(TestCommand::default()); assert!(curp - .handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd) + .handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd, 0) .unwrap()); } +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] #[traced_test] #[test] fn leader_handle_propose_will_reject_conflicted() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx.expect_send_sp_exe().returning(|_| {}); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; let cmd1 = Arc::new(TestCommand::new_put(vec![1], 0)); assert!(curp - .handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd1) + .handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd1, 0) .unwrap()); let cmd2 = Arc::new(TestCommand::new_put(vec![1, 2], 1)); - let res = curp.handle_propose(ProposeId(TEST_CLIENT_ID, 1), cmd2); + let res = curp.handle_propose(ProposeId(TEST_CLIENT_ID, 1), cmd2, 1); assert!(matches!(res, Err(CurpError::KeyConflict(())))); // leader will also reject cmds that conflict un-synced cmds let cmd3 = Arc::new(TestCommand::new_put(vec![2], 1)); - let res = curp.handle_propose(ProposeId(TEST_CLIENT_ID, 2), cmd3); + let res = curp.handle_propose(ProposeId(TEST_CLIENT_ID, 2), cmd3, 2); assert!(matches!(res, Err(CurpError::KeyConflict(())))); } +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] #[traced_test] #[test] fn leader_handle_propose_will_reject_duplicated() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx.expect_send_sp_exe().returning(|_| {}); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; let cmd = Arc::new(TestCommand::default()); assert!(curp - .handle_propose(ProposeId(TEST_CLIENT_ID, 0), Arc::clone(&cmd)) + .handle_propose(ProposeId(TEST_CLIENT_ID, 0), Arc::clone(&cmd), 0) .unwrap()); - let res = curp.handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd); + let res = curp.handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd, 0); assert!(matches!(res, Err(CurpError::Duplicated(())))); } +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] #[traced_test] #[test] fn follower_handle_propose_will_succeed() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let cmd = Arc::new(TestCommand::new_get(vec![1])); assert!(!curp - .handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd) + .handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd, 0) .unwrap()); } +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] #[traced_test] #[test] fn follower_handle_propose_will_reject_conflicted() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let cmd1 = Arc::new(TestCommand::new_get(vec![1])); assert!(!curp - .handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd1) + .handle_propose(ProposeId(TEST_CLIENT_ID, 0), cmd1, 0) .unwrap()); let cmd2 = Arc::new(TestCommand::new_get(vec![1])); - let res = curp.handle_propose(ProposeId(TEST_CLIENT_ID, 1), cmd2); + let res = curp.handle_propose(ProposeId(TEST_CLIENT_ID, 1), cmd2, 1); assert!(matches!(res, Err(CurpError::KeyConflict(())))); } @@ -251,13 +231,7 @@ fn follower_handle_propose_will_reject_conflicted() { #[test] fn heartbeat_will_calibrate_term() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; let s1_id = curp.cluster().get_id_by_name("S1").unwrap(); let result = curp.handle_append_entries_resp(s1_id, None, 2, false, 1); @@ -272,12 +246,7 @@ fn heartbeat_will_calibrate_term() { #[test] fn heartbeat_will_calibrate_next_index() { let task_manager = Arc::new(TaskManager::new()); - let curp = RawCurp::new_test( - 3, - MockCEEventTxApi::::default(), - mock_role_change(), - task_manager, - ); + let curp = RawCurp::new_test(3, mock_role_change(), task_manager); let s1_id = curp.cluster().get_id_by_name("S1").unwrap(); let result = curp.handle_append_entries_resp(s1_id, None, 0, false, 1); @@ -292,18 +261,7 @@ fn heartbeat_will_calibrate_next_index() { #[test] fn handle_ae_will_calibrate_term() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let s2_id = curp.cluster().get_id_by_name("S2").unwrap(); @@ -320,18 +278,7 @@ fn handle_ae_will_calibrate_term() { #[test] fn handle_ae_will_set_leader_id() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let s2_id = curp.cluster().get_id_by_name("S2").unwrap(); @@ -348,18 +295,7 @@ fn handle_ae_will_set_leader_id() { #[test] fn handle_ae_will_reject_wrong_term() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let s2_id = curp.cluster().get_id_by_name("S2").unwrap(); @@ -372,18 +308,7 @@ fn handle_ae_will_reject_wrong_term() { #[test] fn handle_ae_will_reject_wrong_log() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let s2_id = curp.cluster().get_id_by_name("S2").unwrap(); @@ -410,18 +335,7 @@ fn handle_ae_will_reject_wrong_log() { #[abort_on_panic] async fn follower_will_not_start_election_when_heartbeats_are_received() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let curp_c = Arc::clone(&curp); @@ -447,18 +361,7 @@ async fn follower_will_not_start_election_when_heartbeats_are_received() { #[abort_on_panic] async fn follower_or_pre_candidate_will_start_election_if_timeout() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let start = Instant::now(); @@ -496,18 +399,7 @@ async fn follower_or_pre_candidate_will_start_election_if_timeout() { #[test] fn handle_vote_will_calibrate_term() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.st.write().leader_id = None; let s1_id = curp.cluster().get_id_by_name("S1").unwrap(); @@ -522,18 +414,7 @@ fn handle_vote_will_calibrate_term() { #[test] fn handle_vote_will_reject_smaller_term() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 2); let s1_id = curp.cluster().get_id_by_name("S1").unwrap(); @@ -545,18 +426,7 @@ fn handle_vote_will_reject_smaller_term() { #[test] fn handle_vote_will_reject_outdated_candidate() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; let s2_id = curp.cluster().get_id_by_name("S2").unwrap(); let result = curp.handle_append_entries( 2, @@ -582,18 +452,7 @@ fn handle_vote_will_reject_outdated_candidate() { #[test] fn pre_candidate_will_become_candidate_then_become_leader_after_election_succeeds() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); // tick till election starts @@ -624,18 +483,7 @@ fn pre_candidate_will_become_candidate_then_become_leader_after_election_succeed #[test] fn vote_will_calibrate_pre_candidate_term() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); // tick till election starts @@ -658,18 +506,7 @@ fn vote_will_calibrate_pre_candidate_term() { #[test] fn recover_from_spec_pools_will_pick_the_correct_cmds() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 5, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(5, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); // cmd1 has already been committed @@ -732,18 +569,7 @@ fn recover_from_spec_pools_will_pick_the_correct_cmds() { #[test] fn recover_ucp_from_logs_will_pick_the_correct_cmds() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx - .expect_send_reset() - .returning(|_| oneshot::channel().1); - Arc::new(RawCurp::new_test( - 5, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(5, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let cmd0 = Arc::new(TestCommand::new_put(vec![1], 1)); @@ -772,14 +598,11 @@ fn recover_ucp_from_logs_will_pick_the_correct_cmds() { #[test] fn leader_retires_after_log_compact_will_succeed() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; let mut log_w = curp.log.write(); for i in 1..=20 { let cmd = Arc::new(TestCommand::default()); - log_w.push(0, ProposeId(TEST_CLIENT_ID, i), cmd).unwrap(); + log_w.push(0, ProposeId(TEST_CLIENT_ID, i), cmd); } log_w.last_as = 20; log_w.last_exe = 20; @@ -790,23 +613,23 @@ fn leader_retires_after_log_compact_will_succeed() { curp.leader_retires(); } +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] #[traced_test] #[test] fn leader_retires_should_cleanup() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx.expect_send_sp_exe().returning(|_| {}); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; let _ignore = curp.handle_propose( ProposeId(TEST_CLIENT_ID, 0), Arc::new(TestCommand::new_put(vec![1], 0)), + 0, ); let _ignore = curp.handle_propose( ProposeId(TEST_CLIENT_ID, 1), Arc::new(TestCommand::new_get(vec![1])), + 0, ); curp.leader_retires(); @@ -824,10 +647,7 @@ fn leader_retires_should_cleanup() { #[tokio::test] async fn leader_handle_shutdown_will_succeed() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; assert!(curp.handle_shutdown(ProposeId(TEST_CLIENT_ID, 0)).is_ok()); } @@ -835,11 +655,7 @@ async fn leader_handle_shutdown_will_succeed() { #[test] fn follower_handle_shutdown_will_reject() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx.expect_send_sp_exe().returning(|_| {}); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 1); let res = curp.handle_shutdown(ProposeId(TEST_CLIENT_ID, 0)); assert!(matches!( @@ -855,10 +671,7 @@ fn follower_handle_shutdown_will_reject() { #[test] fn is_synced_should_return_true_when_followers_caught_up_with_leader() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - RawCurp::new_test(3, exe_tx, mock_role_change(), task_manager) - }; + let curp = { RawCurp::new_test(3, mock_role_change(), task_manager) }; let s1_id = curp.cluster().get_id_by_name("S1").unwrap(); let s2_id = curp.cluster().get_id_by_name("S2").unwrap(); @@ -876,19 +689,11 @@ fn is_synced_should_return_true_when_followers_caught_up_with_leader() { #[test] fn add_node_should_add_new_node_to_curp() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; let old_cluster = curp.cluster().clone(); let changes = vec![ConfChange::add(1, vec!["http://127.0.0.1:4567".to_owned()])]; assert!(curp.check_new_config(&changes).is_ok()); - let infos = curp.apply_conf_change(changes.clone()); + let infos = curp.apply_conf_change(changes.clone()).unwrap(); assert!(curp.contains(1)); curp.fallback_conf_change(changes, infos.0, infos.1, infos.2); let cluster_after_fallback = curp.cluster(); @@ -911,15 +716,7 @@ fn add_node_should_add_new_node_to_curp() { #[test] fn add_learner_node_and_promote_should_success() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; let changes = vec![ConfChange::add_learner( 1, vec!["http://127.0.0.1:4567".to_owned()], @@ -930,7 +727,7 @@ fn add_learner_node_and_promote_should_success() { let changes = vec![ConfChange::promote(1)]; assert!(curp.check_new_config(&changes).is_ok()); - let infos = curp.apply_conf_change(changes.clone()); + let infos = curp.apply_conf_change(changes.clone()).unwrap(); assert!(curp.check_learner(1, false)); curp.fallback_conf_change(changes, infos.0, infos.1, infos.2); assert!(curp.check_learner(1, true)); @@ -940,15 +737,7 @@ fn add_learner_node_and_promote_should_success() { #[test] fn add_exists_node_should_return_node_already_exists_error() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; let exists_node_id = curp.cluster().get_id_by_name("S1").unwrap(); let changes = vec![ConfChange::add( exists_node_id, @@ -963,20 +752,12 @@ fn add_exists_node_should_return_node_already_exists_error() { #[test] fn remove_node_should_remove_node_from_curp() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 5, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(5, mock_role_change(), task_manager)) }; let old_cluster = curp.cluster().clone(); let follower_id = curp.cluster().get_id_by_name("S1").unwrap(); let changes = vec![ConfChange::remove(follower_id)]; assert!(curp.check_new_config(&changes).is_ok()); - let infos = curp.apply_conf_change(changes.clone()); + let infos = curp.apply_conf_change(changes.clone()).unwrap(); assert_eq!(infos, (vec!["S1".to_owned()], "S1".to_owned(), false)); assert!(!curp.contains(follower_id)); curp.fallback_conf_change(changes, infos.0, infos.1, infos.2); @@ -996,15 +777,7 @@ fn remove_node_should_remove_node_from_curp() { #[test] fn remove_non_exists_node_should_return_node_not_exists_error() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 5, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(5, mock_role_change(), task_manager)) }; let changes = vec![ConfChange::remove(1)]; let resp = curp.check_new_config(&changes); assert!(matches!(resp, Err(CurpError::NodeNotExists(())))); @@ -1014,15 +787,7 @@ fn remove_non_exists_node_should_return_node_not_exists_error() { #[test] fn update_node_should_update_the_address_of_node() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; let old_cluster = curp.cluster().clone(); let follower_id = curp.cluster().get_id_by_name("S1").unwrap(); let mut mock_connect = MockInnerConnectApi::new(); @@ -1040,7 +805,7 @@ fn update_node_should_update_the_address_of_node() { vec!["http://127.0.0.1:4567".to_owned()], )]; assert!(curp.check_new_config(&changes).is_ok()); - let infos = curp.apply_conf_change(changes.clone()); + let infos = curp.apply_conf_change(changes.clone()).unwrap(); assert_eq!(infos, (vec!["S1".to_owned()], String::new(), false)); assert_eq!( curp.cluster().peer_urls(follower_id), @@ -1063,16 +828,7 @@ fn update_node_should_update_the_address_of_node() { #[test] fn leader_handle_propose_conf_change() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let mut exe_tx = MockCEEventTxApi::::default(); - exe_tx.expect_send_sp_exe().returning(|_| {}); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; let follower_id = curp.cluster().get_id_by_name("S1").unwrap(); assert_eq!( curp.cluster().peer_urls(follower_id), @@ -1090,15 +846,7 @@ fn leader_handle_propose_conf_change() { #[test] fn follower_handle_propose_conf_change() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 2); let follower_id = curp.cluster().get_id_by_name("S1").unwrap(); @@ -1124,15 +872,7 @@ fn follower_handle_propose_conf_change() { #[test] fn leader_handle_move_leader() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.switch_config(ConfChange::add_learner(1234, vec!["address".to_owned()])); let res = curp.handle_move_leader(1234); @@ -1155,15 +895,7 @@ fn leader_handle_move_leader() { #[test] fn follower_handle_move_leader() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 3, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(3, mock_role_change(), task_manager)) }; curp.update_to_term_and_become_follower(&mut *curp.st.write(), 2); let target_id = curp.cluster().get_id_by_name("S1").unwrap(); @@ -1175,15 +907,7 @@ fn follower_handle_move_leader() { #[test] fn leader_will_reset_transferee_after_remove_node() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 5, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(5, mock_role_change(), task_manager)) }; let target_id = curp.cluster().get_id_by_name("S1").unwrap(); let res = curp.handle_move_leader(target_id); @@ -1194,19 +918,13 @@ fn leader_will_reset_transferee_after_remove_node() { assert!(curp.get_transferee().is_none()); } +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] #[traced_test] #[test] fn leader_will_reject_propose_when_transferring() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 5, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(5, mock_role_change(), task_manager)) }; let target_id = curp.cluster().get_id_by_name("S1").unwrap(); let res = curp.handle_move_leader(target_id); @@ -1214,7 +932,7 @@ fn leader_will_reject_propose_when_transferring() { let propose_id = ProposeId(0, 0); let cmd = Arc::new(TestCommand::new_put(vec![1], 1)); - let res = curp.handle_propose(propose_id, cmd); + let res = curp.handle_propose(propose_id, cmd, 0); assert!(res.is_err()); } @@ -1222,15 +940,7 @@ fn leader_will_reject_propose_when_transferring() { #[test] fn leader_will_reset_transferee_after_it_become_follower() { let task_manager = Arc::new(TaskManager::new()); - let curp = { - let exe_tx = MockCEEventTxApi::::default(); - Arc::new(RawCurp::new_test( - 5, - exe_tx, - mock_role_change(), - task_manager, - )) - }; + let curp = { Arc::new(RawCurp::new_test(5, mock_role_change(), task_manager)) }; let target_id = curp.cluster().get_id_by_name("S1").unwrap(); let res = curp.handle_move_leader(target_id); diff --git a/crates/curp/src/server/storage/db.rs b/crates/curp/src/server/storage/db.rs index 00df60e6a..6d8963508 100644 --- a/crates/curp/src/server/storage/db.rs +++ b/crates/curp/src/server/storage/db.rs @@ -1,11 +1,14 @@ -use std::marker::PhantomData; +use std::ops::Deref; -use async_trait::async_trait; use engine::{Engine, EngineType, StorageEngine, StorageOps, WriteOperation}; +use parking_lot::Mutex; use prost::Message; use utils::config::EngineConfig; -use super::{StorageApi, StorageError}; +use super::{ + wal::{codec::DataFrame, config::WALConfig, WALStorage, WALStorageOps}, + RecoverData, StorageApi, StorageError, +}; use crate::{ cmd::Command, log_entry::LogEntry, @@ -22,27 +25,30 @@ const MEMBER_ID: &[u8] = b"MemberId"; /// Column family name for curp storage const CF: &str = "curp"; -/// Column family name for logs -const LOGS_CF: &str = "logs"; /// Column family name for members const MEMBERS_CF: &str = "members"; +/// The sub dir for `RocksDB` files +const ROCKSDB_SUB_DIR: &str = "rocksdb"; + +/// The sub dir for WAL files +const WAL_SUB_DIR: &str = "wal"; + /// `DB` storage implementation #[derive(Debug)] pub struct DB { + /// The WAL storage + wal: Mutex>, /// DB handle db: Engine, - /// Phantom - phantom: PhantomData, } -#[async_trait] impl StorageApi for DB { /// Command type Command = C; #[inline] - async fn flush_voted_for(&self, term: u64, voted_for: ServerId) -> Result<(), StorageError> { + fn flush_voted_for(&self, term: u64, voted_for: ServerId) -> Result<(), StorageError> { let bytes = bincode::serialize(&(term, voted_for))?; let op = WriteOperation::new_put(CF, VOTE_FOR.to_vec(), bytes); self.db.write_multi(vec![op], true)?; @@ -51,12 +57,17 @@ impl StorageApi for DB { } #[inline] - async fn put_log_entry(&self, entry: &LogEntry) -> Result<(), StorageError> { - let bytes = bincode::serialize(entry)?; - let op = WriteOperation::new_put(LOGS_CF, entry.index.to_le_bytes().to_vec(), bytes); - self.db.write_multi(vec![op], false)?; - - Ok(()) + fn put_log_entries(&self, entry: &[&LogEntry]) -> Result<(), StorageError> { + self.wal + .lock() + .send_sync( + entry + .iter() + .map(Deref::deref) + .map(DataFrame::Entry) + .collect(), + ) + .map_err(Into::into) } #[inline] @@ -135,47 +146,47 @@ impl StorageApi for DB { } #[inline] - async fn recover( - &self, - ) -> Result<(Option<(u64, ServerId)>, Vec>), StorageError> { + fn recover(&self) -> Result, StorageError> { + let entries = self.wal.lock().recover()?; let voted_for = self .db .get(CF, VOTE_FOR)? .map(|bytes| bincode::deserialize::<(u64, ServerId)>(&bytes)) .transpose()?; - - let mut entries = vec![]; - let mut prev_index = 0; - for (_k, v) in self.db.get_all(LOGS_CF)? { - let entry: LogEntry = bincode::deserialize(&v)?; - #[allow(clippy::arithmetic_side_effects)] // won't overflow - if entry.index != prev_index + 1 { - // break when logs are no longer consistent - break; - } - prev_index = entry.index; - entries.push(entry); - } - Ok((voted_for, entries)) } } impl DB { /// Create a new CURP `DB` + /// + /// WARN: The `recover` method must be called before any call to `put_log_entries`. + /// /// # Errors /// Will return `StorageError` if failed to open the storage #[inline] pub fn open(config: &EngineConfig) -> Result { - let engine_type = match *config { - EngineConfig::Memory => EngineType::Memory, - EngineConfig::RocksDB(ref path) => EngineType::Rocks(path.clone()), + let (engine_type, wal_config) = match *config { + EngineConfig::Memory => (EngineType::Memory, WALConfig::Memory), + EngineConfig::RocksDB(ref path) => { + let mut rocksdb_dir = path.clone(); + rocksdb_dir.push(ROCKSDB_SUB_DIR); + let mut wal_dir = path.clone(); + wal_dir.push(WAL_SUB_DIR); + ( + EngineType::Rocks(rocksdb_dir.clone()), + WALConfig::new(wal_dir), + ) + } _ => unreachable!("Not supported storage type"), }; - let db = Engine::new(engine_type, &[CF, LOGS_CF, MEMBERS_CF])?; + + let db = Engine::new(engine_type, &[CF, MEMBERS_CF])?; + let wal = WALStorage::new(wal_config)?; + Ok(Self { + wal: Mutex::new(wal), db, - phantom: PhantomData, }) } } @@ -198,20 +209,23 @@ mod tests { let storage_cfg = EngineConfig::RocksDB(db_dir.clone()); { let s = DB::::open(&storage_cfg)?; - s.flush_voted_for(1, 222).await?; - s.flush_voted_for(3, 111).await?; + let (voted_for, entries) = s.recover()?; + assert!(voted_for.is_none()); + assert!(entries.is_empty()); + s.flush_voted_for(1, 222)?; + s.flush_voted_for(3, 111)?; let entry0 = LogEntry::new(1, 3, ProposeId(1, 1), Arc::new(TestCommand::default())); let entry1 = LogEntry::new(2, 3, ProposeId(1, 2), Arc::new(TestCommand::default())); let entry2 = LogEntry::new(3, 3, ProposeId(1, 3), Arc::new(TestCommand::default())); - s.put_log_entry(&entry0).await?; - s.put_log_entry(&entry1).await?; - s.put_log_entry(&entry2).await?; + s.put_log_entries(&[&entry0])?; + s.put_log_entries(&[&entry1])?; + s.put_log_entries(&[&entry2])?; sleep_secs(2).await; } { let s = DB::::open(&storage_cfg)?; - let (voted_for, entries) = s.recover().await?; + let (voted_for, entries) = s.recover()?; assert_eq!(voted_for, Some((3, 111))); assert_eq!(entries[0].index, 1); assert_eq!(entries[1].index, 2); diff --git a/crates/curp/src/server/storage/mod.rs b/crates/curp/src/server/storage/mod.rs index 029a09415..f07ecc543 100644 --- a/crates/curp/src/server/storage/mod.rs +++ b/crates/curp/src/server/storage/mod.rs @@ -1,4 +1,3 @@ -use async_trait::async_trait; use engine::EngineError; use thiserror::Error; @@ -18,8 +17,11 @@ pub enum StorageError { #[error("codec error, {0}")] Codec(String), /// Rocksdb error - #[error("internal error, {0}")] - Internal(#[from] EngineError), + #[error("rocksdb error, {0}")] + RocksDB(#[from] EngineError), + /// WAL error + #[error("wal error, {0}")] + WAL(#[from] std::io::Error), } impl From for StorageError { @@ -36,8 +38,12 @@ impl From for StorageError { } } +/// Vote info +pub(crate) type VoteInfo = (u64, ServerId); +/// Recovered data +pub(crate) type RecoverData = (Option, Vec>); + /// Curp storage api -#[async_trait] #[allow(clippy::module_name_repetitions)] pub trait StorageApi: Send + Sync { /// Command @@ -47,7 +53,7 @@ pub trait StorageApi: Send + Sync { /// /// # Errors /// Return `StorageError` when it failed to store the `voted_for` info to underlying database. - async fn flush_voted_for(&self, term: u64, voted_for: ServerId) -> Result<(), StorageError>; + fn flush_voted_for(&self, term: u64, voted_for: ServerId) -> Result<(), StorageError>; /// Put `Member` into storage /// @@ -76,16 +82,15 @@ pub trait StorageApi: Send + Sync { /// Put log entries in storage /// /// # Errors - /// Return `StorageError` when it failed to store the given log entry info to underlying database. - async fn put_log_entry(&self, entry: &LogEntry) -> Result<(), StorageError>; + /// Return `StorageError` when it failed to store the log entries to underlying database. + fn put_log_entries(&self, entry: &[&LogEntry]) -> Result<(), StorageError>; /// Recover from persisted storage + /// Return `voted_for` and all log entries /// /// # Errors - /// Return `StorageError` when it failed to recover from underlying database. Otherwise, return recovered `voted_for` and all log entries - async fn recover( - &self, - ) -> Result<(Option<(u64, ServerId)>, Vec>), StorageError>; + /// Return `StorageError` when it failed to recover the log entries and vote info from underlying database. + fn recover(&self) -> Result, StorageError>; } /// CURP `DB` storage implementation diff --git a/crates/curp/src/server/storage/wal/codec.rs b/crates/curp/src/server/storage/wal/codec.rs index fc93801c3..33c7f4226 100644 --- a/crates/curp/src/server/storage/wal/codec.rs +++ b/crates/curp/src/server/storage/wal/codec.rs @@ -295,7 +295,10 @@ impl FrameEncoder for DataFrame<'_, C> where C: Serialize, { - #[allow(clippy::arithmetic_side_effects)] // The integer shift is safe + #[allow( + clippy::arithmetic_side_effects, // The integer shift is safe + clippy::indexing_slicing // The slicing is checked + )] fn encode(&self) -> Vec { match *self { DataFrame::Entry(ref entry) => { diff --git a/crates/curp/src/server/storage/wal/config.rs b/crates/curp/src/server/storage/wal/config.rs index c6e2627b3..70157ce0f 100644 --- a/crates/curp/src/server/storage/wal/config.rs +++ b/crates/curp/src/server/storage/wal/config.rs @@ -5,7 +5,16 @@ const DEFAULT_SEGMENT_SIZE: u64 = 64 * 1024 * 1024; /// The config for WAL #[derive(Debug, Clone)] -pub(crate) struct WALConfig { +pub(crate) enum WALConfig { + /// Persistent implementation + Persistent(PersistentConfig), + /// Mock memory implementation + Memory, +} + +/// The config for persistent WAL +#[derive(Debug, Clone)] +pub(crate) struct PersistentConfig { /// The path of this config pub(super) dir: PathBuf, /// The maximum size of this segment @@ -17,17 +26,28 @@ pub(crate) struct WALConfig { impl WALConfig { /// Creates a new `WALConfig` pub(crate) fn new(dir: impl AsRef) -> Self { - Self { + Self::Persistent(PersistentConfig { dir: dir.as_ref().into(), max_segment_size: DEFAULT_SEGMENT_SIZE, - } + }) + } + + /// Creates a new memory `WALConfig` + pub(crate) fn new_memory() -> Self { + Self::Memory } /// Sets the `max_segment_size` pub(crate) fn with_max_segment_size(self, size: u64) -> Self { - Self { - dir: self.dir, - max_segment_size: size, + match self { + Self::Persistent(PersistentConfig { + dir, + max_segment_size, + }) => Self::Persistent(PersistentConfig { + dir, + max_segment_size: size, + }), + Self::Memory => Self::Memory, } } } diff --git a/crates/curp/src/server/storage/wal/mock/mod.rs b/crates/curp/src/server/storage/wal/mock/mod.rs new file mode 100644 index 000000000..a6f230d50 --- /dev/null +++ b/crates/curp/src/server/storage/wal/mock/mod.rs @@ -0,0 +1,61 @@ +use std::{collections::VecDeque, io, marker::PhantomData}; + +use curp_external_api::LogIndex; +use serde::{de::DeserializeOwned, Serialize}; + +use crate::log_entry::LogEntry; + +use super::{codec::DataFrame, config::WALConfig, WALStorageOps}; + +/// The mock WAL storage +#[derive(Debug)] +pub(crate) struct WALStorage { + /// Storage + entries: VecDeque>, +} + +impl WALStorage { + /// Creates a new mock `WALStorage` + pub(super) fn new() -> WALStorage { + Self { + entries: VecDeque::new(), + } + } +} + +impl WALStorageOps for WALStorage +where + C: Clone, +{ + fn recover(&mut self) -> io::Result>> { + Ok(self.entries.clone().into_iter().collect()) + } + + fn send_sync(&mut self, item: Vec>) -> io::Result<()> { + for frame in item { + if let DataFrame::Entry(entry) = frame { + self.entries.push_back(entry.clone()); + } + } + + Ok(()) + } + + fn truncate_head(&mut self, compact_index: LogIndex) -> io::Result<()> { + while self + .entries + .front() + .is_some_and(|e| e.index <= compact_index) + { + let _ignore = self.entries.pop_front(); + } + Ok(()) + } + + fn truncate_tail(&mut self, max_index: LogIndex) -> io::Result<()> { + while self.entries.back().is_some_and(|e| e.index > max_index) { + let _ignore = self.entries.pop_back(); + } + Ok(()) + } +} diff --git a/crates/curp/src/server/storage/wal/mod.rs b/crates/curp/src/server/storage/wal/mod.rs index fb86b4410..d204aca9e 100644 --- a/crates/curp/src/server/storage/wal/mod.rs +++ b/crates/curp/src/server/storage/wal/mod.rs @@ -32,269 +32,89 @@ mod util; /// Framed mod framed; -use std::{io, marker::PhantomData, ops::Mul}; +/// Mock WAL storage +mod mock; -use clippy_utilities::OverflowArithmetic; +/// WAL storage +mod storage; + +use std::io; + +use codec::DataFrame; +use config::WALConfig; use curp_external_api::LogIndex; -use futures::{future::join_all, Future, SinkExt, StreamExt}; -use itertools::Itertools; use serde::{de::DeserializeOwned, Serialize}; -use tokio_util::codec::Framed; -use tracing::{debug, error, info, warn}; use crate::log_entry::LogEntry; -use self::{ - codec::{DataFrame, DataFrameOwned, WAL}, - config::WALConfig, - error::{CorruptType, WALError}, - pipeline::FilePipeline, - remover::SegmentRemover, - segment::WALSegment, - util::LockedFile, -}; +/// The wal file extension +const WAL_FILE_EXT: &str = ".wal"; -/// The magic of the WAL file -const WAL_MAGIC: u32 = 0xd86e_0be2; +/// Operations of a WAL storage +pub(crate) trait WALStorageOps { + /// Recover from the given directory if there's any segments + fn recover(&mut self) -> io::Result>>; -/// The current WAL version -const WAL_VERSION: u8 = 0x00; + /// Send frames with fsync + fn send_sync(&mut self, item: Vec>) -> io::Result<()>; -/// The wal file extension -const WAL_FILE_EXT: &str = ".wal"; + /// Tuncate all the logs whose index is less than or equal to `compact_index` + /// + /// `compact_index` should be the smallest index required in CURP + fn truncate_head(&mut self, compact_index: LogIndex) -> io::Result<()>; + + /// Tuncate all the logs whose index is greater than `max_index` + fn truncate_tail(&mut self, max_index: LogIndex) -> io::Result<()>; +} -/// The WAL storage +/// The WAL Storage #[derive(Debug)] -pub(super) struct WALStorage { - /// The config of wal files - config: WALConfig, - /// The pipeline that pre-allocates files - pipeline: FilePipeline, - /// WAL segments - segments: Vec, - /// The next segment id - next_segment_id: u64, - /// The next log index - next_log_index: LogIndex, - /// The phantom data - _phantom: PhantomData, +pub(crate) enum WALStorage { + /// Persistent storage + Persistent(storage::WALStorage), + /// Mock memory storage + Memory(mock::WALStorage), } impl WALStorage { - /// Creates a new `LogStorage` - pub(super) fn new(config: WALConfig) -> io::Result> { - if !config.dir.try_exists()? { - std::fs::create_dir_all(&config.dir); - } - let mut pipeline = FilePipeline::new(config.dir.clone(), config.max_segment_size); - Ok(Self { - config, - pipeline, - segments: vec![], - next_segment_id: 0, - next_log_index: 0, - _phantom: PhantomData, + /// Creates a new `WALStorage` + pub(crate) fn new(config: WALConfig) -> io::Result { + Ok(match config { + WALConfig::Persistent(conf) => Self::Persistent(storage::WALStorage::new(conf)?), + WALConfig::Memory => Self::Memory(mock::WALStorage::new()), }) } } -impl WALStorage +impl WALStorageOps for WALStorage where - C: Serialize + DeserializeOwned + Unpin + 'static + std::fmt::Debug, + C: Serialize + DeserializeOwned + std::fmt::Debug + Clone, { - /// Recover from the given directory if there's any segments - pub(super) fn recover(&mut self) -> io::Result>> { - /// Number of lines printed around the missing log in debug information - const NUM_LINES_DEBUG: usize = 3; - // We try to recover the removal first - SegmentRemover::recover(&self.config.dir)?; - - let file_paths = util::get_file_paths_with_ext(&self.config.dir, WAL_FILE_EXT)?; - let lfiles: Vec<_> = file_paths - .into_iter() - .map(LockedFile::open_rw) - .collect::>()?; - - let segment_opening = lfiles - .into_iter() - .map(|f| WALSegment::open(f, self.config.max_segment_size)); - - let mut segments = Self::take_until_io_error(segment_opening)?; - segments.sort_unstable(); - debug!("Recovered segments: {:?}", segments); - - let logs_iter = segments.iter_mut().map(WALSegment::recover_segment_logs); - - let logs_batches = Self::take_until_io_error(logs_iter)?; - let mut logs: Vec<_> = logs_batches.into_iter().flatten().collect(); - - let pos = Self::highest_valid_pos(&logs[..]); - if pos != logs.len() { - let debug_logs: Vec<_> = logs - .iter() - .skip(pos.overflow_sub(pos.min(NUM_LINES_DEBUG))) - .take(NUM_LINES_DEBUG.mul(2)) - .collect(); - error!( - "WAL corrupted: {}, truncated at position: {pos}, logs around this position: {debug_logs:?}", - CorruptType::LogNotContinue - ); - logs.truncate(pos); - } - - let next_segment_id = segments.last().map_or(0, |s| s.id().overflow_add(1)); - let next_log_index = logs.last().map_or(1, |l| l.index.overflow_add(1)); - self.next_segment_id = next_segment_id; - self.next_log_index = next_log_index; - self.segments = segments; - - self.open_new_segment()?; - info!("WAL successfully recovered"); - - Ok(logs) - } - - /// Send frames with fsync - #[allow(clippy::pattern_type_mismatch)] // Cannot satisfy both clippy - pub(super) fn send_sync(&mut self, item: Vec>) -> io::Result<()> { - let last_segment = self - .segments - .last_mut() - .unwrap_or_else(|| unreachable!("there should be at least on segment")); - if let Some(DataFrame::Entry(entry)) = item.last() { - self.next_log_index = entry.index.overflow_add(1); - } - last_segment.write_sync(item, WAL::new())?; - - if last_segment.is_full() { - self.open_new_segment()?; + fn recover(&mut self) -> io::Result>> { + match *self { + WALStorage::Persistent(ref mut s) => s.recover(), + WALStorage::Memory(ref mut s) => s.recover(), } - - Ok(()) } - /// Truncate all the logs whose index is less than or equal to `compact_index` - /// - /// `compact_index` should be the smallest index required in CURP - pub(super) fn truncate_head(&mut self, compact_index: LogIndex) -> io::Result<()> { - if compact_index >= self.next_log_index { - warn!( - "head truncation: compact index too large, compact index: {}, storage next index: {}", - compact_index, self.next_log_index - ); - return Ok(()); - } - - debug!("performing head truncation on index: {compact_index}"); - - let mut to_remove_num = self - .segments - .iter() - .take_while(|s| s.base_index() <= compact_index) - .count() - .saturating_sub(1); - - if to_remove_num == 0 { - return Ok(()); + fn send_sync(&mut self, item: Vec>) -> io::Result<()> { + match *self { + WALStorage::Persistent(ref mut s) => s.send_sync(item), + WALStorage::Memory(ref mut s) => s.send_sync(item), } - - // The last segment does not need to be removed - let to_remove: Vec<_> = self.segments.drain(0..to_remove_num).collect(); - SegmentRemover::new_removal(&self.config.dir, to_remove.iter())?; - - Ok(()) } - /// Truncate all the logs whose index is greater than `max_index` - pub(super) fn truncate_tail(&mut self, max_index: LogIndex) -> io::Result<()> { - // segments to truncate - let segments: Vec<_> = self - .segments - .iter_mut() - .rev() - .take_while_inclusive::<_>(|s| s.base_index() > max_index) - .collect(); - - for segment in segments { - segment.seal::(max_index)?; + fn truncate_head(&mut self, compact_index: LogIndex) -> io::Result<()> { + match *self { + WALStorage::Persistent(ref mut s) => s.truncate_head(compact_index), + WALStorage::Memory(ref mut s) => s.truncate_head(compact_index), } - - let to_remove = self.update_segments(); - SegmentRemover::new_removal(&self.config.dir, to_remove.iter())?; - - self.next_log_index = max_index.overflow_add(1); - self.open_new_segment()?; - - Ok(()) } - /// Opens a new WAL segment - fn open_new_segment(&mut self) -> io::Result<()> { - let lfile = self - .pipeline - .next() - .ok_or(io::Error::from(io::ErrorKind::BrokenPipe))??; - - let segment = WALSegment::create( - lfile, - self.next_log_index, - self.next_segment_id, - self.config.max_segment_size, - )?; - - self.segments.push(segment); - self.next_segment_id = self.next_segment_id.overflow_add(1); - - Ok(()) - } - - /// Removes segments that are no longer needed - #[allow(clippy::pattern_type_mismatch)] // Cannot satisfy both clippy - fn update_segments(&mut self) -> Vec { - let flags: Vec<_> = self.segments.iter().map(WALSegment::is_redundant).collect(); - let (to_remove, remaining): (Vec<_>, Vec<_>) = - self.segments.drain(..).zip(flags).partition(|(_, f)| *f); - - self.segments = remaining.into_iter().map(|(s, _)| s).collect(); - - to_remove.into_iter().map(|(s, _)| s).collect() - } - - /// Returns the highest valid position of the log entries, - /// the logs are continuous before this position - #[allow(clippy::pattern_type_mismatch)] // can't fix - fn highest_valid_pos(entries: &[LogEntry]) -> usize { - let iter = entries.iter(); - iter.clone() - .zip(iter.skip(1)) - .enumerate() - .find(|(_, (x, y))| x.index.overflow_add(1) != y.index) - .map_or(entries.len(), |(i, _)| i) - } - - /// Iterates until an `io::Error` occurs. - fn take_until_io_error(opening: I) -> io::Result> - where - I: IntoIterator>, - { - let mut ts = vec![]; - - for result in opening { - match result { - Ok(t) => ts.push(t), - Err(e) => { - let e = e.io_or_corrupt()?; - error!("WAL corrupted: {e}"); - } - } + fn truncate_tail(&mut self, max_index: LogIndex) -> io::Result<()> { + match *self { + WALStorage::Persistent(ref mut s) => s.truncate_tail(max_index), + WALStorage::Memory(ref mut s) => s.truncate_tail(max_index), } - - Ok(ts) - } -} - -impl Drop for WALStorage { - fn drop(&mut self) { - self.pipeline.stop(); } } diff --git a/crates/curp/src/server/storage/wal/segment.rs b/crates/curp/src/server/storage/wal/segment.rs index c50ab6573..d0eb2c0cb 100644 --- a/crates/curp/src/server/storage/wal/segment.rs +++ b/crates/curp/src/server/storage/wal/segment.rs @@ -22,10 +22,16 @@ use super::{ error::{CorruptType, WALError}, framed::{Decoder, Encoder}, util::{get_checksum, parse_u64, validate_data, LockedFile}, - WAL_FILE_EXT, WAL_MAGIC, WAL_VERSION, + WAL_FILE_EXT, }; use crate::log_entry::LogEntry; +/// The magic of the WAL file +const WAL_MAGIC: u32 = 0xd86e_0be2; + +/// The current WAL version +const WAL_VERSION: u8 = 0x00; + /// The size of wal file header in bytes pub(super) const WAL_HEADER_SIZE: usize = 56; @@ -96,7 +102,7 @@ impl WALSegment { &mut self, ) -> Result>, WALError> where - C: Serialize + DeserializeOwned + 'static + std::fmt::Debug, + C: Serialize + DeserializeOwned + std::fmt::Debug, { let frame_batches = self.read_all(WAL::::new())?; let frame_batches_filtered: Vec<_> = frame_batches diff --git a/crates/curp/src/server/storage/wal/storage.rs b/crates/curp/src/server/storage/wal/storage.rs new file mode 100644 index 000000000..44bbfcf5d --- /dev/null +++ b/crates/curp/src/server/storage/wal/storage.rs @@ -0,0 +1,263 @@ +use std::{io, marker::PhantomData, ops::Mul}; + +use clippy_utilities::OverflowArithmetic; +use curp_external_api::LogIndex; +use futures::{future::join_all, Future, SinkExt, StreamExt}; +use itertools::Itertools; +use serde::{de::DeserializeOwned, Serialize}; +use tokio_util::codec::Framed; +use tracing::{debug, error, info, warn}; + +use crate::log_entry::LogEntry; + +use super::{ + codec::{DataFrame, DataFrameOwned, WAL}, + config::PersistentConfig, + error::{CorruptType, WALError}, + pipeline::FilePipeline, + remover::SegmentRemover, + segment::WALSegment, + util::{self, LockedFile}, + WALStorageOps, WAL_FILE_EXT, +}; + +/// The WAL storage +#[derive(Debug)] +pub(crate) struct WALStorage { + /// The config of wal files + config: PersistentConfig, + /// The pipeline that pre-allocates files + pipeline: FilePipeline, + /// WAL segments + segments: Vec, + /// The next segment id + next_segment_id: u64, + /// The next log index + next_log_index: LogIndex, + /// The phantom data + _phantom: PhantomData, +} + +impl WALStorage { + /// Creates a new `LogStorage` + pub(super) fn new(config: PersistentConfig) -> io::Result> { + if !config.dir.try_exists()? { + std::fs::create_dir_all(&config.dir); + } + let mut pipeline = FilePipeline::new(config.dir.clone(), config.max_segment_size); + Ok(Self { + config, + pipeline, + segments: vec![], + next_segment_id: 0, + next_log_index: 0, + _phantom: PhantomData, + }) + } +} + +impl WALStorageOps for WALStorage +where + C: Serialize + DeserializeOwned + std::fmt::Debug, +{ + /// Recover from the given directory if there's any segments + fn recover(&mut self) -> io::Result>> { + /// Number of lines printed around the missing log in debug information + const NUM_LINES_DEBUG: usize = 3; + // We try to recover the removal first + SegmentRemover::recover(&self.config.dir)?; + + let file_paths = util::get_file_paths_with_ext(&self.config.dir, WAL_FILE_EXT)?; + let lfiles: Vec<_> = file_paths + .into_iter() + .map(LockedFile::open_rw) + .collect::>()?; + + let segment_opening = lfiles + .into_iter() + .map(|f| WALSegment::open(f, self.config.max_segment_size)); + + let mut segments = Self::take_until_io_error(segment_opening)?; + segments.sort_unstable(); + debug!("Recovered segments: {:?}", segments); + + let logs_iter = segments.iter_mut().map(WALSegment::recover_segment_logs); + + let logs_batches = Self::take_until_io_error(logs_iter)?; + let mut logs: Vec<_> = logs_batches.into_iter().flatten().collect(); + + let pos = Self::highest_valid_pos(&logs[..]); + if pos != logs.len() { + let debug_logs: Vec<_> = logs + .iter() + .skip(pos.overflow_sub(pos.min(NUM_LINES_DEBUG))) + .take(NUM_LINES_DEBUG.mul(2)) + .collect(); + error!( + "WAL corrupted: {}, truncated at position: {pos}, logs around this position: {debug_logs:?}", + CorruptType::LogNotContinue + ); + logs.truncate(pos); + } + + let next_segment_id = segments.last().map_or(0, |s| s.id().overflow_add(1)); + let next_log_index = logs.last().map_or(1, |l| l.index.overflow_add(1)); + self.next_segment_id = next_segment_id; + self.next_log_index = next_log_index; + self.segments = segments; + + self.open_new_segment()?; + info!("WAL successfully recovered"); + + Ok(logs) + } + + #[allow(clippy::pattern_type_mismatch)] // Cannot satisfy both clippy + fn send_sync(&mut self, item: Vec>) -> io::Result<()> { + let last_segment = self + .segments + .last_mut() + .unwrap_or_else(|| unreachable!("there should be at least on segment")); + if let Some(DataFrame::Entry(entry)) = item.last() { + self.next_log_index = entry.index.overflow_add(1); + } + last_segment.write_sync(item, WAL::new())?; + + if last_segment.is_full() { + self.open_new_segment()?; + } + + Ok(()) + } + + /// Truncate all the logs whose index is less than or equal to + /// `compact_index` + /// + /// `compact_index` should be the smallest index required in CURP + fn truncate_head(&mut self, compact_index: LogIndex) -> io::Result<()> { + if compact_index >= self.next_log_index { + warn!( + "head truncation: compact index too large, compact index: {}, storage next index: {}", + compact_index, self.next_log_index + ); + return Ok(()); + } + + debug!("performing head truncation on index: {compact_index}"); + + let mut to_remove_num = self + .segments + .iter() + .take_while(|s| s.base_index() <= compact_index) + .count() + .saturating_sub(1); + + if to_remove_num == 0 { + return Ok(()); + } + + // The last segment does not need to be removed + let to_remove: Vec<_> = self.segments.drain(0..to_remove_num).collect(); + SegmentRemover::new_removal(&self.config.dir, to_remove.iter())?; + + Ok(()) + } + + /// Truncate all the logs whose index is greater than `max_index` + fn truncate_tail(&mut self, max_index: LogIndex) -> io::Result<()> { + // segments to truncate + let segments: Vec<_> = self + .segments + .iter_mut() + .rev() + .take_while_inclusive::<_>(|s| s.base_index() > max_index) + .collect(); + + for segment in segments { + segment.seal::(max_index)?; + } + + let to_remove = self.update_segments(); + SegmentRemover::new_removal(&self.config.dir, to_remove.iter())?; + + self.next_log_index = max_index.overflow_add(1); + self.open_new_segment()?; + + Ok(()) + } +} + +impl WALStorage +where + C: Serialize + DeserializeOwned + std::fmt::Debug, +{ + /// Opens a new WAL segment + fn open_new_segment(&mut self) -> io::Result<()> { + let lfile = self + .pipeline + .next() + .ok_or(io::Error::from(io::ErrorKind::BrokenPipe))??; + + let segment = WALSegment::create( + lfile, + self.next_log_index, + self.next_segment_id, + self.config.max_segment_size, + )?; + + self.segments.push(segment); + self.next_segment_id = self.next_segment_id.overflow_add(1); + + Ok(()) + } + + /// Removes segments that are no longer needed + #[allow(clippy::pattern_type_mismatch)] // Cannot satisfy both clippy + fn update_segments(&mut self) -> Vec { + let flags: Vec<_> = self.segments.iter().map(WALSegment::is_redundant).collect(); + let (to_remove, remaining): (Vec<_>, Vec<_>) = + self.segments.drain(..).zip(flags).partition(|(_, f)| *f); + + self.segments = remaining.into_iter().map(|(s, _)| s).collect(); + + to_remove.into_iter().map(|(s, _)| s).collect() + } + + /// Returns the highest valid position of the log entries, + /// the logs are continuous before this position + #[allow(clippy::pattern_type_mismatch)] // can't fix + fn highest_valid_pos(entries: &[LogEntry]) -> usize { + let iter = entries.iter(); + iter.clone() + .zip(iter.skip(1)) + .enumerate() + .find(|(_, (x, y))| x.index.overflow_add(1) != y.index) + .map_or(entries.len(), |(i, _)| i) + } + + /// Iterates until an `io::Error` occurs. + fn take_until_io_error(opening: I) -> io::Result> + where + I: IntoIterator>, + { + let mut ts = vec![]; + + for result in opening { + match result { + Ok(t) => ts.push(t), + Err(e) => { + let e = e.io_or_corrupt()?; + error!("WAL corrupted: {e}"); + } + } + } + + Ok(ts) + } +} + +impl Drop for WALStorage { + fn drop(&mut self) { + self.pipeline.stop(); + } +} diff --git a/crates/curp/src/tracker.rs b/crates/curp/src/tracker.rs index d76edb4dc..240a7c672 100644 --- a/crates/curp/src/tracker.rs +++ b/crates/curp/src/tracker.rs @@ -266,6 +266,17 @@ impl Tracker { pub(crate) fn first_incomplete(&self) -> u64 { self.first_incomplete } + + /// Gets all uncompleted seq number + pub(crate) fn all_incompleted(&self) -> Vec { + let mut result = Vec::new(); + for i in 0..self.inflight.len() { + if self.inflight.get(i).unwrap_or(false) { + result.push(self.first_incomplete.wrapping_add(i.numeric_cast())); + } + } + result + } } #[cfg(test)] diff --git a/crates/curp/tests/it/common/curp_group.rs b/crates/curp/tests/it/common/curp_group.rs index 4b2400d79..fbdab5951 100644 --- a/crates/curp/tests/it/common/curp_group.rs +++ b/crates/curp/tests/it/common/curp_group.rs @@ -55,11 +55,7 @@ pub use commandpb::{ /// `BOTTOM_TASKS` are tasks which not dependent on other tasks in the task group. /// `CurpGroup` uses `BOTTOM_TASKS` to detect whether the curp group is closed or not. -const BOTTOM_TASKS: [TaskName; 3] = [ - TaskName::WatchTask, - TaskName::ConfChange, - TaskName::LogPersist, -]; +const BOTTOM_TASKS: [TaskName; 2] = [TaskName::WatchTask, TaskName::ConfChange]; /// The default shutdown timeout used in `wait_for_targets_shutdown` pub(crate) const DEFAULT_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(7); @@ -217,7 +213,7 @@ impl CurpGroup { } async fn run( - server: Arc>, + server: Arc>, listener: TcpListener, shutdown_listener: Listener, ) -> Result<(), tonic::transport::Error> { @@ -322,6 +318,10 @@ impl CurpGroup { &self.nodes[id] } + pub fn get_node_mut(&mut self, id: &ServerId) -> &mut CurpNode { + self.nodes.get_mut(id).unwrap() + } + pub async fn new_client(&self) -> impl ClientApi { let addrs = self.all_addrs().cloned().collect(); ClientBuilder::new(ClientConfig::default(), true) diff --git a/crates/curp/tests/it/main.rs b/crates/curp/tests/it/main.rs index b8174b639..9ce91b3b7 100644 --- a/crates/curp/tests/it/main.rs +++ b/crates/curp/tests/it/main.rs @@ -1,5 +1,3 @@ mod common; -mod read_state; - mod server; diff --git a/crates/curp/tests/it/read_state.rs b/crates/curp/tests/it/read_state.rs deleted file mode 100644 index f47dd303a..000000000 --- a/crates/curp/tests/it/read_state.rs +++ /dev/null @@ -1,59 +0,0 @@ -use std::time::Duration; - -use curp::{client::ClientApi, rpc::ReadState}; -use curp_test_utils::{ - init_logger, sleep_millis, - test_cmd::{TestCommand, TestCommandResult}, -}; -use test_macros::abort_on_panic; - -use crate::common::curp_group::CurpGroup; - -#[tokio::test(flavor = "multi_thread")] -#[abort_on_panic] -async fn read_state() { - init_logger(); - let group = CurpGroup::new(3).await; - let put_client = group.new_client().await; - let put_cmd = TestCommand::new_put(vec![0], 0).set_exe_dur(Duration::from_millis(100)); - tokio::spawn(async move { - assert_eq!( - put_client - .propose(&put_cmd, None, true) - .await - .unwrap() - .unwrap() - .0, - TestCommandResult::default(), - ); - }); - sleep_millis(10).await; - let get_client = group.new_client().await; - let res = get_client - .fetch_read_state(&TestCommand::new_get(vec![0])) - .await - .unwrap(); - if let ReadState::Ids(v) = res { - assert_eq!(v.inflight_ids.len(), 1); - } else { - unreachable!( - "expected result should be ReadState::Ids(v) where len(v) = 1, but received {:?}", - res - ); - } - - sleep_millis(500).await; - - let res = get_client - .fetch_read_state(&TestCommand::new_get(vec![0])) - .await - .unwrap(); - if let ReadState::CommitIndex(index) = res { - assert_eq!(index, 1); - } else { - unreachable!( - "expected result should be ReadState::CommitIndex({:?}), but received {:?}", - 1, res - ); - } -} diff --git a/crates/curp/tests/it/server.rs b/crates/curp/tests/it/server.rs index 3726772f0..9eeb5878a 100644 --- a/crates/curp/tests/it/server.rs +++ b/crates/curp/tests/it/server.rs @@ -12,15 +12,14 @@ use curp_test_utils::{ init_logger, sleep_millis, sleep_secs, test_cmd::{TestCommand, TestCommandResult, TestCommandType}, }; +use futures::stream::FuturesUnordered; use madsim::rand::{thread_rng, Rng}; use test_macros::abort_on_panic; use tokio::net::TcpListener; +use tokio_stream::StreamExt; use utils::{config::ClientConfig, timestamp}; -use crate::common::curp_group::{ - commandpb::ProposeId, CurpGroup, FetchClusterRequest, ProposeRequest, ProposeResponse, - DEFAULT_SHUTDOWN_TIMEOUT, -}; +use crate::common::curp_group::{CurpGroup, FetchClusterRequest, DEFAULT_SHUTDOWN_TIMEOUT}; #[tokio::test(flavor = "multi_thread")] #[abort_on_panic] @@ -58,17 +57,22 @@ async fn synced_propose() { let mut group = CurpGroup::new(5).await; let client = group.new_client().await; - let cmd = TestCommand::new_get(vec![0]); + let cmd = TestCommand::new_put(vec![0], 0); let (er, index) = client.propose(&cmd, None, false).await.unwrap().unwrap(); assert_eq!(er, TestCommandResult::new(vec![], vec![])); assert_eq!(index.unwrap(), 1.into()); // log[0] is a fake one - for exe_rx in group.exe_rxs() { - let (cmd1, er) = exe_rx.recv().await.unwrap(); + { + let mut exe_futs = group + .exe_rxs() + .map(|rx| rx.recv()) + .collect::>(); + let (cmd1, er) = exe_futs.next().await.unwrap().unwrap(); assert_eq!(cmd1, cmd); assert_eq!(er, TestCommandResult::new(vec![], vec![])); } + for as_rx in group.as_rxs() { let (cmd1, index) = as_rx.recv().await.unwrap(); assert_eq!(cmd1, cmd); @@ -76,23 +80,25 @@ async fn synced_propose() { } } -// Each command should be executed once and only once on each node +// Each command should be executed once and only once on leader #[tokio::test(flavor = "multi_thread")] #[abort_on_panic] -async fn exe_exact_n_times() { +async fn exe_exactly_once_on_leader() { init_logger(); let mut group = CurpGroup::new(3).await; let client = group.new_client().await; - let cmd = TestCommand::new_get(vec![0]); + let cmd = TestCommand::new_put(vec![0], 0); let er = client.propose(&cmd, None, true).await.unwrap().unwrap().0; assert_eq!(er, TestCommandResult::new(vec![], vec![])); - for exe_rx in group.exe_rxs() { - let (cmd1, er) = exe_rx.recv().await.unwrap(); + let leader = group.get_leader().await.0; + { + let exec_rx = &mut group.get_node_mut(&leader).exe_rx; + let (cmd1, er) = exec_rx.recv().await.unwrap(); assert!( - tokio::time::timeout(Duration::from_millis(100), exe_rx.recv()) + tokio::time::timeout(Duration::from_millis(100), exec_rx.recv()) .await .is_err() ); @@ -112,6 +118,8 @@ async fn exe_exact_n_times() { } } +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] // To verify PR #86 is fixed #[tokio::test(flavor = "multi_thread")] #[abort_on_panic] @@ -128,11 +136,13 @@ async fn fast_round_is_slower_than_slow_round() { leader_connect .propose(tonic::Request::new(ProposeRequest { propose_id: Some(ProposeId { - client_id: 0, + client_id: TEST_CLIENT_ID, seq_num: 0, }), command: bincode::serialize(&cmd).unwrap(), cluster_version: 0, + term: 0, + first_incomplete: 0, })) .await .unwrap(); @@ -149,11 +159,13 @@ async fn fast_round_is_slower_than_slow_round() { let resp: ProposeResponse = follower_connect .propose(tonic::Request::new(ProposeRequest { propose_id: Some(ProposeId { - client_id: 0, + client_id: TEST_CLIENT_ID, seq_num: 0, }), command: bincode::serialize(&cmd).unwrap(), cluster_version: 0, + term: 0, + first_incomplete: 0, })) .await .unwrap() @@ -161,6 +173,8 @@ async fn fast_round_is_slower_than_slow_round() { assert!(resp.result.is_none()); } +// TODO: rewrite this test for propose_stream +#[cfg(ignore)] #[tokio::test(flavor = "multi_thread")] #[abort_on_panic] async fn concurrent_cmd_order() { @@ -178,11 +192,13 @@ async fn concurrent_cmd_order() { tokio::spawn(async move { c.propose(ProposeRequest { propose_id: Some(ProposeId { - client_id: 0, + client_id: TEST_CLIENT_ID, seq_num: 0, }), command: bincode::serialize(&cmd0).unwrap(), cluster_version: 0, + term: 0, + first_incomplete: 0, }) .await .expect("propose failed"); @@ -192,22 +208,26 @@ async fn concurrent_cmd_order() { let response = leader_connect .propose(ProposeRequest { propose_id: Some(ProposeId { - client_id: 0, + client_id: TEST_CLIENT_ID, seq_num: 1, }), command: bincode::serialize(&cmd1).unwrap(), cluster_version: 0, + term: 0, + first_incomplete: 0, }) .await; assert!(response.is_err()); let response = leader_connect .propose(ProposeRequest { propose_id: Some(ProposeId { - client_id: 0, + client_id: TEST_CLIENT_ID, seq_num: 2, }), command: bincode::serialize(&cmd2).unwrap(), cluster_version: 0, + term: 0, + first_incomplete: 0, }) .await; assert!(response.is_err()); @@ -240,7 +260,7 @@ async fn concurrent_cmd_order_should_have_correct_revision() { let sample_range = 1..=100; for i in sample_range.clone() { - let rand_dur = Duration::from_millis(thread_rng().gen_range(0..500).numeric_cast()); + let rand_dur = Duration::from_millis(thread_rng().gen_range(0..50).numeric_cast()); let _er = client .propose( &TestCommand::new_put(vec![i], i).set_as_dur(rand_dur), @@ -498,9 +518,9 @@ async fn check_new_node(is_learner: bool) { .iter() .any(|m| m.id == node_id && m.name == "new_node" && is_learner == m.is_learner)); - // 4. check if the new node executes the command from old cluster + // 4. check if the new node syncs the command from old cluster let new_node = group.nodes.get_mut(&node_id).unwrap(); - let (cmd, res) = new_node.exe_rx.recv().await.unwrap(); + let (cmd, _) = new_node.as_rx.recv().await.unwrap(); assert_eq!( cmd, TestCommand { @@ -509,7 +529,6 @@ async fn check_new_node(is_learner: bool) { ..Default::default() } ); - assert!(res.values.is_empty()); // 5. check if the old client can propose to the new cluster client diff --git a/crates/simulation/src/curp_group.rs b/crates/simulation/src/curp_group.rs index ce970e31f..e9d3aebe0 100644 --- a/crates/simulation/src/curp_group.rs +++ b/crates/simulation/src/curp_group.rs @@ -1,16 +1,23 @@ -use std::{collections::HashMap, error::Error, path::PathBuf, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + error::Error, + path::PathBuf, + sync::{atomic::AtomicU64, Arc}, + time::Duration, +}; use async_trait::async_trait; pub use curp::rpc::{ - protocol_client::ProtocolClient, PbProposeId, ProposeRequest, ProposeResponse, + protocol_client::ProtocolClient, PbProposeId, ProposeRequest, ProposeResponse, RecordRequest, + RecordResponse, }; use curp::{ client::{ClientApi, ClientBuilder}, cmd::Command, members::{ClusterInfo, ServerId}, rpc::{ - ConfChange, FetchClusterRequest, FetchClusterResponse, Member, ProposeConfChangeRequest, - ProposeConfChangeResponse, ReadState, + ConfChange, FetchClusterRequest, FetchClusterResponse, Member, OpResponse, + ProposeConfChangeRequest, ProposeConfChangeResponse, ReadState, }, server::{ conflict::test_pools::{TestSpecPool, TestUncomPool}, @@ -182,14 +189,20 @@ impl CurpGroup { .iter() .map(|(id, node)| (*id, vec![node.addr.clone()])) .collect(); - SimClient { - inner: Arc::new( + let (client, client_id) = self + .client_node + .spawn(async move { ClientBuilder::new(config, true) .all_members(all_members) - .build() + .build_with_client_id() .await - .unwrap(), - ), + }) + .await + .unwrap() + .unwrap(); + SimClient { + inner: Arc::new(client), + client_id, handle: self.client_node.clone(), } } @@ -400,15 +413,30 @@ pub struct SimProtocolClient { impl SimProtocolClient { #[inline] - pub async fn propose( + pub async fn propose_stream( &mut self, cmd: impl tonic::IntoRequest + 'static + Send, - ) -> Result, tonic::Status> { + ) -> Result>, tonic::Status> { let addr = self.addr.clone(); self.handle .spawn(async move { let mut client = ProtocolClient::connect(addr).await.unwrap(); - client.propose(cmd).await + client.propose_stream(cmd).await + }) + .await + .unwrap() + } + + #[inline] + pub async fn record( + &mut self, + cmd: impl tonic::IntoRequest + 'static + Send, + ) -> Result, tonic::Status> { + let addr = self.addr.clone(); + self.handle + .spawn(async move { + let mut client = ProtocolClient::connect(addr).await.unwrap(); + client.record(cmd).await }) .await .unwrap() @@ -450,6 +478,7 @@ impl SimProtocolClient { pub struct SimClient { inner: Arc>, + client_id: Arc, handle: NodeHandle, } @@ -497,6 +526,11 @@ impl SimClient { .await .unwrap() } + + #[inline] + pub fn client_id(&self) -> u64 { + self.client_id.load(std::sync::atomic::Ordering::Relaxed) + } } impl Drop for CurpGroup { diff --git a/crates/simulation/src/xline_group.rs b/crates/simulation/src/xline_group.rs index e0229f6f5..d3a0c41ae 100644 --- a/crates/simulation/src/xline_group.rs +++ b/crates/simulation/src/xline_group.rs @@ -54,7 +54,7 @@ impl XlineGroup { vec!["0.0.0.0:2379".to_owned()], vec![format!("192.168.1.{}:2379", i + 1)], all.clone(), - false, + i == 0, CurpConfig::default(), ClientConfig::default(), ServerTimeout::default(), diff --git a/crates/simulation/tests/it/curp/server_recovery.rs b/crates/simulation/tests/it/curp/server_recovery.rs index 46a3c26cf..7e8a88ccf 100644 --- a/crates/simulation/tests/it/curp/server_recovery.rs +++ b/crates/simulation/tests/it/curp/server_recovery.rs @@ -2,7 +2,7 @@ use std::{sync::Arc, time::Duration, vec}; -use curp::rpc::{ConfChange, ProposeConfChangeRequest}; +use curp::rpc::{ConfChange, ProposeConfChangeRequest, RecordRequest}; use curp_test_utils::{init_logger, sleep_secs, test_cmd::TestCommand, TEST_TABLE}; use engine::{StorageEngine, StorageOps}; use itertools::Itertools; @@ -51,17 +51,18 @@ async fn leader_crash_and_recovery() { let old_leader = group.nodes.get_mut(&leader).unwrap(); // new leader will push an empty log to commit previous logs, the empty log does - // not call ce.execute and ce.after_sync, therefore, the index of the first item - // received by as_rx is 2 - let (_cmd, er) = old_leader.exe_rx.recv().await.unwrap(); - assert_eq!(er.values, Vec::::new()); + // not call ce.after_sync, therefore, the index of the first item received by + // as_rx is 2 let asr = old_leader.as_rx.recv().await.unwrap(); assert_eq!(asr.1, 3); // log index 1 and 2 is the empty log - let (_cmd, er) = old_leader.exe_rx.recv().await.unwrap(); + let new_leader = group.nodes.get_mut(&leader2).unwrap(); + let (_cmd, er) = new_leader.exe_rx.recv().await.unwrap(); + assert_eq!(er.values, Vec::::new()); + let (_cmd, er) = new_leader.exe_rx.recv().await.unwrap(); assert_eq!(er.values, vec![0]); - let asr = old_leader.as_rx.recv().await.unwrap(); - assert_eq!(asr.1, 4); // log index 1 and 2 is the empty log + let asr = new_leader.as_rx.recv().await.unwrap(); + assert_eq!(asr.1, 3); // log index 1 and 2 is the empty log } #[madsim::test] @@ -100,15 +101,8 @@ async fn follower_crash_and_recovery() { group.restart(follower).await; let follower = group.nodes.get_mut(&follower).unwrap(); - let (_cmd, er) = follower.exe_rx.recv().await.unwrap(); - assert_eq!(er.values, Vec::::new(),); let asr = follower.as_rx.recv().await.unwrap(); - assert_eq!(asr.1, 2); // log index 1 is the empty log - - let (_cmd, er) = follower.exe_rx.recv().await.unwrap(); - assert_eq!(er.values, vec![0]); - let asr = follower.as_rx.recv().await.unwrap(); - assert_eq!(asr.1, 3); + assert_eq!(asr.1, 2); } #[madsim::test] @@ -122,9 +116,15 @@ async fn leader_and_follower_both_crash_and_recovery() { let follower = *group.nodes.keys().find(|&id| id != &leader).unwrap(); group.crash(follower).await; + let _wait_up = client + .propose(TestCommand::new_get(vec![0]), true) + .await + .unwrap() + .unwrap(); + assert_eq!( client - .propose(TestCommand::new_put(vec![0], 0), true) + .propose(TestCommand::new_put(vec![0], 0), false) .await .unwrap() .unwrap() @@ -132,16 +132,6 @@ async fn leader_and_follower_both_crash_and_recovery() { .values, Vec::::new(), ); - assert_eq!( - client - .propose(TestCommand::new_get(vec![0]), true) - .await - .unwrap() - .unwrap() - .0 - .values, - vec![0] - ); group.crash(leader).await; @@ -150,29 +140,15 @@ async fn leader_and_follower_both_crash_and_recovery() { let old_leader = group.nodes.get_mut(&leader).unwrap(); - let (_cmd, er) = old_leader.exe_rx.recv().await.unwrap(); - assert_eq!(er.values, Vec::::new(),); let asr = old_leader.as_rx.recv().await.unwrap(); assert_eq!(asr.1, 2); // log index 1 is the empty log - let (_cmd, er) = old_leader.exe_rx.recv().await.unwrap(); - assert_eq!(er.values, vec![0]); - let asr = old_leader.as_rx.recv().await.unwrap(); - assert_eq!(asr.1, 3); - // restart follower group.restart(follower).await; let follower = group.nodes.get_mut(&follower).unwrap(); - let (_cmd, er) = follower.exe_rx.recv().await.unwrap(); - assert_eq!(er.values, Vec::::new(),); let asr = follower.as_rx.recv().await.unwrap(); assert_eq!(asr.1, 2); // log index 1 is the empty log - - let (_cmd, er) = follower.exe_rx.recv().await.unwrap(); - assert_eq!(er.values, vec![0]); - let asr = follower.as_rx.recv().await.unwrap(); - assert_eq!(asr.1, 3); } #[madsim::test] @@ -186,13 +162,13 @@ async fn new_leader_will_recover_spec_cmds_cond1() { // 1: send cmd1 to all others except the leader let cmd1 = Arc::new(TestCommand::new_put(vec![0], 0)); - let req1 = ProposeRequest { - propose_id: Some(PbProposeId { - client_id: 0, - seq_num: 0, - }), + let propose_id = PbProposeId { + client_id: client.client_id(), + seq_num: 0, + }; + let req1_rec = RecordRequest { + propose_id: Some(propose_id), command: bincode::serialize(&cmd1).unwrap(), - cluster_version: 0, }; for id in group .all_members @@ -201,7 +177,7 @@ async fn new_leader_will_recover_spec_cmds_cond1() { .take(4) { let mut connect = group.get_connect(id).await; - connect.propose(req1.clone()).await.unwrap(); + connect.record(req1_rec.clone()).await.unwrap(); } // 2: disable leader1 and wait election @@ -223,14 +199,14 @@ async fn new_leader_will_recover_spec_cmds_cond1() { // old leader should recover from the new leader group.enable_node(leader1); - // every cmd should be executed and after synced on every node - for rx in group.exe_rxs() { - rx.recv().await; - rx.recv().await; - } + // every cmd should be executed on leader + let leader2 = group.get_leader().await.0; + let new_leader = group.nodes.get_mut(&leader2).unwrap(); + new_leader.exe_rx.recv().await; + + // every cmd should be after synced on every node for rx in group.as_rxs() { rx.recv().await; - rx.recv().await; } } @@ -299,14 +275,17 @@ async fn old_leader_will_keep_original_states() { let cmd1 = Arc::new(TestCommand::new_put(vec![0], 1)); let req1 = ProposeRequest { propose_id: Some(PbProposeId { - client_id: 0, - seq_num: 0, + client_id: client.client_id(), + seq_num: 1, }), command: bincode::serialize(&cmd1).unwrap(), cluster_version: 0, + term: 1, + slow_path: false, + first_incomplete: 0, }; let mut leader1_connect = group.get_connect(&leader1).await; - leader1_connect.propose(req1).await.unwrap(); + leader1_connect.propose_stream(req1).await.unwrap(); // 3: recover all others and disable leader, a new leader will be elected group.disable_node(leader1); @@ -489,11 +468,12 @@ async fn overwritten_config_should_fallback() { let node_id = 123; let address = vec!["127.0.0.1:4567".to_owned()]; let changes = vec![ConfChange::add(node_id, address)]; + let client = group.new_client().await; let res = leader_conn .propose_conf_change( ProposeConfChangeRequest { propose_id: Some(PbProposeId { - client_id: 0, + client_id: client.client_id(), seq_num: 0, }), changes, diff --git a/crates/utils/src/barrier.rs b/crates/utils/src/barrier.rs index dd306d05a..5798af042 100644 --- a/crates/utils/src/barrier.rs +++ b/crates/utils/src/barrier.rs @@ -36,7 +36,7 @@ where /// Wait for a collection of ids. #[inline] - pub fn wait_all(&self, ids: Vec) -> impl Future { + pub fn wait_all(&self, ids: Vec) -> impl Future + Send { let mut barriers_l = self.barriers.lock(); let listeners: FuturesOrdered<_> = ids .into_iter() diff --git a/crates/utils/src/config.rs b/crates/utils/src/config.rs index af947fe08..0f59dc853 100644 --- a/crates/utils/src/config.rs +++ b/crates/utils/src/config.rs @@ -372,6 +372,8 @@ pub const fn default_server_wait_synced_timeout() -> Duration { } /// default initial retry timeout +/// FIXME: etcd client has it's own retry mechanism, which may lead to nested retry timeouts. +/// Consider bypassing for proxied etcd client requests. #[must_use] #[inline] pub const fn default_initial_retry_timeout() -> Duration { diff --git a/crates/utils/src/task_manager/mod.rs b/crates/utils/src/task_manager/mod.rs index 894b70170..8f177b8ee 100644 --- a/crates/utils/src/task_manager/mod.rs +++ b/crates/utils/src/task_manager/mod.rs @@ -10,7 +10,7 @@ use std::{ use clippy_utilities::OverflowArithmetic; use dashmap::DashMap; use tokio::{sync::Notify, task::JoinHandle}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use self::tasks::{TaskName, ALL_EDGES}; @@ -33,8 +33,6 @@ pub struct TaskManager { pub struct ClusterShutdownTracker { /// Cluster shutdown notify notify: Notify, - /// State of mpsc channel. - mpmc_channel_shutdown: AtomicBool, /// Count of sync follower tasks. sync_follower_task_count: AtomicU8, /// Shutdown Applied @@ -48,20 +46,11 @@ impl ClusterShutdownTracker { pub fn new() -> Self { Self { notify: Notify::new(), - mpmc_channel_shutdown: AtomicBool::new(false), sync_follower_task_count: AtomicU8::new(0), leader_notified: AtomicBool::new(false), } } - /// Mark mpmc channel shutdown - #[inline] - pub fn mark_mpmc_channel_shutdown(&self) { - self.mpmc_channel_shutdown.store(true, Ordering::Relaxed); - self.notify.notify_one(); - debug!("mark mpmc channel shutdown"); - } - /// Sync follower task count inc #[inline] pub fn sync_follower_task_count_inc(&self) { @@ -93,10 +82,9 @@ impl ClusterShutdownTracker { /// Check if the cluster shutdown condition is met fn check(&self) -> bool { - let mpmc_channel_shutdown = self.mpmc_channel_shutdown.load(Ordering::Relaxed); let sync_follower_task_count = self.sync_follower_task_count.load(Ordering::Relaxed); let leader_notified = self.leader_notified.load(Ordering::Relaxed); - mpmc_channel_shutdown && sync_follower_task_count == 0 && leader_notified + sync_follower_task_count == 0 && leader_notified } } @@ -189,9 +177,17 @@ impl TaskManager { }; task.notifier.notify_waiters(); for handle in task.handle.drain(..) { - handle - .await - .unwrap_or_else(|e| unreachable!("background task should not panic: {e}")); + // Directly abort the task if it's cancel safe + if task.name.cancel_safe() { + handle.abort(); + if let Err(e) = handle.await { + assert!(e.is_cancelled(), "background task should not panic: {e}"); + } + } else { + handle + .await + .unwrap_or_else(|e| unreachable!("background task should not panic: {e}")); + } } for child in task.depend_by.drain(..) { let Some(mut child_task) = tasks.get_mut(&child) else { @@ -227,9 +223,9 @@ impl TaskManager { let _ig = tokio::spawn(async move { info!("cluster shutdown start"); state.store(2, Ordering::Release); - for name in [TaskName::SyncFollower, TaskName::ConflictCheckedMpmc] { - _ = tasks.get(&name).map(|n| n.notifier.notify_waiters()); - } + _ = tasks + .get(&TaskName::SyncFollower) + .map(|n| n.notifier.notify_waiters()); loop { if tracker.check() { break; @@ -254,6 +250,7 @@ impl TaskManager { for t in self.tasks.iter() { for h in &t.handle { if !h.is_finished() { + warn!("task: {:?} not finished", t.name); return false; } } @@ -374,6 +371,14 @@ impl Listener { self.state() } + /// Checks whether self has shutdown. + #[inline] + #[must_use] + pub fn is_shutdown(&self) -> bool { + let state = self.state(); + matches!(state, State::Shutdown) + } + /// Get a sync follower guard #[must_use] #[inline] @@ -383,12 +388,6 @@ impl Listener { tracker: Arc::clone(&self.cluster_shutdown_tracker), } } - - /// Mark mpmc channel shutdown - #[inline] - pub fn mark_mpmc_channel_shutdown(&self) { - self.cluster_shutdown_tracker.mark_mpmc_channel_shutdown(); - } } /// Sync follower guard, used to track sync follower task count @@ -421,8 +420,13 @@ mod test { for name in TaskName::iter() { let record_tx = record_tx.clone(); tm.spawn(name, move |listener| async move { - listener.wait().await; - record_tx.send(name).unwrap(); + if name.cancel_safe() { + record_tx.send(name).unwrap(); + listener.wait().await; + } else { + listener.wait().await; + record_tx.send(name).unwrap(); + } }); } drop(record_tx); diff --git a/crates/utils/src/task_manager/tasks.rs b/crates/utils/src/task_manager/tasks.rs index b4e29f2ec..e32606b00 100644 --- a/crates/utils/src/task_manager/tasks.rs +++ b/crates/utils/src/task_manager/tasks.rs @@ -1,12 +1,13 @@ -// CONFLICT_CHECKED_MPMC -// | -// CMD_WORKER LEASE_KEEP_ALIVE -// / \ | -// COMPACT_BG KV_UPDATES TONIC_SERVER ELECTION -// \ / | \ / -// WATCH_TASK CONF_CHANGE LOG_PERSIST +// AFTER_SYNC LEASE_KEEP_ALIVE +// | | +// KV_UPDATES TONIC_SERVER +// \ / | +// WATCH_TASK CONF_CHANGE +// +// Other tasks like `CompactBg`, `GcSpecPool`, `GcCmdBoard`, `RevokeExpiredLeases`, `SyncVictims`, +// `Election`, and `AutoCompactor` do not have dependent tasks. -// NOTE: In integration tests, we use bottom tasks, like `WatchTask`, `ConfChange`, and `LogPersist`, +// NOTE: In integration tests, we use bottom tasks, like `WatchTask` and `ConfChange`, // which are not dependent on other tasks to detect the curp group is closed or not. If you want // to refactor the task group, don't forget to modify the `BOTTOM_TASKS` in `crates/curp/tests/it/common/curp_group.rs` // to prevent the integration tests from failing. @@ -35,33 +36,48 @@ macro_rules! enum_with_iter { } } enum_with_iter! { - ConflictCheckedMpmc, - CmdWorker, CompactBg, KvUpdates, WatchTask, LeaseKeepAlive, TonicServer, - LogPersist, Election, SyncFollower, ConfChange, - GcSpecPool, - GcCmdBoard, + GcClientLease, RevokeExpiredLeases, SyncVictims, AutoCompactor, + AfterSync, + HandlePropose, +} + +impl TaskName { + /// Returns `true` if the task is cancel safe + pub(super) fn cancel_safe(self) -> bool { + match self { + TaskName::HandlePropose | TaskName::AfterSync => true, + TaskName::CompactBg + | TaskName::KvUpdates + | TaskName::WatchTask + | TaskName::LeaseKeepAlive + | TaskName::TonicServer + | TaskName::Election + | TaskName::SyncFollower + | TaskName::ConfChange + | TaskName::GcClientLease + | TaskName::RevokeExpiredLeases + | TaskName::SyncVictims + | TaskName::AutoCompactor => false, + } + } } /// All edges of task graph, the first item in each pair must be shut down before the second item -pub const ALL_EDGES: [(TaskName, TaskName); 9] = [ - (TaskName::ConflictCheckedMpmc, TaskName::CmdWorker), - (TaskName::CmdWorker, TaskName::CompactBg), - (TaskName::CmdWorker, TaskName::KvUpdates), +pub const ALL_EDGES: [(TaskName, TaskName); 5] = [ + (TaskName::AfterSync, TaskName::KvUpdates), (TaskName::KvUpdates, TaskName::WatchTask), (TaskName::LeaseKeepAlive, TaskName::TonicServer), (TaskName::TonicServer, TaskName::WatchTask), (TaskName::TonicServer, TaskName::ConfChange), - (TaskName::TonicServer, TaskName::LogPersist), - (TaskName::Election, TaskName::LogPersist), ]; diff --git a/crates/xline/Cargo.toml b/crates/xline/Cargo.toml index de32c7c41..d79fb39c1 100644 --- a/crates/xline/Cargo.toml +++ b/crates/xline/Cargo.toml @@ -26,6 +26,7 @@ curp-external-api = { path = "../curp-external-api" } dashmap = "6.0.1" engine = { path = "../engine" } event-listener = "5.3.1" +flume = "0.11.0" futures = "0.3.25" hyper = "1.0.0" itertools = "0.13" diff --git a/crates/xline/src/conflict/mod.rs b/crates/xline/src/conflict/mod.rs index ae16fe66b..279c2f90e 100644 --- a/crates/xline/src/conflict/mod.rs +++ b/crates/xline/src/conflict/mod.rs @@ -2,7 +2,8 @@ use std::sync::Arc; use curp::{ cmd::Command as CurpCommand, - server::{conflict::CommandEntry, SpObject, UcpObject}, + rpc::PoolEntry, + server::{SpObject, UcpObject}, }; use utils::interval_map::Interval; use xlineapi::{ @@ -94,10 +95,7 @@ fn is_exclusive_cmd(cmd: &Command) -> bool { /// Gets all lease id /// * lease ids in the requests field /// * lease ids associated with the keys -pub(super) fn all_leases( - lease_collection: &LeaseCollection, - req: &CommandEntry, -) -> Vec { +pub(super) fn all_leases(lease_collection: &LeaseCollection, req: &PoolEntry) -> Vec { req.leases() .into_iter() .chain(lookup_lease(lease_collection, req)) @@ -109,7 +107,7 @@ pub(super) fn all_leases( /// We also needs to handle `PutRequest` and `DeleteRangeRequest` in /// lease conflict pools, as they may conflict with a `LeaseRevokeRequest`. /// Therefore, we should lookup the lease ids from lease collection. -fn lookup_lease(lease_collection: &LeaseCollection, req: &CommandEntry) -> Vec { +fn lookup_lease(lease_collection: &LeaseCollection, req: &PoolEntry) -> Vec { req.request() .keys() .into_iter() diff --git a/crates/xline/src/conflict/spec_pool.rs b/crates/xline/src/conflict/spec_pool.rs index 8bcfb41ec..82f1c84c1 100644 --- a/crates/xline/src/conflict/spec_pool.rs +++ b/crates/xline/src/conflict/spec_pool.rs @@ -1,10 +1,11 @@ -//! A speculative pool(witness) is used to store commands that are speculatively executed. -//! CURP requires that a witness only accepts and saves an operation if it is commutative -//! with every other operation currently stored by that witness +//! A speculative pool(witness) is used to store commands that are speculatively +//! executed. CURP requires that a witness only accepts and saves an operation +//! if it is commutative with every other operation currently stored by that +//! witness use std::{collections::HashMap, sync::Arc}; -use curp::server::conflict::CommandEntry; +use curp::rpc::PoolEntry; use curp_external_api::conflict::{ConflictPoolOp, EntryId, SpeculativePoolOp}; use utils::interval_map::{Interval, IntervalMap}; use xlineapi::{command::Command, interval::BytesAffine}; @@ -18,14 +19,14 @@ use super::{all_leases, intervals, is_exclusive_cmd}; #[cfg_attr(test, derive(Default))] pub(crate) struct KvSpecPool { /// Interval map for keys overlap detection - map: IntervalMap>, + map: IntervalMap>, /// Lease collection lease_collection: Arc, /// Id to intervals map /// - /// NOTE: To avoid potential side-effects from the `LeaseCollection`, we store - /// The lookup results from `LeaseCollection` during entry insert and use - /// these result in entry remove. + /// NOTE: To avoid potential side-effects from the `LeaseCollection`, we + /// store The lookup results from `LeaseCollection` during entry insert + /// and use these result in entry remove. intervals: HashMap<<::Entry as EntryId>::Id, Vec>>, } @@ -42,7 +43,7 @@ impl KvSpecPool { } impl ConflictPoolOp for KvSpecPool { - type Entry = CommandEntry; + type Entry = PoolEntry; fn remove(&mut self, entry: &Self::Entry) { for interval in self.intervals.remove(&entry.id()).into_iter().flatten() { @@ -91,14 +92,14 @@ impl SpeculativePoolOp for KvSpecPool { #[cfg_attr(test, derive(Default))] pub(crate) struct LeaseSpecPool { /// Stores leases in the pool - leases: HashMap>, + leases: HashMap>, /// Lease collection lease_collection: Arc, /// Id to lease ids map /// - /// NOTE: To avoid potential side-effects from the `LeaseCollection`, we store - /// The lookup results from `LeaseCollection` during entry insert and use - /// these result in entry remove. + /// NOTE: To avoid potential side-effects from the `LeaseCollection`, we + /// store The lookup results from `LeaseCollection` during entry insert + /// and use these result in entry remove. ids: HashMap<<::Entry as EntryId>::Id, Vec>, } @@ -114,7 +115,7 @@ impl LeaseSpecPool { } impl ConflictPoolOp for LeaseSpecPool { - type Entry = CommandEntry; + type Entry = PoolEntry; fn is_empty(&self) -> bool { self.leases.is_empty() @@ -162,11 +163,11 @@ impl SpeculativePoolOp for LeaseSpecPool { #[derive(Debug, Default)] pub(crate) struct ExclusiveSpecPool { /// Stores the command - conflict: Option>, + conflict: Option>, } impl ConflictPoolOp for ExclusiveSpecPool { - type Entry = CommandEntry; + type Entry = PoolEntry; fn is_empty(&self) -> bool { self.conflict.is_none() diff --git a/crates/xline/src/conflict/tests.rs b/crates/xline/src/conflict/tests.rs index 36f368005..44954f24f 100644 --- a/crates/xline/src/conflict/tests.rs +++ b/crates/xline/src/conflict/tests.rs @@ -1,6 +1,6 @@ use std::sync::Arc; -use curp::{rpc::ProposeId, server::conflict::CommandEntry}; +use curp::rpc::{PoolEntry, ProposeId}; use curp_external_api::conflict::{ConflictPoolOp, SpeculativePoolOp, UncommittedPoolOp}; use xlineapi::{ command::Command, AuthEnableRequest, AuthRoleAddRequest, DeleteRangeRequest, LeaseGrantRequest, @@ -410,7 +410,7 @@ fn lease_ucp_mutation_no_side_effect() { assert!(ucp.all_conflict(&lease_revoke).is_empty()); } -fn compare_commands(mut a: Vec>, mut b: Vec>) { +fn compare_commands(mut a: Vec>, mut b: Vec>) { a.sort_unstable(); b.sort_unstable(); assert_eq!(a, b); @@ -422,14 +422,14 @@ struct EntryGenerator { } impl EntryGenerator { - fn gen_put(&mut self, key: &str) -> CommandEntry { + fn gen_put(&mut self, key: &str) -> PoolEntry { self.gen_entry(RequestWrapper::PutRequest(PutRequest { key: key.as_bytes().to_vec(), ..Default::default() })) } - fn gen_delete_range(&mut self, key: &str, range_end: &str) -> CommandEntry { + fn gen_delete_range(&mut self, key: &str, range_end: &str) -> PoolEntry { self.gen_entry(RequestWrapper::DeleteRangeRequest(DeleteRangeRequest { key: key.as_bytes().to_vec(), range_end: range_end.as_bytes().to_vec(), @@ -437,32 +437,32 @@ impl EntryGenerator { })) } - fn gen_lease_grant(&mut self, id: i64) -> CommandEntry { + fn gen_lease_grant(&mut self, id: i64) -> PoolEntry { self.gen_entry(RequestWrapper::LeaseGrantRequest(LeaseGrantRequest { id, ..Default::default() })) } - fn gen_lease_revoke(&mut self, id: i64) -> CommandEntry { + fn gen_lease_revoke(&mut self, id: i64) -> PoolEntry { self.gen_entry(RequestWrapper::LeaseRevokeRequest(LeaseRevokeRequest { id, })) } - fn gen_auth_enable(&mut self) -> CommandEntry { + fn gen_auth_enable(&mut self) -> PoolEntry { self.gen_entry(RequestWrapper::AuthEnableRequest(AuthEnableRequest {})) } - fn gen_role_add(&mut self) -> CommandEntry { + fn gen_role_add(&mut self) -> PoolEntry { self.gen_entry(RequestWrapper::AuthRoleAddRequest( AuthRoleAddRequest::default(), )) } - fn gen_entry(&mut self, req: RequestWrapper) -> CommandEntry { + fn gen_entry(&mut self, req: RequestWrapper) -> PoolEntry { self.id += 1; let cmd = Command::new(req); - CommandEntry::new(ProposeId(0, self.id), Arc::new(cmd)) + PoolEntry::new(ProposeId(0, self.id), Arc::new(cmd)) } } diff --git a/crates/xline/src/conflict/uncommitted_pool.rs b/crates/xline/src/conflict/uncommitted_pool.rs index ba02ed5ca..6bfd5c693 100644 --- a/crates/xline/src/conflict/uncommitted_pool.rs +++ b/crates/xline/src/conflict/uncommitted_pool.rs @@ -1,13 +1,14 @@ //! An uncommitted pool is used to store unsynced commands. -//! CURP requires that a master will only execute client operations speculatively, -//! if that operation is commutative with every other unsynced operation. +//! CURP requires that a master will only execute client operations +//! speculatively, if that operation is commutative with every other unsynced +//! operation. use std::{ collections::{hash_map, HashMap}, sync::Arc, }; -use curp::server::conflict::CommandEntry; +use curp::rpc::PoolEntry; use curp_external_api::conflict::{ConflictPoolOp, EntryId, UncommittedPoolOp}; use itertools::Itertools; use utils::interval_map::{Interval, IntervalMap}; @@ -27,9 +28,9 @@ pub(crate) struct KvUncomPool { lease_collection: Arc, /// Id to intervals map /// - /// NOTE: To avoid potential side-effects from the `LeaseCollection`, we store - /// The lookup results from `LeaseCollection` during entry insert and use - /// these result in entry remove. + /// NOTE: To avoid potential side-effects from the `LeaseCollection`, we + /// store The lookup results from `LeaseCollection` during entry insert + /// and use these result in entry remove. intervals: HashMap<<::Entry as EntryId>::Id, Vec>>, } @@ -46,7 +47,7 @@ impl KvUncomPool { } impl ConflictPoolOp for KvUncomPool { - type Entry = CommandEntry; + type Entry = PoolEntry; fn remove(&mut self, entry: &Self::Entry) { for interval in self.intervals.remove(&entry.id()).into_iter().flatten() { @@ -114,9 +115,9 @@ pub(crate) struct LeaseUncomPool { lease_collection: Arc, /// Id to lease ids map /// - /// NOTE: To avoid potential side-effects from the `LeaseCollection`, we store - /// The lookup results from `LeaseCollection` during entry insert and use - /// these result in entry remove. + /// NOTE: To avoid potential side-effects from the `LeaseCollection`, we + /// store The lookup results from `LeaseCollection` during entry insert + /// and use these result in entry remove. ids: HashMap<<::Entry as EntryId>::Id, Vec>, } @@ -132,7 +133,7 @@ impl LeaseUncomPool { } impl ConflictPoolOp for LeaseUncomPool { - type Entry = CommandEntry; + type Entry = PoolEntry; fn remove(&mut self, entry: &Self::Entry) { for id in self.ids.remove(&entry.id()).into_iter().flatten() { @@ -205,7 +206,7 @@ pub(crate) struct ExclusiveUncomPool { } impl ConflictPoolOp for ExclusiveUncomPool { - type Entry = CommandEntry; + type Entry = PoolEntry; fn all(&self) -> Vec { self.conflicts.all() @@ -253,19 +254,19 @@ struct Commands { /// /// As we may need to insert multiple commands with the same /// set of keys, we store a vector of commands as the value. - cmds: Vec>, + cmds: Vec>, } impl Commands { /// Appends a cmd to the value - fn push_cmd(&mut self, cmd: CommandEntry) { + fn push_cmd(&mut self, cmd: PoolEntry) { self.cmds.push(cmd); } /// Removes a cmd from the value /// /// Returns `true` if the value is empty - fn remove_cmd(&mut self, cmd: &CommandEntry) -> bool { + fn remove_cmd(&mut self, cmd: &PoolEntry) -> bool { let Some(idx) = self.cmds.iter().position(|c| c == cmd) else { return self.is_empty(); }; @@ -279,7 +280,7 @@ impl Commands { } /// Gets all commands - fn all(&self) -> Vec> { + fn all(&self) -> Vec> { self.cmds.clone() } diff --git a/crates/xline/src/revision_number.rs b/crates/xline/src/revision_number.rs index bf6043cd8..fb5e4287f 100644 --- a/crates/xline/src/revision_number.rs +++ b/crates/xline/src/revision_number.rs @@ -2,27 +2,35 @@ use std::sync::atomic::{AtomicI64, Ordering}; /// Revision number #[derive(Debug)] -pub(crate) struct RevisionNumberGenerator(AtomicI64); +pub(crate) struct RevisionNumberGenerator { + /// The current revision number + current: AtomicI64, +} impl RevisionNumberGenerator { /// Create a new revision pub(crate) fn new(rev: i64) -> Self { - Self(AtomicI64::new(rev)) + Self { + current: AtomicI64::new(rev), + } } - /// Get the revision number + /// Get the current revision number pub(crate) fn get(&self) -> i64 { - self.0.load(Ordering::Relaxed) - } - - /// Get the next revision number - pub(crate) fn next(&self) -> i64 { - self.0.fetch_add(1, Ordering::Relaxed).wrapping_add(1) + self.current.load(Ordering::Relaxed) } /// Set the revision number pub(crate) fn set(&self, rev: i64) { - self.0.store(rev, Ordering::Relaxed); + self.current.store(rev, Ordering::Relaxed); + } + + /// Gets a temporary state + pub(crate) fn state(&self) -> RevisionNumberGeneratorState { + RevisionNumberGeneratorState { + current: &self.current, + next: AtomicI64::new(self.get()), + } } } @@ -32,3 +40,29 @@ impl Default for RevisionNumberGenerator { RevisionNumberGenerator::new(1) } } + +/// Revision generator with temporary state +pub(crate) struct RevisionNumberGeneratorState<'a> { + /// The current revision number + current: &'a AtomicI64, + /// Next revision number + next: AtomicI64, +} + +impl RevisionNumberGeneratorState<'_> { + /// Get the current revision number + pub(crate) fn get(&self) -> i64 { + self.next.load(Ordering::Relaxed) + } + + /// Increases the next revision number + pub(crate) fn next(&self) -> i64 { + self.next.fetch_add(1, Ordering::Relaxed).wrapping_add(1) + } + + /// Commit the revision number + pub(crate) fn commit(&self) { + self.current + .store(self.next.load(Ordering::Relaxed), Ordering::Relaxed); + } +} diff --git a/crates/xline/src/server/auth_server.rs b/crates/xline/src/server/auth_server.rs index 33a0949ef..bd285d926 100644 --- a/crates/xline/src/server/auth_server.rs +++ b/crates/xline/src/server/auth_server.rs @@ -51,7 +51,6 @@ impl AuthServer { async fn propose( &self, request: tonic::Request, - use_fast_path: bool, ) -> Result<(CommandResponse, Option), tonic::Status> where T: Into, @@ -59,7 +58,7 @@ impl AuthServer { let auth_info = self.auth_store.try_get_auth_info_from_request(&request)?; let request = request.into_inner().into(); let cmd = Command::new_with_auth_info(request, auth_info); - let res = self.client.propose(&cmd, None, use_fast_path).await??; + let res = self.client.propose(&cmd, None, false).await??; Ok(res) } @@ -67,13 +66,12 @@ impl AuthServer { async fn handle_req( &self, request: tonic::Request, - use_fast_path: bool, ) -> Result, tonic::Status> where Req: Into, Res: From, { - let (cmd_res, sync_res) = self.propose(request, use_fast_path).await?; + let (cmd_res, sync_res) = self.propose(request).await?; let mut res_wrapper = cmd_res.into_inner(); if let Some(sync_res) = sync_res { res_wrapper.update_revision(sync_res.revision()); @@ -89,7 +87,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthEnableRequest {:?}", request); - self.handle_req(request, false).await + self.handle_req(request).await } async fn auth_disable( @@ -97,7 +95,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthDisableRequest {:?}", request); - self.handle_req(request, false).await + self.handle_req(request).await } async fn auth_status( @@ -105,8 +103,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthStatusRequest {:?}", request); - let is_fast_path = true; - self.handle_req(request, is_fast_path).await + self.handle_req(request).await } async fn authenticate( @@ -114,7 +111,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthenticateRequest {:?}", request); - self.handle_req(request, false).await + self.handle_req(request).await } async fn user_add( @@ -128,7 +125,7 @@ impl Auth for AuthServer { .map_err(|err| tonic::Status::internal(format!("Failed to hash password: {err}")))?; user_add_req.hashed_password = hashed_password; user_add_req.password = String::new(); - self.handle_req(request, false).await + self.handle_req(request).await } async fn user_get( @@ -136,8 +133,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthUserGetRequest {:?}", request); - let is_fast_path = true; - self.handle_req(request, is_fast_path).await + self.handle_req(request).await } async fn user_list( @@ -145,8 +141,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthUserListRequest {:?}", request); - let is_fast_path = true; - self.handle_req(request, is_fast_path).await + self.handle_req(request).await } async fn user_delete( @@ -154,7 +149,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthUserDeleteRequest {:?}", request); - self.handle_req(request, false).await + self.handle_req(request).await } async fn user_change_password( @@ -167,7 +162,7 @@ impl Auth for AuthServer { .map_err(|err| tonic::Status::internal(format!("Failed to hash password: {err}")))?; user_change_password_req.hashed_password = hashed_password; user_change_password_req.password = String::new(); - self.handle_req(request, false).await + self.handle_req(request).await } async fn user_grant_role( @@ -175,7 +170,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthUserGrantRoleRequest {:?}", request); - self.handle_req(request, false).await + self.handle_req(request).await } async fn user_revoke_role( @@ -183,7 +178,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthUserRevokeRoleRequest {:?}", request); - self.handle_req(request, false).await + self.handle_req(request).await } async fn role_add( @@ -192,7 +187,7 @@ impl Auth for AuthServer { ) -> Result, tonic::Status> { debug!("Receive AuthRoleAddRequest {:?}", request); request.get_ref().validation()?; - self.handle_req(request, false).await + self.handle_req(request).await } async fn role_get( @@ -200,8 +195,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthRoleGetRequest {:?}", request); - let is_fast_path = true; - self.handle_req(request, is_fast_path).await + self.handle_req(request).await } async fn role_list( @@ -209,8 +203,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthRoleListRequest {:?}", request); - let is_fast_path = true; - self.handle_req(request, is_fast_path).await + self.handle_req(request).await } async fn role_delete( @@ -218,7 +211,7 @@ impl Auth for AuthServer { request: tonic::Request, ) -> Result, tonic::Status> { debug!("Receive AuthRoleDeleteRequest {:?}", request); - self.handle_req(request, false).await + self.handle_req(request).await } async fn role_grant_permission( @@ -230,7 +223,7 @@ impl Auth for AuthServer { request.get_ref() ); request.get_ref().validation()?; - self.handle_req(request, false).await + self.handle_req(request).await } async fn role_revoke_permission( @@ -241,6 +234,6 @@ impl Auth for AuthServer { "Receive AuthRoleRevokePermissionRequest {}", request.get_ref() ); - self.handle_req(request, false).await + self.handle_req(request).await } } diff --git a/crates/xline/src/server/auth_wrapper.rs b/crates/xline/src/server/auth_wrapper.rs index 509d57b16..1df9d65d0 100644 --- a/crates/xline/src/server/auth_wrapper.rs +++ b/crates/xline/src/server/auth_wrapper.rs @@ -4,11 +4,13 @@ use curp::{ cmd::PbCodec, rpc::{ FetchClusterRequest, FetchClusterResponse, FetchReadStateRequest, FetchReadStateResponse, - LeaseKeepAliveMsg, MoveLeaderRequest, MoveLeaderResponse, ProposeConfChangeRequest, - ProposeConfChangeResponse, ProposeRequest, ProposeResponse, Protocol, PublishRequest, - PublishResponse, ShutdownRequest, ShutdownResponse, WaitSyncedRequest, WaitSyncedResponse, + LeaseKeepAliveMsg, MoveLeaderRequest, MoveLeaderResponse, OpResponse, + ProposeConfChangeRequest, ProposeConfChangeResponse, ProposeRequest, Protocol, + PublishRequest, PublishResponse, ReadIndexRequest, ReadIndexResponse, RecordRequest, + RecordResponse, ShutdownRequest, ShutdownResponse, }, }; +use flume::r#async::RecvStream; use tracing::debug; use xlineapi::command::Command; @@ -35,10 +37,12 @@ impl AuthWrapper { #[tonic::async_trait] impl Protocol for AuthWrapper { - async fn propose( + type ProposeStreamStream = RecvStream<'static, Result>; + + async fn propose_stream( &self, mut request: tonic::Request, - ) -> Result, tonic::Status> { + ) -> Result, tonic::Status> { debug!( "AuthWrapper received propose request: {}", request.get_ref().propose_id() @@ -51,7 +55,21 @@ impl Protocol for AuthWrapper { command.set_auth_info(auth_info); request.get_mut().command = command.encode(); }; - self.curp_server.propose(request).await + self.curp_server.propose_stream(request).await + } + + async fn record( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.curp_server.record(request).await + } + + async fn read_index( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.curp_server.read_index(request).await } async fn shutdown( @@ -75,13 +93,6 @@ impl Protocol for AuthWrapper { self.curp_server.publish(request).await } - async fn wait_synced( - &self, - request: tonic::Request, - ) -> Result, tonic::Status> { - self.curp_server.wait_synced(request).await - } - async fn fetch_cluster( &self, request: tonic::Request, diff --git a/crates/xline/src/server/command.rs b/crates/xline/src/server/command.rs index 1fb8fee57..cd564729d 100644 --- a/crates/xline/src/server/command.rs +++ b/crates/xline/src/server/command.rs @@ -1,28 +1,31 @@ -use std::{fmt::Debug, sync::Arc}; +use std::{fmt::Debug, iter, sync::Arc}; use clippy_utilities::OverflowArithmetic; use curp::{ - cmd::{Command as CurpCommand, CommandExecutor as CurpCommandExecutor}, + cmd::{ + AfterSyncCmd, AfterSyncOk, Command as CurpCommand, CommandExecutor as CurpCommandExecutor, + }, members::ServerId, InflightId, LogIndex, }; use dashmap::DashMap; -use engine::Snapshot; +use engine::{Snapshot, TransactionApi}; use event_listener::Event; use parking_lot::RwLock; use tracing::warn; use utils::{barrier::IdBarrier, table_names::META_TABLE}; use xlineapi::{ - command::{Command, CurpClient}, + command::{Command, CurpClient, SyncResponse}, execute_error::ExecuteError, AlarmAction, AlarmRequest, AlarmType, }; use crate::{ - revision_number::RevisionNumberGenerator, + revision_number::RevisionNumberGeneratorState, rpc::{RequestBackend, RequestWrapper}, storage::{ db::{WriteOp, DB}, + index::IndexOperate, storage_api::XlineStorageOps, AlarmStore, AuthStore, KvStore, LeaseStore, }, @@ -75,10 +78,6 @@ pub(crate) struct CommandExecutor { db: Arc, /// Barrier for propose id id_barrier: Arc>, - /// Revision Number generator for KV request and Lease request - general_rev: Arc, - /// Revision Number generator for Auth request - auth_rev: Arc, /// Compact events compact_events: Arc>>, /// Quota checker @@ -224,8 +223,6 @@ impl CommandExecutor { alarm_storage: Arc, db: Arc, id_barrier: Arc>, - general_rev: Arc, - auth_rev: Arc, compact_events: Arc>>, quota: u64, ) -> Self { @@ -238,8 +235,6 @@ impl CommandExecutor { alarm_storage, db, id_barrier, - general_rev, - auth_rev, compact_events, quota_checker, alarmer, @@ -274,83 +269,260 @@ impl CommandExecutor { _ => Ok(()), } } -} -#[async_trait::async_trait] -impl CurpCommandExecutor for CommandExecutor { - fn prepare( + /// After sync KV commands + fn after_sync_kv( &self, - cmd: &Command, - ) -> Result<::PR, ::Error> { - self.check_alarm(cmd)?; - let wrapper = cmd.request(); - let auth_info = cmd.auth_info(); - self.auth_storage.check_permission(wrapper, auth_info)?; - let revision = match wrapper.backend() { - RequestBackend::Auth => { - if wrapper.skip_auth_revision() { - self.auth_rev.get() - } else { - self.auth_rev.next() - } - } - RequestBackend::Kv | RequestBackend::Lease => { - if wrapper.skip_general_revision() { - self.general_rev.get() - } else { - self.general_rev.next() - } + wrapper: &RequestWrapper, + txn_db: &T, + index: &(dyn IndexOperate + Send + Sync), + revision_gen: &RevisionNumberGeneratorState<'_>, + to_execute: bool, + ) -> Result< + ( + ::ASR, + Option<::ER>, + ), + ExecuteError, + > + where + T: XlineStorageOps + TransactionApi, + { + let (asr, er) = + self.kv_storage + .after_sync(wrapper, txn_db, index, revision_gen, to_execute)?; + Ok((asr, er)) + } + + /// After sync other type of commands + fn after_sync_others( + &self, + wrapper: &RequestWrapper, + txn_db: &T, + index: &I, + general_revision: &RevisionNumberGeneratorState<'_>, + auth_revision: &RevisionNumberGeneratorState<'_>, + to_execute: bool, + ) -> Result< + ( + ::ASR, + Option<::ER>, + ), + ExecuteError, + > + where + T: XlineStorageOps + TransactionApi, + I: IndexOperate, + { + let er = to_execute + .then(|| match wrapper.backend() { + RequestBackend::Auth => self.auth_storage.execute(wrapper), + RequestBackend::Lease => self.lease_storage.execute(wrapper), + RequestBackend::Alarm => Ok(self.alarm_storage.execute(wrapper)), + RequestBackend::Kv => unreachable!("Should not execute kv commands"), + }) + .transpose()?; + + let (asr, wr_ops) = match wrapper.backend() { + RequestBackend::Auth => self.auth_storage.after_sync(wrapper, auth_revision)?, + RequestBackend::Lease => { + self.lease_storage + .after_sync(wrapper, general_revision, txn_db, index)? } - RequestBackend::Alarm => -1, + RequestBackend::Alarm => self.alarm_storage.after_sync(wrapper, general_revision), + RequestBackend::Kv => unreachable!("Should not sync kv commands"), }; - Ok(revision) + + txn_db.write_ops(wr_ops)?; + + Ok((asr, er)) + } +} + +/// After Sync Result +type AfterSyncResult = Result, ::Error>; + +/// Collection of after sync results +struct ASResults<'a> { + /// After sync cmds and there execution results + cmd_results: Vec<(AfterSyncCmd<'a, Command>, Option)>, +} + +impl<'a> ASResults<'a> { + /// Creates a new [`ASResultStates`]. + fn new(cmds: Vec>) -> Self { + Self { + // Initially all commands have no results + cmd_results: cmds.into_iter().map(|cmd| (cmd, None)).collect(), + } + } + + #[allow(clippy::pattern_type_mismatch)] // can't be fixed + /// Updates the results of commands that have errors by applying a given + /// operation. + fn update_err(&mut self, op: F) + where + F: Fn(&AfterSyncCmd<'_, Command>) -> Result<(), ExecuteError>, + { + self.for_each_none_result(|(cmd, result_opt)| { + if let Err(e) = op(cmd) { + let _ignore = result_opt.replace(Err(e)); + } + }); + } + + /// Updates the results of commands by applying a given operation. + #[allow(clippy::pattern_type_mismatch)] // can't be fixed + fn update_result(&mut self, op: F) + where + F: Fn(&AfterSyncCmd<'_, Command>) -> AfterSyncResult, + { + self.for_each_none_result(|(cmd, result_opt)| { + let _ignore = result_opt.replace(op(cmd)); + }); } - async fn execute( + /// Applies the provided operation to each command-result pair in `cmd_results` where the result is `None`. + #[allow(clippy::pattern_type_mismatch)] // can't be fixed + fn for_each_none_result(&mut self, op: F) + where + F: FnMut(&mut (AfterSyncCmd<'_, Command>, Option)), + { + self.cmd_results + .iter_mut() + .filter(|(_cmd, res)| res.is_none()) + .for_each(op); + } + + /// Converts into errors. + fn into_errors(self, err: ::Error) -> Vec { + iter::repeat(err) + .map(Err) + .take(self.cmd_results.len()) + .collect() + } + + /// Converts into results. + fn into_results(self) -> Vec { + self.cmd_results + .into_iter() + .filter_map(|(_cmd, res)| res) + .collect() + } +} + +#[async_trait::async_trait] +impl CurpCommandExecutor for CommandExecutor { + fn execute( &self, cmd: &Command, ) -> Result<::ER, ::Error> { + self.check_alarm(cmd)?; + let auth_info = cmd.auth_info(); let wrapper = cmd.request(); + self.auth_storage.check_permission(wrapper, auth_info)?; match wrapper.backend() { - RequestBackend::Kv => self.kv_storage.execute(wrapper), + RequestBackend::Kv => self.kv_storage.execute(wrapper, None), RequestBackend::Auth => self.auth_storage.execute(wrapper), RequestBackend::Lease => self.lease_storage.execute(wrapper), RequestBackend::Alarm => Ok(self.alarm_storage.execute(wrapper)), } } - async fn after_sync( + fn execute_ro( &self, cmd: &Command, - index: LogIndex, - revision: i64, - ) -> Result<::ASR, ::Error> { - let quota_enough = self.quota_checker.check(cmd); - let mut ops = vec![WriteOp::PutAppliedIndex(index)]; + ) -> Result< + (::ER, ::ASR), + ::Error, + > { + let er = self.execute(cmd)?; let wrapper = cmd.request(); - let (res, mut wr_ops) = match wrapper.backend() { - RequestBackend::Kv => self.kv_storage.after_sync(wrapper, revision).await?, - RequestBackend::Auth => self.auth_storage.after_sync(wrapper, revision)?, - RequestBackend::Lease => self.lease_storage.after_sync(wrapper, revision).await?, - RequestBackend::Alarm => self.alarm_storage.after_sync(wrapper, revision), - }; - if let RequestWrapper::CompactionRequest(ref compact_req) = *wrapper { - if compact_req.physical { - if let Some(n) = self.compact_events.get(&cmd.compact_id()) { - let _ignore = n.notify(usize::MAX); - } + let rev = match wrapper.backend() { + RequestBackend::Kv | RequestBackend::Lease | RequestBackend::Alarm => { + self.kv_storage.revision_gen().get() } + RequestBackend::Auth => self.auth_storage.revision_gen().get(), }; - if let RequestWrapper::CompactionRequest(ref compact_req) = *wrapper { - if compact_req.physical { - if let Some(n) = self.compact_events.get(&cmd.compact_id()) { - let _ignore = n.notify(usize::MAX); - } + Ok((er, SyncResponse::new(rev))) + } + + fn after_sync( + &self, + cmds: Vec>, + highest_index: Option, + ) -> Vec { + if cmds.is_empty() { + return Vec::new(); + } + let quota_enough = cmds + .iter() + .map(AfterSyncCmd::cmd) + .all(|c| self.quota_checker.check(c)); + + let mut states = ASResults::new(cmds); + states.update_err(|c| self.check_alarm(c.cmd())); + states.update_err(|c| { + self.auth_storage + .check_permission(c.cmd().request(), c.cmd().auth_info()) + }); + + let index = self.kv_storage.index(); + let index_state = index.state(); + let general_revision_gen = self.kv_storage.revision_gen(); + let auth_revision_gen = self.auth_storage.revision_gen(); + let general_revision_state = general_revision_gen.state(); + let auth_revision_state = auth_revision_gen.state(); + + let txn_db = self.db.transaction(); + if let Some(i) = highest_index { + if let Err(e) = txn_db.write_op(WriteOp::PutAppliedIndex(i)) { + return states.into_errors(e); } - }; - ops.append(&mut wr_ops); - self.db.write_ops(ops)?; - self.lease_storage.mark_lease_synced(wrapper); + } + + states.update_result(|c| { + let (cmd, to_execute) = c.into_parts(); + let wrapper = cmd.request(); + let (asr, er) = match wrapper.backend() { + RequestBackend::Kv => self.after_sync_kv( + wrapper, + &txn_db, + &index_state, + &general_revision_state, + to_execute, + ), + RequestBackend::Auth | RequestBackend::Lease | RequestBackend::Alarm => self + .after_sync_others( + wrapper, + &txn_db, + &index_state, + &general_revision_state, + &auth_revision_state, + to_execute, + ), + }?; + + if let RequestWrapper::CompactionRequest(ref compact_req) = *wrapper { + if compact_req.physical { + if let Some(n) = self.compact_events.get(&cmd.compact_id()) { + let _ignore = n.notify(usize::MAX); + } + } + }; + + self.lease_storage.mark_lease_synced(wrapper); + + Ok(AfterSyncOk::new(asr, er)) + }); + + if let Err(e) = txn_db.commit() { + return states.into_errors(ExecuteError::DbError(e.to_string())); + } + index_state.commit(); + general_revision_state.commit(); + auth_revision_state.commit(); + if !quota_enough { if let Some(alarmer) = self.alarmer.read().clone() { let _ig = tokio::spawn(async move { @@ -363,7 +535,8 @@ impl CurpCommandExecutor for CommandExecutor { }); } } - Ok(res) + + states.into_results() } async fn reset( @@ -376,7 +549,8 @@ impl CurpCommandExecutor for CommandExecutor { } else { None }; - self.db.reset(s).await + self.db.reset(s).await?; + self.kv_storage.recover().await } async fn snapshot(&self) -> Result::Error> { diff --git a/crates/xline/src/server/kv_server.rs b/crates/xline/src/server/kv_server.rs index d730b4b14..7e87064f3 100644 --- a/crates/xline/src/server/kv_server.rs +++ b/crates/xline/src/server/kv_server.rs @@ -76,7 +76,7 @@ impl KvServer { fn do_serializable(&self, command: &Command) -> Result { self.auth_storage .check_permission(command.request(), command.auth_info())?; - let cmd_res = self.kv_storage.execute(command.request())?; + let cmd_res = self.kv_storage.execute(command.request(), None)?; Ok(Self::parse_response_op(cmd_res.into_inner().into())) } @@ -232,9 +232,9 @@ impl Kv for KvServer { } } - /// Compact compacts the event history in the etcd key-value store. The key-value - /// store should be periodically compacted or the event history will continue to grow - /// indefinitely. + /// Compact compacts the event history in the etcd key-value store. The + /// key-value store should be periodically compacted or the event + /// history will continue to grow indefinitely. #[instrument(skip_all)] async fn compact( &self, @@ -258,7 +258,7 @@ impl Kv for KvServer { } else { Either::Right(async {}) }; - let (cmd_res, _sync_res) = self.client.propose(&cmd, None, !physical).await??; + let (cmd_res, _sync_res) = self.client.propose(&cmd, None, false).await??; let resp = cmd_res.into_inner(); if timeout(self.compact_timeout, compact_physical_fut) .await diff --git a/crates/xline/src/server/lease_server.rs b/crates/xline/src/server/lease_server.rs index 931abb015..d528c1c8d 100644 --- a/crates/xline/src/server/lease_server.rs +++ b/crates/xline/src/server/lease_server.rs @@ -119,7 +119,6 @@ impl LeaseServer { async fn propose( &self, request: tonic::Request, - use_fast_path: bool, ) -> Result<(CommandResponse, Option), tonic::Status> where T: Into, @@ -127,7 +126,7 @@ impl LeaseServer { let auth_info = self.auth_storage.try_get_auth_info_from_request(&request)?; let request = request.into_inner().into(); let cmd = Command::new_with_auth_info(request, auth_info); - let res = self.client.propose(&cmd, None, use_fast_path).await??; + let res = self.client.propose(&cmd, None, false).await??; Ok(res) } @@ -255,8 +254,7 @@ impl Lease for LeaseServer { lease_grant_req.id = self.id_gen.next(); } - let is_fast_path = true; - let (res, sync_res) = self.propose(request, is_fast_path).await?; + let (res, sync_res) = self.propose(request).await?; let mut res: LeaseGrantResponse = res.into_inner().into(); if let Some(sync_res) = sync_res { @@ -276,8 +274,7 @@ impl Lease for LeaseServer { ) -> Result, tonic::Status> { debug!("Receive LeaseRevokeRequest {:?}", request); - let is_fast_path = true; - let (res, sync_res) = self.propose(request, is_fast_path).await?; + let (res, sync_res) = self.propose(request).await?; let mut res: LeaseRevokeResponse = res.into_inner().into(); if let Some(sync_res) = sync_res { @@ -378,8 +375,7 @@ impl Lease for LeaseServer { ) -> Result, tonic::Status> { debug!("Receive LeaseLeasesRequest {:?}", request); - let is_fast_path = true; - let (res, sync_res) = self.propose(request, is_fast_path).await?; + let (res, sync_res) = self.propose(request).await?; let mut res: LeaseLeasesResponse = res.into_inner().into(); if let Some(sync_res) = sync_res { diff --git a/crates/xline/src/server/lock_server.rs b/crates/xline/src/server/lock_server.rs index 578b03e1e..ac0b39aa2 100644 --- a/crates/xline/src/server/lock_server.rs +++ b/crates/xline/src/server/lock_server.rs @@ -71,14 +71,13 @@ impl LockServer { &self, request: T, auth_info: Option, - use_fast_path: bool, ) -> Result<(CommandResponse, Option), tonic::Status> where T: Into, { let request = request.into(); let cmd = Command::new_with_auth_info(request, auth_info); - let res = self.client.propose(&cmd, None, use_fast_path).await??; + let res = self.client.propose(&cmd, None, false).await??; Ok(res) } @@ -148,7 +147,7 @@ impl LockServer { max_create_revision: rev, ..Default::default() }; - let (cmd_res, _sync_res) = self.propose(get_req, auth_info.cloned(), false).await?; + let (cmd_res, _sync_res) = self.propose(get_req, auth_info.cloned()).await?; let response = Into::::into(cmd_res.into_inner()); let last_key = match response.kvs.first() { Some(kv) => kv.key.clone(), @@ -186,7 +185,7 @@ impl LockServer { key: key.into(), ..Default::default() }; - let (cmd_res, _) = self.propose(del_req, auth_info, true).await?; + let (cmd_res, _) = self.propose(del_req, auth_info).await?; let res = Into::::into(cmd_res.into_inner()); Ok(res.header) } @@ -198,7 +197,7 @@ impl LockServer { ttl: DEFAULT_SESSION_TTL, id: lease_id, }; - let (cmd_res, _) = self.propose(lease_grant_req, auth_info, true).await?; + let (cmd_res, _) = self.propose(lease_grant_req, auth_info).await?; let res = Into::::into(cmd_res.into_inner()); Ok(res.id) } @@ -229,7 +228,7 @@ impl Lock for LockServer { let key = format!("{prefix}{lease_id:x}"); let txn = Self::create_acquire_txn(&prefix, lease_id); - let (cmd_res, sync_res) = self.propose(txn, auth_info.clone(), false).await?; + let (cmd_res, sync_res) = self.propose(txn, auth_info.clone()).await?; let mut txn_res = Into::::into(cmd_res.into_inner()); #[allow(clippy::unwrap_used)] // sync_res always has value when use slow path let my_rev = sync_res.unwrap().revision(); @@ -261,7 +260,7 @@ impl Lock for LockServer { key: key.as_bytes().to_vec(), ..Default::default() }; - let result = self.propose(range_req, auth_info.clone(), true).await; + let result = self.propose(range_req, auth_info.clone()).await; match result { Ok(res) => { let res = Into::::into(res.0.into_inner()); diff --git a/crates/xline/src/server/maintenance.rs b/crates/xline/src/server/maintenance.rs index 2ecd5943e..9ecf80209 100644 --- a/crates/xline/src/server/maintenance.rs +++ b/crates/xline/src/server/maintenance.rs @@ -84,7 +84,6 @@ impl MaintenanceServer { async fn propose( &self, request: tonic::Request, - use_fast_path: bool, ) -> Result<(CommandResponse, Option), tonic::Status> where T: Into + Debug, @@ -92,7 +91,7 @@ impl MaintenanceServer { let auth_info = self.auth_store.try_get_auth_info_from_request(&request)?; let request = request.into_inner().into(); let cmd = Command::new_with_auth_info(request, auth_info); - let res = self.client.propose(&cmd, None, use_fast_path).await??; + let res = self.client.propose(&cmd, None, false).await??; Ok(res) } } @@ -103,8 +102,7 @@ impl Maintenance for MaintenanceServer { &self, request: tonic::Request, ) -> Result, tonic::Status> { - let is_fast_path = true; - let (res, sync_res) = self.propose(request, is_fast_path).await?; + let (res, sync_res) = self.propose(request).await?; let mut res: AlarmResponse = res.into_inner().into(); if let Some(sync_res) = sync_res { let revision = sync_res.revision(); diff --git a/crates/xline/src/server/watch_server.rs b/crates/xline/src/server/watch_server.rs index 8476a5a38..d7cb68f60 100644 --- a/crates/xline/src/server/watch_server.rs +++ b/crates/xline/src/server/watch_server.rs @@ -417,6 +417,7 @@ mod test { time::Duration, }; + use engine::TransactionApi; use parking_lot::Mutex; use test_macros::abort_on_panic; use tokio::{ @@ -431,8 +432,7 @@ mod test { rpc::{PutRequest, WatchProgressRequest}, storage::{ compact::COMPACT_CHANNEL_SIZE, db::DB, index::Index, kv_store::KvStoreInner, - kvwatcher::MockKvWatcherOps, lease_store::LeaseCollection, - storage_api::XlineStorageOps, KvStore, + kvwatcher::MockKvWatcherOps, lease_store::LeaseCollection, KvStore, }, }; @@ -444,20 +444,24 @@ mod test { && wr.header.as_ref().map_or(false, |h| h.revision != 0) } - async fn put( - store: &KvStore, - db: &DB, - key: impl Into>, - value: impl Into>, - revision: i64, - ) { + fn put(store: &KvStore, key: impl Into>, value: impl Into>) { let req = RequestWrapper::from(PutRequest { key: key.into(), value: value.into(), ..Default::default() }); - let (_sync_res, ops) = store.after_sync(&req, revision).await.unwrap(); - db.write_ops(ops).unwrap(); + + let rev_gen = store.revision_gen(); + let index = store.index(); + let txn = store.db().transaction(); + let rev_state = rev_gen.state(); + let index_state = index.state(); + store + .after_sync(&req, &txn, &index_state, &rev_state, false) + .unwrap(); + txn.commit().unwrap(); + index_state.commit(); + rev_state.commit(); } #[tokio::test] @@ -581,13 +585,13 @@ mod test { #[abort_on_panic] async fn test_watch_prev_kv() { let task_manager = Arc::new(TaskManager::new()); - let (compact_tx, _compact_rx) = mpsc::channel(COMPACT_CHANNEL_SIZE); + let (compact_tx, _compact_rx) = flume::bounded(COMPACT_CHANNEL_SIZE); let index = Arc::new(Index::new()); let db = DB::open(&EngineConfig::Memory).unwrap(); let header_gen = Arc::new(HeaderGenerator::new(0, 0)); let lease_collection = Arc::new(LeaseCollection::new(0)); let next_id_gen = Arc::new(WatchIdGenerator::new(1)); - let (kv_update_tx, kv_update_rx) = mpsc::channel(CHANNEL_SIZE); + let (kv_update_tx, kv_update_rx) = flume::bounded(CHANNEL_SIZE); let kv_store_inner = Arc::new(KvStoreInner::new(index, Arc::clone(&db))); let kv_store = Arc::new(KvStore::new( Arc::clone(&kv_store_inner), @@ -602,8 +606,8 @@ mod test { Duration::from_millis(10), &task_manager, ); - put(&kv_store, &db, "foo", "old_bar", 2).await; - put(&kv_store, &db, "foo", "bar", 3).await; + put(&kv_store, "foo", "old_bar"); + put(&kv_store, "foo", "bar"); let (req_tx, req_rx) = mpsc::channel(CHANNEL_SIZE); let req_stream = ReceiverStream::new(req_rx); @@ -767,13 +771,13 @@ mod test { #[tokio::test] async fn watch_compacted_revision_should_fail() { let task_manager = Arc::new(TaskManager::new()); - let (compact_tx, _compact_rx) = mpsc::channel(COMPACT_CHANNEL_SIZE); + let (compact_tx, _compact_rx) = flume::bounded(COMPACT_CHANNEL_SIZE); let index = Arc::new(Index::new()); let db = DB::open(&EngineConfig::Memory).unwrap(); let header_gen = Arc::new(HeaderGenerator::new(0, 0)); let lease_collection = Arc::new(LeaseCollection::new(0)); let next_id_gen = Arc::new(WatchIdGenerator::new(1)); - let (kv_update_tx, kv_update_rx) = mpsc::channel(CHANNEL_SIZE); + let (kv_update_tx, kv_update_rx) = flume::bounded(CHANNEL_SIZE); let kv_store_inner = Arc::new(KvStoreInner::new(index, Arc::clone(&db))); let kv_store = Arc::new(KvStore::new( Arc::clone(&kv_store_inner), @@ -788,9 +792,9 @@ mod test { Duration::from_millis(10), &task_manager, ); - put(&kv_store, &db, "foo", "old_bar", 2).await; - put(&kv_store, &db, "foo", "bar", 3).await; - put(&kv_store, &db, "foo", "new_bar", 4).await; + put(&kv_store, "foo", "old_bar"); + put(&kv_store, "foo", "bar"); + put(&kv_store, "foo", "new_bar"); kv_store.update_compacted_revision(3); diff --git a/crates/xline/src/server/xline_server.rs b/crates/xline/src/server/xline_server.rs index ed4978148..d4e718a73 100644 --- a/crates/xline/src/server/xline_server.rs +++ b/crates/xline/src/server/xline_server.rs @@ -13,9 +13,9 @@ use engine::{MemorySnapshotAllocator, RocksSnapshotAllocator, SnapshotAllocator} #[cfg(not(madsim))] use futures::Stream; use jsonwebtoken::{DecodingKey, EncodingKey}; +use tokio::fs; #[cfg(not(madsim))] use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::{fs, sync::mpsc::channel}; #[cfg(not(madsim))] use tonic::transport::{ server::Connected, Certificate, ClientTlsConfig, Identity, ServerTlsConfig, @@ -68,7 +68,7 @@ use crate::{ }; /// Rpc Server of curp protocol -pub(crate) type CurpServer = Rpc>>; +pub(crate) type CurpServer = Rpc>>; /// Xline server #[derive(Debug)] @@ -197,7 +197,8 @@ impl XlineServer { Arc::new(LeaseCollection::new(min_ttl_secs.numeric_cast())) } - /// Construct underlying storages, including `KvStore`, `LeaseStore`, `AuthStore` + /// Construct underlying storages, including `KvStore`, `LeaseStore`, + /// `AuthStore` #[allow(clippy::type_complexity)] // it is easy to read #[inline] async fn construct_underlying_storages( @@ -213,9 +214,9 @@ impl XlineServer { Arc, Arc, )> { - let (compact_task_tx, compact_task_rx) = channel(COMPACT_CHANNEL_SIZE); + let (compact_task_tx, compact_task_rx) = flume::bounded(COMPACT_CHANNEL_SIZE); let index = Arc::new(Index::new()); - let (kv_update_tx, kv_update_rx) = channel(CHANNEL_SIZE); + let (kv_update_tx, kv_update_rx) = flume::bounded(CHANNEL_SIZE); let kv_store_inner = Arc::new(KvStoreInner::new(Arc::clone(&index), Arc::clone(&db))); let kv_storage = Arc::new(KvStore::new( Arc::clone(&kv_store_inner), @@ -426,8 +427,8 @@ impl XlineServer { self.start_inner(xline_incoming, curp_incoming).await } - /// Init `KvServer`, `LockServer`, `LeaseServer`, `WatchServer` and `CurpServer` - /// for the Xline Server. + /// Init `KvServer`, `LockServer`, `LeaseServer`, `WatchServer` and + /// `CurpServer` for the Xline Server. #[allow( clippy::type_complexity, // it is easy to read clippy::too_many_lines, // TODO: split this into multiple functions @@ -474,8 +475,6 @@ impl XlineServer { Arc::clone(&alarm_storage), Arc::clone(&db), Arc::clone(&id_barrier), - header_gen.general_revision_arc(), - header_gen.auth_revision_arc(), Arc::clone(&compact_events), self.storage_config.quota, )); diff --git a/crates/xline/src/storage/alarm_store.rs b/crates/xline/src/storage/alarm_store.rs index 95a0e567b..7483f249b 100644 --- a/crates/xline/src/storage/alarm_store.rs +++ b/crates/xline/src/storage/alarm_store.rs @@ -19,7 +19,7 @@ use xlineapi::{ }; use super::db::{WriteOp, DB}; -use crate::header_gen::HeaderGenerator; +use crate::{header_gen::HeaderGenerator, revision_number::RevisionNumberGeneratorState}; /// Alarm store #[derive(Debug)] @@ -64,7 +64,7 @@ impl AlarmStore { pub(crate) fn after_sync( &self, request: &RequestWrapper, - revision: i64, + revision_gen: &RevisionNumberGeneratorState<'_>, ) -> (SyncResponse, Vec) { #[allow(clippy::wildcard_enum_match_arm)] let ops = match *request { @@ -77,7 +77,7 @@ impl AlarmStore { unreachable!("Other request should not be sent to this store"); } }; - (SyncResponse::new(revision), ops) + (SyncResponse::new(revision_gen.get()), ops) } /// Recover data form persistent storage diff --git a/crates/xline/src/storage/auth_store/store.rs b/crates/xline/src/storage/auth_store/store.rs index b0f6c0e5e..66fd776ce 100644 --- a/crates/xline/src/storage/auth_store/store.rs +++ b/crates/xline/src/storage/auth_store/store.rs @@ -29,7 +29,7 @@ use super::{ }; use crate::{ header_gen::HeaderGenerator, - revision_number::RevisionNumberGenerator, + revision_number::{RevisionNumberGenerator, RevisionNumberGeneratorState}, rpc::{ AuthDisableRequest, AuthDisableResponse, AuthEnableRequest, AuthEnableResponse, AuthRoleAddRequest, AuthRoleAddResponse, AuthRoleDeleteRequest, AuthRoleDeleteResponse, @@ -527,8 +527,13 @@ impl AuthStore { pub(crate) fn after_sync<'a>( &self, request: &'a RequestWrapper, - revision: i64, + revision_gen: &RevisionNumberGeneratorState, ) -> Result<(SyncResponse, Vec>), ExecuteError> { + let revision = if request.skip_auth_revision() { + revision_gen.get() + } else { + revision_gen.next() + }; #[allow(clippy::wildcard_enum_match_arm)] let ops = match *request { RequestWrapper::AuthEnableRequest(ref req) => { @@ -1151,6 +1156,11 @@ impl AuthStore { self.create_permission_cache()?; Ok(()) } + + /// Gets the auth revision generator + pub(crate) fn revision_gen(&self) -> Arc { + Arc::clone(&self.revision) + } } /// Get common name from tonic request @@ -1205,7 +1215,7 @@ mod test { range_end: "foz".into(), }), }); - assert!(exe_and_sync(&store, &req, 6).is_ok()); + assert!(exe_and_sync(&store, &req).is_ok()); assert_eq!( store.permission_cache(), PermissionCache { @@ -1234,7 +1244,7 @@ mod test { key: "foo".into(), range_end: "".into(), }); - assert!(exe_and_sync(&store, &req, 6).is_ok()); + assert!(exe_and_sync(&store, &req).is_ok()); assert_eq!( store.permission_cache(), PermissionCache { @@ -1252,7 +1262,7 @@ mod test { let req = RequestWrapper::from(AuthRoleDeleteRequest { role: "r".to_owned(), }); - assert!(exe_and_sync(&store, &req, 6).is_ok()); + assert!(exe_and_sync(&store, &req).is_ok()); assert_eq!( store.permission_cache(), PermissionCache { @@ -1270,7 +1280,7 @@ mod test { let req = RequestWrapper::from(AuthUserDeleteRequest { name: "u".to_owned(), }); - assert!(exe_and_sync(&store, &req, 6).is_ok()); + assert!(exe_and_sync(&store, &req).is_ok()); assert_eq!( store.permission_cache(), PermissionCache { @@ -1286,39 +1296,39 @@ mod test { let db = DB::open(&EngineConfig::Memory).unwrap(); let store = init_auth_store(db); let revision = store.revision(); - let rev_gen = Arc::clone(&store.revision); assert!(!store.is_enabled()); let enable_req = RequestWrapper::from(AuthEnableRequest {}); // AuthEnableRequest won't increase the auth revision, but AuthDisableRequest will - assert!(exe_and_sync(&store, &enable_req, store.revision()).is_err()); + assert!(exe_and_sync(&store, &enable_req).is_err()); let req_1 = RequestWrapper::from(AuthUserAddRequest { name: "root".to_owned(), password: String::new(), hashed_password: "123".to_owned(), options: None, }); - assert!(exe_and_sync(&store, &req_1, rev_gen.next()).is_ok()); + assert!(exe_and_sync(&store, &req_1).is_ok()); let req_2 = RequestWrapper::from(AuthRoleAddRequest { name: "root".to_owned(), }); - assert!(exe_and_sync(&store, &req_2, rev_gen.next()).is_ok()); + assert!(exe_and_sync(&store, &req_2).is_ok()); let req_3 = RequestWrapper::from(AuthUserGrantRoleRequest { user: "root".to_owned(), role: "root".to_owned(), }); - assert!(exe_and_sync(&store, &req_3, rev_gen.next()).is_ok()); + assert!(exe_and_sync(&store, &req_3).is_ok()); + assert_eq!(store.revision(), revision + 3); - assert!(exe_and_sync(&store, &enable_req, -1).is_ok()); + assert!(exe_and_sync(&store, &enable_req).is_ok()); assert_eq!(store.revision(), 8); assert!(store.is_enabled()); let disable_req = RequestWrapper::from(AuthDisableRequest {}); - assert!(exe_and_sync(&store, &disable_req, rev_gen.next()).is_ok()); + assert!(exe_and_sync(&store, &disable_req).is_ok()); assert_eq!(store.revision(), revision + 4); assert!(!store.is_enabled()); } @@ -1339,33 +1349,33 @@ mod test { fn init_auth_store(db: Arc) -> AuthStore { let store = init_empty_store(db); - let rev = Arc::clone(&store.revision); let req1 = RequestWrapper::from(AuthRoleAddRequest { name: "r".to_owned(), }); - assert!(exe_and_sync(&store, &req1, rev.next()).is_ok()); + assert!(exe_and_sync(&store, &req1).is_ok()); let req2 = RequestWrapper::from(AuthUserAddRequest { name: "u".to_owned(), password: String::new(), hashed_password: "123".to_owned(), options: None, }); - assert!(exe_and_sync(&store, &req2, rev.next()).is_ok()); + assert!(exe_and_sync(&store, &req2).is_ok()); let req3 = RequestWrapper::from(AuthUserGrantRoleRequest { user: "u".to_owned(), role: "r".to_owned(), }); - assert!(exe_and_sync(&store, &req3, rev.next()).is_ok()); + assert!(exe_and_sync(&store, &req3).is_ok()); let req4 = RequestWrapper::from(AuthRoleGrantPermissionRequest { name: "r".to_owned(), perm: Some(Permission { + #[allow(clippy::as_conversions)] // This cast is always valid perm_type: Type::Readwrite as i32, key: b"foo".to_vec(), range_end: vec![], }), }); - assert!(exe_and_sync(&store, &req4, rev.next()).is_ok()); + assert!(exe_and_sync(&store, &req4).is_ok()); assert_eq!( store.permission_cache(), PermissionCache { @@ -1392,10 +1402,12 @@ mod test { fn exe_and_sync( store: &AuthStore, req: &RequestWrapper, - revision: i64, ) -> Result<(CommandResponse, SyncResponse), ExecuteError> { let cmd_res = store.execute(req)?; - let (sync_res, ops) = store.after_sync(req, revision)?; + let rev_gen = store.revision_gen(); + let rev_gen_state = rev_gen.state(); + let (sync_res, ops) = store.after_sync(req, &rev_gen_state)?; + rev_gen_state.commit(); store.backend.flush_ops(ops)?; Ok((cmd_res, sync_res)) } diff --git a/crates/xline/src/storage/compact/mod.rs b/crates/xline/src/storage/compact/mod.rs index fcf183e4b..7768667e5 100644 --- a/crates/xline/src/storage/compact/mod.rs +++ b/crates/xline/src/storage/compact/mod.rs @@ -5,7 +5,7 @@ use curp::client::ClientApi; use event_listener::Event; use periodic_compactor::PeriodicCompactor; use revision_compactor::RevisionCompactor; -use tokio::{sync::mpsc::Receiver, time::sleep}; +use tokio::time::sleep; use utils::{ config::AutoCompactConfig, task_manager::{tasks::TaskName, Listener, TaskManager}, @@ -98,13 +98,13 @@ pub(crate) async fn compact_bg_task( index: Arc, batch_limit: usize, interval: Duration, - mut compact_task_rx: Receiver<(i64, Option>)>, + compact_task_rx: flume::Receiver<(i64, Option>)>, shutdown_listener: Listener, ) { loop { let (revision, listener) = tokio::select! { - recv = compact_task_rx.recv() => { - let Some((revision, listener)) = recv else { + recv = compact_task_rx.recv_async() => { + let Ok((revision, listener)) = recv else { return; }; (revision, listener) diff --git a/crates/xline/src/storage/compact/revision_compactor.rs b/crates/xline/src/storage/compact/revision_compactor.rs index 149830a39..cd6619a3b 100644 --- a/crates/xline/src/storage/compact/revision_compactor.rs +++ b/crates/xline/src/storage/compact/revision_compactor.rs @@ -129,16 +129,20 @@ mod test { let mut compactable = MockCompactable::new(); compactable.expect_compact().times(3).returning(Ok); let revision_gen = Arc::new(RevisionNumberGenerator::new(110)); + let revision_gen_state = revision_gen.state(); let revision_compactor = RevisionCompactor::new_arc(true, Arc::clone(&revision_gen), 100); revision_compactor.set_compactable(compactable).await; // auto_compactor works successfully assert_eq!(revision_compactor.do_compact(None).await, Some(10)); - revision_gen.next(); // current revision: 111 + revision_gen_state.next(); // current revision: 111 + revision_gen_state.commit(); assert_eq!(revision_compactor.do_compact(Some(10)).await, Some(11)); revision_compactor.pause(); - revision_gen.next(); // current revision 112 + revision_gen_state.next(); // current revision 112 + revision_gen_state.commit(); assert!(revision_compactor.do_compact(Some(11)).await.is_none()); - revision_gen.next(); // current revision 113 + revision_gen_state.next(); // current revision 113 + revision_gen_state.commit(); assert!(revision_compactor.do_compact(Some(11)).await.is_none()); revision_compactor.resume(); assert_eq!(revision_compactor.do_compact(Some(11)).await, Some(13)); diff --git a/crates/xline/src/storage/kv_store.rs b/crates/xline/src/storage/kv_store.rs index 0b35f7caf..19b8fb20a 100644 --- a/crates/xline/src/storage/kv_store.rs +++ b/crates/xline/src/storage/kv_store.rs @@ -2,7 +2,7 @@ use std::{ cmp::Ordering, - collections::{HashMap, VecDeque}, + collections::HashMap, sync::{ atomic::{AtomicI64, Ordering::Relaxed}, Arc, @@ -10,8 +10,8 @@ use std::{ }; use clippy_utilities::{NumericCast, OverflowArithmetic}; +use engine::{Transaction, TransactionApi}; use prost::Message; -use tokio::sync::mpsc; use tracing::{debug, warn}; use utils::table_names::{KV_TABLE, META_TABLE}; use xlineapi::{ @@ -23,12 +23,12 @@ use super::{ db::{DB, SCHEDULED_COMPACT_REVISION}, index::{Index, IndexOperate}, lease_store::LeaseCollection, - revision::Revision, + revision::{KeyRevision, Revision}, }; use crate::{ header_gen::HeaderGenerator, revision_check::RevisionCheck, - revision_number::RevisionNumberGenerator, + revision_number::{RevisionNumberGenerator, RevisionNumberGeneratorState}, rpc::{ CompactionRequest, CompactionResponse, Compare, CompareResult, CompareTarget, DeleteRangeRequest, DeleteRangeResponse, Event, EventType, KeyValue, PutRequest, @@ -51,9 +51,9 @@ pub(crate) struct KvStore { /// Header generator header_gen: Arc, /// KV update sender - kv_update_tx: mpsc::Sender<(i64, Vec)>, + kv_update_tx: flume::Sender<(i64, Vec)>, /// Compact task submit sender - compact_task_tx: mpsc::Sender<(i64, Option>)>, + compact_task_tx: flume::Sender<(i64, Option>)>, /// Lease collection lease_collection: Arc, } @@ -79,13 +79,16 @@ impl KvStoreInner { } } - /// Get `KeyValue` from the `KvStoreInner` - fn get_values(&self, revisions: &[Revision]) -> Result, ExecuteError> { + /// Get `KeyValue` from the `KvStore` + fn get_values(txn: &T, revisions: &[Revision]) -> Result, ExecuteError> + where + T: XlineStorageOps, + { let revisions = revisions .iter() .map(Revision::encode_to_vec) .collect::>>(); - let values = self.db.get_values(KV_TABLE, &revisions)?; + let values = txn.get_values(KV_TABLE, &revisions)?; let kvs: Vec = values .into_iter() .flatten() @@ -100,15 +103,59 @@ impl KvStoreInner { /// Get `KeyValue` of a range /// - /// If `range_end` is `&[]`, this function will return one or zero `KeyValue`. - fn get_range( - &self, + /// If `range_end` is `&[]`, this function will return one or zero + /// `KeyValue`. + fn get_range( + txn_db: &T, + index: &dyn IndexOperate, key: &[u8], range_end: &[u8], revision: i64, - ) -> Result, ExecuteError> { - let revisions = self.index.get(key, range_end, revision); - self.get_values(&revisions) + ) -> Result, ExecuteError> + where + T: XlineStorageOps, + { + let revisions = index.get(key, range_end, revision); + Self::get_values(txn_db, &revisions) + } + + /// Get `KeyValue` of a range with limit and count only, return kvs and + /// total count + fn get_range_with_opts( + txn_db: &T, + index: &dyn IndexOperate, + key: &[u8], + range_end: &[u8], + revision: i64, + limit: usize, + count_only: bool, + ) -> Result<(Vec, usize), ExecuteError> + where + T: XlineStorageOps, + { + let mut revisions = index.get(key, range_end, revision); + let total = revisions.len(); + if count_only || total == 0 { + return Ok((vec![], total)); + } + if limit != 0 { + revisions.truncate(limit); + } + let kvs = Self::get_values(txn_db, &revisions)?; + Ok((kvs, total)) + } + + /// Get previous `KeyValue` of a `KeyValue` + pub(crate) fn get_prev_kv(&self, kv: &KeyValue) -> Option { + Self::get_range( + self.db.as_ref(), + self.index.as_ref(), + &kv.key, + &[], + kv.mod_revision.overflow_sub(1), + ) + .ok()? + .pop() } /// Get `KeyValue` start from a revision and convert to `Event` @@ -120,8 +167,7 @@ impl KvStoreInner { let revisions = self.index .get_from_rev(key_range.range_start(), key_range.range_end(), revision); - let events: Vec = self - .get_values(&revisions)? + let events = Self::get_values(self.db.as_ref(), &revisions)? .into_iter() .map(|kv| { // Delete @@ -143,58 +189,44 @@ impl KvStoreInner { Ok(events) } - /// Get previous `KeyValue` of a `KeyValue` - pub(crate) fn get_prev_kv(&self, kv: &KeyValue) -> Option { - self.get_range(&kv.key, &[], kv.mod_revision.overflow_sub(1)) - .ok()? - .pop() - } - /// Get compacted revision of KV store pub(crate) fn compacted_revision(&self) -> i64 { self.compacted_rev.load(Relaxed) } - - /// Get `KeyValue` of a range with limit and count only, return kvs and total count - fn get_range_with_opts( - &self, - key: &[u8], - range_end: &[u8], - revision: i64, - limit: usize, - count_only: bool, - ) -> Result<(Vec, usize), ExecuteError> { - let mut revisions = self.index.get(key, range_end, revision); - let total = revisions.len(); - if count_only || total == 0 { - return Ok((vec![], total)); - } - if limit != 0 { - revisions.truncate(limit); - } - let kvs = self.get_values(&revisions)?; - Ok((kvs, total)) - } } impl KvStore { - /// execute a kv request + /// Executes a request pub(crate) fn execute( &self, request: &RequestWrapper, + as_ctx: Option<(&Transaction, &mut dyn IndexOperate)>, ) -> Result { - self.handle_kv_requests(request).map(CommandResponse::new) + if let Some((db, index)) = as_ctx { + self.execute_request(request, db, index) + } else { + self.execute_request( + request, + &self.inner.db.transaction(), + &mut self.inner.index.state(), + ) + } + .map(CommandResponse::new) } - /// sync a kv request - pub(crate) async fn after_sync( + /// After-Syncs a request + pub(crate) fn after_sync( &self, request: &RequestWrapper, - revision: i64, - ) -> Result<(SyncResponse, Vec), ExecuteError> { - self.sync_request(request, revision) - .await - .map(|(rev, ops)| (SyncResponse::new(rev), ops)) + txn_db: &T, + index: &(dyn IndexOperate + Send + Sync), + revision_gen: &RevisionNumberGeneratorState<'_>, + to_execute: bool, + ) -> Result<(SyncResponse, Option), ExecuteError> + where + T: XlineStorageOps + TransactionApi, + { + self.sync_request(request, txn_db, index, revision_gen, to_execute) } /// Recover data from persistent storage @@ -241,11 +273,7 @@ impl KvStore { if scheduled_rev > self.compacted_revision() { let event = Arc::new(event_listener::Event::new()); let listener = event.listen(); - if let Err(e) = self - .compact_task_tx - .send((scheduled_rev, Some(event))) - .await - { + if let Err(e) = self.compact_task_tx.send((scheduled_rev, Some(event))) { panic!("the compactor exited unexpectedly: {e:?}"); } listener.await; @@ -273,8 +301,8 @@ impl KvStore { pub(crate) fn new( inner: Arc, header_gen: Arc, - kv_update_tx: mpsc::Sender<(i64, Vec)>, - compact_task_tx: mpsc::Sender<(i64, Option>)>, + kv_update_tx: flume::Sender<(i64, Vec)>, + compact_task_tx: flume::Sender<(i64, Option>)>, lease_collection: Arc, ) -> Self { Self { @@ -303,9 +331,9 @@ impl KvStore { } /// Notify KV changes to KV watcher - async fn notify_updates(&self, revision: i64, updates: Vec) { + fn notify_updates(&self, revision: i64, updates: Vec) { assert!( - self.kv_update_tx.send((revision, updates)).await.is_ok(), + self.kv_update_tx.send((revision, updates)).is_ok(), "Failed to send updates to KV watcher" ); } @@ -442,11 +470,12 @@ impl KvStore { } /// Check result of a `Compare` - fn check_compare(&self, cmp: &Compare) -> bool { - let kvs = self - .inner - .get_range(&cmp.key, &cmp.range_end, 0) - .unwrap_or_default(); + fn check_compare(txn_db: &T, index: &dyn IndexOperate, cmp: &Compare) -> bool + where + T: XlineStorageOps, + { + let kvs = + KvStoreInner::get_range(txn_db, index, &cmp.key, &cmp.range_end, 0).unwrap_or_default(); if kvs.is_empty() { if let Some(TargetUnion::Value(_)) = cmp.target_union { false @@ -525,32 +554,63 @@ impl KvStore { } } -/// handle and sync kv requests +#[cfg(test)] +/// Test uitls +impl KvStore { + pub(crate) fn db(&self) -> &DB { + self.inner.db.as_ref() + } +} + +// Speculatively execute requests impl KvStore { - /// Handle kv requests - fn handle_kv_requests( + /// execute requests + fn execute_request( &self, wrapper: &RequestWrapper, + txn_db: &Transaction, + index: &mut dyn IndexOperate, ) -> Result { debug!("Execute {:?}", wrapper); + #[allow(clippy::wildcard_enum_match_arm)] - let res = match *wrapper { - RequestWrapper::RangeRequest(ref req) => self.handle_range_request(req).map(Into::into), - RequestWrapper::PutRequest(ref req) => self.handle_put_request(req).map(Into::into), - RequestWrapper::DeleteRangeRequest(ref req) => { - self.handle_delete_range_request(req).map(Into::into) + let res: ResponseWrapper = match *wrapper { + RequestWrapper::RangeRequest(ref req) => { + self.execute_range(txn_db, index, req).map(Into::into)? + } + RequestWrapper::PutRequest(ref req) => { + self.execute_put(txn_db, index, req).map(Into::into)? + } + RequestWrapper::DeleteRangeRequest(ref req) => self + .execute_delete_range(txn_db, index, req) + .map(Into::into)?, + RequestWrapper::TxnRequest(ref req) => { + // As we store use revision as key in the DB storage, + // a fake revision needs to be used during speculative execution + let fake_revision = i64::MAX; + self.execute_txn(txn_db, index, req, fake_revision, &mut 0) + .map(Into::into)? } - RequestWrapper::TxnRequest(ref req) => self.handle_txn_request(req).map(Into::into), RequestWrapper::CompactionRequest(ref req) => { - self.handle_compaction_request(req).map(Into::into) + debug!("Receive CompactionRequest {:?}", req); + self.execute_compaction(req).map(Into::into)? } _ => unreachable!("Other request should not be sent to this store"), }; - res + + Ok(res) } - /// Handle `RangeRequest` - fn handle_range_request(&self, req: &RangeRequest) -> Result { + /// Execute `RangeRequest` + fn execute_range( + &self, + tnx_db: &T, + index: &dyn IndexOperate, + req: &RangeRequest, + ) -> Result + where + T: XlineStorageOps, + { req.check_revision(self.compacted_revision(), self.revision())?; let storage_fetch_limit = if (req.sort_order() != SortOrder::None) @@ -564,7 +624,9 @@ impl KvStore { } else { req.limit.overflow_add(1) // get one extra for "more" flag }; - let (mut kvs, total) = self.inner.get_range_with_opts( + let (mut kvs, total) = KvStoreInner::get_range_with_opts( + tnx_db, + index, &req.key, &req.range_end, req.revision, @@ -597,11 +659,20 @@ impl KvStore { kvs.iter_mut().for_each(|kv| kv.value.clear()); } response.kvs = kvs; + Ok(response) } - /// Handle `PutRequest` - fn handle_put_request(&self, req: &PutRequest) -> Result { + /// Generates `PutResponse` + fn generate_put_resp( + &self, + req: &PutRequest, + txn_db: &T, + prev_rev: Option, + ) -> Result<(PutResponse, Option), ExecuteError> + where + T: XlineStorageOps, + { let mut response = PutResponse { header: Some(self.header_gen.gen_header()), ..Default::default() @@ -609,24 +680,91 @@ impl KvStore { if req.lease != 0 && self.lease_collection.look_up(req.lease).is_none() { return Err(ExecuteError::LeaseNotFound(req.lease)); }; + if req.prev_kv || req.ignore_lease || req.ignore_value { - let prev_kv = self.inner.get_range(&req.key, &[], 0)?.pop(); + let prev_kv = + KvStoreInner::get_values(txn_db, &prev_rev.into_iter().collect::>())?.pop(); if prev_kv.is_none() && (req.ignore_lease || req.ignore_value) { return Err(ExecuteError::KeyNotFound); } if req.prev_kv { - response.prev_kv = prev_kv; + response.prev_kv = prev_kv.clone(); } + return Ok((response, prev_kv)); + } + + Ok((response, None)) + } + + /// Execute `PutRequest` + fn execute_put( + &self, + txn_db: &Transaction, + index: &dyn IndexOperate, + req: &PutRequest, + ) -> Result { + let prev_rev = (req.prev_kv || req.ignore_lease || req.ignore_value) + .then(|| index.current_rev(&req.key)) + .flatten(); + let (response, _prev_kv) = + self.generate_put_resp(req, txn_db, prev_rev.map(|key_rev| key_rev.as_revision()))?; + Ok(response) + } + + /// Execute `PutRequest` in Txn + fn execute_txn_put( + &self, + txn_db: &Transaction, + index: &dyn IndexOperate, + req: &PutRequest, + revision: i64, + sub_revision: &mut i64, + ) -> Result { + let (new_rev, prev_rev) = index.register_revision(req.key.clone(), revision, *sub_revision); + let (response, prev_kv) = + self.generate_put_resp(req, txn_db, prev_rev.map(|key_rev| key_rev.as_revision()))?; + let mut kv = KeyValue { + key: req.key.clone(), + value: req.value.clone(), + create_revision: new_rev.create_revision, + mod_revision: new_rev.mod_revision, + version: new_rev.version, + lease: req.lease, }; + if req.ignore_lease { + kv.lease = prev_kv + .as_ref() + .unwrap_or_else(|| { + unreachable!("Should returns an error when prev kv does not exist") + }) + .lease; + } + if req.ignore_value { + kv.value = prev_kv + .as_ref() + .unwrap_or_else(|| { + unreachable!("Should returns an error when prev kv does not exist") + }) + .value + .clone(); + } + txn_db.write_op(WriteOp::PutKeyValue(new_rev.as_revision(), kv.clone()))?; + *sub_revision = sub_revision.overflow_add(1); + Ok(response) } - /// Handle `DeleteRangeRequest` - fn handle_delete_range_request( + /// Generates `DeleteRangeResponse` + fn generate_delete_range_resp( &self, req: &DeleteRangeRequest, - ) -> Result { - let prev_kvs = self.inner.get_range(&req.key, &req.range_end, 0)?; + txn_db: &T, + index: &dyn IndexOperate, + ) -> Result + where + T: XlineStorageOps, + { + let prev_kvs = KvStoreInner::get_range(txn_db, index, &req.key, &req.range_end, 0)?; let mut response = DeleteRangeResponse { header: Some(self.header_gen.gen_header()), ..DeleteRangeResponse::default() @@ -638,33 +776,91 @@ impl KvStore { Ok(response) } - /// Handle `TxnRequest` - fn handle_txn_request(&self, req: &TxnRequest) -> Result { - req.check_revision(self.compacted_revision(), self.revision())?; + /// Execute `DeleteRangeRequest` + fn execute_delete_range( + &self, + txn_db: &T, + index: &dyn IndexOperate, + req: &DeleteRangeRequest, + ) -> Result + where + T: XlineStorageOps, + { + self.generate_delete_range_resp(req, txn_db, index) + } + + /// Execute `DeleteRangeRequest` in Txn + fn execute_txn_delete_range( + &self, + txn_db: &T, + index: &dyn IndexOperate, + req: &DeleteRangeRequest, + revision: i64, + sub_revision: &mut i64, + ) -> Result + where + T: XlineStorageOps, + { + let response = self.generate_delete_range_resp(req, txn_db, index)?; + let _keys = Self::delete_keys( + txn_db, + index, + &req.key, + &req.range_end, + revision, + sub_revision, + )?; - let success = req + Ok(response) + } + + /// Execute `TxnRequest` + fn execute_txn( + &self, + txn_db: &Transaction, + index: &mut dyn IndexOperate, + request: &TxnRequest, + revision: i64, + sub_revision: &mut i64, + ) -> Result { + let success = request .compare .iter() - .all(|compare| self.check_compare(compare)); + .all(|compare| Self::check_compare(txn_db, index, compare)); + tracing::warn!("txn success in execute: {success}"); let requests = if success { - req.success.iter() + request.success.iter() } else { - req.failure.iter() + request.failure.iter() }; - let mut responses = Vec::with_capacity(requests.len()); - for request_op in requests { - let response = self.handle_kv_requests(&request_op.clone().into())?; - responses.push(response.into()); - } + + let responses = requests + .filter_map(|op| op.request.as_ref()) + .map(|req| match *req { + Request::RequestRange(ref r) => { + self.execute_range(txn_db, index, r).map(Into::into) + } + Request::RequestTxn(ref r) => self + .execute_txn(txn_db, index, r, revision, sub_revision) + .map(Into::into), + Request::RequestPut(ref r) => self + .execute_txn_put(txn_db, index, r, revision, sub_revision) + .map(Into::into), + Request::RequestDeleteRange(ref r) => self + .execute_txn_delete_range(txn_db, index, r, revision, sub_revision) + .map(Into::into), + }) + .collect::, _>>()?; + Ok(TxnResponse { header: Some(self.header_gen.gen_header()), succeeded: success, - responses, + responses: responses.into_iter().map(Into::into).collect(), }) } - /// Handle `CompactionRequest` - fn handle_compaction_request( + /// Execute `CompactionRequest` + fn execute_compaction( &self, req: &CompactionRequest, ) -> Result { @@ -680,110 +876,102 @@ impl KvStore { header: Some(self.header_gen.gen_header()), }) } +} - /// Sync requests in kv store - async fn sync_request( +/// Sync requests +impl KvStore { + /// Sync kv requests + fn sync_request( &self, wrapper: &RequestWrapper, - revision: i64, - ) -> Result<(i64, Vec), ExecuteError> { - debug!("After Sync {:?} with revision {}", wrapper, revision); - #[allow(clippy::wildcard_enum_match_arm)] // only kv requests can be sent to kv store - let (ops, events) = match *wrapper { - RequestWrapper::RangeRequest(_) => (Vec::new(), Vec::new()), - RequestWrapper::PutRequest(ref req) => self.sync_put_request(req, revision, 0)?, - RequestWrapper::DeleteRangeRequest(ref req) => { - self.sync_delete_range_request(req, revision, 0) + txn_db: &T, + index: &(dyn IndexOperate + Send + Sync), + revision_gen: &RevisionNumberGeneratorState<'_>, + to_execute: bool, + ) -> Result<(SyncResponse, Option), ExecuteError> + where + T: XlineStorageOps + TransactionApi, + { + debug!("Execute {:?}", wrapper); + warn!("after sync: {wrapper:?}"); + + let next_revision = revision_gen.get().overflow_add(1); + + #[allow(clippy::wildcard_enum_match_arm)] + let (events, execute_response): (_, Option) = match *wrapper { + RequestWrapper::RangeRequest(ref req) => { + self.sync_range(txn_db, index, req, to_execute) } - RequestWrapper::TxnRequest(ref req) => self.sync_txn_request(req, revision)?, - RequestWrapper::CompactionRequest(ref req) => { - self.sync_compaction_request(req, revision).await? + RequestWrapper::PutRequest(ref req) => { + self.sync_put(txn_db, index, req, next_revision, &mut 0, to_execute) } - _ => { - unreachable!("only kv requests can be sent to kv store"); + RequestWrapper::DeleteRangeRequest(ref req) => { + self.sync_delete_range(txn_db, index, req, next_revision, &mut 0, to_execute) } - }; - self.notify_updates(revision, events).await; - Ok((revision, ops)) - } + RequestWrapper::TxnRequest(ref req) => { + self.sync_txn(txn_db, index, req, next_revision, &mut 0, to_execute) + } + RequestWrapper::CompactionRequest(ref req) => self.sync_compaction(req, to_execute), + _ => unreachable!("Other request should not be sent to this store"), + }?; - /// Sync `CompactionRequest` and return if kvstore is changed - async fn sync_compaction_request( - &self, - req: &CompactionRequest, - _revision: i64, - ) -> Result<(Vec, Vec), ExecuteError> { - let revision = req.revision; - let ops = vec![WriteOp::PutScheduledCompactRevision(revision)]; - // TODO: Remove the physical process logic here. It's better to move into the KvServer - let (event, listener) = if req.physical { - let event = Arc::new(event_listener::Event::new()); - let listener = event.listen(); - (Some(event), Some(listener)) + let sync_response = if events.is_empty() { + SyncResponse::new(revision_gen.get()) } else { - (None, None) + self.notify_updates(next_revision, events); + SyncResponse::new(revision_gen.next()) }; - if let Err(e) = self.compact_task_tx.send((revision, event)).await { - panic!("the compactor exited unexpectedly: {e:?}"); - } - if let Some(listener) = listener { - listener.await; - } - Ok((ops, Vec::new())) + + tracing::warn!("sync response: {sync_response:?}"); + + Ok((sync_response, execute_response.map(CommandResponse::new))) } - /// Sync `TxnRequest` and return if kvstore is changed - fn sync_txn_request( + /// Sync `RangeRequest` + fn sync_range( &self, - req: &TxnRequest, - revision: i64, - ) -> Result<(Vec, Vec), ExecuteError> { - let mut sub_revision = 0; - let mut origin_reqs = VecDeque::from([Request::RequestTxn(req.clone())]); - let mut all_events = Vec::new(); - let mut all_ops = Vec::new(); - while let Some(request) = origin_reqs.pop_front() { - let (mut ops, mut events) = match request { - Request::RequestRange(_) => (Vec::new(), Vec::new()), - Request::RequestPut(ref put_req) => { - self.sync_put_request(put_req, revision, sub_revision)? - } - Request::RequestDeleteRange(del_req) => { - self.sync_delete_range_request(&del_req, revision, sub_revision) - } - Request::RequestTxn(txn_req) => { - let success = txn_req - .compare - .iter() - .all(|compare| self.check_compare(compare)); - let reqs_iter = if success { - txn_req.success.into_iter() - } else { - txn_req.failure.into_iter() - }; - origin_reqs.extend(reqs_iter.filter_map(|req_op| req_op.request)); - continue; - } - }; - sub_revision = sub_revision.overflow_add(events.len().numeric_cast()); - all_events.append(&mut events); - all_ops.append(&mut ops); - } - Ok((all_ops, all_events)) + txn_db: &T, + index: &dyn IndexOperate, + req: &RangeRequest, + to_execute: bool, + ) -> Result<(Vec, Option), ExecuteError> + where + T: XlineStorageOps, + { + Ok(( + vec![], + to_execute + .then(|| self.execute_range(txn_db, index, req).map(Into::into)) + .transpose()?, + )) } - /// Sync `PutRequest` and return if kvstore is changed - fn sync_put_request( + /// Sync `PutRequest` + fn sync_put( &self, + txn_db: &T, + index: &dyn IndexOperate, req: &PutRequest, revision: i64, - sub_revision: i64, - ) -> Result<(Vec, Vec), ExecuteError> { - let mut ops = Vec::new(); - let (new_rev, prev_rev) = - self.inner - .index - .register_revision(req.key.clone(), revision, sub_revision); + sub_revision: &mut i64, + to_execute: bool, + ) -> Result<(Vec, Option), ExecuteError> + where + T: XlineStorageOps, + { + let (new_rev, prev_rev_opt) = + index.register_revision(req.key.clone(), revision, *sub_revision); + let execute_resp = to_execute + .then(|| { + self.generate_put_resp( + req, + txn_db, + prev_rev_opt.map(|key_rev| key_rev.as_revision()), + ) + .map(|(resp, _)| resp.into()) + }) + .transpose()?; + let mut kv = KeyValue { key: req.key.clone(), value: req.value.clone(), @@ -792,9 +980,12 @@ impl KvStore { version: new_rev.version, lease: req.lease, }; + if req.ignore_lease || req.ignore_value { - let pre_mod_rev = prev_rev.ok_or(ExecuteError::KeyNotFound)?.mod_revision; - let prev_kv = self.inner.get_range(&req.key, &[], pre_mod_rev)?.pop(); + let prev_rev = prev_rev_opt + .map(|key_rev| key_rev.as_revision()) + .ok_or(ExecuteError::KeyNotFound)?; + let prev_kv = KvStoreInner::get_values(txn_db, &[prev_rev])?.pop(); let prev = prev_kv.as_ref().ok_or(ExecuteError::KeyNotFound)?; if req.ignore_lease { kv.lease = prev.lease; @@ -802,7 +993,7 @@ impl KvStore { if req.ignore_value { kv.value = prev.value.clone(); } - } + }; let old_lease = self.get_lease(&kv.key); if old_lease != 0 { @@ -811,20 +1002,155 @@ impl KvStore { } if req.lease != 0 { self.attach(req.lease, kv.key.as_slice()) - .unwrap_or_else(|e| panic!("unexpected error from lease Attach: {e}")); + .unwrap_or_else(|e| warn!("unexpected error from lease Attach: {e}")); } - ops.push(WriteOp::PutKeyValue(new_rev.as_revision(), kv.clone())); - let event = Event { + + txn_db.write_op(WriteOp::PutKeyValue(new_rev.as_revision(), kv.clone()))?; + *sub_revision = sub_revision.overflow_add(1); + + let events = vec![Event { #[allow(clippy::as_conversions)] // This cast is always valid r#type: EventType::Put as i32, kv: Some(kv), prev_kv: None, + }]; + + Ok((events, execute_resp)) + } + + /// Sync `DeleteRangeRequest` + fn sync_delete_range( + &self, + txn_db: &T, + index: &dyn IndexOperate, + req: &DeleteRangeRequest, + revision: i64, + sub_revision: &mut i64, + to_execute: bool, + ) -> Result<(Vec, Option), ExecuteError> + where + T: XlineStorageOps, + { + let execute_resp = to_execute + .then(|| self.generate_delete_range_resp(req, txn_db, index)) + .transpose()? + .map(Into::into); + + let keys = Self::delete_keys( + txn_db, + index, + &req.key, + &req.range_end, + revision, + sub_revision, + )?; + + Self::detach_leases(&keys, &self.lease_collection); + + Ok((Self::new_deletion_events(revision, keys), execute_resp)) + } + + /// Sync `TxnRequest` + fn sync_txn( + &self, + txn_db: &T, + index: &dyn IndexOperate, + request: &TxnRequest, + revision: i64, + sub_revision: &mut i64, + to_execute: bool, + ) -> Result<(Vec, Option), ExecuteError> + where + T: XlineStorageOps, + { + request.check_revision(self.compacted_revision(), self.revision())?; + let success = request + .compare + .iter() + .all(|compare| Self::check_compare(txn_db, index, compare)); + tracing::warn!("txn success: {success}"); + let requests = if success { + request.success.iter() + } else { + request.failure.iter() }; - Ok((ops, vec![event])) + + let (events, resps): (Vec<_>, Vec<_>) = requests + .filter_map(|op| op.request.as_ref()) + .map(|req| match *req { + Request::RequestRange(ref r) => self.sync_range(txn_db, index, r, to_execute), + Request::RequestTxn(ref r) => { + self.sync_txn(txn_db, index, r, revision, sub_revision, to_execute) + } + Request::RequestPut(ref r) => { + self.sync_put(txn_db, index, r, revision, sub_revision, to_execute) + } + Request::RequestDeleteRange(ref r) => { + self.sync_delete_range(txn_db, index, r, revision, sub_revision, to_execute) + } + }) + .collect::, _>>()? + .into_iter() + .unzip(); + + let resp = to_execute.then(|| { + TxnResponse { + header: Some(self.header_gen.gen_header()), + succeeded: success, + responses: resps + .into_iter() + .flat_map(Option::into_iter) + .map(Into::into) + .collect(), + } + .into() + }); + + Ok((events.into_iter().flatten().collect(), resp)) + } + + /// Sync `CompactionRequest` and return if kvstore is changed + fn sync_compaction( + &self, + req: &CompactionRequest, + to_execute: bool, + ) -> Result<(Vec, Option), ExecuteError> { + let revision = req.revision; + let ops = vec![WriteOp::PutScheduledCompactRevision(revision)]; + // TODO: Remove the physical process logic here. It's better to move into the + // KvServer + // FIXME: madsim is single threaded, we cannot use synchronous wait here + let index = self.index(); + let target_revisions = index + .compact(revision) + .into_iter() + .map(|key_rev| key_rev.as_revision().encode_to_vec()) + .collect::>>(); + // Given that the Xline uses a lim-tree database with smaller write amplification as the storage backend , does using progressive compaction really good at improving performance? + for revision_chunk in target_revisions.chunks(1000) { + if let Err(e) = self.compact(revision_chunk) { + panic!("failed to compact revision chunk {revision_chunk:?} due to {e}"); + } + } + if let Err(e) = self.compact_finished(revision) { + panic!("failed to set finished compact revision {revision:?} due to {e}"); + } + + self.inner.db.write_ops(ops)?; + + let resp = to_execute + .then(|| CompactionResponse { + header: Some(self.header_gen.gen_header()), + }) + .map(Into::into); + + Ok((vec![], resp)) } +} +impl KvStore { /// create events for a deletion - fn new_deletion_events(revision: i64, keys: Vec>) -> Vec { + pub(crate) fn new_deletion_events(revision: i64, keys: Vec>) -> Vec { keys.into_iter() .map(|key| { let kv = KeyValue { @@ -846,7 +1172,7 @@ impl KvStore { fn mark_deletions<'a>( revisions: &[(Revision, Revision)], keys: &[Vec], - ) -> Vec> { + ) -> (Vec>, Vec<(Vec, KeyRevision)>) { assert_eq!(keys.len(), revisions.len(), "Index doesn't match with DB"); keys.iter() .zip(revisions.iter()) @@ -856,49 +1182,67 @@ impl KvStore { mod_revision: new_rev.revision(), ..KeyValue::default() }; - WriteOp::PutKeyValue(new_rev, del_kv) - }) - .collect() - } - /// Sync `DeleteRangeRequest` and return if kvstore is changed - fn sync_delete_range_request( - &self, - req: &DeleteRangeRequest, - revision: i64, - sub_revision: i64, - ) -> (Vec, Vec) { - Self::delete_keys( - &self.inner.index, - &self.lease_collection, - &req.key, - &req.range_end, - revision, - sub_revision, - ) + let key_revision = ( + del_kv.key.clone(), + KeyRevision::new( + del_kv.create_revision, + del_kv.version, + new_rev.revision(), + new_rev.sub_revision(), + ), + ); + (WriteOp::PutKeyValue(new_rev, del_kv), key_revision) + }) + .unzip() } - /// Delete keys from index and detach them in lease collection, return all the write operations and events - pub(crate) fn delete_keys<'a>( - index: &Index, - lease_collection: &LeaseCollection, + /// Delete keys from index and detach them in lease collection, return all + /// the write operations and events + pub(crate) fn delete_keys( + txn_db: &T, + index: &dyn IndexOperate, key: &[u8], range_end: &[u8], revision: i64, - sub_revision: i64, - ) -> (Vec>, Vec) { - let mut ops = Vec::new(); - let (revisions, keys) = index.delete(key, range_end, revision, sub_revision); - let mut del_ops = Self::mark_deletions(&revisions, &keys); - ops.append(&mut del_ops); - for k in &keys { + sub_revision: &mut i64, + ) -> Result>, ExecuteError> + where + T: XlineStorageOps, + { + let (revisions, keys) = index.delete(key, range_end, revision, *sub_revision); + let (del_ops, key_revisions) = Self::mark_deletions(&revisions, &keys); + + index.insert(key_revisions); + + *sub_revision = sub_revision.overflow_add(del_ops.len().numeric_cast()); + for op in del_ops { + txn_db.write_op(op)?; + } + + Ok(keys) + } + + /// Detaches the leases + pub(crate) fn detach_leases(keys: &[Vec], lease_collection: &LeaseCollection) { + for k in keys { let lease_id = lease_collection.get_lease(k); lease_collection .detach(lease_id, k) .unwrap_or_else(|e| warn!("Failed to detach lease from a key, error: {:?}", e)); } - let events = Self::new_deletion_events(revision, keys); - (ops, events) + } +} + +impl KvStore { + /// Gets the index + pub(crate) fn index(&self) -> Arc { + Arc::clone(&self.inner.index) + } + + /// Gets the general revision generator + pub(crate) fn revision_gen(&self) -> Arc { + Arc::clone(&self.revision) } } @@ -963,9 +1307,7 @@ mod test { } } - async fn init_store( - db: Arc, - ) -> Result<(StoreWrapper, RevisionNumberGenerator), ExecuteError> { + fn init_store(db: Arc) -> Result<(StoreWrapper, RevisionNumberGenerator), ExecuteError> { let store = init_empty_store(db); let keys = vec!["a", "b", "c", "d", "e", "z", "z", "z"]; let vals = vec!["a", "b", "c", "d", "e", "z1", "z2", "z3"]; @@ -976,15 +1318,15 @@ mod test { value: val.into(), ..Default::default() }); - exe_as_and_flush(&store, &req, revision.next()).await?; + exe_as_and_flush(&store, &req)?; } Ok((store, revision)) } fn init_empty_store(db: Arc) -> StoreWrapper { let task_manager = Arc::new(TaskManager::new()); - let (compact_tx, compact_rx) = mpsc::channel(COMPACT_CHANNEL_SIZE); - let (kv_update_tx, kv_update_rx) = mpsc::channel(CHANNEL_SIZE); + let (compact_tx, compact_rx) = flume::bounded(COMPACT_CHANNEL_SIZE); + let (kv_update_tx, kv_update_rx) = flume::bounded(CHANNEL_SIZE); let lease_collection = Arc::new(LeaseCollection::new(0)); let header_gen = Arc::new(HeaderGenerator::new(0, 0)); let index = Arc::new(Index::new()); @@ -1015,13 +1357,18 @@ mod test { StoreWrapper(Some(storage), task_manager) } - async fn exe_as_and_flush( + fn exe_as_and_flush( store: &Arc, request: &RequestWrapper, - revision: i64, ) -> Result<(), ExecuteError> { - let (_sync_res, ops) = store.after_sync(request, revision).await?; - store.inner.db.write_ops(ops)?; + let txn_db = store.db().transaction(); + let index = store.index(); + let index_state = index.state(); + let rev_gen_state = store.revision.state(); + let _res = store.after_sync(request, &txn_db, &index_state, &rev_gen_state, false)?; + txn_db.commit().unwrap(); + index_state.commit(); + rev_gen_state.commit(); Ok(()) } @@ -1039,14 +1386,16 @@ mod test { #[abort_on_panic] async fn test_keys_only() -> Result<(), ExecuteError> { let db = DB::open(&EngineConfig::Memory)?; - let (store, _rev) = init_store(db).await?; + let (store, _rev) = init_store(db)?; let request = RangeRequest { key: vec![0], range_end: vec![0], keys_only: true, ..Default::default() }; - let response = store.handle_range_request(&request)?; + let txn_db = store.inner.db.transaction(); + let index = store.inner.index.state(); + let response = store.execute_range(&txn_db, &index, &request)?; assert_eq!(response.kvs.len(), 6); for kv in response.kvs { assert!(kv.value.is_empty()); @@ -1058,7 +1407,7 @@ mod test { #[abort_on_panic] async fn test_range_empty() -> Result<(), ExecuteError> { let db = DB::open(&EngineConfig::Memory)?; - let (store, _rev) = init_store(db).await?; + let (store, _rev) = init_store(db)?; let request = RangeRequest { key: "x".into(), @@ -1066,7 +1415,9 @@ mod test { keys_only: true, ..Default::default() }; - let response = store.handle_range_request(&request)?; + let txn_db = store.inner.db.transaction(); + let index = store.inner.index.state(); + let response = store.execute_range(&txn_db, &index, &request)?; assert_eq!(response.kvs.len(), 0); assert_eq!(response.count, 0); Ok(()) @@ -1076,7 +1427,7 @@ mod test { #[abort_on_panic] async fn test_range_filter() -> Result<(), ExecuteError> { let db = DB::open(&EngineConfig::Memory)?; - let (store, _rev) = init_store(db).await?; + let (store, _rev) = init_store(db)?; let request = RangeRequest { key: vec![0], @@ -1087,7 +1438,9 @@ mod test { min_mod_revision: 2, ..Default::default() }; - let response = store.handle_range_request(&request)?; + let txn_db = store.inner.db.transaction(); + let index = store.inner.index.state(); + let response = store.execute_range(&txn_db, &index, &request)?; assert_eq!(response.count, 6); assert_eq!(response.kvs.len(), 2); assert_eq!(response.kvs[0].create_revision, 2); @@ -1099,7 +1452,7 @@ mod test { #[abort_on_panic] async fn test_range_sort() -> Result<(), ExecuteError> { let db = DB::open(&EngineConfig::Memory)?; - let (store, _rev) = init_store(db).await?; + let (store, _rev) = init_store(db)?; let keys = ["a", "b", "c", "d", "e", "z"]; let reversed_keys = ["z", "e", "d", "c", "b", "a"]; let version_keys = ["z", "a", "b", "c", "d", "e"]; @@ -1111,7 +1464,9 @@ mod test { SortTarget::Mod, SortTarget::Value, ] { - let response = store.handle_range_request(&sort_req(order, target))?; + let txn_db = store.inner.db.transaction(); + let index = store.inner.index.state(); + let response = store.execute_range(&txn_db, &index, &sort_req(order, target))?; assert_eq!(response.count, 6); assert_eq!(response.kvs.len(), 6); let expected: [&str; 6] = match order { @@ -1132,7 +1487,10 @@ mod test { } } for order in [SortOrder::Ascend, SortOrder::Descend, SortOrder::None] { - let response = store.handle_range_request(&sort_req(order, SortTarget::Version))?; + let txn_db = store.inner.db.transaction(); + let index = store.inner.index.state(); + let response = + store.execute_range(&txn_db, &index, &sort_req(order, SortTarget::Version))?; assert_eq!(response.count, 6); assert_eq!(response.kvs.len(), 6); let expected = match order { @@ -1159,7 +1517,7 @@ mod test { let db = DB::open(&EngineConfig::Memory)?; let ops = vec![WriteOp::PutScheduledCompactRevision(8)]; db.write_ops(ops)?; - let (store, _rev_gen) = init_store(Arc::clone(&db)).await?; + let (store, _rev_gen) = init_store(Arc::clone(&db))?; assert_eq!(store.inner.index.get_from_rev(b"z", b"", 5).len(), 3); let new_store = init_empty_store(db); @@ -1169,13 +1527,18 @@ mod test { range_end: vec![], ..Default::default() }; - let res = new_store.handle_range_request(&range_req)?; + + let txn_db = new_store.inner.db.transaction(); + let index = new_store.inner.index.state(); + let res = new_store.execute_range(&txn_db, &index, &range_req)?; assert_eq!(res.kvs.len(), 0); assert_eq!(new_store.compacted_revision(), -1); new_store.recover().await?; - let res = new_store.handle_range_request(&range_req)?; + let txn_db_recovered = new_store.inner.db.transaction(); + let index_recovered = new_store.inner.index.state(); + let res = store.execute_range(&txn_db_recovered, &index_recovered, &range_req)?; assert_eq!(res.kvs.len(), 1); assert_eq!(res.kvs[0].key, b"a"); assert_eq!(new_store.compacted_revision(), 8); @@ -1224,14 +1587,17 @@ mod test { }], }); let db = DB::open(&EngineConfig::Memory)?; - let (store, rev) = init_store(db).await?; - exe_as_and_flush(&store, &txn_req, rev.next()).await?; + let (store, _rev) = init_store(db)?; + exe_as_and_flush(&store, &txn_req)?; let request = RangeRequest { key: "success".into(), range_end: vec![], ..Default::default() }; - let response = store.handle_range_request(&request)?; + + let txn_db = store.inner.db.transaction(); + let index = store.inner.index.state(); + let response = store.execute_range(&txn_db, &index, &request)?; assert_eq!(response.count, 1); assert_eq!(response.kvs.len(), 1); assert_eq!(response.kvs[0].value, "1".as_bytes()); @@ -1243,7 +1609,7 @@ mod test { #[abort_on_panic] async fn test_kv_store_index_available() { let db = DB::open(&EngineConfig::Memory).unwrap(); - let (store, revision) = init_store(Arc::clone(&db)).await.unwrap(); + let (store, _revision) = init_store(Arc::clone(&db)).unwrap(); let handle = tokio::spawn({ let store = Arc::clone(&store); async move { @@ -1253,15 +1619,13 @@ mod test { value: vec![i], ..Default::default() }); - exe_as_and_flush(&store, &req, revision.next()) - .await - .unwrap(); + exe_as_and_flush(&store, &req).unwrap(); } } }); tokio::time::sleep(std::time::Duration::from_micros(50)).await; let revs = store.inner.index.get_from_rev(b"foo", b"", 1); - let kvs = store.inner.get_values(&revs).unwrap(); + let kvs = KvStoreInner::get_values(&db.transaction(), &revs).unwrap(); assert_eq!( kvs.len(), revs.len(), @@ -1271,10 +1635,10 @@ mod test { } #[tokio::test(flavor = "multi_thread")] + #[allow(clippy::too_many_lines)] // TODO: splits this test async fn test_compaction() -> Result<(), ExecuteError> { let db = DB::open(&EngineConfig::Memory)?; let store = init_empty_store(db); - let revision = RevisionNumberGenerator::default(); // sample requests: (a, 1) (b, 2) (a, 3) (del a) // their revisions: 2 3 4 5 let requests = vec![ @@ -1300,20 +1664,25 @@ mod test { ]; for req in requests { - exe_as_and_flush(&store, &req, revision.next()) - .await - .unwrap(); + exe_as_and_flush(&store, &req).unwrap(); } let target_revisions = index_compact(&store, 3); store.compact(target_revisions.as_ref())?; + + let txn_db = store.inner.db.transaction(); + let index = store.inner.index.state(); assert_eq!( - store.inner.get_range(b"a", b"", 2).unwrap().len(), + KvStoreInner::get_range(&txn_db, &index, b"a", b"", 2) + .unwrap() + .len(), 1, "(a, 1) should not be removed" ); assert_eq!( - store.inner.get_range(b"b", b"", 3).unwrap().len(), + KvStoreInner::get_range(&txn_db, &index, b"b", b"", 3) + .unwrap() + .len(), 1, "(b, 2) should not be removed" ); @@ -1321,16 +1690,22 @@ mod test { let target_revisions = index_compact(&store, 4); store.compact(target_revisions.as_ref())?; assert!( - store.inner.get_range(b"a", b"", 2).unwrap().is_empty(), + KvStoreInner::get_range(&txn_db, &index, b"a", b"", 2) + .unwrap() + .is_empty(), "(a, 1) should be removed" ); assert_eq!( - store.inner.get_range(b"b", b"", 3).unwrap().len(), + KvStoreInner::get_range(&txn_db, &index, b"b", b"", 3) + .unwrap() + .len(), 1, "(b, 2) should not be removed" ); assert_eq!( - store.inner.get_range(b"a", b"", 4).unwrap().len(), + KvStoreInner::get_range(&txn_db, &index, b"a", b"", 4) + .unwrap() + .len(), 1, "(a, 3) should not be removed" ); @@ -1338,20 +1713,28 @@ mod test { let target_revisions = index_compact(&store, 5); store.compact(target_revisions.as_ref())?; assert!( - store.inner.get_range(b"a", b"", 2).unwrap().is_empty(), + KvStoreInner::get_range(&txn_db, &index, b"a", b"", 2) + .unwrap() + .is_empty(), "(a, 1) should be removed" ); assert_eq!( - store.inner.get_range(b"b", b"", 3).unwrap().len(), + KvStoreInner::get_range(&txn_db, &index, b"b", b"", 3) + .unwrap() + .len(), 1, "(b, 2) should not be removed" ); assert!( - store.inner.get_range(b"a", b"", 4).unwrap().is_empty(), + KvStoreInner::get_range(&txn_db, &index, b"a", b"", 4) + .unwrap() + .is_empty(), "(a, 3) should be removed" ); assert!( - store.inner.get_range(b"a", b"", 5).unwrap().is_empty(), + KvStoreInner::get_range(&txn_db, &index, b"a", b"", 5) + .unwrap() + .is_empty(), "(a, 4) should be removed" ); diff --git a/crates/xline/src/storage/kvwatcher.rs b/crates/xline/src/storage/kvwatcher.rs index ab6dd5955..6b8524b56 100644 --- a/crates/xline/src/storage/kvwatcher.rs +++ b/crates/xline/src/storage/kvwatcher.rs @@ -383,7 +383,7 @@ impl KvWatcher { /// Create a new `Arc` pub(crate) fn new_arc( kv_store_inner: Arc, - kv_update_rx: mpsc::Receiver<(i64, Vec)>, + kv_update_rx: flume::Receiver<(i64, Vec)>, sync_victims_interval: Duration, task_manager: &TaskManager, ) -> Arc { @@ -405,13 +405,13 @@ impl KvWatcher { #[allow(clippy::arithmetic_side_effects, clippy::ignored_unit_patterns)] // Introduced by tokio::select! async fn kv_updates_task( kv_watcher: Arc, - mut kv_update_rx: mpsc::Receiver<(i64, Vec)>, + kv_update_rx: flume::Receiver<(i64, Vec)>, shutdown_listener: Listener, ) { loop { tokio::select! { - updates = kv_update_rx.recv() => { - let Some(updates) = updates else { + updates = kv_update_rx.recv_async() => { + let Ok(updates) = updates else { return; }; kv_watcher.handle_kv_updates(updates); @@ -592,7 +592,7 @@ mod test { use std::{collections::BTreeMap, time::Duration}; - use clippy_utilities::{NumericCast, OverflowArithmetic}; + use engine::TransactionApi; use test_macros::abort_on_panic; use tokio::time::{sleep, timeout}; use utils::config::EngineConfig; @@ -604,18 +604,18 @@ mod test { rpc::PutRequest, storage::{ compact::COMPACT_CHANNEL_SIZE, db::DB, index::Index, lease_store::LeaseCollection, - storage_api::XlineStorageOps, KvStore, + KvStore, }, }; - fn init_empty_store(task_manager: &TaskManager) -> (Arc, Arc, Arc) { - let (compact_tx, _compact_rx) = mpsc::channel(COMPACT_CHANNEL_SIZE); + fn init_empty_store(task_manager: &TaskManager) -> (Arc, Arc) { + let (compact_tx, _compact_rx) = flume::bounded(COMPACT_CHANNEL_SIZE); let db = DB::open(&EngineConfig::Memory).unwrap(); let header_gen = Arc::new(HeaderGenerator::new(0, 0)); let index = Arc::new(Index::new()); let lease_collection = Arc::new(LeaseCollection::new(0)); - let (kv_update_tx, kv_update_rx) = mpsc::channel(128); - let kv_store_inner = Arc::new(KvStoreInner::new(index, Arc::clone(&db))); + let (kv_update_tx, kv_update_rx) = flume::bounded(128); + let kv_store_inner = Arc::new(KvStoreInner::new(index, db)); let store = Arc::new(KvStore::new( Arc::clone(&kv_store_inner), header_gen, @@ -630,14 +630,14 @@ mod test { sync_victims_interval, task_manager, ); - (store, db, kv_watcher) + (store, kv_watcher) } #[tokio::test(flavor = "multi_thread")] #[abort_on_panic] async fn watch_should_not_lost_events() { let task_manager = Arc::new(TaskManager::new()); - let (store, db, kv_watcher) = init_empty_store(&task_manager); + let (store, kv_watcher) = init_empty_store(&task_manager); let mut map = BTreeMap::new(); let (event_tx, mut event_rx) = mpsc::channel(128); let stop_notify = Arc::new(event_listener::Event::new()); @@ -654,14 +654,7 @@ mod test { let store = Arc::clone(&store); async move { for i in 0..100_u8 { - put( - store.as_ref(), - db.as_ref(), - "foo", - vec![i], - i.overflow_add(2).numeric_cast(), - ) - .await; + put(store.as_ref(), "foo", vec![i]); } } }); @@ -694,7 +687,7 @@ mod test { #[abort_on_panic] async fn test_victim() { let task_manager = Arc::new(TaskManager::new()); - let (store, db, kv_watcher) = init_empty_store(&task_manager); + let (store, kv_watcher) = init_empty_store(&task_manager); // response channel with capacity 1, so it will be full easily, then we can trigger victim let (event_tx, mut event_rx) = mpsc::channel(1); let stop_notify = Arc::new(event_listener::Event::new()); @@ -723,14 +716,7 @@ mod test { }); for i in 0..100_u8 { - put( - store.as_ref(), - db.as_ref(), - "foo", - vec![i], - i.numeric_cast(), - ) - .await; + put(store.as_ref(), "foo", vec![i]); } handle.await.unwrap(); drop(store); @@ -741,7 +727,7 @@ mod test { #[abort_on_panic] async fn test_cancel_watcher() { let task_manager = Arc::new(TaskManager::new()); - let (store, _db, kv_watcher) = init_empty_store(&task_manager); + let (store, kv_watcher) = init_empty_store(&task_manager); let (event_tx, _event_rx) = mpsc::channel(1); let stop_notify = Arc::new(event_listener::Event::new()); kv_watcher.watch( @@ -761,19 +747,22 @@ mod test { task_manager.shutdown(true).await; } - async fn put( - store: &KvStore, - db: &DB, - key: impl Into>, - value: impl Into>, - revision: i64, - ) { + fn put(store: &KvStore, key: impl Into>, value: impl Into>) { let req = RequestWrapper::from(PutRequest { key: key.into(), value: value.into(), ..Default::default() }); - let (_sync_res, ops) = store.after_sync(&req, revision).await.unwrap(); - db.write_ops(ops).unwrap(); + let txn = store.db().transaction(); + let index = store.index(); + let index_state = index.state(); + let rev_gen = store.revision_gen(); + let rev_gen_state = rev_gen.state(); + store + .after_sync(&req, &txn, &index_state, &rev_gen_state, false) + .unwrap(); + txn.commit().unwrap(); + index_state.commit(); + rev_gen_state.commit(); } } diff --git a/crates/xline/src/storage/lease_store/mod.rs b/crates/xline/src/storage/lease_store/mod.rs index 619bb67c6..beeac074c 100644 --- a/crates/xline/src/storage/lease_store/mod.rs +++ b/crates/xline/src/storage/lease_store/mod.rs @@ -16,10 +16,11 @@ use std::{ time::Duration, }; +use clippy_utilities::OverflowArithmetic; +use engine::TransactionApi; use log::debug; use parking_lot::RwLock; use prost::Message; -use tokio::sync::mpsc; use utils::table_names::LEASE_TABLE; use xlineapi::{ command::{CommandResponse, SyncResponse}, @@ -29,10 +30,12 @@ use xlineapi::{ pub(crate) use self::{lease::Lease, lease_collection::LeaseCollection}; use super::{ db::{WriteOp, DB}, - index::Index, + index::{Index, IndexOperate}, + storage_api::XlineStorageOps, }; use crate::{ header_gen::HeaderGenerator, + revision_number::RevisionNumberGeneratorState, rpc::{ Event, LeaseGrantRequest, LeaseGrantResponse, LeaseLeasesRequest, LeaseLeasesResponse, LeaseRevokeRequest, LeaseRevokeResponse, LeaseStatus, PbLease, RequestWrapper, @@ -51,12 +54,13 @@ pub(crate) struct LeaseStore { lease_collection: Arc, /// Db to store lease db: Arc, + #[allow(unused)] // used in tests /// Key to revision index index: Arc, /// Header generator header_gen: Arc, /// KV update sender - kv_update_tx: mpsc::Sender<(i64, Vec)>, + kv_update_tx: flume::Sender<(i64, Vec)>, /// Primary flag is_primary: AtomicBool, /// cache unsynced lease id @@ -72,7 +76,7 @@ impl LeaseStore { header_gen: Arc, db: Arc, index: Arc, - kv_update_tx: mpsc::Sender<(i64, Vec)>, + kv_update_tx: flume::Sender<(i64, Vec)>, is_leader: bool, ) -> Self { Self { @@ -97,14 +101,26 @@ impl LeaseStore { } /// sync a lease request - pub(crate) async fn after_sync( + pub(crate) fn after_sync( &self, request: &RequestWrapper, - revision: i64, - ) -> Result<(SyncResponse, Vec), ExecuteError> { - self.sync_request(request, revision) - .await - .map(|(rev, ops)| (SyncResponse::new(rev), ops)) + revision_gen: &RevisionNumberGeneratorState<'_>, + txn_db: &T, + index: &I, + ) -> Result<(SyncResponse, Vec), ExecuteError> + where + T: XlineStorageOps + TransactionApi, + I: IndexOperate, + { + let next_revision = revision_gen.get().overflow_add(1); + let updated = self.sync_request(request, next_revision, txn_db, index)?; + let rev = if updated { + revision_gen.next() + } else { + revision_gen.get() + }; + // TODO: return only a `SyncResponse` + Ok((SyncResponse::new(rev), vec![])) } /// Get lease by id @@ -268,36 +284,47 @@ impl LeaseStore { } /// Sync `RequestWithToken` - async fn sync_request( + fn sync_request( &self, wrapper: &RequestWrapper, revision: i64, - ) -> Result<(i64, Vec), ExecuteError> { + txn_db: &T, + index: &I, + ) -> Result + where + T: XlineStorageOps + TransactionApi, + I: IndexOperate, + { #[allow(clippy::wildcard_enum_match_arm)] - let ops = match *wrapper { + let updated = match *wrapper { RequestWrapper::LeaseGrantRequest(ref req) => { debug!("Sync LeaseGrantRequest {:?}", req); - self.sync_lease_grant_request(req) + self.sync_lease_grant_request(req, txn_db)?; + false } RequestWrapper::LeaseRevokeRequest(ref req) => { debug!("Sync LeaseRevokeRequest {:?}", req); - self.sync_lease_revoke_request(req, revision).await? + self.sync_lease_revoke_request(req, revision, txn_db, index)? } RequestWrapper::LeaseLeasesRequest(ref req) => { debug!("Sync LeaseLeasesRequest {:?}", req); - vec![] + false } _ => unreachable!("Other request should not be sent to this store"), }; - Ok((revision, ops)) + Ok(updated) } /// Sync `LeaseGrantRequest` - fn sync_lease_grant_request(&self, req: &LeaseGrantRequest) -> Vec { + fn sync_lease_grant_request( + &self, + req: &LeaseGrantRequest, + txn_db: &T, + ) -> Result<(), ExecuteError> { let lease = self .lease_collection .grant(req.id, req.ttl, self.is_primary()); - vec![WriteOp::PutLease(lease)] + txn_db.write_op(WriteOp::PutLease(lease)) } /// Get all `PbLease` @@ -315,14 +342,20 @@ impl LeaseStore { } /// Sync `LeaseRevokeRequest` - async fn sync_lease_revoke_request( + #[allow(clippy::trivially_copy_pass_by_ref)] // we can only get a reference in the caller + fn sync_lease_revoke_request( &self, req: &LeaseRevokeRequest, revision: i64, - ) -> Result, ExecuteError> { - let mut ops = Vec::new(); + txn_db: &T, + index: &I, + ) -> Result + where + T: XlineStorageOps + TransactionApi, + I: IndexOperate, + { let mut updates = Vec::new(); - ops.push(WriteOp::DeleteLease(req.id)); + txn_db.write_op(WriteOp::DeleteLease(req.id))?; let del_keys = match self.lease_collection.look_up(req.id) { Some(l) => l.keys(), @@ -331,28 +364,24 @@ impl LeaseStore { if del_keys.is_empty() { let _ignore = self.lease_collection.revoke(req.id); - return Ok(Vec::new()); + return Ok(false); } - for (key, sub_revision) in del_keys.iter().zip(0..) { - let (mut del_ops, mut del_event) = KvStore::delete_keys( - &self.index, - &self.lease_collection, - key, - &[], - revision, - sub_revision, - ); - ops.append(&mut del_ops); + for (key, mut sub_revision) in del_keys.iter().zip(0..) { + let deleted = + KvStore::delete_keys(txn_db, index, key, &[], revision, &mut sub_revision)?; + KvStore::detach_leases(&deleted, &self.lease_collection); + let mut del_event = KvStore::new_deletion_events(revision, deleted); updates.append(&mut del_event); } let _ignore = self.lease_collection.revoke(req.id); assert!( - self.kv_update_tx.send((revision, updates)).await.is_ok(), + self.kv_update_tx.send((revision, updates)).is_ok(), "Failed to send updates to KV watcher" ); - Ok(ops) + + Ok(true) } } @@ -364,17 +393,20 @@ mod test { use utils::config::EngineConfig; use super::*; - use crate::storage::{db::DB, storage_api::XlineStorageOps}; + use crate::{ + revision_number::RevisionNumberGenerator, + storage::{db::DB, storage_api::XlineStorageOps}, + }; #[tokio::test(flavor = "multi_thread")] #[abort_on_panic] async fn test_lease_storage() -> Result<(), Box> { let db = DB::open(&EngineConfig::Memory)?; - let lease_store = init_store(db); - let revision_gen = lease_store.header_gen.general_revision_arc(); + let (lease_store, rev_gen) = init_store(db); + let rev_gen_state = rev_gen.state(); let req1 = RequestWrapper::from(LeaseGrantRequest { ttl: 10, id: 1 }); - let _ignore1 = exe_and_sync_req(&lease_store, &req1, -1).await?; + let _ignore1 = exe_and_sync_req(&lease_store, &req1, &rev_gen_state)?; let lo = lease_store.look_up(1).unwrap(); assert_eq!(lo.id(), 1); @@ -388,7 +420,7 @@ mod test { lease_store.lease_collection.detach(1, "key".as_bytes())?; let req2 = RequestWrapper::from(LeaseRevokeRequest { id: 1 }); - let _ignore2 = exe_and_sync_req(&lease_store, &req2, revision_gen.next()).await?; + let _ignore2 = exe_and_sync_req(&lease_store, &req2, &rev_gen_state)?; assert!(lease_store.look_up(1).is_none()); assert!(lease_store.leases().is_empty()); @@ -396,9 +428,9 @@ mod test { let req4 = RequestWrapper::from(LeaseGrantRequest { ttl: 10, id: 4 }); let req5 = RequestWrapper::from(LeaseRevokeRequest { id: 3 }); let req6 = RequestWrapper::from(LeaseLeasesRequest {}); - let _ignore3 = exe_and_sync_req(&lease_store, &req3, -1).await?; - let _ignore4 = exe_and_sync_req(&lease_store, &req4, -1).await?; - let resp_1 = exe_and_sync_req(&lease_store, &req6, -1).await?; + let _ignore3 = exe_and_sync_req(&lease_store, &req3, &rev_gen_state)?; + let _ignore4 = exe_and_sync_req(&lease_store, &req4, &rev_gen_state)?; + let resp_1 = exe_and_sync_req(&lease_store, &req6, &rev_gen_state)?; let ResponseWrapper::LeaseLeasesResponse(leases_1) = resp_1 else { panic!("wrong response type: {resp_1:?}"); @@ -406,8 +438,8 @@ mod test { assert_eq!(leases_1.leases[0].id, 3); assert_eq!(leases_1.leases[1].id, 4); - let _ignore5 = exe_and_sync_req(&lease_store, &req5, -1).await?; - let resp_2 = exe_and_sync_req(&lease_store, &req6, -1).await?; + let _ignore5 = exe_and_sync_req(&lease_store, &req5, &rev_gen_state)?; + let resp_2 = exe_and_sync_req(&lease_store, &req6, &rev_gen_state)?; let ResponseWrapper::LeaseLeasesResponse(leases_2) = resp_2 else { panic!("wrong response type: {resp_2:?}"); }; @@ -419,7 +451,10 @@ mod test { #[tokio::test(flavor = "multi_thread")] async fn test_lease_sync() -> Result<(), Box> { let db = DB::open(&EngineConfig::Memory)?; - let lease_store = init_store(db); + let txn = db.transaction(); + let index = Index::new(); + let (lease_store, rev_gen) = init_store(Arc::clone(&db)); + let rev_gen_state = rev_gen.state(); let wait_duration = Duration::from_millis(1); let req1 = RequestWrapper::from(LeaseGrantRequest { ttl: 10, id: 1 }); @@ -432,7 +467,7 @@ mod test { "the future should block until the lease is synced" ); - let (_ignore, ops) = lease_store.after_sync(&req1, -1).await?; + let (_ignore, ops) = lease_store.after_sync(&req1, &rev_gen_state, &txn, &index)?; lease_store.db.write_ops(ops)?; lease_store.mark_lease_synced(&req1); @@ -453,7 +488,7 @@ mod test { "the future should block until the lease is synced" ); - let (_ignore, ops) = lease_store.after_sync(&req2, -1).await?; + let (_ignore, ops) = lease_store.after_sync(&req2, &rev_gen_state, &txn, &index)?; lease_store.db.write_ops(ops)?; lease_store.mark_lease_synced(&req2); @@ -471,13 +506,14 @@ mod test { #[abort_on_panic] async fn test_recover() -> Result<(), ExecuteError> { let db = DB::open(&EngineConfig::Memory)?; - let store = init_store(Arc::clone(&db)); + let (store, rev_gen) = init_store(Arc::clone(&db)); + let rev_gen_state = rev_gen.state(); let req1 = RequestWrapper::from(LeaseGrantRequest { ttl: 10, id: 1 }); - let _ignore1 = exe_and_sync_req(&store, &req1, -1).await?; + let _ignore1 = exe_and_sync_req(&store, &req1, &rev_gen_state)?; store.lease_collection.attach(1, "key".into())?; - let new_store = init_store(db); + let (new_store, _) = init_store(db); assert!(new_store.look_up(1).is_none()); new_store.recover()?; @@ -492,22 +528,30 @@ mod test { Ok(()) } - fn init_store(db: Arc) -> LeaseStore { + fn init_store(db: Arc) -> (LeaseStore, RevisionNumberGenerator) { let lease_collection = Arc::new(LeaseCollection::new(0)); - let (kv_update_tx, _) = mpsc::channel(1); + let (kv_update_tx, _) = flume::bounded(1); let header_gen = Arc::new(HeaderGenerator::new(0, 0)); let index = Arc::new(Index::new()); - LeaseStore::new(lease_collection, header_gen, db, index, kv_update_tx, true) + ( + LeaseStore::new(lease_collection, header_gen, db, index, kv_update_tx, true), + RevisionNumberGenerator::new(1), + ) } - async fn exe_and_sync_req( + fn exe_and_sync_req( ls: &LeaseStore, req: &RequestWrapper, - revision: i64, + rev_gen: &RevisionNumberGeneratorState<'_>, ) -> Result { let cmd_res = ls.execute(req)?; - let (_ignore, ops) = ls.after_sync(req, revision).await?; - ls.db.write_ops(ops)?; + let txn = ls.db.transaction(); + let index = ls.index.state(); + let (_ignore, _ops) = ls.after_sync(req, rev_gen, &txn, &index)?; + txn.commit() + .map_err(|e| ExecuteError::DbError(e.to_string()))?; + index.commit(); + rev_gen.commit(); Ok(cmd_res.into_inner()) } } diff --git a/crates/xline/tests/it/lease_test.rs b/crates/xline/tests/it/lease_test.rs index ca4b8b67f..036235913 100644 --- a/crates/xline/tests/it/lease_test.rs +++ b/crates/xline/tests/it/lease_test.rs @@ -43,7 +43,7 @@ async fn test_lease_keep_alive() -> Result<(), Box> { let non_leader_ep = cluster.get_client_url(1); let client = cluster.client().await; - let res = client.lease_client().grant(1, None).await?; + let res = client.lease_client().grant(3, None).await?; let lease_id = res.id; assert!(lease_id > 0); @@ -65,7 +65,7 @@ async fn test_lease_keep_alive() -> Result<(), Box> { let (mut keeper, mut stream) = c.keep_alive(lease_id).await?; let handle = tokio::spawn(async move { loop { - tokio::time::sleep(Duration::from_millis(500)).await; + tokio::time::sleep(Duration::from_millis(1500)).await; let _ = keeper.keep_alive(); if let Ok(Some(r)) = stream.message().await { info!("keep alive response: {:?}", r); @@ -79,7 +79,7 @@ async fn test_lease_keep_alive() -> Result<(), Box> { assert_eq!(res.kvs[0].value, b"bar"); handle.abort(); - tokio::time::sleep(Duration::from_secs(2)).await; + tokio::time::sleep(Duration::from_secs(6)).await; let res = client.kv_client().range("foo", None).await?; assert_eq!(res.kvs.len(), 0); diff --git a/crates/xlineapi/src/lib.rs b/crates/xlineapi/src/lib.rs index c152912b8..1b88bb8e6 100644 --- a/crates/xlineapi/src/lib.rs +++ b/crates/xlineapi/src/lib.rs @@ -543,6 +543,11 @@ impl RequestWrapper { ) } + /// Check whether the kv request or lease request should skip the revision or not + pub fn skip_lease_revision(&self) -> bool { + matches!(self, RequestWrapper::LeaseGrantRequest(_)) + } + /// Check whether the kv request or lease request should skip the revision or not pub fn skip_general_revision(&self) -> bool { match self {