From 076c6f050d91c0ba5bfa114aad929090adc4126f Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 21 Nov 2024 09:24:57 +0100 Subject: [PATCH 1/8] Add new termination modes to the Admin API --- crates/admin/src/rest_api/invocations.rs | 10 ++++++++++ crates/types/src/invocation.rs | 22 +++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/crates/admin/src/rest_api/invocations.rs b/crates/admin/src/rest_api/invocations.rs index 5f25d4452..c0282660d 100644 --- a/crates/admin/src/rest_api/invocations.rs +++ b/crates/admin/src/rest_api/invocations.rs @@ -31,6 +31,10 @@ pub enum DeletionMode { Kill, #[serde(alias = "purge")] Purge, + #[serde(alias = "kill-and-restart")] + KillAndRestart, + #[serde(alias = "cancel-and-restart")] + CancelAndRestart, } #[derive(Debug, Default, Deserialize, JsonSchema)] pub struct DeleteInvocationParams { @@ -90,6 +94,12 @@ pub async fn delete_invocation( Command::TerminateInvocation(InvocationTermination::kill(invocation_id)) } DeletionMode::Purge => Command::PurgeInvocation(PurgeInvocationRequest { invocation_id }), + DeletionMode::CancelAndRestart => { + Command::TerminateInvocation(InvocationTermination::cancel_and_restart(invocation_id)) + } + DeletionMode::KillAndRestart => { + Command::TerminateInvocation(InvocationTermination::kill_and_restart(invocation_id)) + } }; let partition_key = invocation_id.partition_key(); diff --git a/crates/types/src/invocation.rs b/crates/types/src/invocation.rs index 240760f7c..125dd6e44 100644 --- a/crates/types/src/invocation.rs +++ b/crates/types/src/invocation.rs @@ -773,15 +773,35 @@ impl InvocationTermination { flavor: TerminationFlavor::Cancel, } } + + pub const fn kill_and_restart(invocation_id: InvocationId) -> Self { + Self { + invocation_id, + flavor: TerminationFlavor::KillAndRestart, + } + } + + pub const fn cancel_and_restart(invocation_id: InvocationId) -> Self { + Self { + invocation_id, + flavor: TerminationFlavor::CancelAndRestart, + } + } } -/// Flavor of the termination. Can be kill (hard stop) or graceful cancel. +/// Flavor of the termination. #[derive(Debug, Clone, Copy, Eq, PartialEq, serde::Serialize, serde::Deserialize)] pub enum TerminationFlavor { /// hard termination, no clean up Kill, + /// hard termination with restart afterward using same input + #[serde(alias = "kill-and-restart")] + KillAndRestart, /// graceful termination allowing the invocation to clean up Cancel, + /// graceful termination allowing the invocation to clean up with restart afterward using same input + #[serde(alias = "cancel-and-restart")] + CancelAndRestart, } /// Message to purge an invocation. From f41ef980e5862779ae6c49c3e13dcc407a7cbb71 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Fri, 22 Nov 2024 12:34:43 +0100 Subject: [PATCH 2/8] Add field InFlightInvocationMetadata.restate_when_completed, used to store when the invocation should be restarted. Add new termination flavors to Outbox table. When doing so, I reorganized a bit the data structure of outbox termination. --- .../tests/invocation_status_table_test/mod.rs | 3 + .../proto/dev/restate/storage/v1/domain.proto | 17 +++++ .../src/invocation_status_table/mod.rs | 5 ++ crates/storage-api/src/storage.rs | 71 ++++++++++++++++++- 4 files changed, 95 insertions(+), 1 deletion(-) diff --git a/crates/partition-store/src/tests/invocation_status_table_test/mod.rs b/crates/partition-store/src/tests/invocation_status_table_test/mod.rs index 7da572ea7..54bbe81b7 100644 --- a/crates/partition-store/src/tests/invocation_status_table_test/mod.rs +++ b/crates/partition-store/src/tests/invocation_status_table_test/mod.rs @@ -92,6 +92,7 @@ fn invoked_status(invocation_target: InvocationTarget) -> InvocationStatus { source: Source::Ingress(*RPC_REQUEST_ID), completion_retention_duration: Duration::ZERO, idempotency_key: None, + restart_when_completed: false, }) } @@ -105,6 +106,7 @@ fn killed_status(invocation_target: InvocationTarget) -> InvocationStatus { source: Source::Ingress(*RPC_REQUEST_ID), completion_retention_duration: Duration::ZERO, idempotency_key: None, + restart_when_completed: false, }) } @@ -119,6 +121,7 @@ fn suspended_status(invocation_target: InvocationTarget) -> InvocationStatus { source: Source::Ingress(*RPC_REQUEST_ID), completion_retention_duration: Duration::ZERO, idempotency_key: None, + restart_when_completed: false, }, waiting_for_completed_entries: HashSet::default(), } diff --git a/crates/storage-api/proto/dev/restate/storage/v1/domain.proto b/crates/storage-api/proto/dev/restate/storage/v1/domain.proto index 3083aaf25..babc960f1 100644 --- a/crates/storage-api/proto/dev/restate/storage/v1/domain.proto +++ b/crates/storage-api/proto/dev/restate/storage/v1/domain.proto @@ -153,6 +153,7 @@ message InvocationStatusV2 { uint32 journal_length = 14; optional string deployment_id = 15; optional dev.restate.service.protocol.ServiceProtocolVersion service_protocol_version = 16; + bool restart_when_completed = 23; // Suspended repeated uint32 waiting_for_completed_entries = 17; @@ -519,14 +520,29 @@ message OutboxMessage { ResponseResult response_result = 3; } + // TODO remove this in Restate 1.3 message OutboxKill { InvocationId invocation_id = 1; } + // TODO remove this in Restate 1.3 message OutboxCancel { InvocationId invocation_id = 1; } + message OutboxTermination { + enum TerminationFlavor { + UNKNOWN = 0; + KILL = 1; + KILL_AND_RESTART = 2; + CANCEL = 3; + CANCEL_AND_RESTART = 4; + } + + InvocationId invocation_id = 1; + TerminationFlavor flavor = 2; + } + message AttachInvocationRequest { oneof query { InvocationId invocation_id = 1; @@ -543,6 +559,7 @@ message OutboxMessage { OutboxKill kill = 4; OutboxCancel cancel = 5; AttachInvocationRequest attach_invocation_request = 6; + OutboxTermination termination = 7; } } diff --git a/crates/storage-api/src/invocation_status_table/mod.rs b/crates/storage-api/src/invocation_status_table/mod.rs index 677112f3d..2530fc202 100644 --- a/crates/storage-api/src/invocation_status_table/mod.rs +++ b/crates/storage-api/src/invocation_status_table/mod.rs @@ -473,6 +473,9 @@ pub struct InFlightInvocationMetadata { /// If zero, the invocation completion will not be retained. pub completion_retention_duration: Duration, pub idempotency_key: Option, + + /// When the invocation completes, restart it. + pub restart_when_completed: bool, } impl InFlightInvocationMetadata { @@ -496,6 +499,7 @@ impl InFlightInvocationMetadata { completion_retention_duration: pre_flight_invocation_metadata .completion_retention_duration, idempotency_key: pre_flight_invocation_metadata.idempotency_key, + restart_when_completed: false, }, InvocationInput { argument: pre_flight_invocation_metadata.argument, @@ -624,6 +628,7 @@ mod test_util { source: Source::Ingress(PartitionProcessorRpcRequestId::default()), completion_retention_duration: Duration::ZERO, idempotency_key: None, + restart_when_completed: false, } } } diff --git a/crates/storage-api/src/storage.rs b/crates/storage-api/src/storage.rs index 7d609df1a..9101a0204 100644 --- a/crates/storage-api/src/storage.rs +++ b/crates/storage-api/src/storage.rs @@ -91,7 +91,8 @@ pub mod v1 { use crate::storage::v1::journal_entry::completion_result::{Empty, Failure, Success}; use crate::storage::v1::journal_entry::{completion_result, CompletionResult, Entry, Kind}; use crate::storage::v1::outbox_message::{ - OutboxCancel, OutboxKill, OutboxServiceInvocation, OutboxServiceInvocationResponse, + outbox_termination, OutboxCancel, OutboxKill, OutboxServiceInvocation, + OutboxServiceInvocationResponse, OutboxTermination, }; use crate::storage::v1::service_invocation_response_sink::{ Ingress, PartitionProcessor, ResponseSink, @@ -349,6 +350,7 @@ pub mod v1 { journal_length, deployment_id, service_protocol_version, + restart_when_completed, waiting_for_completed_entries, result, } = value; @@ -443,6 +445,7 @@ pub mod v1 { .unwrap_or_default() .try_into()?, idempotency_key: idempotency_key.map(ByteString::from), + restart_when_completed, }, )) } @@ -465,6 +468,7 @@ pub mod v1 { .unwrap_or_default() .try_into()?, idempotency_key: idempotency_key.map(ByteString::from), + restart_when_completed, }, waiting_for_completed_entries: waiting_for_completed_entries .into_iter() @@ -490,6 +494,7 @@ pub mod v1 { .unwrap_or_default() .try_into()?, idempotency_key: idempotency_key.map(ByteString::from), + restart_when_completed, }, )) } @@ -568,6 +573,7 @@ pub mod v1 { journal_length: 0, deployment_id: None, service_protocol_version: None, + restart_when_completed: false, waiting_for_completed_entries: vec![], result: None, }, @@ -621,6 +627,7 @@ pub mod v1 { journal_length: 0, deployment_id: None, service_protocol_version: None, + restart_when_completed: false, waiting_for_completed_entries: vec![], result: None, }, @@ -634,6 +641,7 @@ pub mod v1 { source, completion_retention_duration, idempotency_key, + restart_when_completed, }, ) => { let (deployment_id, service_protocol_version) = match pinned_deployment { @@ -683,6 +691,7 @@ pub mod v1 { journal_length: journal_metadata.length, deployment_id, service_protocol_version, + restart_when_completed, waiting_for_completed_entries: vec![], result: None, } @@ -698,6 +707,7 @@ pub mod v1 { source, completion_retention_duration, idempotency_key, + restart_when_completed, }, waiting_for_completed_entries, } => { @@ -748,6 +758,7 @@ pub mod v1 { journal_length: journal_metadata.length, deployment_id, service_protocol_version, + restart_when_completed, waiting_for_completed_entries: waiting_for_completed_entries .into_iter() .collect(), @@ -764,6 +775,7 @@ pub mod v1 { source, completion_retention_duration, idempotency_key, + restart_when_completed, }, ) => { let (deployment_id, service_protocol_version) = match pinned_deployment { @@ -813,6 +825,7 @@ pub mod v1 { journal_length: journal_metadata.length, deployment_id, service_protocol_version, + restart_when_completed, waiting_for_completed_entries: vec![], result: None, } @@ -857,6 +870,7 @@ pub mod v1 { journal_length: 0, deployment_id: None, service_protocol_version: None, + restart_when_completed: false, waiting_for_completed_entries: vec![], result: Some(response_result.into()), }, @@ -1044,6 +1058,7 @@ pub mod v1 { source, completion_retention_duration: completion_retention_time, idempotency_key, + restart_when_completed: false, }) } } @@ -1155,6 +1170,7 @@ pub mod v1 { source: caller, completion_retention_duration: completion_retention_time, idempotency_key, + restart_when_completed: false, }, waiting_for_completed_entries, )) @@ -2647,6 +2663,41 @@ pub mod v1 { ), ) } + outbox_message::OutboxMessage::Termination(outbox_termination) => { + crate::outbox_table::OutboxMessage::InvocationTermination( + InvocationTermination { + invocation_id: restate_types::identifiers::InvocationId::try_from( + outbox_termination + .invocation_id + .ok_or(ConversionError::missing_field("invocation_id"))?, + )?, + flavor: match outbox_termination::TerminationFlavor::try_from( + outbox_termination.flavor, + ) + .map_err(|e| ConversionError::invalid_data(e))? + { + outbox_termination::TerminationFlavor::Unknown => { + return Err(ConversionError::UnexpectedEnumVariant( + "termination flavor", + outbox_termination.flavor, + )) + } + outbox_termination::TerminationFlavor::Kill => { + TerminationFlavor::Kill + } + outbox_termination::TerminationFlavor::KillAndRestart => { + TerminationFlavor::KillAndRestart + } + outbox_termination::TerminationFlavor::Cancel => { + TerminationFlavor::Cancel + } + outbox_termination::TerminationFlavor::CancelAndRestart => { + TerminationFlavor::CancelAndRestart + } + }, + }, + ) + } outbox_message::OutboxMessage::AttachInvocationRequest( outbox_message::AttachInvocationRequest { block_on_inflight, @@ -2731,6 +2782,24 @@ pub mod v1 { )), }) } + TerminationFlavor::KillAndRestart => { + outbox_message::OutboxMessage::Termination(OutboxTermination { + invocation_id: Some(InvocationId::from( + invocation_termination.invocation_id, + )), + flavor: outbox_termination::TerminationFlavor::KillAndRestart + .into(), + }) + } + TerminationFlavor::CancelAndRestart => { + outbox_message::OutboxMessage::Termination(OutboxTermination { + invocation_id: Some(InvocationId::from( + invocation_termination.invocation_id, + )), + flavor: outbox_termination::TerminationFlavor::CancelAndRestart + .into(), + }) + } }, crate::outbox_table::OutboxMessage::AttachInvocation( restate_types::invocation::AttachInvocationRequest { From 083751da2ca47d22b762c4c96b105391010aeac3 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Fri, 22 Nov 2024 12:35:08 +0100 Subject: [PATCH 3/8] Add functionality in journal table to delete a range of the journal, rather than the whole journal. --- .../partition-store/src/journal_table/mod.rs | 18 +++++++++++------- crates/storage-api/src/journal_table/mod.rs | 10 +++++++++- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/crates/partition-store/src/journal_table/mod.rs b/crates/partition-store/src/journal_table/mod.rs index b54f19f7a..c0007348a 100644 --- a/crates/partition-store/src/journal_table/mod.rs +++ b/crates/partition-store/src/journal_table/mod.rs @@ -25,7 +25,7 @@ use restate_types::identifiers::{ }; use restate_types::storage::StorageCodec; use std::io::Cursor; -use std::ops::RangeInclusive; +use std::ops::{Range, RangeInclusive}; define_table_key!( Journal, @@ -121,14 +121,14 @@ fn all_journals( })) } -fn delete_journal( +fn delete_journal_range( storage: &mut S, invocation_id: &InvocationId, - journal_length: EntryIndex, + journal_range: Range, ) { let mut key = write_journal_entry_key(invocation_id, 0); let k = &mut key; - for journal_index in 0..journal_length { + for journal_index in journal_range { k.journal_index = Some(journal_index); storage.delete_key(k); } @@ -201,10 +201,14 @@ impl<'a> JournalTable for PartitionStoreTransaction<'a> { put_journal_entry(self, invocation_id, journal_index, journal_entry) } - async fn delete_journal(&mut self, invocation_id: &InvocationId, journal_length: EntryIndex) { + async fn delete_journal_range( + &mut self, + invocation_id: &InvocationId, + journal_range: Range, + ) { self.assert_partition_key(invocation_id); - let _x = RocksDbPerfGuard::new("delete-journal"); - delete_journal(self, invocation_id, journal_length) + let _x = RocksDbPerfGuard::new("delete-journal-range"); + delete_journal_range(self, invocation_id, journal_range) } } diff --git a/crates/storage-api/src/journal_table/mod.rs b/crates/storage-api/src/journal_table/mod.rs index cd7469943..c2cb8b0f6 100644 --- a/crates/storage-api/src/journal_table/mod.rs +++ b/crates/storage-api/src/journal_table/mod.rs @@ -14,7 +14,7 @@ use restate_types::identifiers::{EntryIndex, InvocationId, JournalEntryId, Parti use restate_types::journal::enriched::EnrichedRawEntry; use restate_types::journal::{CompletionResult, EntryType}; use std::future::Future; -use std::ops::RangeInclusive; +use std::ops::{Range, RangeInclusive}; /// Different types of journal entries persisted by the runtime #[derive(Debug, Clone, PartialEq, Eq)] @@ -78,5 +78,13 @@ pub trait JournalTable: ReadOnlyJournalTable { &mut self, invocation_id: &InvocationId, journal_length: EntryIndex, + ) -> impl Future + Send { + self.delete_journal_range(invocation_id, 0..journal_length) + } + + fn delete_journal_range( + &mut self, + invocation_id: &InvocationId, + journal_range: Range, ) -> impl Future + Send; } From bdd4b9b24185d491b98fbec00404f5ba6ced5626 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Fri, 22 Nov 2024 12:35:34 +0100 Subject: [PATCH 4/8] Implement kill and retry/cancel and retry functionality --- .../worker/src/partition/state_machine/mod.rs | 246 +++++++++++++++++- 1 file changed, 235 insertions(+), 11 deletions(-) diff --git a/crates/worker/src/partition/state_machine/mod.rs b/crates/worker/src/partition/state_machine/mod.rs index 8c9906aaf..8c0f4879f 100644 --- a/crates/worker/src/partition/state_machine/mod.rs +++ b/crates/worker/src/partition/state_machine/mod.rs @@ -87,7 +87,7 @@ use std::fmt::{Debug, Formatter}; use std::marker::PhantomData; use std::ops::RangeInclusive; use std::time::Instant; -use tracing::error; +use tracing::{error, info}; use utils::SpanExt; #[derive(Debug, Hash, enumset::EnumSetType, strum::Display)] @@ -852,12 +852,20 @@ impl StateMachine { ctx: &mut StateMachineApplyContext<'_, State>, InvocationTermination { invocation_id, - flavor: termination_flavor, + flavor, }: InvocationTermination, ) -> Result<(), Error> { - match termination_flavor { + match flavor { TerminationFlavor::Kill => self.on_kill_invocation(ctx, invocation_id).await, TerminationFlavor::Cancel => self.on_cancel_invocation(ctx, invocation_id).await, + TerminationFlavor::KillAndRestart => { + self.on_kill_and_restart_invocation(ctx, invocation_id) + .await + } + TerminationFlavor::CancelAndRestart => { + self.on_cancel_and_restart_invocation(ctx, invocation_id) + .await + } } } @@ -880,11 +888,11 @@ impl StateMachine { match status { InvocationStatus::Invoked(metadata) => { - self.kill_invoked_invocation(ctx, invocation_id, metadata) + self.kill_invoked_invocation(ctx, invocation_id, metadata, false) .await?; } InvocationStatus::Suspended { metadata, .. } => { - self.kill_suspended_invocation(ctx, invocation_id, metadata) + self.kill_suspended_invocation(ctx, invocation_id, metadata, false) .await?; } InvocationStatus::Inboxed(inboxed) => { @@ -1013,6 +1021,157 @@ impl StateMachine { Ok(()) } + async fn on_kill_and_restart_invocation< + State: VirtualObjectStatusTable + + InvocationStatusTable + + InboxTable + + FsmTable + + StateTable + + JournalTable + + OutboxTable + + TimerTable + + FsmTable, + >( + &mut self, + ctx: &mut StateMachineApplyContext<'_, State>, + invocation_id: InvocationId, + ) -> Result<(), Error> { + let status = ctx.get_invocation_status(&invocation_id).await?; + + match status { + InvocationStatus::Invoked(metadata) => { + self.kill_invoked_invocation(ctx, invocation_id, metadata, true) + .await?; + } + InvocationStatus::Suspended { metadata, .. } => { + self.kill_suspended_invocation(ctx, invocation_id, metadata, true) + .await?; + } + InvocationStatus::Inboxed(_) => { + info!("Received kill and restart command for invocation '{invocation_id}' in inboxed state. Ignoring it as it was not yet executed."); + } + InvocationStatus::Scheduled(_) => { + info!("Received kill and restart command for invocation '{invocation_id}' in scheduled state. Ignoring it as it was not yet executed."); + } + InvocationStatus::Killed(mut metadata) => { + if !metadata.restart_when_completed { + // Update the restart_when_completed. + // This might happen if the user sent in a quick succession two kill commands, the first without retry and the second with retry + metadata.restart_when_completed = true; + ctx.storage + .put_invocation_status(&invocation_id, &InvocationStatus::Killed(metadata)) + .await; + } + + // Nothing to do here really, let's send again the abort signal to the invoker just in case + Self::do_send_abort_invocation_to_invoker(ctx, invocation_id, true); + } + InvocationStatus::Completed(_) => { + info!("Received kill and restart command for completed invocation '{invocation_id}'. To cleanup the invocation after it's been completed, use the purge invocation command."); + } + InvocationStatus::Free => { + trace!("Received kill and restart command for unknown invocation with id '{invocation_id}'."); + // We still try to send the abort signal to the invoker, + // as it might be the case that previously the user sent an abort signal + // but some message was still between the invoker/PP queues. + // This can happen because the invoke/resume and the abort invoker messages end up in different queues, + // and the abort message can overtake the invoke/resume. + // Consequently the invoker might have not received the abort and the user tried to send it again. + Self::do_send_abort_invocation_to_invoker(ctx, invocation_id, false); + } + }; + + Ok(()) + } + + async fn on_cancel_and_restart_invocation< + State: VirtualObjectStatusTable + + InvocationStatusTable + + InboxTable + + FsmTable + + StateTable + + JournalTable + + OutboxTable + + TimerTable, + >( + &mut self, + ctx: &mut StateMachineApplyContext<'_, State>, + invocation_id: InvocationId, + ) -> Result<(), Error> { + let status = ctx.get_invocation_status(&invocation_id).await?; + + match status { + InvocationStatus::Invoked(mut metadata) => { + self.cancel_journal_leaves( + ctx, + invocation_id, + InvocationStatusProjection::Invoked, + metadata.journal_metadata.length, + ) + .await?; + if !metadata.restart_when_completed { + // Update the restart_when_completed. + metadata.restart_when_completed = true; + ctx.storage + .put_invocation_status(&invocation_id, &InvocationStatus::Invoked(metadata)) + .await; + } + } + InvocationStatus::Suspended { + mut metadata, + waiting_for_completed_entries, + } => { + if self + .cancel_journal_leaves( + ctx, + invocation_id, + InvocationStatusProjection::Suspended(waiting_for_completed_entries), + metadata.journal_metadata.length, + ) + .await? + { + metadata.restart_when_completed = true; + Self::do_resume_service(ctx, invocation_id, metadata).await?; + } + } + InvocationStatus::Inboxed(_) => { + info!("Received cancel and restart command for invocation '{invocation_id}' in inboxed state. Ignoring it as it was not yet executed."); + } + InvocationStatus::Scheduled(_) => { + info!("Received cancel and restart command for invocation '{invocation_id}' in scheduled state. Ignoring it as it was not yet executed."); + } + InvocationStatus::Killed(mut metadata) => { + if !metadata.restart_when_completed { + // Update the restart_when_completed. + // This might happen if the user sent in a quick succession two kill commands, the first without retry and the second with retry + metadata.restart_when_completed = true; + ctx.storage + .put_invocation_status(&invocation_id, &InvocationStatus::Killed(metadata)) + .await; + } + + // Nothing to do here really, let's send again the abort signal to the invoker just in case + Self::do_send_abort_invocation_to_invoker(ctx, invocation_id, true); + } + InvocationStatus::Completed(_) => { + debug!("Received cancel command for completed invocation '{invocation_id}'. To cleanup the invocation after it's been completed, use the purge invocation command."); + } + InvocationStatus::Free => { + trace!("Received cancel command for unknown invocation with id '{invocation_id}'."); + // We still try to send the abort signal to the invoker, + // as it might be the case that previously the user sent an abort signal + // but some message was still between the invoker/PP queues. + // This can happen because the invoke/resume and the abort invoker messages end up in different queues, + // and the abort message can overtake the invoke/resume. + // Consequently the invoker might have not received the abort and the user tried to send it again. + // TODO + Self::do_send_abort_invocation_to_invoker(ctx, invocation_id, false); + } + }; + + Ok(()) + } + async fn terminate_inboxed_invocation< State: InvocationStatusTable + InboxTable + OutboxTable + FsmTable, >( @@ -1023,8 +1182,10 @@ impl StateMachine { inboxed_invocation: InboxedInvocation, ) -> Result<(), Error> { let error = match termination_flavor { - TerminationFlavor::Kill => KILLED_INVOCATION_ERROR, - TerminationFlavor::Cancel => CANCELED_INVOCATION_ERROR, + TerminationFlavor::Kill | TerminationFlavor::KillAndRestart => KILLED_INVOCATION_ERROR, + TerminationFlavor::Cancel | TerminationFlavor::CancelAndRestart => { + CANCELED_INVOCATION_ERROR + } }; let InboxedInvocation { @@ -1082,8 +1243,10 @@ impl StateMachine { scheduled_invocation: ScheduledInvocation, ) -> Result<(), Error> { let error = match termination_flavor { - TerminationFlavor::Kill => KILLED_INVOCATION_ERROR, - TerminationFlavor::Cancel => CANCELED_INVOCATION_ERROR, + TerminationFlavor::Kill | TerminationFlavor::KillAndRestart => KILLED_INVOCATION_ERROR, + TerminationFlavor::Cancel | TerminationFlavor::CancelAndRestart => { + CANCELED_INVOCATION_ERROR + } }; let ScheduledInvocation { @@ -1145,7 +1308,8 @@ impl StateMachine { &mut self, ctx: &mut StateMachineApplyContext<'_, State>, invocation_id: InvocationId, - metadata: InFlightInvocationMetadata, + mut metadata: InFlightInvocationMetadata, + restart: bool, ) -> Result<(), Error> { self.kill_child_invocations(ctx, &invocation_id, metadata.journal_metadata.length) .await?; @@ -1160,11 +1324,17 @@ impl StateMachine { "Effect: Store killed invocation" ); + metadata.restart_when_completed = restart; ctx.storage .put_invocation_status(&invocation_id, &InvocationStatus::Killed(metadata)) .await; Self::do_send_abort_invocation_to_invoker(ctx, invocation_id, true); } else { + if restart { + // Kill and restart won't work without ExperimentalFeature::InvocationStatusKilled + warn!("Ignoring the kill and restart command for '{invocation_id}' and simply executing kill, as this command is not implemented yet") + } + self.end_invocation( ctx, invocation_id, @@ -1190,11 +1360,16 @@ impl StateMachine { &mut self, ctx: &mut StateMachineApplyContext<'_, State>, invocation_id: InvocationId, - metadata: InFlightInvocationMetadata, + mut metadata: InFlightInvocationMetadata, + restart: bool, ) -> Result<(), Error> { self.kill_child_invocations(ctx, &invocation_id, metadata.journal_metadata.length) .await?; + if restart { + metadata.restart_when_completed = true + } + // No need to go through the Killed state when we're suspended, // because it means we already got a terminal state from the invoker. self.end_invocation( @@ -1683,6 +1858,12 @@ impl StateMachine { // If given, this will override any Output Entry available in the journal table response_result_override: Option, ) -> Result<(), Error> { + if invocation_metadata.restart_when_completed { + return self + .restart_invocation(ctx, invocation_id, invocation_metadata) + .await; + } + let invocation_target = invocation_metadata.invocation_target.clone(); let journal_length = invocation_metadata.journal_metadata.length; let completion_retention_time = invocation_metadata.completion_retention_duration; @@ -1761,6 +1942,49 @@ impl StateMachine { Ok(()) } + async fn restart_invocation< + State: InboxTable + + VirtualObjectStatusTable + + JournalTable + + OutboxTable + + FsmTable + + InvocationStatusTable + + StateTable, + >( + &mut self, + ctx: &mut StateMachineApplyContext<'_, State>, + invocation_id: InvocationId, + mut invocation_metadata: InFlightInvocationMetadata, + ) -> Result<(), Error> { + info!("Restarting invocation"); + + // We need to cleanup the journal except the first item. + let journal_length = invocation_metadata.journal_metadata.length; + debug_if_leader!( + ctx.is_leader, + restate.journal.length = journal_length, + "Effect: Drop journal except first entry" + ); + ctx.storage + .delete_journal_range(&invocation_id, 1..journal_length) + .await; + + // Let's reset a bunch of parameters in the InFlightInvocationMetadata + invocation_metadata.journal_metadata.length = 1; + invocation_metadata.pinned_deployment = None; + invocation_metadata.restart_when_completed = false; + + Self::invoke( + ctx, + invocation_id, + invocation_metadata, + InvokeInputJournal::NoCachedJournal, + ) + .await?; + + Ok(()) + } + #[allow(clippy::too_many_arguments)] async fn send_response_to_sinks( &mut self, From 2fc253f4f33e0bbbb865791f0197e9c13d8266fd Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Fri, 22 Nov 2024 12:36:48 +0100 Subject: [PATCH 5/8] Expose the cancel and retry/kill and retry commands in the CLI --- cli/src/clients/admin_interface.rs | 21 ++++++++++++++++++--- cli/src/commands/invocations/cancel.rs | 18 ++++++++++++------ 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/cli/src/clients/admin_interface.rs b/cli/src/clients/admin_interface.rs index 8cf91859d..2fcdbecbd 100644 --- a/cli/src/clients/admin_interface.rs +++ b/cli/src/clients/admin_interface.rs @@ -41,7 +41,12 @@ pub trait AdminClientInterface { async fn purge_invocation(&self, id: &str) -> reqwest::Result>; - async fn cancel_invocation(&self, id: &str, kill: bool) -> reqwest::Result>; + async fn cancel_invocation( + &self, + id: &str, + kill: bool, + retry: bool, + ) -> reqwest::Result>; async fn patch_state( &self, @@ -132,7 +137,12 @@ impl AdminClientInterface for AdminClient { self.run(reqwest::Method::DELETE, url).await } - async fn cancel_invocation(&self, id: &str, kill: bool) -> reqwest::Result> { + async fn cancel_invocation( + &self, + id: &str, + kill: bool, + retry: bool, + ) -> reqwest::Result> { let mut url = self .base_url .join(&format!("/invocations/{id}")) @@ -140,7 +150,12 @@ impl AdminClientInterface for AdminClient { url.set_query(Some(&format!( "mode={}", - if kill { "kill" } else { "cancel" } + match (kill, retry) { + (false, false) => "cancel", + (false, true) => "cancel-and-restart", + (true, false) => "kill", + (true, true) => "kill-and-restart", + } ))); self.run(reqwest::Method::DELETE, url).await diff --git a/cli/src/commands/invocations/cancel.rs b/cli/src/commands/invocations/cancel.rs index 8cd8586e0..d026811e5 100644 --- a/cli/src/commands/invocations/cancel.rs +++ b/cli/src/commands/invocations/cancel.rs @@ -37,6 +37,9 @@ pub struct Cancel { /// Ungracefully kill the invocation and its children #[clap(long)] kill: bool, + /// After cancelling/killing, restart the invocation using the same input. + #[clap(long, alias = "retry")] + restart: bool, } pub async fn run_cancel(State(env): State, opts: &Cancel) -> Result<()> { @@ -67,16 +70,19 @@ pub async fn run_cancel(State(env): State, opts: &Cancel) -> Result<()> // Get the invocation and confirm let prompt = format!( "Are you sure you want to {} these invocations?", - if opts.kill { - Styled(Style::Danger, "kill") - } else { - Styled(Style::Warn, "cancel") - }, + match (opts.kill, opts.restart) { + (false, false) => Styled(Style::Warn, "cancel"), + (false, true) => Styled(Style::Warn, "cancel and restart"), + (true, false) => Styled(Style::Danger, "kill"), + (true, true) => Styled(Style::Danger, "kill and restart"), + } ); confirm_or_exit(&prompt)?; for inv in invocations { - let result = client.cancel_invocation(&inv.id, opts.kill).await?; + let result = client + .cancel_invocation(&inv.id, opts.kill, opts.restart) + .await?; let _ = result.success_or_error()?; } From fef23e458f2c688e80e666edc44efd6349866397 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Fri, 22 Nov 2024 14:03:02 +0100 Subject: [PATCH 6/8] Add unit tests --- .../state_machine/tests/kill_cancel.rs | 284 ++++++++++++++++++ .../partition/state_machine/tests/matchers.rs | 13 + 2 files changed, 297 insertions(+) diff --git a/crates/worker/src/partition/state_machine/tests/kill_cancel.rs b/crates/worker/src/partition/state_machine/tests/kill_cancel.rs index 81675c136..a2c48764f 100644 --- a/crates/worker/src/partition/state_machine/tests/kill_cancel.rs +++ b/crates/worker/src/partition/state_machine/tests/kill_cancel.rs @@ -10,6 +10,7 @@ use super::{fixtures, matchers, *}; +use crate::partition::state_machine::tests::matchers::{invoked, killed, suspended}; use assert2::assert; use assert2::let_assert; use googletest::any; @@ -157,6 +158,7 @@ async fn terminate_scheduled_invocation( response: eq(IngressResponseResult::Failure(match termination_flavor { TerminationFlavor::Kill => KILLED_INVOCATION_ERROR, TerminationFlavor::Cancel => CANCELED_INVOCATION_ERROR, + _ => panic!("Unexpected termination flavor"), })) })) ); @@ -172,6 +174,288 @@ async fn terminate_scheduled_invocation( Ok(()) } +#[rstest] +#[case(ExperimentalFeature::InvocationStatusKilled.into(), TerminationFlavor::KillAndRestart)] +#[case(EnumSet::empty(), TerminationFlavor::KillAndRestart)] +#[case(EnumSet::empty(), TerminationFlavor::CancelAndRestart)] +#[tokio::test] +async fn terminate_and_restart_scheduled_invocation_has_no_effect( + #[case] experimental_features: EnumSet, + #[case] termination_flavor: TerminationFlavor, +) -> anyhow::Result<()> { + let mut test_env = TestEnv::create_with_experimental_features(experimental_features).await; + + let invocation_id = InvocationId::mock_random(); + + let _ = test_env + .apply(Command::Invoke(ServiceInvocation { + invocation_id, + execution_time: Some(MillisSinceEpoch::MAX), + ..ServiceInvocation::mock() + })) + .await; + + // assert that inboxed invocation is in invocation_status + let scheduled_invocation_status = test_env + .storage() + .get_invocation_status(&invocation_id) + .await?; + assert!(let InvocationStatus::Scheduled(_) = scheduled_invocation_status); + + // This has no effect + let actions = test_env + .apply(Command::TerminateInvocation(InvocationTermination { + invocation_id, + flavor: termination_flavor, + })) + .await; + assert_that!(actions, empty()); + + // Invocation status didn't change at all + assert_eq!( + scheduled_invocation_status, + test_env + .storage() + .get_invocation_status(&invocation_id) + .await + .unwrap() + ); + + test_env.shutdown().await; + Ok(()) +} + +#[rstest] +#[case(ExperimentalFeature::InvocationStatusKilled.into(), TerminationFlavor::KillAndRestart)] +#[case(EnumSet::empty(), TerminationFlavor::KillAndRestart)] +#[case(EnumSet::empty(), TerminationFlavor::CancelAndRestart)] +#[tokio::test] +async fn terminate_and_restart_inboxed_invocation_has_no_effect( + #[case] experimental_features: EnumSet, + #[case] termination_flavor: TerminationFlavor, +) -> anyhow::Result<()> { + let mut test_env = TestEnv::create_with_experimental_features(experimental_features).await; + + let invocation_target = InvocationTarget::mock_virtual_object(); + let invocation_id = InvocationId::mock_generate(&invocation_target); + + // First invocation takes the lock + let _ = test_env + .apply(Command::Invoke(ServiceInvocation { + invocation_id: InvocationId::mock_generate(&invocation_target), + invocation_target: invocation_target.clone(), + ..ServiceInvocation::mock() + })) + .await; + + // This invocation will be inboxed + let _ = test_env + .apply(Command::Invoke(ServiceInvocation { + invocation_id, + invocation_target: invocation_target.clone(), + ..ServiceInvocation::mock() + })) + .await; + + // assert that inboxed invocation is in invocation_status + let inboxed_invocation_status = test_env + .storage() + .get_invocation_status(&invocation_id) + .await?; + assert!(let InvocationStatus::Inboxed(_) = inboxed_invocation_status); + + // This has no effect + let actions = test_env + .apply(Command::TerminateInvocation(InvocationTermination { + invocation_id, + flavor: termination_flavor, + })) + .await; + assert_that!(actions, empty()); + + // Invocation status didn't change at all + assert_eq!( + inboxed_invocation_status, + test_env + .storage() + .get_invocation_status(&invocation_id) + .await + .unwrap() + ); + + test_env.shutdown().await; + Ok(()) +} + +#[tokio::test] +async fn kill_and_restart_when_invoked() -> anyhow::Result<()> { + let mut test_env = TestEnv::create_with_experimental_features( + ExperimentalFeature::InvocationStatusKilled.into(), + ) + .await; + + let invocation_id = InvocationId::mock_random(); + + // Start invocation and pin the deployment and add one entry (doesn't matter which one) + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + ..ServiceInvocation::mock() + }), + Command::InvokerEffect(InvokerEffect { + invocation_id, + kind: InvokerEffectKind::PinnedDeployment(PinnedDeployment { + deployment_id: Default::default(), + service_protocol_version: Default::default(), + }), + }), + Command::InvokerEffect(InvokerEffect { + invocation_id, + kind: InvokerEffectKind::JournalEntry { + entry_index: 1, + entry: ProtobufRawEntryCodec::serialize_enriched(Entry::ClearAllState), + }, + }), + ]) + .await; + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await?, + invoked() + ); + + // First we should transition to killed status + let _ = test_env + .apply(Command::TerminateInvocation(InvocationTermination { + invocation_id, + flavor: TerminationFlavor::KillAndRestart, + })) + .await; + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await?, + killed() + ); + + // Now send the Failed invoker effect to complete the kill procedure + let actions = test_env + .apply(Command::InvokerEffect(InvokerEffect { + invocation_id, + kind: InvokerEffectKind::Failed(KILLED_INVOCATION_ERROR), + })) + .await; + + // Should have restarted the invocation + assert_that!( + actions, + contains(matchers::actions::invoke_for_id(invocation_id)) + ); + + // We should be back to invoked state with a reset journal and reset deployment id + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await?, + pat!(InvocationStatus::Invoked(pat!( + InFlightInvocationMetadata { + journal_metadata: pat!(JournalMetadata { length: eq(1) }), + pinned_deployment: none(), + restart_when_completed: eq(false) + } + ))) + ); + + test_env.shutdown().await; + Ok(()) +} + +#[tokio::test] +async fn kill_and_restart_when_suspended() -> anyhow::Result<()> { + let mut test_env = TestEnv::create_with_experimental_features( + ExperimentalFeature::InvocationStatusKilled.into(), + ) + .await; + + let invocation_id = InvocationId::mock_random(); + + // Start invocation, pin the deployment, add one completable entry, suspend + let _ = test_env + .apply_multiple([ + Command::Invoke(ServiceInvocation { + invocation_id, + ..ServiceInvocation::mock() + }), + Command::InvokerEffect(InvokerEffect { + invocation_id, + kind: InvokerEffectKind::PinnedDeployment(PinnedDeployment { + deployment_id: Default::default(), + service_protocol_version: Default::default(), + }), + }), + Command::InvokerEffect(InvokerEffect { + invocation_id, + kind: InvokerEffectKind::JournalEntry { + entry_index: 1, + entry: ProtobufRawEntryCodec::serialize_enriched(Entry::Awakeable( + AwakeableEntry { result: None }, + )), + }, + }), + Command::InvokerEffect(InvokerEffect { + invocation_id, + kind: InvokerEffectKind::Suspended { + waiting_for_completed_entries: [1].into(), + }, + }), + ]) + .await; + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await?, + suspended() + ); + + // This should immediately restart + let actions = test_env + .apply(Command::TerminateInvocation(InvocationTermination { + invocation_id, + flavor: TerminationFlavor::KillAndRestart, + })) + .await; + + // Should have restarted the invocation + assert_that!( + actions, + contains(matchers::actions::invoke_for_id(invocation_id)) + ); + + // We should be back to invoked state with a reset journal and reset deployment id + assert_that!( + test_env + .storage() + .get_invocation_status(&invocation_id) + .await?, + pat!(InvocationStatus::Invoked(pat!( + InFlightInvocationMetadata { + journal_metadata: pat!(JournalMetadata { length: eq(1) }), + pinned_deployment: none(), + restart_when_completed: eq(false) + } + ))) + ); + + test_env.shutdown().await; + Ok(()) +} + #[rstest] #[case(ExperimentalFeature::InvocationStatusKilled.into())] #[case(EnumSet::empty())] diff --git a/crates/worker/src/partition/state_machine/tests/matchers.rs b/crates/worker/src/partition/state_machine/tests/matchers.rs index eee3267af..9d84d3eef 100644 --- a/crates/worker/src/partition/state_machine/tests/matchers.rs +++ b/crates/worker/src/partition/state_machine/tests/matchers.rs @@ -11,6 +11,7 @@ use bytes::Bytes; use bytestring::ByteString; use googletest::prelude::*; +use restate_storage_api::invocation_status_table::InvocationStatus; use restate_storage_api::timer_table::{TimerKey, TimerKeyKind}; use restate_types::errors::codes; use restate_types::identifiers::EntryIndex; @@ -152,6 +153,18 @@ pub mod outbox { } } +pub fn invoked() -> impl Matcher { + pat!(InvocationStatus::Invoked { .. }) +} + +pub fn suspended() -> impl Matcher { + pat!(InvocationStatus::Suspended { .. }) +} + +pub fn killed() -> impl Matcher { + pat!(InvocationStatus::Killed { .. }) +} + pub fn completion( entry_index: EntryIndex, completion_result: CompletionResult, From 527f61b674226b3d1780b602aaba39c11f1b3a15 Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Fri, 6 Dec 2024 10:25:39 +0100 Subject: [PATCH 7/8] Rebase --- .../src/partition/state_machine/tests/kill_cancel.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/worker/src/partition/state_machine/tests/kill_cancel.rs b/crates/worker/src/partition/state_machine/tests/kill_cancel.rs index a2c48764f..f24f078e6 100644 --- a/crates/worker/src/partition/state_machine/tests/kill_cancel.rs +++ b/crates/worker/src/partition/state_machine/tests/kill_cancel.rs @@ -178,7 +178,7 @@ async fn terminate_scheduled_invocation( #[case(ExperimentalFeature::InvocationStatusKilled.into(), TerminationFlavor::KillAndRestart)] #[case(EnumSet::empty(), TerminationFlavor::KillAndRestart)] #[case(EnumSet::empty(), TerminationFlavor::CancelAndRestart)] -#[tokio::test] +#[restate_core::test] async fn terminate_and_restart_scheduled_invocation_has_no_effect( #[case] experimental_features: EnumSet, #[case] termination_flavor: TerminationFlavor, @@ -229,7 +229,7 @@ async fn terminate_and_restart_scheduled_invocation_has_no_effect( #[case(ExperimentalFeature::InvocationStatusKilled.into(), TerminationFlavor::KillAndRestart)] #[case(EnumSet::empty(), TerminationFlavor::KillAndRestart)] #[case(EnumSet::empty(), TerminationFlavor::CancelAndRestart)] -#[tokio::test] +#[restate_core::test] async fn terminate_and_restart_inboxed_invocation_has_no_effect( #[case] experimental_features: EnumSet, #[case] termination_flavor: TerminationFlavor, @@ -287,7 +287,7 @@ async fn terminate_and_restart_inboxed_invocation_has_no_effect( Ok(()) } -#[tokio::test] +#[restate_core::test] async fn kill_and_restart_when_invoked() -> anyhow::Result<()> { let mut test_env = TestEnv::create_with_experimental_features( ExperimentalFeature::InvocationStatusKilled.into(), @@ -375,7 +375,7 @@ async fn kill_and_restart_when_invoked() -> anyhow::Result<()> { Ok(()) } -#[tokio::test] +#[restate_core::test] async fn kill_and_restart_when_suspended() -> anyhow::Result<()> { let mut test_env = TestEnv::create_with_experimental_features( ExperimentalFeature::InvocationStatusKilled.into(), From 71f1cd47dcdb5c008bf3909a223b6feb92a94bfb Mon Sep 17 00:00:00 2001 From: slinkydeveloper Date: Thu, 19 Dec 2024 09:17:28 +0100 Subject: [PATCH 8/8] Plain kill overrides cancel and retry/kill and retry --- crates/worker/src/partition/state_machine/mod.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/crates/worker/src/partition/state_machine/mod.rs b/crates/worker/src/partition/state_machine/mod.rs index 8c0f4879f..a2c6baa7b 100644 --- a/crates/worker/src/partition/state_machine/mod.rs +++ b/crates/worker/src/partition/state_machine/mod.rs @@ -913,10 +913,21 @@ impl StateMachine { ) .await? } - InvocationStatus::Killed(_) => { + InvocationStatus::Killed(mut in_flight_invocation_metadata) => { trace!("Received kill command for an already killed invocation with id '{invocation_id}'."); // Nothing to do here really, let's send again the abort signal to the invoker just in case Self::do_send_abort_invocation_to_invoker(ctx, invocation_id, true); + + if in_flight_invocation_metadata.restart_when_completed { + // Reset restart when completed flag + in_flight_invocation_metadata.restart_when_completed = false; + ctx.storage + .put_invocation_status( + &invocation_id, + &InvocationStatus::Killed(in_flight_invocation_metadata), + ) + .await; + } } InvocationStatus::Completed(_) => { debug!("Received kill command for completed invocation '{invocation_id}'. To cleanup the invocation after it's been completed, use the purge invocation command.");