diff --git a/cmon/src/main.rs b/cmon/src/main.rs index 4f3fe1952..1f3595808 100644 --- a/cmon/src/main.rs +++ b/cmon/src/main.rs @@ -41,7 +41,6 @@ enum DtraceDisplay { Replaced, ExtentLiveRepair, ExtentLimit, - Backpressure, NextJobId, JobDelta, DsDelay, @@ -61,7 +60,6 @@ impl fmt::Display for DtraceDisplay { DtraceDisplay::Replaced => write!(f, "replaced"), DtraceDisplay::ExtentLiveRepair => write!(f, "extent_live_repair"), DtraceDisplay::ExtentLimit => write!(f, "extent_under_repair"), - DtraceDisplay::Backpressure => write!(f, "backpressure"), DtraceDisplay::NextJobId => write!(f, "next_job_id"), DtraceDisplay::JobDelta => write!(f, "job_delta"), DtraceDisplay::DsDelay => write!(f, "ds_delay"), @@ -229,9 +227,6 @@ fn print_dtrace_header(dd: &[DtraceDisplay]) { DtraceDisplay::ExtentLimit => { print!(" {:>4}", "EXTL"); } - DtraceDisplay::Backpressure => { - print!(" {:>5}", "BAKPR"); - } DtraceDisplay::NextJobId => { print!(" {:>7}", "NEXTJOB"); } @@ -348,9 +343,6 @@ fn print_dtrace_row(d_out: Arg, dd: &[DtraceDisplay], last_job_id: &mut u64) { DtraceDisplay::ExtentLimit => { print!(" {:4}", d_out.ds_extent_limit); } - DtraceDisplay::Backpressure => { - print!(" {:>5}", d_out.up_backpressure); - } DtraceDisplay::NextJobId => { print!(" {:>7}", d_out.next_job_id); } diff --git a/tools/dtrace/get-ds-state.d b/tools/dtrace/get-ds-state.d new file mode 100644 index 000000000..92aaf1dd5 --- /dev/null +++ b/tools/dtrace/get-ds-state.d @@ -0,0 +1,23 @@ +/* + * Print a status line for all matching probes. + * Exit after 5 seconds. + */ +#pragma D option quiet +#pragma D option strsize=1k + +crucible_upstairs*:::up-status +{ + my_sesh = json(copyinstr(arg1), "ok.session_id"); + + printf("%6d %8s %17s %17s %17s\n", + pid, + substr(my_sesh, 0, 8), + json(copyinstr(arg1), "ok.ds_state[0]"), + json(copyinstr(arg1), "ok.ds_state[1]"), + json(copyinstr(arg1), "ok.ds_state[2]")); +} + +tick-5s +{ + exit(0); +} diff --git a/tools/dtrace/get-ds-state.sh b/tools/dtrace/get-ds-state.sh index 7662c5fc1..5f5a13490 100755 --- a/tools/dtrace/get-ds-state.sh +++ b/tools/dtrace/get-ds-state.sh @@ -1,9 +1,16 @@ #!/bin/bash # -# This script will display the downstairs states for any propolis zones it -# finds running on a system. -for zzz in $(zoneadm list | grep propolis); do - echo -n "$zzz " - ppid=$(zlogin "$zzz" pgrep propolis-server) - dtrace -xstrsize=1k -p $ppid -q -n 'crucible_upstairs*:::up-status { printf("%6d %17s %17s %17s", pid, json(copyinstr(arg1), "ok.ds_state[0]"), json(copyinstr(arg1), "ok.ds_state[1]"), json(copyinstr(arg1), "ok.ds_state[2]")); exit(0); }' -done +# This script will display the downstairs states for each pid/session +# it finds running on a system. +filename='/tmp/get-ds-state.out' + +# Gather state on all running propolis servers, record summary to a file +dtrace -s /opt/oxide/crucible_dtrace/get-ds-state.d | sort -n | uniq | awk 'NF' > "$filename" +# Walk the lines in the file, append the zone name to each line. +while read -r p; do + # For each line in the file, pull out the PID we are looking at and + # print the zone that process is running in. + pid=$(echo $p | awk '{print $1}') + zone=$(ps -o zone -p $pid | tail -1 | cut -c 1-28) + echo "$zone $p" +done < "$filename" diff --git a/tools/dtrace/get-lr-state.d b/tools/dtrace/get-lr-state.d new file mode 100755 index 000000000..983628021 --- /dev/null +++ b/tools/dtrace/get-lr-state.d @@ -0,0 +1,26 @@ +/* + * Print a live repair status line for all matching probes. + * Exit after 5 seconds. + */ +#pragma D option quiet +#pragma D option strsize=1k + +crucible_upstairs*:::up-status +{ + my_sesh = json(copyinstr(arg1), "ok.session_id"); + + printf("%6d %8s %s %s %s %s %s %s\n", + pid, + substr(my_sesh, 0, 8), + json(copyinstr(arg1), "ok.ds_live_repair_completed[0]"), + json(copyinstr(arg1), "ok.ds_live_repair_completed[1]"), + json(copyinstr(arg1), "ok.ds_live_repair_completed[2]"), + json(copyinstr(arg1), "ok.ds_live_repair_aborted[0]"), + json(copyinstr(arg1), "ok.ds_live_repair_aborted[1]"), + json(copyinstr(arg1), "ok.ds_live_repair_aborted[2]")); +} + +tick-5s +{ + exit(0); +} diff --git a/tools/dtrace/get-lr-state.sh b/tools/dtrace/get-lr-state.sh index 690e20438..5af032379 100755 --- a/tools/dtrace/get-lr-state.sh +++ b/tools/dtrace/get-lr-state.sh @@ -1,9 +1,16 @@ #!/bin/bash -# -# This script will log into every propolis zone it finds and get the -# DTrace live repair counters from propolis-server in each zone. -for zzz in $(zoneadm list | grep propolis); do - echo -n "$zzz " - ppid=$(zlogin "$zzz" pgrep propolis-server) - dtrace -xstrsize=1k -p $ppid -q -n 'crucible_upstairs*:::up-status { printf("%6d %s %s %s %s %s %s", pid, json(copyinstr(arg1), "ok.ds_live_repair_completed[0]"), json(copyinstr(arg1), "ok.ds_live_repair_completed[1]"), json(copyinstr(arg1), "ok.ds_live_repair_completed[2]"), json(copyinstr(arg1), "ok.ds_live_repair_aborted[0]"), json(copyinstr(arg1), "ok.ds_live_repair_aborted[1]"), json(copyinstr(arg1), "ok.ds_live_repair_aborted[2]")); exit(0); }' -done + +# This script will display the downstairs live repair for each +# pid/session it finds running on a system. +filename='/tmp/get-lr-state.out' + +# Gather state on all running propolis servers, record summary to a file +dtrace -s /opt/oxide/crucible_dtrace/get-lr-state.d | sort -n | uniq | awk 'NF' > "$filename" +# Walk the lines in the file, append the zone name to each line. +while read -r p; do + # For each line in the file, pull out the PID we are looking at and + # print the zone that process is running in. + pid=$(echo $p | awk '{print $1}') + zone=$(ps -o zone -p $pid | tail -1 | cut -c 1-28) + echo "$zone $p" +done < "$filename" diff --git a/tools/dtrace/single_up_info.d b/tools/dtrace/single_up_info.d index 0eb1f4e1b..476edb8eb 100755 --- a/tools/dtrace/single_up_info.d +++ b/tools/dtrace/single_up_info.d @@ -41,7 +41,7 @@ crucible_upstairs*:::up-status * I'm not very happy about this, but if we don't print it all on one * line, then multiple sessions will clobber each others output. */ - printf("%8s %17s %17s %17s %5s %5s %9s %5s %10s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s\n", + printf("%8s %17s %17s %17s %5s %5s %5s %10s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s\n", substr(session_id, 0, 8), @@ -62,7 +62,6 @@ crucible_upstairs*:::up-status * Job ID delta and backpressure */ json(copyinstr(arg1), "ok.next_job_id"), - json(copyinstr(arg1), "ok.up_backpressure"), json(copyinstr(arg1), "ok.write_bytes_out"), /* diff --git a/tools/dtrace/sled_upstairs_info.d b/tools/dtrace/sled_upstairs_info.d index 277b8757e..a9366992d 100755 --- a/tools/dtrace/sled_upstairs_info.d +++ b/tools/dtrace/sled_upstairs_info.d @@ -46,8 +46,7 @@ crucible_upstairs*:::up-status * we don't print it all on one line, then multiple sessions will * clobber each others output. */ - printf("%5d %8s %17s %17s %17s %5s %5s %9s %5s %10s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s\n", - + printf("%5d %8s %17s %17s %17s %5s %5s %5s %10s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s %5s\n", pid, substr(session_id, 0, 8), @@ -62,7 +61,6 @@ crucible_upstairs*:::up-status /* Job ID and backpressure */ json(copyinstr(arg1), "ok.next_job_id"), - json(copyinstr(arg1), "ok.up_backpressure"), json(copyinstr(arg1), "ok.write_bytes_out"), /* In progress jobs on the work list for each downstairs */ diff --git a/tools/dtrace/upstairs_info.d b/tools/dtrace/upstairs_info.d index 64294fc2c..5cb4fc178 100755 --- a/tools/dtrace/upstairs_info.d +++ b/tools/dtrace/upstairs_info.d @@ -21,7 +21,7 @@ tick-1s { printf("%6s ", "PID"); printf("%17s %17s %17s", "DS STATE 0", "DS STATE 1", "DS STATE 2"); - printf(" %5s %5s %9s %5s", "UPW", "DSW", "JOBID", "BAKPR"); + printf(" %5s %5s %9s", "UPW", "DSW", "JOBID"); printf(" %10s", "WRITE_BO"); printf(" %5s %5s %5s", "IP0", "IP1", "IP2"); printf(" %5s %5s %5s", "D0", "D1", "D2"); @@ -49,10 +49,9 @@ crucible_upstairs*:::up-status printf(" %5s", json(copyinstr(arg1), "ok.ds_count")); /* - * Job ID and backpressure + * Job ID and outstanding bytes */ printf(" %9s", json(copyinstr(arg1), "ok.next_job_id")); - printf(" %5s", json(copyinstr(arg1), "ok.up_backpressure")); printf(" %10s", json(copyinstr(arg1), "ok.write_bytes_out")); /* diff --git a/tools/make-dtrace.sh b/tools/make-dtrace.sh index 8b1324234..d1c4aba64 100755 --- a/tools/make-dtrace.sh +++ b/tools/make-dtrace.sh @@ -23,7 +23,9 @@ tar cvf ../../out/crucible-dtrace.tar \ README.md \ all_downstairs.d \ downstairs_count.d \ + get-ds-state.d \ get-ds-state.sh \ + get-lr-state.d \ get-lr-state.sh \ perf-downstairs-os.d \ perf-downstairs-three.d \ diff --git a/upstairs/src/backpressure.rs b/upstairs/src/backpressure.rs deleted file mode 100644 index feaf4a33f..000000000 --- a/upstairs/src/backpressure.rs +++ /dev/null @@ -1,375 +0,0 @@ -// Copyright 2024 Oxide Computer Company - -use crate::{IOop, IO_OUTSTANDING_MAX_BYTES, IO_OUTSTANDING_MAX_JOBS}; -use std::{ - sync::{ - atomic::{AtomicU64, Ordering}, - Arc, - }, - time::Duration, -}; - -/// Helper struct to contain a set of backpressure counters -#[derive(Debug)] -pub struct BackpressureCounters(Arc); - -/// Inner data structure for individual backpressure counters -#[derive(Debug)] -struct BackpressureCountersInner { - /// Number of bytes from `Write` and `WriteUnwritten` operations - /// - /// This value is used for global backpressure, to avoid buffering too many - /// writes (which otherwise return immediately, and are not persistent until - /// a flush) - write_bytes: AtomicU64, - - /// Number of jobs in the queue - /// - /// This value is also used for global backpressure - // XXX should we only count write jobs here? Or should we also count read - // bytes for global backpressure? Much to ponder... - jobs: AtomicU64, - - /// Number of bytes from `Write`, `WriteUnwritten`, and `Read` operations - /// - /// This value is used for local backpressure, to keep the 3x Downstairs - /// roughly in sync. Otherwise, the fastest Downstairs will answer all read - /// requests, and the others can get arbitrarily far behind. - io_bytes: AtomicU64, -} - -/// Guard to automatically decrement backpressure bytes when dropped -#[derive(Debug)] -pub struct BackpressureGuard { - counter: Arc, - write_bytes: u64, - io_bytes: u64, - // There's also an implicit "1 job" here -} - -impl Drop for BackpressureGuard { - fn drop(&mut self) { - self.counter - .write_bytes - .fetch_sub(self.write_bytes, Ordering::Relaxed); - self.counter - .io_bytes - .fetch_sub(self.io_bytes, Ordering::Relaxed); - self.counter.jobs.fetch_sub(1, Ordering::Relaxed); - } -} - -impl BackpressureGuard { - #[cfg(test)] - pub fn dummy() -> Self { - let counter = Arc::new(BackpressureCountersInner { - write_bytes: 0.into(), - io_bytes: 0.into(), - jobs: 1.into(), - }); - Self { - counter, - write_bytes: 0, - io_bytes: 0, - } - } -} - -impl BackpressureCounters { - pub fn new() -> Self { - Self(Arc::new(BackpressureCountersInner { - write_bytes: AtomicU64::new(0), - io_bytes: AtomicU64::new(0), - jobs: AtomicU64::new(0), - })) - } - - pub fn get_write_bytes(&self) -> u64 { - self.0.write_bytes.load(Ordering::Relaxed) - } - - pub fn get_io_bytes(&self) -> u64 { - self.0.io_bytes.load(Ordering::Relaxed) - } - - pub fn get_jobs(&self) -> u64 { - self.0.jobs.load(Ordering::Relaxed) - } - - /// Stores write / IO bytes (and 1 job) for a pending write - #[must_use] - pub fn early_write_increment(&mut self, bytes: u64) -> BackpressureGuard { - self.0.write_bytes.fetch_add(bytes, Ordering::Relaxed); - self.0.io_bytes.fetch_add(bytes, Ordering::Relaxed); - self.0.jobs.fetch_add(1, Ordering::Relaxed); - BackpressureGuard { - counter: self.0.clone(), - write_bytes: bytes, - io_bytes: bytes, - // implicit 1 job - } - } - - /// Stores write / IO bytes (and 1 job) in the backpressure counters - #[must_use] - pub fn increment(&mut self, io: &IOop) -> BackpressureGuard { - let write_bytes = io.write_bytes(); - let io_bytes = io.job_bytes(); - self.0.write_bytes.fetch_add(write_bytes, Ordering::Relaxed); - self.0.io_bytes.fetch_add(io_bytes, Ordering::Relaxed); - self.0.jobs.fetch_add(1, Ordering::Relaxed); - BackpressureGuard { - counter: self.0.clone(), - write_bytes, - io_bytes, - // implicit 1 job - } - } -} - -/// Configuration for host-side backpressure -/// -/// Backpressure adds an artificial delay to host write messages (which are -/// otherwise acked immediately, before actually being complete). The delay is -/// varied based on two metrics: -/// -/// - number of write bytes outstanding -/// - queue length (in jobs) -/// -/// We compute backpressure delay based on both metrics, then pick the larger of -/// the two delays. -#[derive(Copy, Clone, Debug)] -pub struct BackpressureConfig { - pub bytes: BackpressureChannelConfig, - pub queue: BackpressureChannelConfig, -} - -impl Default for BackpressureConfig { - fn default() -> BackpressureConfig { - // `max_value` values below must be higher than `IO_OUTSTANDING_MAX_*`; - // otherwise, replaying jobs to a previously-Offline Downstairs could - // immediately kick us into the saturated regime, which would be - // unfortunate. - BackpressureConfig { - // Byte-based backpressure - bytes: BackpressureChannelConfig { - start_value: 50 * 1024u64.pow(2), // 50 MiB - max_value: IO_OUTSTANDING_MAX_BYTES * 2, - delay_scale: Duration::from_millis(100), - delay_max: Duration::from_millis(30_000), - }, - - // Queue-based backpressure - queue: BackpressureChannelConfig { - start_value: 500, - max_value: IO_OUTSTANDING_MAX_JOBS as u64 * 2, - delay_scale: Duration::from_millis(5), - delay_max: Duration::from_millis(30_000), - }, - } - } -} - -#[derive(Copy, Clone, Debug)] -pub struct BackpressureChannelConfig { - /// When should backpressure start - pub start_value: u64, - /// Value at which backpressure is saturated - pub max_value: u64, - - /// Characteristic scale of backpressure - /// - /// This scale sets the backpressure delay halfway between `start`_value and - /// `max_value` - pub delay_scale: Duration, - - /// Maximum delay (returned at `max_value`) - pub delay_max: Duration, -} - -#[derive(Copy, Clone, Debug, Eq, PartialEq)] -pub enum BackpressureAmount { - Duration(Duration), - Saturated, -} - -impl std::cmp::Ord for BackpressureAmount { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - match (self, other) { - (BackpressureAmount::Saturated, BackpressureAmount::Saturated) => { - std::cmp::Ordering::Equal - } - (BackpressureAmount::Saturated, _) => std::cmp::Ordering::Greater, - (_, BackpressureAmount::Saturated) => std::cmp::Ordering::Less, - ( - BackpressureAmount::Duration(a), - BackpressureAmount::Duration(b), - ) => a.cmp(b), - } - } -} - -impl std::cmp::PartialOrd for BackpressureAmount { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl BackpressureAmount { - /// Converts to a delay in microseconds - /// - /// The saturated case is marked by `u64::MAX` - pub fn as_micros(&self) -> u64 { - match self { - BackpressureAmount::Duration(t) => t.as_micros() as u64, - BackpressureAmount::Saturated => u64::MAX, - } - } -} - -/// Helper `struct` to store a shared backpressure amount -/// -/// Under the hood, this stores a value in microseconds in an `AtomicU64`, so it -/// can be read and written atomically. `BackpressureAmount::Saturated` is -/// indicated by `u64::MAX`. -#[derive(Clone, Debug)] -pub struct SharedBackpressureAmount(Arc); - -impl SharedBackpressureAmount { - pub fn new() -> Self { - Self(Arc::new(AtomicU64::new(0))) - } - - pub fn store(&self, b: BackpressureAmount) { - let v = match b { - BackpressureAmount::Duration(d) => d.as_micros() as u64, - BackpressureAmount::Saturated => u64::MAX, - }; - self.0.store(v, Ordering::Relaxed); - } - - pub fn load(&self) -> BackpressureAmount { - match self.0.load(Ordering::Relaxed) { - u64::MAX => BackpressureAmount::Saturated, - delay_us => { - BackpressureAmount::Duration(Duration::from_micros(delay_us)) - } - } - } -} - -impl BackpressureChannelConfig { - fn get_backpressure(&self, value: u64) -> BackpressureAmount { - // Return a special value if we're saturated - if value >= self.max_value { - return BackpressureAmount::Saturated; - } - - // This ratio starts at 0 (at start_value) and hits 1 when backpressure - // should be maxed out. - let frac = value.saturating_sub(self.start_value) as f64 - / (self.max_value - self.start_value) as f64; - - let v = if frac < 0.5 { - frac * 2.0 - } else { - 1.0 / (2.0 * (1.0 - frac)) - }; - - BackpressureAmount::Duration( - self.delay_scale.mul_f64(v.powi(2)).min(self.delay_max), - ) - } -} - -impl BackpressureConfig { - pub fn get_backpressure( - &self, - bytes: u64, - jobs: u64, - ) -> BackpressureAmount { - let bp_bytes = self.bytes.get_backpressure(bytes); - let bp_queue = self.queue.get_backpressure(jobs); - bp_bytes.max(bp_queue) - } -} - -#[cfg(test)] -mod test { - use super::*; - - /// Confirm that replaying the max number of jobs / bytes will not saturate - #[test] - fn check_outstanding_backpressure() { - let cfg = BackpressureConfig::default(); - let BackpressureAmount::Duration(_) = - cfg.get_backpressure(IO_OUTSTANDING_MAX_BYTES, 0) - else { - panic!("backpressure saturated") - }; - - let BackpressureAmount::Duration(_) = - cfg.get_backpressure(0, IO_OUTSTANDING_MAX_JOBS as u64) - else { - panic!("backpressure saturated") - }; - } - - #[test] - fn check_max_backpressure() { - let cfg = BackpressureConfig::default(); - let BackpressureAmount::Duration(timeout) = cfg - .get_backpressure(IO_OUTSTANDING_MAX_BYTES * 2 - 1024u64.pow(2), 0) - else { - panic!("backpressure saturated") - }; - println!( - "max byte-based delay: {}", - humantime::format_duration(timeout) - ); - assert!( - timeout >= Duration::from_secs(30), - "max byte-based backpressure delay is too low; - expected >= 30 secs, got {}", - humantime::format_duration(timeout) - ); - - let BackpressureAmount::Duration(timeout) = - cfg.get_backpressure(0, IO_OUTSTANDING_MAX_JOBS as u64 * 2 - 1) - else { - panic!("backpressure saturated") - }; - println!( - "max job-based delay: {}", - humantime::format_duration(timeout) - ); - assert!( - timeout >= Duration::from_secs(30), - "max job-based backpressure delay is too low; - expected >= 30 secs, got {}", - humantime::format_duration(timeout) - ); - } - - #[test] - fn check_saturated_backpressure() { - let cfg = BackpressureConfig::default(); - assert!(matches!( - cfg.get_backpressure(IO_OUTSTANDING_MAX_BYTES * 2 + 1, 0), - BackpressureAmount::Saturated - )); - assert!(matches!( - cfg.get_backpressure(IO_OUTSTANDING_MAX_BYTES * 2, 0), - BackpressureAmount::Saturated - )); - - assert!(matches!( - cfg.get_backpressure(0, IO_OUTSTANDING_MAX_JOBS as u64 * 2 + 1), - BackpressureAmount::Saturated - )); - assert!(matches!( - cfg.get_backpressure(0, IO_OUTSTANDING_MAX_JOBS as u64 * 2), - BackpressureAmount::Saturated - )); - } -} diff --git a/upstairs/src/client.rs b/upstairs/src/client.rs index 15b36684c..59406787f 100644 --- a/upstairs/src/client.rs +++ b/upstairs/src/client.rs @@ -1,9 +1,9 @@ // Copyright 2023 Oxide Computer Company use crate::{ - backpressure::BackpressureCounters, cdt, integrity_hash, - live_repair::ExtentInfo, upstairs::UpstairsConfig, upstairs::UpstairsState, - ClientIOStateCount, ClientId, CrucibleDecoder, CrucibleError, DownstairsIO, - DsState, EncryptionContext, IOState, IOop, JobId, Message, RawReadResponse, + cdt, integrity_hash, io_limits::ClientIOLimits, live_repair::ExtentInfo, + upstairs::UpstairsConfig, upstairs::UpstairsState, ClientIOStateCount, + ClientId, CrucibleDecoder, CrucibleError, DownstairsIO, DsState, + EncryptionContext, IOState, IOop, JobId, Message, RawReadResponse, ReconcileIO, ReconcileIOState, RegionDefinitionStatus, RegionMetadata, }; use crucible_common::{x509::TLSContext, ExtentId, VerboseTimeout}; @@ -122,11 +122,8 @@ pub(crate) struct DownstairsClient { /// Number of bytes associated with each IO state io_state_byte_count: ClientIOStateCount, - /// Jobs, write bytes, and total IO bytes in this client's queue - /// - /// These values are used for both global and local (per-client) - /// backpressure. - pub(crate) backpressure_counters: BackpressureCounters, + /// Absolute IO limits for this client + io_limits: ClientIOLimits, /// UUID for this downstairs region /// @@ -200,6 +197,7 @@ impl DownstairsClient { client_id: ClientId, cfg: Arc, target_addr: Option, + io_limits: ClientIOLimits, log: Logger, tls_context: Option>, ) -> Self { @@ -216,6 +214,7 @@ impl DownstairsClient { &log, ), client_id, + io_limits, region_uuid: None, needs_replay: false, negotiation_state: NegotiationState::Start, @@ -232,7 +231,6 @@ impl DownstairsClient { repair_info: None, io_state_job_count: ClientIOStateCount::default(), io_state_byte_count: ClientIOStateCount::default(), - backpressure_counters: BackpressureCounters::new(), connection_id: ConnectionId(0), client_delay_us, } @@ -256,6 +254,10 @@ impl DownstairsClient { cfg, client_task: Self::new_dummy_task(false), client_id: ClientId::new(0), + io_limits: ClientIOLimits::new( + crate::IO_OUTSTANDING_MAX_JOBS * 3 / 2, + crate::IO_OUTSTANDING_MAX_BYTES as usize * 3 / 2, + ), region_uuid: None, needs_replay: false, negotiation_state: NegotiationState::Start, @@ -272,7 +274,6 @@ impl DownstairsClient { repair_info: None, io_state_job_count: ClientIOStateCount::default(), io_state_byte_count: ClientIOStateCount::default(), - backpressure_counters: BackpressureCounters::new(), connection_id: ConnectionId(0), client_delay_us, } @@ -359,19 +360,24 @@ impl DownstairsClient { // Update our bytes-in-flight counter if was_running && !is_running { // Because the job is no longer running, it shouldn't count for - // backpressure. Remove the backpressure guard for this client, - // which decrements backpressure counters on drop. - job.backpressure_guard.take(&self.client_id); - } else if is_running - && !was_running - && !job.backpressure_guard.contains(&self.client_id) - { - // This should only happen if a job is replayed, but that still - // counts! - job.backpressure_guard.insert( - self.client_id, - self.backpressure_counters.increment(&job.work), - ); + // backpressure or IO limits. Remove the backpressure guard for + // this client, which decrements backpressure counters on drop. + job.io_limits.take(&self.client_id); + } else if is_running && !was_running { + match self.io_limits.try_claim(job.work.job_bytes() as u32) { + Ok(g) => { + job.io_limits.insert(self.client_id, g); + } + Err(e) => { + // We can't handle the case of "running out of permits + // during replay", because waiting for a permit would + // deadlock the worker task. Log the error and continue. + warn!( + self.log, + "could not claim IO permits when replaying job: {e:?}" + ) + } + } } old_state @@ -547,7 +553,11 @@ impl DownstairsClient { /// /// # Panics /// If `self.client_task` is not `None`, or `self.target_addr` is `None` - pub(crate) fn reinitialize(&mut self, up_state: &UpstairsState) { + pub(crate) fn reinitialize( + &mut self, + up_state: &UpstairsState, + can_replay: bool, + ) { // Clear this Downstair's repair address, and let the YesItsMe set it. // This works if this Downstairs is new, reconnecting, or was replaced // entirely; the repair address could have changed in any of these @@ -570,6 +580,9 @@ impl DownstairsClient { let current = &self.state; let new_state = match current { + DsState::Active | DsState::Offline if !can_replay => { + Some(DsState::Faulted) + } DsState::Active => Some(DsState::Offline), DsState::LiveRepair | DsState::LiveRepairReady => { Some(DsState::Faulted) @@ -840,8 +853,12 @@ impl DownstairsClient { reason: ClientStopReason, ) { let new_state = match self.state { - DsState::Active => DsState::Offline, - DsState::Offline => DsState::Offline, + DsState::Active | DsState::Offline + if matches!(reason, ClientStopReason::IneligibleForReplay) => + { + DsState::Faulted + } + DsState::Active | DsState::Offline => DsState::Offline, DsState::Faulted => DsState::Faulted, DsState::Deactivated => DsState::New, DsState::Reconcile => DsState::New, @@ -2234,7 +2251,7 @@ impl DownstairsClient { } pub(crate) fn total_bytes_outstanding(&self) -> usize { - self.backpressure_counters.get_io_bytes() as usize + self.io_state_byte_count.in_progress as usize } /// Returns a unique ID for the current connection, or `None` @@ -2408,6 +2425,9 @@ pub(crate) enum ClientStopReason { /// The upstairs has requested that we deactivate when we were offline OfflineDeactivated, + + /// The Upstairs has dropped jobs that would be needed for replay + IneligibleForReplay, } /// Response received from the I/O task diff --git a/upstairs/src/deferred.rs b/upstairs/src/deferred.rs index cbe43932e..301575a64 100644 --- a/upstairs/src/deferred.rs +++ b/upstairs/src/deferred.rs @@ -3,9 +3,8 @@ use std::sync::Arc; use crate::{ - backpressure::BackpressureGuard, client::ConnectionId, - upstairs::UpstairsConfig, BlockContext, BlockOp, ClientData, ClientId, - ImpactedBlocks, Message, RawWrite, + client::ConnectionId, io_limits::IOLimitGuard, upstairs::UpstairsConfig, + BlockContext, BlockOp, ClientId, ImpactedBlocks, Message, RawWrite, }; use bytes::BytesMut; use crucible_common::{integrity_hash, CrucibleError, RegionDefinition}; @@ -114,7 +113,7 @@ pub(crate) struct DeferredWrite { pub data: BytesMut, pub is_write_unwritten: bool, pub cfg: Arc, - pub guard: ClientData, + pub io_guard: IOLimitGuard, } /// Result of a deferred `BlockOp` @@ -135,7 +134,7 @@ pub(crate) struct EncryptedWrite { pub data: RawWrite, pub impacted_blocks: ImpactedBlocks, pub is_write_unwritten: bool, - pub guard: ClientData, + pub io_guard: IOLimitGuard, } impl DeferredWrite { @@ -183,7 +182,7 @@ impl DeferredWrite { data, impacted_blocks: self.impacted_blocks, is_write_unwritten: self.is_write_unwritten, - guard: self.guard, + io_guard: self.io_guard, } } } diff --git a/upstairs/src/downstairs.rs b/upstairs/src/downstairs.rs index 23a889169..35d80d6ed 100644 --- a/upstairs/src/downstairs.rs +++ b/upstairs/src/downstairs.rs @@ -1,16 +1,16 @@ // Copyright 2023 Oxide Computer Company use std::{ - collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque}, + collections::{BTreeMap, HashMap, HashSet, VecDeque}, net::SocketAddr, sync::Arc, time::Duration, }; use crate::{ - backpressure::BackpressureGuard, cdt, client::{ClientAction, ClientStopReason, DownstairsClient, EnqueueResult}, guest::GuestBlockRes, + io_limits::{IOLimitGuard, IOLimits}, live_repair::ExtentInfo, stats::DownstairsStatOuter, upstairs::{UpstairsConfig, UpstairsState}, @@ -88,6 +88,19 @@ pub(crate) struct Downstairs { /// (including automatic flushes). next_flush: u64, + /// Indicates whether we are eligible for replay + /// + /// We are only eligible for replay if all jobs since the last flush are + /// buffered (i.e. none have been retired by a `Barrier` operation). + can_replay: bool, + + /// How many `Flush` or `Barrier` operations are pending? + /// + /// We only want to send a `Barrier` if there isn't already one pending, so + /// we track it here (incrementing in `submit_flush` / `submit_barrier` and + /// decrementing in `retire_check`). + pending_barrier: usize, + /// Ringbuf of recently acked job IDs. acked_ids: AllocRingBuffer, @@ -126,11 +139,6 @@ pub(crate) struct Downstairs { /// Data for an in-progress live repair repair: Option, - /// Jobs that are ready to be acked - /// - /// This must be handled after every event - ackable_work: BTreeSet, - /// Region definition (copied from the upstairs) ddef: Option, @@ -142,49 +150,70 @@ pub(crate) struct Downstairs { reqwest_client: reqwest::Client, } +/// Tuple storing a job and an optional result +#[derive(Debug)] +pub(crate) struct PendingJob { + id: JobId, + result: Option>, +} + +impl PendingJob { + fn new(id: JobId) -> Self { + Self { id, result: None } + } +} + /// State machine for a live-repair operation /// /// We pass through all states (except `FinalFlush`) in order for each extent, /// then pass through the `FinalFlush` state before completion. In each state, -/// we're waiting for a particular job to finish, which is indicated by a -/// `BlockOpWaiter`. -/// -/// Early states carry around reserved IDs (both `JobId` and guest work IDs), as -/// well as a reserved `BlockOpWaiter` for the final flush. -#[derive(Copy, Clone, Debug)] +/// we're waiting for a particular job to finish. We store a `PendingJob` for +/// every pending job, because it may be possible for more than one to finish +/// before checking live-repair state. +#[derive(Debug)] pub(crate) enum LiveRepairState { Closing { - close_id: JobId, - repair_id: JobId, - noop_id: JobId, - reopen_id: JobId, + close_job: PendingJob, + repair_job: PendingJob, + noop_job: PendingJob, + reopen_job: PendingJob, }, Repairing { - repair_id: JobId, - noop_id: JobId, - reopen_id: JobId, + repair_job: PendingJob, + noop_job: PendingJob, + reopen_job: PendingJob, }, Noop { - noop_id: JobId, - reopen_id: JobId, + noop_job: PendingJob, + reopen_job: PendingJob, }, Reopening { - reopen_id: JobId, + reopen_job: PendingJob, }, FinalFlush { - flush_id: JobId, + flush_job: PendingJob, }, } impl LiveRepairState { - /// Returns the job ID that we're waiting on at the moment - fn active_job_id(&self) -> JobId { + fn dummy() -> Self { + LiveRepairState::Noop { + noop_job: PendingJob::new(JobId(0)), + reopen_job: PendingJob::new(JobId(0)), + } + } + + /// Returns the result for the job that we're waiting on at the moment + /// + /// This is a consuming function; the result is removed from the object. + #[must_use] + fn active_job_result(&mut self) -> Option> { match self { - LiveRepairState::Closing { close_id, .. } => *close_id, - LiveRepairState::Repairing { repair_id, .. } => *repair_id, - LiveRepairState::Noop { noop_id, .. } => *noop_id, - LiveRepairState::Reopening { reopen_id, .. } => *reopen_id, - LiveRepairState::FinalFlush { flush_id } => *flush_id, + LiveRepairState::Closing { close_job: j, .. } + | LiveRepairState::Repairing { repair_job: j, .. } + | LiveRepairState::Noop { noop_job: j, .. } + | LiveRepairState::Reopening { reopen_job: j, .. } + | LiveRepairState::FinalFlush { flush_job: j } => j.result.take(), } } } @@ -233,6 +262,47 @@ pub(crate) struct LiveRepairData { state: LiveRepairState, } +impl LiveRepairData { + /// If the given job is one that we're waiting for, store its result + fn on_job_complete(&mut self, ds_id: JobId, io: &DownstairsIO) { + // Build an array of `Option<&mut PendingJob>` to check + let jobs = match &mut self.state { + LiveRepairState::Closing { + close_job, + repair_job, + noop_job, + reopen_job, + } => [ + Some(close_job), + Some(repair_job), + Some(noop_job), + Some(reopen_job), + ], + LiveRepairState::Repairing { + repair_job, + noop_job, + reopen_job, + } => [Some(repair_job), Some(noop_job), Some(reopen_job), None], + LiveRepairState::Noop { + noop_job, + reopen_job, + } => [Some(noop_job), Some(reopen_job), None, None], + LiveRepairState::Reopening { reopen_job } => { + [Some(reopen_job), None, None, None] + } + LiveRepairState::FinalFlush { flush_job } => { + [Some(flush_job), None, None, None] + } + }; + for pending in jobs.into_iter().flatten() { + if pending.id == ds_id { + assert!(pending.result.is_none()); + pending.result = Some(io.result()) + } + } + } +} + #[derive(Debug)] pub(crate) struct ReconcileData { /// An ID uniquely identifying this reconciliation @@ -257,6 +327,7 @@ impl Downstairs { ds_target: ClientMap, tls_context: Option>, stats: DownstairsStatOuter, + io_limits: &IOLimits, log: Logger, ) -> Self { let mut clients = [None, None, None]; @@ -265,6 +336,7 @@ impl Downstairs { i, cfg.clone(), ds_target.get(&i).copied(), + io_limits.client_limits(i), log.new(o!("client" => i.get().to_string())), tls_context.clone(), )); @@ -291,6 +363,8 @@ impl Downstairs { }, cfg, next_flush: 0, + can_replay: true, + pending_barrier: 0, ds_active: ActiveJobs::new(), gw_active: HashSet::new(), acked_ids: AllocRingBuffer::new(2048), @@ -304,7 +378,6 @@ impl Downstairs { reconcile_repair_needed: 0, reconcile_repair_aborted: 0, log: log.new(o!("" => "downstairs".to_string())), - ackable_work: BTreeSet::new(), repair: None, ddef: None, stats, @@ -334,7 +407,12 @@ impl Downstairs { }); let stats = DownstairsStatOuter::default(); - let mut ds = Self::new(cfg, ClientMap::new(), None, stats, log); + // Build a set of fake IO limits that we won't hit + let io_limits = IOLimits::new(u32::MAX as usize, u32::MAX as usize); + + let mut ds = + Self::new(cfg, ClientMap::new(), None, stats, &io_limits, log); + // Create a fake repair address so this field is populated. for cid in ClientId::iter() { ds.clients[cid].repair_addr = @@ -396,23 +474,6 @@ impl Downstairs { } } - /// Checks whether we have ackable work - pub(crate) fn has_ackable_jobs(&self) -> bool { - !self.ackable_work.is_empty() - } - - /// Send back acks for all jobs that are `AckReady` - pub(crate) fn ack_jobs(&mut self) { - debug!(self.log, "ack_jobs called in Downstairs"); - - let ack_list = std::mem::take(&mut self.ackable_work); - let jobs_checked = ack_list.len(); - for ds_id_done in ack_list.iter() { - self.ack_job(*ds_id_done); - } - debug!(self.log, "ack_ready handled {jobs_checked} jobs"); - } - /// Send the ack for a single job back upstairs through `GuestWork` /// /// Update stats for the upstairs as well @@ -471,6 +532,10 @@ impl Downstairs { } debug!(self.log, "[A] ack job {}", ds_id); + if let Some(r) = &mut self.repair { + r.on_job_complete(ds_id, done); + } + // Copy (if present) read data back to the guest buffer they // provided to us, and notify any waiters. if let Some(res) = done.res.take() { @@ -486,7 +551,6 @@ impl Downstairs { } else { panic!("job {ds_id} not on gw_active list"); } - self.retire_check(ds_id); } /// Helper function to calculate pruned deps for a given job @@ -523,7 +587,7 @@ impl Downstairs { // Restart the IO task for that specific client, transitioning to a new // state. - self.clients[client_id].reinitialize(up_state); + self.clients[client_id].reinitialize(up_state, self.can_replay); for i in ClientId::iter() { // Clear per-client delay, because we're starting a new session @@ -939,33 +1003,19 @@ impl Downstairs { /// Checks whether live-repair can continue /// - /// If live-repair can continue, returns the relevant `JobId`, which should - /// be passed into `self.continue_live_repair(..)` - /// - /// This must be called before `Downstairs::ack_jobs`, because it looks for - /// the repair job in `self.ackable_work` to decide if it's done. - fn get_live_repair_job(&mut self) -> Option { - if let Some(repair) = &self.repair { - let ds_id = repair.state.active_job_id(); - if self.ackable_work.contains(&ds_id) { - Some(ds_id) - } else { - // The job that live-repair is waiting on isn't yet ackable, and - // it better not have already been acked. - let job = self.ds_active.get(&ds_id).unwrap(); - assert!(!job.acked); - None - } - } else { - // No live-repair in progress, nothing to do - None - } + /// If live-repair can continue, returns the relevant job result, which + /// should be passed into `self.continue_live_repair(..)`. + #[must_use] + fn get_live_repair_job(&mut self) -> Option> { + self.repair + .as_mut() + .and_then(|r| r.state.active_job_result()) } /// Pushes live-repair forward, if possible /// - /// It's possible that handling the current live-repair job will make - /// subsequent live-repair jobs ackable immediately + /// It's possible that handling the current live-repair job will cause + /// subsequent live-repair jobs to be completed immediately /// (`test_repair_extent_fail_noop_out_of_order` exercises this case). As /// such, this function will continue running until the next live-repair job /// is not ready. @@ -973,17 +1023,16 @@ impl Downstairs { &mut self, up_state: &UpstairsState, ) { - while let Some(ds_id) = self.get_live_repair_job() { - self.continue_live_repair(ds_id, up_state); + while let Some(r) = self.get_live_repair_job() { + self.continue_live_repair(r, up_state); } } - fn continue_live_repair(&mut self, ds_id: JobId, up_state: &UpstairsState) { - let done = self.ds_active.get(&ds_id).unwrap(); - assert!(!done.acked); - assert!(self.ackable_work.contains(&ds_id)); - let r = done.result(); - + fn continue_live_repair( + &mut self, + r: Result<(), CrucibleError>, + up_state: &UpstairsState, + ) { let Some(repair) = &self.repair else { panic!("cannot continue live-repair without self.repair"); }; @@ -1027,23 +1076,27 @@ impl Downstairs { // calling such functions; we have to extract everything we want from // the `LiveRepairData` before calling anything on `&mut self`. let repair = self.repair.as_mut().unwrap(); // reborrow - match repair.state { + + // steal `repair.state`, because we're about to replace it + match std::mem::replace(&mut repair.state, LiveRepairState::dummy()) { LiveRepairState::Closing { - close_id, - repair_id, - noop_id, - reopen_id, + close_job, + repair_job, + noop_job, + reopen_job, } => { info!( self.log, "RE:{} Wait for result from repair command {}", repair.active_extent, - repair_id, + repair_job.id, ); + let close_id = close_job.id; + let repair_id = repair_job.id; repair.state = LiveRepairState::Repairing { - repair_id, - noop_id, - reopen_id, + repair_job, + noop_job, + reopen_job, }; if repair.aborting_repair { self.create_and_enqueue_noop_io(vec![close_id], repair_id); @@ -1061,28 +1114,33 @@ impl Downstairs { }; } LiveRepairState::Repairing { - repair_id, - noop_id, - reopen_id, + repair_job, + noop_job, + reopen_job, } => { info!( self.log, "RE:{} Wait for result from NoOp command {}", repair.active_extent, - noop_id, + noop_job.id, ); - repair.state = LiveRepairState::Noop { noop_id, reopen_id }; + let repair_id = repair_job.id; + let noop_id = noop_job.id; + repair.state = LiveRepairState::Noop { + noop_job, + reopen_job, + }; self.create_and_enqueue_noop_io(vec![repair_id], noop_id); } - LiveRepairState::Noop { reopen_id, .. } => { + LiveRepairState::Noop { reopen_job, .. } => { info!( self.log, "RE:{} Wait for result from reopen command {}", repair.active_extent, - reopen_id, + reopen_job.id, ); // The reopen job was already queued, so just change state - repair.state = LiveRepairState::Reopening { reopen_id }; + repair.state = LiveRepairState::Reopening { reopen_job }; } LiveRepairState::Reopening { .. } => { // It's possible that we've reached the end of our extents! @@ -1100,14 +1158,16 @@ impl Downstairs { // reassignment is handled below. if finished || (repair.aborting_repair && !have_reserved_jobs) { // We're done, submit a final flush! - let flush_id = self.submit_flush(None, None); + let flush_id = self.submit_flush(None, None, None); info!(self.log, "LiveRepair final flush submitted"); cdt::up__to__ds__flush__start!(|| (flush_id.0)); // The borrow was dropped earlier, so reborrow `self.repair` self.repair.as_mut().unwrap().state = - LiveRepairState::FinalFlush { flush_id } + LiveRepairState::FinalFlush { + flush_job: PendingJob::new(flush_id), + } } else { // Keep going! repair.active_extent = next_extent; @@ -1169,7 +1229,7 @@ impl Downstairs { let nio = IOop::ExtentLiveNoOp { dependencies: deps }; cdt::gw__noop__start!(|| (noop_id.0)); - self.enqueue(noop_id, nio, None, ClientMap::new()) + self.enqueue(noop_id, nio, None, None) } #[allow(clippy::too_many_arguments)] @@ -1185,7 +1245,7 @@ impl Downstairs { cdt::gw__repair__start!(|| (repair_id.0, eid.0)); - self.enqueue(repair_id, repair_io, None, ClientMap::new()) + self.enqueue(repair_id, repair_io, None, None) } fn repair_or_noop( @@ -1320,10 +1380,10 @@ impl Downstairs { ); let state = LiveRepairState::Closing { - close_id, - repair_id, - reopen_id, - noop_id, + close_job: PendingJob::new(close_id), + repair_job: PendingJob::new(repair_id), + reopen_job: PendingJob::new(reopen_id), + noop_job: PendingJob::new(noop_id), }; if let Some(extent_count) = extent_count { self.repair = Some(LiveRepairData { @@ -1374,7 +1434,7 @@ impl Downstairs { cdt::gw__reopen__start!(|| (reopen_id.0, eid.0)); - self.enqueue(reopen_id, reopen_io, None, ClientMap::new()) + self.enqueue(reopen_id, reopen_io, None, None) } #[cfg(test)] @@ -1402,7 +1462,7 @@ impl Downstairs { block_size: 512, }; - self.enqueue(ds_id, aread, None, ClientMap::new()); + self.enqueue(ds_id, aread, None, None); ds_id } @@ -1433,7 +1493,7 @@ impl Downstairs { iblocks, request, is_write_unwritten, - ClientData::from_fn(|_| BackpressureGuard::dummy()), + IOLimitGuard::dummy(), ) } @@ -1442,7 +1502,7 @@ impl Downstairs { blocks: ImpactedBlocks, write: RawWrite, is_write_unwritten: bool, - bp_guard: ClientData, + io_guard: IOLimitGuard, ) -> JobId { let ds_id = self.next_id(); if is_write_unwritten { @@ -1485,7 +1545,7 @@ impl Downstairs { ds_id, awrite, Some(GuestBlockRes::Acked), // writes are always acked - bp_guard.into(), + Some(io_guard), ); ds_id } @@ -1515,7 +1575,7 @@ impl Downstairs { let close_io = self.create_close_io(eid, deps, repair.to_vec()); cdt::gw__close__start!(|| (close_id.0, eid.0)); - self.enqueue(close_id, close_io, None, ClientMap::new()) + self.enqueue(close_id, close_io, None, None) } /// Get the repair IDs and dependencies for this extent. @@ -1859,6 +1919,7 @@ impl Downstairs { &mut self, snapshot_details: Option, res: Option, + io_guard: Option, ) -> JobId { let next_id = self.next_id(); cdt::gw__flush__start!(|| (next_id.0)); @@ -1887,15 +1948,52 @@ impl Downstairs { extent_limit: extent_under_repair, }; - self.enqueue( - next_id, - flush, - res.map(GuestBlockRes::Other), - ClientMap::new(), - ); + self.pending_barrier += 1; + self.enqueue(next_id, flush, res.map(GuestBlockRes::Other), io_guard); next_id } + /// Checks to see whether a `Barrier` operation is needed + /// + /// A `Barrier` is needed if we have buffered more than + /// `IO_CACHED_MAX_BYTES/JOBS` worth of complete jobs, and there are no + /// other barrier (or flush) operations in flight + pub(crate) fn needs_barrier(&self) -> bool { + if self.pending_barrier > 0 { + return false; + } + + // n.b. This may not be 100% reliable: if different Downstairs have + // finished a different subset of jobs, then it's theoretically possible + // for each DownstairsClient to be under our limits, but for the true + // number of cached bytes/jobs to be over the limits. + // + // It's hard to imagine how we could encounter such a situation, given + // job dependencies and no out-of-order execution, so this is more of a + // "fun fact" and less an actual concern. + let max_jobs = self + .clients + .iter() + .map(|c| { + let i = c.io_state_job_count(); + i.skipped + i.done + i.error + }) + .max() + .unwrap(); + let max_bytes = self + .clients + .iter() + .map(|c| { + let i = c.io_state_byte_count(); + i.skipped + i.done + i.error + }) + .max() + .unwrap(); + + max_jobs as u64 >= crate::IO_CACHED_MAX_JOBS + || max_bytes >= crate::IO_CACHED_MAX_BYTES + } + pub(crate) fn submit_barrier(&mut self) -> JobId { let next_id = self.next_id(); cdt::gw__barrier__start!(|| (next_id.0)); @@ -1906,12 +2004,8 @@ impl Downstairs { let dependencies = self.ds_active.deps_for_flush(next_id); debug!(self.log, "IO Barrier {next_id} has deps {dependencies:?}"); - self.enqueue( - next_id, - IOop::Barrier { dependencies }, - None, - ClientMap::new(), - ); + self.pending_barrier += 1; + self.enqueue(next_id, IOop::Barrier { dependencies }, None, None); next_id } @@ -1990,6 +2084,7 @@ impl Downstairs { blocks: ImpactedBlocks, data: Buffer, res: BlockRes, + io_guard: IOLimitGuard, ) -> JobId { // If there is a live-repair in progress that intersects with this read, // then reserve job IDs for those jobs. We must do this before @@ -2019,7 +2114,7 @@ impl Downstairs { }; let res = GuestBlockRes::Read(data, res); - self.enqueue(ds_id, aread, Some(res), ClientMap::new()); + self.enqueue(ds_id, aread, Some(res), Some(io_guard)); ds_id } @@ -2029,7 +2124,7 @@ impl Downstairs { blocks: ImpactedBlocks, write: RawWrite, is_write_unwritten: bool, - backpressure_guard: ClientData, + io_guard: IOLimitGuard, ) -> JobId { // If there is a live-repair in progress that intersects with this read, // then reserve job IDs for those jobs. @@ -2039,7 +2134,7 @@ impl Downstairs { blocks, write, is_write_unwritten, - backpressure_guard, + io_guard, ) } @@ -2083,7 +2178,7 @@ impl Downstairs { /// /// - enqueue the job in each of [Self::clients] (clients may skip the job) /// - add the job to [Self::ds_active] - /// - Mark the job as ackable if it was skipped by all downstairs + /// - Ack the job immediately if it was skipped by all downstairs /// - Check that the job was already acked if it's a write (the "fast ack" /// optimization, which is performed elsewhere) /// - Send the job to each downstairs client task (if not skipped) @@ -2092,39 +2187,33 @@ impl Downstairs { ds_id: JobId, io: IOop, res: Option, - mut bp_guard: ClientMap, + io_limits: Option, ) { let mut skipped = 0; let last_repair_extent = self.last_repair_extent(); + let mut io_limits = io_limits + .map(ClientMap::from) + .unwrap_or_else(ClientMap::new); + // Send the job to each client! let state = ClientData::from_fn(|cid| { let client = &mut self.clients[cid]; let r = client.enqueue(ds_id, &io, last_repair_extent); match r { - EnqueueResult::Send | EnqueueResult::Hold => { - // Update the per-client backpressure guard - if !bp_guard.contains(&cid) { - let g = client.backpressure_counters.increment(&io); - bp_guard.insert(cid, g); - } - if matches!(r, EnqueueResult::Send) { - self.send(ds_id, io.clone(), cid); - } + EnqueueResult::Send => self.send(ds_id, io.clone(), cid), + EnqueueResult::Hold => (), + EnqueueResult::Skip => { + // Take and drop the ClientIOLimitGuard, freeing up tokens + let _ = io_limits.take(&cid); + skipped += 1; } - EnqueueResult::Skip => skipped += 1, } r.state() }); // Writes are acked immediately, before being enqueued let acked = io.is_write(); - if skipped == 3 { - if !acked { - self.ackable_work.insert(ds_id); - } - warn!(self.log, "job {} skipped on all downstairs", &ds_id); - } // Puts the IO onto the downstairs work queue. self.ds_active.insert( @@ -2136,14 +2225,25 @@ impl Downstairs { res, replay: false, data: None, - backpressure_guard: bp_guard, + io_limits, }, ); + if acked { self.acked_ids.push(ds_id) } else { self.gw_active.insert(ds_id); } + + // Ack the job immediately if it was skipped on all 3x downstairs + // (and wasn't previously acked, i.e. isn't a write) + if skipped == 3 { + if !acked { + self.ack_job(ds_id); + } + self.retire_check(ds_id); + warn!(self.log, "job {} skipped on all downstairs", &ds_id); + } } /// Sends the given job to the given client @@ -2439,11 +2539,17 @@ impl Downstairs { Ok(ReplaceResult::Started) } + /// Checks whether the given client state should go from Offline -> Faulted + /// + /// # Panics + /// If the given client is not in the `Offline` state pub(crate) fn check_gone_too_long( &mut self, client_id: ClientId, up_state: &UpstairsState, ) { + assert_eq!(self.clients[client_id].state(), DsState::Offline); + let byte_count = self.clients[client_id].total_bytes_outstanding(); let work_count = self.clients[client_id].total_live_work(); let failed = if work_count > crate::IO_OUTSTANDING_MAX_JOBS { @@ -2458,6 +2564,13 @@ impl Downstairs { "downstairs failed, too many outstanding bytes {byte_count}" ); Some(ClientStopReason::TooManyOutstandingBytes) + } else if !self.can_replay { + // XXX can this actually happen? + warn!( + self.log, + "downstairs became ineligible for replay while offline" + ); + Some(ClientStopReason::IneligibleForReplay) } else { None }; @@ -2470,8 +2583,8 @@ impl Downstairs { /// Move all `New` and `InProgress` jobs for the given client to `Skipped` /// - /// This may lead to jobs being marked as ackable, since a skipped job - /// counts as complete in some circumstances. + /// This may lead to jobs being acked, since a skipped job counts as + /// complete in some circumstances. pub(crate) fn skip_all_jobs(&mut self, client_id: ClientId) { info!( self.log, @@ -2479,6 +2592,7 @@ impl Downstairs { self.ds_active.len(), ); + let mut ack_jobs = vec![]; let mut retire_check = vec![]; let mut number_jobs_skipped = 0; @@ -2502,12 +2616,16 @@ impl Downstairs { self.log, "[{}] notify = true for {}", client_id, ds_id ); - self.ackable_work.insert(*ds_id); + ack_jobs.push(*ds_id); } } } }); + for ds_id in ack_jobs { + self.ack_job(ds_id); + } + info!( self.log, "[{}] changed {} jobs to fault skipped", @@ -2589,9 +2707,12 @@ impl Downstairs { /// writes and if they aren't included in replay then the write will /// never start. fn retire_check(&mut self, ds_id: JobId) { - if !self.is_flush(ds_id) { - return; - } + let job = self.ds_active.get(&ds_id).expect("checked missing job"); + let can_replay = match job.work { + IOop::Flush { .. } => true, + IOop::Barrier { .. } => false, + _ => return, + }; // Only a completed flush will remove jobs from the active queue - // currently we have to keep everything around for use during replay @@ -2645,15 +2766,22 @@ impl Downstairs { for &id in &retired { let job = self.ds_active.remove(&id); + // Update our barrier count for the removed job + if matches!(job.work, IOop::Flush { .. } | IOop::Barrier { .. }) + { + self.pending_barrier = + self.pending_barrier.checked_sub(1).unwrap(); + } + // Jobs should have their backpressure contribution removed when // they are completed (in `process_io_completion_inner`), // **not** when they are retired. We'll do a sanity check here // and print a warning if that's not the case. for c in ClientId::iter() { - if job.backpressure_guard.contains(&c) { + if job.io_limits.contains(&c) { warn!( self.log, - "job {ds_id} had pending backpressure bytes \ + "job {ds_id} had pending io limits \ for client {c}" ); // Backpressure is decremented on drop @@ -2666,6 +2794,9 @@ impl Downstairs { for cid in ClientId::iter() { self.clients[cid].skipped_jobs.retain(|&x| x >= ds_id); } + + // Update the flag indicating whether replay is allowed + self.can_replay = can_replay; } } @@ -3091,11 +3222,10 @@ impl Downstairs { /// Wrapper for marking a single job as done from the given client /// - /// This can be used to test handling of ackable work, etc. + /// This can be used to test handling of job acks, etc /// - /// Returns true if the given job has gone from not ackable (not present in - /// `self.ackable_work`) to ackable. This is for historical reasons, - /// because it's often used in existing tests. + /// Returns true if the given job has gone from not acked to acked. This is + /// for historical reasons, because it's often used in existing tests. #[cfg(test)] pub fn process_ds_completion( &mut self, @@ -3105,7 +3235,7 @@ impl Downstairs { up_state: &UpstairsState, extent_info: Option, ) -> bool { - let was_ackable = self.ackable_work.contains(&ds_id); + let was_acked = self.ds_active.get(&ds_id).unwrap().acked; self.process_io_completion_inner( ds_id, @@ -3114,9 +3244,11 @@ impl Downstairs { up_state, extent_info, ); - let now_ackable = self.ackable_work.contains(&ds_id); - !was_ackable && now_ackable - // TODO should this also ack the job, to mimick our event loop? + // It's possible for the IO completion to have retired this job; in that + // case, it must have been acked before being retired. + let now_acked = + self.ds_active.get(&ds_id).map(|j| j.acked).unwrap_or(true); + !was_acked && now_acked } fn process_io_completion_inner( @@ -3152,38 +3284,22 @@ impl Downstairs { return; }; - if self.clients[client_id].process_io_completion( + let was_acked = job.acked; + let should_ack = self.clients[client_id].process_io_completion( ds_id, job, responses, deactivate, extent_info, - ) { - self.ackable_work.insert(ds_id); - } + ); - /* - * If all 3 jobs are done, we can check here to see if we can - * remove this job from the DS list. If we have completed the ack - * to the guest, then there will be no more work on this job - * but messages may still be unprocessed. - */ - if job.acked { - self.retire_check(ds_id); - } else { - // If we reach this then the job probably has errors and - // hasn't acked back yet. We check for NotAcked so we don't - // double count three done and return true if we already have - // AckReady set. + // If all 3 jobs are done, we can check here to see if we can remove + // this job from the DS list. + let wc = job.state_count(); + let complete = (wc.error + wc.skipped + wc.done) == 3; - // If we are a write or a flush with one success, then - // we must switch our state to failed. This condition is - // handled when we check the job result. - let wc = job.state_count(); - if (wc.error + wc.skipped + wc.done) == 3 { - self.ackable_work.insert(ds_id); - debug!(self.log, "[{}] Set AckReady {}", client_id, ds_id); - } + if !was_acked && (should_ack || complete) { + self.ack_job(ds_id); } // Decide what to do when we have an error from this IO. @@ -3246,6 +3362,10 @@ impl Downstairs { // Nothing to do here, no error! } } + + if complete { + self.retire_check(ds_id); + } } /// Accessor for [`Downstairs::reconcile_repaired`] @@ -3278,48 +3398,15 @@ impl Downstairs { } pub(crate) fn write_bytes_outstanding(&self) -> u64 { + // XXX this overlaps with io_state_byte_count self.clients .iter() .filter(|c| matches!(c.state(), DsState::Active)) - .map(|c| c.backpressure_counters.get_write_bytes()) + .map(|c| c.io_state_byte_count().in_progress) .max() .unwrap_or(0) } - pub(crate) fn jobs_outstanding(&self) -> u64 { - self.clients - .iter() - .filter(|c| matches!(c.state(), DsState::Active)) - .map(|c| c.backpressure_counters.get_jobs()) - .max() - .unwrap_or(0) - } - - /// Marks a single job as acked - /// - /// The job is removed from `self.ackable_work` and `acked` is set to `true` - /// in the job's state. - /// - /// This is only useful in tests; in real code, we'd also want to reply to - /// the guest when acking a job. - #[cfg(test)] - fn ack(&mut self, ds_id: JobId) { - /* - * Move AckReady to Acked. - */ - let Some(job) = self.ds_active.get_mut(&ds_id) else { - panic!("reqid {} is not active", ds_id); - }; - if !self.ackable_work.remove(&ds_id) { - panic!("Job {ds_id} is not ackable"); - } - - if job.acked { - panic!("Job {ds_id} already acked!"); - } - job.acked = true; - } - /// Returns all jobs in sorted order by [`JobId`] /// /// This function is used in unit tests for the Upstairs @@ -3338,11 +3425,6 @@ impl Downstairs { self.ds_active.get_extents_for(ds_id) } - #[cfg(test)] - pub fn ackable_work(&self) -> &BTreeSet { - &self.ackable_work - } - #[cfg(test)] pub fn completed(&self) -> &AllocRingBuffer { &self.retired_ids @@ -3418,7 +3500,7 @@ impl Downstairs { data, }, is_write_unwritten, - ClientData::from_fn(|_| BackpressureGuard::dummy()), + IOLimitGuard::dummy(), ) } @@ -3449,7 +3531,7 @@ impl Downstairs { let ddef = self.ddef.as_ref().unwrap(); let block_count = blocks.blocks(ddef).len(); let buf = Buffer::new(block_count, ddef.block_size() as usize); - self.submit_read(blocks, buf, BlockRes::dummy()) + self.submit_read(blocks, buf, BlockRes::dummy(), IOLimitGuard::dummy()) } /// Create a test downstairs which has all clients Active @@ -4159,23 +4241,17 @@ impl Downstairs { }); } - /// Assign the given number of write bytes to the backpressure counters - #[must_use] - pub(crate) fn early_write_backpressure( - &mut self, - bytes: u64, - ) -> ClientData { - ClientData::from_fn(|i| { - self.clients[i] - .backpressure_counters - .early_write_increment(bytes) - }) - } - pub(crate) fn set_ddef(&mut self, ddef: RegionDefinition) { self.ddef = Some(ddef); } + /// Checks whether there are any in-progress jobs present + pub(crate) fn has_live_jobs(&self) -> bool { + self.clients + .iter() + .any(|c| c.io_state_job_count().in_progress > 0) + } + /// Returns the per-client state for the given job /// /// This is a helper function to make unit tests shorter @@ -4222,14 +4298,15 @@ struct DownstairsBackpressureConfig { #[cfg(test)] pub(crate) mod test { - use super::Downstairs; + use super::{Downstairs, PendingJob}; use crate::{ downstairs::{LiveRepairData, LiveRepairState, ReconcileData}, live_repair::ExtentInfo, upstairs::UpstairsState, - ClientId, CrucibleError, DsState, ExtentFix, ExtentRepairIDs, IOState, - IOop, ImpactedAddr, ImpactedBlocks, JobId, RawReadResponse, - ReconcileIO, ReconcileIOState, ReconciliationId, SnapshotDetails, + BlockOpWaiter, ClientId, CrucibleError, DsState, ExtentFix, + ExtentRepairIDs, IOState, IOop, ImpactedAddr, ImpactedBlocks, JobId, + RawReadResponse, ReconcileIO, ReconcileIOState, ReconciliationId, + SnapshotDetails, }; use bytes::BytesMut; @@ -4269,10 +4346,6 @@ pub(crate) mod test { None, ); } - // Writes are fast-acked when first submitted - if !ds.ds_active.get(&ds_id).unwrap().work.is_write() { - ds.ack(ds_id); - } } fn set_all_reconcile(ds: &mut Downstairs) { @@ -4297,7 +4370,7 @@ pub(crate) mod test { let mut ds = Downstairs::test_default(); ds.force_active(); - let next_id = ds.submit_flush(None, None); + let next_id = ds.submit_flush(None, None, None); assert!(!ds.process_ds_completion( next_id, @@ -4306,7 +4379,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(ds.process_ds_completion( @@ -4316,12 +4388,9 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert_eq!(ds.ackable_work.len(), 1); + assert!(ds.ds_active.get(&next_id).unwrap().acked); assert!(ds.retired_ids.is_empty()); - assert!(!ds.ds_active.get(&next_id).unwrap().acked); - ds.ack(next_id); - assert!(!ds.process_ds_completion( next_id, ClientId::new(2), @@ -4329,7 +4398,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert_eq!(ds.retired_ids.len(), 1); } @@ -4344,6 +4412,7 @@ pub(crate) mod test { snapshot_name: String::from("snap"), }), None, + None, ); assert!(!ds.process_ds_completion( @@ -4353,8 +4422,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(!ds.process_ds_completion( @@ -4364,8 +4431,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(ds.process_ds_completion( @@ -4376,11 +4441,6 @@ pub(crate) mod test { None, )); - assert_eq!(ds.ackable_work.len(), 1); - - ds.ack(next_id); - ds.retire_check(next_id); - assert_eq!(ds.retired_ids.len(), 1); } @@ -4389,7 +4449,7 @@ pub(crate) mod test { let mut ds = Downstairs::test_default(); ds.force_active(); - let next_id = ds.submit_flush(None, None); + let next_id = ds.submit_flush(None, None, None); assert!(!ds.process_ds_completion( next_id, @@ -4398,7 +4458,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(!ds.process_ds_completion( @@ -4408,7 +4467,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(ds.process_ds_completion( @@ -4418,10 +4476,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert_eq!(ds.ackable_work.len(), 1); - - ds.ack(next_id); - ds.retire_check(next_id); assert_eq!(ds.retired_ids.len(), 1); // No skipped jobs here. @@ -4435,7 +4489,7 @@ pub(crate) mod test { let mut ds = Downstairs::test_default(); ds.force_active(); - let next_id = ds.submit_flush(None, None); + let next_id = ds.submit_flush(None, None, None); assert!(!ds.process_ds_completion( next_id, @@ -4444,7 +4498,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(!ds.process_ds_completion( @@ -4454,7 +4507,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(ds.process_ds_completion( @@ -4464,10 +4516,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert_eq!(ds.ackable_work.len(), 1); - - ds.ack(next_id); - ds.retire_check(next_id); assert_eq!(ds.retired_ids.len(), 1); } @@ -4488,11 +4536,8 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert_eq!(ds.ackable_work.len(), 1); assert!(ds.retired_ids.is_empty()); - ds.ack(next_id); - let response = Ok(build_read_response(&[])); assert!(!ds.process_ds_completion( @@ -4502,7 +4547,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); let response = Ok(build_read_response(&[])); @@ -4514,7 +4558,7 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); + // A flush is required to move work to completed assert!(ds.retired_ids.is_empty()); } @@ -4533,7 +4577,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); let response = Ok(build_read_response(&[])); @@ -4545,11 +4588,8 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert_eq!(ds.ackable_work.len(), 1); assert!(ds.retired_ids.is_empty()); - ds.ack(next_id); - let response = Ok(build_read_response(&[])); assert!(!ds.process_ds_completion( @@ -4559,7 +4599,7 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); + // A flush is required to move work to completed // That this is still zero is part of the test assert!(ds.retired_ids.is_empty()); @@ -4579,7 +4619,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(!ds.process_ds_completion( @@ -4589,7 +4628,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); let response = Ok(build_read_response(&[])); @@ -4601,13 +4639,8 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert_eq!(ds.ackable_work.len(), 1); - - ds.ack(next_id); - ds.retire_check(next_id); // A flush is required to move work to completed - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); } @@ -4625,7 +4658,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(!ds.process_ds_completion( @@ -4635,7 +4667,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); assert!(ds.process_ds_completion( @@ -4645,12 +4676,7 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - assert_eq!(ds.ackable_work.len(), 1); - ds.ack(next_id); - ds.retire_check(next_id); - - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); } @@ -4664,6 +4690,7 @@ pub(crate) mod test { let response = || Ok(build_read_response(&[])); + // ack on first reply assert!(ds.process_ds_completion( next_id, ClientId::new(2), @@ -4680,10 +4707,6 @@ pub(crate) mod test { None, )); - // emulated run in up_ds_listen - ds.ack(next_id); - ds.retire_check(next_id); - assert!(!ds.process_ds_completion( next_id, ClientId::new(1), @@ -4692,7 +4715,6 @@ pub(crate) mod test { None, )); - assert!(ds.ackable_work.is_empty()); // Work won't be completed until we get a flush. assert!(ds.retired_ids.is_empty()); } @@ -4716,6 +4738,7 @@ pub(crate) mod test { ds.force_active(); // send a write, and clients 0 and 1 will return errors + // job is fast-acked, so it starts as acked let next_id = ds.create_and_enqueue_generic_write_eob(is_write_unwritten); @@ -4740,25 +4763,14 @@ pub(crate) mod test { assert!(ds.ds_active.get(&next_id).unwrap().data.is_none()); - // Jobs should be acked - assert!(ds.ackable_work.is_empty()); - assert!(ds.ds_active.get(&next_id).unwrap().acked); - let response = Ok(Default::default()); - let res = ds.process_ds_completion( + assert!(!ds.process_ds_completion( next_id, ClientId::new(2), response, &UpstairsState::Active, None, - ); - - // The IO should not have been marked as ackable, because it was - // fast-acked - assert!(!res); - - // Both write and write_unwritten should be fast-acked - assert!(ds.ackable_work.is_empty()); + )); assert!(ds.clients[ClientId::new(0)].stats.downstairs_errors > 0); assert!(ds.clients[ClientId::new(1)].stats.downstairs_errors > 0); @@ -4790,9 +4802,8 @@ pub(crate) mod test { // Simulate completing both writes to downstairs 0 and 1 // - // write_unwritten jobs become ackable upon the second completion; - // normal writes were ackable from the start (and hence - // process_ds_completion always returns `false`) + // all writes are acked immediately upon submission, so + // process_ds_completion always returns `false`. assert!(!ds.process_ds_completion( id1, ClientId::new(0), @@ -4827,11 +4838,10 @@ pub(crate) mod test { assert!(ds.ds_active.get(&id2).unwrap().acked); // Work stays on active queue till the flush - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); // Create the flush and send it to the downstairs - let flush_id = ds.submit_flush(None, None); + let flush_id = ds.submit_flush(None, None, None); // Simulate completing the flush to downstairs 0 and 1 assert!(!ds.process_ds_completion( @@ -4849,20 +4859,12 @@ pub(crate) mod test { None, )); - // Ack the flush back to the guest - ds.ack(flush_id); - // Make sure downstairs 0 and 1 update their last flush id and // that downstairs 2 does not. assert_eq!(ds.clients[ClientId::new(0)].last_flush, flush_id); assert_eq!(ds.clients[ClientId::new(1)].last_flush, flush_id); assert_eq!(ds.clients[ClientId::new(2)].last_flush, JobId(0)); - // Should not retire yet. - ds.retire_check(flush_id); - - assert!(ds.ackable_work.is_empty()); - // Make sure all work is still on the active side assert!(ds.retired_ids.is_empty()); @@ -4939,6 +4941,7 @@ pub(crate) mod test { None )); + // Confirm that we've stored this block let responses = ds.ds_active.get(&next_id).unwrap().data.as_ref(); assert!(responses.is_some()); assert_eq!( @@ -5014,7 +5017,7 @@ pub(crate) mod test { )); // One completion of a read means we can ACK - assert_eq!(ds.ackable_work.len(), 1); + assert!(ds.ds_active.get(&next_id).unwrap().acked); // Complete downstairs 1 and 2 let response = Ok(build_read_response(&[])); @@ -5035,19 +5038,13 @@ pub(crate) mod test { None )); - // Make sure the job is still active - assert!(ds.retired_ids.is_empty()); - - // Ack the job to the guest (this checks whether it's ack ready) - ds.ack(next_id); - - // Nothing left to ACK, but until the flush we keep the IO data. - assert!(ds.ackable_work.is_empty()); + // Make sure the job is not yet retired assert!(ds.retired_ids.is_empty()); // A flush is required to move work to completed // Create the flush then send it to all downstairs. - let next_id = ds.submit_flush(None, None); + let (mut rx, tx) = BlockOpWaiter::pair(); + let next_id = ds.submit_flush(None, Some(tx), None); // Complete the Flush at each downstairs. assert!(!ds.process_ds_completion( @@ -5073,12 +5070,9 @@ pub(crate) mod test { None )); - // ACK the flush and let retire_check move things along. - ds.ack(next_id); - ds.retire_check(next_id); + // Verify the flush has been acked + assert!(rx.try_wait().unwrap().is_ok()); - // Verify no more work to ack. - assert!(ds.ackable_work.is_empty()); // The read and the flush should now be moved to completed. assert_eq!(ds.retired_ids.len(), 2); } @@ -5091,12 +5085,12 @@ pub(crate) mod test { ds.force_active(); // Build our read, put it into the work queue - let next_id = ds.create_and_enqueue_generic_read_eob(); + let read_id = ds.create_and_enqueue_generic_read_eob(); // Downstairs 0 now has completed this work. let response = Ok(build_read_response(&[])); assert!(ds.process_ds_completion( - next_id, + read_id, ClientId::new(0), response, &UpstairsState::Active, @@ -5104,34 +5098,27 @@ pub(crate) mod test { )); // One completion of a read means we can ACK - assert_eq!(ds.ackable_work.len(), 1); - - // But, don't send the ack just yet. - // The job should be ack ready - assert!(ds.ackable_work.contains(&next_id)); - assert!(!ds.ds_active.get(&next_id).unwrap().acked); + assert!(ds.ds_active.get(&read_id).unwrap().acked); // A flush is required to move work to completed // Create the flush then send it to all downstairs. - let next_id = ds.submit_flush(None, None); + let (mut rx, tx) = BlockOpWaiter::pair(); + let flush_id = ds.submit_flush(None, Some(tx), None); // Send and complete the Flush at each downstairs. for cid in ClientId::iter() { ds.process_ds_completion( - next_id, + flush_id, cid, Ok(Default::default()), &UpstairsState::Active, None, ); } + assert!(rx.try_wait().unwrap().is_ok()); - // ACK the flush and let retire_check move things along. - ds.ack(next_id); - ds.retire_check(next_id); - - // Verify the read is still ack ready. - assert_eq!(ds.ackable_work.len(), 1); + // Verify the read is still acked + assert!(ds.ds_active.get(&read_id).unwrap().acked); // The the flush should now be moved to completed. assert_eq!(ds.retired_ids.len(), 1); // The read should still be on the queue. @@ -5186,13 +5173,13 @@ pub(crate) mod test { assert!(ds.ds_active.get(&next_id).unwrap().acked); // Work stays on active queue till the flush - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); // Create the flush IO - let next_id = ds.submit_flush(None, None); + let (mut rx, tx) = BlockOpWaiter::pair(); + let next_id = ds.submit_flush(None, Some(tx), None); - // Complete the flush on all three downstairs. + // Complete the flush on all three downstairs, acking on the 2nd assert!(!ds.process_ds_completion( next_id, ClientId::new(0), @@ -5215,10 +5202,7 @@ pub(crate) mod test { None )); - ds.ack(next_id); - ds.retire_check(next_id); - - assert!(ds.ackable_work.is_empty()); + assert!(rx.try_wait().unwrap().is_ok()); // The write and flush should now be completed. assert_eq!(ds.retired_ids.len(), 2); } @@ -5281,11 +5265,10 @@ pub(crate) mod test { assert!(ds.ds_active.get(&id2).unwrap().acked); // Work stays on active queue till the flush. - assert!(ds.ackable_work.is_empty()); assert!(ds.retired_ids.is_empty()); // Create and send the flush. - let flush_id = ds.submit_flush(None, None); + let flush_id = ds.submit_flush(None, None, None); // Complete the flush on those downstairs. assert!(!ds.process_ds_completion( @@ -5303,13 +5286,6 @@ pub(crate) mod test { None )); - // Ack the flush - ds.ack(flush_id); - - // Should not retire yet - ds.retire_check(flush_id); - - assert!(ds.ackable_work.is_empty()); // Not done yet, until all clients do the work. assert!(ds.retired_ids.is_empty()); @@ -5373,12 +5349,7 @@ pub(crate) mod test { )); // One completion should allow for an ACK - assert_eq!(ds.ackable_work.len(), 1); - assert!(ds.ackable_work.contains(&next_id)); - assert!(!ds.ds_active.get(&next_id).unwrap().acked); - - // The Downstairs will ack jobs immediately, during the event handler - ds.ack(next_id); + assert!(ds.ds_active.get(&next_id).unwrap().acked); // Be sure the job is not yet in replay assert!(!ds.ds_active.get(&next_id).unwrap().replay); @@ -5387,7 +5358,6 @@ pub(crate) mod test { assert!(ds.ds_active.get(&next_id).unwrap().replay); // The IO is still acked - assert!(!ds.ackable_work.contains(&next_id)); assert!(ds.ds_active.get(&next_id).unwrap().acked); } @@ -5410,8 +5380,7 @@ pub(crate) mod test { &UpstairsState::Active, None )); - // We always ack jobs right away - ds.ack(next_id); + assert!(ds.ds_active.get(&next_id).unwrap().acked); // Complete the read on a 2nd downstairs. let response = Ok(build_read_response(&[1, 2, 3, 4])); @@ -5444,7 +5413,6 @@ pub(crate) mod test { &UpstairsState::Active, None )); - assert!(ds.ackable_work.is_empty()); assert!(ds.ds_active.get(&next_id).unwrap().acked); } @@ -5467,14 +5435,11 @@ pub(crate) mod test { &UpstairsState::Active, None )); - // Immediately ack the job (which checks that it's ready) - ds.ack(next_id); + assert!(ds.ds_active.get(&next_id).unwrap().acked); - // Should not retire yet - ds.retire_check(next_id); + // The read has been acked + assert!(ds.ds_active.get(&next_id).unwrap().acked); - // No new ackable work. - assert!(ds.ackable_work.is_empty()); // Verify the IO has not completed yet. assert!(ds.retired_ids.is_empty()); @@ -5494,7 +5459,6 @@ pub(crate) mod test { &UpstairsState::Active, None )); - assert!(ds.ackable_work.is_empty()); assert!(ds.ds_active.get(&next_id).unwrap().acked); } @@ -5540,11 +5504,9 @@ pub(crate) mod test { &UpstairsState::Active, None )); + assert!(ds.ds_active.get(&next_id).unwrap().acked); - // Ack the read to the guest. - ds.ack(next_id); - - // Before re replay_jobs, the IO is not replay + // Before we replay_jobs, the IO is not replay assert!(!ds.ds_active.get(&next_id).unwrap().replay); // Now, take that downstairs offline ds.replay_jobs(ClientId::new(0)); @@ -5612,6 +5574,7 @@ pub(crate) mod test { &UpstairsState::Active, None )); + assert!(ds.ds_active.get(&next_id).unwrap().acked); // Construct our fake response for another downstairs. let response = Ok(build_read_response( @@ -5627,9 +5590,6 @@ pub(crate) mod test { None )); - // Ack the read to the guest. - ds.ack(next_id); - // Now, take the second downstairs offline ds.replay_jobs(ClientId::new(1)); // Now the IO should be replay @@ -5708,7 +5668,7 @@ pub(crate) mod test { // The job should remain acked assert!(ds.ds_active.get(&id1).unwrap().acked); - // Re-submit and complete the write + // Re-submit and complete the write, which should not ack again assert_eq!(ds.job_state(id1, 1), IOState::InProgress); assert!(!ds.process_ds_completion( id1, @@ -5717,10 +5677,6 @@ pub(crate) mod test { &UpstairsState::Active, None )); - - // State should remain acked and not ackable - assert!(ds.ds_active.get(&id1).unwrap().acked); - assert!(ds.ackable_work.is_empty()) } #[test] @@ -5761,9 +5717,6 @@ pub(crate) mod test { // Check that it should have been fast-acked assert!(ds.ds_active.get(&id1).unwrap().acked); - // Verify no more ackable work - assert!(ds.ackable_work.is_empty()); - // Now, take that downstairs offline ds.replay_jobs(ClientId::new(0)); @@ -6390,10 +6343,8 @@ pub(crate) mod test { None, ); - let ack_list = ds.ackable_work().clone(); - assert!(ack_list.is_empty()); let done = ds.ds_active.get(&next_id).unwrap(); - assert!(done.acked); + assert!(done.acked); // fast-acked assert!(done.result().is_ok()); } @@ -6438,7 +6389,6 @@ pub(crate) mod test { let done = ds.ds_active.get(&next_id).unwrap(); assert!(done.acked); - assert!(ds.ackable_work().is_empty()); assert!(done.result().is_err()); } @@ -6492,7 +6442,6 @@ pub(crate) mod test { let done = ds.ds_active.get(&next_id).unwrap(); assert!(done.acked); - assert!(ds.ackable_work().is_empty()); assert!(done.result().is_err()); } @@ -6507,37 +6456,27 @@ pub(crate) mod test { // Create a flush, enqueue it on both the downstairs // and the guest work queues. - let next_id = ds.submit_flush(None, None); + let (mut rx, tx) = BlockOpWaiter::pair(); + let next_id = ds.submit_flush(None, Some(tx), None); let response = || Ok(Default::default()); - ds.process_ds_completion( + assert!(!ds.process_ds_completion( next_id, ClientId::new(0), response(), &UpstairsState::Active, None, - ); + )); - ds.process_ds_completion( + assert!(ds.process_ds_completion( next_id, ClientId::new(2), response(), &UpstairsState::Active, None, - ); - - let ack_list = ds.ackable_work().clone(); - assert_eq!(ack_list.len(), 1); - - // Simulation of what happens in up_ds_listen - for ds_id_done in ack_list.iter() { - assert_eq!(*ds_id_done, next_id); - - ds.ack(*ds_id_done); + )); // acked! - let done = ds.ds_active.get(ds_id_done).unwrap(); - assert!(done.result().is_ok()); - } + assert!(rx.try_wait().unwrap().is_ok()); } #[test] @@ -6553,7 +6492,9 @@ pub(crate) mod test { // Create a flush, enqueue it on both the downstairs // and the guest work queues. - let next_id = ds.submit_flush(None, None); + let next_id = ds.submit_flush(None, None, None); + + // It should be acked when the first completion returns assert!(ds.process_ds_completion( next_id, ClientId::new(0), @@ -6561,19 +6502,6 @@ pub(crate) mod test { &UpstairsState::Active, None, )); - - let ack_list = ds.ackable_work().clone(); - assert_eq!(ack_list.len(), 1); - - // Simulation of what happens in up_ds_listen - for ds_id_done in ack_list.iter() { - assert_eq!(*ds_id_done, next_id); - - ds.ack(*ds_id_done); - - let done = ds.ds_active.get(ds_id_done).unwrap(); - assert!(done.result().is_err()); - } } #[test] @@ -6587,7 +6515,8 @@ pub(crate) mod test { // Create a flush, enqueue it on both the downstairs // and the guest work queues. - let next_id = ds.submit_flush(None, None); + let (mut rx, tx) = BlockOpWaiter::pair(); + let next_id = ds.submit_flush(None, Some(tx), None); // DS 1 has a failure, and this won't return true as we don't // have enough success yet to ACK to the guest. @@ -6609,18 +6538,7 @@ pub(crate) mod test { None )); - let ack_list = ds.ackable_work().clone(); - assert_eq!(ack_list.len(), 1); - - // Simulation of what happens in up_ds_listen - for ds_id_done in ack_list.iter() { - assert_eq!(*ds_id_done, next_id); - - ds.ack(*ds_id_done); - - let done = ds.ds_active.get(ds_id_done).unwrap(); - assert!(done.result().is_err()); - } + assert!(rx.try_wait().unwrap().is_err()); } #[test] @@ -6677,7 +6595,6 @@ pub(crate) mod test { assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Faulted); // Verify that this work should have been fast-acked - assert_eq!(ds.ackable_work().len(), 0); assert!(ds.ds_active.get(&next_id).unwrap().acked); } @@ -6718,7 +6635,7 @@ pub(crate) mod test { None )); - // the operation was previously marked as ackable, because it's a write + // the operation was already fast-acked, because it's a write assert!(!ds.process_ds_completion( next_id, ClientId::new(2), @@ -6751,7 +6668,7 @@ pub(crate) mod test { let response = || Ok(build_read_response(&[])); - // Process the operation for client 1 this should return true + // Process the operation for client 1; we send the ack here assert!(ds.process_ds_completion( next_id, ClientId::new(1), @@ -6760,7 +6677,7 @@ pub(crate) mod test { None, )); - // Process the operation for client 2 this should return false + // Process the operation for client 2, which does not ack assert!(!ds.process_ds_completion( next_id, ClientId::new(2), @@ -6769,13 +6686,9 @@ pub(crate) mod test { None )); - // Verify we can ack this work, then ack it. - assert_eq!(ds.ackable_work().len(), 1); - ds.ack(next_id); - // Perform the flush. let next_id = { - let next_id = ds.submit_flush(None, None); + let next_id = ds.submit_flush(None, None, None); // As this DS is failed, it should have been skipped assert_eq!( @@ -6796,7 +6709,7 @@ pub(crate) mod test { None, )); - // process_ds_operation should return true after we process this. + // process_ds_operation should ack after we process this. assert!(ds.process_ds_completion( next_id, ClientId::new(2), @@ -6805,13 +6718,6 @@ pub(crate) mod test { None )); - // ACK the flush and let retire_check move things along. - assert_eq!(ds.ackable_work().len(), 1); - ds.ack(next_id); - ds.retire_check(next_id); - - assert_eq!(ds.ackable_work().len(), 0); - // The write, the read, and now the flush should be completed. assert_eq!(ds.completed().len(), 3); @@ -6895,7 +6801,7 @@ pub(crate) mod test { let response = Ok(build_read_response(&[])); - // Process the operation for client 2, which makes the job ackable + // Process the operation for client 2, which acks the job assert!(ds.process_ds_completion( next_id, ClientId::new(2), @@ -6939,8 +6845,8 @@ pub(crate) mod test { None )); - // process_ds_operation for client 2; the job was already ackable - // (because it's a write) so this returns false + // process_ds_operation for client 2; the job was already acked (because + // it's a write) so this returns false assert!(!ds.process_ds_completion( next_id, ClientId::new(2), @@ -6961,7 +6867,6 @@ pub(crate) mod test { // Verify this work should have been fast-acked assert!(ds.ds_active.get(&next_id).unwrap().acked); - assert_eq!(ds.ackable_work().len(), 0); // Now, do another write. let next_id = ds.create_and_enqueue_generic_write_eob(false); @@ -6981,7 +6886,7 @@ pub(crate) mod test { )); // We don't process client 1, it had failed - // again, the job was ackable immediately + // again, the job was acked immediately because it's a write assert!(!ds.process_ds_completion( next_id, ClientId::new(2), @@ -6991,7 +6896,6 @@ pub(crate) mod test { )); // Verify we should have fast-acked this work too assert!(ds.ds_active.get(&next_id).unwrap().acked); - assert_eq!(ds.ackable_work().len(), 0); // One downstairs should have a skipped job on its list. assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 0); @@ -7002,7 +6906,7 @@ pub(crate) mod test { assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 0); // Enqueue the flush. - let flush_id = ds.submit_flush(None, None); + let flush_id = ds.submit_flush(None, None, None); assert_eq!( ds.job_states(flush_id), @@ -7019,7 +6923,7 @@ pub(crate) mod test { None )); - // process_ds_operation should return true after we process client 2. + // process_ds_operation should ack after we process client 2. assert!(ds.process_ds_completion( flush_id, ClientId::new(2), @@ -7028,14 +6932,6 @@ pub(crate) mod test { None )); - // ACK all the jobs and let retire_check move things along. - assert_eq!(ds.ackable_work().len(), 1); - // first two writes should have been fast-acked - ds.ack(flush_id); - ds.retire_check(flush_id); - - assert_eq!(ds.ackable_work().len(), 0); - // The two writes and the flush should be completed. assert_eq!(ds.completed().len(), 3); @@ -7091,7 +6987,7 @@ pub(crate) mod test { None )); - // Process the operation for client 2. The job was already ackable, so + // Process the operation for client 2. The job was already acked, so // this returns `false` assert!(!ds.process_ds_completion( write_id, @@ -7106,9 +7002,6 @@ pub(crate) mod test { assert_eq!(ds.clients[ClientId::new(1)].state(), DsState::Faulted); assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Active); - // This job was immediately acked - assert_eq!(ds.ackable_work().len(), 0); - // Verify the read switched from InProgress to Skipped let job = ds.ds_active.get(&read_id).unwrap(); @@ -7175,7 +7068,7 @@ pub(crate) mod test { assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Active); // The write was fast-acked, and the read is still going - assert!(ds.ackable_work().is_empty()); + assert!(ds.ds_active.get(&write_id).unwrap().acked); // Verify the read switched from new to skipped @@ -7233,8 +7126,9 @@ pub(crate) mod test { ); } - // The write has been fast-acked; the read is ackable - assert_eq!(ds.ackable_work().len(), 1); + // The write has been fast-acked; the read has been acked + assert!(ds.ds_active.get(&write_one).unwrap().acked); + assert!(ds.ds_active.get(&read_one).unwrap().acked); // Verify all IOs are done @@ -7254,39 +7148,35 @@ pub(crate) mod test { let err_response = Err(CrucibleError::GenericError("bad".to_string())); - // Process the operation for client 0, 1 - ds.process_ds_completion( + // Process the operation for client 0, 1, 2. It should already be acked + assert!(!ds.process_ds_completion( write_fail, ClientId::new(0), Ok(Default::default()), &UpstairsState::Active, None, - ); - ds.process_ds_completion( + )); + assert!(!ds.process_ds_completion( write_fail, ClientId::new(1), Ok(Default::default()), &UpstairsState::Active, None, - ); - ds.process_ds_completion( + )); + assert!(!ds.process_ds_completion( write_fail, ClientId::new(2), err_response, &UpstairsState::Active, None, - ); + )); // Verify client states assert_eq!(ds.clients[ClientId::new(0)].state(), DsState::Active); assert_eq!(ds.clients[ClientId::new(1)].state(), DsState::Active); assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Faulted); - // Only the first read remains ackable - assert_eq!(ds.ackable_work().len(), 1); - // Verify all IOs are done - for cid in ClientId::iter() { let job = ds.ds_active.get(&read_one).unwrap(); assert_eq!(job.state[cid], IOState::Done); @@ -7343,8 +7233,9 @@ pub(crate) mod test { ); } - // Verify the read can be acked (the write was fast-acked) - assert_eq!(ds.ackable_work().len(), 1); + // Verify the read has been acked (the write was fast-acked) + assert!(ds.ds_active.get(&read_one).unwrap().acked); + assert!(ds.ds_active.get(&write_one).unwrap().acked); // Verify all IOs are done @@ -7366,39 +7257,35 @@ pub(crate) mod test { let read_two = ds.create_and_enqueue_generic_read_eob(); // Process the write operation for downstairs 0, 1 - ds.process_ds_completion( + assert!(!ds.process_ds_completion( write_fail, ClientId::new(0), Ok(Default::default()), &UpstairsState::Active, None, - ); - ds.process_ds_completion( + )); + assert!(!ds.process_ds_completion( write_fail, ClientId::new(1), Ok(Default::default()), &UpstairsState::Active, None, - ); + )); // Have downstairs 2 return error. - ds.process_ds_completion( + assert!(!ds.process_ds_completion( write_fail, ClientId::new(2), err_response, &UpstairsState::Active, None, - ); + )); // Verify client states assert_eq!(ds.clients[ClientId::new(0)].state(), DsState::Active); assert_eq!(ds.clients[ClientId::new(1)].state(), DsState::Active); assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Faulted); - // The first read remains the only ackable work - assert_eq!(ds.ackable_work().len(), 1); - // Verify all IOs are done - for cid in ClientId::iter() { // First read, still Done let job = ds.ds_active.get(&read_one).unwrap(); @@ -7444,7 +7331,7 @@ pub(crate) mod test { let read_one = ds.create_and_enqueue_generic_read_eob(); // Finally, add a flush - let flush_one = ds.submit_flush(None, None); + let flush_one = ds.submit_flush(None, None, None); let job = ds.ds_active.get(&write_one).unwrap(); assert_eq!(job.state[ClientId::new(0)], IOState::Skipped); @@ -7467,87 +7354,75 @@ pub(crate) mod test { assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 0); // Do the write - ds.process_ds_completion( + assert!(!ds.process_ds_completion( write_one, ClientId::new(1), Ok(Default::default()), &UpstairsState::Active, None, - ); - ds.process_ds_completion( + )); + assert!(!ds.process_ds_completion( write_one, ClientId::new(2), Ok(Default::default()), &UpstairsState::Active, None, - ); + )); // Make the read ok response, do the read let rr = || Ok(build_read_response(&[])); - ds.process_ds_completion( + assert!(ds.process_ds_completion( read_one, ClientId::new(1), rr(), &UpstairsState::Active, None, - ); - ds.process_ds_completion( + )); + assert!(!ds.process_ds_completion( read_one, ClientId::new(2), rr(), &UpstairsState::Active, None, - ); + )); + + let job = ds.ds_active.get(&read_one).unwrap(); + assert_eq!(job.state[ClientId::new(1)], IOState::Done); + assert_eq!(job.state[ClientId::new(2)], IOState::Done); + let job = ds.ds_active.get(&write_one).unwrap(); + assert_eq!(job.state[ClientId::new(1)], IOState::Done); + assert_eq!(job.state[ClientId::new(2)], IOState::Done); // Do the flush - ds.process_ds_completion( + assert!(!ds.process_ds_completion( flush_one, ClientId::new(1), Ok(Default::default()), &UpstairsState::Active, None, - ); - ds.process_ds_completion( + )); + let job = ds.ds_active.get(&flush_one).unwrap(); + assert_eq!(job.state[ClientId::new(0)], IOState::Skipped); + assert_eq!(job.state[ClientId::new(1)], IOState::Done); + assert_eq!(job.state[ClientId::new(2)], IOState::InProgress); + assert!(ds.process_ds_completion( flush_one, ClientId::new(2), Ok(Default::default()), &UpstairsState::Active, None, - ); - - // Verify three jobs can be acked (or should have been fast-acked) - assert!(ds.ds_active.get(&write_one).unwrap().acked); - assert_eq!(ds.ackable_work().len(), 2); - - // Verify all IOs are done - - let job = ds.ds_active.get(&read_one).unwrap(); - assert_eq!(job.state[ClientId::new(1)], IOState::Done); - assert_eq!(job.state[ClientId::new(2)], IOState::Done); - let job = ds.ds_active.get(&write_one).unwrap(); - assert_eq!(job.state[ClientId::new(1)], IOState::Done); - assert_eq!(job.state[ClientId::new(2)], IOState::Done); - let job = ds.ds_active.get(&flush_one).unwrap(); - assert_eq!(job.state[ClientId::new(1)], IOState::Done); - assert_eq!(job.state[ClientId::new(2)], IOState::Done); - - ds.ack(read_one); - // write has already been fast-acked - ds.ack(flush_one); - ds.retire_check(flush_one); + )); + // The flush retires itself and preceding jobs // Skipped jobs just has the flush assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 1); assert!(ds.clients[ClientId::new(0)] .skipped_jobs .contains(&JobId(1002))); - assert_eq!(ds.ackable_work().len(), 0); // The writes, the read, and the flush should be completed. assert_eq!(ds.completed().len(), 3); - // No more ackable work - assert_eq!(ds.ackable_work().len(), 0); // No more jobs on the queue assert_eq!(ds.active_count(), 0); } @@ -7570,7 +7445,8 @@ pub(crate) mod test { let read_one = ds.create_and_enqueue_generic_read_eob(); // Finally, add a flush - let flush_one = ds.submit_flush(None, None); + let (mut rx, tx) = BlockOpWaiter::pair(); + let flush_one = ds.submit_flush(None, Some(tx), None); let job = ds.ds_active.get(&write_one).unwrap(); assert_eq!(job.state[ClientId::new(0)], IOState::Skipped); @@ -7593,51 +7469,42 @@ pub(crate) mod test { assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 3); // Do the write - ds.process_ds_completion( + assert!(!ds.process_ds_completion( write_one, ClientId::new(1), Ok(Default::default()), &UpstairsState::Active, None, - ); + )); + let job = ds.ds_active.get(&write_one).unwrap(); + assert!(job.acked); // fast-ack + assert_eq!(job.state[ClientId::new(1)], IOState::Done); // Make the read ok response, do the read let rr = Ok(build_read_response(&[])); - ds.process_ds_completion( + assert!(ds.process_ds_completion( read_one, ClientId::new(1), rr, &UpstairsState::Active, None, - ); + )); - // Do the flush - ds.process_ds_completion( + // Verify that the read IO is done + let job = ds.ds_active.get(&read_one).unwrap(); + assert!(job.acked); + assert_eq!(job.state[ClientId::new(1)], IOState::Done); + + // Do the flush, which retires other jobs + assert!(ds.process_ds_completion( flush_one, ClientId::new(1), Ok(Default::default()), &UpstairsState::Active, None, - ); - - // Verify the write should be fast-acked and the others are ackable - assert!(ds.ds_active.get(&write_one).unwrap().acked); - assert_eq!(ds.ackable_work().len(), 2); - - // Verify all IOs are done - - let job = ds.ds_active.get(&read_one).unwrap(); - assert_eq!(job.state[ClientId::new(1)], IOState::Done); - let job = ds.ds_active.get(&write_one).unwrap(); - assert_eq!(job.state[ClientId::new(1)], IOState::Done); - let job = ds.ds_active.get(&flush_one).unwrap(); - assert_eq!(job.state[ClientId::new(1)], IOState::Done); - - ds.ack(read_one); - // write should be fast-acked - ds.ack(flush_one); - ds.retire_check(flush_one); + )); + assert!(rx.try_wait().unwrap().is_err()); // Skipped jobs now just have the flush. assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 1); @@ -7648,12 +7515,9 @@ pub(crate) mod test { assert!(ds.clients[ClientId::new(2)] .skipped_jobs .contains(&JobId(1002))); - assert_eq!(ds.ackable_work().len(), 0); // The writes, the read, and the flush should be completed. assert_eq!(ds.completed().len(), 3); - // No more ackable work - assert_eq!(ds.ackable_work().len(), 0); // No more jobs on the queue assert_eq!(ds.active_count(), 0); } @@ -7684,21 +7548,12 @@ pub(crate) mod test { assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 1); // Verify jobs can be acked. - assert_eq!(ds.ackable_work().len(), 1); - - // Verify all IOs are done - // We are simulating what would happen here by the up_ds_listen - // task, after it receives a notification from the ds_done_tx. - - ds.ack(read_one); - - ds.retire_check(read_one); + assert!(ds.ds_active.get(&read_one).unwrap().acked); // Our skipped jobs have not yet been cleared. assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 1); assert_eq!(ds.clients[ClientId::new(1)].skipped_jobs.len(), 1); assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 1); - assert_eq!(ds.ackable_work().len(), 0); } #[test] @@ -7727,17 +7582,10 @@ pub(crate) mod test { assert_eq!(ds.clients[ClientId::new(1)].skipped_jobs.len(), 1); assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 1); - // Verify all IOs are done - // We are simulating what would happen here by the up_ds_listen - // task, after it receives a notification from the ds_done_tx. - - ds.retire_check(write_one); // No flush, no change in skipped jobs. assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 1); assert_eq!(ds.clients[ClientId::new(1)].skipped_jobs.len(), 1); assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 1); - - assert_eq!(ds.ackable_work().len(), 0); } #[test] @@ -7754,36 +7602,19 @@ pub(crate) mod test { } // Create a flush. - let flush_one = ds.submit_flush(None, None); - let job = ds.ds_active.get(&flush_one).unwrap(); - assert_eq!(job.state[ClientId::new(0)], IOState::Skipped); - assert_eq!(job.state[ClientId::new(1)], IOState::Skipped); - assert_eq!(job.state[ClientId::new(2)], IOState::Skipped); + let (mut rx, tx) = BlockOpWaiter::pair(); + let _ = ds.submit_flush(None, Some(tx), None); + // This job should be insta-skipped and then retired, so it's gone + assert!(rx.try_wait().unwrap().is_err()); for cid in ClientId::iter() { assert_eq!(ds.clients[cid].skipped_jobs.len(), 1); assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1000))); } - // Verify jobs can be acked. - assert_eq!(ds.ackable_work().len(), 1); - - ds.ack(flush_one); - ds.retire_check(flush_one); - - assert_eq!(ds.ackable_work().len(), 0); - // The flush should remove all work from the ds queue. assert_eq!(ds.completed().len(), 1); - // No more ackable work - assert_eq!(ds.ackable_work().len(), 0); // No more jobs on the queue assert_eq!(ds.active_count(), 0); - - // Skipped jobs still has the flush. - for cid in ClientId::iter() { - assert_eq!(ds.clients[cid].skipped_jobs.len(), 1); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1000))); - } } #[test] @@ -7803,40 +7634,30 @@ pub(crate) mod test { // Create a read. let read_one = ds.create_and_enqueue_generic_read_eob(); + // The read should be acked because all downstairs are offline + assert!(ds.ds_active.get(&read_one).unwrap().acked); + // Create a write. let write_one = ds.create_and_enqueue_generic_write_eob(false); - // Create a flush - let flush_one = ds.submit_flush(None, None); - // Verify all jobs can be acked (or should have been fast-acked) - let write_job = ds.ds_active.get(&write_one).unwrap(); - assert!(write_job.acked); - assert_eq!(ds.ackable_work().len(), 2); + // The write should be fast-acked + assert!(ds.ds_active.get(&write_one).unwrap().acked); // Skipped jobs are not yet cleared. for cid in ClientId::iter() { - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1000))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1001))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1002))); - assert_eq!(ds.clients[cid].skipped_jobs.len(), 3); + assert!(ds.clients[cid].skipped_jobs.contains(&read_one)); + assert!(ds.clients[cid].skipped_jobs.contains(&write_one)); + assert_eq!(ds.clients[cid].skipped_jobs.len(), 2); } - // Verify all IOs are done - // We are simulating what would happen here by the up_ds_listen - // task, after it receives a notification from the ds_done_tx. - ds.ack(read_one); - // write has already been fast-acked - ds.ack(flush_one); - - // Don't bother with retire check for read/write, just flush - ds.retire_check(flush_one); - - assert_eq!(ds.ackable_work().len(), 0); + // Create a flush + let (mut rx, tx) = BlockOpWaiter::pair(); + let _flush_one = ds.submit_flush(None, Some(tx), None); + // The flush instantly returns an error and retires old jobs + assert!(rx.try_wait().unwrap().is_err()); // The flush should remove all work from the ds queue. assert_eq!(ds.completed().len(), 3); - // No more ackable work - assert_eq!(ds.ackable_work().len(), 0); // No more jobs on the queue assert_eq!(ds.active_count(), 0); @@ -7866,41 +7687,39 @@ pub(crate) mod test { // Create a read, write, flush let read_one = ds.create_and_enqueue_generic_read_eob(); let write_one = ds.create_and_enqueue_generic_write_eob(false); - let flush_one = ds.submit_flush(None, None); - - // Create more IOs. - let _read_two = ds.create_and_enqueue_generic_read_eob(); - let _write_two = ds.create_and_enqueue_generic_write_eob(false); - let _flush_two = ds.submit_flush(None, None); - // Six jobs have been skipped. + // The read and write are skipped + acked for cid in ClientId::iter() { - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1000))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1001))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1002))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1003))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1004))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1005))); - assert_eq!(ds.clients[cid].skipped_jobs.len(), 6); + assert!(ds.clients[cid].skipped_jobs.contains(&read_one)); + assert!(ds.clients[cid].skipped_jobs.contains(&write_one)); + assert_eq!(ds.clients[cid].skipped_jobs.len(), 2); } - - // Ack the read and flush, and confirm that the write was fast-acked - ds.ack(read_one); + assert!(ds.ds_active.get(&read_one).unwrap().acked); assert!(ds.ds_active.get(&write_one).unwrap().acked); - ds.ack(flush_one); - assert_eq!(ds.ackable_work().len(), 2); - // Don't bother with retire check for read/write, just flush - ds.retire_check(flush_one); + let (mut rx, tx) = BlockOpWaiter::pair(); + let flush_one = ds.submit_flush(None, Some(tx), None); + assert!(rx.try_wait().unwrap().is_err()); + + let read_two = ds.create_and_enqueue_generic_read_eob(); + let write_two = ds.create_and_enqueue_generic_write_eob(false); - // The first two skipped jobs are now cleared and the non-acked - // jobs remain on the list, as well as the last flush. for cid in ClientId::iter() { - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1002))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1003))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1004))); - assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1005))); - assert_eq!(ds.clients[cid].skipped_jobs.len(), 4); + assert!(ds.clients[cid].skipped_jobs.contains(&flush_one)); + assert!(ds.clients[cid].skipped_jobs.contains(&read_two)); + assert!(ds.clients[cid].skipped_jobs.contains(&write_two)); + assert_eq!(ds.clients[cid].skipped_jobs.len(), 3); + } + assert!(ds.ds_active.get(&read_two).unwrap().acked); + assert!(ds.ds_active.get(&write_two).unwrap().acked); + + let (mut rx, tx) = BlockOpWaiter::pair(); + let flush_two = ds.submit_flush(None, Some(tx), None); + assert!(rx.try_wait().unwrap().is_err()); + + for cid in ClientId::iter() { + assert!(ds.clients[cid].skipped_jobs.contains(&flush_two)); + assert_eq!(ds.clients[cid].skipped_jobs.len(), 1); } } @@ -8670,10 +8489,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: extent_repair_ids.close_id, - repair_id: extent_repair_ids.repair_id, - noop_id: extent_repair_ids.noop_id, - reopen_id: extent_repair_ids.reopen_id, + close_job: PendingJob::new(extent_repair_ids.close_id), + repair_job: PendingJob::new(extent_repair_ids.repair_id), + noop_job: PendingJob::new(extent_repair_ids.noop_id), + reopen_job: PendingJob::new(extent_repair_ids.reopen_id), }, }); } @@ -8806,7 +8625,7 @@ pub(crate) mod test { let read_id = ds.submit_read_block(eid, BlockOffset(0)); let write_id = ds.submit_test_write_block(eid, BlockOffset(1), false); - let flush_id = ds.submit_flush(None, None); + let flush_id = ds.submit_flush(None, None, None); let repair_ids = create_and_enqueue_repair_ops(&mut ds, eid); @@ -8923,7 +8742,7 @@ pub(crate) mod test { let eid = ExtentId(1); let repair_ids = create_and_enqueue_repair_ops(&mut ds, eid); - let flush_id = ds.submit_flush(None, None); + let flush_id = ds.submit_flush(None, None, None); assert_eq!(ds.ds_active.len(), 5); @@ -9003,9 +8822,9 @@ pub(crate) mod test { let mut ds = Downstairs::repair_test_one_repair(); - let flush_id1 = ds.submit_flush(None, None); + let flush_id1 = ds.submit_flush(None, None, None); let repair_ids = create_and_enqueue_repair_ops(&mut ds, ExtentId(1)); - let flush_id2 = ds.submit_flush(None, None); + let flush_id2 = ds.submit_flush(None, None, None); assert_eq!(ds.ds_active.len(), 6); @@ -9038,7 +8857,7 @@ pub(crate) mod test { let repair_ids1 = create_and_enqueue_repair_ops(&mut ds, ExtentId(0)); - let flush_id = ds.submit_flush(None, None); + let flush_id = ds.submit_flush(None, None, None); let repair_ids2 = create_and_enqueue_repair_ops(&mut ds, ExtentId(1)); @@ -9531,7 +9350,7 @@ pub(crate) mod test { let read_id = ds.submit_read_block(ExtentId(0), BlockOffset(1)); - let flush_id = ds.submit_flush(None, None); + let flush_id = ds.submit_flush(None, None, None); assert_eq!(ds.ds_active.len(), 3); @@ -9583,10 +9402,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(1000), - repair_id: JobId(1001), - noop_id: JobId(1002), - reopen_id: JobId(1003), + close_job: PendingJob::new(JobId(1000)), + repair_job: PendingJob::new(JobId(1001)), + noop_job: PendingJob::new(JobId(1002)), + reopen_job: PendingJob::new(JobId(1003)), }, }); @@ -9649,10 +9468,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(1000), - repair_id: JobId(1001), - noop_id: JobId(1002), - reopen_id: JobId(1003), + close_job: PendingJob::new(JobId(1000)), + repair_job: PendingJob::new(JobId(1001)), + noop_job: PendingJob::new(JobId(1002)), + reopen_job: PendingJob::new(JobId(1003)), }, }); @@ -9711,14 +9530,14 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(1000), - repair_id: JobId(1001), - noop_id: JobId(1002), - reopen_id: JobId(1003), + close_job: PendingJob::new(JobId(1000)), + repair_job: PendingJob::new(JobId(1001)), + noop_job: PendingJob::new(JobId(1002)), + reopen_job: PendingJob::new(JobId(1003)), }, }); - let flush_id = ds.submit_flush(None, None); + let flush_id = ds.submit_flush(None, None, None); assert_eq!(ds.ds_active.len(), 1); assert!(ds.get_deps(flush_id).is_empty()); @@ -9740,10 +9559,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(1004), - repair_id: JobId(1005), - noop_id: JobId(1006), - reopen_id: JobId(1007), + close_job: PendingJob::new(JobId(1004)), + repair_job: PendingJob::new(JobId(1005)), + noop_job: PendingJob::new(JobId(1006)), + reopen_job: PendingJob::new(JobId(1007)), }, }); @@ -9804,10 +9623,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(next_id), - repair_id: JobId(next_id + 1), - noop_id: JobId(next_id + 2), - reopen_id: JobId(next_id + 3), + close_job: PendingJob::new(JobId(next_id)), + repair_job: PendingJob::new(JobId(next_id + 1)), + noop_job: PendingJob::new(JobId(next_id + 2)), + reopen_job: PendingJob::new(JobId(next_id + 3)), }, }); @@ -9826,10 +9645,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(next_id), - repair_id: JobId(next_id + 1), - noop_id: JobId(next_id + 2), - reopen_id: JobId(next_id + 3), + close_job: PendingJob::new(JobId(next_id)), + repair_job: PendingJob::new(JobId(next_id + 1)), + noop_job: PendingJob::new(JobId(next_id + 2)), + reopen_job: PendingJob::new(JobId(next_id + 3)), }, }); @@ -9972,10 +9791,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(next_id), - repair_id: JobId(next_id + 1), - noop_id: JobId(next_id + 2), - reopen_id: JobId(next_id + 3), + close_job: PendingJob::new(JobId(next_id)), + repair_job: PendingJob::new(JobId(next_id + 1)), + noop_job: PendingJob::new(JobId(next_id + 2)), + reopen_job: PendingJob::new(JobId(next_id + 3)), }, }); @@ -10228,10 +10047,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(1000), - repair_id: JobId(1001), - noop_id: JobId(1002), - reopen_id: JobId(1003), + close_job: PendingJob::new(JobId(1000)), + repair_job: PendingJob::new(JobId(1001)), + noop_job: PendingJob::new(JobId(1002)), + reopen_job: PendingJob::new(JobId(1003)), }, }); @@ -10323,10 +10142,10 @@ pub(crate) mod test { repair_downstairs: vec![ClientId::new(1)], aborting_repair: false, state: LiveRepairState::Closing { - close_id: JobId(1000), - repair_id: JobId(1001), - noop_id: JobId(1002), - reopen_id: JobId(1003), + close_job: PendingJob::new(JobId(1000)), + repair_job: PendingJob::new(JobId(1001)), + noop_job: PendingJob::new(JobId(1002)), + reopen_job: PendingJob::new(JobId(1003)), }, }); diff --git a/upstairs/src/dummy_downstairs_tests.rs b/upstairs/src/dummy_downstairs_tests.rs index e9e277a31..fb7254fbd 100644 --- a/upstairs/src/dummy_downstairs_tests.rs +++ b/upstairs/src/dummy_downstairs_tests.rs @@ -14,7 +14,10 @@ use crate::BlockIO; use crate::Buffer; use crate::CrucibleError; use crate::DsState; -use crate::{IO_OUTSTANDING_MAX_BYTES, IO_OUTSTANDING_MAX_JOBS}; +use crate::{ + IO_CACHED_MAX_BYTES, IO_CACHED_MAX_JOBS, IO_OUTSTANDING_MAX_BYTES, + IO_OUTSTANDING_MAX_JOBS, +}; use crucible_client_types::CrucibleOpts; use crucible_common::Block; use crucible_common::BlockIndex; @@ -264,23 +267,26 @@ impl DownstairsHandle { /// # Panics /// If a non-flush message arrives pub async fn ack_flush(&mut self) -> u64 { - let Message::Flush { - job_id, - flush_number, - upstairs_id, - .. - } = self.recv().await.unwrap() - else { - panic!("saw non flush!"); - }; - self.send(Message::FlushAck { - upstairs_id, - session_id: self.upstairs_session_id.unwrap(), - job_id, - result: Ok(()), - }) - .unwrap(); - flush_number + match self.recv().await.unwrap() { + Message::Flush { + job_id, + flush_number, + upstairs_id, + .. + } => { + self.send(Message::FlushAck { + upstairs_id, + session_id: self.upstairs_session_id.unwrap(), + job_id, + result: Ok(()), + }) + .unwrap(); + flush_number + } + m => { + panic!("saw non flush {m:?}"); + } + } } /// Awaits a `Message::Write { .. }` and sends a `WriteAck` @@ -290,17 +296,45 @@ impl DownstairsHandle { /// # Panics /// If a non-write message arrives pub async fn ack_write(&mut self) -> JobId { - let Message::Write { header, .. } = self.recv().await.unwrap() else { - panic!("saw non write!"); - }; - self.send(Message::WriteAck { - upstairs_id: header.upstairs_id, - session_id: self.upstairs_session_id.unwrap(), - job_id: header.job_id, - result: Ok(()), - }) - .unwrap(); - header.job_id + match self.recv().await.unwrap() { + Message::Write { header, .. } => { + self.send(Message::WriteAck { + upstairs_id: header.upstairs_id, + session_id: self.upstairs_session_id.unwrap(), + job_id: header.job_id, + result: Ok(()), + }) + .unwrap(); + header.job_id + } + m => panic!("saw non write: {m:?}"), + } + } + + /// Awaits a `Message::Barrier { .. }` and sends a `BarrierAck` + /// + /// Returns the job ID for further checks. + /// + /// # Panics + /// If a non-write message arrives + pub async fn ack_barrier(&mut self) -> JobId { + match self.recv().await.unwrap() { + Message::Barrier { + upstairs_id, + job_id, + .. + } => { + self.send(Message::BarrierAck { + upstairs_id, + session_id: self.upstairs_session_id.unwrap(), + job_id, + result: Ok(()), + }) + .unwrap(); + job_id + } + m => panic!("saw non barrier: {m:?}"), + } } /// Awaits a `Message::Read` and sends a blank `ReadResponse` @@ -310,26 +344,27 @@ impl DownstairsHandle { /// # Panics /// If a non-read message arrives pub async fn ack_read(&mut self) -> JobId { - let Message::ReadRequest { - job_id, - upstairs_id, - .. - } = self.recv().await.unwrap() - else { - panic!("saw non write!"); - }; - let (block, data) = make_blank_read_response(); - self.send(Message::ReadResponse { - header: ReadResponseHeader { - upstairs_id, - session_id: self.upstairs_session_id.unwrap(), + match self.recv().await.unwrap() { + Message::ReadRequest { job_id, - blocks: Ok(vec![block]), - }, - data: data.clone(), - }) - .unwrap(); - job_id + upstairs_id, + .. + } => { + let (block, data) = make_blank_read_response(); + self.send(Message::ReadResponse { + header: ReadResponseHeader { + upstairs_id, + session_id: self.upstairs_session_id.unwrap(), + job_id, + blocks: Ok(vec![block]), + }, + data: data.clone(), + }) + .unwrap(); + job_id + } + m => panic!("saw non read {m:?}"), + } } } @@ -471,14 +506,31 @@ pub struct TestHarness { /// Number of extents in `TestHarness::default_config` const DEFAULT_EXTENT_COUNT: u32 = 25; +const DEFAULT_BLOCK_COUNT: u64 = 10; + +struct TestOpts { + flush_timeout: f32, + read_only: bool, + disable_backpressure: bool, +} impl TestHarness { pub async fn new() -> TestHarness { - Self::new_(false).await + Self::new_with_opts(TestOpts { + flush_timeout: 86400.0, + read_only: false, + disable_backpressure: true, + }) + .await } pub async fn new_ro() -> TestHarness { - Self::new_(true).await + Self::new_with_opts(TestOpts { + flush_timeout: 86400.0, + read_only: true, + disable_backpressure: true, + }) + .await } pub fn ds1(&mut self) -> &mut DownstairsHandle { @@ -494,7 +546,7 @@ impl TestHarness { // IO_OUTSTANDING_MAX_BYTES in less than IO_OUTSTANDING_MAX_JOBS, // i.e. letting us test both byte and job fault conditions. extent_count: DEFAULT_EXTENT_COUNT, - extent_size: Block::new_512(10), + extent_size: Block::new_512(DEFAULT_BLOCK_COUNT), gen_numbers: vec![0u64; DEFAULT_EXTENT_COUNT as usize], flush_numbers: vec![0u64; DEFAULT_EXTENT_COUNT as usize], @@ -502,10 +554,10 @@ impl TestHarness { } } - async fn new_(read_only: bool) -> TestHarness { + async fn new_with_opts(opts: TestOpts) -> TestHarness { let log = csl(); - let cfg = Self::default_config(read_only); + let cfg = Self::default_config(opts.read_only); let ds1 = cfg.clone().start(log.new(o!("downstairs" => 1))).await; let ds2 = cfg.clone().start(log.new(o!("downstairs" => 2))).await; @@ -514,15 +566,16 @@ impl TestHarness { // Configure our guest without backpressure, to speed up tests which // require triggering a timeout let (g, mut io) = Guest::new(Some(log.clone())); - io.disable_queue_backpressure(); - io.disable_byte_backpressure(); + if opts.disable_backpressure { + io.disable_backpressure(); + } let guest = Arc::new(g); let crucible_opts = CrucibleOpts { id: Uuid::new_v4(), target: vec![ds1.local_addr, ds2.local_addr, ds3.local_addr], - flush_timeout: Some(86400.0), - read_only, + flush_timeout: Some(opts.flush_timeout), + read_only: opts.read_only, ..Default::default() }; @@ -1446,11 +1499,12 @@ async fn test_byte_fault_condition() { // out. const WRITE_SIZE: usize = 105 * 1024; // 105 KiB let write_buf = BytesMut::from(vec![1; WRITE_SIZE].as_slice()); // 50 KiB + let barrier_point = IO_CACHED_MAX_BYTES as usize / write_buf.len(); let num_jobs = IO_OUTSTANDING_MAX_BYTES as usize / write_buf.len() + 10; assert!(num_jobs < IO_OUTSTANDING_MAX_JOBS); // First, we'll send jobs until the timeout - for _ in 0..num_jobs { + for i in 0..num_jobs { // We must `spawn` here because `write` will wait for the response // to come back before returning let write_buf = write_buf.clone(); @@ -1458,11 +1512,11 @@ async fn test_byte_fault_condition() { guest.write(BlockIndex(0), write_buf).await.unwrap(); }); - // Before we're kicked out, assert we're seeing the read requests - assert!(matches!( - harness.ds1().recv().await.unwrap(), - Message::Write { .. }, - )); + // Before we're kicked out, assert we're seeing the write requests + let m = harness.ds1().recv().await.unwrap(); + if !matches!(m, Message::Write { .. },) { + panic!("got unexpected message {m:?}"); + } harness.ds2.ack_write().await; harness.ds3.ack_write().await; @@ -1474,6 +1528,17 @@ async fn test_byte_fault_condition() { assert_eq!(ds[ClientId::new(0)], DsState::Active); assert_eq!(ds[ClientId::new(1)], DsState::Active); assert_eq!(ds[ClientId::new(2)], DsState::Active); + + // Once one of the Downstairs has cached more that a certain amount of + // bytes, it will automatically send a Barrier operation. + if i == barrier_point { + let m = harness.ds1().recv().await.unwrap(); + if !matches!(m, Message::Barrier { .. }) { + panic!("expected Barrier, got message {m:?}"); + } + harness.ds2.ack_barrier().await; + harness.ds3.ack_barrier().await; + } } // Sleep until we're confident that the Downstairs is kicked out @@ -1605,6 +1670,13 @@ async fn test_byte_fault_condition_offline() { assert_eq!(ds[ClientId::new(1)], DsState::Active); assert_eq!(ds[ClientId::new(2)], DsState::Active); } + + // If we've sent IO_CACHED_MAX_BYTES, then we expect the Upstairs to + // insert a Barrier (because it's trying to clean out finished jobs). + if IO_CACHED_MAX_BYTES as usize / WRITE_SIZE == i { + harness.ds2.ack_barrier().await; + harness.ds3.ack_barrier().await; + } } // Confirm that the system comes up after live-repair @@ -1760,7 +1832,7 @@ async fn test_job_fault_condition() { // timeout, so that when timeout hits, the downstairs will become Faulted // instead of Offline. let num_jobs = IO_OUTSTANDING_MAX_JOBS + 200; - for _ in 0..num_jobs { + for i in 0..num_jobs { // We must `spawn` here because `write` will wait for the response to // come back before returning let h = harness.spawn(|guest| async move { @@ -1769,10 +1841,10 @@ async fn test_job_fault_condition() { }); // DS1 should be receiving messages - assert!(matches!( - harness.ds1().recv().await.unwrap(), - Message::ReadRequest { .. }, - )); + let m = harness.ds1().recv().await.unwrap(); + if !matches!(m, Message::ReadRequest { .. },) { + panic!("got unexpected message {m:?}"); + } // Respond with read responses for downstairs 2 and 3 harness.ds2.ack_read().await; @@ -1786,6 +1858,18 @@ async fn test_job_fault_condition() { assert_eq!(ds[ClientId::new(0)], DsState::Active); assert_eq!(ds[ClientId::new(1)], DsState::Active); assert_eq!(ds[ClientId::new(2)], DsState::Active); + + // When we hit IO_CACHED_MAX_JOBS, the Upstairs will attempt to send a + // barrier to retire jobs. The barrier won't succeed, because we're not + // acking it on DS1. + if i + 1 == IO_CACHED_MAX_JOBS as usize { + let m = harness.ds1().recv().await.unwrap(); + if !matches!(m, Message::Barrier { .. }) { + panic!("expected Barrier, got message {m:?}"); + } + harness.ds2.ack_barrier().await; + harness.ds3.ack_barrier().await; + } } // Sleep until we're confident that the Downstairs is kicked out @@ -1888,6 +1972,7 @@ async fn test_job_fault_condition_offline() { // transition it to `Faulted` by sending it enough to hit // `IO_OUTSTANDING_MAX_JOBS` info!(harness.log, "sending more jobs to fault DS1"); + let mut barrier_count = 0; for i in num_jobs..IO_OUTSTANDING_MAX_JOBS + 200 { let h = harness.spawn(|guest| async move { let mut buffer = Buffer::new(1, 512); @@ -1910,8 +1995,17 @@ async fn test_job_fault_condition_offline() { // the Upstairs has finished updating its state). h.await.unwrap(); + // If we've sent IO_CACHED_MAX_JOBS, then we expect the Upstairs to + // insert a Barrier (because it's trying to clean out finished jobs). + if i + 1 == IO_CACHED_MAX_JOBS as usize { + // nothing arrives on DS1, because it's offline + harness.ds2.ack_barrier().await; + harness.ds3.ack_barrier().await; + barrier_count += 1; + } + let ds = harness.guest.downstairs_state().await.unwrap(); - if i < IO_OUTSTANDING_MAX_JOBS { + if i + barrier_count < IO_OUTSTANDING_MAX_JOBS { // At this point, we should still be offline assert_eq!(ds[ClientId::new(0)], DsState::Offline); assert_eq!(ds[ClientId::new(1)], DsState::Active); @@ -2716,3 +2810,83 @@ async fn test_no_send_offline() { e => panic!("invalid message {e:?}; expected Write"), } } + +/// Test that barrier operations are sent periodically +#[tokio::test] +async fn test_jobs_based_barrier() { + let mut harness = TestHarness::new_with_opts(TestOpts { + flush_timeout: 0.5, + read_only: false, + disable_backpressure: false, + }) + .await; + + for i in 1..IO_CACHED_MAX_JOBS * 3 { + // Send a write, which will succeed + let write_handle = harness.spawn(|guest| async move { + let mut data = BytesMut::new(); + data.resize(512, 1u8); + guest.write(BlockIndex(0), data).await.unwrap(); + }); + + // Ensure that all three clients got the write request + harness.ds1().ack_write().await; + harness.ds2.ack_write().await; + harness.ds3.ack_write().await; + + write_handle.await.unwrap(); + + if i % IO_CACHED_MAX_JOBS == 0 { + harness.ds1().ack_barrier().await; + harness.ds2.ack_barrier().await; + harness.ds3.ack_barrier().await; + } + } + + // We should also automatically send a flush here + harness.ds1().ack_flush().await; + harness.ds2.ack_flush().await; + harness.ds3.ack_flush().await; +} + +/// Test that barrier operations are sent periodically +#[tokio::test] +async fn test_bytes_based_barrier() { + let mut harness = TestHarness::new_with_opts(TestOpts { + flush_timeout: 0.5, + read_only: false, + disable_backpressure: false, + }) + .await; + + const WRITE_SIZE: usize = 105 * 1024; // 105 KiB + let write_buf = BytesMut::from(vec![1; WRITE_SIZE].as_slice()); // 50 KiB + let barrier_point = + (IO_CACHED_MAX_BYTES as usize).div_ceil(write_buf.len()); + + for i in 1..barrier_point * 3 + 10 { + // Send a write, which will succeed + let write_buf = write_buf.clone(); + let write_handle = harness.spawn(|guest| async move { + guest.write(BlockIndex(0), write_buf).await.unwrap(); + }); + + // Ensure that all three clients got the write request + harness.ds1().ack_write().await; + harness.ds2.ack_write().await; + harness.ds3.ack_write().await; + + write_handle.await.unwrap(); + + if i % barrier_point == 0 { + harness.ds1().ack_barrier().await; + harness.ds2.ack_barrier().await; + harness.ds3.ack_barrier().await; + } + } + + // We should also automatically send a flush here + harness.ds1().ack_flush().await; + harness.ds2.ack_flush().await; + harness.ds3.ack_flush().await; +} diff --git a/upstairs/src/guest.rs b/upstairs/src/guest.rs index 077f4c676..da14210fa 100644 --- a/upstairs/src/guest.rs +++ b/upstairs/src/guest.rs @@ -2,13 +2,10 @@ use std::{ net::SocketAddr, sync::atomic::{AtomicU64, Ordering}, - time::Duration, }; use crate::{ - backpressure::{ - BackpressureAmount, BackpressureConfig, SharedBackpressureAmount, - }, + io_limits::{IOLimitView, IOLimits}, BlockIO, BlockOp, BlockOpWaiter, BlockRes, Buffer, ReadBlockContext, ReplaceResult, UpstairsAction, }; @@ -18,8 +15,8 @@ use crucible_protocol::SnapshotDetails; use async_trait::async_trait; use bytes::BytesMut; -use slog::{error, info, warn, Logger}; -use tokio::sync::{mpsc, Mutex}; +use slog::{info, warn, Logger}; +use tokio::sync::mpsc; use tracing::{instrument, span, Level}; use uuid::Uuid; @@ -110,19 +107,8 @@ pub struct Guest { /// it can be read from a `&self` reference. block_size: AtomicU64, - /// Backpressure is implemented as a delay on host write operations - /// - /// It is stored in an `Arc` so that the `GuestIoHandle` can update it from - /// the IO task. - backpressure: SharedBackpressureAmount, - - /// Lock held during backpressure delay - /// - /// Without this lock, multiple tasks could submit jobs to the upstairs and - /// wait in parallel, which defeats the purpose of backpressure (since you - /// could send arbitrarily many jobs at high speed by sending them from - /// different tasks). - backpressure_lock: Mutex<()>, + /// View into global IO limits + io_limits: IOLimitView, /// Logger for the guest log: Logger, @@ -150,21 +136,31 @@ impl Guest { // time spent waiting for the queue versus time spent in Upstairs code). let (req_tx, req_rx) = mpsc::channel(500); - let backpressure = SharedBackpressureAmount::new(); + // We have to set limits above `IO_OUTSTANDING_MAX_JOBS/BYTES`: + // an `Offline` downstairs must hit that threshold to transition to + // `Faulted`, so we can't be IO-limited before that point. + let io_limits = IOLimits::new( + crate::IO_OUTSTANDING_MAX_JOBS * 3 / 2, + crate::IO_OUTSTANDING_MAX_BYTES as usize * 3 / 2, + ); + let io_limits_view = io_limits.view(); + let io = GuestIoHandle { req_rx, - backpressure: backpressure.clone(), - backpressure_config: BackpressureConfig::default(), + io_limits, + + #[cfg(test)] + disable_backpressure: false, + log: log.clone(), }; let guest = Guest { req_tx, block_size: AtomicU64::new(0), + io_limits: io_limits_view, - backpressure, - backpressure_lock: Mutex::new(()), log, }; (guest, io) @@ -198,28 +194,6 @@ impl Guest { rx.wait().await } - /// Sleeps for a backpressure-dependent amount, holding the lock - /// - /// If backpressure is saturated, logs and returns an error. - async fn backpressure_sleep(&self) -> Result<(), CrucibleError> { - let bp = self.backpressure.load(); - match bp { - BackpressureAmount::Saturated => { - let err = "write queue is saturated"; - error!(self.log, "{err}"); - Err(CrucibleError::IoError(err.to_owned())) - } - BackpressureAmount::Duration(d) => { - if d > Duration::ZERO { - let _guard = self.backpressure_lock.lock().await; - tokio::time::sleep(d).await; - drop(_guard); - } - Ok(()) - } - } - } - #[cfg(test)] pub async fn downstairs_state( &self, @@ -353,11 +327,20 @@ impl BlockIO for Guest { assert_eq!(chunk.len() as u64 % bs, 0); let offset_change = chunk.len() as u64 / bs; + let io_guard = + self.io_limits.claim(chunk.len() as u32).await.map_err( + |e| { + CrucibleError::IoError(format!( + "could not get IO guard for Read: {e:?}" + )) + }, + )?; let (rx, done) = BlockOpWaiter::pair(); let rio = BlockOp::Read { offset, data: chunk, done, + io_guard, }; // Our return value always includes the buffer, so we can splice it @@ -405,9 +388,8 @@ impl BlockIO for Guest { } // We split writes into chunks to bound the maximum (typical) latency of - // any single `BlockOp::Write`. Otherwise, the host could send writes - // which are large enough that our maximum backpressure delay wouldn't - // compensate for them. + // any single `BlockOp::Write`. This makes the system's performance + // characteristics easier to think about. const MDTS: usize = 1024 * 1024; // 1 MiB while !data.is_empty() { @@ -415,13 +397,19 @@ impl BlockIO for Guest { assert_eq!(buf.len() as u64 % bs, 0); let offset_change = buf.len() as u64 / bs; - self.backpressure_sleep().await?; + let io_guard = + self.io_limits.claim(buf.len() as u32).await.map_err(|e| { + CrucibleError::IoError(format!( + "could not get IO guard for Write: {e:?}" + )) + })?; let reply = self .send_and_wait(|done| BlockOp::Write { offset, data: buf, done, + io_guard, }) .await; reply?; @@ -442,11 +430,17 @@ impl BlockIO for Guest { return Ok(()); } - self.backpressure_sleep().await?; + let io_guard = + self.io_limits.claim(data.len() as u32).await.map_err(|e| { + CrucibleError::IoError(format!( + "could not get IO guard for WriteUnwritten: {e:?}" + )) + })?; self.send_and_wait(|done| BlockOp::WriteUnwritten { offset, data, done, + io_guard, }) .await } @@ -455,9 +449,15 @@ impl BlockIO for Guest { &self, snapshot_details: Option, ) -> Result<(), CrucibleError> { + let io_guard = self.io_limits.claim(0).await.map_err(|e| { + CrucibleError::IoError(format!( + "could not get IO guard for flush: {e:?}" + )) + })?; self.send_and_wait(|done| BlockOp::Flush { snapshot_details, done, + io_guard, }) .await } @@ -514,14 +514,15 @@ pub struct GuestIoHandle { /// Queue to receive new blockreqs req_rx: mpsc::Receiver, - /// Current backpressure (shared with the `Guest`) - backpressure: SharedBackpressureAmount, - - /// Backpressure configuration, as a starting point and max delay - backpressure_config: BackpressureConfig, + /// IO limiting (shared with the `Guest`) + io_limits: IOLimits, /// Log handle, mainly to pass it into the [`Upstairs`] pub log: Logger, + + /// Flag to disable backpressure during unit tests + #[cfg(test)] + disable_backpressure: bool, } impl GuestIoHandle { @@ -536,29 +537,17 @@ impl GuestIoHandle { } #[cfg(test)] - pub fn disable_queue_backpressure(&mut self) { - self.backpressure_config.queue.delay_scale = Duration::ZERO; + pub fn disable_backpressure(&mut self) { + self.disable_backpressure = true; } #[cfg(test)] - pub fn disable_byte_backpressure(&mut self) { - self.backpressure_config.bytes.delay_scale = Duration::ZERO; - } - - #[cfg(test)] - pub fn is_queue_backpressure_disabled(&self) -> bool { - self.backpressure_config.queue.delay_scale == Duration::ZERO - } - - /// Set `self.backpressure` based on outstanding IO ratio - pub fn set_backpressure(&self, bytes: u64, jobs: u64) { - let bp = self.backpressure_config.get_backpressure(bytes, jobs); - self.backpressure.store(bp); + pub fn is_backpressure_disabled(&self) -> bool { + self.disable_backpressure } - /// Looks up current backpressure - pub fn get_backpressure(&self) -> BackpressureAmount { - self.backpressure.load() + pub(crate) fn io_limits(&self) -> &IOLimits { + &self.io_limits } } diff --git a/upstairs/src/io_limits.rs b/upstairs/src/io_limits.rs new file mode 100644 index 000000000..14958f2a2 --- /dev/null +++ b/upstairs/src/io_limits.rs @@ -0,0 +1,158 @@ +// Copyright 2024 Oxide Computer Company +use crate::{ClientData, ClientId, ClientMap}; +use std::sync::Arc; +use tokio::sync::{ + AcquireError, OwnedSemaphorePermit, Semaphore, TryAcquireError, +}; + +/// Per-client IO limits +#[derive(Clone, Debug)] +pub struct ClientIOLimits { + /// Semaphore to claim IO bytes on behalf of a job + io_bytes: Arc, + + /// Semaphore to claim individual IO jobs + jobs: Arc, +} + +impl ClientIOLimits { + /// Builds a new `ClientIOLimits` object with the given limits + pub fn new(max_jobs: usize, max_io_bytes: usize) -> Self { + ClientIOLimits { + io_bytes: Semaphore::new(max_io_bytes).into(), + jobs: Semaphore::new(max_jobs).into(), + } + } + + /// Claims a certain number of bytes (and one job) + /// + /// This function waits until the given resources are available; as such, it + /// should not be called from the same task which is processing jobs (since + /// that could create a deadlock). + async fn claim( + &self, + bytes: u32, + ) -> Result { + let io_bytes = self.io_bytes.clone().acquire_many_owned(bytes).await?; + let jobs = self.jobs.clone().acquire_owned().await?; + Ok(ClientIOLimitGuard { io_bytes, jobs }) + } + + /// Tries to claim a certain number of bytes (and one job) + pub fn try_claim( + &self, + bytes: u32, + ) -> Result { + let io_bytes = self.io_bytes.clone().try_acquire_many_owned(bytes)?; + let jobs = self.jobs.clone().try_acquire_owned()?; + Ok(ClientIOLimitGuard { io_bytes, jobs }) + } +} + +/// Read-write handle for IO limits across all 3x clients +#[derive(Clone, Debug)] +pub struct IOLimits(ClientData); + +impl IOLimits { + /// Builds a new set of IO limits + pub fn new(max_jobs: usize, max_io_bytes: usize) -> Self { + Self(ClientData::from_fn(|_i| { + ClientIOLimits::new(max_jobs, max_io_bytes) + })) + } + + /// Returns a per-client IO limit handle + /// + /// This handle shares permits with the parent + pub fn client_limits(&self, i: ClientId) -> ClientIOLimits { + self.0[i].clone() + } + + /// Returns a view handle for the IO limits + pub fn view(&self) -> IOLimitView { + IOLimitView(self.clone()) + } + + /// Try to claim some number of bytes (and one job) + /// + /// Returns `Err((ClientId, e))` if any of the claims fail + pub fn try_claim( + &self, + bytes: u32, + ) -> Result { + let mut out = ClientData::from_fn(|_| None); + for i in ClientId::iter() { + out[i] = Some(self.0[i].try_claim(bytes).map_err(|e| (i, e))?); + } + + Ok(IOLimitGuard(out.map(Option::unwrap))) + } +} + +/// View into global IO limits +/// +/// This is equivalent to an [`IOLimits`], but exposes a different API. It +/// should be owned by a separate task, to avoid deadlocks when trying to claim +/// resources. +#[derive(Clone, Debug)] +pub struct IOLimitView(IOLimits); + +impl IOLimitView { + /// Claim some number of bytes (and one job) + /// + /// This function waits until the given resources are available; as such, it + /// should not be called from the same task which is processing jobs (since + /// that could create a deadlock). + pub async fn claim( + &self, + bytes: u32, + ) -> Result { + let mut out = ClientData::from_fn(|_| None); + let lim = &self.0; + for i in ClientId::iter() { + out[i] = Some(lim.0[i].claim(bytes).await?); + } + Ok(IOLimitGuard(out.map(Option::unwrap))) + } +} + +//////////////////////////////////////////////////////////////////////////////// + +/// Handle owning some amount of per-client IO +/// +/// The IO permits are released when this handle is dropped +#[derive(Debug)] +pub struct ClientIOLimitGuard { + #[expect(unused)] + io_bytes: OwnedSemaphorePermit, + #[expect(unused)] + jobs: OwnedSemaphorePermit, +} + +impl ClientIOLimitGuard { + #[cfg(test)] + pub fn dummy() -> Self { + let a = Arc::new(Semaphore::new(1)); + let b = Arc::new(Semaphore::new(1)); + let io_bytes = a.try_acquire_owned().unwrap(); + let jobs = b.try_acquire_owned().unwrap(); + ClientIOLimitGuard { io_bytes, jobs } + } +} + +/// Handle which stores IO limit guards for all 3x clients +#[derive(Debug)] +pub struct IOLimitGuard(ClientData); + +impl From for ClientMap { + fn from(i: IOLimitGuard) -> Self { + i.0.into() + } +} + +impl IOLimitGuard { + #[cfg(test)] + pub fn dummy() -> Self { + Self(ClientData::from_fn(|_i| ClientIOLimitGuard::dummy())) + } +} diff --git a/upstairs/src/lib.rs b/upstairs/src/lib.rs index 02df2783d..4e798cfce 100644 --- a/upstairs/src/lib.rs +++ b/upstairs/src/lib.rs @@ -48,9 +48,6 @@ pub use in_memory::InMemoryBlockIO; pub mod block_io; pub use block_io::{FileBlockIO, ReqwestBlockIO}; -pub(crate) mod backpressure; -use backpressure::BackpressureGuard; - pub mod block_req; pub(crate) use block_req::{BlockOpWaiter, BlockRes}; @@ -86,17 +83,32 @@ mod downstairs; mod upstairs; use upstairs::{UpCounters, UpstairsAction}; +mod io_limits; +use io_limits::IOLimitGuard; + /// Max number of write bytes between the upstairs and an offline downstairs /// -/// If we exceed this value, the upstairs will give -/// up and mark the offline downstairs as faulted. -const IO_OUTSTANDING_MAX_BYTES: u64 = 1024 * 1024 * 1024; // 1 GiB +/// If we exceed this value, the upstairs will give up and mark the offline +/// downstairs as faulted. +const IO_OUTSTANDING_MAX_BYTES: u64 = 50 * 1024 * 1024; // 50 MiB /// Max number of outstanding IOs between the upstairs and an offline downstairs /// /// If we exceed this value, the upstairs will give up and mark that offline /// downstairs as faulted. -pub const IO_OUTSTANDING_MAX_JOBS: usize = 10000; +pub const IO_OUTSTANDING_MAX_JOBS: usize = 1000; + +/// Maximum of bytes to cache from complete (but un-flushed) IO +/// +/// Caching complete jobs allows us to replay them if a Downstairs goes offline +/// them comes back. +const IO_CACHED_MAX_BYTES: u64 = 1024 * 1024 * 1024; // 1 GiB + +/// Maximum of jobs to cache from complete (but un-flushed) IO +/// +/// Caching complete jobs allows us to replay them if a Downstairs goes offline +/// them comes back. +const IO_CACHED_MAX_JOBS: u64 = 10000; /// The BlockIO trait behaves like a physical NVMe disk (or a virtio virtual /// disk): there is no contract about what order operations that are submitted @@ -436,6 +448,11 @@ impl ClientData { ]) } + /// Builds a new `ClientData` by applying a function to each item + pub fn map U>(self, f: F) -> ClientData { + ClientData(self.0.map(f)) + } + #[cfg(test)] pub fn get(&self) -> &[T; 3] { &self.0 @@ -914,11 +931,7 @@ struct DownstairsIO { /// consistency checking with subsequent replies. data: Option, - /// Handle for this job's contribution to guest backpressure - /// - /// Each of these guard handles will automatically decrement the - /// backpressure count for their respective Downstairs when dropped. - backpressure_guard: ClientMap, + io_limits: ClientMap, } impl DownstairsIO { @@ -1275,6 +1288,11 @@ impl IOop { // the downstairs to act based on that. true } + IOop::Barrier { .. } => { + // The Barrier IOop doesn't actually touch any extents; it's + // purely for dependency management. + true + } _ => { panic!("Unsupported IO check {:?}", self); } @@ -1298,16 +1316,6 @@ impl IOop { _ => 0, } } - - /// Returns the number of bytes written - fn write_bytes(&self) -> u64 { - match &self { - IOop::Write { data, .. } | IOop::WriteUnwritten { data, .. } => { - data.len() as u64 - } - _ => 0, - } - } } #[allow(clippy::derive_partial_eq_without_eq)] @@ -1485,20 +1493,24 @@ pub(crate) enum BlockOp { offset: BlockIndex, data: Buffer, done: BlockRes, + io_guard: IOLimitGuard, }, Write { offset: BlockIndex, data: BytesMut, done: BlockRes, + io_guard: IOLimitGuard, }, WriteUnwritten { offset: BlockIndex, data: BytesMut, done: BlockRes, + io_guard: IOLimitGuard, }, Flush { snapshot_details: Option, done: BlockRes, + io_guard: IOLimitGuard, }, GoActive { done: BlockRes, @@ -1567,8 +1579,6 @@ pub struct Arg { pub up_counters: UpCounters, /// Next JobID pub next_job_id: JobId, - /// Backpressure value - pub up_backpressure: u64, /// Jobs on the downstairs work queue. pub ds_count: u32, /// Number of write bytes in flight @@ -1647,7 +1657,7 @@ pub fn up_main( }; #[cfg(test)] - let disable_backpressure = guest.is_queue_backpressure_disabled(); + let disable_backpressure = guest.is_backpressure_disabled(); /* * Build the Upstairs struct that we use to share data between @@ -1658,7 +1668,7 @@ pub fn up_main( #[cfg(test)] if disable_backpressure { - up.disable_client_backpressure(); + up.downstairs.disable_client_backpressure(); } if let Some(pr) = producer_registry { diff --git a/upstairs/src/upstairs.rs b/upstairs/src/upstairs.rs index 884143e5d..58ce09638 100644 --- a/upstairs/src/upstairs.rs +++ b/upstairs/src/upstairs.rs @@ -10,6 +10,7 @@ use crate::{ }, downstairs::{Downstairs, DownstairsAction}, extent_from_offset, + io_limits::IOLimitGuard, stats::UpStatOuter, BlockOp, BlockRes, Buffer, ClientId, ClientMap, CrucibleOpts, DsState, EncryptionContext, GuestIoHandle, Message, RegionDefinition, @@ -169,10 +170,9 @@ impl UpCounters { /// /// For example, we _always_ do things like /// - Send all pending IO to the client work tasks -/// - Ack all ackable jobs to the guest /// - Step through the live-repair state machine (if it's running) /// - Check for client-side deactivation (if it's pending) -/// - Set backpressure time in the guest +/// - Set backpressure time in the clients /// /// Keeping the `Upstairs` "clean" through this invariant maintenance makes it /// easier to think about its state, because it's guaranteed to be clean when we @@ -206,12 +206,11 @@ pub(crate) struct Upstairs { /// Marks whether a flush is needed /// - /// The Upstairs keeps all IOs in memory until a flush is ACK'd back from - /// all three downstairs. If there are IOs we have accepted into the work - /// queue that don't end with a flush, then we set this to indicate that the - /// upstairs may need to issue a flush of its own to be sure that data is - /// pushed to disk. Note that this is not an indication of an ACK'd flush, - /// just that the last IO command we put on the work queue was not a flush. + /// If there are IOs we have accepted into the work queue that don't end + /// with a flush, then we set this to indicate that the upstairs may need to + /// issue a flush of its own to be sure that data is pushed to disk. Note + /// that this is not an indication of an ACK'd flush, just that the last IO + /// command we put on the work queue was not a flush. need_flush: bool, /// Statistics for this upstairs @@ -390,6 +389,7 @@ impl Upstairs { ds_target, tls_context, stats.ds_stats(), + guest.io_limits(), log.new(o!("" => "downstairs")), ); let flush_timeout_secs = opt.flush_timeout.unwrap_or(0.5); @@ -424,11 +424,6 @@ impl Upstairs { } } - #[cfg(test)] - pub(crate) fn disable_client_backpressure(&mut self) { - self.downstairs.disable_client_backpressure(); - } - /// Build an Upstairs for simple tests #[cfg(test)] pub fn test_default(ddef: Option) -> Self { @@ -531,6 +526,10 @@ impl Upstairs { /// Apply an action returned from [`Upstairs::select`] pub(crate) fn apply(&mut self, action: UpstairsAction) { + // Check whether the downstairs has live jobs before performing the + // action, because the action may cause it to retire live jobs. + let has_jobs = self.downstairs.has_live_jobs(); + match action { UpstairsAction::Downstairs(d) => { self.counters.action_downstairs += 1; @@ -567,7 +566,8 @@ impl Upstairs { .counters .action_flush_check)); if self.need_flush { - self.submit_flush(None, None); + let io_guard = self.try_acquire_io(0); + self.submit_flush(None, None, io_guard); } self.flush_deadline = Instant::now() + self.flush_interval; } @@ -603,17 +603,15 @@ impl Upstairs { // because too many jobs have piled up. self.gone_too_long(); + // Check whether we need to send a Barrier operation to clean out + // complete-but-unflushed jobs. + if self.downstairs.needs_barrier() { + self.submit_barrier() + } + // Check to see whether live-repair can continue - // - // This must be called before acking jobs, because it looks in - // `Downstairs::ackable_jobs` to see which jobs are done. self.downstairs.check_and_continue_live_repair(&self.state); - // Handle any jobs that have become ready for acks - if self.downstairs.has_ackable_jobs() { - self.downstairs.ack_jobs() - } - // Check for client-side deactivation if matches!(&self.state, UpstairsState::Deactivating(..)) { info!(self.log, "checking for deactivation"); @@ -652,7 +650,41 @@ impl Upstairs { // For now, check backpressure after every event. We may want to make // this more nuanced in the future. - self.set_backpressure(); + self.downstairs.set_client_backpressure(); + + // We do this last because some of the code above can be slow + // (especially during debug builds), and we don't want to set our flush + // deadline such that it fires immediately. + if has_jobs { + self.flush_deadline = Instant::now() + self.flush_interval; + } + } + + /// Attempts to acquire permits to perform an IO job with the given bytes + /// + /// Upon failure, logs an error and returns `None`. + /// + /// This function is used by messages generated internally to the Upstairs + /// for best-effort IO limiting. If the message would exceed our available + /// permits, it's still allowed (because to do otherwise would deadlock the + /// upstairs task). In other words, internally generated messages can limit + /// guest IO work, but not the other way around + fn try_acquire_io(&self, bytes: usize) -> Option { + let Ok(bytes) = u32::try_from(bytes) else { + warn!(self.log, "too many bytes for try_acquire_io"); + return None; + }; + match self.guest.io_limits().try_claim(bytes) { + Ok(v) => Some(v), + Err((i, e)) => { + warn!( + self.log, + "could not apply IO limits to upstairs work: \ + client {i} returned {e:?}" + ); + None + } + } } /// Helper function to await all deferred block requests @@ -710,7 +742,6 @@ impl Upstairs { up_count: self.downstairs.gw_active.len() as u32, up_counters: self.counters, next_job_id: self.downstairs.peek_next_id(), - up_backpressure: self.guest.get_backpressure().as_micros(), write_bytes_out: self.downstairs.write_bytes_outstanding(), ds_count: self.downstairs.active_count() as u32, ds_state: self.downstairs.collect_stats(|c| c.state()), @@ -903,11 +934,21 @@ impl Upstairs { match op { // All Write operations are deferred, because they will offload // encryption to a separate thread pool. - BlockOp::Write { offset, data, done } => { - self.submit_deferred_write(offset, data, done, false); + BlockOp::Write { + offset, + data, + done, + io_guard, + } => { + self.submit_deferred_write(offset, data, done, false, io_guard); } - BlockOp::WriteUnwritten { offset, data, done } => { - self.submit_deferred_write(offset, data, done, true); + BlockOp::WriteUnwritten { + offset, + data, + done, + io_guard, + } => { + self.submit_deferred_write(offset, data, done, true, io_guard); } // If we have any deferred requests in the FuturesOrdered, then we // have to keep using it for subsequent requests (even ones that are @@ -1074,15 +1115,19 @@ impl Upstairs { done.send_ok(self.show_all_work()); } - BlockOp::Read { offset, data, done } => { - self.submit_read(offset, data, done) - } + BlockOp::Read { + offset, + data, + done, + io_guard, + } => self.submit_read(offset, data, done, io_guard), BlockOp::Write { .. } | BlockOp::WriteUnwritten { .. } => { panic!("writes must always be deferred") } BlockOp::Flush { snapshot_details, done, + io_guard, } => { /* * Submit for read and write both check if the upstairs is @@ -1095,7 +1140,7 @@ impl Upstairs { done.send_err(CrucibleError::UpstairsInactive); return; } - self.submit_flush(Some(done), snapshot_details); + self.submit_flush(Some(done), snapshot_details, Some(io_guard)); } BlockOp::ReplaceDownstairs { id, old, new, done } => { let r = self.downstairs.replace(id, old, new, &self.state); @@ -1235,7 +1280,8 @@ impl Upstairs { } if !self.downstairs.can_deactivate_immediately() { debug!(self.log, "not ready to deactivate; submitting final flush"); - self.submit_flush(None, None); + let io_guard = self.try_acquire_io(0); + self.submit_flush(None, None, io_guard); } else { debug!(self.log, "ready to deactivate right away"); // Deactivation is handled in the invariant-checking portion of @@ -1249,6 +1295,7 @@ impl Upstairs { &mut self, res: Option, snapshot_details: Option, + io_guard: Option, ) { // Notice that unlike submit_read and submit_write, we do not check for // guest_io_ready here. The upstairs itself can call submit_flush @@ -1266,18 +1313,18 @@ impl Upstairs { if snapshot_details.is_some() { info!(self.log, "flush with snap requested"); } - let ds_id = self.downstairs.submit_flush(snapshot_details, res); + let ds_id = + self.downstairs + .submit_flush(snapshot_details, res, io_guard); cdt::up__to__ds__flush__start!(|| (ds_id.0)); } - #[allow(dead_code)] // XXX this will be used soon! fn submit_barrier(&mut self) { // Notice that unlike submit_read and submit_write, we do not check for // guest_io_ready here. The upstairs itself calls submit_barrier // without the guest being involved; indeed the guest is not allowed to // call it! - let ds_id = self.downstairs.submit_barrier(); cdt::up__to__ds__barrier__start!(|| (ds_id.0)); @@ -1291,8 +1338,7 @@ impl Upstairs { offset: BlockIndex, data: Buffer, ) { - let br = BlockRes::dummy(); - self.submit_read(offset, data, br) + self.submit_read(offset, data, BlockRes::dummy(), IOLimitGuard::dummy()) } /// Submit a read job to the downstairs @@ -1301,6 +1347,7 @@ impl Upstairs { offset: BlockIndex, data: Buffer, res: BlockRes, + io_guard: IOLimitGuard, ) { if !self.guest_io_ready() { res.send_err((data, CrucibleError::UpstairsInactive)); @@ -1339,7 +1386,9 @@ impl Upstairs { * Grab this ID after extent_from_offset: in case of Err we don't * want to create a gap in the IDs. */ - let ds_id = self.downstairs.submit_read(impacted_blocks, data, res); + let ds_id = + self.downstairs + .submit_read(impacted_blocks, data, res, io_guard); cdt::up__to__ds__read__start!(|| (ds_id.0)); } @@ -1359,6 +1408,7 @@ impl Upstairs { data, BlockRes::dummy(), is_write_unwritten, + IOLimitGuard::dummy(), ) { self.submit_write(DeferredWrite::run(w)) } @@ -1375,14 +1425,19 @@ impl Upstairs { data: BytesMut, res: BlockRes, is_write_unwritten: bool, + io_guard: IOLimitGuard, ) { // It's possible for the write to be invalid out of the gate, in which // case `compute_deferred_write` replies to the `res` itself and returns // `None`. Otherwise, we have to store a future to process the write // result. - if let Some(w) = - self.compute_deferred_write(offset, data, res, is_write_unwritten) - { + if let Some(w) = self.compute_deferred_write( + offset, + data, + res, + is_write_unwritten, + io_guard, + ) { let should_defer = !self.deferred_ops.is_empty() || w.data.len() > MIN_DEFER_SIZE_BYTES as usize; if should_defer { @@ -1404,6 +1459,7 @@ impl Upstairs { data: BytesMut, res: BlockRes, is_write_unwritten: bool, + io_guard: IOLimitGuard, ) -> Option { if !self.guest_io_ready() { res.send_err(CrucibleError::UpstairsInactive); @@ -1431,8 +1487,6 @@ impl Upstairs { let impacted_blocks = extent_from_offset(&ddef, offset, ddef.bytes_to_blocks(data.len())); - let guard = self.downstairs.early_write_backpressure(data.len() as u64); - // Fast-ack, pretending to be done immediately operations res.send_ok(()); @@ -1442,7 +1496,7 @@ impl Upstairs { data, is_write_unwritten, cfg: self.cfg.clone(), - guard, + io_guard, }) } @@ -1462,7 +1516,7 @@ impl Upstairs { write.impacted_blocks, write.data, write.is_write_unwritten, - write.guard, + write.io_guard, ); if write.is_write_unwritten { @@ -1592,7 +1646,7 @@ impl Upstairs { // IO operation replies // - // This may cause jobs to become ackable! + // This may cause jobs to be acked! Message::WriteAck { .. } | Message::WriteUnwrittenAck { .. } | Message::FlushAck { .. } @@ -2010,16 +2064,6 @@ impl Upstairs { self.downstairs.reinitialize(client_id, &self.state); } - /// Sets both guest and per-client backpressure - fn set_backpressure(&self) { - self.guest.set_backpressure( - self.downstairs.write_bytes_outstanding(), - self.downstairs.jobs_outstanding(), - ); - - self.downstairs.set_client_backpressure(); - } - /// Returns the `RegionDefinition` /// /// # Panics @@ -2341,7 +2385,7 @@ pub(crate) mod test { ); // op 1 - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); // op 2 upstairs.submit_dummy_write( @@ -2386,7 +2430,7 @@ pub(crate) mod test { } // op 3 - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); // ops 4 to 6 for i in 3..6 { @@ -2717,7 +2761,7 @@ pub(crate) mod test { ); // op 1 - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); // op 2 upstairs.submit_dummy_read(BlockIndex(0), Buffer::new(2, 512)); @@ -2744,11 +2788,11 @@ pub(crate) mod test { let mut upstairs = make_upstairs(); upstairs.force_active().unwrap(); - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); let jobs = upstairs.downstairs.get_all_jobs(); assert_eq!(jobs.len(), 3); @@ -2778,7 +2822,7 @@ pub(crate) mod test { upstairs.force_active().unwrap(); // op 0 - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); // ops 1 to 2 for i in 0..2 { @@ -2790,7 +2834,7 @@ pub(crate) mod test { } // op 3 - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); // ops 4 to 6 for i in 0..3 { @@ -2802,7 +2846,7 @@ pub(crate) mod test { } // op 7 - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); let jobs = upstairs.downstairs.get_all_jobs(); assert_eq!(jobs.len(), 8); @@ -3256,7 +3300,7 @@ pub(crate) mod test { upstairs.submit_dummy_read(BlockIndex(95), Buffer::new(2, 512)); // op 1 - upstairs.submit_flush(None, None); + upstairs.submit_flush(None, None, None); // op 2 upstairs.submit_dummy_write( @@ -3475,10 +3519,21 @@ pub(crate) mod test { let offset = BlockIndex(7); let data = BytesMut::from([1; 512].as_slice()); let (_write_res, done) = BlockOpWaiter::pair(); + let io_guard = IOLimitGuard::dummy(); let op = if is_write_unwritten { - BlockOp::WriteUnwritten { offset, data, done } + BlockOp::WriteUnwritten { + offset, + data, + done, + io_guard, + } } else { - BlockOp::Write { offset, data, done } + BlockOp::Write { + offset, + data, + done, + io_guard, + } }; up.apply(UpstairsAction::Guest(op)); up.await_deferred_ops().await; @@ -3591,7 +3646,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); // fake read response from downstairs that will successfully decrypt let mut data = Vec::from([1u8; 512]); @@ -3639,7 +3700,13 @@ pub(crate) mod test { let data = Buffer::new(blocks, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); let mut data = Vec::from([1u8; 512]); @@ -3697,7 +3764,13 @@ pub(crate) mod test { let data = Buffer::new(blocks, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); // fake read response from downstairs that will fail decryption let mut data = Vec::from([1u8; 512]); @@ -3773,7 +3846,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); // fake read response from downstairs that will fail decryption let mut data = Vec::from([1u8; 512]); @@ -3836,7 +3915,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); // fake read response from downstairs that will fail integrity hash // check @@ -3881,7 +3966,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); let data = BytesMut::from([1u8; 512].as_slice()); let hash = integrity_hash(&[&data]); @@ -3942,7 +4033,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); for client_id in [ClientId::new(0), ClientId::new(1)] { let data = BytesMut::from([1u8; 512].as_slice()); @@ -4005,7 +4102,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); let data = BytesMut::from([1u8; 512].as_slice()); let hash = integrity_hash(&[&data]); @@ -4065,7 +4168,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); // The first read has no block contexts, because it was unwritten let data = BytesMut::from([0u8; 512].as_slice()); @@ -4121,7 +4230,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); // The first read has no block contexts, because it was unwritten let data = BytesMut::from([0u8; 512].as_slice()); @@ -4181,7 +4296,13 @@ pub(crate) mod test { data.extend_from_slice(vec![1; NODEFER_SIZE].as_slice()); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Write { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Write { + offset, + data, + done, + io_guard, + })); assert_eq!(up.deferred_ops.len(), 0); // Submit a long write, which should be deferred @@ -4189,7 +4310,13 @@ pub(crate) mod test { data.extend_from_slice(vec![2; DEFER_SIZE].as_slice()); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Write { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Write { + offset, + data, + done, + io_guard, + })); assert_eq!(up.deferred_ops.len(), 1); assert_eq!(up.deferred_msgs.len(), 0); @@ -4199,7 +4326,13 @@ pub(crate) mod test { data.extend_from_slice(vec![3; NODEFER_SIZE].as_slice()); let offset = BlockIndex(7); let (_res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Write { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Write { + offset, + data, + done, + io_guard, + })); assert_eq!(up.deferred_ops.len(), 2); assert_eq!(up.deferred_msgs.len(), 0); } @@ -4221,7 +4354,13 @@ pub(crate) mod test { let data = Buffer::new(1, 512); let offset = BlockIndex(7); let (res, done) = BlockOpWaiter::pair(); - up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done })); + let io_guard = IOLimitGuard::dummy(); + up.apply(UpstairsAction::Guest(BlockOp::Read { + offset, + data, + done, + io_guard, + })); let reply = res.wait_raw().await.unwrap(); match reply {