diff --git a/cmon/src/main.rs b/cmon/src/main.rs
index 4f3fe1952..1f3595808 100644
--- a/cmon/src/main.rs
+++ b/cmon/src/main.rs
@@ -41,7 +41,6 @@ enum DtraceDisplay {
     Replaced,
     ExtentLiveRepair,
     ExtentLimit,
-    Backpressure,
     NextJobId,
     JobDelta,
     DsDelay,
@@ -61,7 +60,6 @@ impl fmt::Display for DtraceDisplay {
             DtraceDisplay::Replaced => write!(f, "replaced"),
             DtraceDisplay::ExtentLiveRepair => write!(f, "extent_live_repair"),
             DtraceDisplay::ExtentLimit => write!(f, "extent_under_repair"),
-            DtraceDisplay::Backpressure => write!(f, "backpressure"),
             DtraceDisplay::NextJobId => write!(f, "next_job_id"),
             DtraceDisplay::JobDelta => write!(f, "job_delta"),
             DtraceDisplay::DsDelay => write!(f, "ds_delay"),
@@ -229,9 +227,6 @@ fn print_dtrace_header(dd: &[DtraceDisplay]) {
             DtraceDisplay::ExtentLimit => {
                 print!(" {:>4}", "EXTL");
             }
-            DtraceDisplay::Backpressure => {
-                print!(" {:>5}", "BAKPR");
-            }
             DtraceDisplay::NextJobId => {
                 print!(" {:>7}", "NEXTJOB");
             }
@@ -348,9 +343,6 @@ fn print_dtrace_row(d_out: Arg, dd: &[DtraceDisplay], last_job_id: &mut u64) {
             DtraceDisplay::ExtentLimit => {
                 print!(" {:4}", d_out.ds_extent_limit);
             }
-            DtraceDisplay::Backpressure => {
-                print!(" {:>5}", d_out.up_backpressure);
-            }
             DtraceDisplay::NextJobId => {
                 print!(" {:>7}", d_out.next_job_id);
             }
diff --git a/tools/dtrace/get-ds-state.d b/tools/dtrace/get-ds-state.d
new file mode 100644
index 000000000..92aaf1dd5
--- /dev/null
+++ b/tools/dtrace/get-ds-state.d
@@ -0,0 +1,23 @@
+/*
+ * Print a status line for all matching probes.
+ * Exit after 5 seconds.
+ */
+#pragma D option quiet
+#pragma D option strsize=1k
+
+crucible_upstairs*:::up-status
+{
+    my_sesh = json(copyinstr(arg1), "ok.session_id");
+
+    printf("%6d %8s %17s %17s %17s\n",
+        pid,
+        substr(my_sesh, 0, 8),
+        json(copyinstr(arg1), "ok.ds_state[0]"),
+        json(copyinstr(arg1), "ok.ds_state[1]"),
+        json(copyinstr(arg1), "ok.ds_state[2]"));
+}
+
+tick-5s
+{
+    exit(0);
+}
diff --git a/tools/dtrace/get-ds-state.sh b/tools/dtrace/get-ds-state.sh
index 7662c5fc1..5f5a13490 100755
--- a/tools/dtrace/get-ds-state.sh
+++ b/tools/dtrace/get-ds-state.sh
@@ -1,9 +1,16 @@
 #!/bin/bash
 #
-# This script will display the downstairs states for any propolis zones it
-# finds running on a system.
-for zzz in $(zoneadm list | grep propolis); do
-    echo -n "$zzz "
-    ppid=$(zlogin "$zzz" pgrep propolis-server)
-    dtrace -xstrsize=1k -p $ppid -q -n 'crucible_upstairs*:::up-status { printf("%6d %17s %17s %17s", pid, json(copyinstr(arg1), "ok.ds_state[0]"), json(copyinstr(arg1), "ok.ds_state[1]"), json(copyinstr(arg1), "ok.ds_state[2]")); exit(0); }'
-done
+# This script will display the downstairs states for each pid/session
+# it finds running on a system.
+filename='/tmp/get-ds-state.out'
+
+# Gather state on all running propolis servers, record summary to a file
+dtrace -s /opt/oxide/crucible_dtrace/get-ds-state.d | sort -n | uniq | awk 'NF' > "$filename"
+# Walk the lines in the file, append the zone name to each line.
+while read -r p; do
+        # For each line in the file, pull out the PID we are looking at and
+        # print the zone that process is running in.
+        pid=$(echo $p | awk '{print $1}')
+        zone=$(ps -o zone -p $pid | tail -1 | cut -c 1-28)
+        echo "$zone $p"
+done < "$filename"
diff --git a/tools/dtrace/get-lr-state.d b/tools/dtrace/get-lr-state.d
new file mode 100755
index 000000000..983628021
--- /dev/null
+++ b/tools/dtrace/get-lr-state.d
@@ -0,0 +1,26 @@
+/*
+ * Print a live repair status line for all matching probes.
+ * Exit after 5 seconds.
+ */
+#pragma D option quiet
+#pragma D option strsize=1k
+
+crucible_upstairs*:::up-status
+{
+    my_sesh = json(copyinstr(arg1), "ok.session_id");
+
+    printf("%6d %8s %s %s %s %s %s %s\n",
+        pid,
+        substr(my_sesh, 0, 8),
+        json(copyinstr(arg1), "ok.ds_live_repair_completed[0]"),
+        json(copyinstr(arg1), "ok.ds_live_repair_completed[1]"),
+        json(copyinstr(arg1), "ok.ds_live_repair_completed[2]"),
+        json(copyinstr(arg1), "ok.ds_live_repair_aborted[0]"),
+        json(copyinstr(arg1), "ok.ds_live_repair_aborted[1]"),
+        json(copyinstr(arg1), "ok.ds_live_repair_aborted[2]"));
+}
+
+tick-5s
+{
+    exit(0);
+}
diff --git a/tools/dtrace/get-lr-state.sh b/tools/dtrace/get-lr-state.sh
index 690e20438..5af032379 100755
--- a/tools/dtrace/get-lr-state.sh
+++ b/tools/dtrace/get-lr-state.sh
@@ -1,9 +1,16 @@
 #!/bin/bash
-#
-# This script will log into every propolis zone it finds and get the
-# DTrace live repair counters from propolis-server in each zone.
-for zzz in $(zoneadm list | grep propolis); do
-    echo -n "$zzz "
-    ppid=$(zlogin "$zzz" pgrep propolis-server)
-    dtrace -xstrsize=1k -p $ppid -q -n 'crucible_upstairs*:::up-status { printf("%6d %s %s %s %s %s %s", pid, json(copyinstr(arg1), "ok.ds_live_repair_completed[0]"), json(copyinstr(arg1), "ok.ds_live_repair_completed[1]"), json(copyinstr(arg1), "ok.ds_live_repair_completed[2]"), json(copyinstr(arg1), "ok.ds_live_repair_aborted[0]"), json(copyinstr(arg1), "ok.ds_live_repair_aborted[1]"), json(copyinstr(arg1), "ok.ds_live_repair_aborted[2]")); exit(0); }'
-done
+
+# This script will display the downstairs live repair for each
+# pid/session it finds running on a system.
+filename='/tmp/get-lr-state.out'
+
+# Gather state on all running propolis servers, record summary to a file
+dtrace -s /opt/oxide/crucible_dtrace/get-lr-state.d | sort -n | uniq | awk 'NF' > "$filename"
+# Walk the lines in the file, append the zone name to each line.
+while read -r p; do
+        # For each line in the file, pull out the PID we are looking at and
+        # print the zone that process is running in.
+        pid=$(echo $p | awk '{print $1}')
+        zone=$(ps -o zone -p $pid | tail -1 | cut -c 1-28)
+        echo "$zone $p"
+done < "$filename"
diff --git a/tools/dtrace/single_up_info.d b/tools/dtrace/single_up_info.d
index 0eb1f4e1b..476edb8eb 100755
--- a/tools/dtrace/single_up_info.d
+++ b/tools/dtrace/single_up_info.d
@@ -41,7 +41,7 @@ crucible_upstairs*:::up-status
      * I'm not very happy about this, but if we don't print it all on one
      * line, then multiple sessions will clobber each others output.
      */
-    printf("%8s %17s %17s %17s %5s %5s %9s %5s %10s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s\n",
+    printf("%8s %17s %17s %17s %5s %5s %5s %10s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s\n",
 
     substr(session_id, 0, 8),
 
@@ -62,7 +62,6 @@ crucible_upstairs*:::up-status
      * Job ID delta and backpressure
      */
     json(copyinstr(arg1), "ok.next_job_id"),
-    json(copyinstr(arg1), "ok.up_backpressure"),
     json(copyinstr(arg1), "ok.write_bytes_out"),
 
     /*
diff --git a/tools/dtrace/sled_upstairs_info.d b/tools/dtrace/sled_upstairs_info.d
index 277b8757e..a9366992d 100755
--- a/tools/dtrace/sled_upstairs_info.d
+++ b/tools/dtrace/sled_upstairs_info.d
@@ -46,8 +46,7 @@ crucible_upstairs*:::up-status
      * we don't print it all on one line, then multiple sessions will
      * clobber each others output.
      */
-    printf("%5d %8s %17s %17s %17s %5s %5s %9s %5s %10s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s\n",
-
+    printf("%5d %8s %17s %17s %17s %5s %5s %5s %10s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s  %5s %5s %5s\n",
     pid,
     substr(session_id, 0, 8),
 
@@ -62,7 +61,6 @@ crucible_upstairs*:::up-status
 
     /* Job ID and backpressure */
     json(copyinstr(arg1), "ok.next_job_id"),
-    json(copyinstr(arg1), "ok.up_backpressure"),
     json(copyinstr(arg1), "ok.write_bytes_out"),
 
     /* In progress jobs on the work list for each downstairs */
diff --git a/tools/dtrace/upstairs_info.d b/tools/dtrace/upstairs_info.d
index 64294fc2c..5cb4fc178 100755
--- a/tools/dtrace/upstairs_info.d
+++ b/tools/dtrace/upstairs_info.d
@@ -21,7 +21,7 @@ tick-1s
 {
     printf("%6s ", "PID");
     printf("%17s %17s %17s", "DS STATE 0", "DS STATE 1", "DS STATE 2");
-    printf("  %5s %5s %9s %5s", "UPW", "DSW", "JOBID", "BAKPR");
+    printf("  %5s %5s %9s", "UPW", "DSW", "JOBID");
     printf(" %10s", "WRITE_BO");
     printf("  %5s %5s %5s", "IP0", "IP1", "IP2");
     printf("  %5s %5s %5s", "D0", "D1", "D2");
@@ -49,10 +49,9 @@ crucible_upstairs*:::up-status
     printf(" %5s", json(copyinstr(arg1), "ok.ds_count"));
 
     /*
-     * Job ID and backpressure
+     * Job ID and outstanding bytes
      */
     printf(" %9s", json(copyinstr(arg1), "ok.next_job_id"));
-    printf(" %5s", json(copyinstr(arg1), "ok.up_backpressure"));
     printf(" %10s", json(copyinstr(arg1), "ok.write_bytes_out"));
 
     /*
diff --git a/tools/make-dtrace.sh b/tools/make-dtrace.sh
index 8b1324234..d1c4aba64 100755
--- a/tools/make-dtrace.sh
+++ b/tools/make-dtrace.sh
@@ -23,7 +23,9 @@ tar cvf ../../out/crucible-dtrace.tar \
     README.md \
     all_downstairs.d \
     downstairs_count.d \
+    get-ds-state.d \
     get-ds-state.sh \
+    get-lr-state.d \
     get-lr-state.sh \
     perf-downstairs-os.d \
     perf-downstairs-three.d \
diff --git a/upstairs/src/backpressure.rs b/upstairs/src/backpressure.rs
deleted file mode 100644
index feaf4a33f..000000000
--- a/upstairs/src/backpressure.rs
+++ /dev/null
@@ -1,375 +0,0 @@
-// Copyright 2024 Oxide Computer Company
-
-use crate::{IOop, IO_OUTSTANDING_MAX_BYTES, IO_OUTSTANDING_MAX_JOBS};
-use std::{
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
-    time::Duration,
-};
-
-/// Helper struct to contain a set of backpressure counters
-#[derive(Debug)]
-pub struct BackpressureCounters(Arc<BackpressureCountersInner>);
-
-/// Inner data structure for individual backpressure counters
-#[derive(Debug)]
-struct BackpressureCountersInner {
-    /// Number of bytes from `Write` and `WriteUnwritten` operations
-    ///
-    /// This value is used for global backpressure, to avoid buffering too many
-    /// writes (which otherwise return immediately, and are not persistent until
-    /// a flush)
-    write_bytes: AtomicU64,
-
-    /// Number of jobs in the queue
-    ///
-    /// This value is also used for global backpressure
-    // XXX should we only count write jobs here?  Or should we also count read
-    // bytes for global backpressure?  Much to ponder...
-    jobs: AtomicU64,
-
-    /// Number of bytes from `Write`, `WriteUnwritten`, and `Read` operations
-    ///
-    /// This value is used for local backpressure, to keep the 3x Downstairs
-    /// roughly in sync.  Otherwise, the fastest Downstairs will answer all read
-    /// requests, and the others can get arbitrarily far behind.
-    io_bytes: AtomicU64,
-}
-
-/// Guard to automatically decrement backpressure bytes when dropped
-#[derive(Debug)]
-pub struct BackpressureGuard {
-    counter: Arc<BackpressureCountersInner>,
-    write_bytes: u64,
-    io_bytes: u64,
-    // There's also an implicit "1 job" here
-}
-
-impl Drop for BackpressureGuard {
-    fn drop(&mut self) {
-        self.counter
-            .write_bytes
-            .fetch_sub(self.write_bytes, Ordering::Relaxed);
-        self.counter
-            .io_bytes
-            .fetch_sub(self.io_bytes, Ordering::Relaxed);
-        self.counter.jobs.fetch_sub(1, Ordering::Relaxed);
-    }
-}
-
-impl BackpressureGuard {
-    #[cfg(test)]
-    pub fn dummy() -> Self {
-        let counter = Arc::new(BackpressureCountersInner {
-            write_bytes: 0.into(),
-            io_bytes: 0.into(),
-            jobs: 1.into(),
-        });
-        Self {
-            counter,
-            write_bytes: 0,
-            io_bytes: 0,
-        }
-    }
-}
-
-impl BackpressureCounters {
-    pub fn new() -> Self {
-        Self(Arc::new(BackpressureCountersInner {
-            write_bytes: AtomicU64::new(0),
-            io_bytes: AtomicU64::new(0),
-            jobs: AtomicU64::new(0),
-        }))
-    }
-
-    pub fn get_write_bytes(&self) -> u64 {
-        self.0.write_bytes.load(Ordering::Relaxed)
-    }
-
-    pub fn get_io_bytes(&self) -> u64 {
-        self.0.io_bytes.load(Ordering::Relaxed)
-    }
-
-    pub fn get_jobs(&self) -> u64 {
-        self.0.jobs.load(Ordering::Relaxed)
-    }
-
-    /// Stores write / IO bytes (and 1 job) for a pending write
-    #[must_use]
-    pub fn early_write_increment(&mut self, bytes: u64) -> BackpressureGuard {
-        self.0.write_bytes.fetch_add(bytes, Ordering::Relaxed);
-        self.0.io_bytes.fetch_add(bytes, Ordering::Relaxed);
-        self.0.jobs.fetch_add(1, Ordering::Relaxed);
-        BackpressureGuard {
-            counter: self.0.clone(),
-            write_bytes: bytes,
-            io_bytes: bytes,
-            // implicit 1 job
-        }
-    }
-
-    /// Stores write / IO bytes (and 1 job) in the backpressure counters
-    #[must_use]
-    pub fn increment(&mut self, io: &IOop) -> BackpressureGuard {
-        let write_bytes = io.write_bytes();
-        let io_bytes = io.job_bytes();
-        self.0.write_bytes.fetch_add(write_bytes, Ordering::Relaxed);
-        self.0.io_bytes.fetch_add(io_bytes, Ordering::Relaxed);
-        self.0.jobs.fetch_add(1, Ordering::Relaxed);
-        BackpressureGuard {
-            counter: self.0.clone(),
-            write_bytes,
-            io_bytes,
-            // implicit 1 job
-        }
-    }
-}
-
-/// Configuration for host-side backpressure
-///
-/// Backpressure adds an artificial delay to host write messages (which are
-/// otherwise acked immediately, before actually being complete).  The delay is
-/// varied based on two metrics:
-///
-/// - number of write bytes outstanding
-/// - queue length (in jobs)
-///
-/// We compute backpressure delay based on both metrics, then pick the larger of
-/// the two delays.
-#[derive(Copy, Clone, Debug)]
-pub struct BackpressureConfig {
-    pub bytes: BackpressureChannelConfig,
-    pub queue: BackpressureChannelConfig,
-}
-
-impl Default for BackpressureConfig {
-    fn default() -> BackpressureConfig {
-        // `max_value` values below must be higher than `IO_OUTSTANDING_MAX_*`;
-        // otherwise, replaying jobs to a previously-Offline Downstairs could
-        // immediately kick us into the saturated regime, which would be
-        // unfortunate.
-        BackpressureConfig {
-            // Byte-based backpressure
-            bytes: BackpressureChannelConfig {
-                start_value: 50 * 1024u64.pow(2), // 50 MiB
-                max_value: IO_OUTSTANDING_MAX_BYTES * 2,
-                delay_scale: Duration::from_millis(100),
-                delay_max: Duration::from_millis(30_000),
-            },
-
-            // Queue-based backpressure
-            queue: BackpressureChannelConfig {
-                start_value: 500,
-                max_value: IO_OUTSTANDING_MAX_JOBS as u64 * 2,
-                delay_scale: Duration::from_millis(5),
-                delay_max: Duration::from_millis(30_000),
-            },
-        }
-    }
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct BackpressureChannelConfig {
-    /// When should backpressure start
-    pub start_value: u64,
-    /// Value at which backpressure is saturated
-    pub max_value: u64,
-
-    /// Characteristic scale of backpressure
-    ///
-    /// This scale sets the backpressure delay halfway between `start`_value and
-    /// `max_value`
-    pub delay_scale: Duration,
-
-    /// Maximum delay (returned at `max_value`)
-    pub delay_max: Duration,
-}
-
-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
-pub enum BackpressureAmount {
-    Duration(Duration),
-    Saturated,
-}
-
-impl std::cmp::Ord for BackpressureAmount {
-    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
-        match (self, other) {
-            (BackpressureAmount::Saturated, BackpressureAmount::Saturated) => {
-                std::cmp::Ordering::Equal
-            }
-            (BackpressureAmount::Saturated, _) => std::cmp::Ordering::Greater,
-            (_, BackpressureAmount::Saturated) => std::cmp::Ordering::Less,
-            (
-                BackpressureAmount::Duration(a),
-                BackpressureAmount::Duration(b),
-            ) => a.cmp(b),
-        }
-    }
-}
-
-impl std::cmp::PartialOrd for BackpressureAmount {
-    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl BackpressureAmount {
-    /// Converts to a delay in microseconds
-    ///
-    /// The saturated case is marked by `u64::MAX`
-    pub fn as_micros(&self) -> u64 {
-        match self {
-            BackpressureAmount::Duration(t) => t.as_micros() as u64,
-            BackpressureAmount::Saturated => u64::MAX,
-        }
-    }
-}
-
-/// Helper `struct` to store a shared backpressure amount
-///
-/// Under the hood, this stores a value in microseconds in an `AtomicU64`, so it
-/// can be read and written atomically.  `BackpressureAmount::Saturated` is
-/// indicated by `u64::MAX`.
-#[derive(Clone, Debug)]
-pub struct SharedBackpressureAmount(Arc<AtomicU64>);
-
-impl SharedBackpressureAmount {
-    pub fn new() -> Self {
-        Self(Arc::new(AtomicU64::new(0)))
-    }
-
-    pub fn store(&self, b: BackpressureAmount) {
-        let v = match b {
-            BackpressureAmount::Duration(d) => d.as_micros() as u64,
-            BackpressureAmount::Saturated => u64::MAX,
-        };
-        self.0.store(v, Ordering::Relaxed);
-    }
-
-    pub fn load(&self) -> BackpressureAmount {
-        match self.0.load(Ordering::Relaxed) {
-            u64::MAX => BackpressureAmount::Saturated,
-            delay_us => {
-                BackpressureAmount::Duration(Duration::from_micros(delay_us))
-            }
-        }
-    }
-}
-
-impl BackpressureChannelConfig {
-    fn get_backpressure(&self, value: u64) -> BackpressureAmount {
-        // Return a special value if we're saturated
-        if value >= self.max_value {
-            return BackpressureAmount::Saturated;
-        }
-
-        // This ratio starts at 0 (at start_value) and hits 1 when backpressure
-        // should be maxed out.
-        let frac = value.saturating_sub(self.start_value) as f64
-            / (self.max_value - self.start_value) as f64;
-
-        let v = if frac < 0.5 {
-            frac * 2.0
-        } else {
-            1.0 / (2.0 * (1.0 - frac))
-        };
-
-        BackpressureAmount::Duration(
-            self.delay_scale.mul_f64(v.powi(2)).min(self.delay_max),
-        )
-    }
-}
-
-impl BackpressureConfig {
-    pub fn get_backpressure(
-        &self,
-        bytes: u64,
-        jobs: u64,
-    ) -> BackpressureAmount {
-        let bp_bytes = self.bytes.get_backpressure(bytes);
-        let bp_queue = self.queue.get_backpressure(jobs);
-        bp_bytes.max(bp_queue)
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    /// Confirm that replaying the max number of jobs / bytes will not saturate
-    #[test]
-    fn check_outstanding_backpressure() {
-        let cfg = BackpressureConfig::default();
-        let BackpressureAmount::Duration(_) =
-            cfg.get_backpressure(IO_OUTSTANDING_MAX_BYTES, 0)
-        else {
-            panic!("backpressure saturated")
-        };
-
-        let BackpressureAmount::Duration(_) =
-            cfg.get_backpressure(0, IO_OUTSTANDING_MAX_JOBS as u64)
-        else {
-            panic!("backpressure saturated")
-        };
-    }
-
-    #[test]
-    fn check_max_backpressure() {
-        let cfg = BackpressureConfig::default();
-        let BackpressureAmount::Duration(timeout) = cfg
-            .get_backpressure(IO_OUTSTANDING_MAX_BYTES * 2 - 1024u64.pow(2), 0)
-        else {
-            panic!("backpressure saturated")
-        };
-        println!(
-            "max byte-based delay: {}",
-            humantime::format_duration(timeout)
-        );
-        assert!(
-            timeout >= Duration::from_secs(30),
-            "max byte-based backpressure delay is too low;
-            expected >= 30 secs, got {}",
-            humantime::format_duration(timeout)
-        );
-
-        let BackpressureAmount::Duration(timeout) =
-            cfg.get_backpressure(0, IO_OUTSTANDING_MAX_JOBS as u64 * 2 - 1)
-        else {
-            panic!("backpressure saturated")
-        };
-        println!(
-            "max job-based delay: {}",
-            humantime::format_duration(timeout)
-        );
-        assert!(
-            timeout >= Duration::from_secs(30),
-            "max job-based backpressure delay is too low;
-            expected >= 30 secs, got {}",
-            humantime::format_duration(timeout)
-        );
-    }
-
-    #[test]
-    fn check_saturated_backpressure() {
-        let cfg = BackpressureConfig::default();
-        assert!(matches!(
-            cfg.get_backpressure(IO_OUTSTANDING_MAX_BYTES * 2 + 1, 0),
-            BackpressureAmount::Saturated
-        ));
-        assert!(matches!(
-            cfg.get_backpressure(IO_OUTSTANDING_MAX_BYTES * 2, 0),
-            BackpressureAmount::Saturated
-        ));
-
-        assert!(matches!(
-            cfg.get_backpressure(0, IO_OUTSTANDING_MAX_JOBS as u64 * 2 + 1),
-            BackpressureAmount::Saturated
-        ));
-        assert!(matches!(
-            cfg.get_backpressure(0, IO_OUTSTANDING_MAX_JOBS as u64 * 2),
-            BackpressureAmount::Saturated
-        ));
-    }
-}
diff --git a/upstairs/src/client.rs b/upstairs/src/client.rs
index 15b36684c..59406787f 100644
--- a/upstairs/src/client.rs
+++ b/upstairs/src/client.rs
@@ -1,9 +1,9 @@
 // Copyright 2023 Oxide Computer Company
 use crate::{
-    backpressure::BackpressureCounters, cdt, integrity_hash,
-    live_repair::ExtentInfo, upstairs::UpstairsConfig, upstairs::UpstairsState,
-    ClientIOStateCount, ClientId, CrucibleDecoder, CrucibleError, DownstairsIO,
-    DsState, EncryptionContext, IOState, IOop, JobId, Message, RawReadResponse,
+    cdt, integrity_hash, io_limits::ClientIOLimits, live_repair::ExtentInfo,
+    upstairs::UpstairsConfig, upstairs::UpstairsState, ClientIOStateCount,
+    ClientId, CrucibleDecoder, CrucibleError, DownstairsIO, DsState,
+    EncryptionContext, IOState, IOop, JobId, Message, RawReadResponse,
     ReconcileIO, ReconcileIOState, RegionDefinitionStatus, RegionMetadata,
 };
 use crucible_common::{x509::TLSContext, ExtentId, VerboseTimeout};
@@ -122,11 +122,8 @@ pub(crate) struct DownstairsClient {
     /// Number of bytes associated with each IO state
     io_state_byte_count: ClientIOStateCount<u64>,
 
-    /// Jobs, write bytes, and total IO bytes in this client's queue
-    ///
-    /// These values are used for both global and local (per-client)
-    /// backpressure.
-    pub(crate) backpressure_counters: BackpressureCounters,
+    /// Absolute IO limits for this client
+    io_limits: ClientIOLimits,
 
     /// UUID for this downstairs region
     ///
@@ -200,6 +197,7 @@ impl DownstairsClient {
         client_id: ClientId,
         cfg: Arc<UpstairsConfig>,
         target_addr: Option<SocketAddr>,
+        io_limits: ClientIOLimits,
         log: Logger,
         tls_context: Option<Arc<crucible_common::x509::TLSContext>>,
     ) -> Self {
@@ -216,6 +214,7 @@ impl DownstairsClient {
                 &log,
             ),
             client_id,
+            io_limits,
             region_uuid: None,
             needs_replay: false,
             negotiation_state: NegotiationState::Start,
@@ -232,7 +231,6 @@ impl DownstairsClient {
             repair_info: None,
             io_state_job_count: ClientIOStateCount::default(),
             io_state_byte_count: ClientIOStateCount::default(),
-            backpressure_counters: BackpressureCounters::new(),
             connection_id: ConnectionId(0),
             client_delay_us,
         }
@@ -256,6 +254,10 @@ impl DownstairsClient {
             cfg,
             client_task: Self::new_dummy_task(false),
             client_id: ClientId::new(0),
+            io_limits: ClientIOLimits::new(
+                crate::IO_OUTSTANDING_MAX_JOBS * 3 / 2,
+                crate::IO_OUTSTANDING_MAX_BYTES as usize * 3 / 2,
+            ),
             region_uuid: None,
             needs_replay: false,
             negotiation_state: NegotiationState::Start,
@@ -272,7 +274,6 @@ impl DownstairsClient {
             repair_info: None,
             io_state_job_count: ClientIOStateCount::default(),
             io_state_byte_count: ClientIOStateCount::default(),
-            backpressure_counters: BackpressureCounters::new(),
             connection_id: ConnectionId(0),
             client_delay_us,
         }
@@ -359,19 +360,24 @@ impl DownstairsClient {
         // Update our bytes-in-flight counter
         if was_running && !is_running {
             // Because the job is no longer running, it shouldn't count for
-            // backpressure.  Remove the backpressure guard for this client,
-            // which decrements backpressure counters on drop.
-            job.backpressure_guard.take(&self.client_id);
-        } else if is_running
-            && !was_running
-            && !job.backpressure_guard.contains(&self.client_id)
-        {
-            // This should only happen if a job is replayed, but that still
-            // counts!
-            job.backpressure_guard.insert(
-                self.client_id,
-                self.backpressure_counters.increment(&job.work),
-            );
+            // backpressure or IO limits.  Remove the backpressure guard for
+            // this client, which decrements backpressure counters on drop.
+            job.io_limits.take(&self.client_id);
+        } else if is_running && !was_running {
+            match self.io_limits.try_claim(job.work.job_bytes() as u32) {
+                Ok(g) => {
+                    job.io_limits.insert(self.client_id, g);
+                }
+                Err(e) => {
+                    // We can't handle the case of "running out of permits
+                    // during replay", because waiting for a permit would
+                    // deadlock the worker task.  Log the error and continue.
+                    warn!(
+                        self.log,
+                        "could not claim IO permits when replaying job: {e:?}"
+                    )
+                }
+            }
         }
 
         old_state
@@ -547,7 +553,11 @@ impl DownstairsClient {
     ///
     /// # Panics
     /// If `self.client_task` is not `None`, or `self.target_addr` is `None`
-    pub(crate) fn reinitialize(&mut self, up_state: &UpstairsState) {
+    pub(crate) fn reinitialize(
+        &mut self,
+        up_state: &UpstairsState,
+        can_replay: bool,
+    ) {
         // Clear this Downstair's repair address, and let the YesItsMe set it.
         // This works if this Downstairs is new, reconnecting, or was replaced
         // entirely; the repair address could have changed in any of these
@@ -570,6 +580,9 @@ impl DownstairsClient {
 
         let current = &self.state;
         let new_state = match current {
+            DsState::Active | DsState::Offline if !can_replay => {
+                Some(DsState::Faulted)
+            }
             DsState::Active => Some(DsState::Offline),
             DsState::LiveRepair | DsState::LiveRepairReady => {
                 Some(DsState::Faulted)
@@ -840,8 +853,12 @@ impl DownstairsClient {
         reason: ClientStopReason,
     ) {
         let new_state = match self.state {
-            DsState::Active => DsState::Offline,
-            DsState::Offline => DsState::Offline,
+            DsState::Active | DsState::Offline
+                if matches!(reason, ClientStopReason::IneligibleForReplay) =>
+            {
+                DsState::Faulted
+            }
+            DsState::Active | DsState::Offline => DsState::Offline,
             DsState::Faulted => DsState::Faulted,
             DsState::Deactivated => DsState::New,
             DsState::Reconcile => DsState::New,
@@ -2234,7 +2251,7 @@ impl DownstairsClient {
     }
 
     pub(crate) fn total_bytes_outstanding(&self) -> usize {
-        self.backpressure_counters.get_io_bytes() as usize
+        self.io_state_byte_count.in_progress as usize
     }
 
     /// Returns a unique ID for the current connection, or `None`
@@ -2408,6 +2425,9 @@ pub(crate) enum ClientStopReason {
 
     /// The upstairs has requested that we deactivate when we were offline
     OfflineDeactivated,
+
+    /// The Upstairs has dropped jobs that would be needed for replay
+    IneligibleForReplay,
 }
 
 /// Response received from the I/O task
diff --git a/upstairs/src/deferred.rs b/upstairs/src/deferred.rs
index cbe43932e..301575a64 100644
--- a/upstairs/src/deferred.rs
+++ b/upstairs/src/deferred.rs
@@ -3,9 +3,8 @@
 use std::sync::Arc;
 
 use crate::{
-    backpressure::BackpressureGuard, client::ConnectionId,
-    upstairs::UpstairsConfig, BlockContext, BlockOp, ClientData, ClientId,
-    ImpactedBlocks, Message, RawWrite,
+    client::ConnectionId, io_limits::IOLimitGuard, upstairs::UpstairsConfig,
+    BlockContext, BlockOp, ClientId, ImpactedBlocks, Message, RawWrite,
 };
 use bytes::BytesMut;
 use crucible_common::{integrity_hash, CrucibleError, RegionDefinition};
@@ -114,7 +113,7 @@ pub(crate) struct DeferredWrite {
     pub data: BytesMut,
     pub is_write_unwritten: bool,
     pub cfg: Arc<UpstairsConfig>,
-    pub guard: ClientData<BackpressureGuard>,
+    pub io_guard: IOLimitGuard,
 }
 
 /// Result of a deferred `BlockOp`
@@ -135,7 +134,7 @@ pub(crate) struct EncryptedWrite {
     pub data: RawWrite,
     pub impacted_blocks: ImpactedBlocks,
     pub is_write_unwritten: bool,
-    pub guard: ClientData<BackpressureGuard>,
+    pub io_guard: IOLimitGuard,
 }
 
 impl DeferredWrite {
@@ -183,7 +182,7 @@ impl DeferredWrite {
             data,
             impacted_blocks: self.impacted_blocks,
             is_write_unwritten: self.is_write_unwritten,
-            guard: self.guard,
+            io_guard: self.io_guard,
         }
     }
 }
diff --git a/upstairs/src/downstairs.rs b/upstairs/src/downstairs.rs
index 23a889169..35d80d6ed 100644
--- a/upstairs/src/downstairs.rs
+++ b/upstairs/src/downstairs.rs
@@ -1,16 +1,16 @@
 // Copyright 2023 Oxide Computer Company
 use std::{
-    collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque},
+    collections::{BTreeMap, HashMap, HashSet, VecDeque},
     net::SocketAddr,
     sync::Arc,
     time::Duration,
 };
 
 use crate::{
-    backpressure::BackpressureGuard,
     cdt,
     client::{ClientAction, ClientStopReason, DownstairsClient, EnqueueResult},
     guest::GuestBlockRes,
+    io_limits::{IOLimitGuard, IOLimits},
     live_repair::ExtentInfo,
     stats::DownstairsStatOuter,
     upstairs::{UpstairsConfig, UpstairsState},
@@ -88,6 +88,19 @@ pub(crate) struct Downstairs {
     /// (including automatic flushes).
     next_flush: u64,
 
+    /// Indicates whether we are eligible for replay
+    ///
+    /// We are only eligible for replay if all jobs since the last flush are
+    /// buffered (i.e. none have been retired by a `Barrier` operation).
+    can_replay: bool,
+
+    /// How many `Flush` or `Barrier` operations are pending?
+    ///
+    /// We only want to send a `Barrier` if there isn't already one pending, so
+    /// we track it here (incrementing in `submit_flush` / `submit_barrier` and
+    /// decrementing in `retire_check`).
+    pending_barrier: usize,
+
     /// Ringbuf of recently acked job IDs.
     acked_ids: AllocRingBuffer<JobId>,
 
@@ -126,11 +139,6 @@ pub(crate) struct Downstairs {
     /// Data for an in-progress live repair
     repair: Option<LiveRepairData>,
 
-    /// Jobs that are ready to be acked
-    ///
-    /// This must be handled after every event
-    ackable_work: BTreeSet<JobId>,
-
     /// Region definition (copied from the upstairs)
     ddef: Option<RegionDefinition>,
 
@@ -142,49 +150,70 @@ pub(crate) struct Downstairs {
     reqwest_client: reqwest::Client,
 }
 
+/// Tuple storing a job and an optional result
+#[derive(Debug)]
+pub(crate) struct PendingJob {
+    id: JobId,
+    result: Option<Result<(), CrucibleError>>,
+}
+
+impl PendingJob {
+    fn new(id: JobId) -> Self {
+        Self { id, result: None }
+    }
+}
+
 /// State machine for a live-repair operation
 ///
 /// We pass through all states (except `FinalFlush`) in order for each extent,
 /// then pass through the `FinalFlush` state before completion.  In each state,
-/// we're waiting for a particular job to finish, which is indicated by a
-/// `BlockOpWaiter`.
-///
-/// Early states carry around reserved IDs (both `JobId` and guest work IDs), as
-/// well as a reserved `BlockOpWaiter` for the final flush.
-#[derive(Copy, Clone, Debug)]
+/// we're waiting for a particular job to finish.  We store a `PendingJob` for
+/// every pending job, because it may be possible for more than one to finish
+/// before checking live-repair state.
+#[derive(Debug)]
 pub(crate) enum LiveRepairState {
     Closing {
-        close_id: JobId,
-        repair_id: JobId,
-        noop_id: JobId,
-        reopen_id: JobId,
+        close_job: PendingJob,
+        repair_job: PendingJob,
+        noop_job: PendingJob,
+        reopen_job: PendingJob,
     },
     Repairing {
-        repair_id: JobId,
-        noop_id: JobId,
-        reopen_id: JobId,
+        repair_job: PendingJob,
+        noop_job: PendingJob,
+        reopen_job: PendingJob,
     },
     Noop {
-        noop_id: JobId,
-        reopen_id: JobId,
+        noop_job: PendingJob,
+        reopen_job: PendingJob,
     },
     Reopening {
-        reopen_id: JobId,
+        reopen_job: PendingJob,
     },
     FinalFlush {
-        flush_id: JobId,
+        flush_job: PendingJob,
     },
 }
 
 impl LiveRepairState {
-    /// Returns the job ID that we're waiting on at the moment
-    fn active_job_id(&self) -> JobId {
+    fn dummy() -> Self {
+        LiveRepairState::Noop {
+            noop_job: PendingJob::new(JobId(0)),
+            reopen_job: PendingJob::new(JobId(0)),
+        }
+    }
+
+    /// Returns the result for the job that we're waiting on at the moment
+    ///
+    /// This is a consuming function; the result is removed from the object.
+    #[must_use]
+    fn active_job_result(&mut self) -> Option<Result<(), CrucibleError>> {
         match self {
-            LiveRepairState::Closing { close_id, .. } => *close_id,
-            LiveRepairState::Repairing { repair_id, .. } => *repair_id,
-            LiveRepairState::Noop { noop_id, .. } => *noop_id,
-            LiveRepairState::Reopening { reopen_id, .. } => *reopen_id,
-            LiveRepairState::FinalFlush { flush_id } => *flush_id,
+            LiveRepairState::Closing { close_job: j, .. }
+            | LiveRepairState::Repairing { repair_job: j, .. }
+            | LiveRepairState::Noop { noop_job: j, .. }
+            | LiveRepairState::Reopening { reopen_job: j, .. }
+            | LiveRepairState::FinalFlush { flush_job: j } => j.result.take(),
         }
     }
 }
@@ -233,6 +262,47 @@ pub(crate) struct LiveRepairData {
     state: LiveRepairState,
 }
 
+impl LiveRepairData {
+    /// If the given job is one that we're waiting for, store its result
+    fn on_job_complete(&mut self, ds_id: JobId, io: &DownstairsIO) {
+        // Build an array of `Option<&mut PendingJob>` to check
+        let jobs = match &mut self.state {
+            LiveRepairState::Closing {
+                close_job,
+                repair_job,
+                noop_job,
+                reopen_job,
+            } => [
+                Some(close_job),
+                Some(repair_job),
+                Some(noop_job),
+                Some(reopen_job),
+            ],
+            LiveRepairState::Repairing {
+                repair_job,
+                noop_job,
+                reopen_job,
+            } => [Some(repair_job), Some(noop_job), Some(reopen_job), None],
+            LiveRepairState::Noop {
+                noop_job,
+                reopen_job,
+            } => [Some(noop_job), Some(reopen_job), None, None],
+            LiveRepairState::Reopening { reopen_job } => {
+                [Some(reopen_job), None, None, None]
+            }
+            LiveRepairState::FinalFlush { flush_job } => {
+                [Some(flush_job), None, None, None]
+            }
+        };
+        for pending in jobs.into_iter().flatten() {
+            if pending.id == ds_id {
+                assert!(pending.result.is_none());
+                pending.result = Some(io.result())
+            }
+        }
+    }
+}
+
 #[derive(Debug)]
 pub(crate) struct ReconcileData {
     /// An ID uniquely identifying this reconciliation
@@ -257,6 +327,7 @@ impl Downstairs {
         ds_target: ClientMap<SocketAddr>,
         tls_context: Option<Arc<crucible_common::x509::TLSContext>>,
         stats: DownstairsStatOuter,
+        io_limits: &IOLimits,
         log: Logger,
     ) -> Self {
         let mut clients = [None, None, None];
@@ -265,6 +336,7 @@ impl Downstairs {
                 i,
                 cfg.clone(),
                 ds_target.get(&i).copied(),
+                io_limits.client_limits(i),
                 log.new(o!("client" => i.get().to_string())),
                 tls_context.clone(),
             ));
@@ -291,6 +363,8 @@ impl Downstairs {
             },
             cfg,
             next_flush: 0,
+            can_replay: true,
+            pending_barrier: 0,
             ds_active: ActiveJobs::new(),
             gw_active: HashSet::new(),
             acked_ids: AllocRingBuffer::new(2048),
@@ -304,7 +378,6 @@ impl Downstairs {
             reconcile_repair_needed: 0,
             reconcile_repair_aborted: 0,
             log: log.new(o!("" => "downstairs".to_string())),
-            ackable_work: BTreeSet::new(),
             repair: None,
             ddef: None,
             stats,
@@ -334,7 +407,12 @@ impl Downstairs {
         });
         let stats = DownstairsStatOuter::default();
 
-        let mut ds = Self::new(cfg, ClientMap::new(), None, stats, log);
+        // Build a set of fake IO limits that we won't hit
+        let io_limits = IOLimits::new(u32::MAX as usize, u32::MAX as usize);
+
+        let mut ds =
+            Self::new(cfg, ClientMap::new(), None, stats, &io_limits, log);
+
         // Create a fake repair address so this field is populated.
         for cid in ClientId::iter() {
             ds.clients[cid].repair_addr =
@@ -396,23 +474,6 @@ impl Downstairs {
         }
     }
 
-    /// Checks whether we have ackable work
-    pub(crate) fn has_ackable_jobs(&self) -> bool {
-        !self.ackable_work.is_empty()
-    }
-
-    /// Send back acks for all jobs that are `AckReady`
-    pub(crate) fn ack_jobs(&mut self) {
-        debug!(self.log, "ack_jobs called in Downstairs");
-
-        let ack_list = std::mem::take(&mut self.ackable_work);
-        let jobs_checked = ack_list.len();
-        for ds_id_done in ack_list.iter() {
-            self.ack_job(*ds_id_done);
-        }
-        debug!(self.log, "ack_ready handled {jobs_checked} jobs");
-    }
-
     /// Send the ack for a single job back upstairs through `GuestWork`
     ///
     /// Update stats for the upstairs as well
@@ -471,6 +532,10 @@ impl Downstairs {
         }
         debug!(self.log, "[A] ack job {}", ds_id);
 
+        if let Some(r) = &mut self.repair {
+            r.on_job_complete(ds_id, done);
+        }
+
         // Copy (if present) read data back to the guest buffer they
         // provided to us, and notify any waiters.
         if let Some(res) = done.res.take() {
@@ -486,7 +551,6 @@ impl Downstairs {
         } else {
             panic!("job {ds_id} not on gw_active list");
         }
-        self.retire_check(ds_id);
     }
 
     /// Helper function to calculate pruned deps for a given job
@@ -523,7 +587,7 @@ impl Downstairs {
 
         // Restart the IO task for that specific client, transitioning to a new
         // state.
-        self.clients[client_id].reinitialize(up_state);
+        self.clients[client_id].reinitialize(up_state, self.can_replay);
 
         for i in ClientId::iter() {
             // Clear per-client delay, because we're starting a new session
@@ -939,33 +1003,19 @@ impl Downstairs {
 
     /// Checks whether live-repair can continue
     ///
-    /// If live-repair can continue, returns the relevant `JobId`, which should
-    /// be passed into `self.continue_live_repair(..)`
-    ///
-    /// This must be called before `Downstairs::ack_jobs`, because it looks for
-    /// the repair job in `self.ackable_work` to decide if it's done.
-    fn get_live_repair_job(&mut self) -> Option<JobId> {
-        if let Some(repair) = &self.repair {
-            let ds_id = repair.state.active_job_id();
-            if self.ackable_work.contains(&ds_id) {
-                Some(ds_id)
-            } else {
-                // The job that live-repair is waiting on isn't yet ackable, and
-                // it better not have already been acked.
-                let job = self.ds_active.get(&ds_id).unwrap();
-                assert!(!job.acked);
-                None
-            }
-        } else {
-            // No live-repair in progress, nothing to do
-            None
-        }
+    /// If live-repair can continue, returns the relevant job result, which
+    /// should be passed into `self.continue_live_repair(..)`.
+    #[must_use]
+    fn get_live_repair_job(&mut self) -> Option<Result<(), CrucibleError>> {
+        self.repair
+            .as_mut()
+            .and_then(|r| r.state.active_job_result())
     }
 
     /// Pushes live-repair forward, if possible
     ///
-    /// It's possible that handling the current live-repair job will make
-    /// subsequent live-repair jobs ackable immediately
+    /// It's possible that handling the current live-repair job will cause
+    /// subsequent live-repair jobs to be completed immediately
     /// (`test_repair_extent_fail_noop_out_of_order` exercises this case).  As
     /// such, this function will continue running until the next live-repair job
     /// is not ready.
@@ -973,17 +1023,16 @@ impl Downstairs {
         &mut self,
         up_state: &UpstairsState,
     ) {
-        while let Some(ds_id) = self.get_live_repair_job() {
-            self.continue_live_repair(ds_id, up_state);
+        while let Some(r) = self.get_live_repair_job() {
+            self.continue_live_repair(r, up_state);
         }
     }
 
-    fn continue_live_repair(&mut self, ds_id: JobId, up_state: &UpstairsState) {
-        let done = self.ds_active.get(&ds_id).unwrap();
-        assert!(!done.acked);
-        assert!(self.ackable_work.contains(&ds_id));
-        let r = done.result();
-
+    fn continue_live_repair(
+        &mut self,
+        r: Result<(), CrucibleError>,
+        up_state: &UpstairsState,
+    ) {
         let Some(repair) = &self.repair else {
             panic!("cannot continue live-repair without self.repair");
         };
@@ -1027,23 +1076,27 @@ impl Downstairs {
         // calling such functions; we have to extract everything we want from
         // the `LiveRepairData` before calling anything on `&mut self`.
         let repair = self.repair.as_mut().unwrap(); // reborrow
-        match repair.state {
+
+        // steal `repair.state`, because we're about to replace it
+        match std::mem::replace(&mut repair.state, LiveRepairState::dummy()) {
             LiveRepairState::Closing {
-                close_id,
-                repair_id,
-                noop_id,
-                reopen_id,
+                close_job,
+                repair_job,
+                noop_job,
+                reopen_job,
             } => {
                 info!(
                     self.log,
                     "RE:{} Wait for result from repair command {}",
                     repair.active_extent,
-                    repair_id,
+                    repair_job.id,
                 );
+                let close_id = close_job.id;
+                let repair_id = repair_job.id;
                 repair.state = LiveRepairState::Repairing {
-                    repair_id,
-                    noop_id,
-                    reopen_id,
+                    repair_job,
+                    noop_job,
+                    reopen_job,
                 };
                 if repair.aborting_repair {
                     self.create_and_enqueue_noop_io(vec![close_id], repair_id);
@@ -1061,28 +1114,33 @@ impl Downstairs {
                 };
             }
             LiveRepairState::Repairing {
-                repair_id,
-                noop_id,
-                reopen_id,
+                repair_job,
+                noop_job,
+                reopen_job,
             } => {
                 info!(
                     self.log,
                     "RE:{} Wait for result from NoOp command {}",
                     repair.active_extent,
-                    noop_id,
+                    noop_job.id,
                 );
-                repair.state = LiveRepairState::Noop { noop_id, reopen_id };
+                let repair_id = repair_job.id;
+                let noop_id = noop_job.id;
+                repair.state = LiveRepairState::Noop {
+                    noop_job,
+                    reopen_job,
+                };
                 self.create_and_enqueue_noop_io(vec![repair_id], noop_id);
             }
-            LiveRepairState::Noop { reopen_id, .. } => {
+            LiveRepairState::Noop { reopen_job, .. } => {
                 info!(
                     self.log,
                     "RE:{} Wait for result from reopen command {}",
                     repair.active_extent,
-                    reopen_id,
+                    reopen_job.id,
                 );
                 // The reopen job was already queued, so just change state
-                repair.state = LiveRepairState::Reopening { reopen_id };
+                repair.state = LiveRepairState::Reopening { reopen_job };
             }
             LiveRepairState::Reopening { .. } => {
                 // It's possible that we've reached the end of our extents!
@@ -1100,14 +1158,16 @@ impl Downstairs {
                 // reassignment is handled below.
                 if finished || (repair.aborting_repair && !have_reserved_jobs) {
                     // We're done, submit a final flush!
-                    let flush_id = self.submit_flush(None, None);
+                    let flush_id = self.submit_flush(None, None, None);
 
                     info!(self.log, "LiveRepair final flush submitted");
                     cdt::up__to__ds__flush__start!(|| (flush_id.0));
 
                     // The borrow was dropped earlier, so reborrow `self.repair`
                     self.repair.as_mut().unwrap().state =
-                        LiveRepairState::FinalFlush { flush_id }
+                        LiveRepairState::FinalFlush {
+                            flush_job: PendingJob::new(flush_id),
+                        }
                 } else {
                     // Keep going!
                     repair.active_extent = next_extent;
@@ -1169,7 +1229,7 @@ impl Downstairs {
         let nio = IOop::ExtentLiveNoOp { dependencies: deps };
 
         cdt::gw__noop__start!(|| (noop_id.0));
-        self.enqueue(noop_id, nio, None, ClientMap::new())
+        self.enqueue(noop_id, nio, None, None)
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -1185,7 +1245,7 @@ impl Downstairs {
 
         cdt::gw__repair__start!(|| (repair_id.0, eid.0));
 
-        self.enqueue(repair_id, repair_io, None, ClientMap::new())
+        self.enqueue(repair_id, repair_io, None, None)
     }
 
     fn repair_or_noop(
@@ -1320,10 +1380,10 @@ impl Downstairs {
         );
 
         let state = LiveRepairState::Closing {
-            close_id,
-            repair_id,
-            reopen_id,
-            noop_id,
+            close_job: PendingJob::new(close_id),
+            repair_job: PendingJob::new(repair_id),
+            reopen_job: PendingJob::new(reopen_id),
+            noop_job: PendingJob::new(noop_id),
         };
         if let Some(extent_count) = extent_count {
             self.repair = Some(LiveRepairData {
@@ -1374,7 +1434,7 @@ impl Downstairs {
 
         cdt::gw__reopen__start!(|| (reopen_id.0, eid.0));
 
-        self.enqueue(reopen_id, reopen_io, None, ClientMap::new())
+        self.enqueue(reopen_id, reopen_io, None, None)
     }
 
     #[cfg(test)]
@@ -1402,7 +1462,7 @@ impl Downstairs {
             block_size: 512,
         };
 
-        self.enqueue(ds_id, aread, None, ClientMap::new());
+        self.enqueue(ds_id, aread, None, None);
         ds_id
     }
 
@@ -1433,7 +1493,7 @@ impl Downstairs {
             iblocks,
             request,
             is_write_unwritten,
-            ClientData::from_fn(|_| BackpressureGuard::dummy()),
+            IOLimitGuard::dummy(),
         )
     }
 
@@ -1442,7 +1502,7 @@ impl Downstairs {
         blocks: ImpactedBlocks,
         write: RawWrite,
         is_write_unwritten: bool,
-        bp_guard: ClientData<BackpressureGuard>,
+        io_guard: IOLimitGuard,
     ) -> JobId {
         let ds_id = self.next_id();
         if is_write_unwritten {
@@ -1485,7 +1545,7 @@ impl Downstairs {
             ds_id,
             awrite,
             Some(GuestBlockRes::Acked), // writes are always acked
-            bp_guard.into(),
+            Some(io_guard),
         );
         ds_id
     }
@@ -1515,7 +1575,7 @@ impl Downstairs {
         let close_io = self.create_close_io(eid, deps, repair.to_vec());
 
         cdt::gw__close__start!(|| (close_id.0, eid.0));
-        self.enqueue(close_id, close_io, None, ClientMap::new())
+        self.enqueue(close_id, close_io, None, None)
     }
 
     /// Get the repair IDs and dependencies for this extent.
@@ -1859,6 +1919,7 @@ impl Downstairs {
         &mut self,
         snapshot_details: Option<SnapshotDetails>,
         res: Option<BlockRes>,
+        io_guard: Option<IOLimitGuard>,
     ) -> JobId {
         let next_id = self.next_id();
         cdt::gw__flush__start!(|| (next_id.0));
@@ -1887,15 +1948,52 @@ impl Downstairs {
             extent_limit: extent_under_repair,
         };
 
-        self.enqueue(
-            next_id,
-            flush,
-            res.map(GuestBlockRes::Other),
-            ClientMap::new(),
-        );
+        self.pending_barrier += 1;
+        self.enqueue(next_id, flush, res.map(GuestBlockRes::Other), io_guard);
         next_id
     }
 
+    /// Checks to see whether a `Barrier` operation is needed
+    ///
+    /// A `Barrier` is needed if we have buffered more than
+    /// `IO_CACHED_MAX_BYTES/JOBS` worth of complete jobs, and there are no
+    /// other barrier (or flush) operations in flight
+    pub(crate) fn needs_barrier(&self) -> bool {
+        if self.pending_barrier > 0 {
+            return false;
+        }
+
+        // n.b. This may not be 100% reliable: if different Downstairs have
+        // finished a different subset of jobs, then it's theoretically possible
+        // for each DownstairsClient to be under our limits, but for the true
+        // number of cached bytes/jobs to be over the limits.
+        //
+        // It's hard to imagine how we could encounter such a situation, given
+        // job dependencies and no out-of-order execution, so this is more of a
+        // "fun fact" and less an actual concern.
+        let max_jobs = self
+            .clients
+            .iter()
+            .map(|c| {
+                let i = c.io_state_job_count();
+                i.skipped + i.done + i.error
+            })
+            .max()
+            .unwrap();
+        let max_bytes = self
+            .clients
+            .iter()
+            .map(|c| {
+                let i = c.io_state_byte_count();
+                i.skipped + i.done + i.error
+            })
+            .max()
+            .unwrap();
+
+        max_jobs as u64 >= crate::IO_CACHED_MAX_JOBS
+            || max_bytes >= crate::IO_CACHED_MAX_BYTES
+    }
+
     pub(crate) fn submit_barrier(&mut self) -> JobId {
         let next_id = self.next_id();
         cdt::gw__barrier__start!(|| (next_id.0));
@@ -1906,12 +2004,8 @@ impl Downstairs {
         let dependencies = self.ds_active.deps_for_flush(next_id);
         debug!(self.log, "IO Barrier {next_id} has deps {dependencies:?}");
 
-        self.enqueue(
-            next_id,
-            IOop::Barrier { dependencies },
-            None,
-            ClientMap::new(),
-        );
+        self.pending_barrier += 1;
+        self.enqueue(next_id, IOop::Barrier { dependencies }, None, None);
         next_id
     }
 
@@ -1990,6 +2084,7 @@ impl Downstairs {
         blocks: ImpactedBlocks,
         data: Buffer,
         res: BlockRes<Buffer, (Buffer, CrucibleError)>,
+        io_guard: IOLimitGuard,
     ) -> JobId {
         // If there is a live-repair in progress that intersects with this read,
         // then reserve job IDs for those jobs.  We must do this before
@@ -2019,7 +2114,7 @@ impl Downstairs {
         };
 
         let res = GuestBlockRes::Read(data, res);
-        self.enqueue(ds_id, aread, Some(res), ClientMap::new());
+        self.enqueue(ds_id, aread, Some(res), Some(io_guard));
 
         ds_id
     }
@@ -2029,7 +2124,7 @@ impl Downstairs {
         blocks: ImpactedBlocks,
         write: RawWrite,
         is_write_unwritten: bool,
-        backpressure_guard: ClientData<BackpressureGuard>,
+        io_guard: IOLimitGuard,
     ) -> JobId {
         // If there is a live-repair in progress that intersects with this read,
         // then reserve job IDs for those jobs.
@@ -2039,7 +2134,7 @@ impl Downstairs {
             blocks,
             write,
             is_write_unwritten,
-            backpressure_guard,
+            io_guard,
         )
     }
 
@@ -2083,7 +2178,7 @@ impl Downstairs {
     ///
     /// - enqueue the job in each of [Self::clients] (clients may skip the job)
     /// - add the job to [Self::ds_active]
-    /// - Mark the job as ackable if it was skipped by all downstairs
+    /// - Ack the job immediately if it was skipped by all downstairs
     /// - Check that the job was already acked if it's a write (the "fast ack"
     ///   optimization, which is performed elsewhere)
     /// - Send the job to each downstairs client task (if not skipped)
@@ -2092,39 +2187,33 @@ impl Downstairs {
         ds_id: JobId,
         io: IOop,
         res: Option<GuestBlockRes>,
-        mut bp_guard: ClientMap<BackpressureGuard>,
+        io_limits: Option<IOLimitGuard>,
     ) {
         let mut skipped = 0;
         let last_repair_extent = self.last_repair_extent();
 
+        let mut io_limits = io_limits
+            .map(ClientMap::from)
+            .unwrap_or_else(ClientMap::new);
+
         // Send the job to each client!
         let state = ClientData::from_fn(|cid| {
             let client = &mut self.clients[cid];
             let r = client.enqueue(ds_id, &io, last_repair_extent);
             match r {
-                EnqueueResult::Send | EnqueueResult::Hold => {
-                    // Update the per-client backpressure guard
-                    if !bp_guard.contains(&cid) {
-                        let g = client.backpressure_counters.increment(&io);
-                        bp_guard.insert(cid, g);
-                    }
-                    if matches!(r, EnqueueResult::Send) {
-                        self.send(ds_id, io.clone(), cid);
-                    }
+                EnqueueResult::Send => self.send(ds_id, io.clone(), cid),
+                EnqueueResult::Hold => (),
+                EnqueueResult::Skip => {
+                    // Take and drop the ClientIOLimitGuard, freeing up tokens
+                    let _ = io_limits.take(&cid);
+                    skipped += 1;
                 }
-                EnqueueResult::Skip => skipped += 1,
             }
             r.state()
         });
 
         // Writes are acked immediately, before being enqueued
         let acked = io.is_write();
-        if skipped == 3 {
-            if !acked {
-                self.ackable_work.insert(ds_id);
-            }
-            warn!(self.log, "job {} skipped on all downstairs", &ds_id);
-        }
 
         // Puts the IO onto the downstairs work queue.
         self.ds_active.insert(
@@ -2136,14 +2225,25 @@ impl Downstairs {
                 res,
                 replay: false,
                 data: None,
-                backpressure_guard: bp_guard,
+                io_limits,
             },
         );
+
         if acked {
             self.acked_ids.push(ds_id)
         } else {
             self.gw_active.insert(ds_id);
         }
+
+        // Ack the job immediately if it was skipped on all 3x downstairs
+        // (and wasn't previously acked, i.e. isn't a write)
+        if skipped == 3 {
+            if !acked {
+                self.ack_job(ds_id);
+            }
+            self.retire_check(ds_id);
+            warn!(self.log, "job {} skipped on all downstairs", &ds_id);
+        }
     }
 
     /// Sends the given job to the given client
@@ -2439,11 +2539,17 @@ impl Downstairs {
         Ok(ReplaceResult::Started)
     }
 
+    /// Checks whether the given client state should go from Offline -> Faulted
+    ///
+    /// # Panics
+    /// If the given client is not in the `Offline` state
     pub(crate) fn check_gone_too_long(
         &mut self,
         client_id: ClientId,
         up_state: &UpstairsState,
     ) {
+        assert_eq!(self.clients[client_id].state(), DsState::Offline);
+
         let byte_count = self.clients[client_id].total_bytes_outstanding();
         let work_count = self.clients[client_id].total_live_work();
         let failed = if work_count > crate::IO_OUTSTANDING_MAX_JOBS {
@@ -2458,6 +2564,13 @@ impl Downstairs {
                 "downstairs failed, too many outstanding bytes {byte_count}"
             );
             Some(ClientStopReason::TooManyOutstandingBytes)
+        } else if !self.can_replay {
+            // XXX can this actually happen?
+            warn!(
+                self.log,
+                "downstairs became ineligible for replay while offline"
+            );
+            Some(ClientStopReason::IneligibleForReplay)
         } else {
             None
         };
@@ -2470,8 +2583,8 @@ impl Downstairs {
 
     /// Move all `New` and `InProgress` jobs for the given client to `Skipped`
     ///
-    /// This may lead to jobs being marked as ackable, since a skipped job
-    /// counts as complete in some circumstances.
+    /// This may lead to jobs being acked, since a skipped job counts as
+    /// complete in some circumstances.
     pub(crate) fn skip_all_jobs(&mut self, client_id: ClientId) {
         info!(
             self.log,
@@ -2479,6 +2592,7 @@ impl Downstairs {
             self.ds_active.len(),
         );
 
+        let mut ack_jobs = vec![];
         let mut retire_check = vec![];
         let mut number_jobs_skipped = 0;
 
@@ -2502,12 +2616,16 @@ impl Downstairs {
                             self.log,
                             "[{}] notify = true for {}", client_id, ds_id
                         );
-                        self.ackable_work.insert(*ds_id);
+                        ack_jobs.push(*ds_id);
                     }
                 }
             }
         });
 
+        for ds_id in ack_jobs {
+            self.ack_job(ds_id);
+        }
+
         info!(
             self.log,
             "[{}] changed {} jobs to fault skipped",
@@ -2589,9 +2707,12 @@ impl Downstairs {
     /// writes and if they aren't included in replay then the write will
     /// never start.
     fn retire_check(&mut self, ds_id: JobId) {
-        if !self.is_flush(ds_id) {
-            return;
-        }
+        let job = self.ds_active.get(&ds_id).expect("checked missing job");
+        let can_replay = match job.work {
+            IOop::Flush { .. } => true,
+            IOop::Barrier { .. } => false,
+            _ => return,
+        };
 
         // Only a completed flush will remove jobs from the active queue -
         // currently we have to keep everything around for use during replay
@@ -2645,15 +2766,22 @@ impl Downstairs {
             for &id in &retired {
                 let job = self.ds_active.remove(&id);
 
+                // Update our barrier count for the removed job
+                if matches!(job.work, IOop::Flush { .. } | IOop::Barrier { .. })
+                {
+                    self.pending_barrier =
+                        self.pending_barrier.checked_sub(1).unwrap();
+                }
+
                 // Jobs should have their backpressure contribution removed when
                 // they are completed (in `process_io_completion_inner`),
                 // **not** when they are retired.  We'll do a sanity check here
                 // and print a warning if that's not the case.
                 for c in ClientId::iter() {
-                    if job.backpressure_guard.contains(&c) {
+                    if job.io_limits.contains(&c) {
                         warn!(
                             self.log,
-                            "job {ds_id} had pending backpressure bytes \
+                            "job {ds_id} had pending io limits \
                              for client {c}"
                         );
                         // Backpressure is decremented on drop
@@ -2666,6 +2794,9 @@ impl Downstairs {
             for cid in ClientId::iter() {
                 self.clients[cid].skipped_jobs.retain(|&x| x >= ds_id);
             }
+
+            // Update the flag indicating whether replay is allowed
+            self.can_replay = can_replay;
         }
     }
 
@@ -3091,11 +3222,10 @@ impl Downstairs {
 
     /// Wrapper for marking a single job as done from the given client
     ///
-    /// This can be used to test handling of ackable work, etc.
+    /// This can be used to test handling of job acks, etc
     ///
-    /// Returns true if the given job has gone from not ackable (not present in
-    /// `self.ackable_work`) to ackable.  This is for historical reasons,
-    /// because it's often used in existing tests.
+    /// Returns true if the given job has gone from not acked to acked.  This is
+    /// for historical reasons, because it's often used in existing tests.
     #[cfg(test)]
     pub fn process_ds_completion(
         &mut self,
@@ -3105,7 +3235,7 @@ impl Downstairs {
         up_state: &UpstairsState,
         extent_info: Option<ExtentInfo>,
     ) -> bool {
-        let was_ackable = self.ackable_work.contains(&ds_id);
+        let was_acked = self.ds_active.get(&ds_id).unwrap().acked;
 
         self.process_io_completion_inner(
             ds_id,
@@ -3114,9 +3244,11 @@ impl Downstairs {
             up_state,
             extent_info,
         );
-        let now_ackable = self.ackable_work.contains(&ds_id);
-        !was_ackable && now_ackable
-        // TODO should this also ack the job, to mimick our event loop?
+        // It's possible for the IO completion to have retired this job; in that
+        // case, it must have been acked before being retired.
+        let now_acked =
+            self.ds_active.get(&ds_id).map(|j| j.acked).unwrap_or(true);
+        !was_acked && now_acked
     }
 
     fn process_io_completion_inner(
@@ -3152,38 +3284,22 @@ impl Downstairs {
             return;
         };
 
-        if self.clients[client_id].process_io_completion(
+        let was_acked = job.acked;
+        let should_ack = self.clients[client_id].process_io_completion(
             ds_id,
             job,
             responses,
             deactivate,
             extent_info,
-        ) {
-            self.ackable_work.insert(ds_id);
-        }
+        );
 
-        /*
-         * If all 3 jobs are done, we can check here to see if we can
-         * remove this job from the DS list. If we have completed the ack
-         * to the guest, then there will be no more work on this job
-         * but messages may still be unprocessed.
-         */
-        if job.acked {
-            self.retire_check(ds_id);
-        } else {
-            // If we reach this then the job probably has errors and
-            // hasn't acked back yet. We check for NotAcked so we don't
-            // double count three done and return true if we already have
-            // AckReady set.
+        // If all 3 jobs are done, we can check here to see if we can remove
+        // this job from the DS list.
+        let wc = job.state_count();
+        let complete = (wc.error + wc.skipped + wc.done) == 3;
 
-            // If we are a write or a flush with one success, then
-            // we must switch our state to failed.  This condition is
-            // handled when we check the job result.
-            let wc = job.state_count();
-            if (wc.error + wc.skipped + wc.done) == 3 {
-                self.ackable_work.insert(ds_id);
-                debug!(self.log, "[{}] Set AckReady {}", client_id, ds_id);
-            }
+        if !was_acked && (should_ack || complete) {
+            self.ack_job(ds_id);
         }
 
         // Decide what to do when we have an error from this IO.
@@ -3246,6 +3362,10 @@ impl Downstairs {
                 // Nothing to do here, no error!
             }
         }
+
+        if complete {
+            self.retire_check(ds_id);
+        }
     }
 
     /// Accessor for [`Downstairs::reconcile_repaired`]
@@ -3278,48 +3398,15 @@ impl Downstairs {
     }
 
     pub(crate) fn write_bytes_outstanding(&self) -> u64 {
+        // XXX this overlaps with io_state_byte_count
         self.clients
             .iter()
             .filter(|c| matches!(c.state(), DsState::Active))
-            .map(|c| c.backpressure_counters.get_write_bytes())
+            .map(|c| c.io_state_byte_count().in_progress)
             .max()
             .unwrap_or(0)
     }
 
-    pub(crate) fn jobs_outstanding(&self) -> u64 {
-        self.clients
-            .iter()
-            .filter(|c| matches!(c.state(), DsState::Active))
-            .map(|c| c.backpressure_counters.get_jobs())
-            .max()
-            .unwrap_or(0)
-    }
-
-    /// Marks a single job as acked
-    ///
-    /// The job is removed from `self.ackable_work` and `acked` is set to `true`
-    /// in the job's state.
-    ///
-    /// This is only useful in tests; in real code, we'd also want to reply to
-    /// the guest when acking a job.
-    #[cfg(test)]
-    fn ack(&mut self, ds_id: JobId) {
-        /*
-         * Move AckReady to Acked.
-         */
-        let Some(job) = self.ds_active.get_mut(&ds_id) else {
-            panic!("reqid {} is not active", ds_id);
-        };
-        if !self.ackable_work.remove(&ds_id) {
-            panic!("Job {ds_id} is not ackable");
-        }
-
-        if job.acked {
-            panic!("Job {ds_id} already acked!");
-        }
-        job.acked = true;
-    }
-
     /// Returns all jobs in sorted order by [`JobId`]
     ///
     /// This function is used in unit tests for the Upstairs
@@ -3338,11 +3425,6 @@ impl Downstairs {
         self.ds_active.get_extents_for(ds_id)
     }
 
-    #[cfg(test)]
-    pub fn ackable_work(&self) -> &BTreeSet<JobId> {
-        &self.ackable_work
-    }
-
     #[cfg(test)]
     pub fn completed(&self) -> &AllocRingBuffer<JobId> {
         &self.retired_ids
@@ -3418,7 +3500,7 @@ impl Downstairs {
                 data,
             },
             is_write_unwritten,
-            ClientData::from_fn(|_| BackpressureGuard::dummy()),
+            IOLimitGuard::dummy(),
         )
     }
 
@@ -3449,7 +3531,7 @@ impl Downstairs {
         let ddef = self.ddef.as_ref().unwrap();
         let block_count = blocks.blocks(ddef).len();
         let buf = Buffer::new(block_count, ddef.block_size() as usize);
-        self.submit_read(blocks, buf, BlockRes::dummy())
+        self.submit_read(blocks, buf, BlockRes::dummy(), IOLimitGuard::dummy())
     }
 
     /// Create a test downstairs which has all clients Active
@@ -4159,23 +4241,17 @@ impl Downstairs {
         });
     }
 
-    /// Assign the given number of write bytes to the backpressure counters
-    #[must_use]
-    pub(crate) fn early_write_backpressure(
-        &mut self,
-        bytes: u64,
-    ) -> ClientData<BackpressureGuard> {
-        ClientData::from_fn(|i| {
-            self.clients[i]
-                .backpressure_counters
-                .early_write_increment(bytes)
-        })
-    }
-
     pub(crate) fn set_ddef(&mut self, ddef: RegionDefinition) {
         self.ddef = Some(ddef);
     }
 
+    /// Checks whether there are any in-progress jobs present
+    pub(crate) fn has_live_jobs(&self) -> bool {
+        self.clients
+            .iter()
+            .any(|c| c.io_state_job_count().in_progress > 0)
+    }
+
     /// Returns the per-client state for the given job
     ///
     /// This is a helper function to make unit tests shorter
@@ -4222,14 +4298,15 @@ struct DownstairsBackpressureConfig {
 
 #[cfg(test)]
 pub(crate) mod test {
-    use super::Downstairs;
+    use super::{Downstairs, PendingJob};
     use crate::{
         downstairs::{LiveRepairData, LiveRepairState, ReconcileData},
         live_repair::ExtentInfo,
         upstairs::UpstairsState,
-        ClientId, CrucibleError, DsState, ExtentFix, ExtentRepairIDs, IOState,
-        IOop, ImpactedAddr, ImpactedBlocks, JobId, RawReadResponse,
-        ReconcileIO, ReconcileIOState, ReconciliationId, SnapshotDetails,
+        BlockOpWaiter, ClientId, CrucibleError, DsState, ExtentFix,
+        ExtentRepairIDs, IOState, IOop, ImpactedAddr, ImpactedBlocks, JobId,
+        RawReadResponse, ReconcileIO, ReconcileIOState, ReconciliationId,
+        SnapshotDetails,
     };
 
     use bytes::BytesMut;
@@ -4269,10 +4346,6 @@ pub(crate) mod test {
                 None,
             );
         }
-        // Writes are fast-acked when first submitted
-        if !ds.ds_active.get(&ds_id).unwrap().work.is_write() {
-            ds.ack(ds_id);
-        }
     }
 
     fn set_all_reconcile(ds: &mut Downstairs) {
@@ -4297,7 +4370,7 @@ pub(crate) mod test {
         let mut ds = Downstairs::test_default();
         ds.force_active();
 
-        let next_id = ds.submit_flush(None, None);
+        let next_id = ds.submit_flush(None, None, None);
 
         assert!(!ds.process_ds_completion(
             next_id,
@@ -4306,7 +4379,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(ds.process_ds_completion(
@@ -4316,12 +4388,9 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert_eq!(ds.ackable_work.len(), 1);
+        assert!(ds.ds_active.get(&next_id).unwrap().acked);
         assert!(ds.retired_ids.is_empty());
 
-        assert!(!ds.ds_active.get(&next_id).unwrap().acked);
-        ds.ack(next_id);
-
         assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(2),
@@ -4329,7 +4398,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert_eq!(ds.retired_ids.len(), 1);
     }
 
@@ -4344,6 +4412,7 @@ pub(crate) mod test {
                 snapshot_name: String::from("snap"),
             }),
             None,
+            None,
         );
 
         assert!(!ds.process_ds_completion(
@@ -4353,8 +4422,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(!ds.process_ds_completion(
@@ -4364,8 +4431,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(ds.process_ds_completion(
@@ -4376,11 +4441,6 @@ pub(crate) mod test {
             None,
         ));
 
-        assert_eq!(ds.ackable_work.len(), 1);
-
-        ds.ack(next_id);
-        ds.retire_check(next_id);
-
         assert_eq!(ds.retired_ids.len(), 1);
     }
 
@@ -4389,7 +4449,7 @@ pub(crate) mod test {
         let mut ds = Downstairs::test_default();
         ds.force_active();
 
-        let next_id = ds.submit_flush(None, None);
+        let next_id = ds.submit_flush(None, None, None);
 
         assert!(!ds.process_ds_completion(
             next_id,
@@ -4398,7 +4458,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(!ds.process_ds_completion(
@@ -4408,7 +4467,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(ds.process_ds_completion(
@@ -4418,10 +4476,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert_eq!(ds.ackable_work.len(), 1);
-
-        ds.ack(next_id);
-        ds.retire_check(next_id);
 
         assert_eq!(ds.retired_ids.len(), 1);
         // No skipped jobs here.
@@ -4435,7 +4489,7 @@ pub(crate) mod test {
         let mut ds = Downstairs::test_default();
         ds.force_active();
 
-        let next_id = ds.submit_flush(None, None);
+        let next_id = ds.submit_flush(None, None, None);
 
         assert!(!ds.process_ds_completion(
             next_id,
@@ -4444,7 +4498,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(!ds.process_ds_completion(
@@ -4454,7 +4507,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(ds.process_ds_completion(
@@ -4464,10 +4516,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert_eq!(ds.ackable_work.len(), 1);
-
-        ds.ack(next_id);
-        ds.retire_check(next_id);
 
         assert_eq!(ds.retired_ids.len(), 1);
     }
@@ -4488,11 +4536,8 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert_eq!(ds.ackable_work.len(), 1);
         assert!(ds.retired_ids.is_empty());
 
-        ds.ack(next_id);
-
         let response = Ok(build_read_response(&[]));
 
         assert!(!ds.process_ds_completion(
@@ -4502,7 +4547,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         let response = Ok(build_read_response(&[]));
@@ -4514,7 +4558,7 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
+
         // A flush is required to move work to completed
         assert!(ds.retired_ids.is_empty());
     }
@@ -4533,7 +4577,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         let response = Ok(build_read_response(&[]));
@@ -4545,11 +4588,8 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert_eq!(ds.ackable_work.len(), 1);
         assert!(ds.retired_ids.is_empty());
 
-        ds.ack(next_id);
-
         let response = Ok(build_read_response(&[]));
 
         assert!(!ds.process_ds_completion(
@@ -4559,7 +4599,7 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
+
         // A flush is required to move work to completed
         // That this is still zero is part of the test
         assert!(ds.retired_ids.is_empty());
@@ -4579,7 +4619,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(!ds.process_ds_completion(
@@ -4589,7 +4628,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         let response = Ok(build_read_response(&[]));
@@ -4601,13 +4639,8 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert_eq!(ds.ackable_work.len(), 1);
-
-        ds.ack(next_id);
-        ds.retire_check(next_id);
 
         // A flush is required to move work to completed
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
     }
 
@@ -4625,7 +4658,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(!ds.process_ds_completion(
@@ -4635,7 +4667,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         assert!(ds.process_ds_completion(
@@ -4645,12 +4676,7 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-        assert_eq!(ds.ackable_work.len(), 1);
 
-        ds.ack(next_id);
-        ds.retire_check(next_id);
-
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
     }
 
@@ -4664,6 +4690,7 @@ pub(crate) mod test {
 
         let response = || Ok(build_read_response(&[]));
 
+        // ack on first reply
         assert!(ds.process_ds_completion(
             next_id,
             ClientId::new(2),
@@ -4680,10 +4707,6 @@ pub(crate) mod test {
             None,
         ));
 
-        // emulated run in up_ds_listen
-        ds.ack(next_id);
-        ds.retire_check(next_id);
-
         assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(1),
@@ -4692,7 +4715,6 @@ pub(crate) mod test {
             None,
         ));
 
-        assert!(ds.ackable_work.is_empty());
         // Work won't be completed until we get a flush.
         assert!(ds.retired_ids.is_empty());
     }
@@ -4716,6 +4738,7 @@ pub(crate) mod test {
         ds.force_active();
 
         // send a write, and clients 0 and 1 will return errors
+        // job is fast-acked, so it starts as acked
 
         let next_id =
             ds.create_and_enqueue_generic_write_eob(is_write_unwritten);
@@ -4740,25 +4763,14 @@ pub(crate) mod test {
 
         assert!(ds.ds_active.get(&next_id).unwrap().data.is_none());
 
-        // Jobs should be acked
-        assert!(ds.ackable_work.is_empty());
-        assert!(ds.ds_active.get(&next_id).unwrap().acked);
-
         let response = Ok(Default::default());
-        let res = ds.process_ds_completion(
+        assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(2),
             response,
             &UpstairsState::Active,
             None,
-        );
-
-        // The IO should not have been marked as ackable, because it was
-        // fast-acked
-        assert!(!res);
-
-        // Both write and write_unwritten should be fast-acked
-        assert!(ds.ackable_work.is_empty());
+        ));
 
         assert!(ds.clients[ClientId::new(0)].stats.downstairs_errors > 0);
         assert!(ds.clients[ClientId::new(1)].stats.downstairs_errors > 0);
@@ -4790,9 +4802,8 @@ pub(crate) mod test {
 
         // Simulate completing both writes to downstairs 0 and 1
         //
-        // write_unwritten jobs become ackable upon the second completion;
-        // normal writes were ackable from the start (and hence
-        // process_ds_completion always returns `false`)
+        // all writes are acked immediately upon submission, so
+        // process_ds_completion always returns `false`.
         assert!(!ds.process_ds_completion(
             id1,
             ClientId::new(0),
@@ -4827,11 +4838,10 @@ pub(crate) mod test {
         assert!(ds.ds_active.get(&id2).unwrap().acked);
 
         // Work stays on active queue till the flush
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         // Create the flush and send it to the downstairs
-        let flush_id = ds.submit_flush(None, None);
+        let flush_id = ds.submit_flush(None, None, None);
 
         // Simulate completing the flush to downstairs 0 and 1
         assert!(!ds.process_ds_completion(
@@ -4849,20 +4859,12 @@ pub(crate) mod test {
             None,
         ));
 
-        // Ack the flush back to the guest
-        ds.ack(flush_id);
-
         // Make sure downstairs 0 and 1 update their last flush id and
         // that downstairs 2 does not.
         assert_eq!(ds.clients[ClientId::new(0)].last_flush, flush_id);
         assert_eq!(ds.clients[ClientId::new(1)].last_flush, flush_id);
         assert_eq!(ds.clients[ClientId::new(2)].last_flush, JobId(0));
 
-        // Should not retire yet.
-        ds.retire_check(flush_id);
-
-        assert!(ds.ackable_work.is_empty());
-
         // Make sure all work is still on the active side
         assert!(ds.retired_ids.is_empty());
 
@@ -4939,6 +4941,7 @@ pub(crate) mod test {
             None
         ));
 
+        // Confirm that we've stored this block
         let responses = ds.ds_active.get(&next_id).unwrap().data.as_ref();
         assert!(responses.is_some());
         assert_eq!(
@@ -5014,7 +5017,7 @@ pub(crate) mod test {
         ));
 
         // One completion of a read means we can ACK
-        assert_eq!(ds.ackable_work.len(), 1);
+        assert!(ds.ds_active.get(&next_id).unwrap().acked);
 
         // Complete downstairs 1 and 2
         let response = Ok(build_read_response(&[]));
@@ -5035,19 +5038,13 @@ pub(crate) mod test {
             None
         ));
 
-        // Make sure the job is still active
-        assert!(ds.retired_ids.is_empty());
-
-        // Ack the job to the guest (this checks whether it's ack ready)
-        ds.ack(next_id);
-
-        // Nothing left to ACK, but until the flush we keep the IO data.
-        assert!(ds.ackable_work.is_empty());
+        // Make sure the job is not yet retired
         assert!(ds.retired_ids.is_empty());
 
         // A flush is required to move work to completed
         // Create the flush then send it to all downstairs.
-        let next_id = ds.submit_flush(None, None);
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let next_id = ds.submit_flush(None, Some(tx), None);
 
         // Complete the Flush at each downstairs.
         assert!(!ds.process_ds_completion(
@@ -5073,12 +5070,9 @@ pub(crate) mod test {
             None
         ));
 
-        // ACK the flush and let retire_check move things along.
-        ds.ack(next_id);
-        ds.retire_check(next_id);
+        // Verify the flush has been acked
+        assert!(rx.try_wait().unwrap().is_ok());
 
-        // Verify no more work to ack.
-        assert!(ds.ackable_work.is_empty());
         // The read and the flush should now be moved to completed.
         assert_eq!(ds.retired_ids.len(), 2);
     }
@@ -5091,12 +5085,12 @@ pub(crate) mod test {
         ds.force_active();
 
         // Build our read, put it into the work queue
-        let next_id = ds.create_and_enqueue_generic_read_eob();
+        let read_id = ds.create_and_enqueue_generic_read_eob();
 
         // Downstairs 0 now has completed this work.
         let response = Ok(build_read_response(&[]));
         assert!(ds.process_ds_completion(
-            next_id,
+            read_id,
             ClientId::new(0),
             response,
             &UpstairsState::Active,
@@ -5104,34 +5098,27 @@ pub(crate) mod test {
         ));
 
         // One completion of a read means we can ACK
-        assert_eq!(ds.ackable_work.len(), 1);
-
-        // But, don't send the ack just yet.
-        // The job should be ack ready
-        assert!(ds.ackable_work.contains(&next_id));
-        assert!(!ds.ds_active.get(&next_id).unwrap().acked);
+        assert!(ds.ds_active.get(&read_id).unwrap().acked);
 
         // A flush is required to move work to completed
         // Create the flush then send it to all downstairs.
-        let next_id = ds.submit_flush(None, None);
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let flush_id = ds.submit_flush(None, Some(tx), None);
 
         // Send and complete the Flush at each downstairs.
         for cid in ClientId::iter() {
             ds.process_ds_completion(
-                next_id,
+                flush_id,
                 cid,
                 Ok(Default::default()),
                 &UpstairsState::Active,
                 None,
             );
         }
+        assert!(rx.try_wait().unwrap().is_ok());
 
-        // ACK the flush and let retire_check move things along.
-        ds.ack(next_id);
-        ds.retire_check(next_id);
-
-        // Verify the read is still ack ready.
-        assert_eq!(ds.ackable_work.len(), 1);
+        // Verify the read is still acked
+        assert!(ds.ds_active.get(&read_id).unwrap().acked);
         // The the flush should now be moved to completed.
         assert_eq!(ds.retired_ids.len(), 1);
         // The read should still be on the queue.
@@ -5186,13 +5173,13 @@ pub(crate) mod test {
         assert!(ds.ds_active.get(&next_id).unwrap().acked);
 
         // Work stays on active queue till the flush
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         // Create the flush IO
-        let next_id = ds.submit_flush(None, None);
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let next_id = ds.submit_flush(None, Some(tx), None);
 
-        // Complete the flush on all three downstairs.
+        // Complete the flush on all three downstairs, acking on the 2nd
         assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(0),
@@ -5215,10 +5202,7 @@ pub(crate) mod test {
             None
         ));
 
-        ds.ack(next_id);
-        ds.retire_check(next_id);
-
-        assert!(ds.ackable_work.is_empty());
+        assert!(rx.try_wait().unwrap().is_ok());
         // The write and flush should now be completed.
         assert_eq!(ds.retired_ids.len(), 2);
     }
@@ -5281,11 +5265,10 @@ pub(crate) mod test {
         assert!(ds.ds_active.get(&id2).unwrap().acked);
 
         // Work stays on active queue till the flush.
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.retired_ids.is_empty());
 
         // Create and send the flush.
-        let flush_id = ds.submit_flush(None, None);
+        let flush_id = ds.submit_flush(None, None, None);
 
         // Complete the flush on those downstairs.
         assert!(!ds.process_ds_completion(
@@ -5303,13 +5286,6 @@ pub(crate) mod test {
             None
         ));
 
-        // Ack the flush
-        ds.ack(flush_id);
-
-        // Should not retire yet
-        ds.retire_check(flush_id);
-
-        assert!(ds.ackable_work.is_empty());
         // Not done yet, until all clients do the work.
         assert!(ds.retired_ids.is_empty());
 
@@ -5373,12 +5349,7 @@ pub(crate) mod test {
         ));
 
         // One completion should allow for an ACK
-        assert_eq!(ds.ackable_work.len(), 1);
-        assert!(ds.ackable_work.contains(&next_id));
-        assert!(!ds.ds_active.get(&next_id).unwrap().acked);
-
-        // The Downstairs will ack jobs immediately, during the event handler
-        ds.ack(next_id);
+        assert!(ds.ds_active.get(&next_id).unwrap().acked);
 
         // Be sure the job is not yet in replay
         assert!(!ds.ds_active.get(&next_id).unwrap().replay);
@@ -5387,7 +5358,6 @@ pub(crate) mod test {
         assert!(ds.ds_active.get(&next_id).unwrap().replay);
 
         // The IO is still acked
-        assert!(!ds.ackable_work.contains(&next_id));
         assert!(ds.ds_active.get(&next_id).unwrap().acked);
     }
 
@@ -5410,8 +5380,7 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None
         ));
-        // We always ack jobs right away
-        ds.ack(next_id);
+        assert!(ds.ds_active.get(&next_id).unwrap().acked);
 
         // Complete the read on a 2nd downstairs.
         let response = Ok(build_read_response(&[1, 2, 3, 4]));
@@ -5444,7 +5413,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.ds_active.get(&next_id).unwrap().acked);
     }
 
@@ -5467,14 +5435,11 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None
         ));
-        // Immediately ack the job (which checks that it's ready)
-        ds.ack(next_id);
+        assert!(ds.ds_active.get(&next_id).unwrap().acked);
 
-        // Should not retire yet
-        ds.retire_check(next_id);
+        // The read has been acked
+        assert!(ds.ds_active.get(&next_id).unwrap().acked);
 
-        // No new ackable work.
-        assert!(ds.ackable_work.is_empty());
         // Verify the IO has not completed yet.
         assert!(ds.retired_ids.is_empty());
 
@@ -5494,7 +5459,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None
         ));
-        assert!(ds.ackable_work.is_empty());
         assert!(ds.ds_active.get(&next_id).unwrap().acked);
     }
 
@@ -5540,11 +5504,9 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None
         ));
+        assert!(ds.ds_active.get(&next_id).unwrap().acked);
 
-        // Ack the read to the guest.
-        ds.ack(next_id);
-
-        // Before re replay_jobs, the IO is not replay
+        // Before we replay_jobs, the IO is not replay
         assert!(!ds.ds_active.get(&next_id).unwrap().replay);
         // Now, take that downstairs offline
         ds.replay_jobs(ClientId::new(0));
@@ -5612,6 +5574,7 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None
         ));
+        assert!(ds.ds_active.get(&next_id).unwrap().acked);
 
         // Construct our fake response for another downstairs.
         let response = Ok(build_read_response(
@@ -5627,9 +5590,6 @@ pub(crate) mod test {
             None
         ));
 
-        // Ack the read to the guest.
-        ds.ack(next_id);
-
         // Now, take the second downstairs offline
         ds.replay_jobs(ClientId::new(1));
         // Now the IO should be replay
@@ -5708,7 +5668,7 @@ pub(crate) mod test {
         // The job should remain acked
         assert!(ds.ds_active.get(&id1).unwrap().acked);
 
-        // Re-submit and complete the write
+        // Re-submit and complete the write, which should not ack again
         assert_eq!(ds.job_state(id1, 1), IOState::InProgress);
         assert!(!ds.process_ds_completion(
             id1,
@@ -5717,10 +5677,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None
         ));
-
-        // State should remain acked and not ackable
-        assert!(ds.ds_active.get(&id1).unwrap().acked);
-        assert!(ds.ackable_work.is_empty())
     }
 
     #[test]
@@ -5761,9 +5717,6 @@ pub(crate) mod test {
         // Check that it should have been fast-acked
         assert!(ds.ds_active.get(&id1).unwrap().acked);
 
-        // Verify no more ackable work
-        assert!(ds.ackable_work.is_empty());
-
         // Now, take that downstairs offline
         ds.replay_jobs(ClientId::new(0));
 
@@ -6390,10 +6343,8 @@ pub(crate) mod test {
             None,
         );
 
-        let ack_list = ds.ackable_work().clone();
-        assert!(ack_list.is_empty());
         let done = ds.ds_active.get(&next_id).unwrap();
-        assert!(done.acked);
+        assert!(done.acked); // fast-acked
         assert!(done.result().is_ok());
     }
 
@@ -6438,7 +6389,6 @@ pub(crate) mod test {
 
         let done = ds.ds_active.get(&next_id).unwrap();
         assert!(done.acked);
-        assert!(ds.ackable_work().is_empty());
         assert!(done.result().is_err());
     }
 
@@ -6492,7 +6442,6 @@ pub(crate) mod test {
 
         let done = ds.ds_active.get(&next_id).unwrap();
         assert!(done.acked);
-        assert!(ds.ackable_work().is_empty());
         assert!(done.result().is_err());
     }
 
@@ -6507,37 +6456,27 @@ pub(crate) mod test {
 
         // Create a flush, enqueue it on both the downstairs
         // and the guest work queues.
-        let next_id = ds.submit_flush(None, None);
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let next_id = ds.submit_flush(None, Some(tx), None);
 
         let response = || Ok(Default::default());
-        ds.process_ds_completion(
+        assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(0),
             response(),
             &UpstairsState::Active,
             None,
-        );
+        ));
 
-        ds.process_ds_completion(
+        assert!(ds.process_ds_completion(
             next_id,
             ClientId::new(2),
             response(),
             &UpstairsState::Active,
             None,
-        );
-
-        let ack_list = ds.ackable_work().clone();
-        assert_eq!(ack_list.len(), 1);
-
-        // Simulation of what happens in up_ds_listen
-        for ds_id_done in ack_list.iter() {
-            assert_eq!(*ds_id_done, next_id);
-
-            ds.ack(*ds_id_done);
+        )); // acked!
 
-            let done = ds.ds_active.get(ds_id_done).unwrap();
-            assert!(done.result().is_ok());
-        }
+        assert!(rx.try_wait().unwrap().is_ok());
     }
 
     #[test]
@@ -6553,7 +6492,9 @@ pub(crate) mod test {
 
         // Create a flush, enqueue it on both the downstairs
         // and the guest work queues.
-        let next_id = ds.submit_flush(None, None);
+        let next_id = ds.submit_flush(None, None, None);
+
+        // It should be acked when the first completion returns
         assert!(ds.process_ds_completion(
             next_id,
             ClientId::new(0),
@@ -6561,19 +6502,6 @@ pub(crate) mod test {
             &UpstairsState::Active,
             None,
         ));
-
-        let ack_list = ds.ackable_work().clone();
-        assert_eq!(ack_list.len(), 1);
-
-        // Simulation of what happens in up_ds_listen
-        for ds_id_done in ack_list.iter() {
-            assert_eq!(*ds_id_done, next_id);
-
-            ds.ack(*ds_id_done);
-
-            let done = ds.ds_active.get(ds_id_done).unwrap();
-            assert!(done.result().is_err());
-        }
     }
 
     #[test]
@@ -6587,7 +6515,8 @@ pub(crate) mod test {
 
         // Create a flush, enqueue it on both the downstairs
         // and the guest work queues.
-        let next_id = ds.submit_flush(None, None);
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let next_id = ds.submit_flush(None, Some(tx), None);
 
         // DS 1 has a failure, and this won't return true as we don't
         // have enough success yet to ACK to the guest.
@@ -6609,18 +6538,7 @@ pub(crate) mod test {
             None
         ));
 
-        let ack_list = ds.ackable_work().clone();
-        assert_eq!(ack_list.len(), 1);
-
-        // Simulation of what happens in up_ds_listen
-        for ds_id_done in ack_list.iter() {
-            assert_eq!(*ds_id_done, next_id);
-
-            ds.ack(*ds_id_done);
-
-            let done = ds.ds_active.get(ds_id_done).unwrap();
-            assert!(done.result().is_err());
-        }
+        assert!(rx.try_wait().unwrap().is_err());
     }
 
     #[test]
@@ -6677,7 +6595,6 @@ pub(crate) mod test {
         assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Faulted);
 
         // Verify that this work should have been fast-acked
-        assert_eq!(ds.ackable_work().len(), 0);
         assert!(ds.ds_active.get(&next_id).unwrap().acked);
     }
 
@@ -6718,7 +6635,7 @@ pub(crate) mod test {
             None
         ));
 
-        // the operation was previously marked as ackable, because it's a write
+        // the operation was already fast-acked, because it's a write
         assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(2),
@@ -6751,7 +6668,7 @@ pub(crate) mod test {
 
         let response = || Ok(build_read_response(&[]));
 
-        // Process the operation for client 1 this should return true
+        // Process the operation for client 1; we send the ack here
         assert!(ds.process_ds_completion(
             next_id,
             ClientId::new(1),
@@ -6760,7 +6677,7 @@ pub(crate) mod test {
             None,
         ));
 
-        // Process the operation for client 2 this should return false
+        // Process the operation for client 2, which does not ack
         assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(2),
@@ -6769,13 +6686,9 @@ pub(crate) mod test {
             None
         ));
 
-        // Verify we can ack this work, then ack it.
-        assert_eq!(ds.ackable_work().len(), 1);
-        ds.ack(next_id);
-
         // Perform the flush.
         let next_id = {
-            let next_id = ds.submit_flush(None, None);
+            let next_id = ds.submit_flush(None, None, None);
 
             // As this DS is failed, it should have been skipped
             assert_eq!(
@@ -6796,7 +6709,7 @@ pub(crate) mod test {
             None,
         ));
 
-        // process_ds_operation should return true after we process this.
+        // process_ds_operation should ack after we process this.
         assert!(ds.process_ds_completion(
             next_id,
             ClientId::new(2),
@@ -6805,13 +6718,6 @@ pub(crate) mod test {
             None
         ));
 
-        // ACK the flush and let retire_check move things along.
-        assert_eq!(ds.ackable_work().len(), 1);
-        ds.ack(next_id);
-        ds.retire_check(next_id);
-
-        assert_eq!(ds.ackable_work().len(), 0);
-
         // The write, the read, and now the flush should be completed.
         assert_eq!(ds.completed().len(), 3);
 
@@ -6895,7 +6801,7 @@ pub(crate) mod test {
 
         let response = Ok(build_read_response(&[]));
 
-        // Process the operation for client 2, which makes the job ackable
+        // Process the operation for client 2, which acks the job
         assert!(ds.process_ds_completion(
             next_id,
             ClientId::new(2),
@@ -6939,8 +6845,8 @@ pub(crate) mod test {
             None
         ));
 
-        // process_ds_operation for client 2; the job was already ackable
-        // (because it's a write) so this returns false
+        // process_ds_operation for client 2; the job was already acked (because
+        // it's a write) so this returns false
         assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(2),
@@ -6961,7 +6867,6 @@ pub(crate) mod test {
 
         // Verify this work should have been fast-acked
         assert!(ds.ds_active.get(&next_id).unwrap().acked);
-        assert_eq!(ds.ackable_work().len(), 0);
 
         // Now, do another write.
         let next_id = ds.create_and_enqueue_generic_write_eob(false);
@@ -6981,7 +6886,7 @@ pub(crate) mod test {
         ));
         // We don't process client 1, it had failed
 
-        // again, the job was ackable immediately
+        // again, the job was acked immediately because it's a write
         assert!(!ds.process_ds_completion(
             next_id,
             ClientId::new(2),
@@ -6991,7 +6896,6 @@ pub(crate) mod test {
         ));
         // Verify we should have fast-acked this work too
         assert!(ds.ds_active.get(&next_id).unwrap().acked);
-        assert_eq!(ds.ackable_work().len(), 0);
 
         // One downstairs should have a skipped job on its list.
         assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 0);
@@ -7002,7 +6906,7 @@ pub(crate) mod test {
         assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 0);
 
         // Enqueue the flush.
-        let flush_id = ds.submit_flush(None, None);
+        let flush_id = ds.submit_flush(None, None, None);
 
         assert_eq!(
             ds.job_states(flush_id),
@@ -7019,7 +6923,7 @@ pub(crate) mod test {
             None
         ));
 
-        // process_ds_operation should return true after we process client 2.
+        // process_ds_operation should ack after we process client 2.
         assert!(ds.process_ds_completion(
             flush_id,
             ClientId::new(2),
@@ -7028,14 +6932,6 @@ pub(crate) mod test {
             None
         ));
 
-        // ACK all the jobs and let retire_check move things along.
-        assert_eq!(ds.ackable_work().len(), 1);
-        // first two writes should have been fast-acked
-        ds.ack(flush_id);
-        ds.retire_check(flush_id);
-
-        assert_eq!(ds.ackable_work().len(), 0);
-
         // The two writes and the flush should be completed.
         assert_eq!(ds.completed().len(), 3);
 
@@ -7091,7 +6987,7 @@ pub(crate) mod test {
             None
         ));
 
-        // Process the operation for client 2.  The job was already ackable, so
+        // Process the operation for client 2.  The job was already acked, so
         // this returns `false`
         assert!(!ds.process_ds_completion(
             write_id,
@@ -7106,9 +7002,6 @@ pub(crate) mod test {
         assert_eq!(ds.clients[ClientId::new(1)].state(), DsState::Faulted);
         assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Active);
 
-        // This job was immediately acked
-        assert_eq!(ds.ackable_work().len(), 0);
-
         // Verify the read switched from InProgress to Skipped
         let job = ds.ds_active.get(&read_id).unwrap();
 
@@ -7175,7 +7068,7 @@ pub(crate) mod test {
         assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Active);
 
         // The write was fast-acked, and the read is still going
-        assert!(ds.ackable_work().is_empty());
+        assert!(ds.ds_active.get(&write_id).unwrap().acked);
 
         // Verify the read switched from new to skipped
 
@@ -7233,8 +7126,9 @@ pub(crate) mod test {
             );
         }
 
-        // The write has been fast-acked; the read is ackable
-        assert_eq!(ds.ackable_work().len(), 1);
+        // The write has been fast-acked; the read has been acked
+        assert!(ds.ds_active.get(&write_one).unwrap().acked);
+        assert!(ds.ds_active.get(&read_one).unwrap().acked);
 
         // Verify all IOs are done
 
@@ -7254,39 +7148,35 @@ pub(crate) mod test {
 
         let err_response = Err(CrucibleError::GenericError("bad".to_string()));
 
-        // Process the operation for client 0, 1
-        ds.process_ds_completion(
+        // Process the operation for client 0, 1, 2.  It should already be acked
+        assert!(!ds.process_ds_completion(
             write_fail,
             ClientId::new(0),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
-        ds.process_ds_completion(
+        ));
+        assert!(!ds.process_ds_completion(
             write_fail,
             ClientId::new(1),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
-        ds.process_ds_completion(
+        ));
+        assert!(!ds.process_ds_completion(
             write_fail,
             ClientId::new(2),
             err_response,
             &UpstairsState::Active,
             None,
-        );
+        ));
 
         // Verify client states
         assert_eq!(ds.clients[ClientId::new(0)].state(), DsState::Active);
         assert_eq!(ds.clients[ClientId::new(1)].state(), DsState::Active);
         assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Faulted);
 
-        // Only the first read remains ackable
-        assert_eq!(ds.ackable_work().len(), 1);
-
         // Verify all IOs are done
-
         for cid in ClientId::iter() {
             let job = ds.ds_active.get(&read_one).unwrap();
             assert_eq!(job.state[cid], IOState::Done);
@@ -7343,8 +7233,9 @@ pub(crate) mod test {
             );
         }
 
-        // Verify the read can be acked (the write was fast-acked)
-        assert_eq!(ds.ackable_work().len(), 1);
+        // Verify the read has been acked (the write was fast-acked)
+        assert!(ds.ds_active.get(&read_one).unwrap().acked);
+        assert!(ds.ds_active.get(&write_one).unwrap().acked);
 
         // Verify all IOs are done
 
@@ -7366,39 +7257,35 @@ pub(crate) mod test {
         let read_two = ds.create_and_enqueue_generic_read_eob();
 
         // Process the write operation for downstairs 0, 1
-        ds.process_ds_completion(
+        assert!(!ds.process_ds_completion(
             write_fail,
             ClientId::new(0),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
-        ds.process_ds_completion(
+        ));
+        assert!(!ds.process_ds_completion(
             write_fail,
             ClientId::new(1),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
+        ));
         // Have downstairs 2 return error.
-        ds.process_ds_completion(
+        assert!(!ds.process_ds_completion(
             write_fail,
             ClientId::new(2),
             err_response,
             &UpstairsState::Active,
             None,
-        );
+        ));
 
         // Verify client states
         assert_eq!(ds.clients[ClientId::new(0)].state(), DsState::Active);
         assert_eq!(ds.clients[ClientId::new(1)].state(), DsState::Active);
         assert_eq!(ds.clients[ClientId::new(2)].state(), DsState::Faulted);
 
-        // The first read remains the only ackable work
-        assert_eq!(ds.ackable_work().len(), 1);
-
         // Verify all IOs are done
-
         for cid in ClientId::iter() {
             // First read, still Done
             let job = ds.ds_active.get(&read_one).unwrap();
@@ -7444,7 +7331,7 @@ pub(crate) mod test {
         let read_one = ds.create_and_enqueue_generic_read_eob();
 
         // Finally, add a flush
-        let flush_one = ds.submit_flush(None, None);
+        let flush_one = ds.submit_flush(None, None, None);
 
         let job = ds.ds_active.get(&write_one).unwrap();
         assert_eq!(job.state[ClientId::new(0)], IOState::Skipped);
@@ -7467,87 +7354,75 @@ pub(crate) mod test {
         assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 0);
 
         // Do the write
-        ds.process_ds_completion(
+        assert!(!ds.process_ds_completion(
             write_one,
             ClientId::new(1),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
-        ds.process_ds_completion(
+        ));
+        assert!(!ds.process_ds_completion(
             write_one,
             ClientId::new(2),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
+        ));
 
         // Make the read ok response, do the read
         let rr = || Ok(build_read_response(&[]));
 
-        ds.process_ds_completion(
+        assert!(ds.process_ds_completion(
             read_one,
             ClientId::new(1),
             rr(),
             &UpstairsState::Active,
             None,
-        );
-        ds.process_ds_completion(
+        ));
+        assert!(!ds.process_ds_completion(
             read_one,
             ClientId::new(2),
             rr(),
             &UpstairsState::Active,
             None,
-        );
+        ));
+
+        let job = ds.ds_active.get(&read_one).unwrap();
+        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
+        assert_eq!(job.state[ClientId::new(2)], IOState::Done);
+        let job = ds.ds_active.get(&write_one).unwrap();
+        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
+        assert_eq!(job.state[ClientId::new(2)], IOState::Done);
 
         // Do the flush
-        ds.process_ds_completion(
+        assert!(!ds.process_ds_completion(
             flush_one,
             ClientId::new(1),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
-        ds.process_ds_completion(
+        ));
+        let job = ds.ds_active.get(&flush_one).unwrap();
+        assert_eq!(job.state[ClientId::new(0)], IOState::Skipped);
+        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
+        assert_eq!(job.state[ClientId::new(2)], IOState::InProgress);
+        assert!(ds.process_ds_completion(
             flush_one,
             ClientId::new(2),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
-
-        // Verify three jobs can be acked (or should have been fast-acked)
-        assert!(ds.ds_active.get(&write_one).unwrap().acked);
-        assert_eq!(ds.ackable_work().len(), 2);
-
-        // Verify all IOs are done
-
-        let job = ds.ds_active.get(&read_one).unwrap();
-        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
-        assert_eq!(job.state[ClientId::new(2)], IOState::Done);
-        let job = ds.ds_active.get(&write_one).unwrap();
-        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
-        assert_eq!(job.state[ClientId::new(2)], IOState::Done);
-        let job = ds.ds_active.get(&flush_one).unwrap();
-        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
-        assert_eq!(job.state[ClientId::new(2)], IOState::Done);
-
-        ds.ack(read_one);
-        // write has already been fast-acked
-        ds.ack(flush_one);
-        ds.retire_check(flush_one);
+        ));
+        // The flush retires itself and preceding jobs
 
         // Skipped jobs just has the flush
         assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 1);
         assert!(ds.clients[ClientId::new(0)]
             .skipped_jobs
             .contains(&JobId(1002)));
-        assert_eq!(ds.ackable_work().len(), 0);
 
         // The writes, the read, and the flush should be completed.
         assert_eq!(ds.completed().len(), 3);
-        // No more ackable work
-        assert_eq!(ds.ackable_work().len(), 0);
         // No more jobs on the queue
         assert_eq!(ds.active_count(), 0);
     }
@@ -7570,7 +7445,8 @@ pub(crate) mod test {
         let read_one = ds.create_and_enqueue_generic_read_eob();
 
         // Finally, add a flush
-        let flush_one = ds.submit_flush(None, None);
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let flush_one = ds.submit_flush(None, Some(tx), None);
 
         let job = ds.ds_active.get(&write_one).unwrap();
         assert_eq!(job.state[ClientId::new(0)], IOState::Skipped);
@@ -7593,51 +7469,42 @@ pub(crate) mod test {
         assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 3);
 
         // Do the write
-        ds.process_ds_completion(
+        assert!(!ds.process_ds_completion(
             write_one,
             ClientId::new(1),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
+        ));
+        let job = ds.ds_active.get(&write_one).unwrap();
+        assert!(job.acked); // fast-ack
+        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
 
         // Make the read ok response, do the read
         let rr = Ok(build_read_response(&[]));
 
-        ds.process_ds_completion(
+        assert!(ds.process_ds_completion(
             read_one,
             ClientId::new(1),
             rr,
             &UpstairsState::Active,
             None,
-        );
+        ));
 
-        // Do the flush
-        ds.process_ds_completion(
+        // Verify that the read IO is done
+        let job = ds.ds_active.get(&read_one).unwrap();
+        assert!(job.acked);
+        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
+
+        // Do the flush, which retires other jobs
+        assert!(ds.process_ds_completion(
             flush_one,
             ClientId::new(1),
             Ok(Default::default()),
             &UpstairsState::Active,
             None,
-        );
-
-        // Verify the write should be fast-acked and the others are ackable
-        assert!(ds.ds_active.get(&write_one).unwrap().acked);
-        assert_eq!(ds.ackable_work().len(), 2);
-
-        // Verify all IOs are done
-
-        let job = ds.ds_active.get(&read_one).unwrap();
-        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
-        let job = ds.ds_active.get(&write_one).unwrap();
-        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
-        let job = ds.ds_active.get(&flush_one).unwrap();
-        assert_eq!(job.state[ClientId::new(1)], IOState::Done);
-
-        ds.ack(read_one);
-        // write should be fast-acked
-        ds.ack(flush_one);
-        ds.retire_check(flush_one);
+        ));
+        assert!(rx.try_wait().unwrap().is_err());
 
         // Skipped jobs now just have the flush.
         assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 1);
@@ -7648,12 +7515,9 @@ pub(crate) mod test {
         assert!(ds.clients[ClientId::new(2)]
             .skipped_jobs
             .contains(&JobId(1002)));
-        assert_eq!(ds.ackable_work().len(), 0);
 
         // The writes, the read, and the flush should be completed.
         assert_eq!(ds.completed().len(), 3);
-        // No more ackable work
-        assert_eq!(ds.ackable_work().len(), 0);
         // No more jobs on the queue
         assert_eq!(ds.active_count(), 0);
     }
@@ -7684,21 +7548,12 @@ pub(crate) mod test {
         assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 1);
 
         // Verify jobs can be acked.
-        assert_eq!(ds.ackable_work().len(), 1);
-
-        // Verify all IOs are done
-        // We are simulating what would happen here by the up_ds_listen
-        // task, after it receives a notification from the ds_done_tx.
-
-        ds.ack(read_one);
-
-        ds.retire_check(read_one);
+        assert!(ds.ds_active.get(&read_one).unwrap().acked);
 
         // Our skipped jobs have not yet been cleared.
         assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 1);
         assert_eq!(ds.clients[ClientId::new(1)].skipped_jobs.len(), 1);
         assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 1);
-        assert_eq!(ds.ackable_work().len(), 0);
     }
 
     #[test]
@@ -7727,17 +7582,10 @@ pub(crate) mod test {
         assert_eq!(ds.clients[ClientId::new(1)].skipped_jobs.len(), 1);
         assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 1);
 
-        // Verify all IOs are done
-        // We are simulating what would happen here by the up_ds_listen
-        // task, after it receives a notification from the ds_done_tx.
-
-        ds.retire_check(write_one);
         // No flush, no change in skipped jobs.
         assert_eq!(ds.clients[ClientId::new(0)].skipped_jobs.len(), 1);
         assert_eq!(ds.clients[ClientId::new(1)].skipped_jobs.len(), 1);
         assert_eq!(ds.clients[ClientId::new(2)].skipped_jobs.len(), 1);
-
-        assert_eq!(ds.ackable_work().len(), 0);
     }
 
     #[test]
@@ -7754,36 +7602,19 @@ pub(crate) mod test {
         }
 
         // Create a flush.
-        let flush_one = ds.submit_flush(None, None);
-        let job = ds.ds_active.get(&flush_one).unwrap();
-        assert_eq!(job.state[ClientId::new(0)], IOState::Skipped);
-        assert_eq!(job.state[ClientId::new(1)], IOState::Skipped);
-        assert_eq!(job.state[ClientId::new(2)], IOState::Skipped);
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let _ = ds.submit_flush(None, Some(tx), None);
+        // This job should be insta-skipped and then retired, so it's gone
+        assert!(rx.try_wait().unwrap().is_err());
         for cid in ClientId::iter() {
             assert_eq!(ds.clients[cid].skipped_jobs.len(), 1);
             assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1000)));
         }
 
-        // Verify jobs can be acked.
-        assert_eq!(ds.ackable_work().len(), 1);
-
-        ds.ack(flush_one);
-        ds.retire_check(flush_one);
-
-        assert_eq!(ds.ackable_work().len(), 0);
-
         // The flush should remove all work from the ds queue.
         assert_eq!(ds.completed().len(), 1);
-        // No more ackable work
-        assert_eq!(ds.ackable_work().len(), 0);
         // No more jobs on the queue
         assert_eq!(ds.active_count(), 0);
-
-        // Skipped jobs still has the flush.
-        for cid in ClientId::iter() {
-            assert_eq!(ds.clients[cid].skipped_jobs.len(), 1);
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1000)));
-        }
     }
 
     #[test]
@@ -7803,40 +7634,30 @@ pub(crate) mod test {
         // Create a read.
         let read_one = ds.create_and_enqueue_generic_read_eob();
 
+        // The read should be acked because all downstairs are offline
+        assert!(ds.ds_active.get(&read_one).unwrap().acked);
+
         // Create a write.
         let write_one = ds.create_and_enqueue_generic_write_eob(false);
 
-        // Create a flush
-        let flush_one = ds.submit_flush(None, None);
-        // Verify all jobs can be acked (or should have been fast-acked)
-        let write_job = ds.ds_active.get(&write_one).unwrap();
-        assert!(write_job.acked);
-        assert_eq!(ds.ackable_work().len(), 2);
+        // The write should be fast-acked
+        assert!(ds.ds_active.get(&write_one).unwrap().acked);
 
         // Skipped jobs are not yet cleared.
         for cid in ClientId::iter() {
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1000)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1001)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1002)));
-            assert_eq!(ds.clients[cid].skipped_jobs.len(), 3);
+            assert!(ds.clients[cid].skipped_jobs.contains(&read_one));
+            assert!(ds.clients[cid].skipped_jobs.contains(&write_one));
+            assert_eq!(ds.clients[cid].skipped_jobs.len(), 2);
         }
 
-        // Verify all IOs are done
-        // We are simulating what would happen here by the up_ds_listen
-        // task, after it receives a notification from the ds_done_tx.
-        ds.ack(read_one);
-        // write has already been fast-acked
-        ds.ack(flush_one);
-
-        // Don't bother with retire check for read/write, just flush
-        ds.retire_check(flush_one);
-
-        assert_eq!(ds.ackable_work().len(), 0);
+        // Create a flush
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let _flush_one = ds.submit_flush(None, Some(tx), None);
+        // The flush instantly returns an error and retires old jobs
+        assert!(rx.try_wait().unwrap().is_err());
 
         // The flush should remove all work from the ds queue.
         assert_eq!(ds.completed().len(), 3);
-        // No more ackable work
-        assert_eq!(ds.ackable_work().len(), 0);
         // No more jobs on the queue
         assert_eq!(ds.active_count(), 0);
 
@@ -7866,41 +7687,39 @@ pub(crate) mod test {
         // Create a read, write, flush
         let read_one = ds.create_and_enqueue_generic_read_eob();
         let write_one = ds.create_and_enqueue_generic_write_eob(false);
-        let flush_one = ds.submit_flush(None, None);
-
-        // Create more IOs.
-        let _read_two = ds.create_and_enqueue_generic_read_eob();
-        let _write_two = ds.create_and_enqueue_generic_write_eob(false);
-        let _flush_two = ds.submit_flush(None, None);
 
-        // Six jobs have been skipped.
+        // The read and write are skipped + acked
         for cid in ClientId::iter() {
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1000)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1001)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1002)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1003)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1004)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1005)));
-            assert_eq!(ds.clients[cid].skipped_jobs.len(), 6);
+            assert!(ds.clients[cid].skipped_jobs.contains(&read_one));
+            assert!(ds.clients[cid].skipped_jobs.contains(&write_one));
+            assert_eq!(ds.clients[cid].skipped_jobs.len(), 2);
         }
-
-        // Ack the read and flush, and confirm that the write was fast-acked
-        ds.ack(read_one);
+        assert!(ds.ds_active.get(&read_one).unwrap().acked);
         assert!(ds.ds_active.get(&write_one).unwrap().acked);
-        ds.ack(flush_one);
 
-        assert_eq!(ds.ackable_work().len(), 2);
-        // Don't bother with retire check for read/write, just flush
-        ds.retire_check(flush_one);
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let flush_one = ds.submit_flush(None, Some(tx), None);
+        assert!(rx.try_wait().unwrap().is_err());
+
+        let read_two = ds.create_and_enqueue_generic_read_eob();
+        let write_two = ds.create_and_enqueue_generic_write_eob(false);
 
-        // The first two skipped jobs are now cleared and the non-acked
-        // jobs remain on the list, as well as the last flush.
         for cid in ClientId::iter() {
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1002)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1003)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1004)));
-            assert!(ds.clients[cid].skipped_jobs.contains(&JobId(1005)));
-            assert_eq!(ds.clients[cid].skipped_jobs.len(), 4);
+            assert!(ds.clients[cid].skipped_jobs.contains(&flush_one));
+            assert!(ds.clients[cid].skipped_jobs.contains(&read_two));
+            assert!(ds.clients[cid].skipped_jobs.contains(&write_two));
+            assert_eq!(ds.clients[cid].skipped_jobs.len(), 3);
+        }
+        assert!(ds.ds_active.get(&read_two).unwrap().acked);
+        assert!(ds.ds_active.get(&write_two).unwrap().acked);
+
+        let (mut rx, tx) = BlockOpWaiter::pair();
+        let flush_two = ds.submit_flush(None, Some(tx), None);
+        assert!(rx.try_wait().unwrap().is_err());
+
+        for cid in ClientId::iter() {
+            assert!(ds.clients[cid].skipped_jobs.contains(&flush_two));
+            assert_eq!(ds.clients[cid].skipped_jobs.len(), 1);
         }
     }
 
@@ -8670,10 +8489,10 @@ pub(crate) mod test {
                 repair_downstairs: vec![ClientId::new(1)],
                 aborting_repair: false,
                 state: LiveRepairState::Closing {
-                    close_id: extent_repair_ids.close_id,
-                    repair_id: extent_repair_ids.repair_id,
-                    noop_id: extent_repair_ids.noop_id,
-                    reopen_id: extent_repair_ids.reopen_id,
+                    close_job: PendingJob::new(extent_repair_ids.close_id),
+                    repair_job: PendingJob::new(extent_repair_ids.repair_id),
+                    noop_job: PendingJob::new(extent_repair_ids.noop_id),
+                    reopen_job: PendingJob::new(extent_repair_ids.reopen_id),
                 },
             });
         }
@@ -8806,7 +8625,7 @@ pub(crate) mod test {
 
         let read_id = ds.submit_read_block(eid, BlockOffset(0));
         let write_id = ds.submit_test_write_block(eid, BlockOffset(1), false);
-        let flush_id = ds.submit_flush(None, None);
+        let flush_id = ds.submit_flush(None, None, None);
 
         let repair_ids = create_and_enqueue_repair_ops(&mut ds, eid);
 
@@ -8923,7 +8742,7 @@ pub(crate) mod test {
         let eid = ExtentId(1);
 
         let repair_ids = create_and_enqueue_repair_ops(&mut ds, eid);
-        let flush_id = ds.submit_flush(None, None);
+        let flush_id = ds.submit_flush(None, None, None);
 
         assert_eq!(ds.ds_active.len(), 5);
 
@@ -9003,9 +8822,9 @@ pub(crate) mod test {
 
         let mut ds = Downstairs::repair_test_one_repair();
 
-        let flush_id1 = ds.submit_flush(None, None);
+        let flush_id1 = ds.submit_flush(None, None, None);
         let repair_ids = create_and_enqueue_repair_ops(&mut ds, ExtentId(1));
-        let flush_id2 = ds.submit_flush(None, None);
+        let flush_id2 = ds.submit_flush(None, None, None);
 
         assert_eq!(ds.ds_active.len(), 6);
 
@@ -9038,7 +8857,7 @@ pub(crate) mod test {
 
         let repair_ids1 = create_and_enqueue_repair_ops(&mut ds, ExtentId(0));
 
-        let flush_id = ds.submit_flush(None, None);
+        let flush_id = ds.submit_flush(None, None, None);
 
         let repair_ids2 = create_and_enqueue_repair_ops(&mut ds, ExtentId(1));
 
@@ -9531,7 +9350,7 @@ pub(crate) mod test {
 
         let read_id = ds.submit_read_block(ExtentId(0), BlockOffset(1));
 
-        let flush_id = ds.submit_flush(None, None);
+        let flush_id = ds.submit_flush(None, None, None);
 
         assert_eq!(ds.ds_active.len(), 3);
 
@@ -9583,10 +9402,10 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(1000),
-                repair_id: JobId(1001),
-                noop_id: JobId(1002),
-                reopen_id: JobId(1003),
+                close_job: PendingJob::new(JobId(1000)),
+                repair_job: PendingJob::new(JobId(1001)),
+                noop_job: PendingJob::new(JobId(1002)),
+                reopen_job: PendingJob::new(JobId(1003)),
             },
         });
 
@@ -9649,10 +9468,10 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(1000),
-                repair_id: JobId(1001),
-                noop_id: JobId(1002),
-                reopen_id: JobId(1003),
+                close_job: PendingJob::new(JobId(1000)),
+                repair_job: PendingJob::new(JobId(1001)),
+                noop_job: PendingJob::new(JobId(1002)),
+                reopen_job: PendingJob::new(JobId(1003)),
             },
         });
 
@@ -9711,14 +9530,14 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(1000),
-                repair_id: JobId(1001),
-                noop_id: JobId(1002),
-                reopen_id: JobId(1003),
+                close_job: PendingJob::new(JobId(1000)),
+                repair_job: PendingJob::new(JobId(1001)),
+                noop_job: PendingJob::new(JobId(1002)),
+                reopen_job: PendingJob::new(JobId(1003)),
             },
         });
 
-        let flush_id = ds.submit_flush(None, None);
+        let flush_id = ds.submit_flush(None, None, None);
 
         assert_eq!(ds.ds_active.len(), 1);
         assert!(ds.get_deps(flush_id).is_empty());
@@ -9740,10 +9559,10 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(1004),
-                repair_id: JobId(1005),
-                noop_id: JobId(1006),
-                reopen_id: JobId(1007),
+                close_job: PendingJob::new(JobId(1004)),
+                repair_job: PendingJob::new(JobId(1005)),
+                noop_job: PendingJob::new(JobId(1006)),
+                reopen_job: PendingJob::new(JobId(1007)),
             },
         });
 
@@ -9804,10 +9623,10 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(next_id),
-                repair_id: JobId(next_id + 1),
-                noop_id: JobId(next_id + 2),
-                reopen_id: JobId(next_id + 3),
+                close_job: PendingJob::new(JobId(next_id)),
+                repair_job: PendingJob::new(JobId(next_id + 1)),
+                noop_job: PendingJob::new(JobId(next_id + 2)),
+                reopen_job: PendingJob::new(JobId(next_id + 3)),
             },
         });
 
@@ -9826,10 +9645,10 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(next_id),
-                repair_id: JobId(next_id + 1),
-                noop_id: JobId(next_id + 2),
-                reopen_id: JobId(next_id + 3),
+                close_job: PendingJob::new(JobId(next_id)),
+                repair_job: PendingJob::new(JobId(next_id + 1)),
+                noop_job: PendingJob::new(JobId(next_id + 2)),
+                reopen_job: PendingJob::new(JobId(next_id + 3)),
             },
         });
 
@@ -9972,10 +9791,10 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(next_id),
-                repair_id: JobId(next_id + 1),
-                noop_id: JobId(next_id + 2),
-                reopen_id: JobId(next_id + 3),
+                close_job: PendingJob::new(JobId(next_id)),
+                repair_job: PendingJob::new(JobId(next_id + 1)),
+                noop_job: PendingJob::new(JobId(next_id + 2)),
+                reopen_job: PendingJob::new(JobId(next_id + 3)),
             },
         });
 
@@ -10228,10 +10047,10 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(1000),
-                repair_id: JobId(1001),
-                noop_id: JobId(1002),
-                reopen_id: JobId(1003),
+                close_job: PendingJob::new(JobId(1000)),
+                repair_job: PendingJob::new(JobId(1001)),
+                noop_job: PendingJob::new(JobId(1002)),
+                reopen_job: PendingJob::new(JobId(1003)),
             },
         });
 
@@ -10323,10 +10142,10 @@ pub(crate) mod test {
             repair_downstairs: vec![ClientId::new(1)],
             aborting_repair: false,
             state: LiveRepairState::Closing {
-                close_id: JobId(1000),
-                repair_id: JobId(1001),
-                noop_id: JobId(1002),
-                reopen_id: JobId(1003),
+                close_job: PendingJob::new(JobId(1000)),
+                repair_job: PendingJob::new(JobId(1001)),
+                noop_job: PendingJob::new(JobId(1002)),
+                reopen_job: PendingJob::new(JobId(1003)),
             },
         });
 
diff --git a/upstairs/src/dummy_downstairs_tests.rs b/upstairs/src/dummy_downstairs_tests.rs
index e9e277a31..fb7254fbd 100644
--- a/upstairs/src/dummy_downstairs_tests.rs
+++ b/upstairs/src/dummy_downstairs_tests.rs
@@ -14,7 +14,10 @@ use crate::BlockIO;
 use crate::Buffer;
 use crate::CrucibleError;
 use crate::DsState;
-use crate::{IO_OUTSTANDING_MAX_BYTES, IO_OUTSTANDING_MAX_JOBS};
+use crate::{
+    IO_CACHED_MAX_BYTES, IO_CACHED_MAX_JOBS, IO_OUTSTANDING_MAX_BYTES,
+    IO_OUTSTANDING_MAX_JOBS,
+};
 use crucible_client_types::CrucibleOpts;
 use crucible_common::Block;
 use crucible_common::BlockIndex;
@@ -264,23 +267,26 @@ impl DownstairsHandle {
     /// # Panics
     /// If a non-flush message arrives
     pub async fn ack_flush(&mut self) -> u64 {
-        let Message::Flush {
-            job_id,
-            flush_number,
-            upstairs_id,
-            ..
-        } = self.recv().await.unwrap()
-        else {
-            panic!("saw non flush!");
-        };
-        self.send(Message::FlushAck {
-            upstairs_id,
-            session_id: self.upstairs_session_id.unwrap(),
-            job_id,
-            result: Ok(()),
-        })
-        .unwrap();
-        flush_number
+        match self.recv().await.unwrap() {
+            Message::Flush {
+                job_id,
+                flush_number,
+                upstairs_id,
+                ..
+            } => {
+                self.send(Message::FlushAck {
+                    upstairs_id,
+                    session_id: self.upstairs_session_id.unwrap(),
+                    job_id,
+                    result: Ok(()),
+                })
+                .unwrap();
+                flush_number
+            }
+            m => {
+                panic!("saw non flush {m:?}");
+            }
+        }
     }
 
     /// Awaits a `Message::Write { .. }` and sends a `WriteAck`
@@ -290,17 +296,45 @@ impl DownstairsHandle {
     /// # Panics
     /// If a non-write message arrives
     pub async fn ack_write(&mut self) -> JobId {
-        let Message::Write { header, .. } = self.recv().await.unwrap() else {
-            panic!("saw non write!");
-        };
-        self.send(Message::WriteAck {
-            upstairs_id: header.upstairs_id,
-            session_id: self.upstairs_session_id.unwrap(),
-            job_id: header.job_id,
-            result: Ok(()),
-        })
-        .unwrap();
-        header.job_id
+        match self.recv().await.unwrap() {
+            Message::Write { header, .. } => {
+                self.send(Message::WriteAck {
+                    upstairs_id: header.upstairs_id,
+                    session_id: self.upstairs_session_id.unwrap(),
+                    job_id: header.job_id,
+                    result: Ok(()),
+                })
+                .unwrap();
+                header.job_id
+            }
+            m => panic!("saw non write: {m:?}"),
+        }
+    }
+
+    /// Awaits a `Message::Barrier { .. }` and sends a `BarrierAck`
+    ///
+    /// Returns the job ID for further checks.
+    ///
+    /// # Panics
+    /// If a non-write message arrives
+    pub async fn ack_barrier(&mut self) -> JobId {
+        match self.recv().await.unwrap() {
+            Message::Barrier {
+                upstairs_id,
+                job_id,
+                ..
+            } => {
+                self.send(Message::BarrierAck {
+                    upstairs_id,
+                    session_id: self.upstairs_session_id.unwrap(),
+                    job_id,
+                    result: Ok(()),
+                })
+                .unwrap();
+                job_id
+            }
+            m => panic!("saw non barrier: {m:?}"),
+        }
     }
 
     /// Awaits a `Message::Read` and sends a blank `ReadResponse`
@@ -310,26 +344,27 @@ impl DownstairsHandle {
     /// # Panics
     /// If a non-read message arrives
     pub async fn ack_read(&mut self) -> JobId {
-        let Message::ReadRequest {
-            job_id,
-            upstairs_id,
-            ..
-        } = self.recv().await.unwrap()
-        else {
-            panic!("saw non write!");
-        };
-        let (block, data) = make_blank_read_response();
-        self.send(Message::ReadResponse {
-            header: ReadResponseHeader {
-                upstairs_id,
-                session_id: self.upstairs_session_id.unwrap(),
+        match self.recv().await.unwrap() {
+            Message::ReadRequest {
                 job_id,
-                blocks: Ok(vec![block]),
-            },
-            data: data.clone(),
-        })
-        .unwrap();
-        job_id
+                upstairs_id,
+                ..
+            } => {
+                let (block, data) = make_blank_read_response();
+                self.send(Message::ReadResponse {
+                    header: ReadResponseHeader {
+                        upstairs_id,
+                        session_id: self.upstairs_session_id.unwrap(),
+                        job_id,
+                        blocks: Ok(vec![block]),
+                    },
+                    data: data.clone(),
+                })
+                .unwrap();
+                job_id
+            }
+            m => panic!("saw non read {m:?}"),
+        }
     }
 }
 
@@ -471,14 +506,31 @@ pub struct TestHarness {
 
 /// Number of extents in `TestHarness::default_config`
 const DEFAULT_EXTENT_COUNT: u32 = 25;
+const DEFAULT_BLOCK_COUNT: u64 = 10;
+
+struct TestOpts {
+    flush_timeout: f32,
+    read_only: bool,
+    disable_backpressure: bool,
+}
 
 impl TestHarness {
     pub async fn new() -> TestHarness {
-        Self::new_(false).await
+        Self::new_with_opts(TestOpts {
+            flush_timeout: 86400.0,
+            read_only: false,
+            disable_backpressure: true,
+        })
+        .await
     }
 
     pub async fn new_ro() -> TestHarness {
-        Self::new_(true).await
+        Self::new_with_opts(TestOpts {
+            flush_timeout: 86400.0,
+            read_only: true,
+            disable_backpressure: true,
+        })
+        .await
     }
 
     pub fn ds1(&mut self) -> &mut DownstairsHandle {
@@ -494,7 +546,7 @@ impl TestHarness {
             // IO_OUTSTANDING_MAX_BYTES in less than IO_OUTSTANDING_MAX_JOBS,
             // i.e. letting us test both byte and job fault conditions.
             extent_count: DEFAULT_EXTENT_COUNT,
-            extent_size: Block::new_512(10),
+            extent_size: Block::new_512(DEFAULT_BLOCK_COUNT),
 
             gen_numbers: vec![0u64; DEFAULT_EXTENT_COUNT as usize],
             flush_numbers: vec![0u64; DEFAULT_EXTENT_COUNT as usize],
@@ -502,10 +554,10 @@ impl TestHarness {
         }
     }
 
-    async fn new_(read_only: bool) -> TestHarness {
+    async fn new_with_opts(opts: TestOpts) -> TestHarness {
         let log = csl();
 
-        let cfg = Self::default_config(read_only);
+        let cfg = Self::default_config(opts.read_only);
 
         let ds1 = cfg.clone().start(log.new(o!("downstairs" => 1))).await;
         let ds2 = cfg.clone().start(log.new(o!("downstairs" => 2))).await;
@@ -514,15 +566,16 @@ impl TestHarness {
         // Configure our guest without backpressure, to speed up tests which
         // require triggering a timeout
         let (g, mut io) = Guest::new(Some(log.clone()));
-        io.disable_queue_backpressure();
-        io.disable_byte_backpressure();
+        if opts.disable_backpressure {
+            io.disable_backpressure();
+        }
         let guest = Arc::new(g);
 
         let crucible_opts = CrucibleOpts {
             id: Uuid::new_v4(),
             target: vec![ds1.local_addr, ds2.local_addr, ds3.local_addr],
-            flush_timeout: Some(86400.0),
-            read_only,
+            flush_timeout: Some(opts.flush_timeout),
+            read_only: opts.read_only,
 
             ..Default::default()
         };
@@ -1446,11 +1499,12 @@ async fn test_byte_fault_condition() {
     // out.
     const WRITE_SIZE: usize = 105 * 1024; // 105 KiB
     let write_buf = BytesMut::from(vec![1; WRITE_SIZE].as_slice()); // 50 KiB
+    let barrier_point = IO_CACHED_MAX_BYTES as usize / write_buf.len();
     let num_jobs = IO_OUTSTANDING_MAX_BYTES as usize / write_buf.len() + 10;
     assert!(num_jobs < IO_OUTSTANDING_MAX_JOBS);
 
     // First, we'll send jobs until the timeout
-    for _ in 0..num_jobs {
+    for i in 0..num_jobs {
         // We must `spawn` here because `write` will wait for the response
         // to come back before returning
         let write_buf = write_buf.clone();
@@ -1458,11 +1512,11 @@ async fn test_byte_fault_condition() {
             guest.write(BlockIndex(0), write_buf).await.unwrap();
         });
 
-        // Before we're kicked out, assert we're seeing the read requests
-        assert!(matches!(
-            harness.ds1().recv().await.unwrap(),
-            Message::Write { .. },
-        ));
+        // Before we're kicked out, assert we're seeing the write requests
+        let m = harness.ds1().recv().await.unwrap();
+        if !matches!(m, Message::Write { .. },) {
+            panic!("got unexpected message {m:?}");
+        }
         harness.ds2.ack_write().await;
         harness.ds3.ack_write().await;
 
@@ -1474,6 +1528,17 @@ async fn test_byte_fault_condition() {
         assert_eq!(ds[ClientId::new(0)], DsState::Active);
         assert_eq!(ds[ClientId::new(1)], DsState::Active);
         assert_eq!(ds[ClientId::new(2)], DsState::Active);
+
+        // Once one of the Downstairs has cached more that a certain amount of
+        // bytes, it will automatically send a Barrier operation.
+        if i == barrier_point {
+            let m = harness.ds1().recv().await.unwrap();
+            if !matches!(m, Message::Barrier { .. }) {
+                panic!("expected Barrier, got message {m:?}");
+            }
+            harness.ds2.ack_barrier().await;
+            harness.ds3.ack_barrier().await;
+        }
     }
 
     // Sleep until we're confident that the Downstairs is kicked out
@@ -1605,6 +1670,13 @@ async fn test_byte_fault_condition_offline() {
             assert_eq!(ds[ClientId::new(1)], DsState::Active);
             assert_eq!(ds[ClientId::new(2)], DsState::Active);
         }
+
+        // If we've sent IO_CACHED_MAX_BYTES, then we expect the Upstairs to
+        // insert a Barrier (because it's trying to clean out finished jobs).
+        if IO_CACHED_MAX_BYTES as usize / WRITE_SIZE == i {
+            harness.ds2.ack_barrier().await;
+            harness.ds3.ack_barrier().await;
+        }
     }
 
     // Confirm that the system comes up after live-repair
@@ -1760,7 +1832,7 @@ async fn test_job_fault_condition() {
     // timeout, so that when timeout hits, the downstairs will become Faulted
     // instead of Offline.
     let num_jobs = IO_OUTSTANDING_MAX_JOBS + 200;
-    for _ in 0..num_jobs {
+    for i in 0..num_jobs {
         // We must `spawn` here because `write` will wait for the response to
         // come back before returning
         let h = harness.spawn(|guest| async move {
@@ -1769,10 +1841,10 @@ async fn test_job_fault_condition() {
         });
 
         // DS1 should be receiving messages
-        assert!(matches!(
-            harness.ds1().recv().await.unwrap(),
-            Message::ReadRequest { .. },
-        ));
+        let m = harness.ds1().recv().await.unwrap();
+        if !matches!(m, Message::ReadRequest { .. },) {
+            panic!("got unexpected message {m:?}");
+        }
 
         // Respond with read responses for downstairs 2 and 3
         harness.ds2.ack_read().await;
@@ -1786,6 +1858,18 @@ async fn test_job_fault_condition() {
         assert_eq!(ds[ClientId::new(0)], DsState::Active);
         assert_eq!(ds[ClientId::new(1)], DsState::Active);
         assert_eq!(ds[ClientId::new(2)], DsState::Active);
+
+        // When we hit IO_CACHED_MAX_JOBS, the Upstairs will attempt to send a
+        // barrier to retire jobs.  The barrier won't succeed, because we're not
+        // acking it on DS1.
+        if i + 1 == IO_CACHED_MAX_JOBS as usize {
+            let m = harness.ds1().recv().await.unwrap();
+            if !matches!(m, Message::Barrier { .. }) {
+                panic!("expected Barrier, got message {m:?}");
+            }
+            harness.ds2.ack_barrier().await;
+            harness.ds3.ack_barrier().await;
+        }
     }
 
     // Sleep until we're confident that the Downstairs is kicked out
@@ -1888,6 +1972,7 @@ async fn test_job_fault_condition_offline() {
     // transition it to `Faulted` by sending it enough to hit
     // `IO_OUTSTANDING_MAX_JOBS`
     info!(harness.log, "sending more jobs to fault DS1");
+    let mut barrier_count = 0;
     for i in num_jobs..IO_OUTSTANDING_MAX_JOBS + 200 {
         let h = harness.spawn(|guest| async move {
             let mut buffer = Buffer::new(1, 512);
@@ -1910,8 +1995,17 @@ async fn test_job_fault_condition_offline() {
         // the Upstairs has finished updating its state).
         h.await.unwrap();
 
+        // If we've sent IO_CACHED_MAX_JOBS, then we expect the Upstairs to
+        // insert a Barrier (because it's trying to clean out finished jobs).
+        if i + 1 == IO_CACHED_MAX_JOBS as usize {
+            // nothing arrives on DS1, because it's offline
+            harness.ds2.ack_barrier().await;
+            harness.ds3.ack_barrier().await;
+            barrier_count += 1;
+        }
+
         let ds = harness.guest.downstairs_state().await.unwrap();
-        if i < IO_OUTSTANDING_MAX_JOBS {
+        if i + barrier_count < IO_OUTSTANDING_MAX_JOBS {
             // At this point, we should still be offline
             assert_eq!(ds[ClientId::new(0)], DsState::Offline);
             assert_eq!(ds[ClientId::new(1)], DsState::Active);
@@ -2716,3 +2810,83 @@ async fn test_no_send_offline() {
         e => panic!("invalid message {e:?}; expected Write"),
     }
 }
+
+/// Test that barrier operations are sent periodically
+#[tokio::test]
+async fn test_jobs_based_barrier() {
+    let mut harness = TestHarness::new_with_opts(TestOpts {
+        flush_timeout: 0.5,
+        read_only: false,
+        disable_backpressure: false,
+    })
+    .await;
+
+    for i in 1..IO_CACHED_MAX_JOBS * 3 {
+        // Send a write, which will succeed
+        let write_handle = harness.spawn(|guest| async move {
+            let mut data = BytesMut::new();
+            data.resize(512, 1u8);
+            guest.write(BlockIndex(0), data).await.unwrap();
+        });
+
+        // Ensure that all three clients got the write request
+        harness.ds1().ack_write().await;
+        harness.ds2.ack_write().await;
+        harness.ds3.ack_write().await;
+
+        write_handle.await.unwrap();
+
+        if i % IO_CACHED_MAX_JOBS == 0 {
+            harness.ds1().ack_barrier().await;
+            harness.ds2.ack_barrier().await;
+            harness.ds3.ack_barrier().await;
+        }
+    }
+
+    // We should also automatically send a flush here
+    harness.ds1().ack_flush().await;
+    harness.ds2.ack_flush().await;
+    harness.ds3.ack_flush().await;
+}
+
+/// Test that barrier operations are sent periodically
+#[tokio::test]
+async fn test_bytes_based_barrier() {
+    let mut harness = TestHarness::new_with_opts(TestOpts {
+        flush_timeout: 0.5,
+        read_only: false,
+        disable_backpressure: false,
+    })
+    .await;
+
+    const WRITE_SIZE: usize = 105 * 1024; // 105 KiB
+    let write_buf = BytesMut::from(vec![1; WRITE_SIZE].as_slice()); // 50 KiB
+    let barrier_point =
+        (IO_CACHED_MAX_BYTES as usize).div_ceil(write_buf.len());
+
+    for i in 1..barrier_point * 3 + 10 {
+        // Send a write, which will succeed
+        let write_buf = write_buf.clone();
+        let write_handle = harness.spawn(|guest| async move {
+            guest.write(BlockIndex(0), write_buf).await.unwrap();
+        });
+
+        // Ensure that all three clients got the write request
+        harness.ds1().ack_write().await;
+        harness.ds2.ack_write().await;
+        harness.ds3.ack_write().await;
+
+        write_handle.await.unwrap();
+
+        if i % barrier_point == 0 {
+            harness.ds1().ack_barrier().await;
+            harness.ds2.ack_barrier().await;
+            harness.ds3.ack_barrier().await;
+        }
+    }
+
+    // We should also automatically send a flush here
+    harness.ds1().ack_flush().await;
+    harness.ds2.ack_flush().await;
+    harness.ds3.ack_flush().await;
+}
diff --git a/upstairs/src/guest.rs b/upstairs/src/guest.rs
index 077f4c676..da14210fa 100644
--- a/upstairs/src/guest.rs
+++ b/upstairs/src/guest.rs
@@ -2,13 +2,10 @@
 use std::{
     net::SocketAddr,
     sync::atomic::{AtomicU64, Ordering},
-    time::Duration,
 };
 
 use crate::{
-    backpressure::{
-        BackpressureAmount, BackpressureConfig, SharedBackpressureAmount,
-    },
+    io_limits::{IOLimitView, IOLimits},
     BlockIO, BlockOp, BlockOpWaiter, BlockRes, Buffer, ReadBlockContext,
     ReplaceResult, UpstairsAction,
 };
@@ -18,8 +15,8 @@ use crucible_protocol::SnapshotDetails;
 
 use async_trait::async_trait;
 use bytes::BytesMut;
-use slog::{error, info, warn, Logger};
-use tokio::sync::{mpsc, Mutex};
+use slog::{info, warn, Logger};
+use tokio::sync::mpsc;
 use tracing::{instrument, span, Level};
 use uuid::Uuid;
 
@@ -110,19 +107,8 @@ pub struct Guest {
     /// it can be read from a `&self` reference.
     block_size: AtomicU64,
 
-    /// Backpressure is implemented as a delay on host write operations
-    ///
-    /// It is stored in an `Arc` so that the `GuestIoHandle` can update it from
-    /// the IO task.
-    backpressure: SharedBackpressureAmount,
-
-    /// Lock held during backpressure delay
-    ///
-    /// Without this lock, multiple tasks could submit jobs to the upstairs and
-    /// wait in parallel, which defeats the purpose of backpressure (since you
-    /// could send arbitrarily many jobs at high speed by sending them from
-    /// different tasks).
-    backpressure_lock: Mutex<()>,
+    /// View into global IO limits
+    io_limits: IOLimitView,
 
     /// Logger for the guest
     log: Logger,
@@ -150,21 +136,31 @@ impl Guest {
         // time spent waiting for the queue versus time spent in Upstairs code).
         let (req_tx, req_rx) = mpsc::channel(500);
 
-        let backpressure = SharedBackpressureAmount::new();
+        // We have to set limits above `IO_OUTSTANDING_MAX_JOBS/BYTES`:
+        // an `Offline` downstairs must hit that threshold to transition to
+        // `Faulted`, so we can't be IO-limited before that point.
+        let io_limits = IOLimits::new(
+            crate::IO_OUTSTANDING_MAX_JOBS * 3 / 2,
+            crate::IO_OUTSTANDING_MAX_BYTES as usize * 3 / 2,
+        );
+        let io_limits_view = io_limits.view();
+
         let io = GuestIoHandle {
             req_rx,
 
-            backpressure: backpressure.clone(),
-            backpressure_config: BackpressureConfig::default(),
+            io_limits,
+
+            #[cfg(test)]
+            disable_backpressure: false,
+
             log: log.clone(),
         };
         let guest = Guest {
             req_tx,
 
             block_size: AtomicU64::new(0),
+            io_limits: io_limits_view,
 
-            backpressure,
-            backpressure_lock: Mutex::new(()),
             log,
         };
         (guest, io)
@@ -198,28 +194,6 @@ impl Guest {
         rx.wait().await
     }
 
-    /// Sleeps for a backpressure-dependent amount, holding the lock
-    ///
-    /// If backpressure is saturated, logs and returns an error.
-    async fn backpressure_sleep(&self) -> Result<(), CrucibleError> {
-        let bp = self.backpressure.load();
-        match bp {
-            BackpressureAmount::Saturated => {
-                let err = "write queue is saturated";
-                error!(self.log, "{err}");
-                Err(CrucibleError::IoError(err.to_owned()))
-            }
-            BackpressureAmount::Duration(d) => {
-                if d > Duration::ZERO {
-                    let _guard = self.backpressure_lock.lock().await;
-                    tokio::time::sleep(d).await;
-                    drop(_guard);
-                }
-                Ok(())
-            }
-        }
-    }
-
     #[cfg(test)]
     pub async fn downstairs_state(
         &self,
@@ -353,11 +327,20 @@ impl BlockIO for Guest {
             assert_eq!(chunk.len() as u64 % bs, 0);
 
             let offset_change = chunk.len() as u64 / bs;
+            let io_guard =
+                self.io_limits.claim(chunk.len() as u32).await.map_err(
+                    |e| {
+                        CrucibleError::IoError(format!(
+                            "could not get IO guard for Read: {e:?}"
+                        ))
+                    },
+                )?;
             let (rx, done) = BlockOpWaiter::pair();
             let rio = BlockOp::Read {
                 offset,
                 data: chunk,
                 done,
+                io_guard,
             };
 
             // Our return value always includes the buffer, so we can splice it
@@ -405,9 +388,8 @@ impl BlockIO for Guest {
         }
 
         // We split writes into chunks to bound the maximum (typical) latency of
-        // any single `BlockOp::Write`.  Otherwise, the host could send writes
-        // which are large enough that our maximum backpressure delay wouldn't
-        // compensate for them.
+        // any single `BlockOp::Write`.  This makes the system's performance
+        // characteristics easier to think about.
         const MDTS: usize = 1024 * 1024; // 1 MiB
 
         while !data.is_empty() {
@@ -415,13 +397,19 @@ impl BlockIO for Guest {
             assert_eq!(buf.len() as u64 % bs, 0);
             let offset_change = buf.len() as u64 / bs;
 
-            self.backpressure_sleep().await?;
+            let io_guard =
+                self.io_limits.claim(buf.len() as u32).await.map_err(|e| {
+                    CrucibleError::IoError(format!(
+                        "could not get IO guard for Write: {e:?}"
+                    ))
+                })?;
 
             let reply = self
                 .send_and_wait(|done| BlockOp::Write {
                     offset,
                     data: buf,
                     done,
+                    io_guard,
                 })
                 .await;
             reply?;
@@ -442,11 +430,17 @@ impl BlockIO for Guest {
             return Ok(());
         }
 
-        self.backpressure_sleep().await?;
+        let io_guard =
+            self.io_limits.claim(data.len() as u32).await.map_err(|e| {
+                CrucibleError::IoError(format!(
+                    "could not get IO guard for WriteUnwritten: {e:?}"
+                ))
+            })?;
         self.send_and_wait(|done| BlockOp::WriteUnwritten {
             offset,
             data,
             done,
+            io_guard,
         })
         .await
     }
@@ -455,9 +449,15 @@ impl BlockIO for Guest {
         &self,
         snapshot_details: Option<SnapshotDetails>,
     ) -> Result<(), CrucibleError> {
+        let io_guard = self.io_limits.claim(0).await.map_err(|e| {
+            CrucibleError::IoError(format!(
+                "could not get IO guard for flush: {e:?}"
+            ))
+        })?;
         self.send_and_wait(|done| BlockOp::Flush {
             snapshot_details,
             done,
+            io_guard,
         })
         .await
     }
@@ -514,14 +514,15 @@ pub struct GuestIoHandle {
     /// Queue to receive new blockreqs
     req_rx: mpsc::Receiver<BlockOp>,
 
-    /// Current backpressure (shared with the `Guest`)
-    backpressure: SharedBackpressureAmount,
-
-    /// Backpressure configuration, as a starting point and max delay
-    backpressure_config: BackpressureConfig,
+    /// IO limiting (shared with the `Guest`)
+    io_limits: IOLimits,
 
     /// Log handle, mainly to pass it into the [`Upstairs`]
     pub log: Logger,
+
+    /// Flag to disable backpressure during unit tests
+    #[cfg(test)]
+    disable_backpressure: bool,
 }
 
 impl GuestIoHandle {
@@ -536,29 +537,17 @@ impl GuestIoHandle {
     }
 
     #[cfg(test)]
-    pub fn disable_queue_backpressure(&mut self) {
-        self.backpressure_config.queue.delay_scale = Duration::ZERO;
+    pub fn disable_backpressure(&mut self) {
+        self.disable_backpressure = true;
     }
 
     #[cfg(test)]
-    pub fn disable_byte_backpressure(&mut self) {
-        self.backpressure_config.bytes.delay_scale = Duration::ZERO;
-    }
-
-    #[cfg(test)]
-    pub fn is_queue_backpressure_disabled(&self) -> bool {
-        self.backpressure_config.queue.delay_scale == Duration::ZERO
-    }
-
-    /// Set `self.backpressure` based on outstanding IO ratio
-    pub fn set_backpressure(&self, bytes: u64, jobs: u64) {
-        let bp = self.backpressure_config.get_backpressure(bytes, jobs);
-        self.backpressure.store(bp);
+    pub fn is_backpressure_disabled(&self) -> bool {
+        self.disable_backpressure
     }
 
-    /// Looks up current backpressure
-    pub fn get_backpressure(&self) -> BackpressureAmount {
-        self.backpressure.load()
+    pub(crate) fn io_limits(&self) -> &IOLimits {
+        &self.io_limits
     }
 }
 
diff --git a/upstairs/src/io_limits.rs b/upstairs/src/io_limits.rs
new file mode 100644
index 000000000..14958f2a2
--- /dev/null
+++ b/upstairs/src/io_limits.rs
@@ -0,0 +1,158 @@
+// Copyright 2024 Oxide Computer Company
+use crate::{ClientData, ClientId, ClientMap};
+use std::sync::Arc;
+use tokio::sync::{
+    AcquireError, OwnedSemaphorePermit, Semaphore, TryAcquireError,
+};
+
+/// Per-client IO limits
+#[derive(Clone, Debug)]
+pub struct ClientIOLimits {
+    /// Semaphore to claim IO bytes on behalf of a job
+    io_bytes: Arc<Semaphore>,
+
+    /// Semaphore to claim individual IO jobs
+    jobs: Arc<Semaphore>,
+}
+
+impl ClientIOLimits {
+    /// Builds a new `ClientIOLimits` object with the given limits
+    pub fn new(max_jobs: usize, max_io_bytes: usize) -> Self {
+        ClientIOLimits {
+            io_bytes: Semaphore::new(max_io_bytes).into(),
+            jobs: Semaphore::new(max_jobs).into(),
+        }
+    }
+
+    /// Claims a certain number of bytes (and one job)
+    ///
+    /// This function waits until the given resources are available; as such, it
+    /// should not be called from the same task which is processing jobs (since
+    /// that could create a deadlock).
+    async fn claim(
+        &self,
+        bytes: u32,
+    ) -> Result<ClientIOLimitGuard, AcquireError> {
+        let io_bytes = self.io_bytes.clone().acquire_many_owned(bytes).await?;
+        let jobs = self.jobs.clone().acquire_owned().await?;
+        Ok(ClientIOLimitGuard { io_bytes, jobs })
+    }
+
+    /// Tries to claim a certain number of bytes (and one job)
+    pub fn try_claim(
+        &self,
+        bytes: u32,
+    ) -> Result<ClientIOLimitGuard, TryAcquireError> {
+        let io_bytes = self.io_bytes.clone().try_acquire_many_owned(bytes)?;
+        let jobs = self.jobs.clone().try_acquire_owned()?;
+        Ok(ClientIOLimitGuard { io_bytes, jobs })
+    }
+}
+
+/// Read-write handle for IO limits across all 3x clients
+#[derive(Clone, Debug)]
+pub struct IOLimits(ClientData<ClientIOLimits>);
+
+impl IOLimits {
+    /// Builds a new set of IO limits
+    pub fn new(max_jobs: usize, max_io_bytes: usize) -> Self {
+        Self(ClientData::from_fn(|_i| {
+            ClientIOLimits::new(max_jobs, max_io_bytes)
+        }))
+    }
+
+    /// Returns a per-client IO limit handle
+    ///
+    /// This handle shares permits with the parent
+    pub fn client_limits(&self, i: ClientId) -> ClientIOLimits {
+        self.0[i].clone()
+    }
+
+    /// Returns a view handle for the IO limits
+    pub fn view(&self) -> IOLimitView {
+        IOLimitView(self.clone())
+    }
+
+    /// Try to claim some number of bytes (and one job)
+    ///
+    /// Returns `Err((ClientId, e))` if any of the claims fail
+    pub fn try_claim(
+        &self,
+        bytes: u32,
+    ) -> Result<IOLimitGuard, (ClientId, TryAcquireError)> {
+        let mut out = ClientData::from_fn(|_| None);
+        for i in ClientId::iter() {
+            out[i] = Some(self.0[i].try_claim(bytes).map_err(|e| (i, e))?);
+        }
+
+        Ok(IOLimitGuard(out.map(Option::unwrap)))
+    }
+}
+
+/// View into global IO limits
+///
+/// This is equivalent to an [`IOLimits`], but exposes a different API.  It
+/// should be owned by a separate task, to avoid deadlocks when trying to claim
+/// resources.
+#[derive(Clone, Debug)]
+pub struct IOLimitView(IOLimits);
+
+impl IOLimitView {
+    /// Claim some number of bytes (and one job)
+    ///
+    /// This function waits until the given resources are available; as such, it
+    /// should not be called from the same task which is processing jobs (since
+    /// that could create a deadlock).
+    pub async fn claim(
+        &self,
+        bytes: u32,
+    ) -> Result<IOLimitGuard, AcquireError> {
+        let mut out = ClientData::from_fn(|_| None);
+        let lim = &self.0;
+        for i in ClientId::iter() {
+            out[i] = Some(lim.0[i].claim(bytes).await?);
+        }
+        Ok(IOLimitGuard(out.map(Option::unwrap)))
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+/// Handle owning some amount of per-client IO
+///
+/// The IO permits are released when this handle is dropped
+#[derive(Debug)]
+pub struct ClientIOLimitGuard {
+    #[expect(unused)]
+    io_bytes: OwnedSemaphorePermit,
+    #[expect(unused)]
+    jobs: OwnedSemaphorePermit,
+}
+
+impl ClientIOLimitGuard {
+    #[cfg(test)]
+    pub fn dummy() -> Self {
+        let a = Arc::new(Semaphore::new(1));
+        let b = Arc::new(Semaphore::new(1));
+        let io_bytes = a.try_acquire_owned().unwrap();
+        let jobs = b.try_acquire_owned().unwrap();
+        ClientIOLimitGuard { io_bytes, jobs }
+    }
+}
+
+/// Handle which stores IO limit guards for all 3x clients
+#[derive(Debug)]
+pub struct IOLimitGuard(ClientData<ClientIOLimitGuard>);
+
+impl From<IOLimitGuard> for ClientMap<ClientIOLimitGuard> {
+    fn from(i: IOLimitGuard) -> Self {
+        i.0.into()
+    }
+}
+
+impl IOLimitGuard {
+    #[cfg(test)]
+    pub fn dummy() -> Self {
+        Self(ClientData::from_fn(|_i| ClientIOLimitGuard::dummy()))
+    }
+}
diff --git a/upstairs/src/lib.rs b/upstairs/src/lib.rs
index 02df2783d..4e798cfce 100644
--- a/upstairs/src/lib.rs
+++ b/upstairs/src/lib.rs
@@ -48,9 +48,6 @@ pub use in_memory::InMemoryBlockIO;
 pub mod block_io;
 pub use block_io::{FileBlockIO, ReqwestBlockIO};
 
-pub(crate) mod backpressure;
-use backpressure::BackpressureGuard;
-
 pub mod block_req;
 pub(crate) use block_req::{BlockOpWaiter, BlockRes};
 
@@ -86,17 +83,32 @@ mod downstairs;
 mod upstairs;
 use upstairs::{UpCounters, UpstairsAction};
 
+mod io_limits;
+use io_limits::IOLimitGuard;
+
 /// Max number of write bytes between the upstairs and an offline downstairs
 ///
-/// If we exceed this value, the upstairs will give
-/// up and mark the offline downstairs as faulted.
-const IO_OUTSTANDING_MAX_BYTES: u64 = 1024 * 1024 * 1024; // 1 GiB
+/// If we exceed this value, the upstairs will give up and mark the offline
+/// downstairs as faulted.
+const IO_OUTSTANDING_MAX_BYTES: u64 = 50 * 1024 * 1024; // 50 MiB
 
 /// Max number of outstanding IOs between the upstairs and an offline downstairs
 ///
 /// If we exceed this value, the upstairs will give up and mark that offline
 /// downstairs as faulted.
-pub const IO_OUTSTANDING_MAX_JOBS: usize = 10000;
+pub const IO_OUTSTANDING_MAX_JOBS: usize = 1000;
+
+/// Maximum of bytes to cache from complete (but un-flushed) IO
+///
+/// Caching complete jobs allows us to replay them if a Downstairs goes offline
+/// them comes back.
+const IO_CACHED_MAX_BYTES: u64 = 1024 * 1024 * 1024; // 1 GiB
+
+/// Maximum of jobs to cache from complete (but un-flushed) IO
+///
+/// Caching complete jobs allows us to replay them if a Downstairs goes offline
+/// them comes back.
+const IO_CACHED_MAX_JOBS: u64 = 10000;
 
 /// The BlockIO trait behaves like a physical NVMe disk (or a virtio virtual
 /// disk): there is no contract about what order operations that are submitted
@@ -436,6 +448,11 @@ impl<T> ClientData<T> {
         ])
     }
 
+    /// Builds a new `ClientData` by applying a function to each item
+    pub fn map<U, F: FnMut(T) -> U>(self, f: F) -> ClientData<U> {
+        ClientData(self.0.map(f))
+    }
+
     #[cfg(test)]
     pub fn get(&self) -> &[T; 3] {
         &self.0
@@ -914,11 +931,7 @@ struct DownstairsIO {
     /// consistency checking with subsequent replies.
     data: Option<RawReadResponse>,
 
-    /// Handle for this job's contribution to guest backpressure
-    ///
-    /// Each of these guard handles will automatically decrement the
-    /// backpressure count for their respective Downstairs when dropped.
-    backpressure_guard: ClientMap<BackpressureGuard>,
+    io_limits: ClientMap<io_limits::ClientIOLimitGuard>,
 }
 
 impl DownstairsIO {
@@ -1275,6 +1288,11 @@ impl IOop {
                     // the downstairs to act based on that.
                     true
                 }
+                IOop::Barrier { .. } => {
+                    // The Barrier IOop doesn't actually touch any extents; it's
+                    // purely for dependency management.
+                    true
+                }
                 _ => {
                     panic!("Unsupported IO check {:?}", self);
                 }
@@ -1298,16 +1316,6 @@ impl IOop {
             _ => 0,
         }
     }
-
-    /// Returns the number of bytes written
-    fn write_bytes(&self) -> u64 {
-        match &self {
-            IOop::Write { data, .. } | IOop::WriteUnwritten { data, .. } => {
-                data.len() as u64
-            }
-            _ => 0,
-        }
-    }
 }
 
 #[allow(clippy::derive_partial_eq_without_eq)]
@@ -1485,20 +1493,24 @@ pub(crate) enum BlockOp {
         offset: BlockIndex,
         data: Buffer,
         done: BlockRes<Buffer, (Buffer, CrucibleError)>,
+        io_guard: IOLimitGuard,
     },
     Write {
         offset: BlockIndex,
         data: BytesMut,
         done: BlockRes,
+        io_guard: IOLimitGuard,
     },
     WriteUnwritten {
         offset: BlockIndex,
         data: BytesMut,
         done: BlockRes,
+        io_guard: IOLimitGuard,
     },
     Flush {
         snapshot_details: Option<SnapshotDetails>,
         done: BlockRes,
+        io_guard: IOLimitGuard,
     },
     GoActive {
         done: BlockRes,
@@ -1567,8 +1579,6 @@ pub struct Arg {
     pub up_counters: UpCounters,
     /// Next JobID
     pub next_job_id: JobId,
-    /// Backpressure value
-    pub up_backpressure: u64,
     /// Jobs on the downstairs work queue.
     pub ds_count: u32,
     /// Number of write bytes in flight
@@ -1647,7 +1657,7 @@ pub fn up_main(
     };
 
     #[cfg(test)]
-    let disable_backpressure = guest.is_queue_backpressure_disabled();
+    let disable_backpressure = guest.is_backpressure_disabled();
 
     /*
      * Build the Upstairs struct that we use to share data between
@@ -1658,7 +1668,7 @@ pub fn up_main(
 
     #[cfg(test)]
     if disable_backpressure {
-        up.disable_client_backpressure();
+        up.downstairs.disable_client_backpressure();
     }
 
     if let Some(pr) = producer_registry {
diff --git a/upstairs/src/upstairs.rs b/upstairs/src/upstairs.rs
index 884143e5d..58ce09638 100644
--- a/upstairs/src/upstairs.rs
+++ b/upstairs/src/upstairs.rs
@@ -10,6 +10,7 @@ use crate::{
     },
     downstairs::{Downstairs, DownstairsAction},
     extent_from_offset,
+    io_limits::IOLimitGuard,
     stats::UpStatOuter,
     BlockOp, BlockRes, Buffer, ClientId, ClientMap, CrucibleOpts, DsState,
     EncryptionContext, GuestIoHandle, Message, RegionDefinition,
@@ -169,10 +170,9 @@ impl UpCounters {
 ///
 /// For example, we _always_ do things like
 /// - Send all pending IO to the client work tasks
-/// - Ack all ackable jobs to the guest
 /// - Step through the live-repair state machine (if it's running)
 /// - Check for client-side deactivation (if it's pending)
-/// - Set backpressure time in the guest
+/// - Set backpressure time in the clients
 ///
 /// Keeping the `Upstairs` "clean" through this invariant maintenance makes it
 /// easier to think about its state, because it's guaranteed to be clean when we
@@ -206,12 +206,11 @@ pub(crate) struct Upstairs {
 
     /// Marks whether a flush is needed
     ///
-    /// The Upstairs keeps all IOs in memory until a flush is ACK'd back from
-    /// all three downstairs.  If there are IOs we have accepted into the work
-    /// queue that don't end with a flush, then we set this to indicate that the
-    /// upstairs may need to issue a flush of its own to be sure that data is
-    /// pushed to disk.  Note that this is not an indication of an ACK'd flush,
-    /// just that the last IO command we put on the work queue was not a flush.
+    /// If there are IOs we have accepted into the work queue that don't end
+    /// with a flush, then we set this to indicate that the upstairs may need to
+    /// issue a flush of its own to be sure that data is pushed to disk.  Note
+    /// that this is not an indication of an ACK'd flush, just that the last IO
+    /// command we put on the work queue was not a flush.
     need_flush: bool,
 
     /// Statistics for this upstairs
@@ -390,6 +389,7 @@ impl Upstairs {
             ds_target,
             tls_context,
             stats.ds_stats(),
+            guest.io_limits(),
             log.new(o!("" => "downstairs")),
         );
         let flush_timeout_secs = opt.flush_timeout.unwrap_or(0.5);
@@ -424,11 +424,6 @@ impl Upstairs {
         }
     }
 
-    #[cfg(test)]
-    pub(crate) fn disable_client_backpressure(&mut self) {
-        self.downstairs.disable_client_backpressure();
-    }
-
     /// Build an Upstairs for simple tests
     #[cfg(test)]
     pub fn test_default(ddef: Option<RegionDefinition>) -> Self {
@@ -531,6 +526,10 @@ impl Upstairs {
 
     /// Apply an action returned from [`Upstairs::select`]
     pub(crate) fn apply(&mut self, action: UpstairsAction) {
+        // Check whether the downstairs has live jobs before performing the
+        // action, because the action may cause it to retire live jobs.
+        let has_jobs = self.downstairs.has_live_jobs();
+
         match action {
             UpstairsAction::Downstairs(d) => {
                 self.counters.action_downstairs += 1;
@@ -567,7 +566,8 @@ impl Upstairs {
                     .counters
                     .action_flush_check));
                 if self.need_flush {
-                    self.submit_flush(None, None);
+                    let io_guard = self.try_acquire_io(0);
+                    self.submit_flush(None, None, io_guard);
                 }
                 self.flush_deadline = Instant::now() + self.flush_interval;
             }
@@ -603,17 +603,15 @@ impl Upstairs {
         // because too many jobs have piled up.
         self.gone_too_long();
 
+        // Check whether we need to send a Barrier operation to clean out
+        // complete-but-unflushed jobs.
+        if self.downstairs.needs_barrier() {
+            self.submit_barrier()
+        }
+
         // Check to see whether live-repair can continue
-        //
-        // This must be called before acking jobs, because it looks in
-        // `Downstairs::ackable_jobs` to see which jobs are done.
         self.downstairs.check_and_continue_live_repair(&self.state);
 
-        // Handle any jobs that have become ready for acks
-        if self.downstairs.has_ackable_jobs() {
-            self.downstairs.ack_jobs()
-        }
-
         // Check for client-side deactivation
         if matches!(&self.state, UpstairsState::Deactivating(..)) {
             info!(self.log, "checking for deactivation");
@@ -652,7 +650,41 @@ impl Upstairs {
 
         // For now, check backpressure after every event.  We may want to make
         // this more nuanced in the future.
-        self.set_backpressure();
+        self.downstairs.set_client_backpressure();
+
+        // We do this last because some of the code above can be slow
+        // (especially during debug builds), and we don't want to set our flush
+        // deadline such that it fires immediately.
+        if has_jobs {
+            self.flush_deadline = Instant::now() + self.flush_interval;
+        }
+    }
+
+    /// Attempts to acquire permits to perform an IO job with the given bytes
+    ///
+    /// Upon failure, logs an error and returns `None`.
+    ///
+    /// This function is used by messages generated internally to the Upstairs
+    /// for best-effort IO limiting.  If the message would exceed our available
+    /// permits, it's still allowed (because to do otherwise would deadlock the
+    /// upstairs task).  In other words, internally generated messages can limit
+    /// guest IO work, but not the other way around
+    fn try_acquire_io(&self, bytes: usize) -> Option<IOLimitGuard> {
+        let Ok(bytes) = u32::try_from(bytes) else {
+            warn!(self.log, "too many bytes for try_acquire_io");
+            return None;
+        };
+        match self.guest.io_limits().try_claim(bytes) {
+            Ok(v) => Some(v),
+            Err((i, e)) => {
+                warn!(
+                    self.log,
+                    "could not apply IO limits to upstairs work: \
+                     client {i} returned {e:?}"
+                );
+                None
+            }
+        }
     }
 
     /// Helper function to await all deferred block requests
@@ -710,7 +742,6 @@ impl Upstairs {
                 up_count: self.downstairs.gw_active.len() as u32,
                 up_counters: self.counters,
                 next_job_id: self.downstairs.peek_next_id(),
-                up_backpressure: self.guest.get_backpressure().as_micros(),
                 write_bytes_out: self.downstairs.write_bytes_outstanding(),
                 ds_count: self.downstairs.active_count() as u32,
                 ds_state: self.downstairs.collect_stats(|c| c.state()),
@@ -903,11 +934,21 @@ impl Upstairs {
         match op {
             // All Write operations are deferred, because they will offload
             // encryption to a separate thread pool.
-            BlockOp::Write { offset, data, done } => {
-                self.submit_deferred_write(offset, data, done, false);
+            BlockOp::Write {
+                offset,
+                data,
+                done,
+                io_guard,
+            } => {
+                self.submit_deferred_write(offset, data, done, false, io_guard);
             }
-            BlockOp::WriteUnwritten { offset, data, done } => {
-                self.submit_deferred_write(offset, data, done, true);
+            BlockOp::WriteUnwritten {
+                offset,
+                data,
+                done,
+                io_guard,
+            } => {
+                self.submit_deferred_write(offset, data, done, true, io_guard);
             }
             // If we have any deferred requests in the FuturesOrdered, then we
             // have to keep using it for subsequent requests (even ones that are
@@ -1074,15 +1115,19 @@ impl Upstairs {
                 done.send_ok(self.show_all_work());
             }
 
-            BlockOp::Read { offset, data, done } => {
-                self.submit_read(offset, data, done)
-            }
+            BlockOp::Read {
+                offset,
+                data,
+                done,
+                io_guard,
+            } => self.submit_read(offset, data, done, io_guard),
             BlockOp::Write { .. } | BlockOp::WriteUnwritten { .. } => {
                 panic!("writes must always be deferred")
             }
             BlockOp::Flush {
                 snapshot_details,
                 done,
+                io_guard,
             } => {
                 /*
                  * Submit for read and write both check if the upstairs is
@@ -1095,7 +1140,7 @@ impl Upstairs {
                     done.send_err(CrucibleError::UpstairsInactive);
                     return;
                 }
-                self.submit_flush(Some(done), snapshot_details);
+                self.submit_flush(Some(done), snapshot_details, Some(io_guard));
             }
             BlockOp::ReplaceDownstairs { id, old, new, done } => {
                 let r = self.downstairs.replace(id, old, new, &self.state);
@@ -1235,7 +1280,8 @@ impl Upstairs {
         }
         if !self.downstairs.can_deactivate_immediately() {
             debug!(self.log, "not ready to deactivate; submitting final flush");
-            self.submit_flush(None, None);
+            let io_guard = self.try_acquire_io(0);
+            self.submit_flush(None, None, io_guard);
         } else {
             debug!(self.log, "ready to deactivate right away");
             // Deactivation is handled in the invariant-checking portion of
@@ -1249,6 +1295,7 @@ impl Upstairs {
         &mut self,
         res: Option<BlockRes>,
         snapshot_details: Option<SnapshotDetails>,
+        io_guard: Option<IOLimitGuard>,
     ) {
         // Notice that unlike submit_read and submit_write, we do not check for
         // guest_io_ready here. The upstairs itself can call submit_flush
@@ -1266,18 +1313,18 @@ impl Upstairs {
         if snapshot_details.is_some() {
             info!(self.log, "flush with snap requested");
         }
-        let ds_id = self.downstairs.submit_flush(snapshot_details, res);
+        let ds_id =
+            self.downstairs
+                .submit_flush(snapshot_details, res, io_guard);
 
         cdt::up__to__ds__flush__start!(|| (ds_id.0));
     }
 
-    #[allow(dead_code)] // XXX this will be used soon!
     fn submit_barrier(&mut self) {
         // Notice that unlike submit_read and submit_write, we do not check for
         // guest_io_ready here. The upstairs itself calls submit_barrier
         // without the guest being involved; indeed the guest is not allowed to
         // call it!
-
         let ds_id = self.downstairs.submit_barrier();
 
         cdt::up__to__ds__barrier__start!(|| (ds_id.0));
@@ -1291,8 +1338,7 @@ impl Upstairs {
         offset: BlockIndex,
         data: Buffer,
     ) {
-        let br = BlockRes::dummy();
-        self.submit_read(offset, data, br)
+        self.submit_read(offset, data, BlockRes::dummy(), IOLimitGuard::dummy())
     }
 
     /// Submit a read job to the downstairs
@@ -1301,6 +1347,7 @@ impl Upstairs {
         offset: BlockIndex,
         data: Buffer,
         res: BlockRes<Buffer, (Buffer, CrucibleError)>,
+        io_guard: IOLimitGuard,
     ) {
         if !self.guest_io_ready() {
             res.send_err((data, CrucibleError::UpstairsInactive));
@@ -1339,7 +1386,9 @@ impl Upstairs {
          * Grab this ID after extent_from_offset: in case of Err we don't
          * want to create a gap in the IDs.
          */
-        let ds_id = self.downstairs.submit_read(impacted_blocks, data, res);
+        let ds_id =
+            self.downstairs
+                .submit_read(impacted_blocks, data, res, io_guard);
 
         cdt::up__to__ds__read__start!(|| (ds_id.0));
     }
@@ -1359,6 +1408,7 @@ impl Upstairs {
             data,
             BlockRes::dummy(),
             is_write_unwritten,
+            IOLimitGuard::dummy(),
         ) {
             self.submit_write(DeferredWrite::run(w))
         }
@@ -1375,14 +1425,19 @@ impl Upstairs {
         data: BytesMut,
         res: BlockRes,
         is_write_unwritten: bool,
+        io_guard: IOLimitGuard,
     ) {
         // It's possible for the write to be invalid out of the gate, in which
         // case `compute_deferred_write` replies to the `res` itself and returns
         // `None`.  Otherwise, we have to store a future to process the write
         // result.
-        if let Some(w) =
-            self.compute_deferred_write(offset, data, res, is_write_unwritten)
-        {
+        if let Some(w) = self.compute_deferred_write(
+            offset,
+            data,
+            res,
+            is_write_unwritten,
+            io_guard,
+        ) {
             let should_defer = !self.deferred_ops.is_empty()
                 || w.data.len() > MIN_DEFER_SIZE_BYTES as usize;
             if should_defer {
@@ -1404,6 +1459,7 @@ impl Upstairs {
         data: BytesMut,
         res: BlockRes,
         is_write_unwritten: bool,
+        io_guard: IOLimitGuard,
     ) -> Option<DeferredWrite> {
         if !self.guest_io_ready() {
             res.send_err(CrucibleError::UpstairsInactive);
@@ -1431,8 +1487,6 @@ impl Upstairs {
         let impacted_blocks =
             extent_from_offset(&ddef, offset, ddef.bytes_to_blocks(data.len()));
 
-        let guard = self.downstairs.early_write_backpressure(data.len() as u64);
-
         // Fast-ack, pretending to be done immediately operations
         res.send_ok(());
 
@@ -1442,7 +1496,7 @@ impl Upstairs {
             data,
             is_write_unwritten,
             cfg: self.cfg.clone(),
-            guard,
+            io_guard,
         })
     }
 
@@ -1462,7 +1516,7 @@ impl Upstairs {
             write.impacted_blocks,
             write.data,
             write.is_write_unwritten,
-            write.guard,
+            write.io_guard,
         );
 
         if write.is_write_unwritten {
@@ -1592,7 +1646,7 @@ impl Upstairs {
 
             // IO operation replies
             //
-            // This may cause jobs to become ackable!
+            // This may cause jobs to be acked!
             Message::WriteAck { .. }
             | Message::WriteUnwrittenAck { .. }
             | Message::FlushAck { .. }
@@ -2010,16 +2064,6 @@ impl Upstairs {
         self.downstairs.reinitialize(client_id, &self.state);
     }
 
-    /// Sets both guest and per-client backpressure
-    fn set_backpressure(&self) {
-        self.guest.set_backpressure(
-            self.downstairs.write_bytes_outstanding(),
-            self.downstairs.jobs_outstanding(),
-        );
-
-        self.downstairs.set_client_backpressure();
-    }
-
     /// Returns the `RegionDefinition`
     ///
     /// # Panics
@@ -2341,7 +2385,7 @@ pub(crate) mod test {
         );
 
         // op 1
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
         // op 2
         upstairs.submit_dummy_write(
@@ -2386,7 +2430,7 @@ pub(crate) mod test {
         }
 
         // op 3
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
         // ops 4 to 6
         for i in 3..6 {
@@ -2717,7 +2761,7 @@ pub(crate) mod test {
         );
 
         // op 1
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
         // op 2
         upstairs.submit_dummy_read(BlockIndex(0), Buffer::new(2, 512));
@@ -2744,11 +2788,11 @@ pub(crate) mod test {
         let mut upstairs = make_upstairs();
         upstairs.force_active().unwrap();
 
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
         let jobs = upstairs.downstairs.get_all_jobs();
         assert_eq!(jobs.len(), 3);
@@ -2778,7 +2822,7 @@ pub(crate) mod test {
         upstairs.force_active().unwrap();
 
         // op 0
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
         // ops 1 to 2
         for i in 0..2 {
@@ -2790,7 +2834,7 @@ pub(crate) mod test {
         }
 
         // op 3
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
         // ops 4 to 6
         for i in 0..3 {
@@ -2802,7 +2846,7 @@ pub(crate) mod test {
         }
 
         // op 7
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
         let jobs = upstairs.downstairs.get_all_jobs();
         assert_eq!(jobs.len(), 8);
@@ -3256,7 +3300,7 @@ pub(crate) mod test {
         upstairs.submit_dummy_read(BlockIndex(95), Buffer::new(2, 512));
 
         // op 1
-        upstairs.submit_flush(None, None);
+        upstairs.submit_flush(None, None, None);
 
         // op 2
         upstairs.submit_dummy_write(
@@ -3475,10 +3519,21 @@ pub(crate) mod test {
         let offset = BlockIndex(7);
         let data = BytesMut::from([1; 512].as_slice());
         let (_write_res, done) = BlockOpWaiter::pair();
+        let io_guard = IOLimitGuard::dummy();
         let op = if is_write_unwritten {
-            BlockOp::WriteUnwritten { offset, data, done }
+            BlockOp::WriteUnwritten {
+                offset,
+                data,
+                done,
+                io_guard,
+            }
         } else {
-            BlockOp::Write { offset, data, done }
+            BlockOp::Write {
+                offset,
+                data,
+                done,
+                io_guard,
+            }
         };
         up.apply(UpstairsAction::Guest(op));
         up.await_deferred_ops().await;
@@ -3591,7 +3646,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         // fake read response from downstairs that will successfully decrypt
         let mut data = Vec::from([1u8; 512]);
@@ -3639,7 +3700,13 @@ pub(crate) mod test {
         let data = Buffer::new(blocks, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         let mut data = Vec::from([1u8; 512]);
 
@@ -3697,7 +3764,13 @@ pub(crate) mod test {
         let data = Buffer::new(blocks, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         // fake read response from downstairs that will fail decryption
         let mut data = Vec::from([1u8; 512]);
@@ -3773,7 +3846,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         // fake read response from downstairs that will fail decryption
         let mut data = Vec::from([1u8; 512]);
@@ -3836,7 +3915,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         // fake read response from downstairs that will fail integrity hash
         // check
@@ -3881,7 +3966,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         let data = BytesMut::from([1u8; 512].as_slice());
         let hash = integrity_hash(&[&data]);
@@ -3942,7 +4033,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         for client_id in [ClientId::new(0), ClientId::new(1)] {
             let data = BytesMut::from([1u8; 512].as_slice());
@@ -4005,7 +4102,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         let data = BytesMut::from([1u8; 512].as_slice());
         let hash = integrity_hash(&[&data]);
@@ -4065,7 +4168,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         // The first read has no block contexts, because it was unwritten
         let data = BytesMut::from([0u8; 512].as_slice());
@@ -4121,7 +4230,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         // The first read has no block contexts, because it was unwritten
         let data = BytesMut::from([0u8; 512].as_slice());
@@ -4181,7 +4296,13 @@ pub(crate) mod test {
         data.extend_from_slice(vec![1; NODEFER_SIZE].as_slice());
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Write { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Write {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
         assert_eq!(up.deferred_ops.len(), 0);
 
         // Submit a long write, which should be deferred
@@ -4189,7 +4310,13 @@ pub(crate) mod test {
         data.extend_from_slice(vec![2; DEFER_SIZE].as_slice());
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Write { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Write {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
         assert_eq!(up.deferred_ops.len(), 1);
         assert_eq!(up.deferred_msgs.len(), 0);
 
@@ -4199,7 +4326,13 @@ pub(crate) mod test {
         data.extend_from_slice(vec![3; NODEFER_SIZE].as_slice());
         let offset = BlockIndex(7);
         let (_res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Write { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Write {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
         assert_eq!(up.deferred_ops.len(), 2);
         assert_eq!(up.deferred_msgs.len(), 0);
     }
@@ -4221,7 +4354,13 @@ pub(crate) mod test {
         let data = Buffer::new(1, 512);
         let offset = BlockIndex(7);
         let (res, done) = BlockOpWaiter::pair();
-        up.apply(UpstairsAction::Guest(BlockOp::Read { offset, data, done }));
+        let io_guard = IOLimitGuard::dummy();
+        up.apply(UpstairsAction::Guest(BlockOp::Read {
+            offset,
+            data,
+            done,
+            io_guard,
+        }));
 
         let reply = res.wait_raw().await.unwrap();
         match reply {