From 6ad07c0dc6717c8d0b0bc64ff7f39292c87beaab Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Fri, 1 Mar 2024 15:48:53 +1100
Subject: [PATCH 1/3] zts: test for correct fsync() response to ZIL flush
 failure

If fsync() (zil_commit()) writes successfully, but then the flush fails,
fsync() should not return success, but instead should fall into a full
transaction wait.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 tests/runfiles/linux.run                      |   4 +
 tests/test-runner/bin/zts-report.py.in        |   1 +
 tests/zfs-tests/include/blkdev.shlib          |   9 +-
 tests/zfs-tests/tests/Makefile.am             |   3 +
 .../tests/functional/flush/cleanup.ksh        |  28 ++
 .../tests/functional/flush/setup.ksh          |  30 ++
 .../functional/flush/zil_flush_error.ksh      | 259 ++++++++++++++++++
 7 files changed, 331 insertions(+), 3 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/flush/cleanup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/flush/setup.ksh
 create mode 100755 tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh

diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 5817e649003c..4a3fcd2cbec8 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -124,6 +124,10 @@ tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
     'scrub_after_resilver', 'suspend_resume_single', 'zpool_status_-s']
 tags = ['functional', 'fault']
 
+[tests/functional/flush:Linux]
+tests = ['zil_flush_error']
+tags = ['functional', 'flush']
+
 [tests/functional/features/large_dnode:Linux]
 tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos']
 tags = ['functional', 'features', 'large_dnode']
diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in
index 1177e80e1a75..98e6c994f348 100755
--- a/tests/test-runner/bin/zts-report.py.in
+++ b/tests/test-runner/bin/zts-report.py.in
@@ -379,6 +379,7 @@ if os.environ.get('CI') == 'true':
         'fault/auto_spare_ashift': ['SKIP', ci_reason],
         'fault/auto_spare_shared': ['SKIP', ci_reason],
         'fault/suspend_resume_single': ['SKIP', ci_reason],
+        'flush/zil_flush_error': ['SKIP', ci_reason],
         'procfs/pool_state': ['SKIP', ci_reason],
     })
 
diff --git a/tests/zfs-tests/include/blkdev.shlib b/tests/zfs-tests/include/blkdev.shlib
index 51eff3023e73..bd8557c94b34 100644
--- a/tests/zfs-tests/include/blkdev.shlib
+++ b/tests/zfs-tests/include/blkdev.shlib
@@ -462,13 +462,16 @@ function unload_scsi_debug
 # Get scsi_debug device name.
 # Returns basename of scsi_debug device (for example "sdb").
 #
-function get_debug_device
+# $1 (optional): Return the first $1 number of SCSI debug device names.
+function get_debug_device #num
 {
+  typeset num=${1:-1}
+
 	for i in {1..10} ; do
-		val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3)
+		val=$(lsscsi | awk '/scsi_debug/ {print $6}' | cut -d/ -f3 | head -n$num)
 
 		# lsscsi can take time to settle
-		if [ "$val" != "-" ] ; then
+		if [[ ! "$val" =~ "-" ]] ; then
 			break
 		fi
 		sleep 1
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index bbeabc6dfb42..34a5a0f0756f 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1516,6 +1516,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/features/large_dnode/large_dnode_008_pos.ksh \
 	functional/features/large_dnode/large_dnode_009_pos.ksh \
 	functional/features/large_dnode/setup.ksh \
+	functional/flush/cleanup.ksh \
+	functional/flush/zil_flush_error.ksh \
+	functional/flush/setup.ksh \
 	functional/grow/grow_pool_001_pos.ksh \
 	functional/grow/grow_replicas_001_pos.ksh \
 	functional/history/cleanup.ksh \
diff --git a/tests/zfs-tests/tests/functional/flush/cleanup.ksh b/tests/zfs-tests/tests/functional/flush/cleanup.ksh
new file mode 100755
index 000000000000..4eb59574e4ec
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/flush/cleanup.ksh
@@ -0,0 +1,28 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2024, Klara, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/flush/setup.ksh b/tests/zfs-tests/tests/functional/flush/setup.ksh
new file mode 100755
index 000000000000..94a3936ce2cd
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/flush/setup.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2024, Klara, Inc.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "global"
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh b/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh
new file mode 100755
index 000000000000..e053c5d3bac6
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/flush/zil_flush_error.ksh
@@ -0,0 +1,259 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2024, Klara, Inc.
+#
+
+#
+# This tests that if the ZIL write sequence fails, it corectly falls back and
+# waits until the transaction has fully committed before returning.
+#
+# When this test was written, the ZIL has a flaw - it assumes that if its
+# writes succeed, then the data is definitely on disk and available for reply
+# if the pool fails. It issues a flush immediately after the write, but does
+# not check it is result. If a disk fails after the data has been accepted into
+# the disk cache, but before it can be written to permanent storage, then
+# fsync() will have returned success even though the data is not stored in the
+# ZIL for replay.
+#
+# If the main pool then fails before the transaction can be written, then data
+# is lost, and fsync() returning success was premature.
+#
+# To prove this, we create a pool with a separate log device. We inject two
+# faults:
+#
+# - ZIL writes appear to succeed, but never make it disk
+# - ZIL flushes fail, and return error
+#
+# We then remove the main pool device, and do a write+fsync. This goes to the
+# ZIL, and appears to succeed. When the txg closes, the write will fail, and
+# the pool suspends.
+#
+# Then, we simulate a reboot by copying the content of the pool devices aside.
+# We restore the pool devices, bring it back online, and export it - we don't
+# need it anymore, but we have to clean up properly. Then we restore the copied
+# content and import the pool, in whatever state it was in when it suspended.
+#
+# Finally, we check the content of the file we wrote to. If it matches what we
+# wrote, then the fsync() was correct, and all is well. If it doesn't match,
+# then the flaw is present, and the test fails.
+#
+# We run the test twice: once without the log device injections, one with. The
+# first confirms the expected behaviour of the ZIL - when the pool is imported,
+# the log is replayed. The second fails as above. When the flaw is corrected,
+# both will succeed, and this overall test succeeds.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
+
+BACKUP_MAIN="$TMPDIR/backup_main"
+BACKUP_LOG="$TMPDIR/backup_log"
+
+LOOP_LOG="$TMPDIR/loop_log"
+
+DATA_FILE="$TMPDIR/data_file"
+
+verify_runnable "global"
+
+function cleanup
+{
+	zinject -c all
+	destroy_pool $TESTPOOL
+	unload_scsi_debug
+	rm -f $BACKUP_MAIN $BACKUP_LOG $DATA_FILE
+}
+
+log_onexit cleanup
+
+log_assert "verify fsync() waits if the ZIL commit fails"
+
+# create 128K of random data, and take its checksum. we do this up front to
+# ensure we don't get messed up by any latency from reading /dev/random or
+# checksumming the file on the pool
+log_must dd if=/dev/random of=$DATA_FILE bs=128K count=1
+typeset sum=$(sha256digest $DATA_FILE)
+
+# create a virtual scsi device with two device nodes. these are backed by the
+# same memory. we do this because we need to be able to take the device offline
+# properly in order to get the pool to suspend; fault injection on a loop
+# device can't do it. once offline, we can use the second node to take a copy
+# of its state.
+load_scsi_debug 100 1 2 1 '512b'
+set -A sd $(get_debug_device 2)
+
+# create a loop device for the log.
+truncate -s 100M $LOOP_LOG
+typeset ld=$(basename $(losetup -f))
+log_must losetup /dev/$ld $LOOP_LOG
+
+# this function runs the entire test sequence. the option decides if faults
+# are injected on the slog device, mimicking the trigger situation that causes
+# the fsync() bug to occur
+function test_fsync
+{
+	typeset -i do_fault_log="$1"
+
+	log_note "setting up test"
+
+	# create the pool. the main data store is on the scsi device, with the
+	# log on a loopback. we bias the ZIL towards to the log device to try
+	# to ensure that fsync() never involves the main device
+	log_must zpool create -f -O logbias=latency $TESTPOOL ${sd[0]} log $ld
+
+	# create the file ahead of time. the ZIL head structure is created on
+	# first use, and does a full txg wait, which we need to avoid
+	log_must dd if=/dev/zero of=/$TESTPOOL/data_file \
+	    bs=128k count=1 conv=fsync
+	log_must zpool sync
+
+	# arrange for writes to the log device to appear to succeed, and
+	# flushes to fail. this simulates a loss of the device between it
+	# accepting the the write into its cache, but before it can be written
+	# out
+	if [[ $do_fault_log != 0 ]] ; then
+		log_note "injecting log device faults"
+		log_must zinject -d $ld -e noop -T write $TESTPOOL
+		log_must zinject -d $ld -e io -T flush $TESTPOOL
+	fi
+
+	# take the main device offline. there is no IO in flight, so ZFS won't
+	# notice immediately
+	log_note "taking main pool offline"
+	log_must eval "echo offline > /sys/block/${sd[0]}/device/state"
+
+	# write out some data, then call fsync(). there are three possible
+	# results:
+	#
+	# - if the bug is present, fsync() will return success, and dd will
+	#   succeed "immediately"; before the pool suspends
+	# - if the bug is fixed, fsync() will block, the pool will suspend, and
+	#   dd will return success after the pool returns to service
+	# - if something else goes wrong, dd will fail; this may happen before
+	#   or after the pool suspends or returns. this shouldn't happen, and
+	#   should abort the test
+	#
+	# we have to put dd in the background, otherwise if it blocks we will
+	# block with it. what we're interested in is whether or not it succeeds
+	# before the pool is suspended. if it does, then we expect that after
+	# the suspended pool is reimported, the data will have been written
+	log_note "running dd in background to write data and call fsync()"
+	dd if=$DATA_FILE of=/$TESTPOOL/data_file bs=128k count=1 conv=fsync &
+	fsync_pid=$!
+
+	# wait for the pool to suspend. this should happen within ~5s, when the
+	# txg sync tries to write the change to the main device
+	log_note "waiting for pool to suspend"
+	typeset -i tries=10
+	until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
+		if ((tries-- == 0)); then
+			log_fail "pool didn't suspend"
+		fi
+		sleep 1
+	done
+
+	# the pool is suspended. see if dd is still present; if it is, then
+	# it's blocked in fsync(), and we have no expectation that the write
+	# made it to disk. if dd has exited, then its return code will tell
+	# us whether fsync() returned success, or it failed for some other
+	# reason
+	typeset -i fsync_success=0
+	if kill -0 $fsync_pid ; then
+		log_note "dd is blocked; fsync() has not returned"
+	else
+		log_note "dd has finished, ensuring it was successful"
+		log_must wait $fsync_pid
+		fsync_success=1
+	fi
+
+	# pool is suspended. if we online the main device right now, it will
+	# retry writing the transaction, which will succed, and everything will
+	# continue as its supposed to. that's the opposite of what we want; we
+	# want to do an import, as if after reboot, to force the pool to try to
+	# replay the ZIL, so we can compare the final result against what
+	# fsync() told us
+	#
+	# however, right now the pool is wedged. we need to get it back online
+	# so we can export it, so we can do the import. so we need to copy the
+	# entire pool state away. for the scsi device, we can do this through
+	# the second device node. for the loopback, we can copy it directly
+	log_note "taking copy of suspended pool"
+	log_must cp /dev/${sd[1]} $BACKUP_MAIN
+	log_must cp /dev/$ld $BACKUP_LOG
+
+	# bring the entire pool back online, by clearing error injections and
+	# restoring the main device. this will unblock anything still waiting
+	# on it, and tidy up all the internals so we can reset it
+	log_note "bringing pool back online"
+	if [[ $do_fault_log != 0 ]] ; then
+		log_must zinject -c all
+	fi
+	log_must eval "echo running > /sys/block/${sd[0]}/device/state"
+	log_must zpool clear $TESTPOOL
+
+	# now the pool is back online. if dd was blocked, it should now
+	# complete successfully. make sure that's true
+	if [[ $fsync_success == 0 ]] ; then
+		log_note "ensuring blocked dd has now finished"
+		log_must wait $fsync_pid
+	fi
+
+	log_note "exporting pool"
+
+	# pool now clean, export it
+	log_must zpool export $TESTPOOL
+
+	log_note "reverting pool to suspended state"
+
+	# restore the pool to the suspended state, mimicking a reboot
+	log_must cp $BACKUP_MAIN /dev/${sd[0]}
+	log_must cp $BACKUP_LOG /dev/$ld
+
+	# import the crashed pool
+	log_must zpool import $TESTPOOL
+
+	# if fsync() succeeded before the pool suspended, then the ZIL should
+	# have replayed properly and the data is now available on the pool
+	#
+	# note that we don't check the alternative; fsync() blocking does not
+	# mean that data _didn't_ make it to disk, just the ZFS never claimed
+	# that it did. in that case we can't know what _should_ be on disk
+	# right now, so can't check
+	if [[ $fsync_success == 1 ]] ; then
+		log_note "fsync() succeeded earlier; checking data was written correctly"
+		typeset newsum=$(sha256digest /$TESTPOOL/data_file)
+		log_must test "$sum" = "$newsum"
+	fi
+
+	log_note "test finished, cleaning up"
+	log_must zpool destroy -f $TESTPOOL
+}
+
+log_note "first run: ZIL succeeds, and repairs the pool at import"
+test_fsync 0
+
+log_note "second run: ZIL commit fails, and falls back to txg sync"
+test_fsync 1
+
+log_pass "fsync() waits if the ZIL commit fails"

From e7966e581a4ed8462900cd549c4ee626f9e8c8f0 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 9 May 2024 11:50:58 +1000
Subject: [PATCH 2/3] zio_flush: propagate flush errors to the ZIL

Since the beginning, ZFS' "flush" operation has always ignored
errors[1]. Write errors are captured and dealt with, but if a write
succeeds but the subsequent flush fails, the operation as a whole will
appear to succeed[2].

In the end-of-transaction uberblock+label write+flush ceremony, it's
very difficult for this situation to occur. Since all devices are
written to, typically the first write will succeed, the first flush will
fail unobserved, but then the second write will fail, and the entire
transaction is aborted. It's difficult to imagine a real-world scenario
where all the writes in that sequence could succeed even as the flushes
are failing (understanding that the OS is still seeing hardware problems
and taking devices offline).

In the ZIL however, it's another story. Since only the write response is
checked, if that write succeeds but the flush then fails, the ZIL will
believe that it succeeds, and zil_commit() (and thus fsync()) will
return success rather than the "correct" behaviour of falling back into
txg_wait_synced()[3].

This commit fixes this by adding a simple flag to zio_flush() to
indicate whether or not the caller wants to receive flush errors. This
flag is enabled for ZIL calls. The existing zio chaining inside the ZIL
and the flush handler zil_lwb_flush_vdevs_done() already has all the
necessary support to properly handle a flush failure and fail the entire
zio chain. This causes zil_commit() to correct fall back to
txg_wait_synced() rather than returning success prematurely.

1. The ZFS birth commit (illumos/illumos-gate@fa9e4066f0) had support
   for flushing devices with write caches with the DKIOCFLUSHWRITECACHE
   ioctl. No errors are checked. The comment in `zil_flush_vdevs()` from
   from the time shows the thinking:

   /*
    * Wait for all the flushes to complete.  Not all devices actually
    * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
    */

2. It's not entirely clear from the code history why this was acceptable
   for devices that _do_ have write caches. Our best guess is that this
   was an oversight: between the combination of hardware, pool topology
   and application behaviour required to hit this, it basically didn't
   come up.

3. Somewhat frustratingly, zil.c contains comments describing this exact
   behaviour, and further discussion in #12443 (September 2021). It
   appears that those involved saw the potential, but were looking at a
   different problem and so didn't have the context to recognise it for
   what it was.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/zio.h       |  2 +-
 module/zfs/vdev_label.c | 14 ++++++++------
 module/zfs/vdev_raidz.c |  6 +++---
 module/zfs/zil.c        | 20 +++-----------------
 module/zfs/zio.c        |  8 ++++----
 5 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/include/sys/zio.h b/include/sys/zio.h
index 446b64ccd8ab..6492d17360f6 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -595,7 +595,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t size, boolean_t *slog);
-extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_flush(zio_t *zio, vdev_t *vd, boolean_t propagate);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index 47346dd5acff..468390b9acd3 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -1830,19 +1830,21 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 
 	for (int v = 0; v < svdcount; v++) {
 		if (vdev_writeable(svd[v])) {
-			zio_flush(zio, svd[v]);
+			zio_flush(zio, svd[v], B_FALSE);
 		}
 	}
 	if (spa->spa_aux_sync_uber) {
 		spa->spa_aux_sync_uber = B_FALSE;
 		for (int v = 0; v < spa->spa_spares.sav_count; v++) {
 			if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) {
-				zio_flush(zio, spa->spa_spares.sav_vdevs[v]);
+				zio_flush(zio, spa->spa_spares.sav_vdevs[v],
+				    B_FALSE);
 			}
 		}
 		for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
 			if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) {
-				zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]);
+				zio_flush(zio, spa->spa_l2cache.sav_vdevs[v],
+				    B_FALSE);
 			}
 		}
 	}
@@ -2007,13 +2009,13 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
-		zio_flush(zio, vd);
+		zio_flush(zio, vd, B_FALSE);
 
 	for (int i = 0; i < 2; i++) {
 		if (!sav[i]->sav_label_sync)
 			continue;
 		for (int v = 0; v < sav[i]->sav_count; v++)
-			zio_flush(zio, sav[i]->sav_vdevs[v]);
+			zio_flush(zio, sav[i]->sav_vdevs[v], B_FALSE);
 		if (l == 1)
 			sav[i]->sav_label_sync = B_FALSE;
 	}
@@ -2091,7 +2093,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 	for (vdev_t *vd =
 	    txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
 	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
-		zio_flush(zio, vd);
+		zio_flush(zio, vd, B_FALSE);
 
 	(void) zio_wait(zio);
 
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 15c8b8ca6016..187d3908ff50 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -4172,7 +4172,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
-	zio_flush(pio, raidvd);
+	zio_flush(pio, raidvd, B_FALSE);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
@@ -4231,7 +4231,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
-	zio_flush(pio, raidvd);
+	zio_flush(pio, raidvd, B_FALSE);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
@@ -4339,7 +4339,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
 	}
 	zio_wait(pio);
 	pio = zio_root(spa, NULL, NULL, 0);
-	zio_flush(pio, raidvd);
+	zio_flush(pio, raidvd, B_FALSE);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 3983da6aa424..f451c170fcec 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2018 Datto Inc.
+ * Copyright (c) 2024, Klara, Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -1495,12 +1496,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		 * includes ZIO errors from either this LWB's write or
 		 * flush, as well as any errors from other dependent LWBs
 		 * (e.g. a root LWB ZIO that might be a child of this LWB).
-		 *
-		 * With that said, it's important to note that LWB flush
-		 * errors are not propagated up to the LWB root ZIO.
-		 * This is incorrect behavior, and results in VDEV flush
-		 * errors not being handled correctly here. See the
-		 * comment above the call to "zio_flush" for details.
 		 */
 
 		zcw->zcw_zio_error = zio->io_error;
@@ -1650,17 +1645,8 @@ zil_lwb_write_done(zio_t *zio)
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL) {
-			/*
-			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
-			 * always used within "zio_flush". This means,
-			 * any errors when flushing the vdev(s), will
-			 * (unfortunately) not be handled correctly,
-			 * since these "zio_flush" errors will not be
-			 * propagated up to "zil_lwb_flush_vdevs_done".
-			 */
-			zio_flush(lwb->lwb_root_zio, vd);
-		}
+		if (vd != NULL)
+			zio_flush(lwb->lwb_root_zio, vd, B_TRUE);
 		kmem_free(zv, sizeof (*zv));
 	}
 }
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 1f3acb9b921e..43e4e7ff15c8 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1633,10 +1633,10 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
  * the flushes complete.
  */
 void
-zio_flush(zio_t *pio, vdev_t *vd)
+zio_flush(zio_t *pio, vdev_t *vd, boolean_t propagate)
 {
-	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
-	    ZIO_FLAG_DONT_RETRY;
+	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
+	    (propagate ? 0 : ZIO_FLAG_DONT_PROPAGATE);
 
 	if (vd->vdev_nowritecache)
 		return;
@@ -1647,7 +1647,7 @@ zio_flush(zio_t *pio, vdev_t *vd)
 		    NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
-			zio_flush(pio, vd->vdev_child[c]);
+			zio_flush(pio, vd->vdev_child[c], propagate);
 	}
 }
 

From 2331d19dab7c0ea9bd8f895f9a652bf02535a8d7 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Mon, 1 Jul 2024 11:19:16 +1000
Subject: [PATCH 3/3] flush: don't report flush error when disabling flush
 support

The first time a device returns ENOTSUP in repsonse to a flush request,
we set vdev_nowritecache so we don't issue flushes in the future and
instead just pretend the succeeded. However, we still return an error
for the initial flush, even though we just decided such errors are
meaningless!

So, when setting vdev_nowritecache in response to a flush error, also
reset the error code to assume success.

Along the way, it seems there's no good reason for vdev_disk & vdev_geom
to explicitly detect no support for flush and set vdev_nowritecache;
just letting the error through to zio_vdev_io_assess() will cause it all
to fall out nicely. So remove those checks.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 module/os/freebsd/zfs/vdev_geom.c | 15 ---------------
 module/os/linux/zfs/vdev_disk.c   |  3 ---
 module/zfs/zio.c                  |  7 +++++--
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c
index b7ff1063b089..7aaa42bfb1a8 100644
--- a/module/os/freebsd/zfs/vdev_geom.c
+++ b/module/os/freebsd/zfs/vdev_geom.c
@@ -1014,21 +1014,6 @@ vdev_geom_io_intr(struct bio *bp)
 		zio->io_error = SET_ERROR(EIO);
 
 	switch (zio->io_error) {
-	case ENOTSUP:
-		/*
-		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
-		 * that future attempts will never succeed. In this case
-		 * we set a persistent flag so that we don't bother with
-		 * requests in the future.
-		 */
-		switch (bp->bio_cmd) {
-		case BIO_FLUSH:
-			vd->vdev_nowritecache = B_TRUE;
-			break;
-		case BIO_DELETE:
-			break;
-		}
-		break;
 	case ENXIO:
 		if (!vd->vdev_remove_wanted) {
 			/*
diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c
index e69c5f3841ec..2c963bb05c80 100644
--- a/module/os/linux/zfs/vdev_disk.c
+++ b/module/os/linux/zfs/vdev_disk.c
@@ -1232,9 +1232,6 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
 	zio->io_error = -error;
 #endif
 
-	if (zio->io_error && (zio->io_error == EOPNOTSUPP))
-		zio->io_vd->vdev_nowritecache = B_TRUE;
-
 	bio_put(bio);
 	ASSERT3S(zio->io_error, >=, 0);
 	if (zio->io_error)
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 43e4e7ff15c8..7af3eb922d7b 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -4532,11 +4532,14 @@ zio_vdev_io_assess(zio_t *zio)
 	/*
 	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
 	 * attempts will ever succeed. In this case we set a persistent
-	 * boolean flag so that we don't bother with it in the future.
+	 * boolean flag so that we don't bother with it in the future, and
+	 * then we act like the flush succeeded.
 	 */
 	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
-	    zio->io_type == ZIO_TYPE_FLUSH && vd != NULL)
+	    zio->io_type == ZIO_TYPE_FLUSH && vd != NULL) {
 		vd->vdev_nowritecache = B_TRUE;
+		zio->io_error = 0;
+	}
 
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;