diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index 220466550258..55be253ffbff 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -46,6 +46,7 @@ typedef struct zfsvfs zfsvfs_t; struct znode; extern int zfs_bclone_enabled; +extern int zfs_bclone_wait_dirty; /* * This structure emulates the vfs_t from other platforms. It's purpose diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 47471a805907..30c168253f96 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1159,6 +1159,15 @@ Enable the experimental block cloning feature. If this setting is 0, then even if feature@block_cloning is enabled, attempts to clone blocks will act as though the feature is disabled. . +.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int +When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be +written to disk. +This allows the clone operation to reliably succeed when a file is +modified and then immediately cloned. +For small files this may be slower than making a copy of the file. +Therefore, this setting defaults to 0 which causes a clone operation to +immediately fail when encountering a dirty block. +. .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index f2d5391037c4..b730bcb9509d 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -93,6 +93,10 @@ int zfs_bclone_enabled = 1; SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, &zfs_bclone_enabled, 0, "Enable block cloning"); +int zfs_bclone_wait_dirty = 0; +SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_wait_dirty, CTLFLAG_RWTUN, + &zfs_bclone_wait_dirty, 0, "Wait for dirty blocks when cloning"); + struct zfs_jailparam { int mount_snapshot; }; diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index b7b89b8afc56..4b9bfd71e926 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -4257,7 +4257,11 @@ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); /* CSTYLED */ -module_param(zfs_bclone_enabled, uint, 0644); +module_param(zfs_bclone_enabled, int, 0644); MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning"); +/* CSTYLED */ +module_param(zfs_bclone_wait_dirty, int, 0644); +MODULE_PARM_DESC(zfs_bclone_wait_dirty, "Wait for dirty blocks when cloning"); + #endif diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 73476ff40ebf..0682e84768e3 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -32,6 +32,7 @@ #include int zfs_bclone_enabled = 1; +int zfs_bclone_wait_dirty = 0; /* * Clone part of a file via block cloning. @@ -40,7 +41,7 @@ int zfs_bclone_enabled = 1; * care of that depending on how it was called. */ static ssize_t -__zpl_clone_file_range(struct file *src_file, loff_t src_off, +zpl_clone_file_range_impl(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len) { struct inode *src_i = file_inode(src_file); @@ -96,11 +97,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, { ssize_t ret; + /* Flags is reserved for future extensions and must be zero. */ if (flags != 0) return (-EINVAL); - /* Try to do it via zfs_clone_range() */ - ret = __zpl_clone_file_range(src_file, src_off, + /* Try to do it via zfs_clone_range() and allow shortening. */ + ret = zpl_clone_file_range_impl(src_file, src_off, dst_file, dst_off, len); #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE @@ -137,6 +139,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the * range in both files and if they're the same, arrange for them to be backed * by the same storage. + * + * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range + * if we want. It's designed for filesystems that may need to shorten the + * length for alignment, EOF, or any other requirement. ZFS may shorten the + * request when there is outstanding dirty data which hasn't been written. */ loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, @@ -145,24 +152,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off, if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) return (-EINVAL); - /* - * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given - * range if we want. Its designed for filesystems that make data past - * EOF available, and don't want it to be visible in both files. ZFS - * doesn't do that, so we just turn the flag off. - */ - flags &= ~REMAP_FILE_CAN_SHORTEN; - + /* No support for dedup yet */ if (flags & REMAP_FILE_DEDUP) - /* No support for dedup yet */ return (-EOPNOTSUPP); /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); } #endif /* HAVE_VFS_REMAP_FILE_RANGE */ @@ -179,8 +183,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off, if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + /* The entire length must be cloned or this is an error. */ + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); } #endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ @@ -214,8 +224,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg) size_t len = i_size_read(file_inode(src_file)); - ssize_t ret = - __zpl_clone_file_range(src_file, 0, dst_file, 0, len); + ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len); fput(src_file); @@ -253,7 +262,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) if (len == 0) len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; - ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset, + ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset, dst_file, fcr.fcr_dest_offset, len); fput(src_file); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index c8ff7b6432fd..9c7929ad22f5 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1049,6 +1049,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, size_t maxblocks, nbps; uint_t inblksz; uint64_t clear_setid_bits_txg = 0; + uint64_t last_synced_txg = 0; inoff = *inoffp; outoff = *outoffp; @@ -1287,15 +1288,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } nbps = maxblocks; + last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, &nbps); if (error != 0) { /* * If we are trying to clone a block that was created - * in the current transaction group, error will be - * EAGAIN here, which we can just return to the caller - * so it can fallback if it likes. + * in the current transaction group, the error will be + * EAGAIN here. Based on sync_wait either return a + * shortened range to the caller so it can fallback, + * or wait for the next sync and check again. */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool(inos), + last_synced_txg + 1); + continue; + } + break; } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 7e0990b5d9f9..05e8cdc8f234 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -631,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', tags = ['functional', 'compression'] [tests/functional/cp_files] -tests = ['cp_files_001_pos', 'cp_stress'] +tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tags = ['functional', 'cp_files'] [tests/functional/crtime] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ae4aa6275465..27c0d3e6c209 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'): 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], + 'cp_files/cp_files_002_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index e4e380aa7fd5..a619b846dd11 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -94,6 +94,7 @@ VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled +BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 4040e60434a7..44fa0bf6869c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1394,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ + functional/cp_files/cp_files_002_pos.ksh \ functional/cp_files/cp_stress.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh new file mode 100755 index 000000000000..03719bf56706 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -0,0 +1,159 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify all cp --reflink modes work with modified file. +# +# STRATEGY: +# 1. Verify "cp --reflink=never|auto|always" behaves as expected. +# Two different modes of operation are tested. +# +# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL +# when there are dirty blocks which cannot be immediately cloned. +# This is the default behavior. +# +# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for +# dirty blocks to be written to disk allowing the clone to succeed. +# The downside to this is it may be slow which depending on the +# situtation may defeat the point of making a clone. +# + +verify_runnable "global" + +if ! is_linux; then + log_unsupported "cp --reflink is a GNU coreutils option" +fi + +function cleanup +{ + datasetexists $TESTPOOL/cp-reflink && \ + destroy_dataset $$TESTPOOL/cp-reflink -f + log_must set_tunable32 BCLONE_WAIT_DIRTY 0 +} + +function verify_copy +{ + src_cksum=$(sha256digest $1) + dst_cksum=$(sha256digest $2) + + if [[ "$src_cksum" != "$dst_cksum" ]]; then + log_must ls -l $CP_TESTDIR + log_fail "checksum mismatch ($src_cksum != $dst_cksum)" + fi +} + +log_assert "Verify all cp --reflink modes work with modified file" + +log_onexit cleanup + +SRC_FILE=src.data +DST_FILE=dst.data +SRC_SIZE=$(($RANDOM % 2048)) + +# A smaller recordsize is used merely to speed up the test. +RECORDSIZE=4096 + +log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink +CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink) + +log_must cd $CP_TESTDIR + +# Never wait on dirty blocks (zfs_bclone_wait_dirty=0) +log_must set_tunable32 BCLONE_WAIT_DIRTY 0 + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + sync_pool $TESTPOOL + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + sync_pool $TESTPOOL + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $SRC_FILE $DST_FILE +done + +# Wait on dirty blocks (zfs_bclone_wait_dirty=1) +log_must set_tunable32 BCLONE_WAIT_DIRTY 1 + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $SRC_FILE $DST_FILE +done + +log_pass