diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 9568d2bbfe38..4b9921d47b81 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include @@ -5342,12 +5343,20 @@ static const char *zdb_ot_extname[] = { #define ZB_TOTAL DN_MAX_LEVELS #define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) +typedef struct zdb_brt_entry { + dva_t zbre_dva; + uint64_t zbre_refcount; + avl_node_t zbre_node; +} zdb_brt_entry_t; + typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; uint64_t zcb_removing_size; uint64_t zcb_checkpoint_size; uint64_t zcb_dedup_asize; uint64_t zcb_dedup_blocks; + uint64_t zcb_clone_asize; + uint64_t zcb_clone_blocks; uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; @@ -5368,6 +5377,8 @@ typedef struct zdb_cb { int zcb_haderrors; spa_t *zcb_spa; uint32_t **zcb_vd_obsolete_counts; + avl_tree_t zcb_brt; + boolean_t zcb_brt_is_active; } zdb_cb_t; /* test if two DVA offsets from same vdev are within the same metaslab */ @@ -5662,6 +5673,45 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); zcb->zcb_asize_total += BP_GET_ASIZE(bp); + if (zcb->zcb_brt_is_active && brt_maybe_exists(zcb->zcb_spa, bp)) { + /* + * Cloned blocks are special. We need to count them, so we can + * later uncount them when reporting leaked space, and we must + * only claim them them once. + * + * To do this, we keep our own in-memory BRT. For each block + * we haven't seen before, we look it up in the real BRT and + * if its there, we note it and its refcount then proceed as + * normal. If we see the block again, we count it as a clone + * and then give it no further consideration. + */ + zdb_brt_entry_t zbre_search, *zbre; + avl_index_t where; + + zbre_search.zbre_dva = bp->blk_dva[0]; + zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where); + if (zbre != NULL) { + zcb->zcb_clone_asize += BP_GET_ASIZE(bp); + zcb->zcb_clone_blocks++; + + zbre->zbre_refcount--; + if (zbre->zbre_refcount == 0) { + avl_remove(&zcb->zcb_brt, zbre); + umem_free(zbre, sizeof (zdb_brt_entry_t)); + } + return; + } + + uint64_t crefcnt = brt_entry_get_refcount(zcb->zcb_spa, bp); + if (crefcnt > 0) { + zbre = umem_zalloc(sizeof (zdb_brt_entry_t), + UMEM_NOFAIL); + zbre->zbre_dva = bp->blk_dva[0]; + zbre->zbre_refcount = crefcnt; + avl_insert(&zcb->zcb_brt, zbre, where); + } + } + if (dump_opt['L']) return; @@ -6664,6 +6714,20 @@ deleted_livelists_dump_mos(spa_t *spa) iterate_deleted_livelists(spa, dump_livelist_cb, NULL); } +static int +zdb_brt_entry_compare(const void *zcn1, const void *zcn2) +{ + const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva; + const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva; + int cmp; + + cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + if (cmp == 0) + cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)); + + return (cmp); +} + static int dump_block_stats(spa_t *spa) { @@ -6678,6 +6742,13 @@ dump_block_stats(spa_t *spa) zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL); + if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) { + avl_create(&zcb->zcb_brt, zdb_brt_entry_compare, + sizeof (zdb_brt_entry_t), + offsetof(zdb_brt_entry_t, zbre_node)); + zcb->zcb_brt_is_active = B_TRUE; + } + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", (dump_opt['c'] == 1) ? "metadata " : "", @@ -6779,7 +6850,8 @@ dump_block_stats(spa_t *spa) metaslab_class_get_alloc(spa_special_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); - total_found = tzb->zb_asize - zcb->zcb_dedup_asize + + total_found = + tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize + zcb->zcb_removing_size + zcb->zcb_checkpoint_size; if (total_found == total_alloc && !dump_opt['L']) { @@ -6820,6 +6892,9 @@ dump_block_stats(spa_t *spa) "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize, (u_longlong_t)zcb->zcb_dedup_blocks, (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\t%-16s %14llu count: %6llu\n", + "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize, + (u_longlong_t)zcb->zcb_clone_blocks); (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); diff --git a/include/sys/brt.h b/include/sys/brt.h index 0761159e3f5f..f73df95058d9 100644 --- a/include/sys/brt.h +++ b/include/sys/brt.h @@ -36,6 +36,7 @@ extern "C" { #endif extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp); +extern uint64_t brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp); extern uint64_t brt_get_dspace(spa_t *spa); extern uint64_t brt_get_used(spa_t *spa); diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 18efebfc1dec..72384b638be5 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -106,6 +106,13 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, if (ret == -EOPNOTSUPP || ret == -EXDEV) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); +#else + /* + * Before Linux 5.3 the filesystem has to return -EOPNOTSUPP to signal + * to the kernel that it should fallback to a content copy. + */ + if (ret == -EXDEV) + ret = -EOPNOTSUPP; #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ return (ret); diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 877b503a1bf2..e8218fb26888 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -1544,6 +1544,37 @@ brt_entry_decref(spa_t *spa, const blkptr_t *bp) return (B_FALSE); } +uint64_t +brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) +{ + brt_t *brt = spa->spa_brt; + brt_vdev_t *brtvd; + brt_entry_t bre_search, *bre; + uint64_t vdevid, refcnt; + int error; + + brt_entry_fill(bp, &bre_search, &vdevid); + + brt_rlock(brt); + + brtvd = brt_vdev(brt, vdevid); + ASSERT(brtvd != NULL); + + bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); + if (bre == NULL) { + error = brt_entry_lookup(brt, brtvd, &bre_search); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) + refcnt = 0; + else + refcnt = bre_search.bre_refcount; + } else + refcnt = bre->bre_refcount; + + brt_unlock(brt); + return (refcnt); +} + static void brt_prefetch(brt_t *brt, const blkptr_t *bp) { diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index b68202d84924..4747b9837337 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -36,6 +36,7 @@ tags = ['functional', 'atime'] [tests/functional/block_cloning:Linux] tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', + 'block_cloning_copyfilerange_fallback', 'block_cloning_ficlone', 'block_cloning_ficlonerange', 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index c9a2b4179aec..5c4b3a7bcdc1 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -300,6 +300,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_partial': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange_fallback': + ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], }) diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c index a7e7277ae411..696dc471d8c3 100644 --- a/tests/zfs-tests/cmd/clonefile.c +++ b/tests/zfs-tests/cmd/clonefile.c @@ -212,7 +212,7 @@ main(int argc, char **argv) int dfd = open(argv[optind+1], O_WRONLY|O_CREAT, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); - if (sfd < 0) { + if (dfd < 0) { fprintf(stderr, "open: %s: %s\n", argv[optind+1], strerror(errno)); close(sfd); diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 0819cb6b576e..3b6b2ef734d0 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -443,6 +443,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange.ksh \ functional/block_cloning/block_cloning_copyfilerange_partial.ksh \ + functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \ functional/block_cloning/block_cloning_disabled_ficlone.ksh \ functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh new file mode 100755 index 000000000000..87f99eb5c0f0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Klara Inc. +# Copyright (c) 2023, Rob Norris +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +claim="copy_file_range will fall back to copy when cloning not possible." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +log_onexit cleanup + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=128K count=4 +log_must sync_pool $TESTPOOL + + +log_note "Copying entire file with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 0 0 524288 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "1 2 3 4" ] + + +log_note "Copying within a block with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 32768 32768 65536 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "2 3 4" ] + + +log_note "Copying across a block with copy_file_range" + +log_must clonefile -f /$TESTPOOL/file /$TESTPOOL/clone 327680 327680 131072 +log_must sync_pool $TESTPOOL + +log_must have_same_content /$TESTPOOL/file /$TESTPOOL/clone + +typeset blocks=$(unique_blocks $TESTPOOL file $TESTPOOL clone) +log_must [ "$blocks" = "2" ] + +log_pass $claim