From ade46fa089b60f680130f39fd401938715c86c4c Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Sat, 26 Aug 2023 11:34:43 -0700 Subject: [PATCH] Increase limit of redaction list by using spill block Currently redaction bookmarks and their associated redaction lists have a relatively low limit of 36 redaction snapshots. This is imposed by the number of snapshot GUIDs that fit in the bonus buffer of the redaction list object. While this is more than enough for most use cases, there are some limited cases where larger numbers would be useful to support. We tweak the redaction list creation code to use a spill block if the number of redaction snapshots is above the amount that would fit in the bonus buffer. We also make a small change to allow spill blocks to be use for types of data besides SA. In order to fully leverage this logic, we also change the redaction code to use vmem_alloc, to handle extremely large allocations if needed. Finally, small tweaks were made to the zfs commands and the test suite. Reviewed-by: Matthew Ahrens Signed-off-by: Paul Dagnelie Closes #15018 --- cmd/zdb/zdb.c | 15 ++++- cmd/zfs/zfs_main.c | 4 ++ include/sys/dsl_bookmark.h | 1 + include/zfeature_common.h | 1 + lib/libzfs/libzfs.abi | 9 +-- man/man7/zpool-features.7 | 12 ++++ module/zcommon/zfeature_common.c | 12 ++++ module/zfs/dmu_redact.c | 17 +++-- module/zfs/dnode.c | 1 + module/zfs/dsl_bookmark.c | 67 ++++++++++++++----- module/zfs/dsl_destroy.c | 10 +++ .../cli_root/zpool_get/zpool_get.cfg | 1 + .../redacted_send/redacted_many_clones.ksh | 12 ++-- 13 files changed, 128 insertions(+), 34 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b7b1a8ffbc90..09a551b2d4cc 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -5299,8 +5299,18 @@ dump_one_objset(const char *dsname, void *arg) avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); - if (dbn->dbn_phys.zbm_redaction_obj != 0) - global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + global_feature_count[ + SPA_FEATURE_REDACTION_BOOKMARKS]++; + objset_t *mos = os->os_spa->spa_meta_objset; + dnode_t *rl; + VERIFY0(dnode_hold(mos, + dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl)); + if (rl->dn_have_spill) { + global_feature_count[ + SPA_FEATURE_REDACTION_LIST_SPILL]++; + } + } if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; } @@ -8141,6 +8151,7 @@ dump_zpool(spa_t *spa) for (spa_feature_t f = 0; f < SPA_FEATURES; f++) global_feature_count[f] = UINT64_MAX; global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; + global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0; global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; global_feature_count[SPA_FEATURE_LIVELIST] = 0; diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 9692bc974140..25ae52c56771 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -3979,6 +3979,10 @@ zfs_do_redact(int argc, char **argv) (void) fprintf(stderr, gettext("potentially invalid redaction " "snapshot; full dataset names required\n")); break; + case ESRCH: + (void) fprintf(stderr, gettext("attempted to resume redaction " + " with a mismatched redaction list\n")); + break; default: (void) fprintf(stderr, gettext("internal error: %s\n"), strerror(errno)); diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h index 353c5c2d260f..d4e559a09037 100644 --- a/include/sys/dsl_bookmark.h +++ b/include/sys/dsl_bookmark.h @@ -72,6 +72,7 @@ typedef struct redaction_list_phys { typedef struct redaction_list { dmu_buf_user_t rl_dbu; redaction_list_phys_t *rl_phys; + dmu_buf_t *rl_bonus; dmu_buf_t *rl_dbuf; uint64_t rl_object; zfs_refcount_t rl_longholds; diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 7066c699e203..1025c44738ba 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -80,6 +80,7 @@ typedef enum spa_feature { SPA_FEATURE_BLAKE3, SPA_FEATURE_BLOCK_CLONING, SPA_FEATURE_AVZ_V2, + SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index e3cfbb08d88d..3185c4a17397 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -599,7 +599,7 @@ - + @@ -5814,7 +5814,8 @@ - + + @@ -8726,8 +8727,8 @@ - - + + diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b901ce6c2935..3c7b0b345d96 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -947,6 +947,18 @@ once all filesystems that have ever had their property set to .Sy zstd are destroyed. +. +.feature com.delphix redaction_list_spill no redaction_bookmarks +This feature enables the redaction list created by zfs redact to store +many more entries. +It becomes +.Sy active +when a redaction list is created with more than 36 entries, +and returns to being +.Sy enabled +when no long redaction lists remain in the pool. +For more information about redacted sends, see +.Xr zfs-send 8 . .El . .Sh SEE ALSO diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 4c9b7ed72a0f..2c74d10f43ff 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -737,6 +737,18 @@ zpool_feature_init(void) ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t redact_list_spill_deps[] = { + SPA_FEATURE_REDACTION_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_REDACTION_LIST_SPILL, + "com.delphix:redaction_list_spill", "redaction_list_spill", + "Support for increased number of redaction_snapshot " + "arguments in zfs redact.", 0, ZFEATURE_TYPE_BOOLEAN, + redact_list_spill_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c index 6bd35713ff18..5ac14edfca12 100644 --- a/module/zfs/dmu_redact.c +++ b/module/zfs/dmu_redact.c @@ -746,7 +746,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, bqueue_enqueue(q, record, sizeof (*record)); return (0); } - redact_nodes = kmem_zalloc(num_threads * + redact_nodes = vmem_zalloc(num_threads * sizeof (*redact_nodes), KM_SLEEP); avl_create(&start_tree, redact_node_compare_start, @@ -820,7 +820,7 @@ perform_thread_merge(bqueue_t *q, uint32_t num_threads, avl_destroy(&start_tree); avl_destroy(&end_tree); - kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); + vmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); if (current_record != NULL) bqueue_enqueue(q, current_record, sizeof (*current_record)); return (err); @@ -1030,7 +1030,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, numsnaps = fnvlist_num_pairs(redactnvl); if (numsnaps > 0) - args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); + args = vmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); nvpair_t *pair = NULL; for (int i = 0; i < numsnaps; i++) { @@ -1079,7 +1079,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN); if (args != NULL) - kmem_free(args, numsnaps * sizeof (*args)); + vmem_free(args, numsnaps * sizeof (*args)); return (SET_ERROR(ENAMETOOLONG)); } err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); @@ -1119,7 +1119,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, } else { uint64_t *guids = NULL; if (numsnaps > 0) { - guids = kmem_zalloc(numsnaps * sizeof (uint64_t), + guids = vmem_zalloc(numsnaps * sizeof (uint64_t), KM_SLEEP); } for (int i = 0; i < numsnaps; i++) { @@ -1131,10 +1131,9 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, dp = NULL; err = dsl_bookmark_create_redacted(newredactbook, snapname, numsnaps, guids, FTAG, &new_rl); - kmem_free(guids, numsnaps * sizeof (uint64_t)); - if (err != 0) { + vmem_free(guids, numsnaps * sizeof (uint64_t)); + if (err != 0) goto out; - } } for (int i = 0; i < numsnaps; i++) { @@ -1188,7 +1187,7 @@ dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, } if (args != NULL) - kmem_free(args, numsnaps * sizeof (*args)); + vmem_free(args, numsnaps * sizeof (*args)); if (dp != NULL) dsl_pool_rele(dp, FTAG); if (ds != NULL) { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 7cf03264dce2..79fd02dcb9aa 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -720,6 +720,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || + (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c index e04796a0814f..03d9420dbdb9 100644 --- a/module/zfs/dsl_bookmark.c +++ b/module/zfs/dsl_bookmark.c @@ -34,6 +34,7 @@ #include #include #include +#include static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, @@ -459,25 +460,42 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps); if (redaction_list != NULL || bookmark_redacted) { redaction_list_t *local_rl; + boolean_t spill = B_FALSE; if (bookmark_redacted) { redact_snaps = dsredactsnaps; num_redact_snaps = dsnumsnaps; } + int bonuslen = sizeof (redaction_list_phys_t) + + num_redact_snaps * sizeof (uint64_t); + if (bonuslen > dmu_bonus_max()) + spill = B_TRUE; dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, - DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) + - num_redact_snaps * sizeof (uint64_t), tx); + DMU_OTN_UINT64_METADATA, spill ? 0 : bonuslen, tx); spa_feature_incr(dp->dp_spa, SPA_FEATURE_REDACTION_BOOKMARKS, tx); + if (spill) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_REDACTION_LIST_SPILL, tx); + } VERIFY0(dsl_redaction_list_hold_obj(dp, dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl)); dsl_redaction_list_long_hold(dp, local_rl, tag); - ASSERT3U((local_rl)->rl_dbuf->db_size, >=, - sizeof (redaction_list_phys_t) + num_redact_snaps * - sizeof (uint64_t)); - dmu_buf_will_dirty(local_rl->rl_dbuf, tx); + if (!spill) { + ASSERT3U(local_rl->rl_bonus->db_size, >=, bonuslen); + dmu_buf_will_dirty(local_rl->rl_bonus, tx); + } else { + dmu_buf_t *db; + VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus, + DB_RF_MUST_SUCCEED, FTAG, &db)); + dmu_buf_will_fill(db, tx); + VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen, + SPA_MINBLOCKSIZE), tx)); + local_rl->rl_phys = db->db_data; + local_rl->rl_dbuf = db; + } memcpy(local_rl->rl_phys->rlp_snaps, redact_snaps, sizeof (uint64_t) * num_redact_snaps); local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; @@ -636,11 +654,15 @@ dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx) SPA_FEATURE_REDACTION_BOOKMARKS)) return (SET_ERROR(ENOTSUP)); /* - * If the list of redact snaps will not fit in the bonus buffer with - * the furthest reached object and offset, fail. + * If the list of redact snaps will not fit in the bonus buffer (or + * spill block, with the REDACTION_LIST_SPILL feature) with the + * furthest reached object and offset, fail. */ - if (dbcra->dbcra_numsnaps > (dmu_bonus_max() - - sizeof (redaction_list_phys_t)) / sizeof (uint64_t)) + uint64_t snaplimit = ((spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_REDACTION_LIST_SPILL) ? spa_maxblocksize(dp->dp_spa) : + dmu_bonus_max()) - + sizeof (redaction_list_phys_t)) / sizeof (uint64_t); + if (dbcra->dbcra_numsnaps > snaplimit) return (SET_ERROR(E2BIG)); if (dsl_bookmark_create_nvl_validate_pair( @@ -1040,6 +1062,14 @@ dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name, } if (dbn->dbn_phys.zbm_redaction_obj != 0) { + dnode_t *rl; + VERIFY0(dnode_hold(mos, + dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl)); + if (rl->dn_have_spill) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_LIST_SPILL, tx); + } + dnode_rele(rl, FTAG); VERIFY0(dmu_object_free(mos, dbn->dbn_phys.zbm_redaction_obj, tx)); spa_feature_decr(dmu_objset_spa(mos), @@ -1213,7 +1243,9 @@ redaction_list_evict_sync(void *rlu) void dsl_redaction_list_rele(redaction_list_t *rl, const void *tag) { - dmu_buf_rele(rl->rl_dbuf, tag); + if (rl->rl_bonus != rl->rl_dbuf) + dmu_buf_rele(rl->rl_dbuf, tag); + dmu_buf_rele(rl->rl_bonus, tag); } int @@ -1221,7 +1253,7 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag, redaction_list_t **rlp) { objset_t *mos = dp->dp_meta_objset; - dmu_buf_t *dbuf; + dmu_buf_t *dbuf, *spill_dbuf; redaction_list_t *rl; int err; @@ -1236,13 +1268,18 @@ dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, const void *tag, redaction_list_t *winner = NULL; rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP); - rl->rl_dbuf = dbuf; + rl->rl_bonus = dbuf; + if (dmu_spill_hold_existing(dbuf, tag, &spill_dbuf) == 0) { + rl->rl_dbuf = spill_dbuf; + } else { + rl->rl_dbuf = dbuf; + } rl->rl_object = rlobj; - rl->rl_phys = dbuf->db_data; + rl->rl_phys = rl->rl_dbuf->db_data; rl->rl_mos = dp->dp_meta_objset; zfs_refcount_create(&rl->rl_longholds); dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL, - &rl->rl_dbuf); + &rl->rl_bonus); if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) { kmem_free(rl, sizeof (*rl)); rl = winner; diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 053f26878cf1..d9d88a981e05 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -1125,6 +1125,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { if (dbn->dbn_phys.zbm_redaction_obj != 0) { + dnode_t *rl; + VERIFY0(dnode_hold(mos, + dbn->dbn_phys.zbm_redaction_obj, FTAG, + &rl)); + if (rl->dn_have_spill) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_LIST_SPILL, + tx); + } + dnode_rele(rl, FTAG); VERIFY0(dmu_object_free(mos, dbn->dbn_phys.zbm_redaction_obj, tx)); spa_feature_decr(dmu_objset_spa(mos), diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 160a0ca2e6db..4248578cde16 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -86,6 +86,7 @@ typeset -a properties=( "feature@log_spacemap" "feature@device_rebuild" "feature@draid" + "feature@redaction_list_spill" ) if is_linux || is_freebsd; then diff --git a/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh index 3386643b295e..f2150be3bc95 100755 --- a/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh +++ b/tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh @@ -27,7 +27,6 @@ # second (the last block in the file) is common to them all. # 2. Verify a redacted stream with a reasonable redaction list length can # be correctly processed. -# 3. Verify that if the list is too long, the send fails gracefully. # typeset ds_name="many_clones" @@ -56,13 +55,18 @@ for i in {1..64}; do log_must zfs snapshot ${clone}$i@snap done -# The limit isn't necessarily 32 snapshots. The maximum number of snapshots in +# The limit isn't necessarily 64 snapshots. The maximum number of snapshots in # the redacted list is determined in dsl_bookmark_create_redacted_check(). -log_must zfs redact $sendfs@snap book1 $clone{1..32}@snap +log_must zfs redact $sendfs@snap book1 $clone{1..64}@snap log_must eval "zfs send --redact book1 $sendfs@snap >$stream" log_must eval "zfs recv $recvfs <$stream" compare_files $sendfs $recvfs "f2" "$RANGE8" -log_mustnot zfs redact $sendfs@snap book2 $clone{1..64}@snap +rls_value="$(zpool get -H -o value feature@redaction_list_spill $POOL)" +if [ "$rls_value" = "active" ]; then + log_note "redaction_list_spill feature active" +else + log_fail "redaction_list_spill feature not active" +fi log_pass "Redacted send can deal with a large redaction list."