From 2b84af87b8ce99fb5b197f7057aefead678713ba Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 20 Jun 2023 12:06:13 +1000 Subject: [PATCH] ddt: add FDT feature and support for legacy and new on-disk formats This is the supporting infrastructure for the upcoming dedup features. Traditionally, dedup objects live directly in the MOS root. While their details vary (checksum, type and class), they are all the same "kind" of thing - a store of dedup entries. The new features are more varied than that, and are better thought of as a set of related stores for the overall state of a dedup table. This adds a new feature flag, SPA_FEATURE_FAST_DEDUP. Enabling this will cause new DDTs to be created as a ZAP in the MOS root, named DDT-. The is used as the root object for the normal type/class store objects, but will also be a place for any storage required by new features. This commit adds two new fields to ddt_t, for version and flags. These are intended to describe the structure and features of the overall dedup table, and are stored as-is in the DDT root. In this commit, flags are always zero, but the intent is that they can be used to hang optional logic or state onto for new dedup features. Version is always 1. For a "legacy" dedup table, where no DDT root directory exists, the version will be 0. ddt_configure() is expected to determine the version and flags features currently in operation based on whether or not the fast_dedup feature is enabled, and from what's available on disk. In this way, its possible to support both old and new tables. This also provides a migration path. A legacy setup can be upgraded to FDT by creating the DDT root ZAP, moving the existing objects into it, and setting version and flags appropriately. There's no support for that here, but it would be straightforward to add later and allows the possibility that newer features could be applied to existing dedup tables. Co-authored-by: Allan Jude Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. --- include/sys/ddt.h | 18 +- include/sys/ddt_impl.h | 8 + include/sys/dmu.h | 1 + include/zfeature_common.h | 1 + man/man7/zpool-features.7 | 17 +- module/zcommon/zfeature_common.c | 6 + module/zfs/ddt.c | 263 +++++++++++++++++- .../cli_root/zpool_get/zpool_get.cfg | 1 + 8 files changed, 302 insertions(+), 13 deletions(-) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 66d59cebacde..02d0cf5daab0 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -39,6 +39,12 @@ extern "C" { struct abd; +/* + * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). + */ +/* No flags yet. */ +#define DDT_FLAG_MASK (0) + /* * DDT on-disk storage object types. Each one corresponds to specific * implementation, see ddt_ops_t. The value itself is not stored on disk. @@ -185,11 +191,15 @@ typedef struct { avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ - avl_tree_t ddt_repair_tree; /* entries being repaired */ + avl_tree_t ddt_repair_tree; /* entries being repaired */ + + enum zio_checksum ddt_checksum; /* checksum algorithm in use */ + spa_t *ddt_spa; /* pool this ddt is on */ + objset_t *ddt_os; /* ddt objset (always MOS) */ - enum zio_checksum ddt_checksum; /* checksum algorithm in use */ - spa_t *ddt_spa; /* pool this ddt is on */ - objset_t *ddt_os; /* ddt objset (always MOS) */ + uint64_t ddt_dir_object; /* MOS dir holding ddt objects */ + uint64_t ddt_version; /* DDT version */ + uint64_t ddt_flags; /* FDT option flags */ /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 7072cfb4ecae..187734ffa98c 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -33,6 +33,14 @@ extern "C" { #endif +/* DDT version numbers */ +#define DDT_VERSION_LEGACY (0) +#define DDT_VERSION_FDT (1) + +/* Names of interesting objects in the DDT root dir */ +#define DDT_DIR_VERSION "version" +#define DDT_DIR_FLAGS "flags" + /* * Ops vector to access a specific DDT object type. */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 8174b21f63dd..b2fc454c748c 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -376,6 +376,7 @@ typedef struct dmu_buf { #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" +#define DMU_POOL_DDT_DIR "DDT-%s" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_ERRORSCRUB "error_scrub" diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 2515ba321759..5733a8187a95 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -82,6 +82,7 @@ typedef enum spa_feature { SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, + SPA_FEATURE_FAST_DEDUP, SPA_FEATURES } spa_feature_t; diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index ea3c68dc6083..ff6e485a4819 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -17,8 +17,9 @@ .\" Copyright (c) 2019, Klara Inc. .\" Copyright (c) 2019, Allan Jude .\" Copyright (c) 2021, Colm Buckley +.\" Copyright (c) 2023, Klara Inc. .\" -.Dd June 23, 2022 +.Dd February 14, 2024 .Dt ZPOOL-FEATURES 7 .Os . @@ -550,6 +551,20 @@ when an encrypted dataset is created and will be returned to the .Sy enabled state when all datasets that use this feature are destroyed. . +.feature com.klarasystems fast_dedup yes +This feature allows more advanced deduplication features to be enabled on new +dedup tables. +.Pp +This feature will be +.Sy active +when the first deduplicated block is written after a new dedup table is created +(ie after a new pool creation, or new checksum used on a dataset with +.Sy dedup +enabled). +It will be returned to the +.Sy enabled +state when all deduplicated blocks using it are freed. +. .feature com.delphix extensible_dataset no This feature allows more flexible use of internal ZFS data structures, and exists for other features to depend on. diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 309d9bf14cd4..8dec5f27b0af 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -754,6 +754,12 @@ zpool_feature_init(void) "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfeature_register(SPA_FEATURE_FAST_DEDUP, + "com.klarasystems:fast_dedup", "fast_dedup", + "Support for advanced deduplication", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index bbb49fdcbd20..fc20a105f4d2 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -39,6 +39,7 @@ #include #include #include +#include /* * # DDT: Deduplication tables @@ -185,6 +186,21 @@ static const char *const ddt_class_name[DDT_CLASSES] = { "unique", }; +/* + * DDT feature flags automatically enabled for each on-disk version. Note that + * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled. + */ +static const uint64_t ddt_version_flags[] = { + [DDT_VERSION_LEGACY] = 0, + [DDT_VERSION_FDT] = 0, +}; + +/* New tables get this version */ +#define DDT_VERSION_DEFAULT (DDT_VERSION_FDT) + +/* Dummy version to signal that configure is still necessary */ +#define DDT_VERSION_UNCONFIGURED (UINT64_MAX) + static void ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) @@ -196,14 +212,18 @@ ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ZCHECKSUM_FLAG_DEDUP; char name[DDT_NAMELEN]; + ASSERT3U(ddt->ddt_dir_object, >, 0); + ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, ==, 0); VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); ASSERT3U(*objectp, !=, 0); - VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, objectp, tx)); + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + + VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, + objectp, tx)); VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), @@ -220,13 +240,15 @@ ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t count; char name[DDT_NAMELEN]; + ASSERT3U(ddt->ddt_dir_object, >, 0); + ddt_object_name(ddt, type, class, name); ASSERT3U(*objectp, !=, 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); VERIFY0(ddt_object_count(ddt, type, class, &count)); VERIFY0(count); - VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx)); VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); @@ -243,9 +265,18 @@ ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) char name[DDT_NAMELEN]; int error; + if (ddt->ddt_dir_object == 0) { + /* + * If we're configured but the containing dir doesn't exist + * yet, then this object can't possibly exist either. + */ + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + return (SET_ERROR(ENOENT)); + } + ddt_object_name(ddt, type, class, name); - error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); if (error != 0) return (error); @@ -669,6 +700,8 @@ ddt_prefetch_all(spa_t *spa) } } +static int ddt_configure(ddt_t *ddt); + ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { @@ -682,6 +715,15 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) { + /* + * This is the first use of this DDT since the pool was + * created; finish getting it ready for use. + */ + ddt_configure(ddt); + ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED); + } + ddt_key_fill(&search, bp); /* Find an existing live entry */ @@ -822,6 +864,177 @@ ddt_key_compare(const void *x1, const void *x2) return (TREE_ISIGN(cmp)); } +/* Create the containing dir for this DDT and bump the feature count */ +static void +ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, ==, 0); + ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); + + char name[DDT_NAMELEN]; + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + ddt->ddt_dir_object = zap_create_link(ddt->ddt_os, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx); + + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION, + sizeof (uint64_t), 1, &ddt->ddt_version, tx)); + VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS, + sizeof (uint64_t), 1, &ddt->ddt_flags, tx)); + + spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); +} + +/* Destroy the containing dir and deactivate the feature */ +static void +ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx) +{ + ASSERT3U(ddt->ddt_dir_object, !=, 0); + ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT); + ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT); + + char name[DDT_NAMELEN]; + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ASSERT(!ddt_object_exists(ddt, type, class)); + } + } + + uint64_t count; + ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count)); + ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_VERSION)); + ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS)); + ASSERT3U(count, ==, 2); + + VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx)); + + ddt->ddt_dir_object = 0; + + spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx); +} + +/* + * Determine, flags and on-disk layout from what's already stored, or + * if there is nothing stored, select based on pool config. + */ +static int +ddt_configure(ddt_t *ddt) +{ + spa_t *spa = ddt->ddt_spa; + char name[DDT_NAMELEN]; + int error; + + ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE); + + boolean_t fdt_enabled = + spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP); + boolean_t fdt_active = + spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP); + + /* + * First, look for the global DDT stats object. If its not there, then + * there's never been a DDT written before ever, and we know we're + * starting from scratch. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + if (error != 0) { + if (error != ENOENT) + return (error); + goto new; + } + + if (fdt_active) { + /* + * Now look for a DDT directory. If it exists, then it has + * everything we need. + */ + snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR, + zio_checksum_table[ddt->ddt_checksum].ci_name); + + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, + &ddt->ddt_dir_object); + if (error == 0) { + ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_VERSION, sizeof (uint64_t), 1, + &ddt->ddt_version); + if (error != 0) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, + DDT_DIR_FLAGS, sizeof (uint64_t), 1, + &ddt->ddt_flags); + if (error != 0) + return (error); + + if (ddt->ddt_version != DDT_VERSION_FDT) { + zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " + "unknown version %llu", spa->spa_name, + name, (u_longlong_t)ddt->ddt_version); + return (SET_ERROR(EINVAL)); + } + + if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) { + zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s " + "version=%llu unknown flags %llx", + spa->spa_name, name, + (u_longlong_t)ddt->ddt_flags, + (u_longlong_t)ddt->ddt_version); + return (SET_ERROR(EINVAL)); + } + + return (0); + } + if (error != ENOENT) + return (error); + } + + /* Any object in the root indicates a traditional setup. */ + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { + ddt_object_name(ddt, type, class, name); + uint64_t obj; + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), + 1, &obj); + if (error == ENOENT) + continue; + if (error != 0) + return (error); + + ddt->ddt_version = DDT_VERSION_LEGACY; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; + + return (0); + } + } + +new: + /* Nothing on disk, so set up for the best version we can */ + if (fdt_enabled) { + ddt->ddt_version = DDT_VERSION_FDT; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = 0; /* create on first use */ + } else { + ddt->ddt_version = DDT_VERSION_LEGACY; + ddt->ddt_flags = ddt_version_flags[ddt->ddt_version]; + ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT; + } + + return (0); +} + static ddt_t * ddt_table_alloc(spa_t *spa, enum zio_checksum c) { @@ -838,6 +1051,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt->ddt_checksum = c; ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; return (ddt); } @@ -871,10 +1085,24 @@ ddt_load(spa_t *spa) ddt_create(spa); + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (!DDT_CHECKSUM_VALID(c)) + continue; + + ddt_t *ddt = spa->spa_ddt[c]; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { + error = ddt_configure(ddt); + if (error != 0) + return (error); + } + } + } + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, &spa->spa_ddt_stat_object); - if (error) return (error == ENOENT ? 0 : error); @@ -1132,25 +1360,44 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) DMU_POOL_DDT_STATS, tx); } + if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0) + ddt_create_dir(ddt, tx); + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { ddt_sync_entry(ddt, dde, tx, txg); ddt_free(dde); } + uint64_t count = 0; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { - uint64_t add, count = 0; + uint64_t add, tcount = 0; for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) { ddt_object_sync(ddt, type, class, tx); VERIFY0(ddt_object_count(ddt, type, class, &add)); - count += add; + tcount += add; } } for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - if (count == 0 && ddt_object_exists(ddt, type, class)) + if (tcount == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } + count += tcount; + } + + if (count == 0) { + /* + * No entries left on the DDT, so reset the version for next + * time. This allows us to handle the feature being changed + * since the DDT was originally created. New entries should get + * whatever the feature currently demands. + */ + if (ddt->ddt_version == DDT_VERSION_FDT) + ddt_destroy_dir(ddt, tx); + + ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt->ddt_flags = 0; } memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram, diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index e8a94ce209bc..50c1b7a9d09e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -109,5 +109,6 @@ if is_linux || is_freebsd; then "feature@block_cloning" "feature@vdev_zaps_v2" "feature@raidz_expansion" + "feature@fast_dedup" ) fi