From c201bb47319890f062ca39cc2a22022c2f8c04bf Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 27 Dec 2023 16:46:07 -0800 Subject: [PATCH] Special failsafe feature Special failsafe is a feature that allows your special allocation class vdevs ('special' and 'dedup') to fail without losing any data. It works by automatically backing up all special data to the pool. This has the added benefit that you can safely create pools with non-matching alloc class redundancy (like a mirrored pool with a single special device). This behavior is controlled via two properties: 1. feature@special_failsafe - This feature flag enables the special failsafe subsystem. It prevents the backed-up pool from being imported read/write on an older version of ZFS that does not support special failsafe. 2. special_failsafe - This pool property is the main on/off switch to control special failsafe. If you want to use special failsafe simply turn it on either at creation time or with `zpool set` prior to adding a special alloc class device. After special device have been added, then you can either leave the property on or turn it off, but once it's off you can't turn it back on again. Note that special failsafe may create a performance penalty over pure alloc class writes due to the extra backup copy write to the pool. Alloc class reads should not be affected as they always read from DVA 0 first (the copy of the data on the special device). It can also inflate disk usage on dRAID pools. Closes: #15118 Signed-off-by: Tony Hutter --- cmd/zpool/zpool_main.c | 34 +++ cmd/zpool/zpool_vdev.c | 94 ++++++- include/sys/fs/zfs.h | 2 + include/sys/spa.h | 3 +- include/sys/spa_impl.h | 10 + include/sys/vdev_impl.h | 5 + include/zfeature_common.h | 1 + lib/libnvpair/libnvpair.abi | 10 +- lib/libuutil/libuutil.abi | 47 +++- lib/libzfs/libzfs.abi | 21 +- lib/libzfs/libzfs_util.c | 9 + lib/libzfs_core/libzfs_core.abi | 51 +++- lib/libzutil/zutil_import.c | 10 +- man/man7/zpool-features.7 | 34 +++ man/man7/zpoolconcepts.7 | 8 +- man/man7/zpoolprops.7 | 28 ++ module/zcommon/zfeature_common.c | 12 + module/zcommon/zpool_prop.c | 4 + module/zfs/metaslab.c | 28 +- module/zfs/spa.c | 189 ++++++++++++- module/zfs/spa_misc.c | 19 +- module/zfs/vdev.c | 106 +++++++- module/zfs/vdev_label.c | 44 ++- module/zfs/vdev_root.c | 18 +- module/zfs/zio.c | 29 ++ tests/runfiles/common.run | 8 + tests/zfs-tests/include/libtest.shlib | 37 ++- tests/zfs-tests/tests/Makefile.am | 12 + .../alloc_class/alloc_class_001_pos.ksh | 16 +- .../alloc_class/alloc_class_003_pos.ksh | 44 +-- .../alloc_class/alloc_class_004_pos.ksh | 48 ++-- .../alloc_class/alloc_class_005_pos.ksh | 66 ++--- .../alloc_class/alloc_class_006_pos.ksh | 14 +- .../alloc_class/alloc_class_007_pos.ksh | 14 +- .../alloc_class/alloc_class_008_pos.ksh | 34 +-- .../alloc_class/alloc_class_009_pos.ksh | 58 ++-- .../alloc_class/alloc_class_010_pos.ksh | 29 +- .../alloc_class/alloc_class_011_neg.ksh | 16 +- .../alloc_class/alloc_class_012_pos.ksh | 116 +++++--- .../alloc_class/alloc_class_013_pos.ksh | 41 +-- .../alloc_class/alloc_class_014_neg.ksh | 16 +- .../alloc_class/alloc_class_015_pos.ksh | 30 ++- .../functional/special_failsafe/cleanup.ksh | 27 ++ .../functional/special_failsafe/setup.ksh | 24 ++ .../special_failsafe/special_failsafe.cfg | 36 +++ .../special_failsafe/special_failsafe.kshlib | 255 ++++++++++++++++++ .../special_failsafe/special_failsafe_add.ksh | 96 +++++++ .../special_failsafe_create.ksh | 86 ++++++ .../special_failsafe_files.ksh | 124 +++++++++ .../special_failsafe_import.ksh | 93 +++++++ .../special_failsafe_offline.ksh | 124 +++++++++ .../special_failsafe_prop.ksh | 118 ++++++++ .../special_failsafe_scrub.ksh | 106 ++++++++ .../special_failsafe_split.ksh | 94 +++++++ 54 files changed, 2303 insertions(+), 295 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg create mode 100644 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh create mode 100755 tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d670cd1afeb1..71507540e60b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -1161,6 +1161,23 @@ zpool_do_add(int argc, char **argv) } } + /* + * Special case: + * + * We need to know the special_failsafe pool property value to determine + * if the new vdev configuration has the correct redundancy requirements + * for special and dedup vdevs. + * + * Pass in the current value for special_failsafe to the proplist. + */ + char strval[ZFS_MAXPROPLEN]; + if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval, + ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) { + verify(add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval, + &props, B_TRUE) == 0); + } + /* pass off to make_root_vdev for processing */ nvroot = make_root_vdev(zhp, props, !check_inuse, check_replication, B_FALSE, dryrun, argc, argv); @@ -6810,6 +6827,23 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing) } } + /* + * Special case: + * + * We need to know the special_failsafe pool property value to determine + * if the new vdev configuration has the correct redundancy requirements + * for special and dedup vdevs. + * + * Pass in the current value for special_failsafe to the proplist. + */ + char strval[ZFS_MAXPROPLEN]; + if (zpool_get_prop(zhp, ZPOOL_PROP_SPECIAL_FAILSAFE, strval, + ZFS_MAXPROPLEN, NULL, B_FALSE) == 0) { + verify(add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), strval, + &props, B_TRUE) == 0); + } + nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, argc, argv); if (nvroot == NULL) { diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index fbd4b81dfacc..cd2996953084 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -85,6 +85,7 @@ */ boolean_t error_seen; boolean_t is_force; +boolean_t is_alloc_class; void vdev_error(const char *fmt, ...) @@ -94,8 +95,15 @@ vdev_error(const char *fmt, ...) if (!error_seen) { (void) fprintf(stderr, gettext("invalid vdev specification\n")); if (!is_force) - (void) fprintf(stderr, gettext("use '-f' to override " - "the following errors:\n")); + if (is_alloc_class) { + (void) fprintf(stderr, gettext("Turn on the " + "special_failsafe pool property or use '-f'" + " to override the following errors:\n")); + is_alloc_class = B_FALSE; + } else { + (void) fprintf(stderr, gettext("use '-f' to " + "override the following errors:\n")); + } else (void) fprintf(stderr, gettext("the following errors " "must be manually repaired:\n")); @@ -442,6 +450,7 @@ typedef struct replication_level { const char *zprl_type; uint64_t zprl_children; uint64_t zprl_parity; + boolean_t zprl_is_alloc_class; } replication_level_t; #define ZPOOL_FUZZ (16 * 1024 * 1024) @@ -480,13 +489,43 @@ is_raidz_draid(replication_level_t *a, replication_level_t *b) return (B_FALSE); } +/* + * Return true if 'props' contains: + * + * special_failsafe=on + * + * ... and feature@special_failsafe is NOT disabled. + */ +static boolean_t +is_special_failsafe_enabled_in_props(nvlist_t *props) +{ + const char *str = NULL; + + if (nvlist_lookup_string(props, "feature@special_failsafe", + &str) == 0) { + if ((str != NULL) && strcmp(str, "disabled") == 0) { + return (B_FALSE); + } + } + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_SPECIAL_FAILSAFE), + &str) == 0) { + if ((str != NULL) && strcmp(str, "on") == 0) { + return (B_TRUE); /* It is enabled */ + } + } + + return (B_FALSE); +} + /* * Given a list of toplevel vdevs, return the current replication level. If * the config is inconsistent, then NULL is returned. If 'fatal' is set, then * an error message will be displayed for each self-inconsistent vdev. */ static replication_level_t * -get_replication(nvlist_t *nvroot, boolean_t fatal) +get_replication(nvlist_t *props, nvlist_t *nvroot, boolean_t fatal) { nvlist_t **top; uint_t t, toplevels; @@ -495,7 +534,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) nvlist_t *nv; const char *type; replication_level_t lastrep = {0}; - replication_level_t rep; + replication_level_t rep = {0}; replication_level_t *ret; replication_level_t *raidz, *mirror; boolean_t dontreport; @@ -507,6 +546,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) for (t = 0; t < toplevels; t++) { uint64_t is_log = B_FALSE; + const char *str = NULL; nv = top[t]; @@ -528,12 +568,29 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) strcmp(type, VDEV_TYPE_INDIRECT) == 0) continue; + rep.zprl_type = type; + + /* + * If special_failsafe=on then we know the special allocation + * class devices have at least one copy of their data on the + * pool so we can ignore their replication level. + */ + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &str); + if (str && + ((strcmp(str, VDEV_ALLOC_BIAS_SPECIAL) == 0) || + (strcmp(str, VDEV_ALLOC_BIAS_DEDUP) == 0))) { + rep.zprl_is_alloc_class = B_TRUE; + if (is_special_failsafe_enabled_in_props(props)) { + continue; /* We're backed up, skip redundancy */ + } + } + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { /* * This is a 'file' or 'disk' vdev. */ - rep.zprl_type = type; rep.zprl_children = 1; rep.zprl_parity = 0; } else { @@ -548,7 +605,6 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) * We also check that the size of each vdev (if it can * be determined) is the same. */ - rep.zprl_type = type; rep.zprl_children = 0; if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || @@ -808,7 +864,7 @@ get_replication(nvlist_t *nvroot, boolean_t fatal) * report any difference between the two. */ static int -check_replication(nvlist_t *config, nvlist_t *newroot) +check_replication(nvlist_t *props, nvlist_t *config, nvlist_t *newroot) { nvlist_t **child; uint_t children; @@ -825,7 +881,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot) verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - if ((current = get_replication(nvroot, B_FALSE)) == NULL) + if ((current = get_replication(props, nvroot, B_FALSE)) == NULL) return (0); } /* @@ -850,17 +906,31 @@ check_replication(nvlist_t *config, nvlist_t *newroot) * Get the replication level of the new vdev spec, reporting any * inconsistencies found. */ - if ((new = get_replication(newroot, B_TRUE)) == NULL) { + if ((new = get_replication(props, newroot, B_TRUE)) == NULL) { free(current); return (-1); } - /* * Check to see if the new vdev spec matches the replication level of * the current pool. */ ret = 0; if (current != NULL) { + if (current->zprl_is_alloc_class || new->zprl_is_alloc_class) + is_alloc_class = B_TRUE; + else + is_alloc_class = B_FALSE; + + /* + * Special case: + * If there were any redundancy problems with alloc class vdevs + * BUT the pool had special_failsafe on, then we're fine since + * all the alloc class data has a copy in the main pool. + */ + if (is_special_failsafe_enabled_in_props(props) && + is_alloc_class) + goto out; + if (is_raidz_mirror(current, new, &raidz, &mirror) || is_raidz_mirror(new, current, &raidz, &mirror)) { if (raidz->zprl_parity != mirror->zprl_children - 1) { @@ -899,7 +969,7 @@ check_replication(nvlist_t *config, nvlist_t *newroot) ret = -1; } } - +out: free(new); if (current != NULL) free(current); @@ -1888,7 +1958,7 @@ make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, * found. We include the existing pool spec, if any, as we need to * catch changes against the existing replication level. */ - if (check_rep && check_replication(poolconfig, newroot) != 0) { + if (check_rep && check_replication(props, poolconfig, newroot) != 0) { nvlist_free(newroot); return (NULL); } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 21f99bacccf3..b8c13f8c3440 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -258,6 +258,7 @@ typedef enum { ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONERATIO, + ZPOOL_PROP_SPECIAL_FAILSAFE, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -1604,6 +1605,7 @@ typedef enum { ZFS_ERR_CRYPTO_NOTSUP, ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS, ZFS_ERR_ASHIFT_MISMATCH, + ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE, } zfs_errno_t; /* diff --git a/include/sys/spa.h b/include/sys/spa.h index b969f05afe48..6405b0496523 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1113,7 +1113,8 @@ extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, extern uint64_t spa_get_last_removal_txg(spa_t *spa); extern boolean_t spa_trust_config(spa_t *spa); extern uint64_t spa_missing_tvds_allowed(spa_t *spa); -extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); +extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing, + uint64_t missing_special); extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); extern uint64_t spa_total_metaslabs(spa_t *spa); extern boolean_t spa_multihost(spa_t *spa); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 0cd0c4720fbe..a0f39df21d1b 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -327,6 +327,13 @@ struct spa { uint64_t spa_missing_tvds; /* unopenable tvds on load */ uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ + /* + * Number of 'spa_missing_tvds' that are alloc class devices + * in the pool that has special_failsafe on, and are thus recoverable + * from errors. + */ + uint64_t spa_missing_recovered_tvds; + uint64_t spa_nonallocating_dspace; spa_removing_phys_t spa_removing_phys; spa_vdev_removal_t *spa_vdev_removal; @@ -465,6 +472,9 @@ struct spa { */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ + + /* Backup special/dedup devices data to the pool */ + boolean_t spa_special_failsafe; }; extern char *spa_config_path; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 2a93f7c680bc..755f4214be66 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -641,6 +641,11 @@ extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); void vdev_metaslab_group_create(vdev_t *vd); uint64_t vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b); +extern boolean_t vdev_is_leaf(vdev_t *vd); +extern boolean_t vdev_is_special(vdev_t *vd); +extern boolean_t vdev_is_dedup(vdev_t *vd); +extern boolean_t vdev_is_alloc_class(vdev_t *vd); +extern boolean_t vdev_is_special_failsafe(vdev_t *vd); /* * Vdev ashift optimization tunables diff --git a/include/zfeature_common.h b/include/zfeature_common.h index 2515ba321759..be74255b31c6 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -82,6 +82,7 @@ typedef enum spa_feature { SPA_FEATURE_AVZ_V2, SPA_FEATURE_REDACTION_LIST_SPILL, SPA_FEATURE_RAIDZ_EXPANSION, + SPA_FEATURE_SPECIAL_FAILSAFE, SPA_FEATURES } spa_feature_t; diff --git a/lib/libnvpair/libnvpair.abi b/lib/libnvpair/libnvpair.abi index ef92f3e9bda6..b99a0d6a3373 100644 --- a/lib/libnvpair/libnvpair.abi +++ b/lib/libnvpair/libnvpair.abi @@ -1156,6 +1156,11 @@ + + + + + @@ -2536,11 +2541,6 @@ - - - - - diff --git a/lib/libuutil/libuutil.abi b/lib/libuutil/libuutil.abi index e942d24c6531..620f384d8f5b 100644 --- a/lib/libuutil/libuutil.abi +++ b/lib/libuutil/libuutil.abi @@ -596,14 +596,11 @@ - + - - - - + @@ -800,9 +797,16 @@ + + + + + + + @@ -912,6 +916,25 @@ + + + + + + + + + + + + + + + + + + + @@ -920,12 +943,23 @@ + + + + + + + + + + + @@ -937,8 +971,9 @@ - + + diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 2bbaae6345ab..bf00ab88da06 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -606,7 +606,7 @@ - + @@ -2895,7 +2895,8 @@ - + + @@ -5936,7 +5937,8 @@ - + + @@ -6251,6 +6253,11 @@ + + + + + @@ -6362,7 +6369,7 @@ - + @@ -8987,8 +8994,8 @@ - - + + @@ -9065,7 +9072,7 @@ - + diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 73ae0950ccb6..378de5a6f8ee 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -774,6 +774,15 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ZFS_ERR_ASHIFT_MISMATCH: zfs_verror(hdl, EZFS_ASHIFT_MISMATCH, fmt, ap); break; + case ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Cannot set pool prop special_failsafe=on since " + "feature@special_failsafe is not set to 'enabled'.\n" + "This could be because the special_failsafe pool prop was " + "manually turned off while the special_failsafe feature " + "flag was active, or the feature flag was disabled.")); + zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); + break; default: zfs_error_aux(hdl, "%s", zfs_strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 5b95c8f779db..ab7231971c26 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -594,14 +594,11 @@ - + - - - - + @@ -770,6 +767,13 @@ + + + + + + + @@ -873,12 +877,42 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -895,8 +929,9 @@ - + + @@ -1119,7 +1154,7 @@ - + @@ -1127,7 +1162,7 @@ - + diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 06705ff4d9b4..6e349920d21f 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1924,7 +1924,7 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp, /* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */ static boolean_t -vdev_is_leaf(nvlist_t *nv) +vdev_is_leaf_nv(nvlist_t *nv) { uint_t children = 0; nvlist_t **child; @@ -1937,10 +1937,10 @@ vdev_is_leaf(nvlist_t *nv) /* Return if a vdev is a leaf vdev and a real device (disk or file) */ static boolean_t -vdev_is_real_leaf(nvlist_t *nv) +vdev_is_real_leaf_nv(nvlist_t *nv) { const char *type = NULL; - if (!vdev_is_leaf(nv)) + if (!vdev_is_leaf_nv(nv)) return (B_FALSE); (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type); @@ -1973,7 +1973,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, /* The very first entry in the NV list is a special case */ if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) { - if (real_leaves_only && !vdev_is_real_leaf(nv)) + if (real_leaves_only && !vdev_is_real_leaf_nv(nv)) return (0); *((nvlist_t **)last_nv) = nv; @@ -1996,7 +1996,7 @@ __for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, * we want. */ if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) { - if (real_leaves_only && !vdev_is_real_leaf(nv)) + if (real_leaves_only && !vdev_is_real_leaf_nv(nv)) return (0); *((nvlist_t **)last_nv) = nv; diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index ea3c68dc6083..9316f7983336 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -322,6 +322,40 @@ With device removal, it can be returned to the .Sy enabled state if all the dedicated allocation class vdevs are removed. . +.feature org.zfsonlinux special_failsafe yes allocation_classes +This feature allows the +.Sy special_failsafe +pool property to be used. +When the +.Sy special_failsafe +pool property is set to "on" all proceeding writes to allocation class vdevs +(like special and dedup vdevs) will also generate an additional copy of the data +to be written to the pool. +This allows alloc class vdev data to be "backed up" to the pool. +A fully backed up allocation device vdev can fail without causing the pool to be +suspended, even if the alloc class device is not redundant. +.Pp +It is important to note the difference between the +.Sy special_failsafe +feature flag and a +.Sy special_failsafe +pool property since they appear similar. +The +.Sy special_failsafe +feature flag is a safeguard to prevent a pool that is using special_failsafe +from being imported read/write on an older version of ZFS that does not support +special_failsafe (and possibly compromising the integrity of the backup +guarantees). +The pool property is what actually allows you to turn on/off the backup copy +writes. +The +.Sy special_failsafe +feature will switch from "enabled" to "active" when allocation class devices +are added. +See the +.Sy special_failsafe +pool property for more details. +. .feature com.delphix async_destroy yes Destroying a file system requires traversing all of its data in order to return its used space to the pool. diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 18dfca6dc8ac..5e6b2c0e0db4 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -181,14 +181,18 @@ section. .It Sy dedup A device solely dedicated for deduplication tables. The redundancy of this device should match the redundancy of the other normal -devices in the pool. +devices in the pool except if the +.Sy special_failsafe +pool property is enabled. If more than one dedup device is specified, then allocations are load-balanced between those devices. .It Sy special A device dedicated solely for allocating various kinds of internal metadata, and optionally small file blocks. The redundancy of this device should match the redundancy of the other normal -devices in the pool. +devices in the pool except if the +.Sy special_failsafe +pool property is enabled. If more than one special device is specified, then allocations are load-balanced between those devices. .Pp diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 5428ab8d3076..129f8de52731 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -437,6 +437,34 @@ command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. +.It Sy special_failsafe Ns = Ns Sy on Ns | Ns Sy off +Controls the special failsafe subsystem for special allocation +class vdevs. +When it's turned on, all writes to special allocation class vdevs +(like 'special' and 'dedup' vdevs) will also write an additional copy of the +data to the main pool. +This allows alloc class vdev data to be "backed up" to the pool. +When +.Sy special_failsafe +is turned on, alloc class vdevs can fail regardless of their redundancy level +without the pool loosing data. +To use +.Sy special_failsafe +simply turn it on at zpool create time, or turn it on prior to adding +alloc class devices. +It's important to note that after alloc class vdevs are added to the pool with +.Sy special_failsafe +on, you can still turn +.Sy special_failsafe +off again, but once it's off you can't turn it back on. +.Sy special_failsafe +can be freely toggled on/off if alloc class devices haven't been added to the +pool, since the pool prop would have no effect. +The +.Sy feature@special_failsafe +feature flag must be enabled in order to use the +.Sy special_failsafe +pool property. .El . .Ss User Properties diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 309d9bf14cd4..a3583faa8195 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -753,6 +753,18 @@ zpool_feature_init(void) "org.openzfs:raidz_expansion", "raidz_expansion", "Support for raidz expansion", ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t special_failsafe_deps[] = { + SPA_FEATURE_ALLOCATION_CLASSES, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_SPECIAL_FAILSAFE, + "org.openzfs:special_failsafe", "special_failsafe", + "Save a copy of allocation class device data to main pool", + ZFEATURE_FLAG_MOS, + ZFEATURE_TYPE_BOOLEAN, special_failsafe_deps, + sfeatures); + } zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index e2e3bf5be69e..e767c0e3193e 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -153,6 +153,10 @@ zpool_prop_init(void) zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_SPECIAL_FAILSAFE, + "special_failsafe", 0, PROP_DEFAULT, ZFS_TYPE_POOL, + "on | off", "SPECIAL_FAILSAFE", boolean_table, + sfeatures); /* default index properties */ zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index c4aa98ced433..036e57a6f83c 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5845,10 +5845,22 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, dva_t *dva = bp->blk_dva; dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; int error = 0; + boolean_t is_special_failsafe = B_FALSE; + + if ((spa->spa_special_failsafe && ((mc == spa_special_class(spa)) || + (mc == spa_dedup_class(spa))))) { + is_special_failsafe = B_TRUE; + } ASSERT0(BP_GET_LOGICAL_BIRTH(bp)); ASSERT0(BP_GET_PHYSICAL_BIRTH(bp)); + /* + * Earlier layers of the code should set nvdas > 1 if the + * alloc class vdev is being backed up. + */ + ASSERT(!(is_special_failsafe && ndvas == 1)); + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); if (mc->mc_allocator[allocator].mca_rotor == NULL) { @@ -5863,7 +5875,21 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, ASSERT3P(zal, !=, NULL); for (int d = 0; d < ndvas; d++) { - error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, + metaslab_class_t *_mc; + if (is_special_failsafe && (d == 1)) { + /* + * If we have the special_failsafe prop set, then make + * the 2nd copy of the data we are going to write go to + * the regular pool rather than yet another copy to the + * alloc class device. That way, if the special device + * is lost, there's still a backup in the pool. + */ + _mc = spa_normal_class(spa); + } else { + _mc = mc; + } + + error = metaslab_alloc_dva(spa, _mc, psize, dva, d, hintdva, txg, flags, zal, allocator); if (error != 0) { for (d--; d >= 0; d--) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 3704ffd08820..0c44dfb05f6e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -471,6 +471,22 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) DNODE_MIN_SIZE, ZPROP_SRC_NONE); } + if (spa_feature_is_enabled(spa, SPA_FEATURE_SPECIAL_FAILSAFE)) { + zprop_source_t src; + if ((uint64_t)spa->spa_special_failsafe == + zpool_prop_default_numeric(ZPOOL_PROP_SPECIAL_FAILSAFE)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + + spa_prop_add_list(*nvp, ZPOOL_PROP_SPECIAL_FAILSAFE, + NULL, spa->spa_special_failsafe, src); + } else { + /* special_failsafe not used */ + spa_prop_add_list(*nvp, ZPOOL_PROP_SPECIAL_FAILSAFE, + NULL, B_FALSE, ZPROP_SRC_NONE); + } + if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, @@ -604,6 +620,27 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; + boolean_t special_failsafe_prop = B_FALSE; + + /* + * The way the feature flags work here are a little interesting. + * + * At zpool creation time, this feature will not be initialized yet when + * spa_prop_validate() gets called. This works out though, as the + * feature flag will be passed in the nvlist if the feature is enabled. + * + * After the pool is created, calls to this function (like zpool set) + * will not include the feature flag in the props nvlist, but the + * feature table will be initialized, so we can use + * spa_feature_is_active(). + */ + boolean_t special_failsafe_feature_disabled; + special_failsafe_feature_disabled = !(spa_feature_is_enabled(spa, + SPA_FEATURE_SPECIAL_FAILSAFE) || spa_feature_is_active(spa, + SPA_FEATURE_SPECIAL_FAILSAFE)); + + /* Did they explicitly pass feature@special_failsafe=enabled ? */ + boolean_t special_failsafe_feature_passed = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { @@ -611,6 +648,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) const char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); + spa_feature_t fid = 0; switch (prop) { case ZPOOL_PROP_INVAL: @@ -645,11 +683,30 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) } fname = strchr(propname, '@') + 1; - if (zfeature_lookup_name(fname, NULL) != 0) { + if (zfeature_lookup_name(fname, &fid) != 0) { error = SET_ERROR(EINVAL); break; } - + /* + * Special case - If both: + * + * SPA_FEATURE_SPECIAL_FAILSAFE = disabled + * + * ... and ... + * + * ZPOOL_PROP_SPECIAL_FAILSAFE = on + * + * then we need to fail. Note that the presence + * of SPA_FEATURE_SPECIAL_FAILSAFE in the + * nvlist means it is enabled (although its + * intval will be 0). If it's disabled, then + * SPA_FEATURE_SPECIAL_FAILSAFE will not + * be in the nvlist at all. + */ + if (fid == SPA_FEATURE_SPECIAL_FAILSAFE) { + special_failsafe_feature_passed = + B_TRUE; + } has_feature = B_TRUE; } else { error = SET_ERROR(EINVAL); @@ -793,6 +850,13 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) if (strlen(strval) > ZPROP_MAX_COMMENT) error = SET_ERROR(E2BIG); break; + case ZPOOL_PROP_SPECIAL_FAILSAFE: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > 1) + error = SET_ERROR(EINVAL); + if (intval == 1) + special_failsafe_prop = B_TRUE; + break; default: break; @@ -805,6 +869,26 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) (void) nvlist_remove_all(props, zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); + if (special_failsafe_prop && special_failsafe_feature_disabled && + !special_failsafe_feature_passed) { + /* + * We can't enable SPECIAL_FAILSAFE pool prop if the + * feature flag SPA_FEATURE_SPECIAL_FAILSAFE is + * disabled. + */ + error = SET_ERROR(ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE); + } + + /* + * If the user wants to turn on the special_failsafe prop, but it + * was turned off (while the feature was active), then it can't be + * turned on again. + */ + if (spa_feature_is_active(spa, SPA_FEATURE_SPECIAL_FAILSAFE) && + !spa->spa_special_failsafe && special_failsafe_prop) { + error = SET_ERROR(ZFS_ERR_SPECIAL_FAILSAFE_NOT_POSSIBLE); + } + if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); @@ -2485,6 +2569,53 @@ spa_check_removed(vdev_t *vd) } } +/* + * Decide what to do if we have missing/corrupted alloc class devices. + * + * If we have missing top-level vdevs and they are all alloc class devices with + * special_failsafe set, then we may still be able to import the pool. + */ +static int +spa_check_for_bad_alloc_class_devices(spa_t *spa) +{ + if (spa->spa_missing_recovered_tvds == 0) + return (0); + + /* + * Are there missing alloc class devices but + * SPA_FEATURE_SPECIAL_FAILSAFE is not enabled? If so, + * then we can't import. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_SPECIAL_FAILSAFE)) { + spa_load_note(spa, "some alloc class devices are missing, " + "cannot import."); + return (SET_ERROR(ENXIO)); + } + + /* + * If all the missing top-level devices are alloc class devices, and + * if they have all their data backed up to the pool, then we can still + * import the pool. + */ + if (spa->spa_missing_tvds > 0 && + spa->spa_missing_tvds == spa->spa_missing_recovered_tvds) { + spa_load_note(spa, "only alloc class devices are missing, and " + "the normal pool has copies of the alloc class data, so " + "it's still possible to import."); + return (0); + } + + /* + * If we're here, then it means that not all the missing top-level vdevs + * were alloc class devices. This should have been caught earlier. + */ + spa_load_note(spa, "some alloc class devices that do not have a " + " special_failsafe backup copy are amongst those that are missing," + " cannot import"); + + return (SET_ERROR(ENXIO)); +} + static int spa_check_for_missing_logs(spa_t *spa) { @@ -3914,7 +4045,24 @@ spa_ld_open_vdevs(spa_t *spa) error = vdev_open(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); - if (spa->spa_missing_tvds != 0) { + if (spa->spa_missing_tvds != 0 && + spa->spa_missing_tvds == spa->spa_missing_recovered_tvds && + (error == 0 || error == ENOENT)) { + /* + * Special case: If all the missing top-level vdevs are special + * devices, we may or may not be able to import the pool, + * depending on if the relevant special_failsafe feature and + * property are set. At this early stage of import we do not + * have the feature flags loaded yet, so for now proceed + * with the import. We will do the backup checks later after + * the feature flags are loaded. + */ + spa_load_note(spa, "vdev tree has %lld missing special " + "top-level vdevs. Keep importing for now until we " + "can check the feature flags.", + (u_longlong_t)spa->spa_missing_tvds); + error = 0; + } else if (spa->spa_missing_tvds != 0) { spa_load_note(spa, "vdev tree has %lld missing top-level " "vdevs.", (u_longlong_t)spa->spa_missing_tvds); if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { @@ -4673,6 +4821,7 @@ spa_ld_get_props(spa_t *spa) if (error == 0) { uint64_t autoreplace = 0; + uint64_t special_failsafe = 0; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); @@ -4681,7 +4830,11 @@ spa_ld_get_props(spa_t *spa) spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); + spa_prop_find(spa, ZPOOL_PROP_SPECIAL_FAILSAFE, + &special_failsafe); + spa->spa_autoreplace = (autoreplace != 0); + spa->spa_special_failsafe = (special_failsafe != 0); } /* @@ -5337,6 +5490,13 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) if (error != 0) return (error); + spa_import_progress_set_notes(spa, "Checking for bad alloc class " + "devices"); + spa_check_for_bad_alloc_class_devices(spa); + if (error != 0) + return (error); + + spa_import_progress_set_notes(spa, "Loading dedup tables"); error = spa_ld_load_dedup_tables(spa); if (error != 0) @@ -6521,6 +6681,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); + /* + * Set initial special_failsafe settings. These may change after the + * nvlist properties are processed a little later in spa_sync_props(). + */ + spa->spa_special_failsafe = (boolean_t) + zpool_prop_default_numeric(ZPOOL_PROP_SPECIAL_FAILSAFE); + if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); @@ -9381,6 +9548,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) const char *elemname = nvpair_name(elem); zprop_type_t proptype; spa_feature_t fid; + boolean_t boolval; switch (prop = zpool_name_to_prop(elemname)) { case ZPOOL_PROP_VERSION: @@ -9444,6 +9612,21 @@ spa_sync_props(void *arg, dmu_tx_t *tx) "%s=%s", nvpair_name(elem), strval); break; + case ZPOOL_PROP_SPECIAL_FAILSAFE: + boolval = (boolean_t)fnvpair_value_uint64(elem); + spa->spa_special_failsafe = boolval; + /* + * Dirty the configuration on vdevs as above. + */ + if (tx->tx_txg != TXG_INITIAL) { + vdev_config_dirty(spa->spa_root_vdev); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), boolval ? "on" : "off"); + break; + case ZPOOL_PROP_INVAL: if (zpool_prop_feature(elemname)) { fname = strchr(elemname, '@') + 1; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 68b907614196..63dc686aafe9 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -721,6 +721,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); + + spa->spa_special_failsafe = B_TRUE; + spa_set_deadman_failmode(spa, zfs_deadman_failmode); spa_set_allocator(spa, zfs_active_allocator); @@ -1644,6 +1647,9 @@ spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) */ ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); + + if (spa->spa_special_failsafe) + spa_feature_incr(spa, SPA_FEATURE_SPECIAL_FAILSAFE, tx); } /* @@ -2812,10 +2818,21 @@ spa_syncing_log_sm(spa_t *spa) return (spa->spa_syncing_log_sm); } +/* + * Record the total number of missing top-level vdevs ('missing'), and the + * number of missing top-level vdevs that are recoverable ('missing_recovered'). + * In this case, missing_recovered is the number of top-level alloc class vdevs + * that are recoverable since the special_failsafe pool prop was on, and thus + * their data is "backed up" to the main pool. + * + * The separate 'missing_recovered' count is used during pool import to + * determine if we can import a pool with missing alloc class vdevs. + */ void -spa_set_missing_tvds(spa_t *spa, uint64_t missing) +spa_set_missing_tvds(spa_t *spa, uint64_t missing, uint64_t missing_recovered) { spa->spa_missing_tvds = missing; + spa->spa_missing_recovered_tvds = missing_recovered; } /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ebba453e2b14..af675e88a8c9 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -722,6 +722,60 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) return (vd); } +boolean_t +vdev_is_leaf(vdev_t *vd) +{ + return (vd->vdev_children == 0); +} + +/* Return true if vdev or TLD vdev is special alloc class */ +boolean_t +vdev_is_special(vdev_t *vd) +{ + if (vd->vdev_alloc_bias == VDEV_BIAS_SPECIAL) + return (B_TRUE); + + /* + * If the vdev is a leaf vdev, and is part of a mirror, its parent + * 'mirror' TLD will have vdev_alloc_bias == VDEV_BIAS_SPECIAL, but the + * leaf vdev itself will not. So we also need to check the parent + * in those cases. + */ + if (vdev_is_leaf(vd) && + (vd->vdev_parent != NULL && vdev_is_special(vd->vdev_parent))) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* Return true if vdev or TLD vdev is dedup alloc class */ +boolean_t +vdev_is_dedup(vdev_t *vd) +{ + if (vd->vdev_alloc_bias == VDEV_BIAS_DEDUP) + return (B_TRUE); + + /* + * If the vdev is a leaf vdev, and is part of a mirror, it's parent + * 'mirror' TLD will have vdev_alloc_bias == VDEV_BIAS_DEDUP, but the + * leaf vdev itself will not. So we also need to check the parent + * in those cases. + */ + if (vdev_is_leaf(vd) && + (vd->vdev_parent != NULL && vdev_is_dedup(vd->vdev_parent))) { + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +vdev_is_alloc_class(vdev_t *vd) +{ + return (vdev_is_special(vd) || vdev_is_dedup(vd)); +} + /* * Allocate a new vdev. The 'alloctype' is used to control whether we are * creating a new vdev or loading an existing one - the behavior is slightly @@ -740,6 +794,7 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int rc; vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; boolean_t top_level = (parent && !parent->vdev_parent); + const char *bias = NULL; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -791,8 +846,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, return (SET_ERROR(ENOTSUP)); if (top_level && alloctype == VDEV_ALLOC_ADD) { - const char *bias; - /* * If creating a top-level vdev, check for allocation * classes input. @@ -834,6 +887,11 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_tsd = tsd; vd->vdev_islog = islog; + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + alloc_bias = vdev_derive_alloc_bias(bias); + } + if (top_level && alloc_bias != VDEV_BIAS_NONE) vd->vdev_alloc_bias = alloc_bias; @@ -3680,8 +3738,9 @@ vdev_load(vdev_t *vd) VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), bias_str); if (error == 0) { - ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); - vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); + if (vd->vdev_alloc_bias == VDEV_BIAS_NONE) + vd->vdev_alloc_bias = + vdev_derive_alloc_bias(bias_str); } else if (error != ENOENT) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -4140,7 +4199,8 @@ vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) * If this device has the only valid copy of the data, then * back off and simply mark the vdev as degraded instead. */ - if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { + if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) && + vd->vdev_aux == NULL && vdev_dtl_required(vd)) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -4356,8 +4416,8 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) * don't allow it to be offlined. Log devices are always * expendable. */ - if (!tvd->vdev_islog && vd->vdev_aux == NULL && - vdev_dtl_required(vd)) + if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) && + vd->vdev_aux == NULL && vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EBUSY))); @@ -4413,7 +4473,8 @@ vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) vd->vdev_offline = B_TRUE; vdev_reopen(tvd); - if (!tvd->vdev_islog && vd->vdev_aux == NULL && + if (!tvd->vdev_islog && !vdev_is_special_failsafe(vd) && + vd->vdev_aux == NULL && vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; vdev_reopen(tvd); @@ -5259,10 +5320,14 @@ vdev_propagate_state(vdev_t *vd) * device, treat the root vdev as if it were * degraded. */ - if (child->vdev_islog && vd == rvd) + if ((child->vdev_islog || + vdev_is_special_failsafe(child)) && + (vd == rvd)) { degraded++; - else + } else { faulted++; + } + } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { degraded++; } @@ -5438,8 +5503,9 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) zfs_post_state_change(spa, vd, save_state); } - if (!isopen && vd->vdev_parent) + if (!isopen && vd->vdev_parent) { vdev_propagate_state(vd->vdev_parent); + } } boolean_t @@ -5507,6 +5573,24 @@ vdev_log_state_valid(vdev_t *vd) return (B_FALSE); } +/* + * Is the vdev an alloc class vdev that is part of a pool that has + * special_failsafe on, and thus has all it's data backed up to the main pool? + * + * This function works for both top-level vdevs and leaf vdevs. + */ +boolean_t +vdev_is_special_failsafe(vdev_t *vd) +{ + if (vdev_is_alloc_class(vd)) + return (vd->vdev_spa->spa_special_failsafe); + + if (vdev_is_leaf(vd)) + return (vdev_is_special_failsafe(vd->vdev_parent)); + + return (B_FALSE); +} + /* * Expand a vdev if possible. */ diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index c31f48028bbc..9e1a3088815a 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -521,8 +521,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_removing); } - /* zpool command expects alloc class data */ - if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) { const char *bias = NULL; switch (vd->vdev_alloc_bias) { @@ -539,6 +538,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, ASSERT3U(vd->vdev_alloc_bias, ==, VDEV_BIAS_NONE); } + fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, bias); } @@ -1804,9 +1804,10 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) spa_t *spa = svd[0]->vdev_spa; zio_t *zio; uint64_t good_writes = 0; + boolean_t failure_but_special_failsafe = B_FALSE; + int rc; zio = zio_root(spa, NULL, NULL, flags); - for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); @@ -1850,7 +1851,38 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) (void) zio_wait(zio); - return (good_writes >= 1 ? 0 : EIO); + /* + * Special case: + * + * If we had zero good writes, but all the writes were to alloc class + * disks that were on a pool with special_failsafe on, then it's not + * fatal. + */ + if (good_writes == 0) { + failure_but_special_failsafe = B_TRUE; + for (int v = 0; v < svdcount; v++) { + if (!vdev_is_special_failsafe(svd[v])) { + failure_but_special_failsafe = B_FALSE; + break; + } + } + } + + if (good_writes >= 1) { + /* success */ + rc = 0; + } else if (failure_but_special_failsafe) { + /* + * All the failures are on allocation class disks that were + * fully backed up to the pool, so this isn't fatal. + */ + rc = 0; + } else { + /* failure */ + rc = EIO; + } + + return (rc); } /* @@ -1966,7 +1998,8 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); zio_t *vio = zio_null(zio, spa, NULL, - (vd->vdev_islog || vd->vdev_aux != NULL) ? + (vd->vdev_islog || vd->vdev_aux != NULL || + vdev_is_special_failsafe(vd)) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); vdev_label_sync(vio, good_writes, vd, l, txg, flags); @@ -2019,6 +2052,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) if (error != 0) { if ((flags & ZIO_FLAG_TRYHARD) != 0) return (error); + flags |= ZIO_FLAG_TRYHARD; } diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index e132643dc330..3833bdf89d8d 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -32,6 +32,7 @@ #include #include #include +#include /* * Virtual device vector for the pool's root vdev. @@ -46,6 +47,7 @@ vdev_root_core_tvds(vdev_t *vd) vdev_t *cvd = vd->vdev_child[c]; if (!cvd->vdev_ishole && !cvd->vdev_islog && + !vdev_is_special_failsafe(vd) && cvd->vdev_ops != &vdev_indirect_ops) { tvds++; } @@ -87,6 +89,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, spa_t *spa = vd->vdev_spa; int lasterror = 0; int numerrors = 0; + int numerrors_recovered = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; @@ -97,18 +100,25 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - if (cvd->vdev_open_error && !cvd->vdev_islog && cvd->vdev_ops != &vdev_indirect_ops) { lasterror = cvd->vdev_open_error; numerrors++; + if (vdev_is_special_failsafe(cvd)) + numerrors_recovered++; } } - if (spa_load_state(spa) != SPA_LOAD_NONE) - spa_set_missing_tvds(spa, numerrors); + if (spa_load_state(spa) != SPA_LOAD_NONE) { + spa_set_missing_tvds(spa, numerrors, numerrors_recovered); + } - if (too_many_errors(vd, numerrors)) { + if (numerrors != 0 && (numerrors == numerrors_recovered)) { + vdev_dbgmsg(vd, "there were %lu top-level errors, but they were" + " all alloc class vdevs with special_failsafe. Keep trying" + "to import.", + (long unsigned) numerrors); + } else if (too_many_errors(vd, numerrors)) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 08d56eef83e9..70d5a2d23946 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3505,6 +3505,19 @@ zio_ddt_write(zio_t *zio) ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); + /* + * Dedup writes can either to do a dedicated dedup device or to a + * dedicated special device. If we have special_failsafe on, we need + * to make an extra copy of the data to go on the pool. To do this + * we need to adjust the ZIO's copies here so the later stages in the + * ZIO pipeline work correctly. + */ + if (spa->spa_special_failsafe && zp->zp_copies == 1) { + zp->zp_copies = 2; + } + + p = zp->zp_copies; + ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); ddp = &dde->dde_phys[p]; @@ -3635,6 +3648,22 @@ zio_dva_throttle(zio_t *zio) mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); + /* + * If the special_failsafe pool prop is enabled, we will do the regular + * write to the special/dedup device and an additional "backup" + * write to the normal pool. That way if the special/dedup devices + * all fail, we don't lose all data in our pool. + * + * Reserve that 2nd write to the regular pool here. The DVAs + * for both writes will later be allocated in the + * next step in the ZIO pipeline in + * zio_dva_allocate()->metaslab_alloc(). + */ + if ((spa->spa_special_failsafe && (mc == spa_special_class(spa) || + mc == spa_dedup_class(spa))) && zio->io_prop.zp_copies == 1) { + zio->io_prop.zp_copies = 2; + } + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || !mc->mc_alloc_throttle_enabled || zio->io_child_type == ZIO_CHILD_GANG || diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 912344b4edde..2ae12b112bcd 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -53,6 +53,14 @@ tags = ['functional', 'arc'] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] + +[tests/functional/special_failsafe] +tests = ['special_failsafe_add', 'special_failsafe_create', + 'special_failsafe_files', 'special_failsafe_import', + 'special_failsafe_offline', 'special_failsafe_prop', + 'special_failsafe_scrub', 'special_failsafe_split'] +tags = ['functional', 'special_failsafe'] + [tests/functional/bclone] tests = ['bclone_crossfs_corner_cases_limited', 'bclone_crossfs_data', diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index dfab48d2cdaf..49a508cc5972 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1081,6 +1081,16 @@ function get_pool_prop # property pool zpool get -Hpo value "$prop" "$pool" || log_fail "zpool get $prop $pool" } +# Get the specified vdev property in parsable format or fail +function get_vdev_prop +{ + typeset prop=$1 + typeset pool=$2 + typeset vdev=$3 + + zpool get -Hpo value "$prop" "$pool" "$vdev" || log_fail "zpool get $prop $pool $vdev" +} + # Return 0 if a pool exists; $? otherwise # # $1 - pool name @@ -1815,7 +1825,7 @@ function verify_pool function get_disklist # pool { echo $(zpool iostat -v $1 | awk '(NR > 4) {print $1}' | \ - grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)|\-[0-9]$") + grep -vEe '^-----' -e "^(mirror|raidz[1-3]|draid[1-3]|spare|log|cache|special|dedup)(\-[0-9])+$") } # @@ -3907,3 +3917,28 @@ function pop_coredump_pattern ;; esac } + +# Get a list of all vdevs in the pool that are a certain type. +# +# The returned list is in a space-separated string, with the full path of each +# vdev included: +# +# "/dev/sda /dev/sdb /dev/sdc" +# +# $1: Type of disk to get ('special', 'dedup', 'log', 'cache', 'spare') +# $2: (optional) pool name +function get_list_of_vdevs_that_are { + poolname=${2:-$TESTPOOL} + + zpool status -P $poolname | sed -r '/\s+(mirror|draid|raidz)/d' | \ + awk -v token="$1" '{ + if (tmp == 1 && substr($1,1,1) == "/") { + if (first != 1) { + printf "%s", $1; + first=1; + } else { + printf " %s", $1; + } + } else {tmp=0}; if ($1 == token) {tmp=1}} + END {print ""}' +} diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index db6b4c0146a7..cd5cbdb92eb3 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -90,6 +90,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/alloc_class/alloc_class.kshlib \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ + functional/special_failsafe/special_failsafe.cfg \ + functional/special_failsafe/special_failsafe.kshlib \ functional/bclone/bclone.cfg \ functional/bclone/bclone_common.kshlib \ functional/bclone/bclone_corner_cases.kshlib \ @@ -441,6 +443,16 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/atime/root_atime_on.ksh \ functional/atime/root_relatime_on.ksh \ functional/atime/setup.ksh \ + functional/special_failsafe/special_failsafe_add.ksh \ + functional/special_failsafe/special_failsafe_create.ksh \ + functional/special_failsafe/special_failsafe_files.ksh \ + functional/special_failsafe/special_failsafe_import.ksh \ + functional/special_failsafe/special_failsafe_prop.ksh \ + functional/special_failsafe/special_failsafe_offline.ksh \ + functional/special_failsafe/special_failsafe_scrub.ksh \ + functional/special_failsafe/special_failsafe_split.ksh \ + functional/special_failsafe/cleanup.ksh \ + functional/special_failsafe/setup.ksh \ functional/bclone/bclone_crossfs_corner_cases.ksh \ functional/bclone/bclone_crossfs_corner_cases_limited.ksh \ functional/bclone/bclone_crossfs_data.ksh \ diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh index 3237d7cb784f..4ea64f8318e6 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh @@ -32,12 +32,16 @@ log_assert $claim log_onexit cleanup log_must disk_setup -for type in special dedup; do - log_mustnot zpool create -d $TESTPOOL $CLASS_DISK0 $type $CLASS_DISK1 + +for arg in '-o special_failsafe=on' '' ; do + for type in special dedup; do + log_mustnot zpool create $args -d $TESTPOOL $CLASS_DISK0 $type \ + $CLASS_DISK1 + done + log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must display_status "$TESTPOOL" + log_must zpool destroy -f "$TESTPOOL" done -log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 -log_must display_status "$TESTPOOL" -log_must zpool destroy -f "$TESTPOOL" log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh index 78d40ce56d4e..7ab6552ebb0c 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh @@ -31,27 +31,29 @@ log_onexit cleanup log_must disk_setup -for type in "" "mirror" "raidz" -do - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS - - if [ "$type" = "mirror" ]; then - log_must zpool add $TESTPOOL special mirror \ - $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 - elif [ "$type" = "raidz" ]; then - log_must zpool add $TESTPOOL special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 - else - log_must zpool add $TESTPOOL special $CLASS_DISK0 - log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 - fi - - log_must zpool destroy -f $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + for type in "" "mirror" "raidz" + do + log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS + + if [ "$type" = "mirror" ]; then + log_must zpool add $TESTPOOL special mirror \ + $CLASS_DISK0 $CLASS_DISK1 $CLASS_DISK2 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 + elif [ "$type" = "raidz" ]; then + log_must zpool add $TESTPOOL special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK1 + else + log_must zpool add $TESTPOOL special $CLASS_DISK0 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK0 + fi + + log_must zpool destroy -f $TESTPOOL + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh index 04ce486adb83..131bf79ff306 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh @@ -36,31 +36,35 @@ typeset ac_value typeset stype="" typeset sdisks="" -for type in "" "mirror" "raidz" -do - if [ "$type" = "mirror" ]; then - stype="mirror" - sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" - elif [ "$type" = "raidz" ]; then - stype="mirror" - sdisks="${CLASS_DISK0} ${CLASS_DISK1}" - else - stype="" - sdisks="${CLASS_DISK0}" - fi +for arg in '-o special_failsafe=on' '' ; do + for type in "" "mirror" "raidz" + do + if [ "$type" = "mirror" ]; then + stype="mirror" + sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" + elif [ "$type" = "raidz" ]; then + stype="mirror" + sdisks="${CLASS_DISK0} ${CLASS_DISK1}" + else + stype="" + sdisks="${CLASS_DISK0}" + fi - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \ - special $stype $sdisks + log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS \ + special $stype $sdisks - ac_value="$(zpool get -H -o property,value all | awk '/allocation_classes/ {print $2}')" - if [ "$ac_value" = "active" ]; then - log_note "feature@allocation_classes is active" - else - log_fail "feature@allocation_classes not active, \ - status = $ac_value" - fi + ac_value="$(zpool get -H -o property,value \ + feature@allocation_classes | \ + awk '/allocation_classes/ {print $2}')" + if [ "$ac_value" = "active" ]; then + log_note "feature@allocation_classes is active" + else + log_fail "feature@allocation_classes not active, \ + status = $ac_value" + fi - log_must zpool destroy -f $TESTPOOL + log_must zpool destroy -f $TESTPOOL + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh index 08c703e21acb..6e74b0a6b465 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh @@ -34,38 +34,44 @@ log_must disk_setup typeset ac_value -for type in "" "mirror" "raidz" -do - if [ "$type" = "mirror" ]; then - log_must zpool create $TESTPOOL $type $ZPOOL_DISK0 $ZPOOL_DISK1 - else - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS - fi - ac_value="$(zpool get -H -o property,value all | \ - awk '/allocation_classes/ {print $2}')" - if [ "$ac_value" = "enabled" ]; then - log_note "feature@allocation_classes is enabled" - else - log_fail "feature@allocation_classes not enabled, \ - status = $ac_value" - fi +for arg in '-o special_failsafe=on' '' ; do + for type in "" "mirror" "raidz" + do + if [ "$type" = "mirror" ]; then + log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISK0 \ + $ZPOOL_DISK1 + else + log_must zpool create $arg $TESTPOOL $type $ZPOOL_DISKS + fi + ac_value="$(zpool get -H -o property,value \ + feature@allocation_classes | \ + awk '/allocation_classes/ {print $2}')" + if [ "$ac_value" = "enabled" ]; then + log_note "feature@allocation_classes is enabled" + else + log_fail "feature@allocation_classes not enabled, \ + status = $ac_value" + fi - if [ "$type" = "" ]; then - log_must zpool add $TESTPOOL special $CLASS_DISK0 - else - log_must zpool add $TESTPOOL special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - fi - ac_value="$(zpool get -H -o property,value all | \ - awk '/allocation_classes/ {print $2}')" - if [ "$ac_value" = "active" ]; then - log_note "feature@allocation_classes is active" - else - log_fail "feature@allocation_classes not active, \ - status = $ac_value" - fi + if [ "$type" = "" ]; then + log_must zpool add $TESTPOOL special $CLASS_DISK0 + else + log_must zpool add $TESTPOOL special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + fi + ac_value="$(zpool get -H -o property,value \ + feature@allocation_classes | \ + awk '/allocation_classes/ {print $2}')" - log_must zpool destroy -f $TESTPOOL + if [ "$ac_value" = "active" ]; then + log_note "feature@allocation_classes is active" + else + log_fail "feature@allocation_classes not active, \ + status = $ac_value" + fi + + log_must zpool destroy -f $TESTPOOL + done done log_pass "Values of allocation_classes feature flag correct." diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh index 5852b2876e89..fc20fea6d096 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh @@ -32,10 +32,14 @@ log_onexit cleanup log_must disk_setup -log_must zpool create $TESTPOOL \ - mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ - special mirror $CLASS_DISK0 $CLASS_DISK1 -log_must zpool split $TESTPOOL split_pool -log_must zpool destroy -f $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL \ + mirror $ZPOOL_DISK0 $ZPOOL_DISK1 \ + special mirror $CLASS_DISK0 $CLASS_DISK1 + log_must zpool split $TESTPOOL split_pool + log_must zpool import -d $(dirname $CLASS_DISK1) split_pool + log_must zpool destroy -f $TESTPOOL + log_must zpool destroy -f split_pool +done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh index 106a6d933aac..a08732e6248f 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh @@ -31,11 +31,13 @@ log_onexit cleanup log_must disk_setup -log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS \ - special mirror $CLASS_DISK0 $CLASS_DISK1 -log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2 -log_must sleep 10 -log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 -log_must zpool destroy -f $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS \ + special mirror $CLASS_DISK0 $CLASS_DISK1 + log_must zpool replace $TESTPOOL $CLASS_DISK1 $CLASS_DISK2 + log_must sleep 10 + log_must zpool iostat -H $TESTPOOL $CLASS_DISK2 + log_must zpool destroy -f $TESTPOOL +done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh index f73fbbe38c9b..2ac1024e351d 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh @@ -35,22 +35,24 @@ typeset special_type="" typeset create_disks="" typeset added_disks="" -for type in "" "raidz" -do - if [ "$type" = "raidz" ]; then - special_type="mirror" - create_disks="${CLASS_DISK0} ${CLASS_DISK1}" - added_disks="${CLASS_DISK2} ${CLASS_DISK3}" - else - special_type="" - create_disks="${CLASS_DISK0}" - added_disks="${CLASS_DISK1}" - fi - log_must zpool create $TESTPOOL $type $ZPOOL_DISKS \ - special $special_type $create_disks - log_must zpool add $TESTPOOL special $special_type $added_disks - log_must zpool iostat $TESTPOOL $added_disks - log_must zpool destroy -f $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + for type in "" "raidz" + do + if [ "$type" = "raidz" ]; then + special_type="mirror" + create_disks="${CLASS_DISK0} ${CLASS_DISK1}" + added_disks="${CLASS_DISK2} ${CLASS_DISK3}" + else + special_type="" + create_disks="${CLASS_DISK0}" + added_disks="${CLASS_DISK1}" + fi + log_must zpool create $args$TESTPOOL $type $ZPOOL_DISKS \ + special $special_type $create_disks + log_must zpool add $TESTPOOL special $special_type $added_disks + log_must zpool iostat $TESTPOOL $added_disks + log_must zpool destroy -f $TESTPOOL + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh index e8061fdabcbd..db9fa468eab2 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh @@ -35,35 +35,39 @@ typeset stype="" typeset sdisks="" typeset props="" -for type in "" "mirror" "raidz" -do - if [ "$type" = "mirror" ]; then - stype="mirror" - sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" - props="-o ashift=12" - elif [ "$type" = "raidz" ]; then - stype="mirror" - sdisks="${CLASS_DISK0} ${CLASS_DISK1}" - else - stype="" - sdisks="${CLASS_DISK0}" - fi +for arg in '-o special_failsafe=on' '' ; do + for type in "" "mirror" "raidz" + do + if [ "$type" = "mirror" ]; then + stype="mirror" + sdisks="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2}" + props="-o ashift=12" + elif [ "$type" = "raidz" ]; then + stype="mirror" + sdisks="${CLASS_DISK0} ${CLASS_DISK1}" + else + stype="" + sdisks="${CLASS_DISK0}" + fi - # - # 1/3 of the time add the special vdev after creating the pool - # - if [ $((RANDOM % 3)) -eq 0 ]; then - log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS - log_must zpool add ${props} $TESTPOOL special $stype $sdisks - else - log_must zpool create ${props} $TESTPOOL $type $ZPOOL_DISKS \ - special $stype $sdisks - fi + # + # 1/3 of the time add the special vdev after creating the pool + # + if [ $((RANDOM % 3)) -eq 0 ]; then + log_must zpool create $arg ${props} $TESTPOOL $type \ + $ZPOOL_DISKS + log_must zpool add ${props} $TESTPOOL special $stype \ + $sdisks + else + log_must zpool create $arg ${props} $TESTPOOL $type \ + $ZPOOL_DISKS special $stype $sdisks + fi - log_must zpool export $TESTPOOL - log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL - log_must display_status $TESTPOOL - log_must zpool destroy -f $TESTPOOL + log_must zpool export $TESTPOOL + log_must zpool import -d $TEST_BASE_DIR -s $TESTPOOL + log_must display_status $TESTPOOL + log_must zpool destroy -f $TESTPOOL + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh index cbf5cbf89bdc..913f03f72fcb 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh @@ -32,19 +32,22 @@ log_onexit cleanup log_must disk_setup -log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - -for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072 -do - log_must zfs set special_small_blocks=$value $TESTPOOL - ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \ - awk '/special_small_blocks/ {print $3}') - if [ "$ACTUAL" != "$value" ] - then - log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value!" - fi +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + + for value in 0 512 1024 2048 4096 8192 16384 32768 65536 131072 + do + log_must zfs set special_small_blocks=$value $TESTPOOL + ACTUAL=$(zfs get -p special_small_blocks $TESTPOOL | \ + awk '/special_small_blocks/ {print $3}') + if [ "$ACTUAL" != "$value" ] + then + log_fail "v. $ACTUAL set for $TESTPOOL, expected v. $value" + fi + done + + log_must zpool destroy -f "$TESTPOOL" done -log_must zpool destroy -f "$TESTPOOL" log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh index 0be49b858758..ffc8b84468dc 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh @@ -32,13 +32,17 @@ log_assert $claim log_onexit cleanup log_must disk_setup -log_must zpool create $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 -for value in 256 1025 33554432 -do - log_mustnot zfs set special_small_blocks=$value $TESTPOOL +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + + for value in 256 1025 33554432 + do + log_mustnot zfs set special_small_blocks=$value $TESTPOOL + done + + log_must zpool destroy -f "$TESTPOOL" done -log_must zpool destroy -f "$TESTPOOL" log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh index 0b1c18bafdaf..16d25a3f282a 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh @@ -25,20 +25,20 @@ verify_runnable "global" # -# Verify the file identified by the input is written on a special vdev -# According to the pool layout used in this test vdev_id 3 and 4 are special -# XXX: move this function to libtest.shlib once we get "Vdev Properties" +# Given a dataset and an inode number, return a list of all the vdev numbers +# that the inode has blocks on. # -function file_in_special_vdev # +# For example, if the inode has blocks on vdevs 0, 1 and 2, this would return +# the string "0 1 2" +# +function vdevs_file_is_on # { typeset dataset="$1" typeset inum="$2" - typeset num_normal=$(echo $ZPOOL_DISKS | wc -w) - num_normal=${num_normal##* } - - zdb -dddddd $dataset $inum | awk -v d=$num_normal '{ + zdb -dddddd $dataset $inum | awk ' +/L0 [0-9]+/{ # find DVAs from string "offset level dva" only for L0 (data) blocks -if (match($0,"L0 [0-9]+")) { +# if (match($0,"L0 [0-9]+")) { dvas[0]=$3 dvas[1]=$4 dvas[2]=$5 @@ -50,25 +50,46 @@ if (match($0,"L0 [0-9]+")) { print "Error parsing DVA: <" dva ">"; exit 1; } - # verify vdev is "special" - if (arr[1] < d) { - exit 1; - } + count[arr[1]]++; } } -}}' +#} +} +END { + # Print out the unique vdev numbers that had data + firstprint=1; + for (i in count) { + if (firstprint==1) { + printf("%d", i); + firstprint=0; + } else { + printf(" %d", i); + } + } +} +' } # # Check that device removal works for special class vdevs # +# $1: Set to 1 to backup alloc class data to the pool. Leave blank to disable +# backup. function check_removal { + typeset backup + if [ "$1" == "1" ] ; then + backup=1 + args="-o special_failsafe=on" + else + backup=0 + args="" + fi + # # Create a non-raidz pool so we can remove top-level vdevs # - log_must disk_setup - log_must zpool create $TESTPOOL $ZPOOL_DISKS \ + log_must zpool create $args $TESTPOOL $ZPOOL_DISKS \ special $CLASS_DISK0 special $CLASS_DISK1 log_must display_status "$TESTPOOL" @@ -93,19 +114,49 @@ function check_removal for i in 1 2 3 4; do dataset="$TESTPOOL/$TESTFS" inum="$(get_objnum /$TESTPOOL/$TESTFS/testfile.$i)" - log_must file_in_special_vdev $dataset $inum + + # Get a list of all the vdevs 'testfile.$i' has blocks on. + # The list will be string like "0 1 2 3" if the blocks are on + # vdevs 0-3. + on_vdevs="$(vdevs_file_is_on $dataset $inum)" + + # Get the number of normal (non-special) pool disks + num_pool_disks=$(echo $ZPOOL_DISKS | wc -w) + num_pool_disks=${num_pool_disks##* } + + if [ "$backup" == "1" ] ; then + # Data should be on all vdevs (both pool and special + # devices). + lowest_data_disk=0 + highest_data_disk=$(($num_pool_disks + 1)) + else + + # Data should only be on special devices + lowest_data_disk=$num_pool_disks + highest_data_disk=$(($lowest_data_disk + 1)) + fi + + # Get the starting disks that we expect the data to be on. + # We assume two special devices are attached to the pool. + # Disk numbers start at zero. + expected_on_vdevs="$(seq -s ' ' $lowest_data_disk $highest_data_disk)" + + # Compare the disks we expect to see the blocks on with + # the actual disks they're on. + if [ "$on_vdevs" != "$expected_on_vdevs" ] ; then + # Data distribution is not what we expected, break out of + # the loop so we can properly tear down the pool. We will + # error out after the loop. + break; + fi done log_must zpool remove $TESTPOOL $CLASS_DISK0 - - sleep 5 - sync_pool $TESTPOOL - sleep 1 - - log_must zdb -bbcc $TESTPOOL - log_must zpool list -v $TESTPOOL log_must zpool destroy -f "$TESTPOOL" - log_must disk_cleanup + + if [ "$on_vdevs" != "$expected_on_vdevs" ] ; then + log_fail "Expected data on disks $expected_on_vdevs, got $on_vdevs" + fi } claim="Removing a special device from a pool succeeds." @@ -113,12 +164,15 @@ claim="Removing a special device from a pool succeeds." log_assert $claim log_onexit cleanup -typeset CLASS_DEVSIZE=$CLASS_DEVSIZE -for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do - typeset ZPOOL_DISKS=$ZPOOL_DISKS - for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do - check_removal +log_must disk_setup +for backup in "1" "" ; do + typeset CLASS_DEVSIZE=$CLASS_DEVSIZE + for CLASS_DEVSIZE in $CLASS_DEVSIZE $ZPOOL_DEVSIZE; do + typeset ZPOOL_DISKS=$ZPOOL_DISKS + for ZPOOL_DISKS in "$ZPOOL_DISKS" $ZPOOL_DISK0; do + check_removal $backup + done done done - +log_must disk_cleanup log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh index 624cab88af0c..789bf816eabb 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh @@ -33,31 +33,34 @@ log_onexit cleanup # Create a non-raidz pool so we can remove top-level vdevs # log_must disk_setup -log_must zpool create $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0 -log_must display_status "$TESTPOOL" -# -# Generate some dedup data in the dedup class before removal -# +for arg in '-o special_failsafe=on' '' ; do + log_must zpool create $arg $TESTPOOL $ZPOOL_DISKS dedup $CLASS_DISK0 + log_must display_status "$TESTPOOL" -log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL -block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL" -log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null" + # + # Generate some dedup data in the dedup class before removal + # -sync_pool -log_must zpool list -v $TESTPOOL + log_must zfs create -o dedup=on -V 2G $TESTPOOL/$TESTVOL + block_device_wait "$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL" + log_must eval "new_fs $ZVOL_DEVDIR/$TESTPOOL/$TESTVOL >/dev/null" -# -# remove a dedup allocation vdev -# -log_must zpool remove $TESTPOOL $CLASS_DISK0 + sync_pool + log_must zpool list -v $TESTPOOL + + # + # remove a dedup allocation vdev + # + log_must zpool remove $TESTPOOL $CLASS_DISK0 -sleep 5 -sync_pool $TESTPOOL -sleep 1 + sleep 5 + sync_pool $TESTPOOL + sleep 1 -log_must zdb -bbcc $TESTPOOL + log_must zdb -bbcc $TESTPOOL -log_must zpool destroy -f "$TESTPOOL" + log_must zpool destroy -f "$TESTPOOL" +done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh index 1b52014fd2d9..aae7ecbe9568 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh @@ -26,13 +26,15 @@ log_assert $claim log_onexit cleanup log_must disk_setup -for size in 512 4096 32768 131072 524288 1048576 -do - let bigger=$size*2 - log_mustnot zpool create -O recordsize=$size \ - -O special_small_blocks=$bigger \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 +for arg in '-o special_failsafe=on' '' ; do + for size in 512 4096 32768 131072 524288 1048576 + do + let bigger=$size*2 + log_mustnot zpool create $arg -O recordsize=$size \ + -O special_small_blocks=$bigger \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh index 49c468af6702..3922f8cb7bf9 100755 --- a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh @@ -26,20 +26,22 @@ log_assert $claim log_onexit cleanup log_must disk_setup -for size in 8192 32768 131072 524288 1048576 -do - let smaller=$size/2 - log_must zpool create -O recordsize=$size \ - -O special_small_blocks=$smaller \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - log_must zpool destroy -f "$TESTPOOL" - - log_must zpool create -O recordsize=$size \ - -O special_small_blocks=$size \ - $TESTPOOL raidz $ZPOOL_DISKS special mirror \ - $CLASS_DISK0 $CLASS_DISK1 - log_must zpool destroy -f "$TESTPOOL" +for arg in '-o special_failsafe=on' '' ; do + for size in 8192 32768 131072 524288 1048576 + do + let smaller=$size/2 + log_must zpool create $arg -O recordsize=$size \ + -O special_small_blocks=$smaller \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must zpool destroy -f "$TESTPOOL" + + log_must zpool create $arg -O recordsize=$size \ + -O special_small_blocks=$size \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must zpool destroy -f "$TESTPOOL" + done done log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh b/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh new file mode 100755 index 000000000000..5681caecfc52 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/cleanup.ksh @@ -0,0 +1,27 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017, Intel Corporation. +# Copyright (c) 2018, Delphix +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +verify_runnable "global" + +default_cleanup_noexit +disk_cleanup + +log_pass diff --git a/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh b/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh new file mode 100755 index 000000000000..5c2e45c8dc2e --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/setup.ksh @@ -0,0 +1,24 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) +# +# Copyright (c) 2017, Intel Corporation. +# Copyright (c) 2018 by Delphix. All rights reserved. + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +verify_runnable "global" + +disk_cleanup + +log_pass diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg new file mode 100644 index 000000000000..84200593eb38 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.cfg @@ -0,0 +1,36 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017, Intel Corporation. +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +export ZPOOL_DISK0="$TEST_BASE_DIR/device-0" +export ZPOOL_DISK1="$TEST_BASE_DIR/device-1" +export ZPOOL_DISK2="$TEST_BASE_DIR/device-2" +export ZPOOL_DISKS="${ZPOOL_DISK0} ${ZPOOL_DISK1} ${ZPOOL_DISK2}" + +export CLASS_DISK0="$TEST_BASE_DIR/device-3" +export CLASS_DISK1="$TEST_BASE_DIR/device-4" +export CLASS_DISK2="$TEST_BASE_DIR/device-5" +export CLASS_DISK3="$TEST_BASE_DIR/device-6" +export CLASS_DISK4="$TEST_BASE_DIR/device-7" +export CLASS_DISK5="$TEST_BASE_DIR/device-8" + +export CLASS_DISKS="${CLASS_DISK0} ${CLASS_DISK1} ${CLASS_DISK2} ${CLASS_DISK3} ${CLASS_DISK4} ${CLASS_DISK5}" + +export ZPOOL_DEVSIZE=200M +export CLASS_DEVSIZE=200M + +export IMPORTDIR="$TEST_BASE_DIR" diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib new file mode 100644 index 000000000000..25ae21d72c96 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe.kshlib @@ -0,0 +1,255 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017, Intel Corporation. +# Copyright (c) 2018 by Delphix. All rights reserved. +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.cfg + +BACKUP_DIR=$TEST_BASE_DIR/backups + +function disk_setup +{ + truncate -s $ZPOOL_DEVSIZE $ZPOOL_DISKS + truncate -s $CLASS_DEVSIZE $CLASS_DISKS + + if [ -d $BACKUP_DIR ] ; then + log_fail "Existing $TEST_BASE_DIR/backups directory (maybe leftover from failed test run?)" + fi + + mkdir -p $BACKUP_DIR +} + +function disk_cleanup +{ + rm -f $ZPOOL_DEVSIZE $ZPOOL_DISKS 2> /dev/null + rm -f $CLASS_DEVSIZE $CLASS_DISKS 2> /dev/null + + rm -f special_failsafe.key + rm -fr $BACKUP_DIR +} + +function cleanup +{ + if datasetexists $TESTPOOL ; then + zpool destroy -f $TESTPOOL 2> /dev/null + fi + + disk_cleanup +} + +# Write zeros to an existing file, keeping the same size. +function zero_file { + dd status=none if=/dev/zero of="$1" bs=$(stat_size "$1") count=1 +} + +# Write a verifiable file that will end up on a 'dedup' or 'special' vdev. +# The filename will include the sha256 of the file for easy verification later. +# +# $1: Write type - "dedup" or "special" +# $2: Path to directory to write the file to +# +# Note: we don't use log_must here since this can get really chatty and +# we don't want to spam the logs. It will log_fail if there is an error. +function write_verifiable_file { + class="$1" + writedir="$2" + + if [[ "$class" == "dedup" ]] ; then + # Our dedup file size can be up to a megabyte-ish + filesize=$((32768 + ($RANDOM * $RANDOM % 1000000))) + + # Make write a multiple of the recordsize for dedup + bs=32768 + count=$(($filesize / $bs)) + + # Fill data with the letter 'a' for dedup + file_write -b $bs -c $count -d 'a' -o create -f $writedir/tmp || return + else + # Make all files less than the 32k special_small_blocks size we + # setup at dataset creation time + filesize=$((($RANDOM % 32767) + 1)) + bs=$filesize + count=1 + dd status=none if=/dev/urandom bs=$bs count=$count of="$writedir/tmp" || return + fi + + + csum=$(sha256digest "$writedir/tmp") + newfile=$csum.$class$totalwritten + mv "$writedir/tmp" "$writedir/$newfile" + + # Basic sanity that we created our final file, and it has a non-zero size + expectedsize=$(($bs * $count)) + actualsize=$(stat_size "$writedir/$newfile") + if [[ "$actualsize" != "$expectedsize" ]] || [[ "$actualsize" == "0" ]] ; then + log_fail "File $writedir/$newfile bad size $actualsize (expected $expectedsize)" + return + fi + + totalwritten=$(($totalwritten + 1)) +} + +# Write some files to all our datasets. +# +# For each dataset: +# +# - 10 files should hit special vdevs +# - 10 files should hit dedup vdevs +function write_some_files { + typeset i + for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do + for j in $(seq 1 10) ; do + write_verifiable_file special /$TESTPOOL/$i + write_verifiable_file dedup /$TESTPOOL/$i + done + done +} + +# Given a directory containing only files created by write_verifiable_file(), +# verify that the contents of the file match the sha256sum in the file's name. +# +# $1: Dir path with files to verify +function verify_directory { + typeset verifydir="$1" + typeset i + for i in $(ls $verifydir) ; do + + # Files will look like: + # + # ed324386045fa39d3f41d4f13c8c3e6a4698466e2b694c327f7e490be9e4e33f.dedup13 + # + # Just grab the sha256 part + + shaname="$(echo $i | cut -f1 -d'.')" + if [[ $(sha256digest "$verifydir/$i") != "$shaname" ]] ; then + log_fail "$verifydir/$i sha256 not $shaname" + false + return + fi + done + true +} + +function backup_alloc_class_disks { + typeset i + for i in $@ ; do + cp ${i} $BACKUP_DIR/$(basename $i) + done +} + +function restore_alloc_class_disks { + typeset i + for i in $@ ; do + mv $BACKUP_DIR/$(basename $i) ${i} + done +} + +function zero_alloc_class_disks { + typeset i + for i in $@ ; do + zero_file "${i}" + done +} + +# Create multiple datasets with different permutations of copies and encryption +function special_failsafe_make_datasets { + + log_must zfs create -o compression=off -o special_small_blocks=32K -o recordsize=32K \ + -o dedup=on $TESTPOOL/$TESTFS + + keyfile=$(pwd)/special_failsafe.key + dd if=/dev/random of=$keyfile bs=32 count=1 + + log_must zfs create -o copies=2 -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/2copies + + log_must zfs create -o copies=3 -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/3copies + + log_must zfs create -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/encrypted + + log_must zfs create -o copies=2 -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/encrypted2copies + + log_must zfs create -o copies=3 -o encryption=on -o keylocation=file:///$keyfile -o keyformat=raw -o special_small_blocks=32K -o recordsize=32K -o dedup=on \ + $TESTPOOL/encrypted3copies +} + +# For each dataset we created in special_failsafe_make_datasets, go though +# and check that all the files in the datasets have the correct data. +function verify_all_directories { + typeset i + for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do + verify_directory /$TESTPOOL/$i + done + + # ...we should also have the correct number of files + totalfiles=0 + for i in $TESTFS 2copies 3copies encrypted encrypted2copies encrypted3copies ; do + totalfiles=$(($totalfiles + $(ls /$TESTPOOL/$i | wc -w))) + done + + if [[ "$totalfiles" != "$totalwritten" ]] ; then + log_fail "Wrong file count: expected $totalwritten, got $totalfiles" + else + log_note "Verified $totalfiles files" + fi +} + +# Return a space separated string of disks that are alloc class vdevs. Disk +# names will include the full path. +function get_list_of_alloc_class_disks { + typeset special_disks=$(get_list_of_vdevs_that_are "special") + typeset dedup_disks=$(get_list_of_vdevs_that_are "dedup") + typeset disks="$dedup_disks" + + if [ -n "$special_disks" ] ; then + disks="$special_disks $disks" + fi + + echo "$disks" +} + +# Check that the pool/vdev proprieties and features for alloc class backups +# are sane. +function check_pool_alloc_class_props { + typeset special_failsafe_feature=$(get_pool_prop feature@special_failsafe $TESTPOOL) + typeset special_failsafe_prop=$(get_pool_prop special_failsafe $TESTPOOL) + if [ "$special_failsafe_feature" == "disabled" ] ; then + log_must [ "$special_failsafe_prop" == "off" ] + fi +} + +# Simple function to check pool and vdev proprieties are what we expect. The +# values we expect are passed to this function: +# +# $1: 'feature@special_failsafe' pool feature +# $2: 'special_failsafe' pool prop +# +# This function will log_fail on error. +function boilerplate_check { + typeset special_failsafe_feature=$1 + typeset special_failsafe_prop=$2 + + if [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" != "$special_failsafe_feature" ] ; then + log_fail "feature@special_failsafe = $(get_pool_prop feature@special_failsafe $TESTPOOL), expected $special_failsafe_feature" + fi + + if [ "$(get_pool_prop special_failsafe $TESTPOOL)" != "$special_failsafe_prop" ] ; then + log_fail "special_failsafe = $(get_pool_prop special_failsafe $TESTPOOL), expected $special_failsafe_prop" + fi +} diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh new file mode 100755 index 000000000000..36ff874cb00e --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_add.ksh @@ -0,0 +1,96 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify that 'zpool add' and 'zpool attach' disks have the correct +# special_failsafe settings. + +verify_runnable "global" + +claim="zpool add|attach disks have correct special_failsafe settings" + +log_assert $claim +log_onexit cleanup + +# Try different pool configurations +configs="mirror $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 +mirror $ZPOOL_DISK0 $ZPOOL_DISK1 dedup mirror $CLASS_DISK0 $CLASS_DISK1" + +log_must disk_setup + +function do_test { + typeset config="$1" + typeset initial=$2 + typeset new=$3 + + log_must zpool create -o special_failsafe=$initial $TESTPOOL $config + totalwritten=0 + + # Sanity check that feature@special_failsafe aligns with the + # pool prop + if [ $initial == "on" ] ; then + feature_expected="active" + else + feature_expected="enabled" + fi + boilerplate_check "$feature_expected" "$initial" + + special_failsafe_make_datasets + write_some_files + + if [ $initial != "off" ] ; then + log_must zpool set special_failsafe=$new $TESTPOOL + fi + + write_some_files + + # Now add a new special/dedup disk to the special mirror + log_must zpool attach $TESTPOOL $CLASS_DISK0 $CLASS_DISK2 + write_some_files + + # Add another special & dedup disk in RAID0 with the existing + # special mirror + log_must zpool add $TESTPOOL special $CLASS_DISK3 + log_must zpool add $TESTPOOL dedup $CLASS_DISK4 + + write_some_files + verify_all_directories + + log_must zpool export $TESTPOOL + + alloc_class_disks="$(get_list_of_alloc_class_disks)" + zero_alloc_class_disks $alloc_class_disks + + log_must zpool import -l -d $IMPORTDIR $TESTPOOL + + verify_all_directories + + log_must zpool destroy $TESTPOOL +} + +# Create a pool that is initially not special_failsafe. Then, enable +# special_failsafe and add/attach a disk. +echo "$configs" | while read config ; do + for initial in "on" "off" ; do + for new in "on" "off" ; do + do_test "$config" $initial $new + done + done +done + +cleanup + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh new file mode 100755 index 000000000000..1905fba16073 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_create.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# DESCRIPTION: +# Verify 'zpool create' with different alloc class redundancy +# levels will correctly succeed or fail. + +verify_runnable "global" + +claim="zpool create with different special_failsafe and disk permutations work" + +log_assert $claim +log_onexit cleanup + +# These should always pass since they have same redundancy level +configs_pass="mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 dedup mirror $CLASS_DISK0 $CLASS_DISK1 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3" + +# These should always pass with special_failsafe enabled or when '-f' is passed. +# They should fail otherwise. +configs_pass_failsafe="mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special $CLASS_DISK0 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 dedup $CLASS_DISK0 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special $CLASS_DISK0 dedup $CLASS_DISK2 +mirror $ZPOOL_DISK1 $ZPOOL_DISK2 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2" + +log_must disk_setup + +# Try configs with matching redundancy levels. They should all pass. +echo "$configs_pass" | while read config ; do + log_must zpool create -o feature@special_failsafe=disabled $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -f -o feature@special_failsafe=disabled $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -f -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL +done + +# Try configs with lower redundancy level. They should fail if special_failsafe +# is turned off and -f is not used. +echo "$configs_pass_failsafe" | while read config ; do + log_mustnot zpool create -o feature@special_failsafe=disabled $TESTPOOL $config + + log_must zpool create -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -f -o feature@special_failsafe=disabled $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_must zpool create -f -o special_failsafe=on $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_mustnot zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config + + log_must zpool create -f -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL $config + log_must zpool destroy $TESTPOOL + + log_mustnot zpool create -o feature@special_failsafe=enabled -o special_failsafe=off $TESTPOOL $config +done + +cleanup + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh new file mode 100755 index 000000000000..808df272a4c7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_files.ksh @@ -0,0 +1,124 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Test multiple different special_failsafe permutations. After each step +# write a bunch of known files. Verify all files are present and correct +# after all the steps are complete. + +verify_runnable "global" + +claim="Files on special_failsafe enabled disks do not get corrupted" + +log_assert $claim +log_onexit cleanup + +# Try different pool configurations +configs="mirror $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$ZPOOL_DISKS special $CLASS_DISK0 +$ZPOOL_DISKS dedup $CLASS_DISK0" + +echo "$configs" | while read config ; do + log_must disk_setup + log_must zpool create -o special_failsafe=on $TESTPOOL $config + totalwritten=0 + special_failsafe_make_datasets + + write_some_files + verify_all_directories + + alloc_class_disks="$(get_list_of_alloc_class_disks)" + log_must zpool export $TESTPOOL + + backup_alloc_class_disks $alloc_class_disks + zero_alloc_class_disks $alloc_class_disks + + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + + # Our pool is imported but has all its special devices zeroed out. Try + # writing some files to it and export the pool + write_some_files + + log_must zpool export $TESTPOOL + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + + write_some_files + + log_must zpool export $TESTPOOL + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + + write_some_files + + # Make our old disks appear again (which have older data). Do a zpool + # clear to make them come back online and resilver. + restore_alloc_class_disks $alloc_class_disks + log_must zpool clear $TESTPOOL + + write_some_files + + # At this point the pool should be normal. The next test is to + # corrupt the alloc class devices while the pool is running. + zero_alloc_class_disks $alloc_class_disks + + # Trigger a scrub with our newly-zeroed alloc class disks + log_must zpool scrub $TESTPOOL + + # The pool should be degraded, but still alive. + check_state $TESTPOOL "" "DEGRADED" + + write_some_files + + # Replace all the alloc class disks. This should get the pool + # back to normal. + for disk in $alloc_class_disks ; do + log_must zpool replace $TESTPOOL $disk + done + + write_some_files + + log_must zpool export $TESTPOOL + + # Backup special disks, then totally remove them. + backup_alloc_class_disks $alloc_class_disks + + rm -f $alloc_class_disks + + # Try to import with the alloc class disks missing - it should work. + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + + # After all the pain we've put our pool though, it should still have all the + # correct file data. + log_must verify_all_directories + + if [[ "$totalwritten" != "840" ]] ; then + log_fail "Didn't see 840 files, saw $totalwritten" + fi + + # We've checked all the files. Do some more verifications. + verify_pool $TESTPOOL + verify_filesys $TESTPOOL $TESTPOOL $IMPORTDIR + + # Record a few stats that show metadata re in use + zpool get dedup $TESTPOOL + zdb -bb $TESTPOOL 2>&1 | grep -Ei 'normal|special|dedup|ddt' + + log_must zpool destroy $TESTPOOL + cleanup +done + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh new file mode 100755 index 000000000000..d8ba52c702b3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_import.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify we can import a special_failsafe pool even if all its alloc class +# devices are missing. +# +verify_runnable "global" + +claim="Verify imports work on special_failsafe pools when vdevs missing" + +log_assert $claim +log_onexit cleanup + +TWO_ZPOOL_DISKS="$ZPOOL_DISK0 $ZPOOL_DISK1" +REPLACE_DISK="$ZPOOL_DISK2" + +# Try a bunch of different pool configurations +configs="$TWO_ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $TWO_ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$TWO_ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$TWO_ZPOOL_DISKS special $CLASS_DISK0 +$TWO_ZPOOL_DISKS dedup $CLASS_DISK0" + +function do_test { + typeset config="$1" + typeset action="$2" + typeset onoff="$3" + + totalwritten=0 + log_must disk_setup + log_must zpool create -o special_failsafe=$onoff $TESTPOOL $config + + alloc_class_disks="$(get_list_of_alloc_class_disks)" + + special_failsafe_make_datasets + write_some_files + verify_all_directories + + log_must zpool export $TESTPOOL + + # Backup alloc class disk before removing them + backup_alloc_class_disks $alloc_class_disks + if [ "$action" == "remove" ] ; then + rm -f $alloc_class_disks + else + zero_alloc_class_disks $alloc_class_disks + fi + + # import should succeed or fail depending on how we're backed up + if [ "$onoff" == "on" ] ; then + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + else + log_mustnot zpool import -l -d "$IMPORTDIR" $TESTPOOL + + # With the disks restored, we should be able to import + restore_alloc_class_disks $alloc_class_disks + log_must zpool import -l -d "$IMPORTDIR" $TESTPOOL + fi + write_some_files + + # Do a scrub and verify everything is correct + verify_pool $TESTPOOL + + verify_all_directories + + zpool destroy $TESTPOOL + + cleanup +} + +echo "$configs" | while read config ; do + for action in "remove" "zero" ; do + for onoff in "off" "on" ; do + do_test "$config" "$action" "$onoff" + done + done +done + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh new file mode 100755 index 000000000000..8f5722dfd8d0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_offline.ksh @@ -0,0 +1,124 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify we can offline special_failsafe alloc class disks. +# Verify we cannot offline non-special_failsafe alloc class disks. +# +verify_runnable "global" + +claim="Verify correct behavior when we force fault an alloc class disk" + +log_assert $claim +log_onexit cleanup + +# Try a bunch of different pool configurations +configs="mirror $ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$ZPOOL_DISKS special $CLASS_DISK0 +$ZPOOL_DISKS dedup $CLASS_DISK0" + +function do_test { + prop="$1" + config="$2" + log_must disk_setup + log_must zpool create -f $prop $TESTPOOL $config + check_pool_alloc_class_props + + special_failsafe_make_datasets + totalwritten=0 + write_some_files + + alloc_class_disks=$(get_list_of_alloc_class_disks) + alloc_class_disks_arr=($alloc_class_disks) + + if [ "$prop" == "-o special_failsafe=on" ] ; then + log_must [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" == "active" ] + else + log_must [ "$(get_pool_prop feature@special_failsafe $TESTPOOL)" == "enabled" ] + fi + + for ((i = 0; i < ${#alloc_class_disks_arr[@]}; i++)); do + disk="${alloc_class_disks_arr[$i]}" + if [ "$prop" == "-o special_failsafe=on" ] ; then + # Everything is backed-up. We should be able to + # offline all the disks. + log_must zpool offline $TESTPOOL $disk + log_must check_state $TESTPOOL "$disk" "OFFLINE" + log_must check_state $TESTPOOL "" "DEGRADED" + else + PARENT=$(get_vdev_prop parent $TESTPOOL $disk) + if [ "$PARENT" == "$TESTPOOL" ] ; then + # Leaf is TLD, offline should fail + log_mustnot zpool offline $TESTPOOL $disk + log_must check_state $TESTPOOL "$disk" "ONLINE" + log_must check_state $TESTPOOL "" "ONLINE" + else + # We're part of a mirror. We know all + # mirrors in our test pool are two disk + # so we should be able to offline the + # first disk, but not the second. + if [ "$i" == "0" ] ; then + # First alloc class disk - pretend + # "previous" disk was online to + # make things easy. + prev_online=1 + else + if check_state $TESTPOOL "${alloc_class_disks_arr[$i - 1]}" "ONLINE" ; then + prev_online=1 + else + prev_online=0 + fi + fi + + if [ "$prev_online" == "1" ] ; then + # First disk in mirror, can offline + log_must zpool offline $TESTPOOL $disk + log_must check_state $TESTPOOL "$disk" "OFFLINE" + log_must check_state $TESTPOOL "" "DEGRADED" + else + # Second disk in mirror, can't offline + # but we should still be in a pool + # degraded state from the first disk + # going offline. + log_mustnot zpool offline $TESTPOOL $disk + log_must check_state $TESTPOOL "$disk" "ONLINE" + log_must check_state $TESTPOOL "" "DEGRADED" + fi + fi + fi + done + + write_some_files + verify_all_directories + + # We've checked all the files. Do some more verifications. + verify_pool $TESTPOOL + verify_filesys $TESTPOOL $TESTPOOL $IMPORTDIR + + zpool clear $TESTPOOL + zpool destroy $TESTPOOL + cleanup +} + +for prop in "-o special_failsafe=on" "" ; do + echo "$configs" | while read config ; do + do_test "$prop" "$config" + done +done + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh new file mode 100755 index 000000000000..2c5c60251545 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_prop.ksh @@ -0,0 +1,118 @@ +#!/bin/ksh -p + +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify that special_failsafe prop does not work if +# SPA_FEATURE_SPECIAL_FAILSAFE is disabled. Also, test upgrades. + +verify_runnable "global" + +claim="special_failsafe prop shouldn't work without SPA_FEATURE_SPECIAL_FAILSAFE" + +log_assert $claim +log_onexit cleanup + +# Try a bunch of different pool configurations +configs="$ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$ZPOOL_DISKS special $CLASS_DISK0 +$ZPOOL_DISKS dedup $CLASS_DISK0" + +# Make the pool disks smaller to make them quicker to back up. We don't use +# much data on them. +export ZPOOL_DEVSIZE=200M +export CLASS_DEVSIZE=200M + +log_must disk_setup + +echo "$configs" | while read config ; do + # We should not be able to set special_failsafe=on if the feature + # flag is disabled. + log_mustnot zpool create -o feature@special_failsafe=disabled -o special_failsafe=on $TESTPOOL $config + + # Try a few permutations that should succeed + log_must zpool create -o special_failsafe=off $TESTPOOL $config + boilerplate_check "enabled" "off" + log_must zpool destroy $TESTPOOL + + log_must zpool create -o special_failsafe=on $TESTPOOL $config + boilerplate_check "active" "on" + log_must zpool destroy $TESTPOOL + + log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=on $TESTPOOL $config + boilerplate_check "active" "on" + log_must zpool destroy $TESTPOOL +done + +# Now let's do a multi-step test where we upgrade an older pool +for cmd in "zpool set feature@special_failsafe=enabled $TESTPOOL" "zpool upgrade $TESTPOOL" ; do + + # Make a pool with no special devices + log_must zpool create -o feature@special_failsafe=disabled -o special_failsafe=off $TESTPOOL mirror $ZPOOL_DISKS + totalwritten=0 + + boilerplate_check "disabled" "off" + special_failsafe_make_datasets + write_some_files + + # Test enabling the feature in two different ways: + # + # zpool set feature@special_failsafe=enabled ... + # zpool upgrade ... + # + log_must eval "$cmd" + boilerplate_check "enabled" "off" + write_some_files + + # Shouldn't be able to add with special_failsafe prop off + log_mustnot zpool add $TESTPOOL special $CLASS_DISK0 + + log_must zpool set special_failsafe=on $TESTPOOL + boilerplate_check "enabled" "on" + write_some_files + + log_must zpool add $TESTPOOL special $CLASS_DISK0 + + boilerplate_check "active" "on" + + write_some_files + + zpool add $TESTPOOL dedup $CLASS_DISK1 + + write_some_files + + log_must zpool export $TESTPOOL + log_must zpool import -l -d $IMPORTDIR $TESTPOOL + + verify_all_directories + + # You should be able to turn special_failsafe off if it was on + log_must zpool set special_failsafe=off $TESTPOOL + + boilerplate_check "active" "off" + + # If special_failsafe prop was on and the feature active, and then you + # turned the prop off, you cannot turn it back on again. + log_mustnot zpool set special_failsafe=on $TESTPOOL + + log_must zpool destroy $TESTPOOL +done + +cleanup + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh new file mode 100755 index 000000000000..7ccb32b7bf82 --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_scrub.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Destroy alloc class disks and then do a scrub on both a +# special_failsafe and non-special_failsafe pool. The special_failsafe +# pool should only be DEGRADED, while the non-special_failsafe pool should +# be SUSPENDED. + +verify_runnable "global" + +claim="special_failsafe pools survive a normally fatal scrub with bad disks" + +log_assert $claim +log_onexit cleanup + +# Try different pool configurations +configs="$ZPOOL_DISKS special $CLASS_DISK0 $CLASS_DISK1 dedup $CLASS_DISK2 $CLASS_DISK3 +raidz $ZPOOL_DISKS special mirror $CLASS_DISK0 $CLASS_DISK1 dedup mirror $CLASS_DISK2 $CLASS_DISK3 +$ZPOOL_DISKS special $CLASS_DISK0 dedup $CLASS_DISK1 +$ZPOOL_DISKS special $CLASS_DISK0 +$ZPOOL_DISKS dedup $CLASS_DISK0" + +function do_test { + typeset config="$1" + typeset action="$2" + typeset onoff="$3" + totalwritten=0 + + log_must disk_setup + log_must zpool create -o feature@special_failsafe=enabled -o special_failsafe=$onoff $TESTPOOL $config + + special_failsafe_make_datasets + + totalwritten=0 + write_some_files + + # When we do a scrub later, we will either want it to suspend or not + # suspend the pool, depending on our backup settings. Make sure we are + # able to ride though the suspended pool so we # can continue with our + # tests. + log_must zpool set failmode=continue $TESTPOOL + + alloc_class_disks="$(get_list_of_alloc_class_disks)" + backup_alloc_class_disks $alloc_class_disks + zero_alloc_class_disks $alloc_class_disks + + # Spawn scrub into the background since the pool may be suspended and + # it will hang. We need to continue passed the hung scrub so we + # can restore the bad disks and do a 'zpool clear' to remove the + # suspended pool. + zpool scrub $TESTPOOL & + + wait_scrubbed $TESTPOOL 3 + if [ "$onoff" == "on" ] ; then + log_must check_state $TESTPOOL "" "DEGRADED" + + verify_pool $TESTPOOL + + write_some_files + verify_all_directories + else + log_must check_state $TESTPOOL "" "SUSPENDED" + + # Pool should be suspended. Restore the old disks so we can + # clear the suspension. 'zpool clear' here will delete the + # pool. + restore_alloc_class_disks $alloc_class_disks + log_must zpool clear $TESTPOOL + fi + + cleanup +} + +# Stop zed in case we left it running from an old, aborted, test run. +zed_stop +zed_cleanup + +log_must zed_setup +log_must zed_start +log_must zed_events_drain + +# Verify scrubs work as expected with different permutations of special_failsafe +echo "$configs" | while read config ; do + for i in "on" "off" ; do + do_test "$config" "zero" "$i" + done +done + +log_must zed_stop +log_must zed_cleanup + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh new file mode 100755 index 000000000000..79a3008740fc --- /dev/null +++ b/tests/zfs-tests/tests/functional/special_failsafe/special_failsafe_split.ksh @@ -0,0 +1,94 @@ +#!/bin/ksh -p + +# Copyright (C) 2024 Lawrence Livermore National Security, LLC. +# Refer to the OpenZFS git commit log for authoritative copyright attribution. +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049) + +. $STF_SUITE/tests/functional/special_failsafe/special_failsafe.kshlib + +# +# DESCRIPTION: +# Verify we can split a pool with special_failsafe, and the new pool +# keeps the special_failsafe settings. Also verify the new pool has +# all the data if the pool has special_failsafe. +# +verify_runnable "global" + +claim="zpool split works with special_failsafe" + +log_assert $claim +log_onexit cleanup + +# Create a normal, special_failsafe pool +log_must disk_setup +log_must zpool create -o special_failsafe=on $TESTPOOL mirror \ + $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup \ + mirror $CLASS_DISK2 $CLASS_DISK3 + +totalwritten=0 +special_failsafe_make_datasets +write_some_files +verify_all_directories + +# Split the pool and verify the old pool has all the data +newpool="${TESTPOOL}-2" + +log_must zpool split $TESTPOOL $newpool +check_pool_alloc_class_props +verify_all_directories + +# Forcefault alloc class devices on the old pool and verify we have all the +# data. +log_must zpool offline -f $TESTPOOL $CLASS_DISK0 +log_must zpool offline -f $TESTPOOL $CLASS_DISK2 +log_must check_state $TESTPOOL $CLASS_DISK0 "FAULTED" +log_must check_state $TESTPOOL $CLASS_DISK2 "FAULTED" + +log_must check_state $TESTPOOL "" "DEGRADED" +verify_all_directories + +log_must zpool clear $TESTPOOL + +# All done with the old pool +log_must zpool destroy $TESTPOOL + +# Import the new split pool and rename it $TESTPOOL since all our verification +# functions expect the pool to be called $TESTPOOL. +log_must zpool import -l -f -d $IMPORTDIR $newpool $TESTPOOL + +check_pool_alloc_class_props +verify_all_directories + +# zero alloc class devices on the old pool and verify we have all the +# data. +log_must zpool export $TESTPOOL + +zero_file $CLASS_DISK1 +zero_file $CLASS_DISK3 + +log_must zpool import -l -f -d $IMPORTDIR $TESTPOOL + +verify_all_directories +log_must zpool destroy $TESTPOOL + +# Create a non-special_failsafe pool, split it, and verify the split pool is +# also not special_failsafe. +log_must zpool create -o special_failsafe=off $TESTPOOL mirror \ + $ZPOOL_DISK0 $ZPOOL_DISK1 special mirror $CLASS_DISK0 $CLASS_DISK1 dedup \ + mirror $CLASS_DISK2 $CLASS_DISK3 + +log_must zpool split $TESTPOOL $newpool +check_pool_alloc_class_props +log_must zpool destroy $TESTPOOL +log_must zpool import -l -f -d $IMPORTDIR $newpool $TESTPOOL +check_pool_alloc_class_props +log_must zpool destroy $TESTPOOL + +log_pass $claim