From c937bc8edfcdcb4c8d6a6b2b2ae4ae88bbde45e2 Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Fri, 12 Apr 2024 04:51:21 +0000 Subject: [PATCH] ddt: add support for prefetching tables into the ARC This change adds a new `zpool prefetch -t ddt $pool` command which causes a pool's DDT to be loaded into the ARC. The primary goal is to remove the need to "warm" a pool's cache before deduplication stops slowing write performance. It may also provide a way to reload portions of a DDT if they have been flushed due to inactivity. Sponsored-by: iXsystems, Inc. Sponsored-by: Catalogics, Inc. Sponsored-by: Klara, Inc. Co-authored-by: Will Andrews Co-authored-by: Allan Jude Co-authored-by: Don Brady Signed-off-by: Allan Jude Signed-off-by: Will Andrews Signed-off-by: Fred Weigel Signed-off-by: Rob Norris Signed-off-by: Don Brady --- cmd/zdb/zdb.c | 4 +- cmd/zpool/zpool_main.c | 112 ++++++++- cmd/ztest.c | 18 ++ contrib/debian/openzfs-zfsutils.install | 1 + include/libzfs.h | 3 + include/libzfs_core.h | 3 + include/sys/arc.h | 11 + include/sys/ddt.h | 4 +- include/sys/ddt_impl.h | 1 + include/sys/dmu.h | 8 + include/sys/fs/zfs.h | 23 ++ include/sys/spa.h | 2 + include/sys/zap.h | 1 + lib/libzfs/libzfs.abi | 57 ++++- lib/libzfs/libzfs_impl.h | 3 + lib/libzfs/libzfs_pool.c | 47 ++++ lib/libzfs_core/libzfs_core.abi | 13 ++ lib/libzfs_core/libzfs_core.c | 20 ++ man/Makefile.am | 1 + man/man7/zpoolprops.7 | 4 + man/man8/zpool-prefetch.8 | 46 ++++ man/man8/zpool-status.8 | 4 +- man/man8/zpool.8 | 5 +- module/zcommon/zpool_prop.c | 3 + module/zfs/arc.c | 53 ++++- module/zfs/ddt.c | 34 ++- module/zfs/ddt_stats.c | 29 +++ module/zfs/ddt_zap.c | 7 + module/zfs/dmu.c | 212 +++++++++++++++++- module/zfs/spa.c | 59 ++++- module/zfs/zap_micro.c | 16 ++ module/zfs/zfs_ioctl.c | 98 ++++++-- tests/runfiles/common.run | 4 + tests/zfs-tests/tests/Makefile.am | 4 + .../cli_root/zpool_prefetch/cleanup.ksh | 30 +++ .../cli_root/zpool_prefetch/setup.ksh | 32 +++ .../zpool_prefetch/zpool_prefetch.cfg | 26 +++ .../zpool_prefetch/zpool_prefetch_001_pos.ksh | 128 +++++++++++ 38 files changed, 1084 insertions(+), 42 deletions(-) create mode 100644 man/man8/zpool-prefetch.8 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index bb593b44f50a..9c316d0a7348 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1985,8 +1985,8 @@ dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", name, (u_longlong_t)count, - (u_longlong_t)(dspace / count), - (u_longlong_t)(mspace / count)); + (u_longlong_t)dspace, + (u_longlong_t)mspace); if (dump_opt['D'] < 3) return; diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 57170c8ae717..1d4f59c4b863 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -32,7 +32,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, loli10K * Copyright (c) 2021, Colm Buckley - * Copyright (c) 2021, Klara Inc. + * Copyright (c) 2021, 2023, Klara Inc. * Copyright [2021] Hewlett Packard Enterprise Development LP */ @@ -90,6 +90,7 @@ static int zpool_do_remove(int, char **); static int zpool_do_labelclear(int, char **); static int zpool_do_checkpoint(int, char **); +static int zpool_do_prefetch(int, char **); static int zpool_do_list(int, char **); static int zpool_do_iostat(int, char **); @@ -176,6 +177,7 @@ typedef enum { HELP_LIST, HELP_OFFLINE, HELP_ONLINE, + HELP_PREFETCH, HELP_REPLACE, HELP_REMOVE, HELP_INITIALIZE, @@ -307,6 +309,7 @@ static zpool_command_t command_table[] = { { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, { NULL }, { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, + { "prefetch", zpool_do_prefetch, HELP_PREFETCH }, { NULL }, { "list", zpool_do_list, HELP_LIST }, { "iostat", zpool_do_iostat, HELP_IOSTAT }, @@ -398,6 +401,9 @@ get_usage(zpool_help_t idx) return (gettext("\tlist [-gHLpPv] [-o property[,...]] " "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); + case HELP_PREFETCH: + return (gettext("\tprefetch -t [] \n" + "\t -t ddt \n")); case HELP_OFFLINE: return (gettext("\toffline [--power]|[[-f][-t]] " " ...\n")); @@ -3827,6 +3833,72 @@ zpool_do_checkpoint(int argc, char **argv) #define CHECKPOINT_OPT 1024 +/* + * zpool prefetch [] + * + * Prefetchs a particular type of data in the specified pool. + */ +int +zpool_do_prefetch(int argc, char **argv) +{ + int c; + char *poolname; + char *typestr = NULL; + zpool_prefetch_type_t type; + zpool_handle_t *zhp; + int err = 0; + + while ((c = getopt(argc, argv, "t:")) != -1) { + switch (c) { + case 't': + typestr = optarg; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + argc--; + argv++; + + if (strcmp(typestr, "ddt") == 0) { + type = ZPOOL_PREFETCH_DDT; + } else { + (void) fprintf(stderr, gettext("unsupported prefetch type\n")); + usage(B_FALSE); + } + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + err = zpool_prefetch(zhp, type); + + zpool_close(zhp); + + return (err); +} + /* * zpool import [-d dir] [-D] * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] @@ -6446,6 +6518,7 @@ print_one_column(zpool_prop_t prop, uint64_t value, const char *str, case ZPOOL_PROP_EXPANDSZ: case ZPOOL_PROP_CHECKPOINT: case ZPOOL_PROP_DEDUPRATIO: + case ZPOOL_PROP_DEDUPCACHED: if (value == 0) (void) strlcpy(propval, "-", sizeof (propval)); else @@ -8792,13 +8865,17 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, } static void -print_dedup_stats(nvlist_t *config) +print_dedup_stats(zpool_handle_t *zhp, nvlist_t *config, boolean_t literal) { ddt_histogram_t *ddh; ddt_stat_t *dds; ddt_object_t *ddo; uint_t c; - char dspace[6], mspace[6]; + /* Extra space provided for literal display */ + char dspace[32], mspace[32], cspace[32]; + uint64_t cspace_prop; + enum zfs_nicenum_format format; + zprop_source_t src; /* * If the pool was faulted then we may not have been able to @@ -8816,12 +8893,26 @@ print_dedup_stats(nvlist_t *config) return; } - zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); - zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); - (void) printf("DDT entries %llu, size %s on disk, %s in core\n", + /* + * Squash cached size into in-core size to handle race. + * Only include cached size if it is available. + */ + cspace_prop = zpool_get_prop_int(zhp, ZPOOL_PROP_DEDUPCACHED, &src); + cspace_prop = MIN(cspace_prop, ddo->ddo_mspace); + format = literal ? ZFS_NICENUM_RAW : ZFS_NICENUM_1024; + zfs_nicenum_format(cspace_prop, cspace, sizeof (cspace), format); + zfs_nicenum_format(ddo->ddo_dspace, dspace, sizeof (dspace), format); + zfs_nicenum_format(ddo->ddo_mspace, mspace, sizeof (mspace), format); + (void) printf("DDT entries %llu, size %s on disk, %s in core", (u_longlong_t)ddo->ddo_count, dspace, mspace); + if (src != ZPROP_SRC_DEFAULT) { + (void) printf(", %s cached (%.02f%%)", + cspace, + (double)cspace_prop / (double)ddo->ddo_mspace * 100.0); + } + (void) printf("\n"); verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, (uint64_t **)&dds, &c) == 0); @@ -8857,6 +8948,10 @@ status_callback(zpool_handle_t *zhp, void *data) uint_t c; vdev_stat_t *vs; + /* If dedup stats were requested, also fetch dedupcached. */ + if (cbp->cb_dedup_stats > 1) + zpool_add_propname(zhp, ZPOOL_DEDUPCACHED_PROP_NAME); + config = zpool_get_config(zhp, NULL); reason = zpool_get_status(zhp, &msgid, &errata); @@ -9338,7 +9433,7 @@ status_callback(zpool_handle_t *zhp, void *data) } if (cbp->cb_dedup_stats) - print_dedup_stats(config); + print_dedup_stats(zhp, config, cbp->cb_literal); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); @@ -9412,7 +9507,8 @@ zpool_do_status(int argc, char **argv) cmd = optarg; break; case 'D': - cb.cb_dedup_stats = B_TRUE; + if (++cb.cb_dedup_stats > 2) + cb.cb_dedup_stats = 2; break; case 'e': cb.cb_print_unhealthy = B_TRUE; diff --git a/cmd/ztest.c b/cmd/ztest.c index 3775e2ef2516..6a9264ddcc4c 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -26,6 +26,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2023, Klara, Inc. */ /* @@ -444,6 +445,7 @@ ztest_func_t ztest_blake3; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; +ztest_func_t ztest_pool_prefetch_ddt; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -499,6 +501,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), + ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -6993,6 +6996,21 @@ ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) } } +void +ztest_pool_prefetch_ddt(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + spa_t *spa; + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + + ddt_prefetch_all(spa); + + spa_close(spa, FTAG); + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + static int ztest_set_global_vars(void) { diff --git a/contrib/debian/openzfs-zfsutils.install b/contrib/debian/openzfs-zfsutils.install index 741014398ade..10083351abb5 100644 --- a/contrib/debian/openzfs-zfsutils.install +++ b/contrib/debian/openzfs-zfsutils.install @@ -111,6 +111,7 @@ usr/share/man/man8/zpool-labelclear.8 usr/share/man/man8/zpool-list.8 usr/share/man/man8/zpool-offline.8 usr/share/man/man8/zpool-online.8 +usr/share/man/man8/zpool-prefetch.8 usr/share/man/man8/zpool-reguid.8 usr/share/man/man8/zpool-remove.8 usr/share/man/man8/zpool-reopen.8 diff --git a/include/libzfs.h b/include/libzfs.h index 7836c2325f4e..979b919ce2fa 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -458,6 +458,7 @@ _LIBZFS_H nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); _LIBZFS_H nvlist_t *zpool_get_features(zpool_handle_t *); _LIBZFS_H int zpool_refresh_stats(zpool_handle_t *, boolean_t *); _LIBZFS_H int zpool_get_errlog(zpool_handle_t *, nvlist_t **); +_LIBZFS_H void zpool_add_propname(zpool_handle_t *, const char *); /* * Import and export functions @@ -504,6 +505,8 @@ _LIBZFS_H int zpool_checkpoint(zpool_handle_t *); _LIBZFS_H int zpool_discard_checkpoint(zpool_handle_t *); _LIBZFS_H boolean_t zpool_is_draid_spare(const char *); +_LIBZFS_H int zpool_prefetch(zpool_handle_t *, zpool_prefetch_type_t); + /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. diff --git a/include/libzfs_core.h b/include/libzfs_core.h index b2fd97372cd4..206e5e5c2bf6 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -148,6 +148,9 @@ _LIBZFS_CORE_H int lzc_pool_checkpoint_discard(const char *); _LIBZFS_CORE_H int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); _LIBZFS_CORE_H int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *); + +_LIBZFS_CORE_H int lzc_pool_prefetch(const char *, zpool_prefetch_type_t); + _LIBZFS_CORE_H int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); _LIBZFS_CORE_H int lzc_set_bootenv(const char *, const nvlist_t *); diff --git a/include/sys/arc.h b/include/sys/arc.h index 05307aab99e3..c92b3eee618c 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -250,6 +250,16 @@ typedef struct arc_buf_info { enum zio_compress abi_l2arc_compress; } arc_buf_info_t; +/* + * Flags returned by arc_cached; describes which part of the arc + * the block is cached in. + */ +#define ARC_CACHED_EMBEDDED (1U << 0) +#define ARC_CACHED_IN_L1 (1U << 1) +#define ARC_CACHED_IN_MRU (1U << 2) +#define ARC_CACHED_IN_MFU (1U << 3) +#define ARC_CACHED_IN_L2 (1U << 4) + void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); boolean_t arc_is_metadata(arc_buf_t *buf); @@ -310,6 +320,7 @@ zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); void arc_freed(spa_t *spa, const blkptr_t *bp); +int arc_cached(spa_t *spa, const blkptr_t *bp); void arc_flush(spa_t *spa, boolean_t retry); void arc_tempreserve_clear(uint64_t reserve); diff --git a/include/sys/ddt.h b/include/sys/ddt.h index e0129eda5cf5..66d59cebacde 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -236,6 +236,7 @@ extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); extern uint64_t ddt_get_dedup_dspace(spa_t *spa); extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); +extern int ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize); extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); extern void ddt_enter(ddt_t *ddt); @@ -243,8 +244,9 @@ extern void ddt_exit(ddt_t *ddt); extern void ddt_init(void); extern void ddt_fini(void); extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); -extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); +extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); +extern void ddt_prefetch_all(spa_t *spa); extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp); diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 52b927b7519d..4aaab10c8737 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -47,6 +47,7 @@ typedef struct { const ddt_key_t *ddk); void (*ddt_op_prefetch)(objset_t *os, uint64_t object, const ddt_key_t *ddk); + void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); int (*ddt_op_update)(objset_t *os, uint64_t object, const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index b5fed64da4ad..1376cbef763c 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -505,6 +505,12 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); +/* + * Get an estimated cache size for an object. Caller must expect races. + */ +int dmu_object_cached_size(objset_t *os, uint64_t object, + uint64_t *l1sz, uint64_t *l2sz); + void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); @@ -903,6 +909,8 @@ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri); +int dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 674112cd41fd..4358654c7484 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -260,6 +260,7 @@ typedef enum { ZPOOL_PROP_BCLONERATIO, ZPOOL_PROP_DEDUP_TABLE_SIZE, ZPOOL_PROP_DEDUP_TABLE_QUOTA, + ZPOOL_PROP_DEDUPCACHED, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -1514,6 +1515,7 @@ typedef enum zfs_ioc { ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ + ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1645,6 +1647,11 @@ typedef enum { ZFS_WAIT_NUM_ACTIVITIES } zfs_wait_activity_t; +typedef enum { + ZPOOL_PREFETCH_NONE = 0, + ZPOOL_PREFETCH_DDT +} zpool_prefetch_type_t; + /* * Bookmark name values. */ @@ -1683,6 +1690,17 @@ typedef enum { */ #define ZPOOL_HIDDEN_ARGS "hidden_args" +/* + * The following is used when invoking ZFS_IOC_POOL_GET_PROPS. + */ +#define ZPOOL_GET_PROPS_NAMES "get_props_names" + +/* + * Opt-in property names used with ZPOOL_GET_PROPS_NAMES. + * For example, properties that are hidden or expensive to compute. + */ +#define ZPOOL_DEDUPCACHED_PROP_NAME "dedupcached" + /* * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. */ @@ -1722,6 +1740,11 @@ typedef enum { #define ZFS_WAIT_ACTIVITY "wait_activity" #define ZFS_WAIT_WAITED "wait_waited" +/* + * The following are names used when invoking ZFS_IOC_POOL_PREFETCH. + */ +#define ZPOOL_PREFETCH_TYPE "prefetch_type" + /* * Flags for ZFS_IOC_VDEV_SET_STATE */ diff --git a/include/sys/spa.h b/include/sys/spa.h index df41002ed09b..12321c6562be 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1198,6 +1198,8 @@ extern void spa_boot_init(void); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern int spa_prop_get_nvlist(spa_t *spa, char **props, + unsigned int n_props, nvlist_t **outnvl); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); diff --git a/include/sys/zap.h b/include/sys/zap.h index 96ddcc324b65..0027f7c5103e 100644 --- a/include/sys/zap.h +++ b/include/sys/zap.h @@ -225,6 +225,7 @@ int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name); +int zap_prefetch_object(objset_t *os, uint64_t zapobj); int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 6127438ef92c..8893a36a125e 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -457,6 +457,7 @@ + @@ -520,6 +521,7 @@ + @@ -1642,6 +1644,9 @@ + + + @@ -2096,7 +2101,7 @@ - + @@ -2109,19 +2114,25 @@ + + + + + + - + - + - + - + @@ -2923,7 +2934,8 @@ - + + @@ -5893,6 +5905,7 @@ + @@ -5921,6 +5934,12 @@ + + + + + + @@ -5968,6 +5987,8 @@ + + @@ -6060,6 +6081,11 @@ + + + + + @@ -6205,6 +6231,13 @@ + + + + + + + @@ -6393,6 +6426,11 @@ + + + + + @@ -6587,6 +6625,11 @@ + + + + + @@ -8686,7 +8729,6 @@ - @@ -9148,7 +9190,6 @@ - diff --git a/lib/libzfs/libzfs_impl.h b/lib/libzfs/libzfs_impl.h index ef0359f45ea0..e98ede51e4ba 100644 --- a/lib/libzfs/libzfs_impl.h +++ b/lib/libzfs/libzfs_impl.h @@ -94,12 +94,15 @@ struct zfs_handle { * snapshots of volumes. */ #define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) +#define ZHP_MAX_PROPNAMES 4 struct zpool_handle { libzfs_handle_t *zpool_hdl; zpool_handle_t *zpool_next; char zpool_name[ZFS_MAX_DATASET_NAME_LEN]; int zpool_state; + unsigned int zpool_n_propnames; + const char *zpool_propnames[ZHP_MAX_PROPNAMES]; size_t zpool_config_size; nvlist_t *zpool_config; nvlist_t *zpool_old_config; diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index a5cd34736766..3f2dc7314cf6 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -79,6 +79,13 @@ zpool_get_all_props(zpool_handle_t *zhp) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (zhp->zpool_n_propnames > 0) { + nvlist_t *innvl = fnvlist_alloc(); + fnvlist_add_string_array(innvl, ZPOOL_GET_PROPS_NAMES, + zhp->zpool_propnames, zhp->zpool_n_propnames); + zcmd_write_src_nvlist(hdl, &zc, innvl); + } + zcmd_alloc_dst_nvlist(hdl, &zc, 0); while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { @@ -318,6 +325,15 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, return (0); } + /* + * ZPOOL_PROP_DEDUPCACHED can be fetched by name only using + * the ZPOOL_GET_PROPS_NAMES mechanism + */ + if (prop == ZPOOL_PROP_DEDUPCACHED) { + zpool_add_propname(zhp, ZPOOL_DEDUPCACHED_PROP_NAME); + (void) zpool_get_all_props(zhp); + } + if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && prop != ZPOOL_PROP_NAME) return (-1); @@ -361,6 +377,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, case ZPOOL_PROP_BCLONESAVED: case ZPOOL_PROP_BCLONEUSED: case ZPOOL_PROP_DEDUP_TABLE_SIZE: + case ZPOOL_PROP_DEDUPCACHED: if (literal) (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); @@ -1738,6 +1755,28 @@ zpool_discard_checkpoint(zpool_handle_t *zhp) return (0); } +/* + * Load data type for the given pool. + */ +int +zpool_prefetch(zpool_handle_t *zhp, zpool_prefetch_type_t type) +{ + libzfs_handle_t *hdl = zhp->zpool_hdl; + char msg[1024]; + int error; + + error = lzc_pool_prefetch(zhp->zpool_name, type); + if (error != 0) { + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot prefetch %s in '%s'"), + type == ZPOOL_PREFETCH_DDT ? "ddt" : "", zhp->zpool_name); + (void) zpool_standard_error(hdl, error, msg); + return (-1); + } + + return (0); +} + /* * Add the given vdevs to the pool. The caller must have already performed the * necessary verification to ensure that the vdev specification is well-formed. @@ -4401,6 +4440,14 @@ zbookmark_mem_compare(const void *a, const void *b) return (memcmp(a, b, sizeof (zbookmark_phys_t))); } +void +zpool_add_propname(zpool_handle_t *zhp, const char *propname) +{ + assert(zhp->zpool_n_propnames < ZHP_MAX_PROPNAMES); + zhp->zpool_propnames[zhp->zpool_n_propnames] = propname; + zhp->zpool_n_propnames++; +} + /* * Retrieve the persistent error log, uniquify the members, and return to the * caller. diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index c20698580ee7..02b2dc7182b1 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -176,6 +176,7 @@ + @@ -1428,6 +1429,7 @@ + @@ -1462,6 +1464,12 @@ + + + + + + @@ -2892,6 +2900,11 @@ + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 070f8c1be678..ec8b0ff4f61c 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1629,6 +1629,26 @@ lzc_pool_checkpoint_discard(const char *pool) return (error); } +/* + * Load the requested data type for the specified pool. + */ +int +lzc_pool_prefetch(const char *pool, zpool_prefetch_type_t type) +{ + int error; + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_int32(args, ZPOOL_PREFETCH_TYPE, type); + + error = lzc_ioctl(ZFS_IOC_POOL_PREFETCH, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + /* * Executes a read-only channel program. * diff --git a/man/Makefile.am b/man/Makefile.am index 43bb014ddd32..194bb4721619 100644 --- a/man/Makefile.am +++ b/man/Makefile.am @@ -83,6 +83,7 @@ dist_man_MANS = \ %D%/man8/zpool-list.8 \ %D%/man8/zpool-offline.8 \ %D%/man8/zpool-online.8 \ + %D%/man8/zpool-prefetch.8 \ %D%/man8/zpool-reguid.8 \ %D%/man8/zpool-remove.8 \ %D%/man8/zpool-reopen.8 \ diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index ff21e5300ce7..9aba11ad5bf3 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -73,6 +73,10 @@ The amount of storage used by cloned blocks. Percentage of pool space used. This property can also be referred to by its shortened column name, .Sy cap . +.It Sy dedupcached +Total size of the deduplication table currently loaded into the ARC. +See +.Xr zpool-prefetch 8 . .It Sy dedup_table_size Total on-disk size of the deduplication table. .It Sy expandsize diff --git a/man/man8/zpool-prefetch.8 b/man/man8/zpool-prefetch.8 new file mode 100644 index 000000000000..57445bd4a655 --- /dev/null +++ b/man/man8/zpool-prefetch.8 @@ -0,0 +1,46 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2023, Klara Inc. +.\" +.Dd February 14, 2024 +.Dt ZPOOL-PREFETCH 8 +.Os +. +.Sh NAME +.Nm zpool-prefetch +.Nd Loads specific types of data for the given pool +.Sh SYNOPSIS +.Nm zpool +.Cm prefetch +.Fl t Ar type +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm zpool +.Cm prefetch +.Fl t Li ddt +.Ar pool +.Xc +Prefetch data of a specific type for the given pool; specifically the DDT, +which will improve write I/O performance when the DDT is resident in the ARC. +.El diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index bbe7a45aa0c6..d570c852d787 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd February 14, 2024 .Dt ZPOOL-STATUS 8 .Os . @@ -75,6 +75,8 @@ Display a histogram of deduplication statistics, showing the allocated and referenced .Pq logically referenced in the pool block counts and sizes by reference count. +If repeated, (-DD), also shows statistics on how much of the DDT is resident +in the ARC. .It Fl e Only show unhealthy vdevs (not-ONLINE or with errors). .It Fl g diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index fe44e15cabe1..2b966b72bf4c 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd March 16, 2022 +.Dd February 14, 2024 .Dt ZPOOL 8 .Os . @@ -168,6 +168,8 @@ specified. . .Ss Maintenance .Bl -tag -width Ds +.It Xr zpool-prefetch 8 +Prefetches specific types of pool data. .It Xr zpool-scrub 8 Begins a scrub or resumes a paused scrub. .It Xr zpool-checkpoint 8 @@ -598,6 +600,7 @@ don't wait. .Xr zpool-list 8 , .Xr zpool-offline 8 , .Xr zpool-online 8 , +.Xr zpool-prefetch 8 , .Xr zpool-reguid 8 , .Xr zpool-remove 8 , .Xr zpool-reopen 8 , diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index 31b393ba6444..8fc5313e3da6 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -183,6 +183,9 @@ zpool_prop_init(void) zprop_register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO", B_FALSE, sfeatures); + zprop_register_hidden(ZPOOL_PROP_DEDUPCACHED, + ZPOOL_DEDUPCACHED_PROP_NAME, PROP_TYPE_NUMBER, PROP_READONLY, + ZFS_TYPE_POOL, "DEDUPCACHED", B_FALSE, sfeatures); zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 30d30b98a6c6..3fa13e8dbd43 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -26,7 +26,7 @@ * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2019, loli10K . All rights reserved. * Copyright (c) 2020, George Amanakis. All rights reserved. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020, The FreeBSD Foundation [1] * @@ -5454,6 +5454,57 @@ arc_read_done(zio_t *zio) } } +/* + * Lookup the block at the specified DVA (in bp), and return the manner in + * which the block is cached. A zero return indicates not cached. + */ +int +arc_cached(spa_t *spa, const blkptr_t *bp) +{ + arc_buf_hdr_t *hdr = NULL; + kmutex_t *hash_lock = NULL; + uint64_t guid = spa_load_guid(spa); + int flags = 0; + + if (BP_IS_EMBEDDED(bp)) + return (ARC_CACHED_EMBEDDED); + + hdr = buf_hash_find(guid, bp, &hash_lock); + if (hdr == NULL) + return (0); + + if (HDR_HAS_L1HDR(hdr)) { + arc_state_t *state = hdr->b_l1hdr.b_state; + /* + * We switch to ensure that any future arc_state_type_t + * changes are handled. This is just a shift to promote + * more compile-time checking. + */ + switch (state->arcs_state) { + case ARC_STATE_ANON: + break; + case ARC_STATE_MRU: + flags |= ARC_CACHED_IN_MRU | ARC_CACHED_IN_L1; + break; + case ARC_STATE_MFU: + flags |= ARC_CACHED_IN_MFU | ARC_CACHED_IN_L1; + break; + case ARC_STATE_UNCACHED: + /* The header is still in L1, probably not for long */ + flags |= ARC_CACHED_IN_L1; + break; + default: + break; + } + } + if (HDR_HAS_L2HDR(hdr)) + flags |= ARC_CACHED_IN_L2; + + mutex_exit(hash_lock); + + return (flags); +} + /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index ca73f1a31408..d70ae1a031d5 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -23,7 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2022 by Pawel Jakub Dawidek - * Copyright (c) 2023, Klara Inc. + * Copyright (c) 2019, 2023, Klara Inc. */ #include @@ -340,6 +340,16 @@ ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt->ddt_object[type][class], ddk); } +static void +ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class) +{ + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os, + ddt->ddt_object[type][class]); +} + static int ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde, dmu_tx_t *tx) @@ -652,6 +662,28 @@ ddt_over_quota(spa_t *spa) return (B_FALSE); } +void +ddt_prefetch_all(spa_t *spa) +{ + /* + * Load all DDT entries for each type/class combination. This is + * indended to perform a prefetch on all such blocks. For the same + * reason that ddt_prefetch isn't locked, this is also not locked. + */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { + ddt_object_prefetch_all(ddt, type, class); + } + } + } +} + ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 39b4edfc0f6a..82b682019ae9 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -248,3 +248,32 @@ ddt_get_pool_dedup_ratio(spa_t *spa) return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); } + +int +ddt_get_pool_dedup_cached(spa_t *spa, uint64_t *psize) +{ + uint64_t l1sz, l1tot, l2sz, l2tot; + int err = 0; + + l1tot = l2tot = 0; + *psize = 0; + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; + class++) { + err = dmu_object_cached_size(ddt->ddt_os, + ddt->ddt_object[type][class], &l1sz, &l2sz); + if (err != 0) + return (err); + l1tot += l1sz; + l2tot += l2sz; + } + } + } + + *psize = l1tot + l2tot; + return (err); +} diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 741554de3c60..7ce7461a2b25 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -147,6 +147,12 @@ ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk) (void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS); } +static void +ddt_zap_prefetch_all(objset_t *os, uint64_t object) +{ + (void) zap_prefetch_object(os, object); +} + static int ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) @@ -231,6 +237,7 @@ const ddt_ops_t ddt_zap_ops = { ddt_zap_lookup, ddt_zap_contains, ddt_zap_prefetch, + ddt_zap_prefetch_all, ddt_zap_update, ddt_zap_remove, ddt_zap_walk, diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 8b440aafba43..c23d0ccbdabe 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -26,7 +26,7 @@ * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 Datto Inc. - * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, 2023, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek @@ -701,7 +701,7 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag) * Issue prefetch I/Os for the given blocks. If level is greater than 0, the * indirect blocks prefetched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. If the range - * it too long, prefetch the first dmu_prefetch_max bytes as requested, while + * is too long, prefetch the first dmu_prefetch_max bytes as requested, while * for the rest only a higher level, also fitting within dmu_prefetch_max. It * should primarily help random reads, since for long sequential reads there is * a speculative prefetcher. @@ -777,6 +777,106 @@ dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset, rw_exit(&dn->dn_struct_rwlock); } +typedef struct { + kmutex_t dpa_lock; + kcondvar_t dpa_cv; + uint64_t dpa_pending_io; +} dmu_prefetch_arg_t; + +static void +dmu_prefetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t issued) +{ + (void) level; (void) blkid; (void)issued; + dmu_prefetch_arg_t *dpa = arg; + + ASSERT0(level); + + mutex_enter(&dpa->dpa_lock); + ASSERT3U(dpa->dpa_pending_io, >, 0); + if (--dpa->dpa_pending_io == 0) + cv_broadcast(&dpa->dpa_cv); + mutex_exit(&dpa->dpa_lock); +} + +static void +dmu_prefetch_wait_by_dnode(dnode_t *dn, uint64_t offset, uint64_t len) +{ + dmu_prefetch_arg_t dpa; + + mutex_init(&dpa.dpa_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dpa.dpa_cv, NULL, CV_DEFAULT, NULL); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + uint64_t start = dbuf_whichblock(dn, 0, offset); + uint64_t end = dbuf_whichblock(dn, 0, offset + len - 1) + 1; + dpa.dpa_pending_io = end - start; + + for (uint64_t blk = start; blk < end; blk++) { + (void) dbuf_prefetch_impl(dn, 0, blk, ZIO_PRIORITY_ASYNC_READ, + 0, dmu_prefetch_done, &dpa); + } + + rw_exit(&dn->dn_struct_rwlock); + + /* wait for prefetch L0 reads to finish */ + mutex_enter(&dpa.dpa_lock); + while (dpa.dpa_pending_io > 0) { + cv_wait(&dpa.dpa_cv, &dpa.dpa_lock); + + } + mutex_exit(&dpa.dpa_lock); + + mutex_destroy(&dpa.dpa_lock); + cv_destroy(&dpa.dpa_cv); +} + +/* + * Issue prefetch I/Os for the given L0 block range and wait for the I/O + * to complete. This does not enforce dmu_prefetch_max and will prefetch + * the entire range. The blocks are read from disk into the ARC but no + * decompression occurs (i.e., the dbuf cache is not required). + */ +int +dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size) +{ + dnode_t *dn; + int err = 0; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); + + /* + * Chunk the requests (16 indirects worth) so that we can be interrupted + */ + uint64_t chunksize; + if (dn->dn_indblkshift) { + uint64_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1); + chunksize = (nbps * 16) << dn->dn_datablkshift; + } else { + chunksize = dn->dn_datablksz; + } + + while (size > 0) { + uint64_t mylen = MIN(size, chunksize); + + dmu_prefetch_wait_by_dnode(dn, offset, mylen); + + offset += mylen; + size -= mylen; + + if (issig()) { + err = SET_ERROR(EINTR); + break; + } + } + + dnode_rele(dn, FTAG); + + return (err); +} + /* * Issue prefetch I/Os for the given object's dnode. */ @@ -1444,6 +1544,114 @@ dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size, } #endif /* _KERNEL */ +static void +dmu_cached_bps(spa_t *spa, blkptr_t *bps, uint_t nbps, + uint64_t *l1sz, uint64_t *l2sz) +{ + int cached_flags; + + if (bps == NULL) + return; + + for (size_t blk_off = 0; blk_off < nbps; blk_off++) { + blkptr_t *bp = &bps[blk_off]; + + if (BP_IS_HOLE(bp)) + continue; + + cached_flags = arc_cached(spa, bp); + if (cached_flags == 0) + continue; + + if ((cached_flags & (ARC_CACHED_IN_L1 | ARC_CACHED_IN_L2)) == + ARC_CACHED_IN_L2) + *l2sz += BP_GET_LSIZE(bp); + else + *l1sz += BP_GET_LSIZE(bp); + } +} + +/* + * Estimate DMU object cached size. + */ +int +dmu_object_cached_size(objset_t *os, uint64_t object, + uint64_t *l1sz, uint64_t *l2sz) +{ + dnode_t *dn; + dmu_object_info_t doi; + int err = 0; + + *l1sz = *l2sz = 0; + + if (dnode_hold(os, object, FTAG, &dn) != 0) + return (0); + + if (dn->dn_nlevels < 2) { + dnode_rele(dn, FTAG); + return (0); + } + + dmu_object_info_from_dnode(dn, &doi); + + for (uint64_t off = 0; off < doi.doi_max_offset; + off += dmu_prefetch_max) { + /* dbuf_read doesn't prefetch L1 blocks. */ + dmu_prefetch_by_dnode(dn, 1, off, + dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ); + } + + /* + * Hold all valid L1 blocks, asking ARC the status of each BP + * contained in each such L1 block. + */ + uint_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1); + uint64_t l1blks = 1 + (dn->dn_maxblkid / nbps); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + for (uint64_t blk = 0; blk < l1blks; blk++) { + dmu_buf_impl_t *db = NULL; + + if (issig()) { + /* + * On interrupt, get out, and bubble up EINTR + */ + err = EINTR; + break; + } + + /* + * If we get an i/o error here, the L1 can't be read, + * and nothing under it could be cached, so we just + * continue. Ignoring the error from dbuf_hold_impl + * or from dbuf_read is then a reasonable choice. + */ + err = dbuf_hold_impl(dn, 1, blk, B_TRUE, B_FALSE, FTAG, &db); + if (err != 0) { + /* + * ignore error and continue + */ + err = 0; + continue; + } + + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err == 0) { + dmu_cached_bps(dmu_objset_spa(os), db->db.db_data, + nbps, l1sz, l2sz); + } + /* + * error may be ignored, and we continue + */ + err = 0; + dbuf_rele(db, FTAG); + } + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); + return (err); +} + /* * Allocate a loaned anonymous arc buffer. */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1095c0af37f0..66f766cf9b94 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -34,7 +34,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. - * Copyright (c) 2024, Klara Inc. + * Copyright (c) 2023, 2024, Klara Inc. */ /* @@ -337,6 +337,55 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, nvlist_free(propval); } +static int +spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) +{ + zpool_prop_t prop = zpool_name_to_prop(propname); + zprop_source_t src = ZPROP_SRC_NONE; + uint64_t intval; + int err; + + /* + * NB: Not all properties lookups via this API require + * the spa props lock, so they must explicitly grab it here. + */ + switch (prop) { + case ZPOOL_PROP_DEDUPCACHED: + err = ddt_get_pool_dedup_cached(spa, &intval); + if (err != 0) + return (SET_ERROR(err)); + break; + default: + return (SET_ERROR(EINVAL)); + } + + spa_prop_add_list(outnvl, prop, NULL, intval, src); + + return (0); +} + +int +spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, + nvlist_t **outnvl) +{ + int err = 0; + + if (props == NULL) + return (0); + + if (*outnvl == NULL) { + err = nvlist_alloc(outnvl, NV_UNIQUE_NAME, KM_SLEEP); + if (err) + return (err); + } + + for (unsigned int i = 0; i < n_props && err == 0; i++) { + err = spa_prop_add(spa, props[i], *outnvl); + } + + return (err); +} + /* * Add a user property (source=src, propname=propval) to an nvlist. */ @@ -503,9 +552,11 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) dsl_pool_t *dp; int err; - err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); - if (err) - return (err); + if (*nvp == NULL) { + err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); + if (err) + return (err); + } dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index d806988af96d..0d6533b0e131 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -1072,6 +1072,21 @@ zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) return (err); } +int +zap_prefetch_object(objset_t *os, uint64_t zapobj) +{ + int error; + dmu_object_info_t doi; + + error = dmu_object_info(os, zapobj, &doi); + if (error == 0 && DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) + error = SET_ERROR(EINVAL); + if (error == 0) + dmu_prefetch_wait(os, zapobj, 0, doi.doi_max_offset); + + return (error); +} + int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) @@ -1784,6 +1799,7 @@ EXPORT_SYMBOL(zap_lookup_uint64); EXPORT_SYMBOL(zap_contains); EXPORT_SYMBOL(zap_prefetch); EXPORT_SYMBOL(zap_prefetch_uint64); +EXPORT_SYMBOL(zap_prefetch_object); EXPORT_SYMBOL(zap_add); EXPORT_SYMBOL(zap_add_by_dnode); EXPORT_SYMBOL(zap_add_uint64); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 7b527eb75e83..897335dd4e4f 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -38,7 +38,7 @@ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. - * Copyright (c) 2019, 2021, 2024, Klara Inc. + * Copyright (c) 2019, 2021, 2023, 2024, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright 2024 Oxide Computer Company */ @@ -3009,34 +3009,51 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) return (error); } +/* + * innvl: { + * "get_props_names": [ "prop1", "prop2", ..., "propN" ] + * } + */ + +static const zfs_ioc_key_t zfs_keys_get_props[] = { + { ZPOOL_GET_PROPS_NAMES, DATA_TYPE_STRING_ARRAY, ZK_OPTIONAL }, +}; + static int -zfs_ioc_pool_get_props(zfs_cmd_t *zc) +zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { + nvlist_t *nvp = outnvl; spa_t *spa; + char **props = NULL; + unsigned int n_props = 0; int error; - nvlist_t *nvp = NULL; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + if (nvlist_lookup_string_array(innvl, ZPOOL_GET_PROPS_NAMES, + &props, &n_props) != 0) { + props = NULL; + } + + if ((error = spa_open(pool, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(zc->zc_name)) != NULL) + if ((spa = spa_lookup(pool)) != NULL) { error = spa_prop_get(spa, &nvp); + if (error == 0 && props != NULL) + error = spa_prop_get_nvlist(spa, props, n_props, + &nvp); + } mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); + if (error == 0 && props != NULL) + error = spa_prop_get_nvlist(spa, props, n_props, &nvp); spa_close(spa, FTAG); } - if (error == 0 && zc->zc_nvlist_dst != 0) - error = put_nvlist(zc, nvp); - else - error = SET_ERROR(EFAULT); - - nvlist_free(nvp); return (error); } @@ -4031,6 +4048,52 @@ zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, return (spa_checkpoint_discard(poolname)); } +/* + * Loads specific types of data for the given pool + * + * innvl: { + * "prefetch_type" -> int32_t + * } + * + * outnvl: empty + */ +static const zfs_ioc_key_t zfs_keys_pool_prefetch[] = { + {ZPOOL_PREFETCH_TYPE, DATA_TYPE_INT32, 0}, +}; + +static int +zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + (void) outnvl; + + int error; + spa_t *spa; + int32_t type; + + /* + * Currently, only ZPOOL_PREFETCH_DDT is supported + */ + if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0 || + type != ZPOOL_PREFETCH_DDT) { + return (EINVAL); + } + + error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + hrtime_t start_time = gethrtime(); + + ddt_prefetch_all(spa); + + zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms", spa->spa_name, + (u_longlong_t)NSEC2MSEC(gethrtime() - start_time)); + + spa_close(spa, FTAG); + + return (error); +} + /* * inputs: * zc_name name of dataset to destroy @@ -7283,6 +7346,12 @@ zfs_ioctl_init(void) zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); + zfs_ioctl_register("zpool_prefetch", + ZFS_IOC_POOL_PREFETCH, zfs_ioc_pool_prefetch, + zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, + zfs_keys_pool_prefetch, ARRAY_SIZE(zfs_keys_pool_prefetch)); + zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, @@ -7328,6 +7397,11 @@ zfs_ioctl_init(void) POOL_CHECK_NONE, B_TRUE, B_TRUE, zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); + zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS, + zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE, + zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, @@ -7383,8 +7457,6 @@ zfs_ioctl_init(void) zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); - zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, - zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index d48b243eefed..51a38d70bc66 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -208,6 +208,10 @@ tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', 'zfs_create_verbose'] tags = ['functional', 'cli_root', 'zfs_create'] +[tests/functional/cli_root/zpool_prefetch] +tests = ['zpool_prefetch_001_pos'] +tags = ['functional', 'cli_root', 'zpool_prefetch'] + [tests/functional/cli_root/zfs_destroy] tests = ['zfs_clone_livelist_condense_and_disable', 'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup', diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index a55c86bd4d4f..c38b93817fc0 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -192,6 +192,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/cli_root/zpool_import/zpool_import.kshlib \ functional/cli_root/zpool_initialize/zpool_initialize.kshlib \ functional/cli_root/zpool_labelclear/labelclear.cfg \ + functional/cli_root/zpool_prefetch/zpool_prefetch.cfg \ functional/cli_root/zpool_remove/zpool_remove.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.cfg \ functional/cli_root/zpool_reopen/zpool_reopen.shlib \ @@ -1176,6 +1177,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_online/setup.ksh \ functional/cli_root/zpool_online/zpool_online_001_pos.ksh \ functional/cli_root/zpool_online/zpool_online_002_neg.ksh \ + functional/cli_root/zpool_prefetch/cleanup.ksh \ + functional/cli_root/zpool_prefetch/setup.ksh \ + functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh \ functional/cli_root/zpool_remove/cleanup.ksh \ functional/cli_root/zpool_remove/setup.ksh \ functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/cleanup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/cleanup.ksh new file mode 100755 index 000000000000..79cd6e9f908e --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/cleanup.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/setup.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/setup.ksh new file mode 100755 index 000000000000..6a9af3bc28c3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +. $STF_SUITE/include/libtest.shlib + +DISK=${DISKS%% *} + +default_setup $DISK diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch.cfg new file mode 100644 index 000000000000..70da58df084b --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch.cfg @@ -0,0 +1,26 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh new file mode 100755 index 000000000000..a96a38ff178a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh @@ -0,0 +1,128 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019, 2023 by Klara Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# 'zpool prefetch -t ddt ' can successfully load a pool's DDT on demand. +# +# STRATEGY: +# 1. Build up storage pool with deduplicated dataset. +# 2. Export the pool. +# 3. Import the pool, and use zpool prefetch -t ddt to load its table. +# 4. Verify the DDT was loaded successfully using ddt cache stats +# + +verify_runnable "both" + +log_assert "'zpool prefetch -t ddt ' can successfully load the DDT for a pool." + +function getddtstats +{ + typeset -n gds=$1 + typeset pool=$2 + + out=$(zpool status -DDp $pool | awk '/^ dedup: / {print $6 " " $9 " " $12}') + log_note "status -DDp output: ${out}" + + gds.ondisk=$(echo $out | cut -d" " -f1) + gds.incore=$(echo $out | cut -d" " -f2) + gds.cached=$(echo $out | cut -d" " -f3) + + # In case of missing data, reset to 0. This should normally be due + # to a pool without any DDT. + [ -z "${gds.ondisk}" ] && gds.ondisk="0" + [ -z "${gds.incore}" ] && gds.incore="0" + [ -z "${gds.cached}" ] && gds.cached="0" + + return true +} + +# Confirm that nothing happens on a standard pool config. +typeset -A before +log_must getddtstats before $TESTPOOL +log_note "before stats: ${before}" +log_must test "${before.ondisk}" -eq "0" +log_must test "${before.incore}" -eq "0" +log_must test "${before.cached}" -eq "0" +log_must zpool prefetch -t ddt $TESTPOOL + +# Build up the deduplicated dataset. This consists of creating enough files +# to generate a reasonable size DDT for testing purposes. + +DATASET=$TESTPOOL/ddt +log_must zfs create -o dedup=on $DATASET +MNTPOINT=$(get_prop mountpoint $TESTPOOL/ddt) + +log_note "Generating dataset ..." +typeset -i i=0 +while (( i < 16384 )); do + echo -n $i > $MNTPOINT/f.$i + + # Create some copies of the original mainly for the purpose of + # having duplicate entries. About half will have no copies, while + # the remainder will have an equal distribution of 1-4 copies, + # depending on the number put into the original. + typeset -i j + ((j = i % 8)) + while (( j < 4 )); do + cp $MNTPOINT/f.$i $MNTPOINT/f.$i.$j + ((j += 1)) + done + ((i += 1)) +done +log_note "Dataset generation completed." + +typeset -A generated +log_must getddtstats generated $TESTPOOL +log_note "generated stats: ${generated}" +log_must test "${generated.ondisk}" -ge "1048576" +log_must test "${generated.incore}" -ge "1048576" +log_must test "${generated.cached}" -ge "1048576" +log_must zpool prefetch -t ddt $TESTPOOL + +# Do an export/import series to flush the DDT dataset cache. +typeset -A reimport +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must getddtstats reimport $TESTPOOL +log_note "reimport stats: ${reimport}" +log_must test "${reimport.ondisk}" -ge "1048576" +log_must test "${reimport.incore}" -ge "1048576" +# On reimport, only the first block or two should be cached. +log_must test "${reimport.cached}" -le "65536" + +# Finally, reload it and check again. +typeset -A reloaded +log_must zpool prefetch -t ddt $TESTPOOL +log_must getddtstats reloaded $TESTPOOL +log_note "reloaded stats: ${reloaded}" +log_must test "${reloaded.ondisk}" -ge "1048576" +log_must test "${reloaded.incore}" -ge "1048576" +log_must test "${reloaded.cached}" -eq "${reloaded.incore}" + +log_pass "'zpool prefetch -t ddt ' success."