From 63f8a472afaec9aa4604d68376de08f94aaa9f65 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 17 Jan 2022 11:11:03 +0000 Subject: [PATCH 1/4] Shared L2ARC - Proof of Concept (I will give a talk on this PoC at the OpenZFS Developer Summit 2022.) The ARC dynamically shares DRAM capacity among all currently imported zpools. However, the L2ARC does not do the same for block capacity: the L2ARC vdevs of one zpool only cache buffers of that zpool. This can be undesirable on systems that host multiple zpools because it inhibits dynamic sharing of the cache device capacity which is desirable if the need for L2ARC varies among zpools over time, or if the set of zpools that are imported in the system varies over time. Shared L2ARC addresses this need by decoupling the L2ARC vdevs from the zpools that store actual data. The mechanism that we use is to place the L2ARC vdevs into a special zpool, and to adjust the L2ARC feed thread logic to use that special zpool's L2ARC vdevs for all zpools' buffers. High-level changes: * Reserve "NTNX-fsvm-local-l2arc" as a magic zpool name. We call this "the l2arc pool". All other pools are called "primary pools". * Make l2arc feed thread feed ARC buffers from any zpool to the l2arc zpool. (Before this patch, the l2arc feed thread would only feed ARC buffers to l2arc devices if they are for the same spa_t). * Change the locking to ensure that the l2arc zpool cannot be removed while there are ongoing reads initiated by arc_read on one of the primary pools. This is sufficient and retains correctness of the ARC because nothing about the fundamental operation of L2ARC changes. The only thing that changes is that the L2ARC data is stored on vdevs outside the primary pool. Proof Of Concept => Production ============================== This commit is a proof-of-concept. It works, it results in the desired performance improvement, and it's stable. But to make it production ready, more work needs to be done. (1) The design is based on a version of ZFS that does not support encryption nor Persisent L2ARC. I'm no expert in either of these features. Encryption might work just fine as long as the l2arc feed thread can access the encryption keys for l2arc_apply_transforms. But Persistent L2ARC definitely needs more design work (multiple L2ARC headers?). (2) Remove hard-coded magic name; use a property instead. Make it opt-in so that existing setups are not affected. Example: zpool create -o share_l2arc_vdevs=on my-l2arc-pool (3) Coexistence with non-shared L2ARC; also via property. Make it opt-in so that existing setups are not affected. Example: zpool set use_shared_l2arc=on my-data-pool Signed-off-by: Christian Schwarz --- cmd/zpool/zpool_main.c | 20 ++++ include/libzfs.h | 5 + include/sys/arc.h | 1 + include/sys/fs/zfs.h | 2 + lib/libzfs/libzfs_pool.c | 2 + module/zfs/arc.c | 250 ++++++++++++++++++++++++++++++++++----- module/zfs/dbuf.c | 23 +++- module/zfs/spa.c | 14 +++ 8 files changed, 289 insertions(+), 28 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 8a4f3dd16271..72b2bf85ba85 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2437,6 +2437,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) printf(gettext("invalid label")); break; + case VDEV_AUX_POOL_USES_NTNX_SHARED_L2ARC: + (void) printf(gettext("ignored in favor of cache devices in pool " NTNX_L2ARC_POOL_NAME)); + break; + default: (void) printf(gettext("corrupted data")); break; @@ -2596,6 +2600,10 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, (void) printf(gettext("invalid label")); break; + case VDEV_AUX_POOL_USES_NTNX_SHARED_L2ARC: + (void) printf(gettext("ignored in favor of cache devices in pool " NTNX_L2ARC_POOL_NAME)); + break; + default: (void) printf(gettext("corrupted data")); break; @@ -2896,6 +2904,11 @@ show_import(nvlist_t *config, boolean_t report_error) "\tExpect reduced performance.\n")); break; + case ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC: + (void) printf_color(ANSI_BOLD, + gettext(" status: The pool has L2ARC devices that will be ignored in favor of the shared L2ARC devices in pool " NTNX_L2ARC_POOL_NAME ".\n")); + break; + default: /* * No other status can be seen when importing pools. @@ -3035,6 +3048,9 @@ show_import(nvlist_t *config, boolean_t report_error) (void) printf(gettext(" action: Set a unique system " "hostid with the zgenhostid(8) command.\n")); break; + case ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC: + (void) printf(gettext(" action: The pool's L2ARC devices will be ignored in favor of the shared L2ARC devices in pool " NTNX_L2ARC_POOL_NAME ".\n")); + break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); @@ -8523,6 +8539,10 @@ status_callback(zpool_handle_t *zhp, void *data) } break; + case ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC: + (void) printf(gettext("status: The pool has L2ARC devices that will be ignored in favor of the shared L2ARC devices in pool " NTNX_L2ARC_POOL_NAME ".\n")); + break; + default: /* * The remaining errors can't actually be generated, yet. diff --git a/include/libzfs.h b/include/libzfs.h index df17873369ad..76bdcc16712f 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -419,6 +419,11 @@ typedef enum { ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ ZPOOL_STATUS_COMPATIBILITY_ERR, /* bad 'compatibility' property */ ZPOOL_STATUS_INCOMPATIBLE_FEAT, /* feature set outside compatibility */ + /* + * Pool won't use the given L2ARC because this software version uses + * the Nutanix shared L2ARC. + */ + ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC, /* * Finally, the following indicates a healthy pool. diff --git a/include/sys/arc.h b/include/sys/arc.h index 532a2fe4bc03..fd2f0494e730 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -328,6 +328,7 @@ void arc_fini(void); * Level 2 ARC */ +#define NTNX_L2ARC_POOL_NAME "NTNX-fsvm-local-l2arc" void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index b3fecf489eb0..421994657d76 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -956,6 +956,7 @@ typedef enum vdev_state { VDEV_STATE_REMOVED, /* Explicitly removed from system */ VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ VDEV_STATE_FAULTED, /* External request to fault device */ + VDEV_STATE_UNUSED, VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ VDEV_STATE_HEALTHY /* Presumed good */ } vdev_state_t; @@ -985,6 +986,7 @@ typedef enum vdev_aux { VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ + VDEV_AUX_POOL_USES_NTNX_SHARED_L2ARC, VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index b9806dc30dac..2c183a7cd589 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -201,6 +201,8 @@ zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) return (gettext("DEGRADED")); case VDEV_STATE_HEALTHY: return (gettext("ONLINE")); + case VDEV_STATE_UNUSED: + return (gettext("UNUSED")); default: break; diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 9254d8b84902..f81fa8f1c80b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -810,6 +810,8 @@ typedef struct l2arc_read_callback { zbookmark_phys_t l2rcb_zb; /* original bookmark */ int l2rcb_flags; /* original flags */ abd_t *l2rcb_abd; /* temporary buffer */ + spa_t *l2rcb_primary_spa; /* original spa */ + boolean_t l2rcb_need_exit_scl_l2arc_primary_spa; } l2arc_read_callback_t; typedef struct l2arc_data_free { @@ -865,7 +867,7 @@ static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); -static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static boolean_t l2arc_write_eligible(arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); static void l2arc_do_free_on_write(void); static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr, @@ -3980,7 +3982,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted) if (HDR_HAS_L2HDR(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); } else { - if (l2arc_write_eligible(hdr->b_spa, hdr)) { + if (l2arc_write_eligible(hdr)) { ARCSTAT_INCR(arcstat_evict_l2_eligible, HDR_GET_LSIZE(hdr)); @@ -5919,6 +5921,24 @@ arc_read_done(zio_t *zio) arc_hdr_destroy(hdr); } +static boolean_t +arc_read_l2arc_scl_l2arc_tryenter(vdev_t *l2arc_vdev) +{ + ASSERT(l2arc_vdev); + + /* we only use l2arc vdevs from the dedicated L2ARC pool */ + ASSERT(l2arc_vdev->vdev_spa); + ASSERT(strcmp(spa_name(l2arc_vdev->vdev_spa), NTNX_L2ARC_POOL_NAME) == 0); + + return spa_config_tryenter(l2arc_vdev->vdev_spa, SCL_L2ARC, l2arc_vdev, RW_READER); +} + +static void +arc_read_l2arc_scl_l2arc_exit(vdev_t *l2arc_vdev) +{ + spa_config_exit(l2arc_vdev->vdev_spa, SCL_L2ARC, l2arc_vdev); +} + /* * "Read" the block at the specified DVA (in bp) via the * cache. If the block is found in the cache, invoke the provided @@ -6316,7 +6336,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * Lock out L2ARC device removal. */ if (vdev_is_dead(vd) || - !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + !arc_read_l2arc_scl_l2arc_tryenter(vd)) vd = NULL; } @@ -6354,8 +6374,13 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, } /* Check if the spa even has l2 configured */ - const boolean_t spa_has_l2 = l2arc_ndev != 0 && - spa->spa_l2cache.sav_count > 0; + /* XXX shared l2arc: if this pool is supposed to use the shared + * l2arc vdevs, the value should be the result of a check whether + * shared l2arc vdevs are available. + * On spa's that aren't supposed to use shared l2arc, bring back + * the original check from OpenZFS commit 666aa69f. + */ + const boolean_t spa_has_l2 = B_TRUE; if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) { /* @@ -6374,16 +6399,15 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, abd_t *abd; uint64_t asize; - DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_l2_hits); - hdr->b_l2hdr.b_hits++; - cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_hdr = hdr; cb->l2rcb_bp = *bp; cb->l2rcb_zb = *zb; cb->l2rcb_flags = zio_flags; + cb->l2rcb_primary_spa = spa; + cb->l2rcb_need_exit_scl_l2arc_primary_spa = + B_FALSE; /* * When Compressed ARC is disabled, but the @@ -6409,6 +6433,38 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, addr + asize <= vd->vdev_psize - VDEV_LABEL_END_SIZE); + /* + * Make sure the primary spa is still around + * for the fallback read that will be initiated + * by l2arc_read_done if the read from the l2arc + * vdev fails and it's ARC_FLAG_NOWAIT. + * l2arc_read_done is responsible for releasing. + * We use `cb` as the tag because unlike `vd` + * or `rzio`, we control its lifetime. + */ + if (spa != vd->vdev_spa) { + /* + * See comment in l2arc_read_done on how + * we are careful to avoid recursive + * calls to this function, which would + * result in an error. + */ + int ok = spa_config_tryenter(spa, + SCL_L2ARC, cb, RW_READER); + if (!ok) { + arc_read_l2arc_scl_l2arc_exit(vd); + /* XXX separate stat & probe?? */ + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + goto l2arc_read_error; + } + } + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + hdr->b_l2hdr.b_hits++; + /* * l2arc read. The SCL_L2ARC lock will be * released by l2arc_read_done(). @@ -6417,6 +6473,12 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, */ ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); + /* + * NB: if pio != NULL and spa != vd->vdev_spa, + * then this zio tree spans across multiple spa's. + * A manual code audit at the time of writing showed + * that the zio code is fine with that. + */ rzio = zio_read_phys(pio, vd, addr, asize, abd, ZIO_CHECKSUM_OFF, @@ -6444,6 +6506,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, if (zio_wait(rzio) == 0) goto out; +l2arc_read_error: /* l2arc read error; goto zio_read() */ if (hash_lock != NULL) mutex_enter(hash_lock); @@ -6453,11 +6516,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ARCSTAT_BUMP(arcstat_l2_misses); if (HDR_L2_WRITING(hdr)) ARCSTAT_BUMP(arcstat_l2_rw_clash); - spa_config_exit(spa, SCL_L2ARC, vd); + arc_read_l2arc_scl_l2arc_exit(vd); } } else { if (vd != NULL) - spa_config_exit(spa, SCL_L2ARC, vd); + arc_read_l2arc_scl_l2arc_exit(vd); /* * Only a spa with l2 should contribute to l2 @@ -8414,7 +8477,7 @@ arc_fini(void) */ static boolean_t -l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) +l2arc_write_eligible(arc_buf_hdr_t *hdr) { /* * A buffer is *not* eligible for the L2ARC if it: @@ -8423,7 +8486,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) * 3. has an I/O in progress (it may be an incomplete read). * 4. is flagged not eligible (zfs property). */ - if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || + if (HDR_HAS_L2HDR(hdr) || HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) return (B_FALSE); @@ -8864,9 +8927,31 @@ l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) } +static void +l2arc_fallback_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb = zio->io_private; + ASSERT3P(zio->io_spa, ==, cb->l2rcb_primary_spa); + if (cb->l2rcb_need_exit_scl_l2arc_primary_spa) { + /* + * Release SCL_L2ARC before arc_read_done to avoid recursively + * spa_config_tryenter if arc_read_done calls arc_read again. + */ + spa_config_exit(cb->l2rcb_primary_spa, SCL_L2ARC, cb); + } + arc_buf_hdr_t *hdr = cb->l2rcb_hdr; + zio->io_private = hdr; + kmem_free(cb, sizeof (l2arc_read_callback_t)); + arc_read_done(zio); +} + /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. + * + * We take care of cleaning up any L2ARC read resources before calling + * arc_read_done to minimize overhead if the arc_read callbacks issue arc_read + * themselves. Ideally, the compiler turns the arc_read_done call into a jump. */ static void l2arc_read_done(zio_t *zio) @@ -8882,11 +8967,39 @@ l2arc_read_done(zio_t *zio) ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); - spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + ASSERT(strcmp(spa_name(zio->io_spa), NTNX_L2ARC_POOL_NAME) == 0); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); + ASSERT(cb->l2rcb_primary_spa); + + boolean_t primay_spa_is_not_l2arc_spa = + zio->io_spa != cb->l2rcb_primary_spa; + /* Unblock L2ARC device removal. */ + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + if (primay_spa_is_not_l2arc_spa) { + /* We must keep holding SCL_L2ARC for l2rcb_primary_spa until + * after we have issued the fallback read, in order to prevent + * l2rcb_primary_spa from being exported (or worse, freed). + + * However, we must release it before we call the `done` + * callback that was passed to the original arc_read. The + * reason is that the callback could issue another arc_read, + * resulting in a recursive call to spa_config_tryenter(primary + * spa, SCL_L2ARC). That call would fail if we still held + * SCL_L2ARC, and the code + * would treat is like an L2ARC miss, whereas it's most likely + * a hit. + */ + /* XXX: we should actually use refcount_held to assert that `cb` + * still holds the config lock */ + ASSERT(spa_config_held(cb->l2rcb_primary_spa, SCL_L2ARC, + RW_READER)); + } else { + /* XXX: assert that `cb` is not holding SCL_L2ARC */ + } + hash_lock = HDR_LOCK(hdr); mutex_enter(hash_lock); @@ -8939,10 +9052,27 @@ l2arc_read_done(zio_t *zio) */ ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); + /* + * Set the zio's bp to the one we're trying to fill from L2ARC. + * The bp contains the checksum, and both the arc_cksum_is_equal + * and the fallback zio_read depend on it being set correctly. + */ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ zio->io_prop.zp_complevel = hdr->b_complevel; +#ifdef _KERNEL + if (0) { + zio_t *tmp = zio_unique_parent(zio); + cmn_err(CE_WARN, "zio->io_waiter=%p zio->io_spa=%s " + "zio_unique_parent(zio)=%p parent_is_godfather=%d " + "zio_unique_parent(zio)->io_spa=%s ", + zio->io_waiter, spa_name(zio->io_spa), + tmp, tmp ? !!(tmp->io_flags & ZIO_FLAG_GODFATHER) : -1, + tmp ? spa_name(tmp->io_spa) : ""); + } +#endif + valid_cksum = arc_cksum_is_equal(hdr, zio); /* @@ -8956,6 +9086,10 @@ l2arc_read_done(zio_t *zio) if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); + if (primay_spa_is_not_l2arc_spa) { + spa_config_exit(cb->l2rcb_primary_spa, SCL_L2ARC, cb); + } + kmem_free(cb, sizeof (l2arc_read_callback_t)); zio->io_private = hdr; arc_read_done(zio); } else { @@ -8966,6 +9100,10 @@ l2arc_read_done(zio_t *zio) if (zio->io_error != 0) { ARCSTAT_BUMP(arcstat_l2_io_error); } else { + /* + * IO error ensures that arc_read does the fallback + * read for the case where zio->io_waiter != NULL. + */ zio->io_error = SET_ERROR(EIO); } if (!valid_cksum || tfm_error != 0) @@ -8973,20 +9111,62 @@ l2arc_read_done(zio_t *zio) /* * If there's no waiter, issue an async i/o to the primary - * storage now. If there *is* a waiter, the caller must - * issue the i/o in a context where it's OK to block. + * storage now. If there *is* a waiter, the caller must issue + * the i/o in a context where it's OK to block. + * + * The above in more technical terms: If the arc_read was + * ARC_FLAG_NOWAIT, we issued the zio_read_phys to read from + * the l2arc vdev using zio_nowait and immediately returned + * from arc_read. But arc_read is supposed to transparently + * fall back to the primary storage. So, for the + * ARC_FLAG_NOWAIT case, we do that here. However, if there + * *is* a waiter, it's the ARC_FLAG_WAIT case in which case + * arc_read does the fallback read itself. */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); void *abd = (using_rdata) ? hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; - ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + /* + * The `pio` is the one that was passed to arc_read. + * We added `zio` as its (only) child. + * But arc_read can also be called with pio==NULL. + * In that case, zio_nowait sets zio's parent to + * spa_async_zio_root (aka the grandfather zio). + */ + if (pio) { + ASSERT3S(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + /* + * If we violated the follwoing assertion, we + * would be reading from the wrong spa in the + * fallback zio_read below. The caller of + * arc_read would likely observe an EIO due to + * invalid checksum. It depends on the caller + * what happens then. If the arc_read was for + * ZFS metadata it's likely that they have a + * VERIFY somewhere which would result in a + * kernel panic higher up the stack. If it's + * for user data, it's likely that we bubble up + * the error to userspace. + * + * Either way, the resulting behavior is + * misleading and would likely spark + * misdirected investigations into data + * integrity issues. So, just take the hit and + * do a VERIFY here to get a precise panic in + * case we have an implementation issue. + */ + VERIFY3P(pio->io_spa, ==, cb->l2rcb_primary_spa); + } - zio = zio_read(pio, zio->io_spa, zio->io_bp, - abd, zio->io_size, arc_read_done, - hdr, zio->io_priority, cb->l2rcb_flags, - &cb->l2rcb_zb); + /* NB: io_bp has been copied from cb->l2rcb_bp above */ + cb->l2rcb_need_exit_scl_l2arc_primary_spa = + primay_spa_is_not_l2arc_spa; + zio_t *fallback_read = zio_read(pio, + cb->l2rcb_primary_spa, zio->io_bp, + abd, zio->io_size, l2arc_fallback_read_done, cb, + zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); /* * Original ZIO will be freed, so we need to update @@ -8998,13 +9178,22 @@ l2arc_read_done(zio_t *zio) acb->acb_zio_head = zio; mutex_exit(hash_lock); - zio_nowait(zio); + zio_nowait(fallback_read); + /* + * We release the SCL_L2ARC on l2rcb_primary_spa in + * l2arc_fallback_read_done. See comment in there, at + * at the beginning of the function, for why we can't + * release it here. + */ } else { mutex_exit(hash_lock); + if (primay_spa_is_not_l2arc_spa) { + spa_config_exit(cb->l2rcb_primary_spa, SCL_L2ARC, cb); + } + kmem_free(cb, sizeof (l2arc_read_callback_t)); + /* arc_read will do the fallback read */ } } - - kmem_free(cb, sizeof (l2arc_read_callback_t)); } /* @@ -9252,7 +9441,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) } if (!HDR_HAS_L1HDR(hdr)) { - ASSERT(!HDR_L2_READING(hdr)); + ASSERT(!HDR_L2_READING(hdr)); // why does this hold? /* * This doesn't exist in the ARC. Destroy. * arc_hdr_destroy() will call list_remove() @@ -9478,7 +9667,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t full; l2arc_write_callback_t *cb = NULL; zio_t *pio, *wzio; - uint64_t guid = spa_load_guid(spa); l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev->l2ad_vdev, !=, NULL); @@ -9548,7 +9736,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (!l2arc_write_eligible(guid, hdr)) { + if (!l2arc_write_eligible(hdr)) { mutex_exit(hash_lock); continue; } @@ -9789,6 +9977,12 @@ l2arc_feed_thread(void *unused) spa = dev->l2ad_spa; ASSERT3P(spa, !=, NULL); + /* + * We only allow add l2arc devices in pool NTNX_L2ARC_POOL_NAME + * to l2arc_dev_list. + */ + ASSERT0(strcmp(spa_name(spa), NTNX_L2ARC_POOL_NAME)); + /* * If the pool is read-only then force the feed thread to * sleep a little longer. @@ -9948,6 +10142,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) ASSERT(!l2arc_vdev_present(vd)); + VERIFY(strcmp(spa_name(spa), NTNX_L2ARC_POOL_NAME) == 0); + /* * Create a new l2arc device entry. */ diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 85de95baf61a..35fdc29ec1c7 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3299,7 +3299,28 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } - ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + /* XXX: with system-wide l2arc, this ASSERT is incorrect. + * After conversation with Paul Dagnelie, who added it, + * we concluded that it's not necessary. + * The stack trace was : + * VERIFY3(zio->io_spa == dpa->dpa_spa) failed + * (ffff8fc81ba6c000== ffff8fc9c4aa8000) + * ^ l2arc spa ^primary spa + * + * dump_stack+0x6d/0x8b + * spl_dumpstack+0x29/0x2b [spl] + * spl_panic+0xd1/0xd3 [spl] + * dbuf_prefetch_indirect_done+0xdd/0x420 [zfs] + * arc_read_done+0x3e6/0x990 [zfs] + * l2arc_read_done+0x5a9/0x9a0 [zfs] + * zio_done+0x54d/0x1680 [zfs] + * zio_execute+0xe9/0x2e0 [zfs] + * taskq_thread+0x2ec/0x650 [spl] + * kthread+0x128/0x140 + * ret_from_fork+0x35/0x40 + * + * ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + */ dpa->dpa_dnode = NULL; } else if (dpa->dpa_dnode != NULL) { diff --git a/module/zfs/spa.c b/module/zfs/spa.c index cc367745e486..9c0a908819e1 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1977,6 +1977,20 @@ spa_load_l2cache(spa_t *spa) (void) vdev_validate_aux(vd); + if (!vdev_is_dead(vd) && + strcmp(spa_name(spa), NTNX_L2ARC_POOL_NAME) != 0) { + cmn_err(CE_WARN, "pool '%s': l2arc devices outside of pool " NTNX_L2ARC_POOL_NAME " will not be used and marked offline.", spa_name(spa)); + /* mark device as dead */ + vdev_set_state(vd, B_TRUE, VDEV_STATE_UNUSED, + VDEV_AUX_POOL_USES_NTNX_SHARED_L2ARC); + /* + * Ensure that vdev_is_dead() returns true, so + * we don't l2arc_add_vdev below. The whole + * system-wide L2ARC code relies on this. + */ + VERIFY(vdev_is_dead(vd)); + } + if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); From e6504443f9532e3ff39b687b4a73fd8b3d59e13e Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 24 Oct 2022 12:52:46 +0000 Subject: [PATCH 2/4] remove unused ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC --- cmd/zpool/zpool_main.c | 12 ------------ include/libzfs.h | 5 ----- 2 files changed, 17 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 72b2bf85ba85..84767f65b4b5 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2904,11 +2904,6 @@ show_import(nvlist_t *config, boolean_t report_error) "\tExpect reduced performance.\n")); break; - case ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC: - (void) printf_color(ANSI_BOLD, - gettext(" status: The pool has L2ARC devices that will be ignored in favor of the shared L2ARC devices in pool " NTNX_L2ARC_POOL_NAME ".\n")); - break; - default: /* * No other status can be seen when importing pools. @@ -3048,9 +3043,6 @@ show_import(nvlist_t *config, boolean_t report_error) (void) printf(gettext(" action: Set a unique system " "hostid with the zgenhostid(8) command.\n")); break; - case ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC: - (void) printf(gettext(" action: The pool's L2ARC devices will be ignored in favor of the shared L2ARC devices in pool " NTNX_L2ARC_POOL_NAME ".\n")); - break; default: (void) printf(gettext(" action: The pool cannot be " "imported due to damaged devices or data.\n")); @@ -8539,10 +8531,6 @@ status_callback(zpool_handle_t *zhp, void *data) } break; - case ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC: - (void) printf(gettext("status: The pool has L2ARC devices that will be ignored in favor of the shared L2ARC devices in pool " NTNX_L2ARC_POOL_NAME ".\n")); - break; - default: /* * The remaining errors can't actually be generated, yet. diff --git a/include/libzfs.h b/include/libzfs.h index 76bdcc16712f..df17873369ad 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -419,11 +419,6 @@ typedef enum { ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ ZPOOL_STATUS_COMPATIBILITY_ERR, /* bad 'compatibility' property */ ZPOOL_STATUS_INCOMPATIBLE_FEAT, /* feature set outside compatibility */ - /* - * Pool won't use the given L2ARC because this software version uses - * the Nutanix shared L2ARC. - */ - ZPOOL_STATUS_USES_NTNX_SHARED_L2ARC, /* * Finally, the following indicates a healthy pool. From 95e0c3a5513f84851dc7df2bb6b2c085b52d9bd2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 24 Oct 2022 12:54:14 +0000 Subject: [PATCH 3/4] remove references to Nutanix from identifiers Now the only reference left is the special pool name. That whole concept is going to replaced by zpool properties in the future. --- cmd/zpool/zpool_main.c | 8 ++++---- include/sys/arc.h | 2 +- include/sys/fs/zfs.h | 2 +- module/zfs/arc.c | 8 ++++---- module/zfs/spa.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 84767f65b4b5..e3e7617d4c44 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2437,8 +2437,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) printf(gettext("invalid label")); break; - case VDEV_AUX_POOL_USES_NTNX_SHARED_L2ARC: - (void) printf(gettext("ignored in favor of cache devices in pool " NTNX_L2ARC_POOL_NAME)); + case VDEV_AUX_POOL_USES_SHARED_L2ARC: + (void) printf(gettext("ignored in favor of cache devices in pool " ZFS_SHARED_L2ARC_POOL_NAME)); break; default: @@ -2600,8 +2600,8 @@ print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, (void) printf(gettext("invalid label")); break; - case VDEV_AUX_POOL_USES_NTNX_SHARED_L2ARC: - (void) printf(gettext("ignored in favor of cache devices in pool " NTNX_L2ARC_POOL_NAME)); + case VDEV_AUX_POOL_USES_SHARED_L2ARC: + (void) printf(gettext("ignored in favor of cache devices in pool " ZFS_SHARED_L2ARC_POOL_NAME)); break; default: diff --git a/include/sys/arc.h b/include/sys/arc.h index fd2f0494e730..9ded65f7d23e 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -328,7 +328,7 @@ void arc_fini(void); * Level 2 ARC */ -#define NTNX_L2ARC_POOL_NAME "NTNX-fsvm-local-l2arc" +#define ZFS_SHARED_L2ARC_POOL_NAME "NTNX-fsvm-local-l2arc" void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 421994657d76..cc58d02d99fd 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -986,7 +986,7 @@ typedef enum vdev_aux { VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ - VDEV_AUX_POOL_USES_NTNX_SHARED_L2ARC, + VDEV_AUX_POOL_USES_SHARED_L2ARC, VDEV_AUX_ACTIVE, /* vdev active on a different host */ VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index f81fa8f1c80b..da2a48652b53 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5928,7 +5928,7 @@ arc_read_l2arc_scl_l2arc_tryenter(vdev_t *l2arc_vdev) /* we only use l2arc vdevs from the dedicated L2ARC pool */ ASSERT(l2arc_vdev->vdev_spa); - ASSERT(strcmp(spa_name(l2arc_vdev->vdev_spa), NTNX_L2ARC_POOL_NAME) == 0); + ASSERT(strcmp(spa_name(l2arc_vdev->vdev_spa), ZFS_SHARED_L2ARC_POOL_NAME) == 0); return spa_config_tryenter(l2arc_vdev->vdev_spa, SCL_L2ARC, l2arc_vdev, RW_READER); } @@ -8967,7 +8967,7 @@ l2arc_read_done(zio_t *zio) ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); - ASSERT(strcmp(spa_name(zio->io_spa), NTNX_L2ARC_POOL_NAME) == 0); + ASSERT(strcmp(spa_name(zio->io_spa), ZFS_SHARED_L2ARC_POOL_NAME) == 0); ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; @@ -9981,7 +9981,7 @@ l2arc_feed_thread(void *unused) * We only allow add l2arc devices in pool NTNX_L2ARC_POOL_NAME * to l2arc_dev_list. */ - ASSERT0(strcmp(spa_name(spa), NTNX_L2ARC_POOL_NAME)); + ASSERT0(strcmp(spa_name(spa), ZFS_SHARED_L2ARC_POOL_NAME)); /* * If the pool is read-only then force the feed thread to @@ -10142,7 +10142,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) ASSERT(!l2arc_vdev_present(vd)); - VERIFY(strcmp(spa_name(spa), NTNX_L2ARC_POOL_NAME) == 0); + VERIFY(strcmp(spa_name(spa), ZFS_SHARED_L2ARC_POOL_NAME) == 0); /* * Create a new l2arc device entry. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9c0a908819e1..5397f7a88a8a 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1978,11 +1978,11 @@ spa_load_l2cache(spa_t *spa) (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd) && - strcmp(spa_name(spa), NTNX_L2ARC_POOL_NAME) != 0) { - cmn_err(CE_WARN, "pool '%s': l2arc devices outside of pool " NTNX_L2ARC_POOL_NAME " will not be used and marked offline.", spa_name(spa)); + strcmp(spa_name(spa), ZFS_SHARED_L2ARC_POOL_NAME) != 0) { + cmn_err(CE_WARN, "pool '%s': l2arc devices outside of pool " ZFS_SHARED_L2ARC_POOL_NAME " will not be used and marked offline.", spa_name(spa)); /* mark device as dead */ vdev_set_state(vd, B_TRUE, VDEV_STATE_UNUSED, - VDEV_AUX_POOL_USES_NTNX_SHARED_L2ARC); + VDEV_AUX_POOL_USES_SHARED_L2ARC); /* * Ensure that vdev_is_dead() returns true, so * we don't l2arc_add_vdev below. The whole From f004d01ce452aacc6eabf75d7ef7ab1768e8ee67 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 24 Oct 2022 12:56:52 +0000 Subject: [PATCH 4/4] consistent naming: s/system-wide L2ARC/shared L2ARC/ --- module/zfs/dbuf.c | 2 +- module/zfs/spa.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 35fdc29ec1c7..ba4997365f6b 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3299,7 +3299,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); } - /* XXX: with system-wide l2arc, this ASSERT is incorrect. + /* XXX: with shared l2arc, this ASSERT is incorrect. * After conversation with Paul Dagnelie, who added it, * we concluded that it's not necessary. * The stack trace was : diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 5397f7a88a8a..2408eeac6b3e 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1986,7 +1986,7 @@ spa_load_l2cache(spa_t *spa) /* * Ensure that vdev_is_dead() returns true, so * we don't l2arc_add_vdev below. The whole - * system-wide L2ARC code relies on this. + * Shared L2ARC code relies on this. */ VERIFY(vdev_is_dead(vd)); }