diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 346fdbdf540f..81f5b1c27333 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1922,14 +1922,16 @@ dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe, blkptr_t blk; int p; - for (p = 0; p < ddlwe->ddlwe_nphys; p++) { - const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p]; - if (ddp->ddp_phys_birth == 0) + for (p = 0; p < DDT_NPHYS(ddt); p++) { + const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0) continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); (void) printf("index %llx refcnt %llu phys %d %s\n", - (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, + (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v), p, blkbuf); } } @@ -5798,9 +5800,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, if (dde == NULL) { refcnt = 0; } else { - ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp); - ddt_phys_decref(ddp); - refcnt = ddp->ddp_refcnt; + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + refcnt = ddt_phys_decref(dde->dde_phys, v); if (ddt_phys_total_refcnt(ddt, dde) == 0) ddt_remove(ddt, dde); } @@ -6139,18 +6140,21 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) VERIFY(ddt); uint64_t refcnt = 0; - for (int p = 0; p < ddlwe.ddlwe_nphys; p++) { - ddt_phys_t *ddp = &ddlwe.ddlwe_phys[p]; - if (ddp->ddp_phys_birth == 0) + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_univ_phys_t *ddp = &ddlwe.ddlwe_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0) continue; - refcnt += ddp->ddp_refcnt; + refcnt += ddt_phys_refcnt(ddp, v); + ddt_bp_create(ddb.ddb_checksum, - &ddlwe.ddlwe_key, ddp, &blk); + &ddlwe.ddlwe_key, ddp, v, &blk); if (DDT_PHYS_IS_DITTO(ddt, p)) { zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); } else { - zcb->zcb_dedup_asize += - BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); + zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) * + (ddt_phys_refcnt(ddp, v) - 1); zcb->zcb_dedup_blocks++; } } diff --git a/include/sys/ddt.h b/include/sys/ddt.h index af1f8444090e..a54fa71999ca 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -42,8 +42,8 @@ struct abd; /* * DDT-wide feature flags. These are set in ddt_flags by ddt_configure(). */ -/* No flags yet. */ -#define DDT_FLAG_MASK (0) +#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */ +#define DDT_FLAG_MASK (DDT_FLAG_FLAT) /* * DDT on-disk storage object types. Each one corresponds to specific @@ -126,21 +126,80 @@ typedef struct { * characteristics of the stored block, such as its location on disk (DVAs), * birth txg and ref count. * - * Note that an entry has an array of four ddt_phys_t, one for each number of - * DVAs (copies= property) and another for additional "ditto" copies. Most - * users of ddt_phys_t will handle indexing into or counting the phys they - * want. + * The "traditional" entry has an array of four, one for each number of DVAs + * (copies= property) and another for additional "ditto" copies. Users of the + * traditional struct will specify the variant (index) of the one they want. + * + * The newer "flat" entry has only a single form that is specified using the + * DDT_PHYS_FLAT variant. + * + * Since the value size varies, use one of the size macros when interfacing + * with the ddt zap. */ -typedef struct { - dva_t ddp_dva[SPA_DVAS_PER_BP]; - uint64_t ddp_refcnt; - uint64_t ddp_phys_birth; -} ddt_phys_t; -#define DDT_PHYS_MAX (4) -#define DDT_NPHYS(ddt) ((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX) -#define DDT_PHYS_IS_DITTO(ddt, p) ((ddt) && p == 0) -#define DDT_PHYS_FOR_COPIES(ddt, p) ((ddt) ? (p) : (p)) +#define DDT_PHYS_MAX (4) + +/* + * Note - this can be used in a flexible array and allocated for + * a specific size (ddp_trad or ddp_flat). So be careful not to + * copy using "=" assignment but instead use ddt_phys_copy(). + */ +typedef union { + /* + * Traditional physical payload value for DDT zap (256 bytes) + */ + struct { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; + } ddp_trad[DDT_PHYS_MAX]; + + /* + * Flat physical payload value for DDT zap (72 bytes) + */ + struct { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; /* txg based from BP */ + uint64_t ddp_class_start; /* in realtime seconds */ + } ddp_flat; +} ddt_univ_phys_t; + +/* + * This enum denotes which variant of a ddt_univ_phys_t to target. For + * a traditional DDT entry, it represents the indexes into the ddp_trad + * array. Any consumer of a ddt_univ_phys_t needs to know which variant + * is being targeted. + * + * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, + * we maintain the ability to free existing dedup-ditto blocks. + */ + +typedef enum { + DDT_PHYS_DITTO = 0, + DDT_PHYS_SINGLE = 1, + DDT_PHYS_DOUBLE = 2, + DDT_PHYS_TRIPLE = 3, + DDT_PHYS_FLAT = 4, + DDT_PHYS_NONE = 5 +} ddt_phys_variant_t; + +#define DDT_PHYS_VARIANT(ddt, p) \ + (ASSERT((p) < DDT_PHYS_NONE), \ + ((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p))) + +#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad) +#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat) + +#define _DDT_PHYS_SWITCH(ddt, flat, trad) \ + (((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad)) + +#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \ + DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE) + +#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX) +#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p) +#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0)) /* * A "live" entry, holding changes to an entry made this txg, and other data to @@ -159,6 +218,9 @@ typedef struct { /* copy of data after a repair read, to be rewritten */ abd_t *dde_repair_abd; + /* original phys contents before update, for error handling */ + ddt_univ_phys_t dde_orig_phys; + /* in-flight update IOs */ zio_t *dde_lead_zio[DDT_PHYS_MAX]; } ddt_entry_io_t; @@ -178,7 +240,7 @@ typedef struct { ddt_entry_io_t *dde_io; /* IO support, when required */ - ddt_phys_t dde_phys[]; /* physical data */ + ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */ } ddt_entry_t; /* @@ -189,8 +251,7 @@ typedef struct { ddt_key_t ddlwe_key; ddt_type_t ddlwe_type; ddt_class_t ddlwe_class; - uint8_t ddlwe_nphys; - ddt_phys_t ddlwe_phys[DDT_PHYS_MAX]; + ddt_univ_phys_t ddlwe_phys; } ddt_lightweight_entry_t; /* @@ -236,17 +297,26 @@ typedef struct { uint64_t ddb_cursor; } ddt_bookmark_t; -extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, - uint64_t txg); +extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + blkptr_t *bp, uint64_t txg); extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, - const ddt_phys_t *ddp, blkptr_t *bp); + const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp); -extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); -extern void ddt_phys_clear(ddt_phys_t *ddp); -extern void ddt_phys_addref(ddt_phys_t *ddp); -extern void ddt_phys_decref(ddt_phys_t *ddp); -extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, +extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp); +extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, + ddt_phys_variant_t v); +extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v); +extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); +extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, + const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp, + ddt_phys_variant_t v); +extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + boolean_t encrypted); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index e88a046ab8ae..c4e681fb117b 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -42,14 +42,12 @@ extern "C" { #define DDT_DIR_FLAGS "flags" /* Fill a lightweight entry from a live entry. */ -#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \ - memset((ddlwe), 0, sizeof (*ddlwe)); \ - (ddlwe)->ddlwe_key = (dde)->dde_key; \ - (ddlwe)->ddlwe_type = (dde)->dde_type; \ - (ddlwe)->ddlwe_class = (dde)->dde_class; \ - (ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt); \ - for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++) \ - (ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p]; \ +#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \ + memset((ddlwe), 0, sizeof (*ddlwe)); \ + (ddlwe)->ddlwe_key = (dde)->dde_key; \ + (ddlwe)->ddlwe_type = (dde)->dde_type; \ + (ddlwe)->ddlwe_class = (dde)->dde_class; \ + memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \ } while (0) /* @@ -61,19 +59,19 @@ typedef struct { boolean_t prehash); int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); int (*ddt_op_lookup)(objset_t *os, uint64_t object, - const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + const ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_contains)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch)(objset_t *os, uint64_t object, const ddt_key_t *ddk); void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object); int (*ddt_op_update)(objset_t *os, uint64_t object, - const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, + const ddt_key_t *ddk, const void *phys, size_t psize, dmu_tx_t *tx); int (*ddt_op_remove)(objset_t *os, uint64_t object, const ddt_key_t *ddk, dmu_tx_t *tx); int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, - ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + ddt_key_t *ddk, void *phys, size_t psize); int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); } ddt_ops_t; diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index b91d7f4be88f..63734dbc176f 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); + ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx); void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, diff --git a/include/sys/spa.h b/include/sys/spa.h index 3998f5a6de73..a70912335b16 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -572,7 +572,7 @@ typedef struct blkptr { #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) -#define BP_ZERO(bp) \ +#define BP_ZERO_DVAS(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ @@ -580,6 +580,11 @@ typedef struct blkptr { (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ +} + +#define BP_ZERO(bp) \ +{ \ + BP_ZERO_DVAS(bp); \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 4699ad0fb3e1..08d1f6bf2cef 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -75,12 +75,19 @@ * fill the BP with the DVAs from the entry, increment the refcount and cause * the write IO to return immediately. * - * Each ddt_phys_t slot in the entry represents a separate dedup block for the - * same content/checksum. The slot is selected based on the zp_copies parameter - * the block is written with, that is, the number of DVAs in the block. The - * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" - * feature. These are no longer written, and will be freed if encountered on - * old pools. + * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup + * block for the same content/checksum. The slot is selected based on the + * zp_copies parameter the block is written with, that is, the number of DVAs + * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for + * now-removed "dedupditto" feature. These are no longer written, and will be + * freed if encountered on old pools. + * + * If the "fast_dedup" feature is enabled, new dedup tables will be created + * with the "flat phys" option. In this mode, there is only one ddt_phys_t + * slot. If a write is issued for an entry that exists, but has fewer DVAs, + * then only as many new DVAs are allocated and written to make up the + * shortfall. The existing entry is then extended (ddt_phys_extend()) with the + * new DVAs. * * ## Lifetime of an entry * @@ -130,6 +137,16 @@ * from the alternate block. If the block is actually damaged, this will invoke * the pool's "self-healing" mechanism, and repair the block. * + * If the "fast_dedup" feature is enabled, the "flat phys" option will be in + * use, so there is only ever one ddt_phys_t slot. The repair process will + * still happen in this case, though it is unlikely to succeed as there will + * usually be no other equivalent blocks to fall back on (though there might + * be, if this was an early version of a dedup'd block that has since been + * extended). + * + * Note that this repair mechanism is in addition to and separate from the + * regular OpenZFS scrub and self-healing mechanisms. + * * ## Scanning (scrub/resilver) * * If dedup is active, the scrub machinery will walk the dedup table first, and @@ -162,10 +179,15 @@ c == ZIO_CHECKSUM_BLAKE3) static kmem_cache_t *ddt_cache; -static kmem_cache_t *ddt_entry_cache; -#define DDT_ENTRY_SIZE \ - (sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX) +static kmem_cache_t *ddt_entry_flat_cache; +static kmem_cache_t *ddt_entry_trad_cache; + +#define DDT_ENTRY_FLAT_SIZE (sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE) +#define DDT_ENTRY_TRAD_SIZE (sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE) + +#define DDT_ENTRY_SIZE(ddt) \ + _DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE) /* * Enable/disable prefetching of dedup-ed blocks which are going to be freed. @@ -195,7 +217,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = { */ static const uint64_t ddt_version_flags[] = { [DDT_VERSION_LEGACY] = 0, - [DDT_VERSION_FDT] = 0, + [DDT_VERSION_FDT] = DDT_FLAG_FLAT, }; /* Dummy version to signal that configure is still necessary */ @@ -346,7 +368,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, ddt->ddt_object[type][class], &dde->dde_key, - dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt))); + dde->dde_phys, DDT_PHYS_SIZE(ddt))); } static int @@ -388,8 +410,8 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, - sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx)); + ddt->ddt_object[type][class], &dde->dde_key, + dde->dde_phys, DDT_PHYS_SIZE(ddt), tx)); } static int @@ -410,11 +432,10 @@ ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os, ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key, - ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys)); + &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt)); if (error == 0) { ddlwe->ddlwe_type = type; ddlwe->ddlwe_class = class; - ddlwe->ddlwe_nphys = DDT_NPHYS(ddt); return (0); } return (error); @@ -451,13 +472,25 @@ ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, } void -ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + blkptr_t *bp, uint64_t txg) { ASSERT3U(txg, !=, 0); + ASSERT3U(v, <, DDT_PHYS_NONE); + uint64_t phys_birth; + const dva_t *dvap; + + if (v == DDT_PHYS_FLAT) { + phys_birth = ddp->ddp_flat.ddp_phys_birth; + dvap = ddp->ddp_flat.ddp_dva; + } else { + phys_birth = ddp->ddp_trad[v].ddp_phys_birth; + dvap = ddp->ddp_trad[v].ddp_dva; + } for (int d = 0; d < SPA_DVAS_PER_BP; d++) - bp->blk_dva[d] = ddp->ddp_dva[d]; - BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); + bp->blk_dva[d] = dvap[d]; + BP_SET_BIRTH(bp, txg, phys_birth); } /* @@ -465,13 +498,13 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) * will be missing the salt / IV required to do a full decrypting read. */ void -ddt_bp_create(enum zio_checksum checksum, - const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp) { BP_ZERO(bp); if (ddp != NULL) - ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v)); bp->blk_cksum = ddk->ddk_cksum; @@ -502,42 +535,101 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) } void -ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp) { - ASSERT0(ddp->ddp_phys_birth); + ASSERT3U(v, <, DDT_PHYS_NONE); + int bp_ndvas = BP_GET_NDVAS(bp); + int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ? + SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; + + int s = 0, d = 0; + while (s < bp_ndvas && d < ddp_max_dvas) { + if (DVA_IS_VALID(&dvas[d])) { + d++; + continue; + } + dvas[d] = bp->blk_dva[s]; + s++; d++; + } - for (int d = 0; d < SPA_DVAS_PER_BP; d++) - ddp->ddp_dva[d] = bp->blk_dva[d]; - ddp->ddp_phys_birth = BP_GET_BIRTH(bp); + /* + * If the caller offered us more DVAs than we can fit, something has + * gone wrong in their accounting. zio_ddt_write() should never ask for + * more than we need. + */ + ASSERT3U(s, ==, bp_ndvas); + + if (BP_IS_ENCRYPTED(bp)) + dvas[2] = bp->blk_dva[2]; + + if (ddt_phys_birth(ddp, v) == 0) { + if (v == DDT_PHYS_FLAT) + ddp->ddp_flat.ddp_phys_birth = BP_GET_BIRTH(bp); + else + ddp->ddp_trad[v].ddp_phys_birth = BP_GET_BIRTH(bp); + } } void -ddt_phys_clear(ddt_phys_t *ddp) +ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src, + ddt_phys_variant_t v) { - memset(ddp, 0, sizeof (*ddp)); + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + dst->ddp_flat = src->ddp_flat; + else + dst->ddp_trad[v] = src->ddp_trad[v]; } void -ddt_phys_addref(ddt_phys_t *ddp) +ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { - ddp->ddp_refcnt++; + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE); + else + memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX); } void -ddt_phys_decref(ddt_phys_t *ddp) +ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) { - if (ddp) { - ASSERT3U(ddp->ddp_refcnt, >, 0); - ddp->ddp_refcnt--; - } + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + ddp->ddp_flat.ddp_refcnt++; + else + ddp->ddp_trad[v].ddp_refcnt++; +} + +uint64_t +ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + uint64_t *refcntp; + + if (v == DDT_PHYS_FLAT) + refcntp = &ddp->ddp_flat.ddp_refcnt; + else + refcntp = &ddp->ddp_trad[v].ddp_refcnt; + + ASSERT3U(*refcntp, >, 0); + (*refcntp)--; + return (*refcntp); } static void -ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp, + ddt_phys_variant_t v, uint64_t txg) { blkptr_t blk; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); /* * We clear the dedup bit so that zio_free() will actually free the @@ -545,20 +637,67 @@ ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) */ BP_SET_DEDUP(&blk, 0); - ddt_phys_clear(ddp); + ddt_phys_clear(ddp, v); zio_free(ddt->ddt_spa, txg, &blk); } -ddt_phys_t * +uint64_t +ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + return (ddp->ddp_flat.ddp_phys_birth); + else + return (ddp->ddp_trad[v].ddp_phys_birth); +} + +int +ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, + boolean_t encrypted) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + const dva_t *dvas = (v == DDT_PHYS_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva; + + return (DVA_IS_VALID(&dvas[0]) + + DVA_IS_VALID(&dvas[1]) + + DVA_IS_VALID(&dvas[2]) * !encrypted); +} + +ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp) { - for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ddt_phys_t *ddp = (ddt_phys_t *)&dde->dde_phys[p]; - if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && - BP_GET_BIRTH(bp) == ddp->ddp_phys_birth) - return (ddp); + const ddt_univ_phys_t *ddp = dde->dde_phys; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) && + BP_GET_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) { + return (DDT_PHYS_FLAT); + } + } else /* traditional phys */ { + for (int p = 0; p < DDT_PHYS_MAX; p++) { + if (DVA_EQUAL(BP_IDENTITY(bp), + &ddp->ddp_trad[p].ddp_dva[0]) && + BP_GET_BIRTH(bp) == + ddp->ddp_trad[p].ddp_phys_birth) { + return (p); + } + } } - return (NULL); + return (DDT_PHYS_NONE); +} + +uint64_t +ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v) +{ + ASSERT3U(v, <, DDT_PHYS_NONE); + + if (v == DDT_PHYS_FLAT) + return (ddp->ddp_flat.ddp_refcnt); + else + return (ddp->ddp_trad[v].ddp_refcnt); } uint64_t @@ -566,10 +705,11 @@ ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_entry_t *dde) { uint64_t refcnt = 0; - for (int p = 0; p < DDT_NPHYS(ddt); p++) { - if (DDT_PHYS_IS_DITTO(ddt, p)) - continue; - refcnt += dde->dde_phys[p].ddp_refcnt; + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + refcnt = dde->dde_phys->ddp_flat.ddp_refcnt; + } else { + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys->ddp_trad[p].ddp_refcnt; } return (refcnt); @@ -599,24 +739,33 @@ ddt_init(void) { ddt_cache = kmem_cache_create("ddt_cache", sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - ddt_entry_cache = kmem_cache_create("ddt_entry_cache", - DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache", + DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache", + DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0); } void ddt_fini(void) { - kmem_cache_destroy(ddt_entry_cache); + kmem_cache_destroy(ddt_entry_trad_cache); + kmem_cache_destroy(ddt_entry_flat_cache); kmem_cache_destroy(ddt_cache); } static ddt_entry_t * -ddt_alloc(const ddt_key_t *ddk) +ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk) { ddt_entry_t *dde; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - memset(dde, 0, DDT_ENTRY_SIZE); + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP); + memset(dde, 0, DDT_ENTRY_FLAT_SIZE); + } else { + dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP); + memset(dde, 0, DDT_ENTRY_TRAD_SIZE); + } + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); dde->dde_key = *ddk; @@ -647,7 +796,8 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde) } cv_destroy(&dde->dde_cv); - kmem_cache_free(ddt_entry_cache, dde); + kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ? + ddt_entry_flat_cache : ddt_entry_trad_cache, dde); } void @@ -797,7 +947,12 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) return (NULL); /* Time to make a new entry. */ - dde = ddt_alloc(&search); + dde = ddt_alloc(ddt, &search); + + /* Record the time this class was created (used by ddt prune) */ + if (ddt->ddt_flags & DDT_FLAG_FLAT) + dde->dde_phys->ddp_flat.ddp_class_start = gethrestime_sec(); + avl_insert(&ddt->ddt_tree, dde, where); /* @@ -1210,7 +1365,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) ddt_key_fill(&ddk, bp); - dde = ddt_alloc(&ddk); + dde = ddt_alloc(ddt, &ddk); ddt_alloc_entry_io(dde); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { @@ -1226,7 +1381,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) } } - memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)); + memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt)); return (dde); } @@ -1269,13 +1424,26 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) ddt_repair_entry_done, rdde, rio->io_flags); for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; - ddt_phys_t *rddp = &rdde->dde_phys[p]; - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth != rddp->ddp_phys_birth || - memcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_univ_phys_t *rddp = rdde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(ddp, v); + const dva_t *dvas, *rdvas; + + if (ddt->ddt_flags & DDT_FLAG_FLAT) { + dvas = ddp->ddp_flat.ddp_dva; + rdvas = rddp->ddp_flat.ddp_dva; + } else { + dvas = ddp->ddp_trad[p].ddp_dva; + rdvas = rddp->ddp_trad[p].ddp_dva; + } + + if (phys_birth == 0 || + phys_birth != ddt_phys_birth(rddp, v) || + memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP)) continue; - ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk); zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, @@ -1301,7 +1469,8 @@ ddt_repair_table(ddt_t *ddt, zio_t *rio) rdde_next = AVL_NEXT(t, rdde); avl_remove(&ddt->ddt_repair_tree, rdde); ddt_exit(ddt); - ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, + DDT_PHYS_NONE, &blk); dde = ddt_repair_start(ddt, &blk); ddt_repair_entry(ddt, dde, rdde, rio); ddt_repair_done(ddt, dde); @@ -1326,9 +1495,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) for (int p = 0; p < DDT_NPHYS(ddt); p++) { ASSERT(dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL); - ddt_phys_t *ddp = &dde->dde_phys[p]; - if (ddp->ddp_phys_birth == 0) { - ASSERT0(ddp->ddp_refcnt); + ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v); + + if (ddt_phys_birth(ddp, v) == 0) { + ASSERT0(phys_refcnt); continue; } if (DDT_PHYS_IS_DITTO(ddt, p)) { @@ -1336,12 +1508,12 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) * Note, we no longer create DDT-DITTO blocks, but we * don't want to leak any written by older software. */ - ddt_phys_free(ddt, ddk, ddp, txg); + ddt_phys_free(ddt, ddk, ddp, v, txg); continue; } - if (ddp->ddp_refcnt == 0) - ddt_phys_free(ddt, ddk, ddp, txg); - total_refcnt += ddp->ddp_refcnt; + if (phys_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, v, txg); + total_refcnt += phys_refcnt; } if (total_refcnt > 1) @@ -1375,7 +1547,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ddt_lightweight_entry_t ddlwe; DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe); dsl_scan_ddt_entry(dp->dp_scan, - ddt->ddt_checksum, &ddlwe, tx); + ddt->ddt_checksum, ddt, &ddlwe, tx); } } } @@ -1540,12 +1712,10 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) } if (dde->dde_type < DDT_TYPES) { - ddt_phys_t *ddp; - ASSERT3S(dde->dde_class, <, DDT_CLASSES); int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp)); - ddp = &dde->dde_phys[p]; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); /* * This entry already existed (dde_type is real), so it must @@ -1557,9 +1727,9 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) * likely further action is required to fill out the DDT entry, * and this is a place that is likely to be missed in testing. */ - ASSERT3U(ddp->ddp_refcnt, >, 0); + ASSERT3U(ddt_phys_refcnt(dde->dde_phys, v), >, 0); - ddt_phys_addref(ddp); + ddt_phys_addref(dde->dde_phys, v); result = B_TRUE; } else { /* diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 5449eca3afb1..6da77bbca5cb 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -43,18 +43,22 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) memset(dds, 0, sizeof (*dds)); for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; + const ddt_univ_phys_t *ddp = dde->dde_phys; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); - uint64_t dsize = 0; - uint64_t refcnt = ddp->ddp_refcnt; - - if (ddp->ddp_phys_birth == 0) + if (ddt_phys_birth(ddp, v) == 0) continue; - int ndvas = DDK_GET_CRYPT(&dde->dde_key) ? - SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + int ndvas = ddt_phys_dva_count(ddp, v, + DDK_GET_CRYPT(&dde->dde_key)); + const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ? + ddp->ddp_flat.ddp_dva : ddp->ddp_trad[p].ddp_dva; + + uint64_t dsize = 0; for (int d = 0; d < ndvas; d++) - dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + dsize += dva_get_dsize_sync(spa, &dvas[d]); + + uint64_t refcnt = ddt_phys_refcnt(ddp, v); dds->dds_blocks += 1; dds->dds_lsize += lsize; diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 8f1bbeeecd8d..4e01624f3684 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -109,7 +109,7 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) static int ddt_zap_lookup(objset_t *os, uint64_t object, - const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize) + const ddt_key_t *ddk, void *phys, size_t psize) { uchar_t *cbuf; uint64_t one, csize; @@ -156,7 +156,7 @@ ddt_zap_prefetch_all(objset_t *os, uint64_t object) static int ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, - const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) + const void *phys, size_t psize, dmu_tx_t *tx) { const size_t cbuf_size = psize + 1; @@ -182,7 +182,7 @@ ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk, static int ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, - ddt_phys_t *phys, size_t psize) + void *phys, size_t psize) { zap_cursor_t zc; zap_attribute_t za; diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index dec0eb28dc5f..daf1bd5d637b 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -2929,7 +2929,7 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) + ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx) { (void) tx; const ddt_key_t *ddk = &ddlwe->ddlwe_key; @@ -2953,13 +2953,13 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, if (scn->scn_done_txg != 0) return; - for (int p = 0; p < ddlwe->ddlwe_nphys; p++) { - ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p]; + for (int p = 0; p < DDT_NPHYS(ddt); p++) { + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(&ddlwe->ddlwe_phys, v); - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + if (phys_birth == 0 || phys_birth > scn->scn_phys.scn_max_txg) continue; - ddt_bp_create(checksum, ddk, ddp, &bp); + ddt_bp_create(checksum, ddk, &ddlwe->ddlwe_phys, v, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); @@ -3022,7 +3022,7 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); - dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &ddlwe, tx); + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, ddt, &ddlwe, tx); n++; if (dsl_scan_check_suspend(scn, NULL)) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 39543becc7f6..ee3d8e296588 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3249,14 +3249,16 @@ zio_ddt_child_read_done(zio_t *zio) blkptr_t *bp = zio->io_bp; ddt_t *ddt; ddt_entry_t *dde = zio->io_private; - ddt_phys_t *ddp; zio_t *pio = zio_unique_parent(zio); mutex_enter(&pio->io_lock); ddt = ddt_select(zio->io_spa, bp); - ddp = ddt_phys_select(ddt, dde, bp); - if (zio->io_error == 0) - ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + + if (zio->io_error == 0) { + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + /* this phys variant doesn't need repair */ + ddt_phys_clear(dde->dde_phys, v); + } if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL) dde->dde_io->dde_repair_abd = zio->io_abd; @@ -3277,21 +3279,25 @@ zio_ddt_read_start(zio_t *zio) if (zio->io_child_error[ZIO_CHILD_DDT]) { ddt_t *ddt = ddt_select(zio->io_spa, bp); ddt_entry_t *dde = ddt_repair_start(ddt, bp); - ddt_phys_t *ddp_self = ddt_phys_select(ddt, dde, bp); + ddt_phys_variant_t v_self = ddt_phys_select(ddt, dde, bp); + ddt_univ_phys_t *ddp = dde->dde_phys; blkptr_t blk; ASSERT(zio->io_vsd == NULL); zio->io_vsd = dde; - if (ddp_self == NULL) + if (v_self == DDT_PHYS_NONE) return (zio); + /* issue I/O for the other copies */ for (int p = 0; p < DDT_NPHYS(ddt); p++) { - ddt_phys_t *ddp = &dde->dde_phys[p]; - if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (ddt_phys_birth(ddp, v) == 0 || v == v_self) continue; - ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, - &blk); + + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, + ddp, v, &blk); zio_nowait(zio_read(zio, zio->io_spa, &blk, abd_alloc_for_io(zio->io_size, B_TRUE), zio->io_size, zio_ddt_child_read_done, dde, @@ -3371,30 +3377,32 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) if (DDT_PHYS_IS_DITTO(ddt, p)) continue; + if (dde->dde_io == NULL) + continue; + zio_t *lio = dde->dde_io->dde_lead_zio[p]; + if (lio == NULL) + continue; - if (lio != NULL && do_raw) { + if (do_raw) return (lio->io_size != zio->io_size || abd_cmp(zio->io_abd, lio->io_abd) != 0); - } else if (lio != NULL) { - return (lio->io_orig_size != zio->io_orig_size || - abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); - } + + return (lio->io_orig_size != zio->io_orig_size || + abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); } for (int p = 0; p < DDT_NPHYS(ddt); p++) { - if (DDT_PHYS_IS_DITTO(ddt, p)) - continue; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + uint64_t phys_birth = ddt_phys_birth(dde->dde_phys, v); - ddt_phys_t *ddp = &dde->dde_phys[p]; - - if (ddp->ddp_phys_birth != 0 && do_raw) { + if (phys_birth != 0 && do_raw) { blkptr_t blk = *zio->io_bp; uint64_t psize; abd_t *tmpabd; int error; - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); psize = BP_GET_PSIZE(&blk); if (psize != zio->io_size) @@ -3417,13 +3425,13 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) abd_free(tmpabd); ddt_enter(ddt); return (error != 0); - } else if (ddp->ddp_phys_birth != 0) { + } else if (phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; blkptr_t blk = *zio->io_bp; int error; - ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + ddt_bp_fill(dde->dde_phys, v, &blk, phys_birth); if (BP_GET_LSIZE(&blk) != zio->io_orig_size) return (B_TRUE); @@ -3451,52 +3459,87 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) } static void -zio_ddt_child_write_ready(zio_t *zio) +zio_ddt_child_write_done(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; - zio_t *pio; - if (zio->io_error) - return; + zio_link_t *zl = NULL; + ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); - ddt_phys_t *ddp = &dde->dde_phys[p]; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + ddt_univ_phys_t *ddp = dde->dde_phys; ddt_enter(ddt); - ASSERT(dde->dde_io->dde_lead_zio[p] == zio); + /* we're the lead, so once we're done there's no one else outstanding */ + if (dde->dde_io->dde_lead_zio[p] == zio) + dde->dde_io->dde_lead_zio[p] = NULL; - ddt_phys_fill(ddp, zio->io_bp); + ddt_univ_phys_t *orig = &dde->dde_io->dde_orig_phys; - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(zio, &zl)) != NULL) - ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); + if (zio->io_error != 0) { + /* + * The write failed, so we're about to abort the entire IO + * chain. We need to revert the entry back to what it was at + * the last time it was successfully extended. + */ + ddt_phys_copy(ddp, orig, v); + ddt_phys_clear(orig, v); + + ddt_exit(ddt); + return; + } + + /* + * We've successfully added new DVAs to the entry. Clear the saved + * state or, if there's still outstanding IO, remember it so we can + * revert to a known good state if that IO fails. + */ + if (dde->dde_io->dde_lead_zio[p] == NULL) + ddt_phys_clear(orig, v); + else + ddt_phys_copy(orig, ddp, v); + + /* + * Add references for all dedup writes that were waiting on the + * physical one, skipping any other physical writes that are waiting. + */ + zio_t *pio; + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) + ddt_phys_addref(ddp, v); + } ddt_exit(ddt); } static void -zio_ddt_child_write_done(zio_t *zio) +zio_ddt_child_write_ready(zio_t *zio) { ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); ddt_entry_t *dde = zio->io_private; + zio_link_t *zl = NULL; + ASSERT3P(zio_walk_parents(zio, &zl), !=, NULL); + int p = DDT_PHYS_FOR_COPIES(ddt, zio->io_prop.zp_copies); - ddt_phys_t *ddp = &dde->dde_phys[p]; + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + + if (zio->io_error != 0) + return; ddt_enter(ddt); - ASSERT(ddp->ddp_refcnt == 0); - ASSERT(dde->dde_io->dde_lead_zio[p] == zio); - dde->dde_io->dde_lead_zio[p] = NULL; + ddt_phys_extend(dde->dde_phys, v, zio->io_bp); - if (zio->io_error == 0) { - zio_link_t *zl = NULL; - while (zio_walk_parents(zio, &zl) != NULL) - ddt_phys_addref(ddp); - } else { - ddt_phys_clear(ddp); + zio_t *pio; + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + if (!(pio->io_flags & ZIO_FLAG_DDT_CHILD)) + ddt_bp_fill(dde->dde_phys, v, pio->io_bp, zio->io_txg); } ddt_exit(ddt); @@ -3509,7 +3552,6 @@ zio_ddt_write(zio_t *zio) blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; zio_prop_t *zp = &zio->io_prop; - zio_t *cio = NULL; ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; @@ -3530,9 +3572,6 @@ zio_ddt_write(zio_t *zio) return (zio); } - int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); - ddt_phys_t *ddp = &dde->dde_phys[p]; - if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { /* * If we're using a weak checksum, upgrade to a strong checksum @@ -3556,31 +3595,227 @@ zio_ddt_write(zio_t *zio) return (zio); } - ddt_alloc_entry_io(dde); + int p = DDT_PHYS_FOR_COPIES(ddt, zp->zp_copies); + ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p); + ddt_univ_phys_t *ddp = dde->dde_phys; - if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) { - if (ddp->ddp_phys_birth != 0) - ddt_bp_fill(ddp, bp, txg); - if (dde->dde_io->dde_lead_zio[p] != NULL) - zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); - else - ddt_phys_addref(ddp); - } else if (zio->io_bp_override) { - ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); - ASSERT(BP_EQUAL(bp, zio->io_bp_override)); - ddt_phys_fill(ddp, bp); - ddt_phys_addref(ddp); + /* + * In the common cases, at this point we have a regular BP with no + * allocated DVAs, and the corresponding DDT entry for its checksum. + * Our goal is to fill the BP with enough DVAs to satisfy its copies= + * requirement. + * + * One of three things needs to happen to fulfill this: + * + * - if the DDT entry has enough DVAs to satisfy the BP, we just copy + * them out of the entry and return; + * + * - if the DDT entry has no DVAs (ie its brand new), then we have to + * issue the write as normal so that DVAs can be allocated and the + * data land on disk. We then copy the DVAs into the DDT entry on + * return. + * + * - if the DDT entry has some DVAs, but too few, we have to issue the + * write, adjusted to have allocate fewer copies. When it returns, we + * add the new DVAs to the DDT entry, and update the BP to have the + * full amount it originally requested. + * + * In all cases, if there's already a writing IO in flight, we need to + * defer the action until after the write is done. If our action is to + * write, we need to adjust our request for additional DVAs to match + * what will be in the DDT entry after it completes. In this way every + * IO can be guaranteed to recieve enough DVAs simply by joining the + * end of the chain and letting the sequence play out. + */ + + /* + * Number of DVAs in the DDT entry. If the BP is encrypted we ignore + * the third one as normal. + */ + int have_dvas = ddt_phys_dva_count(ddp, v, BP_IS_ENCRYPTED(bp)); + IMPLY(have_dvas == 0, ddt_phys_birth(ddp, v) == 0); + + /* Number of DVAs requested bya the IO. */ + uint8_t need_dvas = zp->zp_copies; + + /* + * What we do next depends on whether or not there's IO outstanding that + * will update this entry. + */ + if (dde->dde_io == NULL || dde->dde_io->dde_lead_zio[p] == NULL) { + /* + * No IO outstanding, so we only need to worry about ourselves. + */ + + /* + * Override BPs bring their own DVAs and their own problems. + */ + if (zio->io_bp_override) { + /* + * For a brand-new entry, all the work has been done + * for us, and we can just fill it out from the provided + * block and leave. + */ + if (have_dvas == 0) { + ASSERT(BP_GET_LOGICAL_BIRTH(bp) == txg); + ASSERT(BP_EQUAL(bp, zio->io_bp_override)); + ddt_phys_extend(ddp, v, bp); + ddt_phys_addref(ddp, v); + ddt_exit(ddt); + return (zio); + } + + /* + * If we already have this entry, then we want to treat + * it like a regular write. To do this we just wipe + * them out and proceed like a regular write. + * + * Even if there are some DVAs in the entry, we still + * have to clear them out. We can't use them to fill + * out the dedup entry, as they are all referenced + * together by a bp already on disk, and will be freed + * as a group. + */ + BP_ZERO_DVAS(bp); + BP_SET_BIRTH(bp, 0, 0); + } + + /* + * If there are enough DVAs in the entry to service our request, + * then we can just use them as-is. + */ + if (have_dvas >= need_dvas) { + ddt_bp_fill(ddp, v, bp, txg); + ddt_phys_addref(ddp, v); + ddt_exit(ddt); + return (zio); + } + + /* + * Otherwise, we have to issue IO to fill the entry up to the + * amount we need. + */ + need_dvas -= have_dvas; } else { - cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, - zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, - zio_ddt_child_write_done, dde, zio->io_priority, - ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + /* + * There's a write in-flight. If there's already enough DVAs on + * the entry, then either there were already enough to start + * with, or the in-flight IO is between READY and DONE, and so + * has extended the entry with new DVAs. Either way, we don't + * need to do anything, we can just slot in behind it. + */ + + if (zio->io_bp_override) { + /* + * If there's a write out, then we're soon going to + * have our own copies of this block, so clear out the + * override block and treat it as a regular dedup + * write. See comment above. + */ + BP_ZERO_DVAS(bp); + BP_SET_BIRTH(bp, 0, 0); + } + + if (have_dvas >= need_dvas) { + /* + * A minor point: there might already be enough + * committed DVAs in the entry to service our request, + * but we don't know which are completed and which are + * allocated but not yet written. In this case, should + * the IO for the new DVAs fail, we will be on the end + * of the IO chain and will also recieve an error, even + * though our request could have been serviced. + * + * This is an extremely rare case, as it requires the + * original block to be copied with a request for a + * larger number of DVAs, then copied again requesting + * the same (or already fulfilled) number of DVAs while + * the first request is active, and then that first + * request errors. In return, the logic required to + * catch and handle it is complex. For now, I'm just + * not going to bother with it. + */ + + /* + * We always fill the bp here as we may have arrived + * after the in-flight write has passed READY, and so + * missed out. + */ + ddt_bp_fill(ddp, v, bp, txg); + zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); + ddt_exit(ddt); + return (zio); + } + + /* + * There's not enough in the entry yet, so we need to look at + * the write in-flight and see how many DVAs it will have once + * it completes. + * + * The in-flight write has potentially had its copies request + * reduced (if we're filling out an existing entry), so we need + * to reach in and get the original write to find out what it is + * expecting. + * + * Note that the parent of the lead zio will always have the + * highest zp_copies of any zio in the chain, because ones that + * can be serviced without additional IO are always added to + * the back of the chain. + */ + zio_link_t *zl = NULL; + zio_t *pio = + zio_walk_parents(dde->dde_io->dde_lead_zio[p], &zl); + ASSERT(pio); + uint8_t parent_dvas = pio->io_prop.zp_copies; + + if (parent_dvas >= need_dvas) { + zio_add_child(zio, dde->dde_io->dde_lead_zio[p]); + ddt_exit(ddt); + return (zio); + } - zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); - dde->dde_io->dde_lead_zio[p] = cio; + /* + * Still not enough, so we will need to issue to get the + * shortfall. + */ + need_dvas -= parent_dvas; } + /* + * We need to write. We will create a new write with the copies + * property adjusted to match the number of DVAs we need to need to + * grow the DDT entry by to satisfy the request. + */ + zio_prop_t czp = *zp; + czp.zp_copies = need_dvas; + zio_t *cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, + zio->io_orig_size, zio->io_orig_size, &czp, + zio_ddt_child_write_ready, NULL, + zio_ddt_child_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); + + /* + * We are the new lead zio, because our parent has the highest + * zp_copies that has been requested for this entry so far. + */ + ddt_alloc_entry_io(dde); + if (dde->dde_io->dde_lead_zio[p] == NULL) { + /* + * First time out, take a copy of the stable entry to revert + * to if there's an error (see zio_ddt_child_write_done()) + */ + ddt_phys_copy(&dde->dde_io->dde_orig_phys, dde->dde_phys, v); + } else { + /* + * Make the existing chain our child, because it cannot + * complete until we have. + */ + zio_add_child(cio, dde->dde_io->dde_lead_zio[p]); + } + dde->dde_io->dde_lead_zio[p] = cio; + ddt_exit(ddt); zio_nowait(cio); @@ -3596,8 +3831,7 @@ zio_ddt_free(zio_t *zio) spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); - ddt_entry_t *dde; - ddt_phys_t *ddp; + ddt_entry_t *dde = NULL; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); @@ -3605,9 +3839,9 @@ zio_ddt_free(zio_t *zio) ddt_enter(ddt); freedde = dde = ddt_lookup(ddt, bp, B_TRUE); if (dde) { - ddp = ddt_phys_select(ddt, dde, bp); - if (ddp) - ddt_phys_decref(ddp); + ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp); + if (v != DDT_PHYS_NONE) + ddt_phys_decref(dde->dde_phys, v); } ddt_exit(ddt);