Skip to content

Commit

Permalink
ddt: add "flat phys" feature
Browse files Browse the repository at this point in the history
Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.

This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.

Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.

Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.

Signed-off-by: Rob Norris <[email protected]>
Co-authored-by: Don Brady <[email protected]>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
  • Loading branch information
robn and don-brady committed Aug 11, 2024
1 parent dd33813 commit 3852876
Show file tree
Hide file tree
Showing 10 changed files with 733 additions and 234 deletions.
68 changes: 43 additions & 25 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1922,14 +1922,16 @@ dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
blkptr_t blk;
int p;

for (p = 0; p < ddlwe->ddlwe_nphys; p++) {
const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
if (ddp->ddp_phys_birth == 0)
for (p = 0; p < DDT_NPHYS(ddt); p++) {
const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

if (ddt_phys_birth(ddp, v) == 0)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
(void) printf("index %llx refcnt %llu phys %d %s\n",
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
(u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
p, blkbuf);
}
}
Expand Down Expand Up @@ -3311,8 +3313,7 @@ zdb_ddt_cleanup(spa_t *spa)
ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
while (dde) {
next = AVL_NEXT(&ddt->ddt_tree, dde);
memset(&dde->dde_lead_zio, 0,
sizeof (dde->dde_lead_zio));
dde->dde_io = NULL;
ddt_remove(ddt, dde);
dde = next;
}
Expand Down Expand Up @@ -5686,6 +5687,7 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,

spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);

blkptr_t tempbp;
if (BP_GET_DEDUP(bp)) {
/*
* Dedup'd blocks are special. We need to count them, so we can
Expand Down Expand Up @@ -5721,35 +5723,51 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
VERIFY3P(dde, !=, NULL);

/* Get the phys for this variant */
ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp);
VERIFY3P(ddp, !=, NULL);
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);

/*
* This entry may have multiple sets of DVAs. We must claim
* each set the first time we see them in a real block on disk,
* or count them on subsequent occurences. We don't have a
* convenient way to track the first time we see each variant,
* so we repurpose dde_lead_zio[] as a per-phys "seen" flag. We
* can do this safely in zdb because it never writes, so it
* will never have a writing zio for this block in that
* pointer.
*/

/*
* Work out which dde_phys index was used, get the seen flag,
* and update it if necessary.
* so we repurpose dde_io as a set of "seen" flag bits. We can
* do this safely in zdb because it never writes, so it will
* never have a writing zio for this block in that pointer.
*/
uint_t idx =
((uint_t)((uintptr_t)ddp - (uintptr_t)dde->dde_phys)) /
sizeof (ddt_phys_t);
VERIFY3P(ddp, ==, &dde->dde_phys[idx]);
boolean_t seen = (boolean_t)(uintptr_t)dde->dde_lead_zio[idx];
boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
if (!seen)
dde->dde_lead_zio[idx] = (zio_t *)(uintptr_t)B_TRUE;
dde->dde_io =
(void *)(((uintptr_t)dde->dde_io) | (1 << v));

/* Consume a reference for this block. */
VERIFY3U(ddt_phys_total_refcnt(ddt, dde), >, 0);
ddt_phys_decref(ddp);
ddt_phys_decref(dde->dde_phys, v);

/*
* If this entry has a single flat phys, it may have been
* extended with additional DVAs at some time in its life.
* This block might be from before it was fully extended, and
* so have fewer DVAs.
*
* If this is the first time we've seen this block, and we
* claimed it as-is, then we would miss the claim on some
* number of DVAs, which would then be seen as leaked.
*
* In all cases, if we've had fewer DVAs, then the asize would
* be too small, and would lead to the pool apparently using
* more space than allocated.
*
* To handle this, we copy the canonical set of DVAs from the
* entry back to the block pointer before we claim it.
*/
if (v == DDT_PHYS_FLAT) {
ASSERT3U(BP_GET_BIRTH(bp), ==,
ddt_phys_birth(dde->dde_phys, v));
tempbp = *bp;
ddt_bp_fill(dde->dde_phys, v, &tempbp,
BP_GET_BIRTH(bp));
bp = &tempbp;
}

if (seen) {
/*
Expand Down
122 changes: 96 additions & 26 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ struct abd;
/*
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
*/
/* No flags yet. */
#define DDT_FLAG_MASK (0)
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
#define DDT_FLAG_MASK (DDT_FLAG_FLAT)

/*
* DDT on-disk storage object types. Each one corresponds to specific
Expand Down Expand Up @@ -126,21 +126,80 @@ typedef struct {
* characteristics of the stored block, such as its location on disk (DVAs),
* birth txg and ref count.
*
* Note that an entry has an array of four ddt_phys_t, one for each number of
* DVAs (copies= property) and another for additional "ditto" copies. Most
* users of ddt_phys_t will handle indexing into or counting the phys they
* want.
* The "traditional" entry has an array of four, one for each number of DVAs
* (copies= property) and another for additional "ditto" copies. Users of the
* traditional struct will specify the variant (index) of the one they want.
*
* The newer "flat" entry has only a single form that is specified using the
* DDT_PHYS_FLAT variant.
*
* Since the value size varies, use one of the size macros when interfacing
* with the ddt zap.
*/
typedef struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddt_phys_t;

#define DDT_PHYS_MAX (4)
#define DDT_NPHYS(ddt) ((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX)
#define DDT_PHYS_IS_DITTO(ddt, p) ((ddt) && p == 0)
#define DDT_PHYS_FOR_COPIES(ddt, p) ((ddt) ? (p) : (p))
#define DDT_PHYS_MAX (4)

/*
* Note - this can be used in a flexible array and allocated for
* a specific size (ddp_trad or ddp_flat). So be careful not to
* copy using "=" assignment but instead use ddt_phys_copy().
*/
typedef union {
/*
* Traditional physical payload value for DDT zap (256 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddp_trad[DDT_PHYS_MAX];

/*
* Flat physical payload value for DDT zap (72 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth; /* txg based from BP */
uint64_t ddp_class_start; /* in realtime seconds */
} ddp_flat;
} ddt_univ_phys_t;

/*
* This enum denotes which variant of a ddt_univ_phys_t to target. For
* a traditional DDT entry, it represents the indexes into the ddp_trad
* array. Any consumer of a ddt_univ_phys_t needs to know which variant
* is being targeted.
*
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
* we maintain the ability to free existing dedup-ditto blocks.
*/

typedef enum {
DDT_PHYS_DITTO = 0,
DDT_PHYS_SINGLE = 1,
DDT_PHYS_DOUBLE = 2,
DDT_PHYS_TRIPLE = 3,
DDT_PHYS_FLAT = 4,
DDT_PHYS_NONE = 5
} ddt_phys_variant_t;

#define DDT_PHYS_VARIANT(ddt, p) \
(ASSERT((p) < DDT_PHYS_NONE), \
((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))

#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)

#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))

#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)

#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))

/*
* A "live" entry, holding changes to an entry made this txg, and other data to
Expand All @@ -159,6 +218,9 @@ typedef struct {
/* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd;

/* original phys contents before update, for error handling */
ddt_univ_phys_t dde_orig_phys;

/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_MAX];
} ddt_entry_io_t;
Expand All @@ -178,7 +240,7 @@ typedef struct {

ddt_entry_io_t *dde_io; /* IO support, when required */

ddt_phys_t dde_phys[]; /* physical data */
ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */
} ddt_entry_t;

/*
Expand All @@ -189,8 +251,7 @@ typedef struct {
ddt_key_t ddlwe_key;
ddt_type_t ddlwe_type;
ddt_class_t ddlwe_class;
uint8_t ddlwe_nphys;
ddt_phys_t ddlwe_phys[DDT_PHYS_MAX];
ddt_univ_phys_t ddlwe_phys;
} ddt_lightweight_entry_t;

/*
Expand Down Expand Up @@ -236,17 +297,26 @@ typedef struct {
uint64_t ddb_cursor;
} ddt_bookmark_t;

extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
uint64_t txg);
extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
blkptr_t *bp, uint64_t txg);
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp);
const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);

extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
extern void ddt_phys_clear(ddt_phys_t *ddp);
extern void ddt_phys_addref(ddt_phys_t *ddp);
extern void ddt_phys_decref(ddt_phys_t *ddp);
extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde,
extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
const blkptr_t *bp);
extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
ddt_phys_variant_t v);
extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
const ddt_entry_t *dde, const blkptr_t *bp);
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted);

extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
Expand Down
20 changes: 9 additions & 11 deletions include/sys/ddt_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,12 @@ extern "C" {
#define DDT_DIR_FLAGS "flags"

/* Fill a lightweight entry from a live entry. */
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (dde)->dde_key; \
(ddlwe)->ddlwe_type = (dde)->dde_type; \
(ddlwe)->ddlwe_class = (dde)->dde_class; \
(ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt); \
for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++) \
(ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p]; \
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (dde)->dde_key; \
(ddlwe)->ddlwe_type = (dde)->dde_type; \
(ddlwe)->ddlwe_class = (dde)->dde_class; \
memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
} while (0)

/*
Expand All @@ -61,19 +59,19 @@ typedef struct {
boolean_t prehash);
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
int (*ddt_op_lookup)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
const ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_contains)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
int (*ddt_op_update)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
const ddt_key_t *ddk, const void *phys, size_t psize,
dmu_tx_t *tx);
int (*ddt_op_remove)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, dmu_tx_t *tx);
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
} ddt_ops_t;

Expand Down
2 changes: 1 addition & 1 deletion include/sys/dsl_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
Expand Down
7 changes: 6 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -572,14 +572,19 @@ typedef struct blkptr {
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
BP_GET_PSIZE(bp))

#define BP_ZERO(bp) \
#define BP_ZERO_DVAS(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \
(bp)->blk_dva[1].dva_word[0] = 0; \
(bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \
}

#define BP_ZERO(bp) \
{ \
BP_ZERO_DVAS(bp); \
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
Expand Down
Loading

0 comments on commit 3852876

Please sign in to comment.