Skip to content

Commit

Permalink
ddt: add "flat phys" feature
Browse files Browse the repository at this point in the history
Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.

This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.

Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.

Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.

Signed-off-by: Rob Norris <[email protected]>
Co-authored-by: Don Brady <[email protected]>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
  • Loading branch information
robn and don-brady committed Jun 24, 2024
1 parent ab38754 commit 2540fe6
Show file tree
Hide file tree
Showing 10 changed files with 709 additions and 224 deletions.
34 changes: 19 additions & 15 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1900,14 +1900,16 @@ dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
blkptr_t blk;
int p;

for (p = 0; p < ddlwe->ddlwe_nphys; p++) {
const ddt_phys_t *ddp = &ddlwe->ddlwe_phys[p];
if (ddp->ddp_phys_birth == 0)
for (p = 0; p < DDT_NPHYS(ddt); p++) {
const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

if (ddt_phys_birth(ddp, v) == 0)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
(void) printf("index %llx refcnt %llu phys %d %s\n",
(u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
(u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
p, blkbuf);
}
}
Expand Down Expand Up @@ -5771,9 +5773,8 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
if (dde == NULL) {
refcnt = 0;
} else {
ddt_phys_t *ddp = ddt_phys_select(ddt, dde, bp);
ddt_phys_decref(ddp);
refcnt = ddp->ddp_refcnt;
ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
refcnt = ddt_phys_decref(dde->dde_phys, v);
if (ddt_phys_total_refcnt(ddt, dde) == 0)
ddt_remove(ddt, dde);
}
Expand Down Expand Up @@ -6112,18 +6113,21 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
VERIFY(ddt);

uint64_t refcnt = 0;
for (int p = 0; p < ddlwe.ddlwe_nphys; p++) {
ddt_phys_t *ddp = &ddlwe.ddlwe_phys[p];
if (ddp->ddp_phys_birth == 0)
for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ddt_univ_phys_t *ddp = &ddlwe.ddlwe_phys;
ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);

if (ddt_phys_birth(ddp, v) == 0)
continue;
refcnt += ddp->ddp_refcnt;
refcnt += ddt_phys_refcnt(ddp, v);

ddt_bp_create(ddb.ddb_checksum,
&ddlwe.ddlwe_key, ddp, &blk);
&ddlwe.ddlwe_key, ddp, v, &blk);
if (DDT_PHYS_IS_DITTO(ddt, p)) {
zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
} else {
zcb->zcb_dedup_asize +=
BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
zcb->zcb_dedup_asize += BP_GET_ASIZE(&blk) *
(ddt_phys_refcnt(ddp, v) - 1);
zcb->zcb_dedup_blocks++;
}
}
Expand Down
122 changes: 96 additions & 26 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ struct abd;
/*
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
*/
/* No flags yet. */
#define DDT_FLAG_MASK (0)
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
#define DDT_FLAG_MASK (DDT_FLAG_FLAT)

/*
* DDT on-disk storage object types. Each one corresponds to specific
Expand Down Expand Up @@ -126,21 +126,80 @@ typedef struct {
* characteristics of the stored block, such as its location on disk (DVAs),
* birth txg and ref count.
*
* Note that an entry has an array of four ddt_phys_t, one for each number of
* DVAs (copies= property) and another for additional "ditto" copies. Most
* users of ddt_phys_t will handle indexing into or counting the phys they
* want.
* The "traditional" entry has an array of four, one for each number of DVAs
* (copies= property) and another for additional "ditto" copies. Users of the
* traditional struct will specify the variant (index) of the one they want.
*
* The newer "flat" entry has only a single form that is specified using the
* DDT_PHYS_FLAT variant.
*
* Since the value size varies, use one of the size macros when interfacing
* with the ddt zap.
*/
typedef struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddt_phys_t;

#define DDT_PHYS_MAX (4)
#define DDT_NPHYS(ddt) ((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX)
#define DDT_PHYS_IS_DITTO(ddt, p) ((ddt) && p == 0)
#define DDT_PHYS_FOR_COPIES(ddt, p) ((ddt) ? (p) : (p))
#define DDT_PHYS_MAX (4)

/*
* Note - this can be used in a flexible array and allocated for
* a specific size (ddp_trad or ddp_flat). So be careful not to
* copy using "=" assignment but instead use ddt_phys_copy().
*/
typedef union {
/*
* Traditional physical payload value for DDT zap (256 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddp_trad[DDT_PHYS_MAX];

/*
* Flat physical payload value for DDT zap (72 bytes)
*/
struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth; /* txg based from BP */
uint64_t ddp_class_start; /* in realtime seconds */
} ddp_flat;
} ddt_univ_phys_t;

/*
* This enum denotes which variant of a ddt_univ_phys_t to target. For
* a traditional DDT entry, it represents the indexes into the ddp_trad
* array. Any consumer of a ddt_univ_phys_t needs to know which variant
* is being targeted.
*
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
* we maintain the ability to free existing dedup-ditto blocks.
*/

typedef enum {
DDT_PHYS_DITTO = 0,
DDT_PHYS_SINGLE = 1,
DDT_PHYS_DOUBLE = 2,
DDT_PHYS_TRIPLE = 3,
DDT_PHYS_FLAT = 4,
DDT_PHYS_NONE = 5
} ddt_phys_variant_t;

#define DDT_PHYS_VARIANT(ddt, p) \
(ASSERT((p) < DDT_PHYS_NONE), \
((ddt)->ddt_flags & DDT_FLAG_FLAT ? DDT_PHYS_FLAT : (p)))

#define DDT_TRAD_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_trad)
#define DDT_FLAT_PHYS_SIZE sizeof (((ddt_univ_phys_t *)0)->ddp_flat)

#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))

#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
DDT_FLAT_PHYS_SIZE, DDT_TRAD_PHYS_SIZE)

#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))

/*
* A "live" entry, holding changes to an entry made this txg, and other data to
Expand All @@ -159,6 +218,9 @@ typedef struct {
/* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd;

/* original phys contents before update, for error handling */
ddt_univ_phys_t dde_orig_phys;

/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_MAX];
} ddt_entry_io_t;
Expand All @@ -178,7 +240,7 @@ typedef struct {

ddt_entry_io_t *dde_io; /* IO support, when required */

ddt_phys_t dde_phys[]; /* physical data */
ddt_univ_phys_t dde_phys[]; /* flexible -- allocated size varies */
} ddt_entry_t;

/*
Expand All @@ -189,8 +251,7 @@ typedef struct {
ddt_key_t ddlwe_key;
ddt_type_t ddlwe_type;
ddt_class_t ddlwe_class;
uint8_t ddlwe_nphys;
ddt_phys_t ddlwe_phys[DDT_PHYS_MAX];
ddt_univ_phys_t ddlwe_phys;
} ddt_lightweight_entry_t;

/*
Expand Down Expand Up @@ -236,17 +297,26 @@ typedef struct {
uint64_t ddb_cursor;
} ddt_bookmark_t;

extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
uint64_t txg);
extern void ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
blkptr_t *bp, uint64_t txg);
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp);
const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp);

extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
extern void ddt_phys_clear(ddt_phys_t *ddp);
extern void ddt_phys_addref(ddt_phys_t *ddp);
extern void ddt_phys_decref(ddt_phys_t *ddp);
extern ddt_phys_t *ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde,
extern void ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
const blkptr_t *bp);
extern void ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
ddt_phys_variant_t v);
extern void ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern void ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v);
extern uint64_t ddt_phys_refcnt(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern ddt_phys_variant_t ddt_phys_select(const ddt_t *ddt,
const ddt_entry_t *dde, const blkptr_t *bp);
extern uint64_t ddt_phys_birth(const ddt_univ_phys_t *ddp,
ddt_phys_variant_t v);
extern int ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
boolean_t encrypted);

extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
Expand Down
20 changes: 9 additions & 11 deletions include/sys/ddt_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,12 @@ extern "C" {
#define DDT_DIR_FLAGS "flags"

/* Fill a lightweight entry from a live entry. */
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (dde)->dde_key; \
(ddlwe)->ddlwe_type = (dde)->dde_type; \
(ddlwe)->ddlwe_class = (dde)->dde_class; \
(ddlwe)->ddlwe_nphys = DDT_NPHYS(ddt); \
for (int p = 0; p < (ddlwe)->ddlwe_nphys; p++) \
(ddlwe)->ddlwe_phys[p] = (dde)->dde_phys[p]; \
#define DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, ddlwe) do { \
memset((ddlwe), 0, sizeof (*ddlwe)); \
(ddlwe)->ddlwe_key = (dde)->dde_key; \
(ddlwe)->ddlwe_type = (dde)->dde_type; \
(ddlwe)->ddlwe_class = (dde)->dde_class; \
memcpy(&(ddlwe)->ddlwe_phys, (dde)->dde_phys, DDT_PHYS_SIZE(ddt)); \
} while (0)

/*
Expand All @@ -61,19 +59,19 @@ typedef struct {
boolean_t prehash);
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
int (*ddt_op_lookup)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
const ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_contains)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
const ddt_key_t *ddk);
void (*ddt_op_prefetch_all)(objset_t *os, uint64_t object);
int (*ddt_op_update)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize,
const ddt_key_t *ddk, const void *phys, size_t psize,
dmu_tx_t *tx);
int (*ddt_op_remove)(objset_t *os, uint64_t object,
const ddt_key_t *ddk, dmu_tx_t *tx);
int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk,
ddt_key_t *ddk, ddt_phys_t *phys, size_t psize);
ddt_key_t *ddk, void *phys, size_t psize);
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
} ddt_ops_t;

Expand Down
2 changes: 1 addition & 1 deletion include/sys/dsl_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx);
void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
Expand Down
7 changes: 6 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -573,14 +573,19 @@ typedef struct blkptr {
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
BP_GET_PSIZE(bp))

#define BP_ZERO(bp) \
#define BP_ZERO_DVAS(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \
(bp)->blk_dva[1].dva_word[0] = 0; \
(bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \
}

#define BP_ZERO(bp) \
{ \
BP_ZERO_DVAS(bp); \
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
Expand Down
Loading

0 comments on commit 2540fe6

Please sign in to comment.