Skip to content

Commit

Permalink
ddt: add "flat phys" feature
Browse files Browse the repository at this point in the history
Traditional dedup keeps a separate ddt_phys_t "type" for each possible
count of DVAs (that is, copies=) parameter. Each of these are tracked
independently of each other, and have their own set of DVAs. This leads
to an (admittedly rare) situation where you can create as many as six
copies of the data, by changing the copies= parameter between copying.
This is both a waste of storage on disk, but also a waste of space in
the stored DDT entries, since there never needs to be more than three
DVAs to handle all possible values of copies=.

This commit adds a new FDT feature, DDT_FLAG_FLAT. When active, only the
first ddt_phys_t is used. Each time a block is written with the dedup
bit set, this single phys is checked to see if it has enough DVAs to
fulfill the request. If it does, the block is filled with the saved DVAs
as normal. If not, an adjusted write is issued to create as many extra
copies as are needed to fulfill the request, which are then saved into
the entry too.

Because a single phys is no longer an all-or-nothing, but can be
transitioning from fewer to more DVAs, the write path now has to keep a
copy of the previous "known good" DVA set so we can revert to it in case
an error occurs. zio_ddt_write() has been restructured and heavily
commented to make it much easier to see what's happening.

Backwards compatibility is maintained simply by allocating four
ddt_phys_t when the DDT_FLAG_FLAT flag is not set, and updating the phys
selection macros to check the flag. In the old arrangement, each number
of copies gets a whole phys, so it will always have either zero or all
necessary DVAs filled, with no in-between, so the old behaviour
naturally falls out of the new code.

Signed-off-by: Rob Norris <[email protected]>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
  • Loading branch information
robn committed May 15, 2024
1 parent 640bf5b commit c8bdcc3
Show file tree
Hide file tree
Showing 4 changed files with 397 additions and 88 deletions.
41 changes: 32 additions & 9 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ struct abd;
/*
* DDT-wide feature flags. These are set in ddt_flags by ddt_configure().
*/
/* No flags yet. */
#define DDT_FLAG_MASK (0)
#define DDT_FLAG_FLAT (1 << 0) /* single extensible phys */
#define DDT_FLAG_MASK (DDT_FLAG_FLAT)

/*
* DDT on-disk storage object types. Each one corresponds to specific
Expand Down Expand Up @@ -126,21 +126,41 @@ typedef struct {
* characteristics of the stored block, such as its location on disk (DVAs),
* birth txg and ref count.
*
* Note that an entry has an array of four ddt_phys_t, one for each number of
* DVAs (copies= property) and another for additional "ditto" copies. Most
* The "traditional" entry has an array of four ddt_phys_t, one for each number
* of DVAs (copies= property) and another for additional "ditto" copies. Most
* users of ddt_phys_t will handle indexing into or counting the phys they
* want.
*
* The newer "flat" entry has only a single ddt_phys_t, but is represented as
* an array so it can be used by indexing into it, making it easier to support
* both traditional and flat entries in the same code.
*/

#define DDT_PHYS_MAX (4)

typedef struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddt_phys_t;

#define DDT_PHYS_MAX (4)
#define DDT_NPHYS(ddt) ((ddt) ? DDT_PHYS_MAX : DDT_PHYS_MAX)
#define DDT_PHYS_IS_DITTO(ddt, p) ((ddt) && p == 0)
#define DDT_PHYS_FOR_COPIES(ddt, p) ((ddt) ? (p) : (p))
typedef struct {
ddt_phys_t ddpf_phys[1];
} ddt_phys_flat_t;

typedef struct {
ddt_phys_t ddpt_phys[DDT_PHYS_MAX];
} ddt_phys_trad_t;

#define _DDT_PHYS_SWITCH(ddt, flat, trad) \
(((ddt)->ddt_flags & DDT_FLAG_FLAT) ? (flat) : (trad))

#define DDT_PHYS_SIZE(ddt) _DDT_PHYS_SWITCH(ddt, \
sizeof (ddt_phys_flat_t), sizeof (ddt_phys_trad_t))

#define DDT_NPHYS(ddt) _DDT_PHYS_SWITCH(ddt, 1, DDT_PHYS_MAX)
#define DDT_PHYS_FOR_COPIES(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, p)
#define DDT_PHYS_IS_DITTO(ddt, p) _DDT_PHYS_SWITCH(ddt, 0, (p == 0))

/*
* A "live" entry, holding changes to an entry made this txg, and other data to
Expand All @@ -159,6 +179,9 @@ typedef struct {
/* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd;

/* original phys contents before update, for error handling */
ddt_phys_t dde_orig_phys[DDT_PHYS_MAX];

/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_MAX];
} ddt_entry_io_t;
Expand Down Expand Up @@ -241,7 +264,7 @@ extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp);

extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
extern void ddt_phys_extend(ddt_phys_t *ddp, const blkptr_t *bp);
extern void ddt_phys_clear(ddt_phys_t *ddp);
extern void ddt_phys_addref(ddt_phys_t *ddp);
extern void ddt_phys_decref(ddt_phys_t *ddp);
Expand Down
7 changes: 6 additions & 1 deletion include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -573,14 +573,19 @@ typedef struct blkptr {
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
BP_GET_PSIZE(bp))

#define BP_ZERO(bp) \
#define BP_ZERO_DVAS(bp) \
{ \
(bp)->blk_dva[0].dva_word[0] = 0; \
(bp)->blk_dva[0].dva_word[1] = 0; \
(bp)->blk_dva[1].dva_word[0] = 0; \
(bp)->blk_dva[1].dva_word[1] = 0; \
(bp)->blk_dva[2].dva_word[0] = 0; \
(bp)->blk_dva[2].dva_word[1] = 0; \
}

#define BP_ZERO(bp) \
{ \
BP_ZERO_DVAS(bp); \
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
Expand Down
107 changes: 79 additions & 28 deletions module/zfs/ddt.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,19 @@
* fill the BP with the DVAs from the entry, increment the refcount and cause
* the write IO to return immediately.
*
* Each ddt_phys_t slot in the entry represents a separate dedup block for the
* same content/checksum. The slot is selected based on the zp_copies parameter
* the block is written with, that is, the number of DVAs in the block. The
* "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto"
* feature. These are no longer written, and will be freed if encountered on
* old pools.
* Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
* block for the same content/checksum. The slot is selected based on the
* zp_copies parameter the block is written with, that is, the number of DVAs
* in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
* now-removed "dedupditto" feature. These are no longer written, and will be
* freed if encountered on old pools.
*
* If the "fast_dedup" feature is enabled, new dedup tables will be created
* with the "flat phys" option. In this mode, there is only one ddt_phys_t
* slot. If a write is issued for an entry that exists, but has fewer DVAs,
* then only as many new DVAs are allocated and written to make up the
* shortfall. The existing entry is then extended (ddt_phys_extend()) with the
* new DVAs.
*
* ## Lifetime of an entry
*
Expand Down Expand Up @@ -130,6 +137,12 @@
* from the alternate block. If the block is actually damaged, this will invoke
* the pool's "self-healing" mechanism, and repair the block.
*
* If the "fast_dedup" feature is enabled, the "flat phys" option will be in
* use, so there is only ever one ddt_phys_t slot. Repair will not occur in
* this case, as there are no other equivalent blocks to fall back on. Note
* that this does not affect the regular OpenZFS scrub and self-healing
* mechanisms.
*
* ## Scanning (scrub/resilver)
*
* If dedup is active, the scrub machinery will walk the dedup table first, and
Expand Down Expand Up @@ -162,10 +175,17 @@
c == ZIO_CHECKSUM_BLAKE3)

static kmem_cache_t *ddt_cache;
static kmem_cache_t *ddt_entry_cache;

#define DDT_ENTRY_SIZE \
(sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX)
static kmem_cache_t *ddt_entry_flat_cache;
static kmem_cache_t *ddt_entry_trad_cache;

#define DDT_ENTRY_FLAT_SIZE \
(sizeof (ddt_entry_t) + sizeof (ddt_phys_flat_t))
#define DDT_ENTRY_TRAD_SIZE \
(sizeof (ddt_entry_t) + sizeof (ddt_phys_trad_t))

#define DDT_ENTRY_SIZE(ddt) \
_DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)

/*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
Expand Down Expand Up @@ -195,7 +215,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = {
*/
static const uint64_t ddt_version_flags[] = {
[DDT_VERSION_LEGACY] = 0,
[DDT_VERSION_FDT] = 0,
[DDT_VERSION_FDT] = DDT_FLAG_FLAT,
};

/* Dummy version to signal that configure is still necessary */
Expand Down Expand Up @@ -346,7 +366,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,

return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
ddt->ddt_object[type][class], &dde->dde_key,
dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)));
dde->dde_phys, DDT_PHYS_SIZE(ddt)));
}

static int
Expand Down Expand Up @@ -388,8 +408,8 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
ASSERT(ddt_object_exists(ddt, type, class));

return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx));
ddt->ddt_object[type][class], &dde->dde_key,
dde->dde_phys, DDT_PHYS_SIZE(ddt), tx));
}

static int
Expand All @@ -410,7 +430,7 @@ ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,

int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
ddlwe->ddlwe_phys, sizeof (ddlwe->ddlwe_phys));
ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
if (error == 0) {
ddlwe->ddlwe_type = type;
ddlwe->ddlwe_class = class;
Expand Down Expand Up @@ -502,13 +522,34 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
}

void
ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
ddt_phys_extend(ddt_phys_t *ddp, const blkptr_t *bp)
{
ASSERT0(ddp->ddp_phys_birth);
int bp_ndvas = BP_GET_NDVAS(bp);
int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;

int s = 0, d = 0;
while (s < bp_ndvas && d < ddp_max_dvas) {
if (DVA_IS_VALID(&ddp->ddp_dva[d])) {
d++;
continue;
}
ddp->ddp_dva[d] = bp->blk_dva[s];
s++; d++;
}

for (int d = 0; d < SPA_DVAS_PER_BP; d++)
ddp->ddp_dva[d] = bp->blk_dva[d];
ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
/*
* If the caller offered us more DVAs than we can fit, something has
* gone wrong in their accounting. zio_ddt_write() should never ask for
* more than we need.
*/
ASSERT3U(s, ==, bp_ndvas);

if (BP_IS_ENCRYPTED(bp))
ddp->ddp_dva[2] = bp->blk_dva[2];

if (ddp->ddp_phys_birth == 0)
ddp->ddp_phys_birth = BP_GET_BIRTH(bp);
}

void
Expand Down Expand Up @@ -599,24 +640,33 @@ ddt_init(void)
{
ddt_cache = kmem_cache_create("ddt_cache",
sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
}

void
ddt_fini(void)
{
kmem_cache_destroy(ddt_entry_cache);
kmem_cache_destroy(ddt_entry_trad_cache);
kmem_cache_destroy(ddt_entry_flat_cache);
kmem_cache_destroy(ddt_cache);
}

static ddt_entry_t *
ddt_alloc(const ddt_key_t *ddk)
ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
{
ddt_entry_t *dde;

dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
memset(dde, 0, DDT_ENTRY_SIZE);
if (ddt->ddt_flags & DDT_FLAG_FLAT) {
dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
} else {
dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
}

cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);

dde->dde_key = *ddk;
Expand Down Expand Up @@ -647,7 +697,8 @@ ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
}

cv_destroy(&dde->dde_cv);
kmem_cache_free(ddt_entry_cache, dde);
kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
}

void
Expand Down Expand Up @@ -797,7 +848,7 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
return (NULL);

/* Time to make a new entry. */
dde = ddt_alloc(&search);
dde = ddt_alloc(ddt, &search);
avl_insert(&ddt->ddt_tree, dde, where);

/*
Expand Down Expand Up @@ -1209,7 +1260,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)

ddt_key_fill(&ddk, bp);

dde = ddt_alloc(&ddk);
dde = ddt_alloc(ddt, &ddk);
ddt_alloc_entry_io(dde);

for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
Expand Down
Loading

0 comments on commit c8bdcc3

Please sign in to comment.