Skip to content

Commit

Permalink
ddt: slim down ddt_entry_t
Browse files Browse the repository at this point in the history
This slims down the in-memory entry to as small as it can be. The
IO-related parts are made into a separate entry, since they're
relatively rarely needed.

The variable allocation for dde_phys is to support the upcoming flat
format.

Reviewed-by: Alexander Motin <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Rob Norris <[email protected]>
Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.
Closes #15893
  • Loading branch information
robn authored and behlendorf committed Aug 16, 2024
1 parent 4d686c3 commit 0ba5f50
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 31 deletions.
22 changes: 16 additions & 6 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,16 +151,22 @@ typedef struct {
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */
#define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */

/*
* Additional data to support entry update or repair. This is fixed size
* because its relatively rarely used.
*/
typedef struct {
/* key must be first for ddt_key_compare */
ddt_key_t dde_key; /* ddt_tree key */
ddt_phys_t dde_phys[DDT_PHYS_MAX]; /* on-disk data */
/* copy of data after a repair read, to be rewritten */
abd_t *dde_repair_abd;

/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_MAX];
} ddt_entry_io_t;

/* copy of data after a repair read, to be rewritten */
struct abd *dde_repair_abd;
typedef struct {
/* key must be first for ddt_key_compare */
ddt_key_t dde_key; /* ddt_tree key */
avl_node_t dde_node; /* ddt_tree_node */

/* storage type and class the entry was loaded from */
ddt_type_t dde_type;
Expand All @@ -170,7 +176,9 @@ typedef struct {
kcondvar_t dde_cv; /* signaled when load completes */
uint64_t dde_waiters; /* count of waiters on dde_cv */

avl_node_t dde_node; /* ddt_tree node */
ddt_entry_io_t *dde_io; /* IO support, when required */

ddt_phys_t dde_phys[]; /* physical data */
} ddt_entry_t;

/*
Expand Down Expand Up @@ -265,6 +273,8 @@ extern void ddt_prefetch_all(spa_t *spa);
extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
const blkptr_t *bp);

extern void ddt_alloc_entry_io(ddt_entry_t *dde);

extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);

Expand Down
46 changes: 33 additions & 13 deletions module/zfs/ddt.c
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@
static kmem_cache_t *ddt_cache;
static kmem_cache_t *ddt_entry_cache;

#define DDT_ENTRY_SIZE \
(sizeof (ddt_entry_t) + sizeof (ddt_phys_t) * DDT_PHYS_MAX)

/*
* Enable/disable prefetching of dedup-ed blocks which are going to be freed.
*/
Expand Down Expand Up @@ -343,7 +346,7 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,

return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
ddt->ddt_object[type][class], &dde->dde_key,
dde->dde_phys, sizeof (dde->dde_phys)));
dde->dde_phys, sizeof (ddt_phys_t) * DDT_NPHYS(ddt)));
}

static int
Expand Down Expand Up @@ -386,7 +389,7 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,

return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys,
sizeof (dde->dde_phys), tx));
sizeof (ddt_phys_t) * DDT_NPHYS(ddt), tx));
}

static int
Expand Down Expand Up @@ -597,7 +600,7 @@ ddt_init(void)
ddt_cache = kmem_cache_create("ddt_cache",
sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
DDT_ENTRY_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
}

void
Expand All @@ -613,22 +616,35 @@ ddt_alloc(const ddt_key_t *ddk)
ddt_entry_t *dde;

dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
memset(dde, 0, sizeof (ddt_entry_t));
memset(dde, 0, DDT_ENTRY_SIZE);
cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);

dde->dde_key = *ddk;

return (dde);
}

void
ddt_alloc_entry_io(ddt_entry_t *dde)
{
if (dde->dde_io != NULL)
return;

dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
}

static void
ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
{
for (int p = 0; p < DDT_NPHYS(ddt); p++)
ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
if (dde->dde_io != NULL) {
for (int p = 0; p < DDT_NPHYS(ddt); p++)
ASSERT3P(dde->dde_io->dde_lead_zio[p], ==, NULL);

if (dde->dde_repair_abd != NULL)
abd_free(dde->dde_repair_abd);
if (dde->dde_io->dde_repair_abd != NULL)
abd_free(dde->dde_io->dde_repair_abd);

kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
}

cv_destroy(&dde->dde_cv);
kmem_cache_free(ddt_entry_cache, dde);
Expand Down Expand Up @@ -1191,6 +1207,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
ddt_key_fill(&ddk, bp);

dde = ddt_alloc(&ddk);
ddt_alloc_entry_io(dde);

for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
Expand All @@ -1205,7 +1222,7 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
}
}

memset(dde->dde_phys, 0, sizeof (dde->dde_phys));
memset(dde->dde_phys, 0, sizeof (ddt_phys_t) * DDT_NPHYS(ddt));

return (dde);
}
Expand All @@ -1217,7 +1234,8 @@ ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)

ddt_enter(ddt);

if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
if (dde->dde_io->dde_repair_abd != NULL &&
spa_writeable(ddt->ddt_spa) &&
avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
avl_insert(&ddt->ddt_repair_tree, dde, where);
else
Expand Down Expand Up @@ -1255,8 +1273,9 @@ ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
continue;
ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
ZIO_DDT_CHILD_FLAGS(zio), NULL));
}

zio_nowait(zio);
Expand Down Expand Up @@ -1301,7 +1320,8 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
ASSERT(dde->dde_flags & DDE_FLAG_LOADED);

for (int p = 0; p < DDT_NPHYS(ddt); p++) {
ASSERT3P(dde->dde_lead_zio[p], ==, NULL);
ASSERT(dde->dde_io == NULL ||
dde->dde_io->dde_lead_zio[p] == NULL);
ddt_phys_t *ddp = &dde->dde_phys[p];
if (ddp->ddp_phys_birth == 0) {
ASSERT0(ddp->ddp_refcnt);
Expand Down
26 changes: 14 additions & 12 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -3265,8 +3265,8 @@ zio_ddt_child_read_done(zio_t *zio)
if (zio->io_error == 0)
ddt_phys_clear(ddp); /* this ddp doesn't need repair */

if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
dde->dde_repair_abd = zio->io_abd;
if (zio->io_error == 0 && dde->dde_io->dde_repair_abd == NULL)
dde->dde_io->dde_repair_abd = zio->io_abd;
else
abd_free(zio->io_abd);
mutex_exit(&pio->io_lock);
Expand Down Expand Up @@ -3340,8 +3340,8 @@ zio_ddt_read_done(zio_t *zio)
zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
return (NULL);
}
if (dde->dde_repair_abd != NULL) {
abd_copy(zio->io_abd, dde->dde_repair_abd,
if (dde->dde_io->dde_repair_abd != NULL) {
abd_copy(zio->io_abd, dde->dde_io->dde_repair_abd,
zio->io_size);
zio->io_child_error[ZIO_CHILD_DDT] = 0;
}
Expand Down Expand Up @@ -3378,7 +3378,7 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
if (DDT_PHYS_IS_DITTO(ddt, p))
continue;

zio_t *lio = dde->dde_lead_zio[p];
zio_t *lio = dde->dde_io->dde_lead_zio[p];

if (lio != NULL && do_raw) {
return (lio->io_size != zio->io_size ||
Expand Down Expand Up @@ -3472,7 +3472,7 @@ zio_ddt_child_write_ready(zio_t *zio)

ddt_enter(ddt);

ASSERT(dde->dde_lead_zio[p] == zio);
ASSERT(dde->dde_io->dde_lead_zio[p] == zio);

ddt_phys_fill(ddp, zio->io_bp);

Expand All @@ -3495,8 +3495,8 @@ zio_ddt_child_write_done(zio_t *zio)
ddt_enter(ddt);

ASSERT(ddp->ddp_refcnt == 0);
ASSERT(dde->dde_lead_zio[p] == zio);
dde->dde_lead_zio[p] = NULL;
ASSERT(dde->dde_io->dde_lead_zio[p] == zio);
dde->dde_io->dde_lead_zio[p] = NULL;

if (zio->io_error == 0) {
zio_link_t *zl = NULL;
Expand Down Expand Up @@ -3563,11 +3563,13 @@ zio_ddt_write(zio_t *zio)
return (zio);
}

if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
ddt_alloc_entry_io(dde);

if (ddp->ddp_phys_birth != 0 || dde->dde_io->dde_lead_zio[p] != NULL) {
if (ddp->ddp_phys_birth != 0)
ddt_bp_fill(ddp, bp, txg);
if (dde->dde_lead_zio[p] != NULL)
zio_add_child(zio, dde->dde_lead_zio[p]);
if (dde->dde_io->dde_lead_zio[p] != NULL)
zio_add_child(zio, dde->dde_io->dde_lead_zio[p]);
else
ddt_phys_addref(ddp);
} else if (zio->io_bp_override) {
Expand All @@ -3583,7 +3585,7 @@ zio_ddt_write(zio_t *zio)
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);

zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
dde->dde_lead_zio[p] = cio;
dde->dde_io->dde_lead_zio[p] = cio;
}

ddt_exit(ddt);
Expand Down

0 comments on commit 0ba5f50

Please sign in to comment.