Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast Dedup: Cleanup and documentation ahead of integrating Fast Dedup #15887

Closed
wants to merge 15 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
#include <sys/arc.h>
#include <sys/arc_impl.h>
#include <sys/ddt.h>
#include <sys/ddt_impl.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
#include <sys/blkptr.h>
Expand Down Expand Up @@ -1904,7 +1905,7 @@ dump_dedup_ratio(const ddt_stat_t *dds)
}

static void
dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
{
char name[DDT_NAMELEN];
ddt_entry_t dde;
Expand Down Expand Up @@ -1964,8 +1965,10 @@ dump_all_ddts(spa_t *spa)

for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
for (enum ddt_class class = 0; class < DDT_CLASSES;
if (!ddt)
continue;
for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
for (ddt_class_t class = 0; class < DDT_CLASSES;
class++) {
dump_ddt(ddt, type, class);
}
Expand Down Expand Up @@ -6061,6 +6064,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
return;

ASSERT(ddt_phys_total_refcnt(&dde) > 1);
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
VERIFY(ddt);

for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
Expand All @@ -6075,7 +6080,7 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
zcb->zcb_dedup_blocks++;
}
}
ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];

ddt_enter(ddt);
VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
ddt_exit(ddt);
Expand Down Expand Up @@ -7121,6 +7126,7 @@ dump_block_stats(spa_t *spa)
}

typedef struct zdb_ddt_entry {
/* key must be first for ddt_key_compare */
ddt_key_t zdde_key;
uint64_t zdde_ref_blocks;
uint64_t zdde_ref_lsize;
Expand Down Expand Up @@ -7181,7 +7187,7 @@ dump_simulated_ddt(spa_t *spa)
ddt_histogram_t ddh_total = {{{0}}};
ddt_stat_t dds_total = {0};

avl_create(&t, ddt_entry_compare,
avl_create(&t, ddt_key_compare,
sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));

spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
Expand Down Expand Up @@ -7947,6 +7953,8 @@ dump_mos_leaks(spa_t *spa)
for (uint64_t cksum = 0;
cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) {
ddt_t *ddt = spa->spa_ddt[cksum];
if (!ddt)
continue;
mos_obj_refd(ddt->ddt_object[type][class]);
}
}
Expand Down
1 change: 1 addition & 0 deletions include/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ COMMON_H = \
sys/dataset_kstats.h \
sys/dbuf.h \
sys/ddt.h \
sys/ddt_impl.h \
sys/dmu.h \
sys/dmu_impl.h \
sys/dmu_objset.h \
Expand Down
199 changes: 103 additions & 96 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright (c) 2023, Klara Inc.
*/

#ifndef _SYS_DDT_H
Expand All @@ -39,32 +40,50 @@ extern "C" {
struct abd;

/*
* On-disk DDT formats, in the desired search order (newest version first).
* DDT on-disk storage object types. Each one corresponds to specific
* implementation, see ddt_ops_t. The value itself is not stored on disk.
*
* When searching for an entry, objects types will be searched in this order.
*
* Note that DDT_TYPES is used as the "no type" for new entries that have not
* yet been written to a storage object.
*/
enum ddt_type {
DDT_TYPE_ZAP = 0,
typedef enum {
DDT_TYPE_ZAP = 0, /* ZAP storage object, ddt_zap */
DDT_TYPES
};
} ddt_type_t;

_Static_assert(DDT_TYPES <= UINT8_MAX,
"ddt_type_t must fit in a uint8_t");

/* New and updated entries recieve this type, see ddt_sync_entry() */
#define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP)

/*
* DDT classes, in the desired search order (highest replication level first).
* DDT storage classes. Each class has a separate storage object for each type.
* The value itself is not stored on disk.
*
* When search for an entry, object classes will be searched in this order.
*
* Note that DDT_CLASSES is used as the "no class" for new entries that have not
* yet been written to a storage object.
*/
enum ddt_class {
DDT_CLASS_DITTO = 0,
DDT_CLASS_DUPLICATE,
DDT_CLASS_UNIQUE,
typedef enum {
DDT_CLASS_DITTO = 0, /* entry has ditto blocks (obsolete) */
DDT_CLASS_DUPLICATE, /* entry has multiple references */
DDT_CLASS_UNIQUE, /* entry has a single reference */
DDT_CLASSES
};

#define DDT_TYPE_CURRENT 0
} ddt_class_t;

#define DDT_COMPRESS_BYTEORDER_MASK 0x80
#define DDT_COMPRESS_FUNCTION_MASK 0x7f
_Static_assert(DDT_CLASSES < UINT8_MAX,
"ddt_class_t must fit in a uint8_t");

/*
* On-disk ddt entry: key (name) and physical storage (value).
* The "key" part of an on-disk entry. This is the unique "name" for a block,
* that is, that parts of the block pointer that will always be the same for
* the same data.
*/
typedef struct ddt_key {
typedef struct {
zio_cksum_t ddk_cksum; /* 256-bit block checksum */
/*
* Encoded with logical & physical size, encryption, and compression,
Expand All @@ -76,6 +95,10 @@ typedef struct ddt_key {
uint64_t ddk_prop;
} ddt_key_t;

/*
* Macros for accessing parts of a ddt_key_t. These are similar to their BP_*
* counterparts.
*/
#define DDK_GET_LSIZE(ddk) \
BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
#define DDK_SET_LSIZE(ddk, x) \
Expand All @@ -92,18 +115,25 @@ typedef struct ddt_key {
#define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1)
#define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x)

#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t))

#define DDE_GET_NDVAS(dde) (DDK_GET_CRYPT(&dde->dde_key) \
? SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP)

typedef struct ddt_phys {
/*
* The "value" part for an on-disk entry. These are the "physical"
* characteristics of the stored block, such as its location on disk (DVAs),
* birth txg and ref count.
*
* Note that an entry has an array of four ddt_phys_t, one for each number of
* DVAs (copies= property) and another for additional "ditto" copies. Most
* users of ddt_phys_t will handle indexing into or counting the phys they
* want.
*/
typedef struct {
dva_t ddp_dva[SPA_DVAS_PER_BP];
uint64_t ddp_refcnt;
uint64_t ddp_phys_birth;
} ddt_phys_t;

/*
* Named indexes into the ddt_phys_t array in each entry.
*
* Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However,
* we maintain the ability to free existing dedup-ditto blocks.
*/
Expand All @@ -116,99 +146,83 @@ enum ddt_phys_type {
};

/*
* In-core ddt entry
* A "live" entry, holding changes to an entry made this txg, and other data to
* support loading, updating and repairing the entry.
*/
struct ddt_entry {
ddt_key_t dde_key;
ddt_phys_t dde_phys[DDT_PHYS_TYPES];

/* State flags for dde_flags */
#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */

typedef struct {
/* key must be first for ddt_key_compare */
ddt_key_t dde_key; /* ddt_tree key */
ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */

/* in-flight update IOs */
zio_t *dde_lead_zio[DDT_PHYS_TYPES];

/* copy of data after a repair read, to be rewritten */
struct abd *dde_repair_abd;
enum ddt_type dde_type;
enum ddt_class dde_class;
uint8_t dde_loading;
uint8_t dde_loaded;
kcondvar_t dde_cv;
avl_node_t dde_node;
};

/* storage type and class the entry was loaded from */
ddt_type_t dde_type;
ddt_class_t dde_class;

uint8_t dde_flags; /* load state flags */
kcondvar_t dde_cv; /* signaled when load completes */

avl_node_t dde_node; /* ddt_tree node */
} ddt_entry_t;

/*
* In-core ddt
* In-core DDT object. This covers all entries and stats for a the whole pool
* for a given checksum type.
*/
struct ddt {
kmutex_t ddt_lock;
avl_tree_t ddt_tree;
avl_tree_t ddt_repair_tree;
enum zio_checksum ddt_checksum;
spa_t *ddt_spa;
objset_t *ddt_os;
uint64_t ddt_stat_object;
typedef struct {
kmutex_t ddt_lock; /* protects changes to all fields */

avl_tree_t ddt_tree; /* "live" (changed) entries this txg */

avl_tree_t ddt_repair_tree; /* entries being repaired */

enum zio_checksum ddt_checksum; /* checksum algorithm in use */
spa_t *ddt_spa; /* pool this ddt is on */
objset_t *ddt_os; /* ddt objset (always MOS) */

/* per-type/per-class entry store objects */
uint64_t ddt_object[DDT_TYPES][DDT_CLASSES];

/* object ids for whole-ddt and per-type/per-class stats */
uint64_t ddt_stat_object;
ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];

/* type/class stats by power-2-sized referenced blocks */
ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES];
ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES];
ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES];
avl_node_t ddt_node;
};
} ddt_t;

/*
* In-core and on-disk bookmark for DDT walks
* In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(),
* and is stable across calls, even if the DDT is updated, the pool is
* restarted or loaded on another system, or OpenZFS is upgraded.
*/
typedef struct ddt_bookmark {
typedef struct {
uint64_t ddb_class;
uint64_t ddb_type;
uint64_t ddb_checksum;
uint64_t ddb_cursor;
} ddt_bookmark_t;

/*
* Ops vector to access a specific DDT object type.
*/
typedef struct ddt_ops {
char ddt_op_name[32];
int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
boolean_t prehash);
int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
void (*ddt_op_prefetch)(objset_t *os, uint64_t object,
ddt_entry_t *dde);
int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
dmu_tx_t *tx);
int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
dmu_tx_t *tx);
int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
uint64_t *walk);
int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count);
} ddt_ops_t;

#define DDT_NAMELEN 107

extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
enum ddt_class clazz, char *name);
extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
enum ddt_class clazz, uint64_t *walk, ddt_entry_t *dde);
extern int ddt_object_count(ddt_t *ddt, enum ddt_type type,
enum ddt_class clazz, uint64_t *count);
extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
enum ddt_class clazz, dmu_object_info_t *);
extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
enum ddt_class clazz);

extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
uint64_t txg);
extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
const ddt_phys_t *ddp, blkptr_t *bp);

extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);

extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
extern void ddt_phys_clear(ddt_phys_t *ddp);
extern void ddt_phys_addref(ddt_phys_t *ddp);
extern void ddt_phys_decref(ddt_phys_t *ddp);
extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
uint64_t txg);
extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);

extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);

extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
Expand All @@ -220,9 +234,6 @@ extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);

extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);

extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
extern void ddt_enter(ddt_t *ddt);
extern void ddt_exit(ddt_t *ddt);
Expand All @@ -232,26 +243,22 @@ extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp);
extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);

extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class,
const blkptr_t *bp);

extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);

extern int ddt_entry_compare(const void *x1, const void *x2);
extern int ddt_key_compare(const void *x1, const void *x2);

extern void ddt_create(spa_t *spa);
extern int ddt_load(spa_t *spa);
extern void ddt_unload(spa_t *spa);
extern void ddt_sync(spa_t *spa, uint64_t txg);
extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx);

extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);

extern const ddt_ops_t ddt_zap_ops;

#ifdef __cplusplus
}
#endif
Expand Down
Loading
Loading