Skip to content

Commit

Permalink
Reduce blkptr logical and physical birth to 48 bits
Browse files Browse the repository at this point in the history
This PR reduces the blkptr's logical and physical birth times from
64-bit to 48-bits. The upper 16-bits are reserved for future work.

Consumers of the blkptr's logical or physical birth times must use the
newly provided macros when accessing those members of the blkptr
structure.

Signed-off-by: George Wilson <[email protected]>
  • Loading branch information
grwilson committed Mar 11, 2024
1 parent 8f2f6cd commit 2cecdbb
Show file tree
Hide file tree
Showing 32 changed files with 231 additions and 199 deletions.
19 changes: 10 additions & 9 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,8 @@ sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
break;
sublivelist_verify_block_t svb = {
.svb_dva = bp->blk_dva[i],
.svb_allocated_txg = bp->blk_birth
.svb_allocated_txg =
BP_GET_LOGICAL_BIRTH(bp)
};

if (zfs_btree_find(&sv->sv_leftover, &svb,
Expand Down Expand Up @@ -2340,7 +2341,7 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
(int)BPE_GET_ETYPE(bp),
(u_longlong_t)BPE_GET_LSIZE(bp),
(u_longlong_t)BPE_GET_PSIZE(bp),
(u_longlong_t)bp->blk_birth);
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
return;
}

Expand All @@ -2358,16 +2359,16 @@ snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
buflen - strlen(blkbuf),
"%llxL B=%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)bp->blk_birth);
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
} else {
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf),
"%llxL/%llxP F=%llu B=%llu/%llu",
(u_longlong_t)BP_GET_LSIZE(bp),
(u_longlong_t)BP_GET_PSIZE(bp),
(u_longlong_t)BP_GET_FILL(bp),
(u_longlong_t)bp->blk_birth,
(u_longlong_t)BP_PHYSICAL_BIRTH(bp));
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
(u_longlong_t)BP_GET_BIRTH(bp));
if (bp_freed)
(void) snprintf(blkbuf + strlen(blkbuf),
buflen - strlen(blkbuf), " %s", "FREE");
Expand Down Expand Up @@ -2417,7 +2418,7 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
{
int err = 0;

if (bp->blk_birth == 0)
if (BP_GET_LOGICAL_BIRTH(bp) == 0)
return (0);

print_indirect(spa, bp, zb, dnp);
Expand Down Expand Up @@ -2605,7 +2606,7 @@ dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
(void) arg, (void) tx;
char blkbuf[BP_SPRINTF_LEN];

if (bp->blk_birth != 0) {
if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("\t%s\n", blkbuf);
}
Expand Down Expand Up @@ -2646,7 +2647,7 @@ dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
(void) arg, (void) tx;
char blkbuf[BP_SPRINTF_LEN];

ASSERT(bp->blk_birth != 0);
ASSERT(BP_GET_LOGICAL_BIRTH(bp) != 0);
snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
(void) printf("\t%s\n", blkbuf);
return (0);
Expand Down Expand Up @@ -5788,7 +5789,7 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
if (zb->zb_level == ZB_DNODE_LEVEL)
return (0);

if (dump_opt['b'] >= 5 && bp->blk_birth > 0) {
if (dump_opt['b'] >= 5 && BP_GET_LOGICAL_BIRTH(bp) > 0) {
char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
(void) printf("objset %llu object %llu "
Expand Down
12 changes: 6 additions & 6 deletions cmd/zdb/zdb_il.c
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)

if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", tab_prefix,
!BP_IS_HOLE(bp) &&
bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
spa_min_claim_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
print_log_bp(bp, tab_prefix);

Expand All @@ -186,7 +186,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg)
(void) printf("%s<hole>\n", tab_prefix);
return;
}
if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
if (BP_GET_LOGICAL_BIRTH(bp) < zilog->zl_header->zh_claim_txg) {
(void) printf("%s<block already committed>\n",
tab_prefix);
return;
Expand Down Expand Up @@ -237,8 +237,8 @@ zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg)

if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
(void) printf("%shas blkptr, %s\n", tab_prefix,
!BP_IS_HOLE(bp) &&
bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ?
!BP_IS_HOLE(bp) && BP_GET_LOGICAL_BIRTH(bp) >=
spa_min_claim_txg(zilog->zl_spa) ?
"will claim" : "won't claim");
print_log_bp(bp, tab_prefix);
}
Expand Down Expand Up @@ -473,7 +473,7 @@ print_log_block(zilog_t *zilog, const blkptr_t *bp, void *arg,

if (claim_txg != 0)
claim = "already claimed";
else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa))
else if (BP_GET_LOGICAL_BIRTH(bp) >= spa_min_claim_txg(zilog->zl_spa))
claim = "will claim";
else
claim = "won't claim";
Expand Down
4 changes: 2 additions & 2 deletions cmd/zhack.c
Original file line number Diff line number Diff line change
Expand Up @@ -612,8 +612,8 @@ zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)
* Uberblock root block pointer has valid birth TXG.
* Copying it to the label NVlist
*/
if (ub->ub_rootbp.blk_birth != 0) {
const uint64_t txg = ub->ub_rootbp.blk_birth;
if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {
const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp);
ub->ub_txg = txg;

if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) {
Expand Down
87 changes: 54 additions & 33 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,15 @@ typedef struct zio_cksum_salt {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | pad | vdev1 | GRID | ASIZE |
* 0 | pad | vdev1 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 1 |G| offset1 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 2 | pad | vdev2 | GRID | ASIZE |
* 2 | pad | vdev2 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 3 |G| offset2 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 4 | pad | vdev3 | GRID | ASIZE |
* 4 | pad | vdev3 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
Expand All @@ -143,9 +143,9 @@ typedef struct zio_cksum_salt {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 9 | physical birth txg |
* 9 | RSVD | physical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* a | logical birth txg |
* a | RSVD | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | fill count |
* +-------+-------+-------+-------+-------+-------+-------+-------+
Expand All @@ -165,7 +165,6 @@ typedef struct zio_cksum_salt {
* LSIZE logical size
* PSIZE physical size (after compression)
* ASIZE allocated size (including RAID-Z parity and gang block headers)
* GRID RAID-Z layout information (reserved for future use)
* cksum checksum function
* comp compression function
* G gang block indicator
Expand All @@ -175,6 +174,7 @@ typedef struct zio_cksum_salt {
* E blkptr_t contains embedded data (see below)
* lvl level of indirection
* type DMU object type
* RSVD reserved for future use
* phys birth txg when dva[0] was written; zero if same as logical birth txg
* note that typically all the dva's would be written in this
* txg, but they could be different if they were moved by
Expand All @@ -190,11 +190,11 @@ typedef struct zio_cksum_salt {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 0 | vdev1 | GRID | ASIZE |
* 0 | vdev1 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 1 |G| offset1 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 2 | vdev2 | GRID | ASIZE |
* 2 | vdev2 | pad | ASIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 3 |G| offset2 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
Expand All @@ -208,9 +208,9 @@ typedef struct zio_cksum_salt {
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 8 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 9 | physical birth txg |
* 9 | RSVD | physical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* a | logical birth txg |
* a | RSVD | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | IV2 | fill count |
* +-------+-------+-------+-------+-------+-------+-------+-------+
Expand Down Expand Up @@ -283,7 +283,7 @@ typedef struct zio_cksum_salt {
* 8 | payload |
* 9 | payload |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* a | logical birth txg |
* a | RSVD | logical birth txg |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* b | payload |
* c | payload |
Expand All @@ -307,6 +307,7 @@ typedef struct zio_cksum_salt {
* LSIZE logical size of payload, in bytes
* note that 25 bits is enough to store the largest
* "normal" BP's LSIZE (2^16 * 2^9) in bytes
* RSVD reserved for future use
* log. birth transaction group in which the block was logically born
*
* Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
Expand Down Expand Up @@ -355,7 +356,7 @@ typedef enum bp_embedded_type {
#define BPE_NUM_WORDS 14
#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
#define BPE_IS_PAYLOADWORD(bp, wp) \
((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
((wp) != &(bp)->blk_prop && (wp) != (&(bp)->blk_birth_word[1]))

#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
Expand All @@ -374,8 +375,19 @@ typedef struct blkptr {
dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
uint64_t blk_prop; /* size, compression, type, etc */
uint64_t blk_pad[2]; /* Extra space for the future */
uint64_t blk_phys_birth; /* txg when block was allocated */
uint64_t blk_birth; /* transaction group at birth */

/*
* The following two words MUST NOT be accessed directly since
* only 48 bits of the word contain the corresponding birth txg.
* The BP_[GET|SET]_BIRTH macros should be used to manipulate the
* physical and logical birth times. Those macros will set/get
* the value in both of the words.
*
* To retrieve or manipulate one of the words, BP_GET_LOGICAL_BIRTH
* and BP_GET_PHYSICAL_BIRTH (and a corresponding SET macro) must be
* used.
*/
uint64_t blk_birth_word[2];
uint64_t blk_fill; /* fill count */
zio_cksum_t blk_cksum; /* 256-bit checksum */
} blkptr_t;
Expand All @@ -395,9 +407,6 @@ typedef struct blkptr {
BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
SPA_MINBLOCKSHIFT, 0, x)

#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)

#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
#define DVA_SET_VDEV(dva, x) \
BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
Expand Down Expand Up @@ -480,15 +489,27 @@ typedef struct blkptr {
#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1)
#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x)

#define BP_PHYSICAL_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
#define BP_GET_LOGICAL_BIRTH(bp) \
BF64_GET((bp)->blk_birth_word[1], 0, 48)
#define BP_SET_LOGICAL_BIRTH(bp, x) \
BF64_SET((bp)->blk_birth_word[1], 0, 48, x)

#define BP_GET_PHYSICAL_BIRTH(bp) \
BF64_GET((bp)->blk_birth_word[0], 0, 48)
#define BP_SET_PHYSICAL_BIRTH(bp, x) \
BF64_SET((bp)->blk_birth_word[0], 0, 48, x)

#define BP_GET_BIRTH(bp) \
(BP_IS_EMBEDDED(bp) ? 0 : \
BP_GET_PHYSICAL_BIRTH(bp) ? BP_GET_PHYSICAL_BIRTH(bp) : \
BP_GET_LOGICAL_BIRTH(bp))

#define BP_SET_BIRTH(bp, logical, physical) \
{ \
ASSERT(!BP_IS_EMBEDDED(bp)); \
(bp)->blk_birth = (logical); \
(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
BP_SET_LOGICAL_BIRTH(bp, logical); \
BP_SET_PHYSICAL_BIRTH(bp, \
((logical) == (physical) ? 0 : (physical))); \
}

#define BP_GET_FILL(bp) \
Expand Down Expand Up @@ -541,8 +562,8 @@ typedef struct blkptr {
(dva1)->dva_word[0] == (dva2)->dva_word[0])

#define BP_EQUAL(bp1, bp2) \
(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
(bp1)->blk_birth == (bp2)->blk_birth && \
(BP_GET_BIRTH(bp1) == BP_GET_BIRTH(bp2) && \
BP_GET_LOGICAL_BIRTH(bp1) == BP_GET_LOGICAL_BIRTH(bp2) && \
DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
Expand Down Expand Up @@ -581,8 +602,8 @@ typedef struct blkptr {
(bp)->blk_prop = 0; \
(bp)->blk_pad[0] = 0; \
(bp)->blk_pad[1] = 0; \
(bp)->blk_phys_birth = 0; \
(bp)->blk_birth = 0; \
(bp)->blk_birth_word[0] = 0; \
(bp)->blk_birth_word[1] = 0; \
(bp)->blk_fill = 0; \
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
Expand Down Expand Up @@ -631,7 +652,7 @@ typedef struct blkptr {
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \
} else if (BP_IS_EMBEDDED(bp)) { \
len = func(buf + len, size - len, \
"EMBEDDED [L%llu %s] et=%u %s " \
Expand All @@ -642,14 +663,14 @@ typedef struct blkptr {
compress, \
(u_longlong_t)BPE_GET_LSIZE(bp), \
(u_longlong_t)BPE_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \
} else if (BP_IS_REDACTED(bp)) { \
len += func(buf + len, size - len, \
"REDACTED [L%llu %s] size=%llxL birth=%lluL", \
(u_longlong_t)BP_GET_LEVEL(bp), \
type, \
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)bp->blk_birth); \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp)); \
} else { \
for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
const dva_t *dva = &bp->blk_dva[d]; \
Expand Down Expand Up @@ -691,8 +712,8 @@ typedef struct blkptr {
ws, \
(u_longlong_t)BP_GET_LSIZE(bp), \
(u_longlong_t)BP_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth, \
(u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
(u_longlong_t)BP_GET_LOGICAL_BIRTH(bp), \
(u_longlong_t)BP_GET_BIRTH(bp), \
(u_longlong_t)BP_GET_FILL(bp), \
ws, \
(u_longlong_t)bp->blk_cksum.zc_word[0], \
Expand Down Expand Up @@ -1142,9 +1163,9 @@ extern const char *spa_state_to_name(spa_t *spa);
/* error handling */
struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb,
const uint64_t *birth);
const uint64_t birth);
extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb,
const uint64_t *birth);
const uint64_t birth);
extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
Expand Down
2 changes: 1 addition & 1 deletion include/sys/uberblock_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ struct uberblock {
* pool from a checkpointed uberblock [see spa_ld_select_uberblock()],
* the value of the field is used to determine which ZIL blocks have
* been allocated according to the ms_sm when we are rewinding to a
* checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then
* checkpoint. Specifically, if logical birth > ub_checkpoint_txg,then
* the ZIL block is not allocated [see uses of spa_min_claim_txg()].
*/
uint64_t ub_checkpoint_txg;
Expand Down
4 changes: 2 additions & 2 deletions lib/libzdb/libzdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ livelist_compare(const void *larg, const void *rarg)
* Since we're storing blkptrs without cancelling FREE/ALLOC pairs,
* it's possible the offsets are equal. In that case, sort by txg
*/
if (l->blk_birth < r->blk_birth) {
if (BP_GET_LOGICAL_BIRTH(l) < BP_GET_LOGICAL_BIRTH(r)) {
return (-1);
} else if (l->blk_birth > r->blk_birth) {
} else if (BP_GET_LOGICAL_BIRTH(l) > BP_GET_LOGICAL_BIRTH(r)) {
return (+1);
}
return (0);
Expand Down
Loading

0 comments on commit 2cecdbb

Please sign in to comment.