Skip to content

Commit

Permalink
Updating based on PR Feedback(3)
Browse files Browse the repository at this point in the history
1. Unified the block cloning and Direct I/O code paths further. As part
   of this unification, it is important to outline that Direct I/O
   writes transition the db_state to DB_UNCACHED. This is used so that
   dbuf_unoverride() is called when dbuf_undirty() is called. This is
   needed to cleanup space accounting in a TXG. When a dbuf is redirtied
   through dbuf_redirty(), then dbuf_unoverride() is also called to
   clean up space accounting. This is a bit of a different approach that
   block cloning, which always calls dbuf_undirty().
2. As part of uniying the two, Direct I/O also performs the same check
   in dmu_buf_will_fill() so that on failure the previous contents of
   the dbuf are set correctly.
3. General just code cleanup removing checks that are no longer
   necessary.

Signed-off-by: Brian Atkinson <[email protected]>
  • Loading branch information
bwatkinson committed Jul 27, 2024
1 parent 615e4c2 commit b780505
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 94 deletions.
15 changes: 1 addition & 14 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ typedef struct dbuf_dirty_record {
uint8_t dr_copies;
boolean_t dr_nopwrite;
boolean_t dr_brtwrite;
boolean_t dr_diowrite;
boolean_t dr_has_raw_params;

/*
Expand Down Expand Up @@ -453,20 +454,6 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
return (NULL);
}

static inline boolean_t
dbuf_dirty_is_direct_write(dmu_buf_impl_t *db, dbuf_dirty_record_t *dr)
{
boolean_t ret = B_FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));

if (dr != NULL && db->db_level == 0 && !dr->dt.dl.dr_brtwrite &&
dr->dt.dl.dr_override_state == DR_OVERRIDDEN &&
dr->dt.dl.dr_data == NULL) {
ret = B_TRUE;
}
return (ret);
}

#define DBUF_GET_BUFC_TYPE(_db) \
(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)

Expand Down
120 changes: 44 additions & 76 deletions module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -1255,16 +1255,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
ASSERT(buf != NULL);

db->db_buf = buf;

/*
* If there is a Direct I/O, set its data too. Then its state
* will be the same as if we did a ZIL dmu_sync().
*/
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
if (dbuf_dirty_is_direct_write(db, dr)) {
dr->dt.dl.dr_data = db->db_buf;
}

ASSERT(buf->b_data != NULL);
db->db.db_data = buf->b_data;
}
Expand Down Expand Up @@ -1843,7 +1833,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
/*
* If a block clone or Direct I/O write has occurred we will
* get the dirty records overridden BP so we get the most
* recent data..
* recent data.
*/
err = dmu_buf_get_bp_from_dbuf(db, &bp);

Expand Down Expand Up @@ -1948,13 +1938,14 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);

if (dr->dt.dl.dr_brtwrite) {
if (dr->dt.dl.dr_brtwrite || dr->dt.dl.dr_diowrite) {
ASSERT0P(dr->dt.dl.dr_data);
dr->dt.dl.dr_data = db->db_buf;
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
dr->dt.dl.dr_brtwrite = B_FALSE;
dr->dt.dl.dr_diowrite = B_FALSE;
dr->dt.dl.dr_has_raw_params = B_FALSE;

/*
Expand Down Expand Up @@ -2161,26 +2152,11 @@ dbuf_redirty(dbuf_dirty_record_t *dr)
*/
dbuf_unoverride(dr);
if (db->db.db_object != DMU_META_DNODE_OBJECT &&
db->db_state != DB_NOFILL && db->db_buf != NULL) {
/*
* Already released on initial dirty,
* so just thaw.
*/
db->db_state != DB_NOFILL) {
/* Already released on initial dirty, so just thaw. */
ASSERT(arc_released(db->db_buf));
arc_buf_thaw(db->db_buf);
}
/*
* If initial dirty was via Direct I/O, may not have a dr_data.
*
* If the dirty record was associated with cloned block then
* the call above to dbuf_unoverride() will have reset
* dr->dt.dl.dr_data and it will not be NULL here.
*/
if (dr->dt.dl.dr_data == NULL) {
ASSERT3B(dbuf_dirty_is_direct_write(db, dr), ==,
B_TRUE);
dr->dt.dl.dr_data = db->db_buf;
}
}
}

Expand Down Expand Up @@ -2564,6 +2540,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
uint64_t txg = tx->tx_txg;
boolean_t brtwrite;
boolean_t diowrite;

ASSERT(txg != 0);

Expand All @@ -2589,7 +2566,9 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(dr->dr_dbuf == db);

brtwrite = dr->dt.dl.dr_brtwrite;
diowrite = dr->dt.dl.dr_diowrite;
if (brtwrite) {
ASSERT3B(diowrite, ==, B_FALSE);
/*
* We are freeing a block that we cloned in the same
* transaction group.
Expand Down Expand Up @@ -2630,11 +2609,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_state != DB_NOFILL && !brtwrite) {
dbuf_unoverride(dr);

/*
* In the Direct I/O case, the buffer is still dirty, but it
* may be UNCACHED, so we do not need to destroy an ARC buffer.
*/
if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) {
if (dr->dt.dl.dr_data != db->db_buf) {
ASSERT(db->db_buf != NULL);
ASSERT(dr->dt.dl.dr_data != NULL);
arc_buf_destroy(dr->dt.dl.dr_data, db);
Expand All @@ -2647,12 +2622,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_dirtycnt -= 1;

if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
/*
* In the Direct I/O case our db_buf will be NULL as we are not
* caching in the ARC.
*/
ASSERT(db->db_state == DB_NOFILL || brtwrite ||
db->db_buf == NULL || arc_released(db->db_buf));
ASSERT(db->db_state == DB_NOFILL || brtwrite || diowrite ||
arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
}
Expand Down Expand Up @@ -2711,8 +2682,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
* Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
* want to make sure dbuf_read() will read the pending cloned block and
* not the uderlying block that is being replaced. dbuf_undirty() will
* do dbuf_unoverride(), so we will end up with cloned block content,
* without overridden BP.
* do brt_pending_remove() before removing the dirty record.
*/
(void) dbuf_read(db, NULL, flags);
if (undirty) {
Expand Down Expand Up @@ -2761,19 +2731,16 @@ dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp)

*bp = db->db_blkptr;
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
if (dr) {
if (db->db_state == DB_NOFILL) {
/* Block clone */
if (!dr->dt.dl.dr_brtwrite)
error = EIO;
else
*bp = &dr->dt.dl.dr_overridden_by;
} else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN &&
dr->dt.dl.dr_data == NULL) {
ASSERT(db->db_state == DB_UNCACHED);
/* Direct I/O write */
if (dr && db->db_state == DB_NOFILL) {
/* Block clone */
if (!dr->dt.dl.dr_brtwrite)
error = EIO;
else
*bp = &dr->dt.dl.dr_overridden_by;
} else if (dr && db->db_state == DB_UNCACHED) {
/* Direct I/O write */
if (dr->dt.dl.dr_diowrite)
*bp = &dr->dt.dl.dr_overridden_by;
}
}

return (error);
Expand Down Expand Up @@ -2926,21 +2893,28 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
dmu_tx_private_ok(tx));

mutex_enter(&db->db_mtx);
if (db->db_state == DB_NOFILL) {
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
if (db->db_state == DB_NOFILL ||
(db->db_state == DB_UNCACHED && dr && dr->dt.dl.dr_diowrite)) {
/*
* Block cloning: We will be completely overwriting a block
* cloned in this transaction group, so let's undirty the
* pending clone and mark the block as uncached. This will be
* as if the clone was never done. But if the fill can fail
* we should have a way to return back to the cloned data.
* If the fill can fail we should have a way to return back to
* the cloned or Direct I/O write data.
*/
if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
if (canfail && dr) {
mutex_exit(&db->db_mtx);
dmu_buf_will_dirty(db_fake, tx);
return;
}
VERIFY(!dbuf_undirty(db, tx));
db->db_state = DB_UNCACHED;
/*
* Block cloning: We will be completely overwriting a block
* cloned in this transaction group, so let's undirty the
* pending clone and mark the block as uncached. This will be
* as if the clone was never done.
*/
if (dr && dr->dt.dl.dr_brtwrite) {
VERIFY(!dbuf_undirty(db, tx));
db->db_state = DB_UNCACHED;
}
}
mutex_exit(&db->db_mtx);

Expand Down Expand Up @@ -5076,6 +5050,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
if (dr->dt.dl.dr_data != NULL &&
dr->dt.dl.dr_data != db->db_buf) {
ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE);
ASSERT3B(dr->dt.dl.dr_diowrite, ==, B_FALSE);
arc_buf_destroy(dr->dt.dl.dr_data, db);
}
} else {
Expand Down Expand Up @@ -5137,9 +5112,7 @@ dbuf_write_override_done(zio_t *zio)
if (!BP_EQUAL(zio->io_bp, obp)) {
if (!BP_IS_HOLE(obp))
dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);

if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf)
arc_release(dr->dt.dl.dr_data, db);
arc_release(dr->dt.dl.dr_data, db);
}
mutex_exit(&db->db_mtx);

Expand Down Expand Up @@ -5347,14 +5320,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
* (by dmu_sync(), dmu_write_direct(),
* or dmu_buf_write_embedded()).
*/
blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
abd_t *contents = NULL;
if (data) {
ASSERT(BP_IS_HOLE(bp) ||
arc_buf_lsize(data) == BP_GET_LSIZE(bp));
contents = abd_get_from_buf(data->b_data,
arc_buf_size(data));
}
abd_t *contents = (data != NULL) ?
abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;

dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
Expand All @@ -5363,8 +5330,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, bp, dr->dt.dl.dr_copies,
dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite);
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx);
} else if (data == NULL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
Expand Down
9 changes: 7 additions & 2 deletions module/zfs/dmu_direct.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,12 @@ dmu_write_direct_done(zio_t *zio)
ASSERT3U(zio->io_error, ==, EAGAIN);

/*
* In the event of an I/O error the metaslab cleanup is taken
* care of in zio_done().
* In the event of an I/O error this block has been freed in
* zio_done() through zio_dva_unallocate(). Calling
* dmu_sync_done() above set dr_override_state to
* DR_NOT_OVERRIDDEN. In this case when dbuf_undirty() calls
* dbuf_unoverride(), it will skip doing zio_free() to free
* this block as that was already taken care of.
*
* Since we are undirtying the record in open-context, we must
* have a hold on the db, so it should never be evicted after
Expand Down Expand Up @@ -155,6 +159,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx)

dr_head = list_head(&db->db_dirty_records);
ASSERT3U(dr_head->dr_txg, ==, txg);
dr_head->dt.dl.dr_diowrite = B_TRUE;
dr_head->dr_accounted = db->db.db_size;

blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
Expand Down
5 changes: 3 additions & 2 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -1154,11 +1154,12 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
if (error == 0) {
zgd->zgd_db = dbp;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp;
boolean_t direct_write = B_FALSE;
mutex_enter(&db->db_mtx);
dbuf_dirty_record_t *dr =
dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg);
boolean_t direct_write =
dbuf_dirty_is_direct_write(db, dr);
if (dr != NULL && dr->dt.dl.dr_diowrite)
direct_write = B_TRUE;
mutex_exit(&db->db_mtx);

/*
Expand Down

0 comments on commit b780505

Please sign in to comment.