diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h index 0be2c445ab76..e028865189cf 100644 --- a/include/os/linux/zfs/sys/zfs_znode_impl.h +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -184,12 +184,6 @@ extern int zfs_inode_alloc(struct super_block *, struct inode **ip); extern void zfs_inode_destroy(struct inode *); extern void zfs_mark_inode_dirty(struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *); - -#if defined(HAVE_UIO_RW) -extern caddr_t zfs_map_page(page_t *, enum seg_rw); -extern void zfs_unmap_page(page_t *, caddr_t); -#endif /* HAVE_UIO_RW */ - extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE]; #ifdef __cplusplus diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 715731fc2f6f..33db3de5f06d 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -387,7 +387,7 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -blkptr_t *dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db); +int dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp); int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, @@ -453,18 +453,6 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) return (NULL); } -/* - * All Direct I/O writes happen in open context so the first dirty record will - * always be associated with the write. After a Direct I/O write completes the - * dirty records dr_overriden state will bet DR_OVERRIDDEN and the dr_data will - * get set to NULL. - */ -static inline dbuf_dirty_record_t * -dbuf_get_dirty_direct(dmu_buf_impl_t *db) -{ - return (list_head(&db->db_dirty_records)); -} - static inline boolean_t dbuf_dirty_is_direct_write(dmu_buf_impl_t *db, dbuf_dirty_record_t *dr) { diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 8de71448e457..4fd952513811 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -29,12 +29,6 @@ extern int zfs_bclone_enabled; -typedef enum zfs_direct_enabled { - ZFS_DIRECT_IO_ERR, - ZFS_DIRECT_IO_DISABLED, - ZFS_DIRECT_IO_ENABLED -} zfs_direct_enabled_t; - extern int zfs_fsync(znode_t *, int, cred_t *); extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); @@ -52,7 +46,7 @@ extern int mappedread(znode_t *, int, zfs_uio_t *); extern int mappedread_sf(znode_t *, int, zfs_uio_t *); extern void update_pages(znode_t *, int64_t, int, objset_t *); -extern zfs_direct_enabled_t zfs_check_direct_enabled(znode_t *, int, int *); +extern int zfs_check_direct_enabled(znode_t *, int, boolean_t *); extern int zfs_setup_direct(znode_t *, zfs_uio_t *, zfs_uio_rw_t, int *); /* diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c index 67150bc0b332..b4d73c3162eb 100644 --- a/module/os/freebsd/spl/spl_uio.c +++ b/module/os/freebsd/spl/spl_uio.c @@ -318,10 +318,7 @@ static int zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) { const struct iovec *iovp = GET_UIO_STRUCT(uio)->uio_iov; - size_t wanted; - size_t maxsize = zfs_uio_resid(uio); - - wanted = maxsize; + size_t len = zfs_uio_resid(uio); for (int i = 0; i < zfs_uio_iovcnt(uio); i++) { struct iovec iov; @@ -331,7 +328,7 @@ zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) iovp++; continue; } - iov.iov_len = MIN(maxsize, iovp->iov_len); + iov.iov_len = MIN(len, iovp->iov_len); iov.iov_base = iovp->iov_base; int error = zfs_uio_iov_step(iov, uio, &numpages); @@ -339,12 +336,11 @@ zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) return (error); uio->uio_dio.npages += numpages; - maxsize -= iov.iov_len; - wanted -= left; + len -= iov.iov_len; iovp++; } - ASSERT0(wanted); + ASSERT0(len); return (0); } diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 05b8f26d0847..ef1c5c209f03 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -4456,15 +4456,15 @@ zfs_freebsd_read(struct vop_read_args *ap) int error = 0; znode_t *zp = VTOZ(ap->a_vp); int ioflag = ioflags(ap->a_ioflag); + boolean_t is_direct; zfs_uio_init(&uio, ap->a_uio); - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(zp, ioflag, &error); + error = zfs_check_direct_enabled(zp, ioflag, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { + if (error) { return (error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + } else if (is_direct) { error = zfs_freebsd_read_direct(zp, &uio, UIO_READ, ioflag, ap->a_cred); @@ -4505,9 +4505,6 @@ zfs_freebsd_read(struct vop_read_args *ap) } - ASSERT(direct == ZFS_DIRECT_IO_DISABLED || - (direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN)); - error = zfs_read(zp, &uio, ioflag, ap->a_cred); return (error); @@ -4552,15 +4549,15 @@ zfs_freebsd_write(struct vop_write_args *ap) int error = 0; znode_t *zp = VTOZ(ap->a_vp); int ioflag = ioflags(ap->a_ioflag); + boolean_t is_direct; zfs_uio_init(&uio, ap->a_uio); - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(zp, ioflag, &error); + error = zfs_check_direct_enabled(zp, ioflag, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { + if (error) { return (error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + } else if (is_direct) { error = zfs_freebsd_write_direct(zp, &uio, UIO_WRITE, ioflag, ap->a_cred); @@ -4576,9 +4573,6 @@ zfs_freebsd_write(struct vop_write_args *ap) } - ASSERT(direct == ZFS_DIRECT_IO_DISABLED || - (direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN)); - error = zfs_write(zp, &uio, ioflag, ap->a_cred); return (error); diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index f79d2a9f8e11..d6d29bfbe06e 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -614,10 +614,9 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) { const struct iovec *iovp = uio->uio_iov; size_t skip = uio->uio_skip; - size_t wanted, maxsize; + size_t len = uio->uio_resid - skip; ASSERT(uio->uio_segflg != UIO_SYSSPACE); - wanted = maxsize = uio->uio_resid - skip; for (int i = 0; i < uio->uio_iovcnt; i++) { struct iovec iov; @@ -628,7 +627,7 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) skip = 0; continue; } - iov.iov_len = MIN(maxsize, iovp->iov_len - skip); + iov.iov_len = MIN(len, iovp->iov_len - skip); iov.iov_base = iovp->iov_base + skip; int error = zfs_uio_iov_step(iov, rw, uio, &numpages); @@ -636,13 +635,13 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) return (SET_ERROR(error)); uio->uio_dio.npages += numpages; - maxsize -= iov.iov_len; - wanted -= left; + len -= iov.iov_len; skip = 0; iovp++; } - ASSERT0(wanted); + ASSERT0(len); + return (0); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index d8109698224e..745c5059323b 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -227,8 +227,7 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) #if defined(_KERNEL) -static int zfs_fillpage(struct inode *ip, struct page *pp, - boolean_t rangelock_held); +static int zfs_fillpage(struct inode *ip, struct page *pp); /* * When a file is memory mapped, we must keep the IO data synchronized @@ -303,7 +302,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) * In this case we must try and fill the page. */ if (unlikely(!PageUptodate(pp))) { - error = zfs_fillpage(ip, pp, B_TRUE); + error = zfs_fillpage(ip, pp); if (error) { unlock_page(pp); put_page(pp); @@ -3996,66 +3995,19 @@ zfs_inactive(struct inode *ip) * Fill pages with data from the disk. */ static int -zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held) +zfs_fillpage(struct inode *ip, struct page *pp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; - zfs_locked_range_t *lr = NULL; ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; - /* - * It is important to hold the rangelock here because it is possible - * a Direct I/O write might be taking place at the same time that a - * page is being faulted in through filemap_fault(). With a Direct I/O - * write, db->db_data will be set to NULL either in: - * 1. dmu_write_direct() -> dmu_buf_will_not_fill() -> - * dmu_buf_will_fill() -> dbuf_noread() -> dbuf_clear_data() - * 2. dmu_write_direct_done() - * If the rangelock is not held, then there is a race between faulting - * in a page and writing out a Direct I/O write. Without the rangelock - * a NULL pointer dereference can occur in dmu_read_impl() for - * db->db_data during the mempcy operation. - * - * Another important note here is we have to check to make sure the - * rangelock is not already held from mappedread() -> zfs_fillpage(). - * filemap_fault() will first add the page to the inode address_space - * mapping and then will drop the page lock. This leaves open a window - * for mappedread() to begin. In this case he page lock and rangelock, - * are both held and it might have to call here if the page is not - * up to date. In this case the rangelock can not be held twice or a - * deadlock can happen. So the rangelock only needs to be aquired if - * zfs_fillpage() is being called by zfs_getpage(). - * - * Finally it is also important to drop the page lock before grabbing - * the rangelock to avoid another deadlock between here and - * zfs_write() -> update_pages(). update_pages() holds both the - * rangelock and the page lock. - */ - if (rangelock_held == B_FALSE) { - /* - * First try grabbing the rangelock. If that can not be done - * the page lock must be dropped before grabbing the rangelock - * to avoid a deadlock with update_pages(). See comment above. - */ - lr = zfs_rangelock_tryenter(&zp->z_rangelock, io_off, io_len, - RL_READER); - if (lr == NULL) { - get_page(pp); - unlock_page(pp); - lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, - io_len, RL_READER); - lock_page(pp); - put_page(pp); - } - } - void *va = kmap(pp); int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, io_len, va, DMU_READ_PREFETCH); @@ -4075,10 +4027,6 @@ zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held) SetPageUptodate(pp); } - - if (rangelock_held == B_FALSE) - zfs_rangelock_exit(lr); - return (error); } @@ -4099,11 +4047,49 @@ zfs_getpage(struct inode *ip, struct page *pp) zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *zp = ITOZ(ip); int error; + loff_t i_size = i_size_read(ip); + u_offset_t io_off = page_offset(pp); + size_t io_len = PAGE_SIZE; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - error = zfs_fillpage(ip, pp, B_FALSE); + ASSERT3U(io_off, <, i_size); + + if (io_off + io_len > i_size) + io_len = i_size - io_off; + + /* + * It is important to hold the rangelock here because it is possible + * a Direct I/O write or block clone might be taking place at the same + * time that a page is being faulted in through filemap_fault(). With + * Direct I/O writes and block cloning db->db_data will be set to NULL + * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the + * rangelock is not held, then there is a race between faulting in a + * page and writing out a Direct I/O write or block cloning. Without + * the rangelock a NULL pointer dereference can occur in + * dmu_read_impl() for db->db_data during the mempcy operation when + * zfs_fillpage() calls dmu_read(). + */ + zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock, + io_off, io_len, RL_READER); + if (lr == NULL) { + /* + * It is important to drop the page lock before grabbing the + * rangelock to avoid another deadlock between here and + * zfs_write() -> update_pages(). update_pages() holds both the + * rangelock and the page lock. + */ + get_page(pp); + unlock_page(pp); + lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, + io_len, RL_READER); + lock_page(pp); + put_page(pp); + } + error = zfs_fillpage(ip, pp); + zfs_rangelock_exit(lr); + if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 21bff54e9a94..8781e7184751 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -387,14 +387,13 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) struct inode *ip = kiocb->ki_filp->f_mapping->host; struct file *filp = kiocb->ki_filp; int flags = filp->f_flags | zfs_io_flags(kiocb); - int error = 0; + boolean_t is_direct; - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(ITOZ(ip), flags, &error); + int error = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { + if (error) { return (-error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + } else if (is_direct) { ssize_t read = zpl_iter_read_direct(kiocb, to); if (read >= 0 || read != -EAGAIN) @@ -510,7 +509,7 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) struct file *filp = kiocb->ki_filp; int flags = filp->f_flags | zfs_io_flags(kiocb); size_t count = 0; - int error = 0; + boolean_t is_direct; ssize_t ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) @@ -518,12 +517,11 @@ zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) loff_t offset = kiocb->ki_pos; - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(ITOZ(ip), flags, &error); + ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { - return (-error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + if (ret) { + return (-ret); + } else if (is_direct) { ssize_t wrote = zpl_iter_write_direct(kiocb, from); if (wrote >= 0 || wrote != -EAGAIN) { @@ -638,18 +636,17 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, int flags = filp->f_flags | zfs_io_flags(kiocb); size_t count; ssize_t ret; - int error = 0; + boolean_t is_direct; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(ITOZ(ip), flags, &error); + ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { - return (-error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + if (ret) { + return (-ret); + } else if (is_direct) { ssize_t read = zpl_aio_read_direct(kiocb, iov, nr_segs, pos); if (read >= 0 || read != -EAGAIN) @@ -754,7 +751,7 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, size_t ocount; size_t count; ssize_t ret; - int error = 0; + boolean_t is_direct; ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); if (ret) @@ -768,12 +765,11 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, kiocb->ki_pos = pos; - zfs_direct_enabled_t direct = - zfs_check_direct_enabled(ITOZ(ip), flags, &error); + ret = zfs_check_direct_enabled(ITOZ(ip), flags, &is_direct); - if (direct == ZFS_DIRECT_IO_ERR) { - return (-error); - } else if (direct == ZFS_DIRECT_IO_ENABLED) { + if (ret) { + return (-ret); + } else if (is_direct) { ssize_t wrote = zpl_aio_write_direct(kiocb, iov, nr_segs, pos); if (wrote >= 0 || wrote != -EAGAIN) { diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 45fefed6fc60..6e0d15d62bba 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1253,17 +1253,16 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); - dbuf_dirty_record_t *dr_dio = NULL; db->db_buf = buf; - dr_dio = dbuf_get_dirty_direct(db); /* * If there is a Direct I/O, set its data too. Then its state * will be the same as if we did a ZIL dmu_sync(). */ - if (dbuf_dirty_is_direct_write(db, dr_dio)) { - dr_dio->dt.dl.dr_data = db->db_buf; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dbuf_dirty_is_direct_write(db, dr)) { + dr->dt.dl.dr_data = db->db_buf; } ASSERT(buf->b_data != NULL); @@ -1594,7 +1593,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t *bpp = bp; ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1608,37 +1606,18 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, goto early_unlock; } - /* - * If we have a pending block clone, we don't want to read the - * underlying block, but the content of the block being cloned, - * pointed by the dirty record, so we have the most recent data. - * If there is no dirty record, then we hit a race in a sync - * process when the dirty record is already removed, while the - * dbuf is not yet destroyed. Such case is equivalent to uncached. - */ - if (db->db_state == DB_NOFILL) { - dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); - if (dr != NULL) { - if (!dr->dt.dl.dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bpp = &dr->dt.dl.dr_overridden_by; - } - } - - err = dbuf_read_hole(db, dn, bpp); + err = dbuf_read_hole(db, dn, bp); if (err == 0) goto early_unlock; - ASSERT(bpp != NULL); + ASSERT(bp != NULL); /* * Any attempt to read a redacted block should result in an error. This * will never happen under normal conditions, but can be useful for * debugging purposes. */ - if (BP_IS_REDACTED(bpp)) { + if (BP_IS_REDACTED(bp)) { ASSERT(dsl_dataset_feature_is_active( db->db_objset->os_dsl_dataset, SPA_FEATURE_REDACTED_DATASETS)); @@ -1653,9 +1632,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, * All bps of an encrypted os should have the encryption bit set. * If this is not true it indicates tampering and we report an error. */ - if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bp)) { spa_log_error(db->db_objset->os_spa, &zb, - BP_GET_LOGICAL_BIRTH(bpp)); + BP_GET_LOGICAL_BIRTH(bp)); err = SET_ERROR(EIO); goto early_unlock; } @@ -1666,7 +1645,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, if (!DBUF_IS_CACHEABLE(db)) aflags |= ARC_FLAG_UNCACHED; - else if (dbuf_is_l2cacheable(db, bpp)) + else if (dbuf_is_l2cacheable(db, bp)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -1674,7 +1653,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; - if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bpp)) + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bp)) zio_flags |= ZIO_FLAG_RAW; /* @@ -1684,7 +1663,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ - blkptr_t copy = *bpp; + blkptr_t copy = *bp; dmu_buf_unlock_parent(db, dblt, tag); return (arc_read(zio, db->db_objset->os_spa, ©, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, @@ -1856,24 +1835,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) } mutex_exit(&db->db_mtx); } else { - blkptr_t *bp = NULL; ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + blkptr_t *bp; /* - * If a Direct I/O write has occurred we will use the updated - * block pointer. + * If a block clone or Direct I/O write has occurred we will + * get the dirty records overridden BP so we get the most + * recent data.. */ - bp = dmu_buf_get_bp_from_dbuf(db); + err = dmu_buf_get_bp_from_dbuf(db, &bp); + + if (!err) { + if (pio == NULL && (db->db_state == DB_NOFILL || + (bp != NULL && !BP_IS_HOLE(bp)))) { + spa_t *spa = dn->dn_objset->os_spa; + pio = + zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + need_wait = B_TRUE; + } - if (pio == NULL && (db->db_state == DB_NOFILL || - (bp != NULL && !BP_IS_HOLE(bp)))) { - spa_t *spa = dn->dn_objset->os_spa; - pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - need_wait = B_TRUE; + err = + dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG); + } else { + mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, FTAG); } - err = dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG); /* dbuf_read_impl drops db_mtx and parent's rwlock. */ miss = (db->db_state != DB_CACHED); } @@ -2756,31 +2744,39 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) /* * Normally the db_blkptr points to the most recent on-disk content for the - * dbuf (and anything newer will be cached in the dbuf). However, a recent - * Direct I/O write could leave newer content on disk and the dbuf uncached. - * In this case we must return the (as yet unsynced) pointer to the lastest - * on-disk content. + * dbuf (and anything newer will be cached in the dbuf). However, a pending + * block clone or not yet synced Direct I/O write will have a dirty record BP + * pointing to the most recent data. */ -blkptr_t * -dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db) +int +dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp) { ASSERT(MUTEX_HELD(&db->db_mtx)); + int error = 0; - if (db->db_level != 0) - return (db->db_blkptr); - - blkptr_t *bp = db->db_blkptr; + if (db->db_level != 0) { + *bp = db->db_blkptr; + return (0); + } - dbuf_dirty_record_t *dr_dio = dbuf_get_dirty_direct(db); - if (dr_dio && dr_dio->dt.dl.dr_override_state == DR_OVERRIDDEN && - dr_dio->dt.dl.dr_data == NULL) { - ASSERT(db->db_state == DB_UNCACHED || - db->db_state == DB_NOFILL); - /* We have a Direct I/O write or cloned block, use it's BP */ - bp = &dr_dio->dt.dl.dr_overridden_by; + *bp = db->db_blkptr; + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + if (dr) { + if (db->db_state == DB_NOFILL) { + /* Block clone */ + if (!dr->dt.dl.dr_brtwrite) + error = EIO; + else + *bp = &dr->dt.dl.dr_overridden_by; + } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN && + dr->dt.dl.dr_data == NULL) { + ASSERT(db->db_state == DB_UNCACHED); + /* Direct I/O write */ + *bp = &dr->dt.dl.dr_overridden_by; + } } - return (bp); + return (error); } /* diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 2c80cb24a927..3737b6a6c112 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -153,7 +153,7 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa)); ASSERT3U(txg, >, spa_syncing_txg(os->os_spa)); - dr_head = dbuf_get_dirty_direct(db); + dr_head = list_head(&db->db_dirty_records); ASSERT3U(dr_head->dr_txg, ==, txg); dr_head->dr_accounted = db->db.db_size; @@ -261,6 +261,7 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; abd_t *mbuf; zbookmark_phys_t zb; + blkptr_t *bp; mutex_enter(&db->db_mtx); @@ -274,7 +275,11 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, while (db->db_state == DB_READ) cv_wait(&db->db_changed, &db->db_mtx); - blkptr_t *bp = dmu_buf_get_bp_from_dbuf(db); + err = dmu_buf_get_bp_from_dbuf(db, &bp); + if (err) { + mutex_exit(&db->db_mtx); + goto error; + } /* * There is no need to read if this is a hole or the data is @@ -311,13 +316,13 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, /* * The dbuf mutex (db_mtx) must be held when creating the ZIO * for the read. The BP returned from - * dmu_buf_get_bp_from_dbuf() could be from a previous Direct - * I/O write that is in the dbuf's dirty record. When - * zio_read() is called, zio_create() will make a copy of the - * BP. However, if zio_read() is called without the mutex - * being held then the dirty record from the dbuf could be - * freed in dbuf_write_done() resulting in garbage being set - * for the zio BP. + * dmu_buf_get_bp_from_dbuf() could be from a pending block + * clone or a yet to be synced Direct I/O write that is in the + * dbuf's dirty record. When zio_read() is called, zio_create() + * will make a copy of the BP. However, if zio_read() is called + * without the mutex being held then the dirty record from the + * dbuf could be freed in dbuf_write_done() resulting in garbage + * being set for the zio BP. */ zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, @@ -331,6 +336,11 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); return (zio_wait(rio)); + +error: + dmu_buf_rele_array(dbp, numbufs, FTAG); + (void) zio_wait(rio); + return (err); } #ifdef _KERNEL diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 2460805582f2..5fd6996215ba 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -202,25 +202,26 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } -zfs_direct_enabled_t -zfs_check_direct_enabled(znode_t *zp, int ioflags, int *error) -{ - zfs_direct_enabled_t is_direct = ZFS_DIRECT_IO_DISABLED; +int +zfs_check_direct_enabled(znode_t *zp, int ioflags, boolean_t *is_direct) +{; zfsvfs_t *zfsvfs = ZTOZSB(zp); + *is_direct = B_FALSE; + int error; - if ((*error = zfs_enter(zfsvfs, FTAG)) != 0) - return (ZFS_DIRECT_IO_ERR); + if ((error = zfs_enter(zfsvfs, FTAG)) != 0) + return (error); if (ioflags & O_DIRECT && zfsvfs->z_os->os_direct != ZFS_DIRECT_DISABLED) { - is_direct = ZFS_DIRECT_IO_ENABLED; + *is_direct = B_TRUE; } else if (zfsvfs->z_os->os_direct == ZFS_DIRECT_ALWAYS) { - is_direct = ZFS_DIRECT_IO_ENABLED; + *is_direct = B_TRUE; } zfs_exit(zfsvfs, FTAG); - return (is_direct); + return (0); } /*