Skip to content

Commit

Permalink
Updating based on PR Feedback(2)
Browse files Browse the repository at this point in the history
Updating code base on PR code comments. I adjusted the following parts
of the code base on the comments:
1. Updated zfs_check_direct_enabled() so it now just returns an error.
   This removed the need for the added enum and cleaned up the code.
2. Moved acquiring the rangelock from zfs_fillpage() out to
   zfs_getpage(). This cleans up the code and gets rid of the need to
   pass a boolean into zfs_fillpage() to conditionally gra the
   rangelock.
3. Cleaned up the code in both zfs_uio_get_dio_pages() and
   zfs_uio_get_dio_pages_iov(). There was no need to have wanted and
   maxsize as they were the same thing. Also, since the previous
   commit cleaned up the call to zfs_uio_iov_step() the code is much
   cleaner over all.
4. Removed dbuf_get_dirty_direct() function.
5. Unified dbuf_read() to account for both block clones and direct I/O
   writes. This removes redundant code from dbuf_read_impl() for
   grabbingthe BP.
6. Removed zfs_map_page() and zfs_unmap_page() declarations from Linux
   headers as those were never called.

Signed-off-by: Brian Atkinson <[email protected]>
  • Loading branch information
bwatkinson committed Jul 27, 2024
1 parent 6835421 commit 615e4c2
Show file tree
Hide file tree
Showing 11 changed files with 165 additions and 211 deletions.
6 changes: 0 additions & 6 deletions include/os/linux/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,12 +184,6 @@ extern int zfs_inode_alloc(struct super_block *, struct inode **ip);
extern void zfs_inode_destroy(struct inode *);
extern void zfs_mark_inode_dirty(struct inode *);
extern boolean_t zfs_relatime_need_update(const struct inode *);

#if defined(HAVE_UIO_RW)
extern caddr_t zfs_map_page(page_t *, enum seg_rw);
extern void zfs_unmap_page(page_t *, caddr_t);
#endif /* HAVE_UIO_RW */

extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];

#ifdef __cplusplus
Expand Down
14 changes: 1 addition & 13 deletions include/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
dmu_tx_t *tx);
boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
blkptr_t *dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db);
int dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db, blkptr_t **bp);
int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa);
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
Expand Down Expand Up @@ -453,18 +453,6 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg)
return (NULL);
}

/*
* All Direct I/O writes happen in open context so the first dirty record will
* always be associated with the write. After a Direct I/O write completes the
* dirty records dr_overriden state will bet DR_OVERRIDDEN and the dr_data will
* get set to NULL.
*/
static inline dbuf_dirty_record_t *
dbuf_get_dirty_direct(dmu_buf_impl_t *db)
{
return (list_head(&db->db_dirty_records));
}

static inline boolean_t
dbuf_dirty_is_direct_write(dmu_buf_impl_t *db, dbuf_dirty_record_t *dr)
{
Expand Down
8 changes: 1 addition & 7 deletions include/sys/zfs_vnops.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,6 @@

extern int zfs_bclone_enabled;

typedef enum zfs_direct_enabled {
ZFS_DIRECT_IO_ERR,
ZFS_DIRECT_IO_DISABLED,
ZFS_DIRECT_IO_ENABLED
} zfs_direct_enabled_t;

extern int zfs_fsync(znode_t *, int, cred_t *);
extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
Expand All @@ -52,7 +46,7 @@ extern int mappedread(znode_t *, int, zfs_uio_t *);
extern int mappedread_sf(znode_t *, int, zfs_uio_t *);
extern void update_pages(znode_t *, int64_t, int, objset_t *);

extern zfs_direct_enabled_t zfs_check_direct_enabled(znode_t *, int, int *);
extern int zfs_check_direct_enabled(znode_t *, int, boolean_t *);
extern int zfs_setup_direct(znode_t *, zfs_uio_t *, zfs_uio_rw_t, int *);

/*
Expand Down
12 changes: 4 additions & 8 deletions module/os/freebsd/spl/spl_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -318,10 +318,7 @@ static int
zfs_uio_get_dio_pages_impl(zfs_uio_t *uio)
{
const struct iovec *iovp = GET_UIO_STRUCT(uio)->uio_iov;
size_t wanted;
size_t maxsize = zfs_uio_resid(uio);

wanted = maxsize;
size_t len = zfs_uio_resid(uio);

for (int i = 0; i < zfs_uio_iovcnt(uio); i++) {
struct iovec iov;
Expand All @@ -331,20 +328,19 @@ zfs_uio_get_dio_pages_impl(zfs_uio_t *uio)
iovp++;
continue;
}
iov.iov_len = MIN(maxsize, iovp->iov_len);
iov.iov_len = MIN(len, iovp->iov_len);
iov.iov_base = iovp->iov_base;
int error = zfs_uio_iov_step(iov, uio, &numpages);

if (error)
return (error);

uio->uio_dio.npages += numpages;
maxsize -= iov.iov_len;
wanted -= left;
len -= iov.iov_len;
iovp++;
}

ASSERT0(wanted);
ASSERT0(len);

return (0);
}
Expand Down
22 changes: 8 additions & 14 deletions module/os/freebsd/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -4456,15 +4456,15 @@ zfs_freebsd_read(struct vop_read_args *ap)
int error = 0;
znode_t *zp = VTOZ(ap->a_vp);
int ioflag = ioflags(ap->a_ioflag);
boolean_t is_direct;

zfs_uio_init(&uio, ap->a_uio);

zfs_direct_enabled_t direct =
zfs_check_direct_enabled(zp, ioflag, &error);
error = zfs_check_direct_enabled(zp, ioflag, &is_direct);

if (direct == ZFS_DIRECT_IO_ERR) {
if (error) {
return (error);
} else if (direct == ZFS_DIRECT_IO_ENABLED) {
} else if (is_direct) {
error =
zfs_freebsd_read_direct(zp, &uio, UIO_READ, ioflag,
ap->a_cred);
Expand Down Expand Up @@ -4505,9 +4505,6 @@ zfs_freebsd_read(struct vop_read_args *ap)
}


ASSERT(direct == ZFS_DIRECT_IO_DISABLED ||
(direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN));

error = zfs_read(zp, &uio, ioflag, ap->a_cred);

return (error);
Expand Down Expand Up @@ -4552,15 +4549,15 @@ zfs_freebsd_write(struct vop_write_args *ap)
int error = 0;
znode_t *zp = VTOZ(ap->a_vp);
int ioflag = ioflags(ap->a_ioflag);
boolean_t is_direct;

zfs_uio_init(&uio, ap->a_uio);

zfs_direct_enabled_t direct =
zfs_check_direct_enabled(zp, ioflag, &error);
error = zfs_check_direct_enabled(zp, ioflag, &is_direct);

if (direct == ZFS_DIRECT_IO_ERR) {
if (error) {
return (error);
} else if (direct == ZFS_DIRECT_IO_ENABLED) {
} else if (is_direct) {
error =
zfs_freebsd_write_direct(zp, &uio, UIO_WRITE, ioflag,
ap->a_cred);
Expand All @@ -4576,9 +4573,6 @@ zfs_freebsd_write(struct vop_write_args *ap)

}

ASSERT(direct == ZFS_DIRECT_IO_DISABLED ||
(direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN));

error = zfs_write(zp, &uio, ioflag, ap->a_cred);

return (error);
Expand Down
11 changes: 5 additions & 6 deletions module/os/linux/zfs/zfs_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -614,10 +614,9 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
{
const struct iovec *iovp = uio->uio_iov;
size_t skip = uio->uio_skip;
size_t wanted, maxsize;
size_t len = uio->uio_resid - skip;

ASSERT(uio->uio_segflg != UIO_SYSSPACE);
wanted = maxsize = uio->uio_resid - skip;

for (int i = 0; i < uio->uio_iovcnt; i++) {
struct iovec iov;
Expand All @@ -628,21 +627,21 @@ zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw)
skip = 0;
continue;
}
iov.iov_len = MIN(maxsize, iovp->iov_len - skip);
iov.iov_len = MIN(len, iovp->iov_len - skip);
iov.iov_base = iovp->iov_base + skip;
int error = zfs_uio_iov_step(iov, rw, uio, &numpages);

if (error)
return (SET_ERROR(error));

uio->uio_dio.npages += numpages;
maxsize -= iov.iov_len;
wanted -= left;
len -= iov.iov_len;
skip = 0;
iovp++;
}

ASSERT0(wanted);
ASSERT0(len);

return (0);
}

Expand Down
98 changes: 42 additions & 56 deletions module/os/linux/zfs/zfs_vnops_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,7 @@ zfs_close(struct inode *ip, int flag, cred_t *cr)

#if defined(_KERNEL)

static int zfs_fillpage(struct inode *ip, struct page *pp,
boolean_t rangelock_held);
static int zfs_fillpage(struct inode *ip, struct page *pp);

/*
* When a file is memory mapped, we must keep the IO data synchronized
Expand Down Expand Up @@ -303,7 +302,7 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
* In this case we must try and fill the page.
*/
if (unlikely(!PageUptodate(pp))) {
error = zfs_fillpage(ip, pp, B_TRUE);
error = zfs_fillpage(ip, pp);
if (error) {
unlock_page(pp);
put_page(pp);
Expand Down Expand Up @@ -3996,66 +3995,19 @@ zfs_inactive(struct inode *ip)
* Fill pages with data from the disk.
*/
static int
zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held)
zfs_fillpage(struct inode *ip, struct page *pp)
{
znode_t *zp = ITOZ(ip);
zfsvfs_t *zfsvfs = ITOZSB(ip);
loff_t i_size = i_size_read(ip);
u_offset_t io_off = page_offset(pp);
size_t io_len = PAGE_SIZE;
zfs_locked_range_t *lr = NULL;

ASSERT3U(io_off, <, i_size);

if (io_off + io_len > i_size)
io_len = i_size - io_off;

/*
* It is important to hold the rangelock here because it is possible
* a Direct I/O write might be taking place at the same time that a
* page is being faulted in through filemap_fault(). With a Direct I/O
* write, db->db_data will be set to NULL either in:
* 1. dmu_write_direct() -> dmu_buf_will_not_fill() ->
* dmu_buf_will_fill() -> dbuf_noread() -> dbuf_clear_data()
* 2. dmu_write_direct_done()
* If the rangelock is not held, then there is a race between faulting
* in a page and writing out a Direct I/O write. Without the rangelock
* a NULL pointer dereference can occur in dmu_read_impl() for
* db->db_data during the mempcy operation.
*
* Another important note here is we have to check to make sure the
* rangelock is not already held from mappedread() -> zfs_fillpage().
* filemap_fault() will first add the page to the inode address_space
* mapping and then will drop the page lock. This leaves open a window
* for mappedread() to begin. In this case he page lock and rangelock,
* are both held and it might have to call here if the page is not
* up to date. In this case the rangelock can not be held twice or a
* deadlock can happen. So the rangelock only needs to be aquired if
* zfs_fillpage() is being called by zfs_getpage().
*
* Finally it is also important to drop the page lock before grabbing
* the rangelock to avoid another deadlock between here and
* zfs_write() -> update_pages(). update_pages() holds both the
* rangelock and the page lock.
*/
if (rangelock_held == B_FALSE) {
/*
* First try grabbing the rangelock. If that can not be done
* the page lock must be dropped before grabbing the rangelock
* to avoid a deadlock with update_pages(). See comment above.
*/
lr = zfs_rangelock_tryenter(&zp->z_rangelock, io_off, io_len,
RL_READER);
if (lr == NULL) {
get_page(pp);
unlock_page(pp);
lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
io_len, RL_READER);
lock_page(pp);
put_page(pp);
}
}

void *va = kmap(pp);
int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off,
io_len, va, DMU_READ_PREFETCH);
Expand All @@ -4075,10 +4027,6 @@ zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held)
SetPageUptodate(pp);
}


if (rangelock_held == B_FALSE)
zfs_rangelock_exit(lr);

return (error);
}

Expand All @@ -4099,11 +4047,49 @@ zfs_getpage(struct inode *ip, struct page *pp)
zfsvfs_t *zfsvfs = ITOZSB(ip);
znode_t *zp = ITOZ(ip);
int error;
loff_t i_size = i_size_read(ip);
u_offset_t io_off = page_offset(pp);
size_t io_len = PAGE_SIZE;

if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
return (error);

error = zfs_fillpage(ip, pp, B_FALSE);
ASSERT3U(io_off, <, i_size);

if (io_off + io_len > i_size)
io_len = i_size - io_off;

/*
* It is important to hold the rangelock here because it is possible
* a Direct I/O write or block clone might be taking place at the same
* time that a page is being faulted in through filemap_fault(). With
* Direct I/O writes and block cloning db->db_data will be set to NULL
* with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the
* rangelock is not held, then there is a race between faulting in a
* page and writing out a Direct I/O write or block cloning. Without
* the rangelock a NULL pointer dereference can occur in
* dmu_read_impl() for db->db_data during the mempcy operation when
* zfs_fillpage() calls dmu_read().
*/
zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock,
io_off, io_len, RL_READER);
if (lr == NULL) {
/*
* It is important to drop the page lock before grabbing the
* rangelock to avoid another deadlock between here and
* zfs_write() -> update_pages(). update_pages() holds both the
* rangelock and the page lock.
*/
get_page(pp);
unlock_page(pp);
lr = zfs_rangelock_enter(&zp->z_rangelock, io_off,
io_len, RL_READER);
lock_page(pp);
put_page(pp);
}
error = zfs_fillpage(ip, pp);
zfs_rangelock_exit(lr);

if (error == 0)
dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);

Expand Down
Loading

0 comments on commit 615e4c2

Please sign in to comment.