diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d670cd1afeb1..e4db53cdfb12 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -420,7 +420,7 @@ get_usage(zpool_help_t idx) "[ ...]\n")); case HELP_STATUS: return (gettext("\tstatus [--power] [-c [script1,script2,...]] " - "[-DegiLpPstvx] [-T d|u] [pool] ...\n" + "[-dDegiLpPstvx] [-T d|u] [pool] ...\n" "\t [interval [count]]\n")); case HELP_UPGRADE: return (gettext("\tupgrade\n" @@ -2207,6 +2207,7 @@ typedef struct status_cbdata { boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; + boolean_t cb_print_dio_verify; boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; @@ -2439,7 +2440,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, uint_t c, i, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; - char rbuf[6], wbuf[6], cbuf[6]; + char rbuf[6], wbuf[6], cbuf[6], dbuf[6]; char *vname; uint64_t notpresent; spare_cbdata_t spare_cb; @@ -2557,6 +2558,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, printf(" %5s", "-"); } } + if (VDEV_STAT_VALID(vs_dio_verify_errors, vsc) && + cb->cb_print_dio_verify) { + zfs_nicenum(vs->vs_dio_verify_errors, dbuf, + sizeof (dbuf)); + + if (cb->cb_literal) + printf(" %5llu", + (u_longlong_t)vs->vs_dio_verify_errors); + else + printf(" %5s", dbuf); + } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, @@ -9169,6 +9181,10 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, " %5s", gettext("POWER")); } + if (cbp->cb_print_dio_verify) { + printf_color(ANSI_BOLD, " %5s", gettext("DIO")); + } + if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); @@ -9217,10 +9233,11 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-c [script1,script2,...]] [-DegiLpPstvx] [--power] [-T d|u] ... - * [pool] [interval [count]] + * zpool status [-c [script1,script2,...]] [-dDegiLpPstvx] [--power] ... + * [-T d|u] [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -d Display Direct I/O write verify errors * -D Display dedup status (undocumented) * -e Display only unhealthy vdevs * -g Display guid for individual vdev name. @@ -9253,7 +9270,7 @@ zpool_do_status(int argc, char **argv) }; /* check options */ - while ((c = getopt_long(argc, argv, "c:DegiLpPstT:vx", long_options, + while ((c = getopt_long(argc, argv, "c:dDegiLpPstT:vx", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -9280,6 +9297,9 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'd': + cb.cb_print_dio_verify = B_TRUE; + break; case 'D': cb.cb_dedup_stats = B_TRUE; break; diff --git a/cmd/ztest.c b/cmd/ztest.c index 684ab586bb93..ada0e866207f 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -2264,6 +2264,13 @@ ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) if (ztest_random(4) != 0) { int prefetch = ztest_random(2) ? DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; + + /* + * We will randomly set when to do O_DIRECT on a read. + */ + if (ztest_random(4) == 0) + prefetch |= DMU_DIRECTIO; + ztest_block_tag_t rbt; VERIFY(dmu_read(os, lr->lr_foid, offset, @@ -2815,6 +2822,13 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) enum ztest_io_type io_type; uint64_t blocksize; void *data; + uint32_t dmu_read_flags = DMU_READ_NO_PREFETCH; + + /* + * We will randomly set when to do O_DIRECT on a read. + */ + if (ztest_random(4) == 0) + dmu_read_flags |= DMU_DIRECTIO; VERIFY0(dmu_object_info(zd->zd_os, object, &doi)); blocksize = doi.doi_data_block_size; @@ -2880,7 +2894,7 @@ ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) (void) pthread_rwlock_unlock(&ztest_name_lock); VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, - DMU_READ_NO_PREFETCH)); + dmu_read_flags)); (void) ztest_write(zd, object, offset, blocksize, data); break; @@ -5045,6 +5059,13 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) uint64_t stride = 123456789ULL; uint64_t width = 40; int free_percent = 5; + uint32_t dmu_read_flags = DMU_READ_PREFETCH; + + /* + * We will randomly set when to do O_DIRECT on a read. + */ + if (ztest_random(4) == 0) + dmu_read_flags |= DMU_DIRECTIO; /* * This test uses two objects, packobj and bigobj, that are always @@ -5123,10 +5144,10 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) * Read the current contents of our objects. */ error = dmu_read(os, packobj, packoff, packsize, packbuf, - DMU_READ_PREFETCH); + dmu_read_flags); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, - DMU_READ_PREFETCH); + dmu_read_flags); ASSERT0(error); /* @@ -5244,9 +5265,9 @@ ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, - packsize, packcheck, DMU_READ_PREFETCH)); + packsize, packcheck, dmu_read_flags)); VERIFY0(dmu_read(os, bigobj, bigoff, - bigsize, bigcheck, DMU_READ_PREFETCH)); + bigsize, bigcheck, dmu_read_flags)); ASSERT0(memcmp(packbuf, packcheck, packsize)); ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); @@ -5336,6 +5357,13 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) dmu_buf_t *bonus_db; arc_buf_t **bigbuf_arcbufs; dmu_object_info_t doi; + uint32_t dmu_read_flags = DMU_READ_PREFETCH; + + /* + * We will randomly set when to do O_DIRECT on a read. + */ + if (ztest_random(4) == 0) + dmu_read_flags |= DMU_DIRECTIO; size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; od = umem_alloc(size, UMEM_NOFAIL); @@ -5466,10 +5494,10 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) */ if (i != 0 || ztest_random(2) != 0) { error = dmu_read(os, packobj, packoff, - packsize, packbuf, DMU_READ_PREFETCH); + packsize, packbuf, dmu_read_flags); ASSERT0(error); error = dmu_read(os, bigobj, bigoff, bigsize, - bigbuf, DMU_READ_PREFETCH); + bigbuf, dmu_read_flags); ASSERT0(error); } compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, @@ -5529,9 +5557,9 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); VERIFY0(dmu_read(os, packobj, packoff, - packsize, packcheck, DMU_READ_PREFETCH)); + packsize, packcheck, dmu_read_flags)); VERIFY0(dmu_read(os, bigobj, bigoff, - bigsize, bigcheck, DMU_READ_PREFETCH)); + bigsize, bigcheck, dmu_read_flags)); ASSERT0(memcmp(packbuf, packcheck, packsize)); ASSERT0(memcmp(bigbuf, bigcheck, bigsize)); diff --git a/config/kernel-get-user-pages.m4 b/config/kernel-get-user-pages.m4 new file mode 100644 index 000000000000..f9d02b66a178 --- /dev/null +++ b/config/kernel-get-user-pages.m4 @@ -0,0 +1,179 @@ +dnl # +dnl # get_user_pages_unlocked() function was not available till 4.0. +dnl # In earlier kernels (< 4.0) get_user_pages() is available(). +dnl # +dnl # 4.0 API change, +dnl # long get_user_pages_unlocked(struct task_struct *tsk, +dnl # struct mm_struct *mm, unsigned long start, unsigned long nr_pages, +dnl # int write, int force, struct page **pages) +dnl # +dnl # 4.8 API change, +dnl # long get_user_pages_unlocked(unsigned long start, +dnl # unsigned long nr_pages, int write, int force, struct page **page) +dnl # +dnl # 4.9 API change, +dnl # long get_user_pages_unlocked(usigned long start, int nr_pages, +dnl # struct page **pages, unsigned int gup_flags) +dnl # + +dnl# +dnl# Check available get_user_pages/_unlocked interfaces. +dnl# +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_USER_PAGES], [ + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_gup_flags], [ + #include + ], [ + unsigned long start = 0; + unsigned long nr_pages = 1; + unsigned int gup_flags = 0; + struct page **pages = NULL; + long ret __attribute__ ((unused)); + + ret = get_user_pages_unlocked(start, nr_pages, pages, + gup_flags); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_write_flag], [ + #include + ], [ + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + long ret __attribute__ ((unused)); + struct page **pages = NULL; + + ret = get_user_pages_unlocked(start, nr_pages, write, force, + pages); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct], [ + #include + ], [ + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + struct page **pages = NULL; + long ret __attribute__ ((unused)); + + ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, write, + force, pages); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_unlocked_task_struct_gup_flags], [ + #include + ], [ + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + unsigned long start = 0; + unsigned long nr_pages = 1; + struct page **pages = NULL; + unsigned int gup_flags = 0; + long ret __attribute__ ((unused)); + + ret = get_user_pages_unlocked(tsk, mm, start, nr_pages, + pages, gup_flags); + ]) + + ZFS_LINUX_TEST_SRC([get_user_pages_task_struct], [ + #include + ], [ + struct task_struct *tsk = NULL; + struct mm_struct *mm = NULL; + struct vm_area_struct **vmas = NULL; + unsigned long start = 0; + unsigned long nr_pages = 1; + int write = 0; + int force = 0; + struct page **pages = NULL; + int ret __attribute__ ((unused)); + + ret = get_user_pages(tsk, mm, start, nr_pages, write, + force, pages, vmas); + ]) +]) + +dnl # +dnl # Supported get_user_pages/_unlocked interfaces checked newest to oldest. +dnl # We first check for get_user_pages_unlocked as that is available in +dnl # newer kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_GET_USER_PAGES], [ + dnl # + dnl # Current API (as of 4.9) of get_user_pages_unlocked + dnl # + AC_MSG_CHECKING([whether get_user_pages_unlocked() takes gup flags]) + ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_gup_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS, 1, + [get_user_pages_unlocked() takes gup flags]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.8 API change, get_user_pages_unlocked + dnl # + AC_MSG_CHECKING( + [whether get_user_pages_unlocked() takes write flag]) + ZFS_LINUX_TEST_RESULT([get_user_pages_unlocked_write_flag], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG, 1, + [get_user_pages_unlocked() takes write flag]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.0-4.3, 4.5-4.7 API, get_user_pages_unlocked + dnl # + AC_MSG_CHECKING( + [whether get_user_pages_unlocked() takes task_struct]) + ZFS_LINUX_TEST_RESULT( + [get_user_pages_unlocked_task_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE( + HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT, 1, + [get_user_pages_unlocked() takes task_struct]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.4 API, get_user_pages_unlocked + dnl # + AC_MSG_CHECKING( + [whether get_user_pages_unlocked() takes task_struct, gup_flags]) + ZFS_LINUX_TEST_RESULT( + [get_user_pages_unlocked_task_struct_gup_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE( + HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT_GUP_FLAGS, 1, + [get_user_pages_unlocked() takes task_struct, gup_flags]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # get_user_pages + dnl # + AC_MSG_CHECKING( + [whether get_user_pages() takes struct task_struct]) + ZFS_LINUX_TEST_RESULT( + [get_user_pages_task_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE( + HAVE_GET_USER_PAGES_TASK_STRUCT, 1, + [get_user_pages() takes task_struct]) + ], [ + dnl # + dnl # If we cannot map the user's + dnl # pages in then we cannot do + dnl # Direct I/O + dnl # + ZFS_LINUX_TEST_ERROR([Direct I/O]) + ]) + ]) + ]) + ]) + ]) +]) diff --git a/config/kernel-vfs-direct_IO.m4 b/config/kernel-vfs-direct_IO.m4 index 7b7b91f979f9..715e824b7a04 100644 --- a/config/kernel-vfs-direct_IO.m4 +++ b/config/kernel-vfs-direct_IO.m4 @@ -1,5 +1,5 @@ dnl # -dnl # Check for direct IO interfaces. +dnl # Check for Direct I/O interfaces. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ ZFS_LINUX_TEST_SRC([direct_io_iter], [ @@ -100,7 +100,7 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO], [ AC_DEFINE(HAVE_VFS_DIRECT_IO_IOVEC, 1, [aops->direct_IO() uses iovec]) ],[ - ZFS_LINUX_TEST_ERROR([direct IO]) + ZFS_LINUX_TEST_ERROR([Direct I/O]) AC_MSG_RESULT([no]) ]) ]) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index ff560ff3eef0..e7ced0297014 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -85,6 +85,34 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ bytes = copy_from_iter((void *)&buf, size, &iter); ]) + ZFS_LINUX_TEST_SRC([iov_iter_get_pages2], [ + #include + ], [ + struct iov_iter iter = { 0 }; + struct page **pages = NULL; + size_t maxsize = 4096; + unsigned maxpages = 1; + size_t start; + size_t ret __attribute__ ((unused)); + + ret = iov_iter_get_pages2(&iter, pages, maxsize, maxpages, + &start); + ]) + + ZFS_LINUX_TEST_SRC([iov_iter_get_pages], [ + #include + ], [ + struct iov_iter iter = { 0 }; + struct page **pages = NULL; + size_t maxsize = 4096; + unsigned maxpages = 1; + size_t start; + size_t ret __attribute__ ((unused)); + + ret = iov_iter_get_pages(&iter, pages, maxsize, maxpages, + &start); + ]) + ZFS_LINUX_TEST_SRC([iov_iter_type], [ #include #include @@ -184,6 +212,27 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ enable_vfs_iov_iter="no" ]) + dnl # + dnl # Kernel 6.0 changed iov_iter_get_pages() to iov_iter_page_pages2(). + dnl # + AC_MSG_CHECKING([whether iov_iter_get_pages2() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_get_pages2], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_GET_PAGES2, 1, + [iov_iter_get_pages2() is available]) + ], [ + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether iov_iter_get_pages() is available]) + ZFS_LINUX_TEST_RESULT([iov_iter_get_pages], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOV_ITER_GET_PAGES, 1, + [iov_iter_get_pages() is available]) + ], [ + AC_MSG_RESULT(no) + enable_vfs_iov_iter="no" + ]) + ]) + dnl # dnl # This checks for iov_iter_type() in linux/uio.h. It is not dnl # required, however, and the module will compiled without it diff --git a/config/kernel.m4 b/config/kernel.m4 index 548905ccd04d..f4f074b41b97 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -78,6 +78,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_SHOW_OPTIONS ZFS_AC_KERNEL_SRC_FILE_INODE ZFS_AC_KERNEL_SRC_FILE_DENTRY + ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_FSYNC ZFS_AC_KERNEL_SRC_AIO_FSYNC ZFS_AC_KERNEL_SRC_EVICT_INODE @@ -110,6 +111,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_VFS_GETATTR ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_ITERATE + ZFS_AC_KERNEL_SRC_GET_USER_PAGES ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS @@ -228,6 +230,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_FILE_INODE ZFS_AC_KERNEL_FILE_DENTRY + ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_FSYNC ZFS_AC_KERNEL_AIO_FSYNC ZFS_AC_KERNEL_EVICT_INODE @@ -260,6 +263,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_VFS_GETATTR ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_ITERATE + ZFS_AC_KERNEL_GET_USER_PAGES ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index df7be6fc13f6..01a660434f59 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -100,6 +100,9 @@ #define spa_taskq_write_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A" +#define param_set_direct_write_verify_pct_args(var) \ + CTLTYPE_UINT, NULL, 0, param_set_direct_write_verify_pct, "IU" + #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h index 8cfe56c75309..bbff9fe80389 100644 --- a/include/os/freebsd/spl/sys/mutex.h +++ b/include/os/freebsd/spl/sys/mutex.h @@ -70,4 +70,5 @@ typedef enum { #define mutex_exit(lock) sx_xunlock(lock) #define mutex_owned(lock) sx_xlocked(lock) #define mutex_owner(lock) sx_xholder(lock) + #endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/freebsd/spl/sys/param.h b/include/os/freebsd/spl/sys/param.h index 92724e332d68..96440dce03bb 100644 --- a/include/os/freebsd/spl/sys/param.h +++ b/include/os/freebsd/spl/sys/param.h @@ -33,6 +33,7 @@ #include #include_next #define PAGESIZE PAGE_SIZE +#define PAGESHIFT PAGE_SHIFT #define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) #ifdef _KERNEL #include diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h index b9d41903ea63..2bd5bdb80d98 100644 --- a/include/os/freebsd/spl/sys/uio.h +++ b/include/os/freebsd/spl/sys/uio.h @@ -34,13 +34,30 @@ #include_next #include #include +#include + +/* + * uio_extflg: extended flags + */ +#define UIO_DIRECT 0x0001 /* Direct I/O requset */ typedef struct iovec iovec_t; typedef enum uio_seg zfs_uio_seg_t; typedef enum uio_rw zfs_uio_rw_t; +/* + * This structure is used when doing Direct I/O. + */ +typedef struct { + vm_page_t *pages; + int npages; +} zfs_uio_dio_t; + typedef struct zfs_uio { struct uio *uio; + offset_t uio_soffset; + uint16_t uio_extflg; + zfs_uio_dio_t uio_dio; } zfs_uio_t; #define GET_UIO_STRUCT(u) (u)->uio @@ -52,6 +69,7 @@ typedef struct zfs_uio { #define zfs_uio_iovbase(u, idx) GET_UIO_STRUCT(u)->uio_iov[(idx)].iov_base #define zfs_uio_td(u) GET_UIO_STRUCT(u)->uio_td #define zfs_uio_rw(u) GET_UIO_STRUCT(u)->uio_rw +#define zfs_uio_soffset(u) (u)->uio_soffset #define zfs_uio_fault_disable(u, set) #define zfs_uio_prefaultpages(size, u) (0) @@ -61,6 +79,13 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) zfs_uio_offset(uio) = off; } +static inline void +zfs_uio_setsoffset(zfs_uio_t *uio, offset_t off) +{ + ASSERT3U(zfs_uio_offset(uio), ==, off); + zfs_uio_soffset(uio) = off; +} + static inline void zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { @@ -71,7 +96,11 @@ zfs_uio_advance(zfs_uio_t *uio, ssize_t size) static __inline void zfs_uio_init(zfs_uio_t *uio, struct uio *uio_s) { - GET_UIO_STRUCT(uio) = uio_s; + memset(uio, 0, sizeof (zfs_uio_t)); + if (uio_s != NULL) { + GET_UIO_STRUCT(uio) = uio_s; + zfs_uio_soffset(uio) = uio_s->uio_offset; + } } int zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio); diff --git a/include/os/freebsd/spl/sys/vm.h b/include/os/freebsd/spl/sys/vm.h index 7b3830be8a57..045fb019044f 100644 --- a/include/os/freebsd/spl/sys/vm.h +++ b/include/os/freebsd/spl/sys/vm.h @@ -57,6 +57,15 @@ void zfs_vmobject_wunlock(vm_object_t object); #define vm_page_grab_valid_unlocked(m, obj, idx, flags) \ vm_page_grab_valid((m), (obj), (idx), (flags)) #endif + +#if __FreeBSD_version >= 1300047 +#define vm_page_wire_lock(pp) +#define vm_page_wire_unlock(pp) +#else +#define vm_page_wire_lock(pp) vm_page_lock(pp) +#define vm_page_wire_unlock(pp) vm_page_unlock(pp) +#endif + static inline caddr_t zfs_map_page(vm_page_t pp, struct sf_buf **sfp) { @@ -70,4 +79,16 @@ zfs_unmap_page(struct sf_buf *sf) sf_buf_free(sf); } +static inline void +page_unhold(vm_page_t pp) +{ + vm_page_wire_lock(pp); +#if __FreeBSD_version >= 1300035 + vm_page_unwire(pp, PQ_ACTIVE); +#else + vm_page_unhold(pp); +#endif + vm_page_wire_unlock(pp); +} + #endif /* _OPENSOLARIS_SYS_VM_H_ */ diff --git a/include/os/linux/kernel/linux/kmap_compat.h b/include/os/linux/kernel/linux/kmap_compat.h index 7f9c00af802b..3bfc481bc6bf 100644 --- a/include/os/linux/kernel/linux/kmap_compat.h +++ b/include/os/linux/kernel/linux/kmap_compat.h @@ -32,6 +32,8 @@ /* 2.6.37 API change */ #define zfs_kmap_atomic(page) kmap_atomic(page) #define zfs_kunmap_atomic(addr) kunmap_atomic(addr) +#define zfs_kmap(page) kmap(page) +#define zfs_kunmap(page) kunmap(page) /* 5.0 API change - no more 'type' argument for access_ok() */ #ifdef HAVE_ACCESS_OK_TYPE @@ -40,4 +42,49 @@ #define zfs_access_ok(type, addr, size) access_ok(addr, size) #endif +/* + * read returning FOLL_WRITE is due to the fact that we are stating + * that the kernel will have write access to the user pages. So, when + * a Direct I/O read request is issued, the kernel must write to the user + * pages. + * + * get_user_pages_unlocked was not available to 4.0, so we also check + * for get_user_pages on older kernels. + */ +/* 4.9 API change - for and read flag is passed as gup flags */ +#if defined(HAVE_GET_USER_PAGES_UNLOCKED_GUP_FLAGS) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(addr, numpages, pages, read ? FOLL_WRITE : 0) + +/* 4.8 API change - no longer takes struct task_struct as arguement */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_WRITE_FLAG) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(addr, numpages, read, 0, pages) + +/* 4.0-4.3, 4.5-4.7 API */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(current, current->mm, addr, numpages, read, 0, \ + pages) + +/* 4.4 API */ +#elif defined(HAVE_GET_USER_PAGES_UNLOCKED_TASK_STRUCT_GUP_FLAGS) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages_unlocked(current, current->mm, addr, numpages, pages, \ + read ? FOLL_WRITE : 0) + +/* Using get_user_pages if kernel is < 4.0 */ +#elif defined(HAVE_GET_USER_PAGES_TASK_STRUCT) +#define zfs_get_user_pages(addr, numpages, read, pages) \ + get_user_pages(current, current->mm, addr, numpages, read, 0, pages, \ + NULL) +#else +/* + * This case is unreachable. We must be able to use either + * get_user_pages_unlocked() or get_user_pages() to map user pages into + * the kernel. + */ +#error "Unknown Direct I/O interface" +#endif + #endif /* _ZFS_KMAP_H */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 5e6ea8d3c221..1f0a7fa68dfb 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -33,6 +33,12 @@ #include #include #include +#include + +/* + * uio_extflg: extended flags + */ +#define UIO_DIRECT 0x0001 /* Direct I/O request */ #if defined(HAVE_VFS_IOV_ITER) && defined(HAVE_FAULT_IN_IOV_ITER_READABLE) #define iov_iter_fault_in_readable(a, b) fault_in_iov_iter_readable(a, b) @@ -54,6 +60,14 @@ typedef enum zfs_uio_seg { #endif } zfs_uio_seg_t; +/* + * This structures is used when doing Direct I/O. + */ +typedef struct { + struct page **pages; /* Mapped pages */ + int npages; /* Number of mapped pages */ +} zfs_uio_dio_t; + typedef struct zfs_uio { union { const struct iovec *uio_iov; @@ -62,15 +76,16 @@ typedef struct zfs_uio { struct iov_iter *uio_iter; #endif }; - int uio_iovcnt; - offset_t uio_loffset; - zfs_uio_seg_t uio_segflg; + int uio_iovcnt; /* Number of iovecs */ + offset_t uio_soffset; /* Starting logical offset */ + offset_t uio_loffset; /* Current logical offset */ + zfs_uio_seg_t uio_segflg; /* Segment type */ boolean_t uio_fault_disable; - uint16_t uio_fmode; - uint16_t uio_extflg; - ssize_t uio_resid; - - size_t uio_skip; + uint16_t uio_fmode; /* Access mode (unused) */ + uint16_t uio_extflg; /* Extra flags (UIO_DIRECT) */ + ssize_t uio_resid; /* Residual unprocessed bytes */ + size_t uio_skip; /* Skipped bytes in current iovec */ + zfs_uio_dio_t uio_dio; /* Direct I/O user pages */ struct request *rq; } zfs_uio_t; @@ -83,6 +98,7 @@ typedef struct zfs_uio { #define zfs_uio_iovlen(u, idx) (u)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(u, idx) (u)->uio_iov[(idx)].iov_base #define zfs_uio_fault_disable(u, set) (u)->uio_fault_disable = set +#define zfs_uio_soffset(u) (u)->uio_soffset #define zfs_uio_rlimit_fsize(z, u) (0) #define zfs_uio_fault_move(p, n, rw, u) zfs_uiomove((p), (n), (rw), (u)) @@ -94,6 +110,13 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) uio->uio_loffset = off; } +static inline void +zfs_uio_setsoffset(zfs_uio_t *uio, offset_t off) +{ + ASSERT3U(zfs_uio_offset(uio), ==, off); + zfs_uio_soffset(uio) = off; +} + static inline void zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { @@ -117,6 +140,8 @@ zfs_uio_iovec_init(zfs_uio_t *uio, const struct iovec *iov, uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } static inline void @@ -146,6 +171,8 @@ zfs_uio_bvec_init(zfs_uio_t *uio, struct bio *bio, struct request *rq) } uio->rq = rq; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } #if defined(HAVE_VFS_IOV_ITER) @@ -162,8 +189,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, uio->uio_extflg = 0; uio->uio_resid = resid; uio->uio_skip = skip; + uio->uio_soffset = uio->uio_loffset; + memset(&uio->uio_dio, 0, sizeof (zfs_uio_dio_t)); } -#endif +#endif /* HAVE_VFS_IOV_ITER */ #if defined(HAVE_ITER_IOV) #define zfs_uio_iter_iov(iter) iter_iov((iter)) diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 91a4751fffb0..c8eefe4fe5da 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/include/sys/abd.h b/include/sys/abd.h index 19fe96292d5f..af938a6284de 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -29,12 +29,13 @@ #include #include #include -#include #ifdef __cplusplus extern "C" { #endif +struct sf_buf; + typedef enum abd_flags { ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ @@ -46,6 +47,7 @@ typedef enum abd_flags { ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */ ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */ ABD_FLAG_ALLOCD = 1 << 9, /* we allocated the abd_t */ + ABD_FLAG_FROM_PAGES = 1 << 10, /* does not own pages */ } abd_flags_t; typedef struct abd { @@ -69,7 +71,8 @@ typedef struct abd { } abd_scatter; struct abd_linear { void *abd_buf; - struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ + struct scatterlist *abd_sgl; /* for LINEAR_PAGE Linux */ + struct sf_buf *sf; /* for LINEAR_PAGE FreeBSD */ } abd_linear; struct abd_gang { list_t abd_gang_chain; @@ -100,6 +103,14 @@ abd_t *abd_alloc_for_io(size_t, boolean_t); __attribute__((malloc)) abd_t *abd_alloc_sametype(abd_t *, size_t); boolean_t abd_size_alloc_linear(size_t); +#if defined(_KERNEL) +__attribute__((malloc)) +#if defined(__linux__) +abd_t *abd_alloc_from_pages(struct page **, unsigned long, uint64_t size); +#elif defined(__FreeBSD__) +abd_t *abd_alloc_from_pages(vm_page_t *, unsigned long, uint64_t size); +#endif +#endif /* _KERNEL */ void abd_gang_add(abd_t *, abd_t *, boolean_t); void abd_free(abd_t *); abd_t *abd_get_offset(abd_t *, size_t); @@ -210,6 +221,12 @@ abd_get_size(abd_t *abd) return (abd->abd_size); } +static inline boolean_t +abd_is_from_pages(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_FROM_PAGES) ? B_TRUE : B_FALSE); +} + /* * Module lifecycle * Defined in each specific OS's abd_os.c diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index f88ea25e245d..1b2f002ea060 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -42,6 +42,9 @@ typedef enum abd_stats_op { /* forward declarations */ struct scatterlist; struct page; +#if defined(__FreeBSD__) && defined(_KERNEL) +struct sf_buf; +#endif struct abd_iter { /* public interface */ @@ -70,6 +73,9 @@ struct abd_iter { size_t iter_offset; /* offset in current sg/abd_buf, */ /* abd_offset included */ struct scatterlist *iter_sg; /* current sg */ +#if defined(__FreeBSD__) && defined(_KERNEL) + struct sf_buf *sf; /* used to map in vm_page_t FreeBSD */ +#endif }; extern abd_t *abd_zero_scatter; @@ -77,6 +83,7 @@ extern abd_t *abd_zero_scatter; abd_t *abd_gang_get_offset(abd_t *, size_t *); abd_t *abd_alloc_struct(size_t); void abd_free_struct(abd_t *); +void abd_init_struct(abd_t *); /* * OS specific functions @@ -107,9 +114,9 @@ void abd_iter_page(struct abd_iter *); #define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) #define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) -#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) -#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) -#define ABD_GANG(abd) (abd->abd_u.abd_gang) +#define ABD_SCATTER(abd) ((abd)->abd_u.abd_scatter) +#define ABD_LINEAR_BUF(abd) ((abd)->abd_u.abd_linear.abd_buf) +#define ABD_GANG(abd) ((abd)->abd_u.abd_gang) #if defined(_KERNEL) #if defined(__FreeBSD__) diff --git a/include/sys/arc.h b/include/sys/arc.h index 05307aab99e3..0b37e444ed94 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -120,7 +120,7 @@ typedef enum arc_flags /* * Private ARC flags. These flags are private ARC only flags that - * will show up in b_flags in the arc_hdr_buf_t. These flags should + * will show up in b_flags in the arc_buf_hdr_t. These flags should * only be set by ARC code. */ ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ @@ -179,7 +179,6 @@ typedef enum arc_flags ARC_FLAG_COMPRESS_4 = 1 << 28, ARC_FLAG_COMPRESS_5 = 1 << 29, ARC_FLAG_COMPRESS_6 = 1 << 30 - } arc_flags_t; typedef enum arc_buf_flags { diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 3808a04cba80..c79652f9d834 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -61,17 +61,17 @@ extern "C" { /* * The simplified state transition diagram for dbufs looks like: * - * +--> READ --+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * ^ | ^ ^ - * | | | | - * | +--> FILL --+ | - * | | | - * | | | - * | +------> NOFILL -----+ - * | | + * +-------> READ ------+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * ^ | ^ ^ + * | | | | + * | +-------> FILL ------+ | + * | | | | + * | | | | + * | +------> NOFILL -----+-----> UNCACHED + * | | (Direct I/O) * +---------------+ * * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range @@ -330,6 +330,14 @@ typedef struct dmu_buf_impl { /* The buffer was partially read. More reads may follow. */ uint8_t db_partial_read; + + /* + * This block is being held under a writer rangelock of a Direct I/O + * write that is waiting for previous buffered writes to synced out + * due to mixed buffered and O_DIRECT operations. This is needed to + * check whether to grab the rangelock in zfs_get_data(). + */ + uint8_t db_mixed_io_dio_wait; } dmu_buf_impl_t; #define DBUF_HASH_MUTEX(h, idx) \ @@ -387,6 +395,11 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_direct_mixed_io_wait(dmu_buf_impl_t *db, uint64_t txg, + boolean_t read); +void dmu_buf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +blkptr_t *dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db); +int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, @@ -451,6 +464,32 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) return (NULL); } +/* + * All Direct I/O writes happen in open context so the first dirty record will + * always be associated with the write. After a Direct I/O write completes the + * dirty records dr_overriden state will bet DR_OVERRIDDEN and the dr_data will + * get set to NULL. + */ +static inline dbuf_dirty_record_t * +dbuf_get_dirty_direct(dmu_buf_impl_t *db) +{ + return (list_head(&db->db_dirty_records)); +} + +static inline boolean_t +dbuf_dirty_is_direct_write(dmu_buf_impl_t *db, dbuf_dirty_record_t *dr) +{ + boolean_t ret = B_FALSE; + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (dr != NULL && db->db_level == 0 && !dr->dt.dl.dr_brtwrite && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN && + dr->dt.dl.dr_data == NULL) { + ret = B_TRUE; + } + return (ret); +} + #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) @@ -459,7 +498,7 @@ dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) -boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db); +boolean_t dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *db_bp); #ifdef ZFS_DEBUG diff --git a/include/sys/dmu.h b/include/sys/dmu.h index b5fed64da4ad..1c89322ed9ef 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -517,6 +517,7 @@ void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 +#define WP_DIRECT_WR 0x8 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); @@ -567,6 +568,8 @@ int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp); * * The object number must be a valid, allocated object number. */ +int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + const void *tag, dmu_buf_t **dbp); int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, const void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, @@ -581,6 +584,7 @@ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, dmu_buf_t ***dbpp, uint32_t flags); int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag, dmu_buf_t **dbp); + /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. @@ -865,16 +869,20 @@ int dmu_free_long_object(objset_t *os, uint64_t object); #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ #define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ +#define DMU_DIRECTIO 4 /* use Direct I/O */ + int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags); + void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx); -void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); +int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +int dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx, uint32_t flags); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx); #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size); @@ -1070,6 +1078,7 @@ typedef struct zgd { struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct zfs_locked_range *zgd_lr; + boolean_t zgd_grabbed_rangelock; void *zgd_private; } zgd_t; diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index 83ae2b76ba1f..8317072f6264 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -35,6 +35,10 @@ #include #include #include +#include +#include +#include +#include #ifdef __cplusplus extern "C" { @@ -134,7 +138,7 @@ extern "C" { * db_data_pending * db_dirtied * db_link - * db_dirty_node (??) + * dbuf_dirty_records * db_dirtycnt * db_d.* * db.* @@ -150,8 +154,10 @@ extern "C" { * dbuf_find: none (db_holds) * dbuf_hash_insert: none (db_holds) * dmu_buf_read_array_impl: none (db_state, db_changed) - * dmu_sync: none (db_dirty_node, db_d) + * dmu_sync: none (db_dirty_records, db_d) * dnode_reallocate: none (db) + * dmu_write_direct: none (db_dirty_records, db_d) + * dmu_write_direct_done: none (db_dirty_records, db_d) * * dn_mtx (leaf) * protects: @@ -234,8 +240,9 @@ extern "C" { * dnode_new_blkid */ -struct objset; struct dmu_pool; +struct dmu_buf; +struct zgd; typedef struct dmu_sendstatus { list_node_t dss_link; @@ -245,9 +252,30 @@ typedef struct dmu_sendstatus { uint64_t dss_blocks; /* blocks visited during the sending process */ } dmu_sendstatus_t; +/* + * dmu_sync_{ready/done} args + */ +typedef struct { + dbuf_dirty_record_t *dsa_dr; + void (*dsa_done)(struct zgd *, int); + struct zgd *dsa_zgd; + dmu_tx_t *dsa_tx; +} dmu_sync_arg_t; + +void dmu_sync_done(zio_t *, arc_buf_t *buf, void *varg); +void dmu_sync_ready(zio_t *, arc_buf_t *buf, void *varg); + void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); +int dmu_write_direct(zio_t *, dmu_buf_impl_t *, abd_t *, dmu_tx_t *); +int dmu_read_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t flags); +int dmu_write_abd(dnode_t *, uint64_t, uint64_t, abd_t *, uint32_t, dmu_tx_t *); +#if defined(_KERNEL) +int dmu_read_uio_direct(dnode_t *, zfs_uio_t *, uint64_t); +int dmu_write_uio_direct(dnode_t *, zfs_uio_t *, uint64_t, dmu_tx_t *); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index a9123e862af7..587dac738bae 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -134,6 +134,7 @@ struct objset { zfs_cache_type_t os_secondary_cache; zfs_prefetch_type_t os_prefetch; zfs_sync_type_t os_sync; + zfs_direct_t os_direct; zfs_redundant_metadata_type_t os_redundant_metadata; uint64_t os_recordsize; /* diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h index c746600cd2d5..55b150c044ee 100644 --- a/include/sys/fm/fs/zfs.h +++ b/include/sys/fm/fs/zfs.h @@ -42,6 +42,7 @@ extern "C" { #define FM_EREPORT_ZFS_DATA "data" #define FM_EREPORT_ZFS_DELAY "delay" #define FM_EREPORT_ZFS_DEADMAN "deadman" +#define FM_EREPORT_ZFS_DIO_VERIFY "dio_verify" #define FM_EREPORT_ZFS_POOL "zpool" #define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" #define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" @@ -84,6 +85,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T "vdev_io_t" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N "vdev_slow_io_n" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T "vdev_slow_io_t" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS "dio_verify_errors" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index e191420f2d2d..91aaca06c420 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -193,6 +193,7 @@ typedef enum { ZFS_PROP_SNAPSHOTS_CHANGED, ZFS_PROP_PREFETCH, ZFS_PROP_VOLTHREADING, + ZFS_PROP_DIRECT, ZFS_NUM_PROPS } zfs_prop_t; @@ -527,6 +528,12 @@ typedef enum { ZFS_VOLMODE_NONE = 3 } zfs_volmode_t; +typedef enum { + ZFS_DIRECT_DISABLED = 0, + ZFS_DIRECT_STANDARD, + ZFS_DIRECT_ALWAYS +} zfs_direct_t; + typedef enum zfs_keystatus { ZFS_KEYSTATUS_NONE = 0, ZFS_KEYSTATUS_UNAVAILABLE, @@ -784,6 +791,9 @@ typedef struct zpool_load_policy { /* Number of slow IOs */ #define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" +/* Number of Direct I/O write verify errors */ +#define ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS "vdev_dio_verify_errors" + /* vdev enclosure sysfs path */ #define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" @@ -1256,6 +1266,7 @@ typedef struct vdev_stat { uint64_t vs_physical_ashift; /* vdev_physical_ashift */ uint64_t vs_noalloc; /* allocations halted? */ uint64_t vs_pspace; /* physical capacity */ + uint64_t vs_dio_verify_errors; /* DIO write verify errors */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ diff --git a/include/sys/spa.h b/include/sys/spa.h index b969f05afe48..2cea30cfab17 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -941,6 +941,14 @@ typedef struct spa_iostats { kstat_named_t simple_trim_bytes_skipped; kstat_named_t simple_trim_extents_failed; kstat_named_t simple_trim_bytes_failed; + kstat_named_t arc_read_count; + kstat_named_t arc_read_bytes; + kstat_named_t arc_write_count; + kstat_named_t arc_write_bytes; + kstat_named_t direct_read_count; + kstat_named_t direct_read_bytes; + kstat_named_t direct_write_count; + kstat_named_t direct_write_bytes; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); @@ -964,6 +972,10 @@ extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, uint64_t extents_written, uint64_t bytes_written, uint64_t extents_skipped, uint64_t bytes_skipped, uint64_t extents_failed, uint64_t bytes_failed); +extern void spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, + uint32_t flags); +extern void spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, + uint32_t flags); extern void spa_import_progress_add(spa_t *spa); extern void spa_import_progress_remove(uint64_t spa_guid); extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index aa34edda5f6a..9911645ad2b0 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -40,10 +40,47 @@ #define _SYS_UIO_IMPL_H #include +#include extern int zfs_uiomove(void *, size_t, zfs_uio_rw_t, zfs_uio_t *); extern int zfs_uiocopy(void *, size_t, zfs_uio_rw_t, zfs_uio_t *, size_t *); extern void zfs_uioskip(zfs_uio_t *, size_t); +extern void zfs_uio_free_dio_pages(zfs_uio_t *, zfs_uio_rw_t); +extern int zfs_uio_get_dio_pages_alloc(zfs_uio_t *, zfs_uio_rw_t); +extern boolean_t zfs_uio_page_aligned(zfs_uio_t *); + +static inline boolean_t +zfs_dio_page_aligned(void *buf) +{ + return ((((unsigned long)(buf) & (PAGESIZE - 1)) == 0) ? + B_TRUE : B_FALSE); +} + +static inline boolean_t +zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) +{ + return (IS_P2ALIGNED(offset, blksz)); +} + +static inline boolean_t +zfs_dio_size_aligned(uint64_t size, uint64_t blksz) +{ + return (IS_P2ALIGNED(size, blksz)); +} + +static inline boolean_t +zfs_dio_aligned(uint64_t offset, uint64_t size, uint64_t blksz) +{ + return (zfs_dio_offset_aligned(offset, blksz) && + zfs_dio_size_aligned(size, blksz)); +} + +static inline boolean_t +zfs_uio_aligned(zfs_uio_t *uio, uint64_t blksz) +{ + return (zfs_dio_aligned(zfs_uio_offset(uio), zfs_uio_resid(uio), + blksz)); +} static inline void zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 95164c4546bb..855126acbd9a 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -448,9 +448,14 @@ struct vdev { /* * We rate limit ZIO delay, deadman, and checksum events, since they * can flood ZED with tons of events when a drive is acting up. + * + * We also rate limit Direct I/O write verify errors, since a user might + * be continually manipulating a buffer that can flood ZED with tons of + * events. */ zfs_ratelimit_t vdev_delay_rl; zfs_ratelimit_t vdev_deadman_rl; + zfs_ratelimit_t vdev_dio_verify_rl; zfs_ratelimit_t vdev_checksum_rl; /* @@ -649,6 +654,12 @@ extern uint_t zfs_vdev_max_auto_ashift; int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); +/* + * VDEV checksum verification precentage for Direct I/O writes + */ +extern uint_t zfs_vdev_direct_write_verify_pct; +int param_set_direct_write_verify_pct(ZFS_MODULE_PARAM_ARGS); + #ifdef __cplusplus } #endif diff --git a/include/sys/zfs_racct.h b/include/sys/zfs_racct.h index 0e8bd04c1a13..ff84cccb09a1 100644 --- a/include/sys/zfs_racct.h +++ b/include/sys/zfs_racct.h @@ -26,12 +26,13 @@ #ifndef _SYS_ZFS_RACCT_H #define _SYS_ZFS_RACCT_H -#include +#include +#include /* * Platform-dependent resource accounting hooks */ -void zfs_racct_read(uint64_t size, uint64_t iops); -void zfs_racct_write(uint64_t size, uint64_t iops); +void zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); +void zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags); #endif /* _SYS_ZFS_RACCT_H */ diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index e60b99bed192..8de71448e457 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -29,6 +29,12 @@ extern int zfs_bclone_enabled; +typedef enum zfs_direct_enabled { + ZFS_DIRECT_IO_ERR, + ZFS_DIRECT_IO_DISABLED, + ZFS_DIRECT_IO_ENABLED +} zfs_direct_enabled_t; + extern int zfs_fsync(znode_t *, int, cred_t *); extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); @@ -46,6 +52,9 @@ extern int mappedread(znode_t *, int, zfs_uio_t *); extern int mappedread_sf(znode_t *, int, zfs_uio_t *); extern void update_pages(znode_t *, int64_t, int, objset_t *); +extern zfs_direct_enabled_t zfs_check_direct_enabled(znode_t *, int, int *); +extern int zfs_setup_direct(znode_t *, zfs_uio_t *, zfs_uio_rw_t, int *); + /* * Platform code that asynchronously drops zp's inode / vnode_t. * diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index d71144807f47..c852c4758a91 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -308,7 +308,7 @@ extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, const char *dname, znode_t *szp, znode_t *wzp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t len, boolean_t commit, - zil_callback_t callback, void *callback_data); + boolean_t o_direct, zil_callback_t callback, void *callback_data); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zio.h b/include/sys/zio.h index 545b9cf0c3c5..afbfb1ad1c4b 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -222,6 +222,7 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_NOPWRITE (1ULL << 28) #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) +#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 31) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) @@ -352,6 +353,7 @@ typedef struct zio_prop { boolean_t zp_brtwrite; boolean_t zp_encrypt; boolean_t zp_byteorder; + boolean_t zp_direct_write; uint8_t zp_salt[ZIO_DATA_SALT_LEN]; uint8_t zp_iv[ZIO_DATA_IV_LEN]; uint8_t zp_mac[ZIO_DATA_MAC_LEN]; diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h index 2b026d48675a..d6549d2aced1 100644 --- a/include/sys/zio_impl.h +++ b/include/sys/zio_impl.h @@ -157,8 +157,9 @@ enum zio_stage { ZIO_STAGE_VDEV_IO_ASSESS = 1 << 23, /* RW--XT */ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 24, /* R----- */ + ZIO_STAGE_DIO_CHECKSUM_VERIFY = 1 << 25, /* -W---- */ - ZIO_STAGE_DONE = 1 << 25 /* RWFCXT */ + ZIO_STAGE_DONE = 1 << 26 /* RWFCXT */ }; #define ZIO_ROOT_PIPELINE \ @@ -224,6 +225,10 @@ enum zio_stage { ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) +#define ZIO_DIRECT_WRITE_PIPELINE \ + ZIO_WRITE_PIPELINE & \ + (~ZIO_STAGE_ISSUE_ASYNC) + #define ZIO_DDT_CHILD_WRITE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_VDEV_IO_STAGES | \ diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index 665bfc42301b..b107333d6f92 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -82,6 +82,32 @@ typedef struct zfs_uio { #define zfs_uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len #define zfs_uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base +static inline boolean_t +zfs_dio_page_aligned(void *buf) +{ + return ((((unsigned long)(buf) & (PAGESIZE - 1)) == 0) ? + B_TRUE : B_FALSE); +} + +static inline boolean_t +zfs_dio_offset_aligned(uint64_t offset, uint64_t blksz) +{ + return (IS_P2ALIGNED(offset, blksz)); +} + +static inline boolean_t +zfs_dio_size_aligned(uint64_t size, uint64_t blksz) +{ + return (IS_P2ALIGNED(size, blksz)); +} + +static inline boolean_t +zfs_dio_aligned(uint64_t offset, uint64_t size, uint64_t blksz) +{ + return (zfs_dio_offset_aligned(offset, blksz) && + zfs_dio_size_aligned(size, blksz)); +} + static inline void zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) { diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 2bbaae6345ab..84de8e027c13 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -2,13 +2,17 @@ + + - + + + @@ -614,61 +618,192 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + + + + + + + + + + + + + + + + + + - - + + + - - - - - + + + + + - - - - - - + + + + + - - - - - - - - - + + + - - - - + + + + - - - + + + + + + + + + + + + + + + + + + + + + @@ -713,35 +848,11 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - @@ -753,11 +864,6 @@ - - - - - @@ -768,29 +874,40 @@ - - - - - - - + + + + + + + + + + + + + + + + + + @@ -843,6 +960,11 @@ + + + + + @@ -863,6 +985,11 @@ + + + + + @@ -1065,6 +1192,11 @@ + + + + + @@ -1088,6 +1220,9 @@ + + + @@ -1096,10 +1231,6 @@ - - - - @@ -1210,96 +1341,292 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - - - - - - - + + + - - - + + + - - + + - - + + - - - + + + + + + - - + + - - + + + + + + + + + + + + + + - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - + - + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + @@ -1314,10 +1641,10 @@ - + - + @@ -1335,17 +1662,36 @@ - - - + + + - - - + + + + + - + - + + + + + + + + + + + + + + + + + + @@ -1361,185 +1707,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1593,12 +1760,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -1615,33 +1831,20 @@ - - - - - - - - - - - - - - + @@ -1718,157 +1921,88 @@ - + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + + + + + + + + + + + + + + + + + + + + - + + + + + + + + + + - + - + + + + - + - + + + + + + + - - - - - - - - - - - - - - - - - - + + + + @@ -1917,61 +2051,6 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -2101,5962 +2180,1954 @@ + - - - - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - + + + - - - - + + + + - - - + + + - - - + + + + + - - - - - - + + + + + - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - - - - - - - + + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + - - - - + + + + + + + + - - - - - - - - - + + + + + - - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - - + + - - - - - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - + - + - + - + - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - + + - - - - + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - + - + - - - - - - - - - + + + + + + - - - - - - - - - + + + + + + + + + - - - - + + + + - - - - + + + - - - - - + + + - - - - - + + + - - + + + + + + + + + + + + - - - - - + + + - - - - + + + + + + + + + + + + + - - - - + + + + + + - - - - + + + + - - - + + + + - - + + + + + + + + + - - - - - - - + + + + + - - + + + + - - - - - + + + + + - - - + + + + + - - - - + + + + + - - - - + + + + + + + - - + + - - - + + + + + + + + + - - - + + + + - - - - + + + + + + + - - - - + + + + - - - - + + + + + + + + + + + + + - - + + + + + + - - - - - + + + - - - - + + + + + + + + + + + - - - - + + + + - - - + + + + + - - - - + + + + - - - - + + + + + - - - + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + - - - - + + - - + + - - - - - - + + - - - - - - - - - - - - - - - - - - - - - + - - - - + + - - - - - - + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - - - - - - + + + + + - - - - - - - - - - - - - - - - - - - + + + + - - - - - - + + + - - - - - - - - - - - + + + - - - + + + + + + + - - - - - - - - - - - - - - + + + + + + - - - + + + + - - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + - - - - + + + + + - - - - - - - - - - - + + + + - - - - + + + + - - + + - - - - - - - - + + + - - - + + - - - - - + + + - - + + + + + + - - - - - + + + - - - - - - - + + - - - - + + - - - - - + + + + - - - - + + + + - - - - - + + + - + - - + + + + - + - - - + + + - - - - - - + + + - - - - - - - - - + + + + + + - - - - - - + + + + - - - - - - - + + + + - - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + + + + - + + + - - + + + + + - - - + + + + + - - - + + + + + - - + + + + + + + + - - + + + + - - + + + + + + + - - + + + + - - + + + - - + + + + + - - + + + + + - - + + + + - - - - - - + + - - - + + + - - - - - + + + - - - - + + + + + + - - - - - + + + + - - - - - + + + + - - - - + + + + + + + + + + - - + + + + + + - - + + + + + + - - - - + + + + + + + + + - - + + + + + + + + + + + + + + - - + + + - - - - + + + + + + - - - - + + + + - - - + + + + - - - - + + + + + + - - - - + + + + + + + + + + + + + + + + + + + - - - + + + + - - - + + + + + - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - + + + + - - - + + + + + - - - - + + + + + - - - - - - - - - - - - - - - - - + + + + + + + + + - - - - - + + + + + + - - - - - + + + + + + + + - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -8069,6 +4140,12 @@ + + + + + + @@ -8101,6 +4178,10 @@ + + + + @@ -8108,6 +4189,18 @@ + + + + + + + + + + + + @@ -8124,6 +4217,12 @@ + + + + + + @@ -8152,6 +4251,13 @@ + + + + + + + @@ -8189,25 +4295,14 @@ + + + + + - - - - - - - - - - - - - - - - @@ -8227,379 +4322,330 @@ + + + + + + + + + - - + + + + + + + + + + - - + + - - + + - - + + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - - + + - - + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + - - - + + - - + + + + - - - + + + + + + - - - + + + - - + + - - - - - + + + - - - + + + - - + + + - - - + + + + + + + + + + + + + + + + + + @@ -8612,6 +4658,11 @@ + + + + + @@ -8622,55 +4673,49 @@ - - - - - - - - - - - + + - - - - - - - - - + + + + + + + - - - - - - - - - + + + + + + + + + + + + + + + + + - - - @@ -8722,88 +4767,19 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + @@ -8816,19 +4792,8 @@ - - - - - - - - - - - @@ -8837,15 +4802,6 @@ - - - - - - - - - @@ -8857,6 +4813,12 @@ + + + + + + @@ -8869,6 +4831,12 @@ + + + + + + @@ -8912,18 +4880,20 @@ - - - - - - - + + + + + + + + + @@ -8935,6 +4905,19 @@ + + + + + + + + + + + + + @@ -8945,6 +4928,18 @@ + + + + + + + + + + + + @@ -8952,6 +4947,16 @@ + + + + + + + + + + @@ -8972,10 +4977,30 @@ + + + + + + + + + + + + + + + + + + + + @@ -8990,6 +5015,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9032,24 +5104,6 @@ - - - - - - - - - - - - - - - - - - @@ -9062,35 +5116,26 @@ - - - + - - - - - - - - - + + + - - - - - + + + - - - - + + + + - + - + + @@ -9103,24 +5148,32 @@ + + + + + + + + + + + + + + + + + + - - - - - - + - - - - - @@ -9130,6 +5183,11 @@ + + + + + @@ -9144,9 +5202,6 @@ - - - @@ -9212,16 +5267,6 @@ - - - - - - - - - - @@ -9239,6 +5284,9 @@ + + + @@ -9254,6 +5302,12 @@ + + + + + + @@ -9334,21 +5388,12 @@ - - + - - - - - - - - @@ -9390,6 +5435,12 @@ + + + + + + @@ -9397,6 +5448,24 @@ + + + + + + + + + + + + + + + + + + @@ -9482,6 +5551,24 @@ + + + + + + + + + + + + + + + + + + @@ -9493,6 +5580,22 @@ + + + + + + + + + + + + + + + + @@ -9511,100 +5614,124 @@ + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - + - - - - - - - - - - - + + + - - - - - - - - - - + + + - - - - - - + + + - - - - - + + + - - - + + + @@ -9612,15 +5739,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9639,12 +5815,60 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9665,12 +5889,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -9690,8 +5955,122 @@ - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 42f3404db5a9..26870b8bf686 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -83,6 +83,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/ddt_zap.c \ module/zfs/dmu.c \ module/zfs/dmu_diff.c \ + module/zfs/dmu_direct.c \ module/zfs/dmu_object.c \ module/zfs/dmu_objset.c \ module/zfs/dmu_recv.c \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 6088ebc7ef35..da2f0879d2cc 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -399,6 +399,28 @@ May be increased up to .Sy ASHIFT_MAX Po 16 Pc , but this may negatively impact pool space efficiency. . +.It Sy zfs_vdev_direct_write_verify_pct Ns = Ns Sy Linux 2 | FreeBSD 0 Pq uint +If non-zero, then a Direct I/O write's checksum will be verified every +percentage (pct) of Direct I/O writes that are issued to a top-level VDEV +before it is committed and the block pointer is updated. +In the event the checksum is not valid then the I/O operation will be +redirected through the ARC. +This module parameter can be used to detect if the +contents of the users buffer have changed in the process of doing a Direct I/O +write. +It can also help to identify if reported checksum errors are tied to Direct I/O +writes. +Each verify error causes a +.Sy dio_verify +zevent. +Direct Write I/O checkum verify errors can be seen with +.Nm zpool Cm status Fl d . +The default value for this is 2 percent on Linux, but is 0 for +.Fx +because user pages can be placed under write protection in +.Fx +before the Direct I/O write is issued. +. .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq uint Minimum ashift used when creating new top-level vdevs. . @@ -976,6 +998,9 @@ This will smoothly handle between ten times and a tenth of this number. .Pp .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 . . +.It Sy zfs_dio_write_verify_events_per_second Ns = Ns Sy 20 Ns /s Pq uint +Rate limit Direct I/O write verify events to this many per second. +. .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int Disables requirement for IVset GUIDs to be present and match when doing a raw receive of encrypted datasets. diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index 9ff0236f4d74..7a7678842dba 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -1039,6 +1039,44 @@ See the section of .Xr zfsconcepts 7 . .It Xo +.Sy direct Ns = Ns Sy disabled Ns | Ns Sy standard Ns | Ns Sy always +.Xc +Controls the behavior of Direct I/O requests +.Pq e.g. Dv O_DIRECT . +The +.Sy standard +behavior for Direct I/O requests is to bypass the ARC when possible. +These requests will not be cached and performance will be limited by the +raw speed of the underlying disks +.Pq Dv this is the default . +.Sy always +causes every properly aligned read or write to be treated as a direct request. +.Sy disabled +causes the O_DIRECT flag to be silently ignored and all direct requests will +be handled by the ARC. +This is the default behavior for OpenZFS 2.1 and prior releases. +.Pp +Bypassing the ARC requires that a direct request be correctly aligned. +For write requests the starting offset and size of the request must be +.Sy recordsize Ns +-aligned, if not then the unaligned portion of the request will be silently +redirected through the ARC. +For read requests there is no +.Sy recordsize +alignment restriction on either the starting offset or size. +All direct requests must use a page-aligned memory buffer and the request +size must be a multiple of the page size or an error is returned. +.Pp +Concurrently mixing buffered and direct requests to overlapping regions of +a file can decrease performance. +However, the resulting file will always be coherent. +For example, a direct read after a buffered write will return the data +from the buffered write. +Furthermore, if an application uses +.Xr mmap 2 +based file access then in order to maintain coherency all direct requests +are converted to buffered requests while the file is mapped. +.It Xo .Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns .Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k .Xc diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index ef20ef4e003c..77d44bd8ad21 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -98,6 +98,17 @@ This can be an indicator of problems with the underlying storage device. The number of delay events is ratelimited by the .Sy zfs_slow_io_events_per_second module parameter. +.It Sy dio_verify +Issued when there was a checksum verify error after a Direct I/O write has been +issued and is redirected through the ARC. +This event can only take place if the module parameter +.Sy zfs_vdev_direct_write_verify_pct +is not set to zero. +See +.Xr zfs 4 +for more details on the +.Sy zfs_vdev_direct_write_verify_pct +module paramter. .It Sy config Issued every time a vdev change have been done to the pool. .It Sy zpool @@ -408,8 +419,9 @@ ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--XT ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--XT ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R----- +ZIO_STAGE_DIO_CHECKSUM_VERIFY:0x02000000:-W---- -ZIO_STAGE_DONE:0x02000000:RWFCXT +ZIO_STAGE_DONE:0x04000000:RWFCXT .TE . .Sh I/O FLAGS diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index bbe7a45aa0c6..765204118d01 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DegiLpPstvx +.Op Fl dDegiLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -69,6 +69,15 @@ See the option of .Nm zpool Cm iostat for complete details. +.It Fl d +Display the number of Direct I/O write checksum verify errors that have occured +on a top-level VDEV. +See +.Sx zfs_vdev_direct_write_verify_pct +in +.Xr zfs 4 +for details about the conditions that can cause Direct I/O write checksum +verify failures to occur. .It Fl D Display a histogram of deduplication statistics, showing the allocated .Pq physically present on disk diff --git a/module/Kbuild.in b/module/Kbuild.in index 7e08374fa2b9..bf26a79a0de2 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -329,6 +329,7 @@ ZFS_OBJS := \ ddt_stats.o \ ddt_zap.o \ dmu.o \ + dmu_direct.o \ dmu_diff.o \ dmu_object.o \ dmu_objset.o \ @@ -448,6 +449,7 @@ ZFS_OBJS_OS := \ vdev_disk.o \ vdev_file.o \ vdev_label_os.o \ + vdev_os.o \ zfs_acl.o \ zfs_ctldir.o \ zfs_debug.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index d9d31564d090..06e06afa2240 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -255,6 +255,7 @@ SRCS+= abd.c \ ddt_stats.c \ ddt_zap.c \ dmu.c \ + dmu_direct.c \ dmu_diff.c \ dmu_object.c \ dmu_objset.c \ diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c index ffccb6f2594e..98c45b709ba1 100644 --- a/module/os/freebsd/spl/spl_uio.c +++ b/module/os/freebsd/spl/spl_uio.c @@ -44,6 +44,10 @@ #include #include #include +#include +#include +#include +#include int zfs_uiomove(void *cp, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) @@ -105,3 +109,271 @@ zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio) ASSERT3U(zfs_uio_rw(uio), ==, dir); return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio))); } + +/* + * Check if the uio is page-aligned in memory. + */ +boolean_t +zfs_uio_page_aligned(zfs_uio_t *uio) +{ + const struct iovec *iov = GET_UIO_STRUCT(uio)->uio_iov; + + for (int i = zfs_uio_iovcnt(uio); i > 0; iov++, i--) { + unsigned long addr = (unsigned long)iov->iov_base; + size_t size = iov->iov_len; + if ((addr & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +#if __FreeBSD_version < 1300050 +static void +zfs_uio_set_pages_to_stable(zfs_uio_t *uio) +{ + vm_object_t obj; + + ASSERT3P(uio->uio_dio.pages, !=, NULL); + ASSERT3U(uio->uio_dio.npages, >, 0); + + obj = uio->uio_dio.pages[0]->object; + zfs_vmobject_wlock(obj); + for (int i = 0; i < uio->uio_dio.npages; i++) { + vm_page_t page = uio->uio_dio.pages[i]; + + ASSERT3P(page, !=, NULL); + vm_page_sbusy(page); + MPASS(page == PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(page))); + if (page->object != obj) { + zfs_vmobject_wunlock(obj); + obj = page->object; + zfs_vmobject_wlock(obj); + } + pmap_remove_write(page); + } + zfs_vmobject_wunlock(obj); +} + +static void +zfs_uio_release_stable_pages(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + for (int i = 0; i < uio->uio_dio.npages; i++) { + vm_page_t page = uio->uio_dio.pages[i]; + + ASSERT3P(page, !=, NULL); + ASSERT(vm_page_sbusied(page)); + vm_page_sunbusy(page); + } +} + +#else + +static void +zfs_uio_set_pages_to_stable(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + ASSERT3U(uio->uio_dio.npages, >, 0); + + for (int i = 0; i < uio->uio_dio.npages; i++) { + vm_page_t page = uio->uio_dio.pages[i]; + ASSERT3P(page, !=, NULL); + + MPASS(page == PHYS_TO_VM_PAGE(VM_PAGE_TO_PHYS(page))); + vm_page_busy_acquire(page, VM_ALLOC_SBUSY); + pmap_remove_write(page); + } +} + +static void +zfs_uio_release_stable_pages(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + for (int i = 0; i < uio->uio_dio.npages; i++) { + vm_page_t page = uio->uio_dio.pages[i]; + + ASSERT3P(page, !=, NULL); + vm_page_sunbusy(page); + } +} + +#endif + +/* + * If the operation is marked as read, then we are stating the pages will be + * written to and must be given write access. + */ +static int +zfs_uio_hold_pages(unsigned long start, size_t len, unsigned long nr_pages, + zfs_uio_rw_t rw, vm_page_t *pages) +{ + vm_map_t map; + vm_prot_t prot; + int count; + + map = &curthread->td_proc->p_vmspace->vm_map; + ASSERT3S(len, >, 0); + + prot = rw == UIO_READ ? (VM_PROT_READ | VM_PROT_WRITE) : VM_PROT_READ; + count = vm_fault_quick_hold_pages(map, start, len, prot, pages, + nr_pages); + + return (count); +} + +static void +zfs_uio_unhold_pages(vm_page_t *m, int count) +{ +#if __FreeBSD_version < 1300050 + for (int i = 0; i < count; i++) { + vm_page_t page = m[i]; + ASSERT3P(page, !=, NULL); + vm_page_lock(page); + vm_page_unwire_noq(page); + vm_page_unlock(page); + } +#else + vm_page_unhold_pages(m, count); +#endif +} + +void +zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3P(uio->uio_dio.pages, !=, NULL); + ASSERT(zfs_uio_rw(uio) == rw); + + if (rw == UIO_WRITE) + zfs_uio_release_stable_pages(uio); + + zfs_uio_unhold_pages(&uio->uio_dio.pages[0], + uio->uio_dio.npages); + + kmem_free(uio->uio_dio.pages, + uio->uio_dio.npages * sizeof (vm_page_t)); +} + +static long +zfs_uio_get_user_pages(unsigned long start, unsigned long nr_pages, + size_t len, zfs_uio_rw_t rw, vm_page_t *pages) +{ + int count; + + count = zfs_uio_hold_pages(start, len, nr_pages, rw, pages); + + if (count != nr_pages) { + if (count > 0) + vm_page_unhold_pages(pages, count); + return (count); + } + + ASSERT3U(count, ==, nr_pages); + +#if __FreeBSD_version < 1300050 + for (int i = 0; i < count; i++) { + vm_page_t page = pages[i]; + vm_page_lock(page); + vm_page_wire(page); + vm_page_unhold(page); + vm_page_unlock(page); + } +#endif + + return (count); +} + +static size_t +zfs_uio_iov_step(struct iovec v, zfs_uio_t *uio, int *numpages) +{ + unsigned long addr = (unsigned long)(v.iov_base); + size_t len = v.iov_len; + int n = DIV_ROUND_UP(len, PAGE_SIZE); + + int res = zfs_uio_get_user_pages(P2ALIGN(addr, PAGE_SIZE), n, len, + zfs_uio_rw(uio), &uio->uio_dio.pages[uio->uio_dio.npages]); + if (res != n) { + *numpages = -1; + return (SET_ERROR(EFAULT)); + } + + ASSERT3S(len, ==, res * PAGE_SIZE); + *numpages = res; + return (len); +} + +static int +zfs_uio_get_dio_pages_impl(zfs_uio_t *uio) +{ + const struct iovec *iovp = GET_UIO_STRUCT(uio)->uio_iov; + size_t wanted; + size_t maxsize = zfs_uio_resid(uio); + + wanted = maxsize; + + for (int i = 0; i < zfs_uio_iovcnt(uio); i++) { + struct iovec iov; + int numpages = 0; + + if (iovp->iov_len == 0) { + iovp++; + continue; + } + iov.iov_len = MIN(maxsize, iovp->iov_len); + iov.iov_base = iovp->iov_base; + size_t left = zfs_uio_iov_step(iov, uio, &numpages); + + if (numpages == -1) + return (left); + + ASSERT3U(left, ==, iov.iov_len); + uio->uio_dio.npages += numpages; + maxsize -= iov.iov_len; + wanted -= left; + iovp++; + } + + ASSERT0(wanted); + + return (0); +} + +/* + * This function maps user pages into the kernel. In the event that the user + * pages were not mapped successfully an error value is reutrned. + * + * On success, 0 is returned. + */ +int +zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + int error = 0; + size_t npages = DIV_ROUND_UP(zfs_uio_resid(uio), PAGE_SIZE); + size_t size = npages * sizeof (vm_page_t); + + ASSERT(zfs_uio_rw(uio) == rw); + + uio->uio_dio.pages = kmem_alloc(size, KM_SLEEP); + + error = zfs_uio_get_dio_pages_impl(uio); + + if (error) { + kmem_free(uio->uio_dio.pages, size); + return (error); + } + + /* + * Since we will be writing the user pages we must make sure that + * they are stable. That way the contents of the pages can not change + * while we are doing: compression, checksumming, encryption, parity + * calculations or deduplication. + */ + if (zfs_uio_rw(uio) == UIO_WRITE) + zfs_uio_set_pages_to_stable(uio); + + uio->uio_extflg |= UIO_DIRECT; + + return (0); +} diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index 3b812271f98b..faef26cf1bbb 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -33,6 +33,11 @@ #include #include +#ifdef _KERNEL +#include +#include +#endif + typedef struct abd_stats { kstat_named_t abdstat_struct_size; kstat_named_t abdstat_scatter_cnt; @@ -137,9 +142,17 @@ abd_size_alloc_linear(size_t size) void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) { - uint_t n = abd_scatter_chunkcnt(abd); + uint_t n; + + if (abd_is_from_pages(abd)) + n = abd_chunkcnt_for_bytes(abd->abd_size); + else + n = abd_scatter_chunkcnt(abd); ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); int waste = (n << PAGE_SHIFT) - abd->abd_size; + ASSERT3U(n, >, 0); + ASSERT3S(waste, >=, 0); + IMPLY(abd_is_linear_page(abd), waste < PAGE_SIZE); if (op == ABDSTAT_INCR) { ABDSTAT_BUMP(abdstat_scatter_cnt); ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); @@ -200,10 +213,16 @@ abd_free_chunks(abd_t *abd) { uint_t i, n; - n = abd_scatter_chunkcnt(abd); - for (i = 0; i < n; i++) { - kmem_cache_free(abd_chunk_cache, - ABD_SCATTER(abd).abd_chunks[i]); + /* + * Scatter ABDs may be constructed by abd_alloc_from_pages() from + * an array of pages. In which case they should not be freed. + */ + if (!abd_is_from_pages(abd)) { + n = abd_scatter_chunkcnt(abd); + for (i = 0; i < n; i++) { + kmem_cache_free(abd_chunk_cache, + ABD_SCATTER(abd).abd_chunks[i]); + } } } @@ -344,11 +363,20 @@ abd_fini(void) void abd_free_linear_page(abd_t *abd) { +#if defined(_KERNEL) + ASSERT3P(abd->abd_u.abd_linear.sf, !=, NULL); + zfs_unmap_page(abd->abd_u.abd_linear.sf); + + abd_update_scatter_stats(abd, ABDSTAT_DECR); +#else /* - * FreeBSD does not have scatter linear pages - * so there is an error. + * The ABD flag ABD_FLAG_LINEAR_PAGE should only be set in + * abd_alloc_from_pages(), which is strictly in kernel space. + * So if we have gotten here outside of kernel space we have + * an issue. */ VERIFY(0); +#endif } /* @@ -367,6 +395,26 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) return (abd_alloc_linear(size, is_metadata)); } +static abd_t * +abd_get_offset_from_pages(abd_t *abd, abd_t *sabd, size_t chunkcnt, + size_t new_offset) +{ + ASSERT(abd_is_from_pages(sabd)); + + /* + * Set the child child chunks to point at the parent chunks as + * the chunks are just pages and we don't want to copy them. + */ + size_t parent_offset = new_offset / PAGE_SIZE; + ASSERT3U(parent_offset, <, abd_scatter_chunkcnt(sabd)); + for (int i = 0; i < chunkcnt; i++) + ABD_SCATTER(abd).abd_chunks[i] = + ABD_SCATTER(sabd).abd_chunks[parent_offset + i]; + + abd->abd_flags |= ABD_FLAG_FROM_PAGES; + return (abd); +} + abd_t * abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, size_t size) @@ -401,6 +449,11 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, ABD_SCATTER(abd).abd_offset = new_offset & PAGE_MASK; + if (abd_is_from_pages(sabd)) { + return (abd_get_offset_from_pages(abd, sabd, chunkcnt, + new_offset)); + } + /* Copy the scatterlist starting at the correct offset */ (void) memcpy(&ABD_SCATTER(abd).abd_chunks, &ABD_SCATTER(sabd).abd_chunks[new_offset >> PAGE_SHIFT], @@ -409,6 +462,50 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, return (abd); } +#ifdef _KERNEL +/* + * Allocate a scatter ABD structure from user pages. + */ +abd_t * +abd_alloc_from_pages(vm_page_t *pages, unsigned long offset, uint64_t size) +{ + VERIFY3U(size, <=, DMU_MAX_ACCESS); + ASSERT3U(offset, <, PAGE_SIZE); + ASSERT3P(pages, !=, NULL); + + abd_t *abd = abd_alloc_struct(size); + abd->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_FROM_PAGES; + abd->abd_size = size; + + if (size < PAGE_SIZE) { + /* + * We do not have a full page so we will just use a linear ABD. + * We have to make sure to take into account the offset though. + * In all other cases our offset will be 0 as we are always + * PAGE_SIZE aligned. + */ + ASSERT3U(offset + size, <=, PAGE_SIZE); + abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; + ABD_LINEAR_BUF(abd) = (char *)zfs_map_page(pages[0], + &abd->abd_u.abd_linear.sf) + offset; + } else { + ABD_SCATTER(abd).abd_offset = offset; + ASSERT0(ABD_SCATTER(abd).abd_offset); + + /* + * Setting the ABD's abd_chunks to point to the user pages. + */ + for (int i = 0; i < abd_chunkcnt_for_bytes(size); i++) + ABD_SCATTER(abd).abd_chunks[i] = pages[i]; + } + + abd_update_scatter_stats(abd, ABDSTAT_INCR); + + return (abd); +} + +#endif /* _KERNEL */ + /* * Initialize the abd_iter. */ @@ -470,6 +567,18 @@ abd_iter_map(struct abd_iter *aiter) if (abd_is_linear(abd)) { aiter->iter_mapsize = abd->abd_size - offset; paddr = ABD_LINEAR_BUF(abd); +#if defined(_KERNEL) + } else if (abd_is_from_pages(abd)) { + aiter->sf = NULL; + offset += ABD_SCATTER(abd).abd_offset; + size_t index = offset / PAGE_SIZE; + offset &= PAGE_MASK; + aiter->iter_mapsize = MIN(PAGE_SIZE - offset, + abd->abd_size - aiter->iter_pos); + paddr = zfs_map_page( + ABD_SCATTER(aiter->iter_abd).abd_chunks[index], + &aiter->sf); +#endif } else { offset += ABD_SCATTER(abd).abd_offset; paddr = ABD_SCATTER(abd).abd_chunks[offset >> PAGE_SHIFT]; @@ -492,6 +601,14 @@ abd_iter_unmap(struct abd_iter *aiter) ASSERT3U(aiter->iter_mapsize, >, 0); } +#if defined(_KERNEL) + if (abd_is_from_pages(aiter->iter_abd) && + !abd_is_linear_page(aiter->iter_abd)) { + ASSERT3P(aiter->sf, !=, NULL); + zfs_unmap_page(aiter->sf); + } +#endif + aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; } @@ -501,3 +618,68 @@ abd_cache_reap_now(void) { kmem_cache_reap_soon(abd_chunk_cache); } + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will alloate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + +#ifdef ZFS_DEBUG + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); +#endif + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * no change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index 30983b13f7d1..fb2b4bcb01fc 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -832,6 +832,35 @@ SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, " new top-level vdevs. (LEGACY)"); /* END CSTYLED */ +int +param_set_direct_write_verify_pct(SYSCTL_HANDLER_ARGS) +{ + int val; + int err; + + val = zfs_vdev_direct_write_verify_pct; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val > 100 || val < 0) + return (SET_ERROR(EINVAL)); + + zfs_vdev_direct_write_verify_pct = val; + + return (0); +} + +/* BEGIN CSTYLED */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, direct_write_verify_pct, + CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, + &zfs_vdev_direct_write_verify_pct, + sizeof (zfs_vdev_direct_write_verify_pct), + param_set_direct_write_verify_pct, "IU", + "Percentage of Direct I/O writes per top-level VDEV for checksum" + " verification to be performed"); +/* END CSTYLED */ + /* * Since the DTL space map of a vdev is not expected to have a lot of * entries, we default its block size to 4K. diff --git a/module/os/freebsd/zfs/zfs_racct.c b/module/os/freebsd/zfs/zfs_racct.c index 883255bc1901..2989a9af9235 100644 --- a/module/os/freebsd/zfs/zfs_racct.c +++ b/module/os/freebsd/zfs/zfs_racct.c @@ -27,7 +27,7 @@ #include void -zfs_racct_read(uint64_t size, uint64_t iops) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { curthread->td_ru.ru_inblock += iops; #ifdef RACCT @@ -40,10 +40,12 @@ zfs_racct_read(uint64_t size, uint64_t iops) #else (void) size; #endif /* RACCT */ + + spa_iostats_read_add(spa, size, iops, flags); } void -zfs_racct_write(uint64_t size, uint64_t iops) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { curthread->td_ru.ru_oublock += iops; #ifdef RACCT @@ -56,4 +58,6 @@ zfs_racct_write(uint64_t size, uint64_t iops) #else (void) size; #endif /* RACCT */ + + spa_iostats_write_add(spa, size, iops, flags); } diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index d9a8c8a0d769..68d56d5d014d 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -104,14 +104,6 @@ VFS_SMR_DECLARE; #define NDFREE_PNBUF(ndp) NDFREE((ndp), NDF_ONLY_PNBUF) #endif -#if __FreeBSD_version >= 1300047 -#define vm_page_wire_lock(pp) -#define vm_page_wire_unlock(pp) -#else -#define vm_page_wire_lock(pp) vm_page_lock(pp) -#define vm_page_wire_unlock(pp) vm_page_unlock(pp) -#endif - #ifdef DEBUG_VFS_LOCKS #define VNCHECKREF(vp) \ VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \ @@ -450,19 +442,6 @@ page_hold(vnode_t *vp, int64_t start) } #endif -static void -page_unhold(vm_page_t pp) -{ - - vm_page_wire_lock(pp); -#if __FreeBSD_version >= 1300035 - vm_page_unwire(pp, PQ_ACTIVE); -#else - vm_page_unhold(pp); -#endif - vm_page_wire_unlock(pp); -} - /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: @@ -4265,7 +4244,7 @@ zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, * but that would make the locking messier */ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, - len, commit, NULL, NULL); + len, commit, B_FALSE, NULL, NULL); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { @@ -4400,12 +4379,37 @@ ioflags(int ioflags) flags |= O_APPEND; if (ioflags & IO_NDELAY) flags |= O_NONBLOCK; + if (ioflags & IO_DIRECT) + flags |= O_DIRECT; if (ioflags & IO_SYNC) flags |= O_SYNC; return (flags); } +static int +zfs_freebsd_read_direct(znode_t *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, + int ioflag, cred_t *cr) +{ + int ret; + int flags = ioflag; + + ASSERT3U(rw, ==, UIO_READ); + + /* On error, return to fallback to the buffred path */ + ret = zfs_setup_direct(zp, uio, rw, &flags); + if (ret) + return (ret); + + ASSERT(uio->uio_extflg & UIO_DIRECT); + + ret = zfs_read(zp, uio, flags, cr); + + zfs_uio_free_dio_pages(uio, rw); + + return (ret); +} + #ifndef _SYS_SYSPROTO_H_ struct vop_read_args { struct vnode *a_vp; @@ -4419,9 +4423,87 @@ static int zfs_freebsd_read(struct vop_read_args *ap) { zfs_uio_t uio; + int error = 0; + znode_t *zp = VTOZ(ap->a_vp); + int ioflag = ioflags(ap->a_ioflag); + zfs_uio_init(&uio, ap->a_uio); - return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), - ap->a_cred)); + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(zp, ioflag, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + error = + zfs_freebsd_read_direct(zp, &uio, UIO_READ, ioflag, + ap->a_cred); + /* + * XXX We occasionally get an EFAULT for Direct I/O reads on + * FreeBSD 13. This still needs to be resolved. The EFAULT comes + * from: + * zfs_uio_get__dio_pages_alloc() -> + * zfs_uio_get_dio_pages_impl() -> + * zfs_uio_iov_step() -> + * zfs_uio_get_user_pages(). + * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O + * read fails to map in the user pages (returning EFAULT) the + * Direct I/O request is broken up into two separate IO requests + * and issued separately using Direct I/O. + */ +#ifdef ZFS_DEBUG + if (error == EFAULT) { +#if 0 + printf("%s(%d): Direct I/O read returning EFAULT " + "uio = %p, zfs_uio_offset(uio) = %lu " + "zfs_uio_resid(uio) = %lu\n", + __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio), + zfs_uio_resid(&uio)); +#endif + } + +#endif + + /* + * On error we will return unless the error is EAGAIN, which + * just tells us to fallback to the buffered path. + */ + if (error != EAGAIN) + return (error); + else + ioflag &= ~O_DIRECT; + } + + + ASSERT(direct == ZFS_DIRECT_IO_DISABLED || + (direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN)); + + error = zfs_read(zp, &uio, ioflag, ap->a_cred); + + return (error); +} + +static int +zfs_freebsd_write_direct(znode_t *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, + int ioflag, cred_t *cr) +{ + int ret; + int flags = ioflag; + + ASSERT3U(rw, ==, UIO_WRITE); + + /* On error, return to fallback to the buffred path */ + ret = zfs_setup_direct(zp, uio, rw, &flags); + if (ret) + return (ret); + + ASSERT(uio->uio_extflg & UIO_DIRECT); + + ret = zfs_write(zp, uio, flags, cr); + + zfs_uio_free_dio_pages(uio, rw); + + return (ret); } #ifndef _SYS_SYSPROTO_H_ @@ -4437,9 +4519,39 @@ static int zfs_freebsd_write(struct vop_write_args *ap) { zfs_uio_t uio; + int error = 0; + znode_t *zp = VTOZ(ap->a_vp); + int ioflag = ioflags(ap->a_ioflag); + zfs_uio_init(&uio, ap->a_uio); - return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag), - ap->a_cred)); + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(zp, ioflag, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + error = + zfs_freebsd_write_direct(zp, &uio, UIO_WRITE, ioflag, + ap->a_cred); + + /* + * On error we will return unless the error is EAGAIN, which + * just tells us to fallback to the buffered path. + */ + if (error != EAGAIN) + return (error); + else + ioflag &= ~O_DIRECT; + + } + + ASSERT(direct == ZFS_DIRECT_IO_DISABLED || + (direct == ZFS_DIRECT_IO_ENABLED && error == EAGAIN)); + + error = zfs_write(zp, &uio, ioflag, ap->a_cred); + + return (error); } #if __FreeBSD_version >= 1300102 diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 6a7c2d2811b1..c2460e6f56a8 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -915,6 +915,7 @@ zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) if (commit) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); + return (error); } diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index cee7410c8833..f57dd286ad5e 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -192,10 +192,11 @@ static int zfs_abd_scatter_min_size = 512 * 3; abd_t *abd_zero_scatter = NULL; struct page; + /* - * _KERNEL - Will point to ZERO_PAGE if it is available or it will be - * an allocated zero'd PAGESIZE buffer. - * Userspace - Will be an allocated zero'ed PAGESIZE buffer. + * _KERNEL - Will point to ZERO_PAGE if it is available or it will be + * an allocated zero'd PAGESIZE buffer. + * Userspace -> Will be an allocated zero'ed PAGESIZE buffer. * * abd_zero_page is assigned to each of the pages of abd_zero_scatter. */ @@ -462,14 +463,21 @@ abd_free_chunks(abd_t *abd) if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); - abd_for_each_sg(abd, sg, nr_pages, i) { - page = sg_page(sg); - abd_unmark_zfs_page(page); - order = compound_order(page); - __free_pages(page, order); - ASSERT3U(sg->length, <=, PAGE_SIZE << order); - ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + /* + * Scatter ABDs may be constructed by abd_alloc_from_pages() from + * an array of pages. In which case they should not be freed. + */ + if (!abd_is_from_pages(abd)) { + abd_for_each_sg(abd, sg, nr_pages, i) { + page = sg_page(sg); + abd_unmark_zfs_page(page); + order = compound_order(page); + __free_pages(page, order); + ASSERT3U(sg->length, <=, PAGE_SIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } } + abd_free_sg_table(abd); } @@ -528,6 +536,8 @@ abd_alloc_zero_scatter(void) #define zfs_kmap_atomic(chunk) ((void *)chunk) #define zfs_kunmap_atomic(addr) do { (void)(addr); } while (0) +#define zfs_kmap(chunk) ((void *)chunk) +#define zfs_kunmap(chunk) ((void *)chunk) #define local_irq_save(flags) do { (void)(flags); } while (0) #define local_irq_restore(flags) do { (void)(flags); } while (0) #define nth_page(pg, i) \ @@ -688,17 +698,19 @@ abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) void abd_verify_scatter(abd_t *abd) { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); ASSERT3U(ABD_SCATTER(abd).abd_offset, <, ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; + +#ifdef ZFS_DEBUG + struct scatterlist *sg = NULL; + size_t n = ABD_SCATTER(abd).abd_nents; + int i = 0; + abd_for_each_sg(abd, sg, n, i) { ASSERT3P(sg_page(sg), !=, NULL); } +#endif } static void @@ -828,6 +840,11 @@ abd_free_linear_page(abd_t *abd) { /* Transform it back into a scatter ABD for freeing */ struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + + /* When backed by user page unmap it */ + if (abd_is_from_pages(abd)) + zfs_kunmap(sg_page(sg)); + abd->abd_flags &= ~ABD_FLAG_LINEAR; abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; ABD_SCATTER(abd).abd_nents = 1; @@ -838,6 +855,72 @@ abd_free_linear_page(abd_t *abd) abd_update_scatter_stats(abd, ABDSTAT_DECR); } +#ifdef _KERNEL +/* + * Allocate a scatter ABD structure from user pages. The pages must be + * pinned with get_user_pages, or similiar, but need not be mapped via + * the kmap interfaces. + */ +abd_t * +abd_alloc_from_pages(struct page **pages, unsigned long offset, uint64_t size) +{ + uint_t npages = DIV_ROUND_UP(size, PAGE_SIZE); + struct sg_table table; + + VERIFY3U(size, <=, DMU_MAX_ACCESS); + ASSERT3U(offset, <, PAGE_SIZE); + ASSERT3P(pages, !=, NULL); + + /* + * Even if this buf is filesystem metadata, we only track that we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd_t *abd = abd_alloc_struct(0); + abd->abd_flags |= ABD_FLAG_FROM_PAGES | ABD_FLAG_OWNER; + abd->abd_size = size; + + while (sg_alloc_table_from_pages(&table, pages, npages, offset, + size, __GFP_NOWARN | GFP_NOIO) != 0) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + if (size < PAGE_SIZE) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's constructed + * from a user page can be represented this way as long as the + * page is mapped to a virtual address. This allows us to + * apply an offset in to the mapped page. + * + * Note that kmap() must be used, not kmap_atomic(), because + * the mapping needs to bet set up on all CPUs. Using kmap() + * also enables the user of highmem pages when required. + */ + ASSERT3U(offset + size, <=, PAGE_SIZE); + abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + zfs_kmap(sg_page(table.sgl)); + ABD_LINEAR_BUF(abd) = sg_virt(table.sgl); + } else { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + ABD_SCATTER(abd).abd_offset = offset; + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + + ASSERT0(ABD_SCATTER(abd).abd_offset); + } + + abd_update_scatter_stats(abd, ABDSTAT_INCR); + + return (abd); +} + +#endif /* _KERNEL */ + /* * If we're going to use this ABD for doing I/O using the block layer, the * consumer of the ABD data doesn't care if it's scattered or not, and we don't @@ -887,6 +970,9 @@ abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off, ABD_SCATTER(abd).abd_offset = new_offset; ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + if (abd_is_from_pages(sabd)) + abd->abd_flags |= ABD_FLAG_FROM_PAGES; + return (abd); } @@ -1014,6 +1100,115 @@ abd_cache_reap_now(void) { } +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + + /* + * In the event the ABD is composed of a single user page from Direct + * I/O we can not direclty return the raw buffer. This is a consequence + * of not being able to write protect the page and the contents of the + * page can be changed at any time by the user. + */ + if (abd_is_from_pages(abd)) { + buf = zio_buf_alloc(n); + } else if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + +#ifdef ZFS_DEBUG + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); +#endif + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + + /* + * In the event the ABD is composed of a single user page from Direct + * I/O we must make sure copy the data over into the newly allocated + * buffer. This is a consequence of the fact that we can not write + * protect the user page and there is a risk the contents of the page + * could be changed by the user at any moment. + */ + if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scatterd, this will + * not change the contents of the ABD. If you want any changes you made to + * buf to be copied back to abd, use abd_return_buf_copy() instead. If the + * ABD is not constructed from user pages for Direct I/O then an ASSERT + * checks to make sure the contents of buffer have not changed since it was + * borrowed. We can not ASSERT that the contents of the buffer have not changed + * if it is composed of user pages because the pages can not be placed under + * write protection and the user could have possibly changed the contents in + * the pages at any time. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); +#ifdef ZFS_DEBUG + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +#endif + if (abd_is_from_pages(abd)) { + zio_buf_free(buf, n); + } else if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else if (abd_is_gang(abd)) { +#ifdef ZFS_DEBUG + /* + * We have to be careful with gang ABD's that we do not ASSERT0 + * for any ABD's that contain user pages from Direct I/O. In + * order to handle this, we just iterate through the gang ABD + * and only verify ABDs that are not from user pages. + */ + void *cmp_buf = buf; + + for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + if (!abd_is_from_pages(cabd)) { + ASSERT0(abd_cmp_buf(cabd, cmp_buf, + cabd->abd_size)); + } + cmp_buf = (char *)cmp_buf + cabd->abd_size; + } +#endif + zio_buf_free(buf, n); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd) || abd_is_from_pages(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + #if defined(_KERNEL) /* diff --git a/module/os/linux/zfs/vdev_os.c b/module/os/linux/zfs/vdev_os.c new file mode 100644 index 000000000000..3bd7296da97e --- /dev/null +++ b/module/os/linux/zfs/vdev_os.c @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2022 by Triad National Security, LLC. + */ + +#include + +#ifdef _KERNEL + +int +param_set_direct_write_verify_pct(const char *buf, zfs_kernel_param_t *kp) +{ + uint_t val; + int error; + + error = kstrtouint(buf, 0, &val); + if (error < 0) + return (SET_ERROR(error)); + + if (val > 100) + return (SET_ERROR(-EINVAL)); + + error = param_set_uint(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} + +#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_racct.c b/module/os/linux/zfs/zfs_racct.c index ce623ef9d185..ce197caa45f0 100644 --- a/module/os/linux/zfs/zfs_racct.c +++ b/module/os/linux/zfs/zfs_racct.c @@ -25,14 +25,35 @@ #include +#ifdef _KERNEL +#include + +void +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + task_io_account_read(size); + spa_iostats_read_add(spa, size, iops, flags); +} + void -zfs_racct_read(uint64_t size, uint64_t iops) +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { - (void) size, (void) iops; + task_io_account_write(size); + spa_iostats_write_add(spa, size, iops, flags); } +#else + void -zfs_racct_write(uint64_t size, uint64_t iops) +zfs_racct_read(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) { - (void) size, (void) iops; + (void) spa, (void) size, (void) iops, (void) flags; } + +void +zfs_racct_write(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + (void) spa, (void) size, (void) iops, (void) flags; +} + +#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_uio.c b/module/os/linux/zfs/zfs_uio.c index c2ed67c438c6..272bf41e2ac3 100644 --- a/module/os/linux/zfs/zfs_uio.c +++ b/module/os/linux/zfs/zfs_uio.c @@ -41,12 +41,19 @@ #ifdef _KERNEL +#include +#include +#include #include #include #include #include +#include +#include #include #include +#include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -327,8 +334,13 @@ EXPORT_SYMBOL(zfs_uiomove); int zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio) { - if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) { - /* There's never a need to fault in kernel pages */ + if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC || + (uio->uio_extflg & UIO_DIRECT)) { + /* + * There's never a need to fault in kernel pages or Direct I/O + * write pages. Direct I/O write pages have been pinned in so + * there is never a time for these pages a fault will occur. + */ return (0); #if defined(HAVE_VFS_IOV_ITER) } else if (uio->uio_segflg == UIO_ITER) { @@ -437,9 +449,283 @@ zfs_uioskip(zfs_uio_t *uio, size_t n) uio->uio_iovcnt--; } } + uio->uio_loffset += n; uio->uio_resid -= n; } EXPORT_SYMBOL(zfs_uioskip); +/* + * Check if the uio is page-aligned in memory. + */ +boolean_t +zfs_uio_page_aligned(zfs_uio_t *uio) +{ + boolean_t aligned = B_TRUE; + + if (uio->uio_segflg == UIO_USERSPACE || + uio->uio_segflg == UIO_SYSSPACE) { + const struct iovec *iov = uio->uio_iov; + size_t skip = uio->uio_skip; + + for (int i = uio->uio_iovcnt; i > 0; iov++, i--) { + unsigned long addr = + (unsigned long)(iov->iov_base + skip); + size_t size = iov->iov_len - skip; + if ((addr & (PAGE_SIZE - 1)) || + (size & (PAGE_SIZE - 1))) { + aligned = B_FALSE; + break; + } + skip = 0; + } +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + unsigned long alignment = + iov_iter_alignment(uio->uio_iter); + aligned = IS_P2ALIGNED(alignment, PAGE_SIZE); +#endif + } else { + /* Currently not supported */ + aligned = B_FALSE; + } + + return (aligned); +} + + +#if defined(HAVE_ZERO_PAGE_GPL_ONLY) || !defined(_LP64) +#define ZFS_MARKEED_PAGE 0x0 +#define IS_ZFS_MARKED_PAGE(_p) 0 +#define zfs_mark_page(_p) +#define zfs_unmark_page(_p) +#define IS_ZERO_PAGE(_p) 0 + +#else +/* + * Mark pages to know if they were allocated to replace ZERO_PAGE() for + * Direct I/O writes. + */ +#define ZFS_MARKED_PAGE 0x5a465350414745 /* ASCII: ZFSPAGE */ +#define IS_ZFS_MARKED_PAGE(_p) \ + (page_private(_p) == (unsigned long)ZFS_MARKED_PAGE) +#define IS_ZERO_PAGE(_p) ((_p) == ZERO_PAGE(0)) + +static inline void +zfs_mark_page(struct page *page) +{ + ASSERT3P(page, !=, NULL); + get_page(page); + SetPagePrivate(page); + set_page_private(page, ZFS_MARKED_PAGE); +} + +static inline void +zfs_unmark_page(struct page *page) +{ + ASSERT3P(page, !=, NULL); + set_page_private(page, 0UL); + ClearPagePrivate(page); + put_page(page); +} +#endif /* HAVE_ZERO_PAGE_GPL_ONLY || !_LP64 */ + +static void +zfs_uio_dio_check_for_zero_page(zfs_uio_t *uio) +{ + ASSERT3P(uio->uio_dio.pages, !=, NULL); + + for (int i = 0; i < uio->uio_dio.npages; i++) { + struct page *p = uio->uio_dio.pages[i]; + lock_page(p); + + if (IS_ZERO_PAGE(p)) { + /* + * If the user page points the kernels ZERO_PAGE() a + * new zero filled page will just be allocated so the + * contents of the page can not be changed by the user + * while a Direct I/O write is taking place. + */ + gfp_t gfp_zero_page = __GFP_NOWARN | GFP_NOIO | + __GFP_ZERO | GFP_KERNEL; + + ASSERT0(IS_ZFS_MARKED_PAGE(p)); + unlock_page(p); + put_page(p); + + p = __page_cache_alloc(gfp_zero_page); + zfs_mark_page(p); + } else { + unlock_page(p); + } + } +} + +void +zfs_uio_free_dio_pages(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3P(uio->uio_dio.pages, !=, NULL); + + for (int i = 0; i < uio->uio_dio.npages; i++) { + struct page *p = uio->uio_dio.pages[i]; + + if (IS_ZFS_MARKED_PAGE(p)) { + zfs_unmark_page(p); + __free_page(p); + continue; + } + + put_page(p); + } + + vmem_free(uio->uio_dio.pages, + uio->uio_dio.npages * sizeof (struct page *)); +} + +/* + * zfs_uio_iov_step() is just a modified version of the STEP function of Linux's + * iov_iter_get_pages(). + */ +static size_t +zfs_uio_iov_step(struct iovec v, zfs_uio_rw_t rw, zfs_uio_t *uio, int *numpages) +{ + unsigned long addr = (unsigned long)(v.iov_base); + size_t len = v.iov_len; + int n = DIV_ROUND_UP(len, PAGE_SIZE); + + int res = zfs_get_user_pages(P2ALIGN(addr, PAGE_SIZE), n, + rw == UIO_READ, &uio->uio_dio.pages[uio->uio_dio.npages]); + if (res < 0) { + *numpages = -1; + return (-res); + } else if (len != (res * PAGE_SIZE)) { + *numpages = -1; + return (len); + } + + ASSERT3S(len, ==, res * PAGE_SIZE); + *numpages = res; + return (len); +} + +static int +zfs_uio_get_dio_pages_iov(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + const struct iovec *iovp = uio->uio_iov; + size_t skip = uio->uio_skip; + size_t wanted, maxsize; + + ASSERT(uio->uio_segflg != UIO_SYSSPACE); + wanted = maxsize = uio->uio_resid - skip; + + for (int i = 0; i < uio->uio_iovcnt; i++) { + struct iovec iov; + int numpages = 0; + + if (iovp->iov_len == 0) { + iovp++; + skip = 0; + continue; + } + iov.iov_len = MIN(maxsize, iovp->iov_len - skip); + iov.iov_base = iovp->iov_base + skip; + ssize_t left = zfs_uio_iov_step(iov, rw, uio, &numpages); + + if (numpages == -1) { + return (left); + } + + ASSERT3U(left, ==, iov.iov_len); + uio->uio_dio.npages += numpages; + maxsize -= iov.iov_len; + wanted -= left; + skip = 0; + iovp++; + } + + ASSERT0(wanted); + return (0); +} + +#if defined(HAVE_VFS_IOV_ITER) +static int +zfs_uio_get_dio_pages_iov_iter(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + size_t skip = uio->uio_skip; + size_t wanted = uio->uio_resid - uio->uio_skip; + size_t rollback = 0; + size_t cnt; + size_t maxpages = DIV_ROUND_UP(wanted, PAGE_SIZE); + + while (wanted) { +#if defined(HAVE_IOV_ITER_GET_PAGES2) + cnt = iov_iter_get_pages2(uio->uio_iter, + &uio->uio_dio.pages[uio->uio_dio.npages], + wanted, maxpages, &skip); +#else + cnt = iov_iter_get_pages(uio->uio_iter, + &uio->uio_dio.pages[uio->uio_dio.npages], + wanted, maxpages, &skip); +#endif + if (cnt < 0) { + iov_iter_revert(uio->uio_iter, rollback); + return (SET_ERROR(-cnt)); + } + uio->uio_dio.npages += DIV_ROUND_UP(cnt, PAGE_SIZE); + rollback += cnt; + wanted -= cnt; + skip = 0; + iov_iter_advance(uio->uio_iter, cnt); + + } + ASSERT3U(rollback, ==, uio->uio_resid - uio->uio_skip); + iov_iter_revert(uio->uio_iter, rollback); + + return (0); +} +#endif /* HAVE_VFS_IOV_ITER */ + +/* + * This function maps user pages into the kernel. In the event that the user + * pages were not mapped successfully an error value is returned. + * + * On success, 0 is returned. + */ +int +zfs_uio_get_dio_pages_alloc(zfs_uio_t *uio, zfs_uio_rw_t rw) +{ + int error = 0; + size_t npages = DIV_ROUND_UP(uio->uio_resid, PAGE_SIZE); + size_t size = npages * sizeof (struct page *); + + if (uio->uio_segflg == UIO_USERSPACE) { + uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); + error = zfs_uio_get_dio_pages_iov(uio, rw); + ASSERT3S(uio->uio_dio.npages, ==, npages); +#if defined(HAVE_VFS_IOV_ITER) + } else if (uio->uio_segflg == UIO_ITER) { + uio->uio_dio.pages = vmem_alloc(size, KM_SLEEP); + error = zfs_uio_get_dio_pages_iov_iter(uio, rw); + ASSERT3S(uio->uio_dio.npages, ==, npages); +#endif + } else { + return (SET_ERROR(EOPNOTSUPP)); + } + + if (error) { + vmem_free(uio->uio_dio.pages, size); + return (error); + } + + if (rw == UIO_WRITE) { + zfs_uio_dio_check_for_zero_page(uio); + } + + uio->uio_extflg |= UIO_DIRECT; + + return (0); +} + #endif /* _KERNEL */ diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 2015c20d7340..d5e22c00bcf2 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "zfs_comutil.h" enum { diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 1cecad9f7755..d8109698224e 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -227,7 +227,8 @@ zfs_close(struct inode *ip, int flag, cred_t *cr) #if defined(_KERNEL) -static int zfs_fillpage(struct inode *ip, struct page *pp); +static int zfs_fillpage(struct inode *ip, struct page *pp, + boolean_t rangelock_held); /* * When a file is memory mapped, we must keep the IO data synchronized @@ -295,13 +296,14 @@ mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { + /* * If filemap_fault() retries there exists a window * where the page will be unlocked and not up to date. * In this case we must try and fill the page. */ if (unlikely(!PageUptodate(pp))) { - error = zfs_fillpage(ip, pp); + error = zfs_fillpage(ip, pp, B_TRUE); if (error) { unlock_page(pp); put_page(pp); @@ -3853,7 +3855,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, } zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, - for_sync ? zfs_putpage_sync_commit_cb : + B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : zfs_putpage_async_commit_cb, pp); dmu_tx_commit(tx); @@ -3994,20 +3996,68 @@ zfs_inactive(struct inode *ip) * Fill pages with data from the disk. */ static int -zfs_fillpage(struct inode *ip, struct page *pp) +zfs_fillpage(struct inode *ip, struct page *pp, boolean_t rangelock_held) { + znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; + zfs_locked_range_t *lr = NULL; ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; + /* + * It is important to hold the rangelock here because it is possible + * a Direct I/O write might be taking place at the same time that a + * page is being faulted in through filemap_fault(). With a Direct I/O + * write, db->db_data will be set to NULL either in: + * 1. dmu_write_direct() -> dmu_buf_will_not_fill() -> + * dmu_buf_will_fill() -> dbuf_noread() -> dbuf_clear_data() + * 2. dmu_write_direct_done() + * If the rangelock is not held, then there is a race between faulting + * in a page and writing out a Direct I/O write. Without the rangelock + * a NULL pointer dereference can occur in dmu_read_impl() for + * db->db_data during the mempcy operation. + * + * Another important note here is we have to check to make sure the + * rangelock is not already held from mappedread() -> zfs_fillpage(). + * filemap_fault() will first add the page to the inode address_space + * mapping and then will drop the page lock. This leaves open a window + * for mappedread() to begin. In this case he page lock and rangelock, + * are both held and it might have to call here if the page is not + * up to date. In this case the rangelock can not be held twice or a + * deadlock can happen. So the rangelock only needs to be aquired if + * zfs_fillpage() is being called by zfs_getpage(). + * + * Finally it is also important to drop the page lock before grabbing + * the rangelock to avoid another deadlock between here and + * zfs_write() -> update_pages(). update_pages() holds both the + * rangelock and the page lock. + */ + if (rangelock_held == B_FALSE) { + /* + * First try grabbing the rangelock. If that can not be done + * the page lock must be dropped before grabbing the rangelock + * to avoid a deadlock with update_pages(). See comment above. + */ + lr = zfs_rangelock_tryenter(&zp->z_rangelock, io_off, io_len, + RL_READER); + if (lr == NULL) { + get_page(pp); + unlock_page(pp); + lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, + io_len, RL_READER); + lock_page(pp); + put_page(pp); + } + } + void *va = kmap(pp); - int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off, + int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); @@ -4025,6 +4075,10 @@ zfs_fillpage(struct inode *ip, struct page *pp) SetPageUptodate(pp); } + + if (rangelock_held == B_FALSE) + zfs_rangelock_exit(lr); + return (error); } @@ -4049,7 +4103,7 @@ zfs_getpage(struct inode *ip, struct page *pp) if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - error = zfs_fillpage(ip, pp); + error = zfs_fillpage(ip, pp, B_FALSE); if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 9dec52215c7c..d3fd4340e7ae 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -309,7 +309,7 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, } static ssize_t -zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +zpl_iter_read_buffered(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; @@ -322,12 +322,54 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) crhold(cr); cookie = spl_fstrans_mark(); + int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + flags, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); + + return (read); +} + +static ssize_t +zpl_iter_read_direct(struct kiocb *kiocb, struct iov_iter *to) +{ + cred_t *cr = CRED(); + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + ssize_t count = iov_iter_count(to); + int flags = filp->f_flags | zfs_io_flags(kiocb); + zfs_uio_t uio; + ssize_t ret; + + zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); + + /* On error, return to fallback to the buffered path. */ + ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_READ, &flags); + if (ret) + return (-ret); + + ASSERT(uio.uio_extflg & UIO_DIRECT); + + crhold(cr); + fstrans_cookie_t cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(ip), &uio, flags, cr); spl_fstrans_unmark(cookie); crfree(cr); + zfs_uio_free_dio_pages(&uio, UIO_READ); + if (error < 0) return (error); @@ -339,6 +381,31 @@ zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) return (read); } +static ssize_t +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +{ + struct inode *ip = kiocb->ki_filp->f_mapping->host; + struct file *filp = kiocb->ki_filp; + int flags = filp->f_flags | zfs_io_flags(kiocb); + int error = 0; + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(ITOZ(ip), flags, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (-error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + ssize_t read = zpl_iter_read_direct(kiocb, to); + + if (read >= 0 || read != -EAGAIN) + return (read); + + /* Otherwise fallback to buffered read */ + } + + return (zpl_iter_read_buffered(kiocb, to)); +} + static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) @@ -365,44 +432,118 @@ zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, } static ssize_t -zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) +zpl_iter_write_buffered(struct kiocb *kiocb, struct iov_iter *from) { cred_t *cr = CRED(); - fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; + size_t wrote; + size_t count = iov_iter_count(from); + zfs_uio_t uio; - size_t count = 0; - ssize_t ret; + zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); - ret = zpl_generic_write_checks(kiocb, from, &count); - if (ret) - return (ret); + crhold(cr); + fstrans_cookie_t cookie = spl_fstrans_mark(); + + int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; + int error = -zfs_write(ITOZ(ip), &uio, flags, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + if (wrote > 0) + iov_iter_advance(from, wrote); + + return (wrote); +} +static ssize_t +zpl_iter_write_direct(struct kiocb *kiocb, struct iov_iter *from) +{ + cred_t *cr = CRED(); + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + size_t wrote; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count = iov_iter_count(from); + + zfs_uio_t uio; zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); + /* On error, return to fallback to the buffered path. */ + ssize_t ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_WRITE, &flags); + if (ret) + return (-ret); + + ASSERT(uio.uio_extflg & UIO_DIRECT); + crhold(cr); - cookie = spl_fstrans_mark(); + fstrans_cookie_t cookie = spl_fstrans_mark(); - int error = -zfs_write(ITOZ(ip), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + int error = -zfs_write(ITOZ(ip), &uio, flags, cr); spl_fstrans_unmark(cookie); crfree(cr); + zfs_uio_free_dio_pages(&uio, UIO_WRITE); + if (error < 0) return (error); - ssize_t wrote = count - uio.uio_resid; + wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } +static ssize_t +zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) +{ + struct inode *ip = kiocb->ki_filp->f_mapping->host; + struct file *filp = kiocb->ki_filp; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count = 0; + int error = 0; + + ssize_t ret = zpl_generic_write_checks(kiocb, from, &count); + if (ret) + return (ret); + + loff_t offset = kiocb->ki_pos; + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(ITOZ(ip), flags, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (-error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + ssize_t wrote = zpl_iter_write_direct(kiocb, from); + + if (wrote >= 0 || wrote != -EAGAIN) { + return (wrote); + } + + /* + * If we are falling back to a buffered write, then the + * file position should not be updated at this point. + */ + ASSERT3U(offset, ==, kiocb->ki_pos); + } + + return (zpl_iter_write_buffered(kiocb, from)); +} + #else /* !HAVE_VFS_RW_ITERATE */ static ssize_t -zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, +zpl_aio_read_buffered(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); @@ -422,8 +563,9 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, crhold(cr); cookie = spl_fstrans_mark(); + int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + flags, cr); spl_fstrans_unmark(cookie); crfree(cr); @@ -440,39 +582,159 @@ zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, } static ssize_t -zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, +zpl_aio_read_direct(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; + int flags = filp->f_flags | zfs_io_flags(kiocb); size_t count; ssize_t ret; - ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); - ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + /* On error, return to fallback to the buffered path */ + ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_READ, &flags); + if (ret) + return (-ret); + + ASSERT(uio.uio_extflg & UIO_DIRECT); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int error = -zfs_read(ITOZ(ip), &uio, flags, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + zfs_uio_free_dio_pages(&uio, UIO_READ); + + if (error < 0) + return (error); + + ssize_t read = count - uio.uio_resid; + kiocb->ki_pos += read; + + zpl_file_accessed(filp); + + return (read); +} + +static ssize_t +zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct inode *ip = kiocb->ki_filp->f_mapping->host; + struct file *filp = kiocb->ki_filp; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count; + ssize_t ret; + int error = 0; + + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); - kiocb->ki_pos = pos; + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(ITOZ(ip), flags, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (-error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + ssize_t read = zpl_aio_read_direct(kiocb, iov, nr_segs, pos); + + if (read >= 0 || read != -EAGAIN) + return (read); + + /* Otherwise fallback to buffered read */ + } + + return (zpl_aio_read_buffered(kiocb, iov, nr_segs, pos)); +} + +static ssize_t +zpl_aio_write_buffered(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + size_t count; + ssize_t ret; + + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (ret) + return (ret); + + zfs_uio_t uio; + zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, + count, 0); + + crhold(cr); + cookie = spl_fstrans_mark(); + + int flags = (filp->f_flags | zfs_io_flags(kiocb)) & ~O_DIRECT; + int error = -zfs_write(ITOZ(ip), &uio, flags, cr); + + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error < 0) + return (error); + + ssize_t wrote = count - uio.uio_resid; + kiocb->ki_pos += wrote; + + return (wrote); +} + +static ssize_t +zpl_aio_write_direct(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t count; + ssize_t ret; + + ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (ret) + return (ret); zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); + /* On error, return to fallback to the buffered path. */ + ret = zfs_setup_direct(ITOZ(ip), &uio, UIO_WRITE, &flags); + if (ret) + return (-ret); + + ASSERT(uio.uio_extflg & UIO_DIRECT); + crhold(cr); cookie = spl_fstrans_mark(); - int error = -zfs_write(ITOZ(ip), &uio, - filp->f_flags | zfs_io_flags(kiocb), cr); + int error = -zfs_write(ITOZ(ip), &uio, flags, cr); spl_fstrans_unmark(cookie); crfree(cr); + zfs_uio_free_dio_pages(&uio, UIO_WRITE); + if (error < 0) return (error); @@ -481,39 +743,89 @@ zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, return (wrote); } + +static ssize_t +zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + int flags = filp->f_flags | zfs_io_flags(kiocb); + size_t ocount; + size_t count; + ssize_t ret; + int error = 0; + + ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); + if (ret) + return (ret); + + count = ocount; + + ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); + if (ret) + return (ret); + + kiocb->ki_pos = pos; + + zfs_direct_enabled_t direct = + zfs_check_direct_enabled(ITOZ(ip), flags, &error); + + if (direct == ZFS_DIRECT_IO_ERR) { + return (-error); + } else if (direct == ZFS_DIRECT_IO_ENABLED) { + ssize_t wrote = zpl_aio_write_direct(kiocb, iov, nr_segs, pos); + + if (wrote >= 0 || wrote != -EAGAIN) { + return (wrote); + } + + /* + * If we are falling back to a buffered write, then the + * file position should not be updated at this point. + */ + ASSERT3U(pos, ==, kiocb->ki_pos); + } + + return (zpl_aio_write_buffered(kiocb, iov, nr_segs, pos)); +} + #endif /* HAVE_VFS_RW_ITERATE */ -#if defined(HAVE_VFS_RW_ITERATE) static ssize_t -zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) +zpl_direct_IO_impl(void) { - if (rw == WRITE) - return (zpl_iter_write(kiocb, iter)); - else - return (zpl_iter_read(kiocb, iter)); + /* + * All O_DIRCT requests should be handled by + * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code + * should call the direct_IO address_space_operations function. We set + * this code path to be fatal if it is executed. + */ + VERIFY(0); + return (0); } + +#if defined(HAVE_VFS_RW_ITERATE) #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { - return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - ASSERT3S(pos, ==, kiocb->ki_pos); - return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - ASSERT3S(pos, ==, kiocb->ki_pos); - return (zpl_direct_IO_impl(rw, kiocb, iter)); + return (zpl_direct_IO_impl()); } #else -#error "Unknown direct IO interface" +#error "Unknown Direct I/O interface" #endif #else /* HAVE_VFS_RW_ITERATE */ @@ -523,26 +835,16 @@ static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - if (rw == WRITE) - return (zpl_aio_write(kiocb, iov, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iov, nr_segs, pos)); + return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { - const struct iovec *iovp = iov_iter_iovec(iter); - unsigned long nr_segs = iter->nr_segs; - - ASSERT3S(pos, ==, kiocb->ki_pos); - if (rw == WRITE) - return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); - else - return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); + return (zpl_direct_IO_impl()); } #else -#error "Unknown direct IO interface" +#error "Unknown Direct I/O interface" #endif #endif /* HAVE_VFS_RW_ITERATE */ @@ -627,6 +929,7 @@ zpl_mmap(struct file *filp, struct vm_area_struct *vma) error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); + if (error) return (error); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 764993b45e7c..10ac13a898ce 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -395,6 +395,13 @@ zfs_prop_init(void) { NULL } }; + static const zprop_index_t direct_table[] = { + { "disabled", ZFS_DIRECT_DISABLED }, + { "standard", ZFS_DIRECT_STANDARD }, + { "always", ZFS_DIRECT_ALWAYS }, + { NULL } + }; + struct zfs_mod_supported_features *sfeatures = zfs_mod_list_supported(ZFS_SYSFS_DATASET_PROPERTIES); @@ -479,6 +486,10 @@ zfs_prop_init(void) ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "default | full | geom | dev | none", "VOLMODE", volmode_table, sfeatures); + zprop_register_index(ZFS_PROP_DIRECT, "direct", + ZFS_DIRECT_STANDARD, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "disabled | standard | always", "DIRECT", direct_table, + sfeatures); /* inherit index (boolean) properties */ zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 2c0cda25dbc6..d5f9964fce82 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -89,8 +89,8 @@ * functions. * * As an additional feature, linear and scatter ABD's can be stitched together - * by using the gang ABD type (abd_alloc_gang_abd()). This allows for - * multiple ABDs to be viewed as a singular ABD. + * by using the gang ABD type (abd_alloc_gang()). This allows for multiple ABDs + * to be viewed as a singular ABD. * * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to * B_FALSE. @@ -109,11 +109,16 @@ void abd_verify(abd_t *abd) { #ifdef ZFS_DEBUG - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + if (abd_is_from_pages(abd)) { + ASSERT3U(abd->abd_size, <=, DMU_MAX_ACCESS); + } else { + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + } ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | - ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD)); + ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD | + ABD_FLAG_FROM_PAGES)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { @@ -136,7 +141,7 @@ abd_verify(abd_t *abd) #endif } -static void +void abd_init_struct(abd_t *abd) { list_link_init(&abd->abd_gang_link); @@ -238,6 +243,7 @@ abd_free_linear(abd_t *abd) abd_free_linear_page(abd); return; } + if (abd->abd_flags & ABD_FLAG_META) { zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } else { @@ -520,6 +526,21 @@ abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size) */ abd->abd_flags |= ABD_FLAG_LINEAR; + /* + * User pages from Direct I/O requests may be in a single page + * (ABD_FLAG_LINEAR_PAGE), and we must make sure to still flag + * that here for abd. This is required because we have to be + * careful when borrowing the buffer from the ABD because we + * can not place user pages under write protection on Linux. + * See the comments in abd_os.c for abd_borrow_buf(), + * abd_borrow_buf_copy(), abd_return_buf() and + * abd_return_buf_copy(). + */ + if (abd_is_from_pages(sabd)) { + abd->abd_flags |= ABD_FLAG_FROM_PAGES | + ABD_FLAG_LINEAR_PAGE; + } + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; } else if (abd_is_gang(sabd)) { size_t left = size; @@ -636,70 +657,6 @@ abd_to_buf(abd_t *abd) return (ABD_LINEAR_BUF(abd)); } -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } -#ifdef ZFS_DEBUG - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); -#endif - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); -#ifdef ZFS_DEBUG - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -#endif - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - void abd_release_ownership_of_buf(abd_t *abd) { diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6954051b1d19..4191e0eae6ee 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5872,7 +5872,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH), demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, misses); - zfs_racct_read(size, 1); + zfs_racct_read(spa, size, 1, 0); } /* Check if the spa even has l2 configured */ diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 2ac058fd2c93..3ef5e324e5a7 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -211,8 +211,7 @@ dataset_kstats_rename(dataset_kstats_t *dk, const char *name) } void -dataset_kstats_update_write_kstats(dataset_kstats_t *dk, - int64_t nwritten) +dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten) { ASSERT3S(nwritten, >=, 0); @@ -224,8 +223,7 @@ dataset_kstats_update_write_kstats(dataset_kstats_t *dk, } void -dataset_kstats_update_read_kstats(dataset_kstats_t *dk, - int64_t nread) +dataset_kstats_update_read_kstats(dataset_kstats_t *dk, int64_t nread) { ASSERT3S(nread, >=, 0); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 5f3643f573f7..eab28647ae17 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -82,6 +82,13 @@ typedef struct dbuf_stats { */ kstat_named_t cache_levels[DN_MAX_LEVELS]; kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; + /* + * Statistics for Direct I/O. + */ + kstat_named_t direct_mixed_io_read_wait; + kstat_named_t direct_mixed_io_write_wait; + kstat_named_t direct_sync_wait; + kstat_named_t direct_undirty; /* * Statistics about the dbuf hash table. */ @@ -130,6 +137,10 @@ dbuf_stats_t dbuf_stats = { { "cache_total_evicts", KSTAT_DATA_UINT64 }, { { "cache_levels_N", KSTAT_DATA_UINT64 } }, { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, + { "direct_mixed_io_read_wait", KSTAT_DATA_UINT64 }, + { "direct_mixed_io_write_wait", KSTAT_DATA_UINT64 }, + { "direct_sync_wait", KSTAT_DATA_UINT64 }, + { "direct_undirty", KSTAT_DATA_UINT64 }, { "hash_hits", KSTAT_DATA_UINT64 }, { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -151,6 +162,10 @@ struct { wmsum_t cache_total_evicts; wmsum_t cache_levels[DN_MAX_LEVELS]; wmsum_t cache_levels_bytes[DN_MAX_LEVELS]; + wmsum_t direct_mixed_io_read_wait; + wmsum_t direct_mixed_io_write_wait; + wmsum_t direct_sync_wait; + wmsum_t direct_undirty; wmsum_t hash_hits; wmsum_t hash_misses; wmsum_t hash_collisions; @@ -629,7 +644,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db) * L2ARC. */ boolean_t -dbuf_is_l2cacheable(dmu_buf_impl_t *db) +dbuf_is_l2cacheable(dmu_buf_impl_t *db, blkptr_t *bp) { if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL || (db->db_objset->os_secondary_cache == @@ -637,10 +652,17 @@ dbuf_is_l2cacheable(dmu_buf_impl_t *db) if (l2arc_exclude_special == 0) return (B_TRUE); - blkptr_t *bp = db->db_blkptr; - if (bp == NULL || BP_IS_HOLE(bp)) + /* + * bp must be checked in the event it was passed from + * dbuf_read_impl() as the result of a the BP being set from + * a Direct I/O write in dbuf_read(). See comments in + * dbuf_read(). + */ + blkptr_t *db_bp = bp == NULL ? db->db_blkptr : bp; + + if (db_bp == NULL || BP_IS_HOLE(db_bp)) return (B_FALSE); - uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); + uint64_t vdev = DVA_GET_VDEV(db_bp->blk_dva); vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev; vdev_t *vd = NULL; @@ -887,6 +909,14 @@ dbuf_kstat_update(kstat_t *ksp, int rw) ds->cache_levels_bytes[i].value.ui64 = wmsum_value(&dbuf_sums.cache_levels_bytes[i]); } + ds->direct_mixed_io_read_wait.value.ui64 = + wmsum_value(&dbuf_sums.direct_mixed_io_read_wait); + ds->direct_mixed_io_write_wait.value.ui64 = + wmsum_value(&dbuf_sums.direct_mixed_io_write_wait); + ds->direct_sync_wait.value.ui64 = + wmsum_value(&dbuf_sums.direct_sync_wait); + ds->direct_undirty.value.ui64 = + wmsum_value(&dbuf_sums.direct_undirty); ds->hash_hits.value.ui64 = wmsum_value(&dbuf_sums.hash_hits); ds->hash_misses.value.ui64 = @@ -989,6 +1019,10 @@ dbuf_init(void) wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } + wmsum_init(&dbuf_sums.direct_mixed_io_read_wait, 0); + wmsum_init(&dbuf_sums.direct_mixed_io_write_wait, 0); + wmsum_init(&dbuf_sums.direct_sync_wait, 0); + wmsum_init(&dbuf_sums.direct_undirty, 0); wmsum_init(&dbuf_sums.hash_hits, 0); wmsum_init(&dbuf_sums.hash_misses, 0); wmsum_init(&dbuf_sums.hash_collisions, 0); @@ -1061,6 +1095,10 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } + wmsum_fini(&dbuf_sums.direct_mixed_io_read_wait); + wmsum_fini(&dbuf_sums.direct_mixed_io_write_wait); + wmsum_fini(&dbuf_sums.direct_sync_wait); + wmsum_fini(&dbuf_sums.direct_undirty); wmsum_fini(&dbuf_sums.hash_hits); wmsum_fini(&dbuf_sums.hash_misses); wmsum_fini(&dbuf_sums.hash_collisions); @@ -1231,8 +1269,9 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - ASSERT3P(db->db_buf, ==, NULL); - db->db.db_data = NULL; + /* Direct I/O writes may have data */ + if (db->db_buf == NULL) + db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "clear data"); @@ -1244,8 +1283,19 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(buf != NULL); + dbuf_dirty_record_t *dr_dio = NULL; db->db_buf = buf; + dr_dio = dbuf_get_dirty_direct(db); + + /* + * If there is a Direct I/O, set its data too. Then its state + * will be the same as if we did a ZIL dmu_sync(). + */ + if (dbuf_dirty_is_direct_write(db, dr_dio)) { + dr_dio->dt.dl.dr_data = db->db_buf; + } + ASSERT(buf->b_data != NULL); db->db.db_data = buf->b_data; } @@ -1378,6 +1428,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); + /* * All reads are synchronous, so we must have a hold on the dbuf */ @@ -1558,12 +1609,12 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) */ static int dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, - db_lock_type_t dblt, const void *tag) + db_lock_type_t dblt, blkptr_t *bp, const void *tag) { zbookmark_phys_t zb; uint32_t aflags = ARC_FLAG_NOWAIT; int err, zio_flags; - blkptr_t bp, *bpp = NULL; + blkptr_t *bpp = bp; ASSERT(!zfs_refcount_is_zero(&db->db_holds)); ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1592,16 +1643,10 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, err = EIO; goto early_unlock; } - bp = dr->dt.dl.dr_overridden_by; - bpp = &bp; + bpp = &dr->dt.dl.dr_overridden_by; } } - if (bpp == NULL && db->db_blkptr != NULL) { - bp = *db->db_blkptr; - bpp = &bp; - } - err = dbuf_read_hole(db, dn, bpp); if (err == 0) goto early_unlock; @@ -1645,7 +1690,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, if (!DBUF_IS_CACHEABLE(db)) aflags |= ARC_FLAG_UNCACHED; - else if (dbuf_is_l2cacheable(db)) + else if (dbuf_is_l2cacheable(db, bpp)) aflags |= ARC_FLAG_L2CACHE; dbuf_add_ref(db, NULL); @@ -1653,17 +1698,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags, zio_flags = (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; - if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(bpp)) zio_flags |= ZIO_FLAG_RAW; + /* - * The zio layer will copy the provided blkptr later, but we have our - * own copy so that we can release the parent's rwlock. We have to - * do that so that if dbuf_read_done is called synchronously (on + * The zio layer will copy the provided blkptr later, but we need to + * do this now so that we can release the parent's rwlock. We have to + * do that now so that if dbuf_read_done is called synchronously (on * an l1 cache hit) we don't acquire the db_mtx while holding the * parent's rwlock, which would be a lock ordering violation. */ + blkptr_t copy = *bpp; dmu_buf_unlock_parent(db, dblt, tag); - return (arc_read(zio, db->db_objset->os_spa, bpp, + return (arc_read(zio, db->db_objset->os_spa, ©, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb)); @@ -1812,16 +1859,27 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags) DBUF_STAT_BUMP(hash_hits); } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) { boolean_t need_wait = B_FALSE; + blkptr_t *bp = NULL; + /* + * We have to be careful to only grab the dbuf block pointer + * after we have locked the parent. + */ db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + /* + * If a Direct I/O write has occurred we will use the updated + * block pointer. + */ + bp = dmu_buf_get_bp_from_dbuf(db); + if (pio == NULL && (db->db_state == DB_NOFILL || - (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) { + (bp != NULL && !BP_IS_HOLE(bp)))) { spa_t *spa = dn->dn_objset->os_spa; pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG); + err = dbuf_read_impl(db, dn, pio, flags, dblt, bp, FTAG); /* * dbuf_read_impl has dropped db_mtx and our parent's rwlock * for us @@ -1921,6 +1979,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); + /* * This assert is valid because dmu_sync() expects to be called by * a zilog's get_data while holding a range lock. This call only @@ -1949,6 +2008,9 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dr->dt.dl.dr_has_raw_params = B_FALSE; /* + * In the event that Direct I/O was used, we do not + * need to release the buffer from the ARC. + * * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do @@ -2087,6 +2149,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) */ dmu_buf_will_dirty(&db->db, tx); + VERIFY3P(db->db_buf, !=, NULL); + /* create the data buffer for the new block */ buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); @@ -2147,11 +2211,26 @@ dbuf_redirty(dbuf_dirty_record_t *dr) */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && - db->db_state != DB_NOFILL) { - /* Already released on initial dirty, so just thaw. */ + db->db_state != DB_NOFILL && db->db_buf != NULL) { + /* + * Already released on initial dirty, + * so just thaw. + */ ASSERT(arc_released(db->db_buf)); arc_buf_thaw(db->db_buf); } + /* + * If initial dirty was via Direct I/O, may not have a dr_data. + * + * If the dirty record was associated with cloned block then + * the call above to dbuf_unoverride() will have reset + * dr->dt.dl.dr_data and it will not be NULL here. + */ + if (dr->dt.dl.dr_data == NULL) { + ASSERT3B(dbuf_dirty_is_direct_write(db, dr), ==, + B_TRUE); + dr->dt.dl.dr_data = db->db_buf; + } } } @@ -2528,13 +2607,17 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) /* * Undirty a buffer in the transaction group referenced by the given - * transaction. Return whether this evicted the dbuf. + * transaction. Return whether this evicted the dbuf. */ boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - uint64_t txg = tx->tx_txg; + uint64_t txg; boolean_t brtwrite; + dbuf_dirty_record_t *dr; + + txg = tx->tx_txg; + dr = dbuf_find_dirty_eq(db, txg); ASSERT(txg != 0); @@ -2554,7 +2637,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg); if (dr == NULL) return (B_FALSE); ASSERT(dr->dr_dbuf == db); @@ -2601,10 +2683,15 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state != DB_NOFILL && !brtwrite) { dbuf_unoverride(dr); - ASSERT(db->db_buf != NULL); - ASSERT(dr->dt.dl.dr_data != NULL); - if (dr->dt.dl.dr_data != db->db_buf) + /* + * In the Direct I/O case, the buffer is still dirty, but it + * may be UNCACHED, so we do not need to destroy an ARC buffer. + */ + if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) { + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); arc_buf_destroy(dr->dt.dl.dr_data, db); + } } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -2613,8 +2700,12 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_dirtycnt -= 1; if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { + /* + * In the Direct I/O case our db_buf will be NULL as we are not + * caching in the ARC. + */ ASSERT(db->db_state == DB_NOFILL || brtwrite || - arc_released(db->db_buf)); + db->db_buf == NULL || arc_released(db->db_buf)); dbuf_destroy(db); return (B_TRUE); } @@ -2704,6 +2795,166 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) return (dr != NULL); } +void +dmu_buf_direct_mixed_io_wait(dmu_buf_impl_t *db, uint64_t txg, boolean_t read) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (read == B_TRUE) { + /* + * If a buffered read is in process, a Direct I/O read will + * wait for the buffered I/O to complete. + */ + ASSERT3U(txg, ==, 0); + while (db->db_state == DB_READ) { + DBUF_STAT_BUMP(direct_mixed_io_read_wait); + cv_wait(&db->db_changed, &db->db_mtx); + } + } else { + /* + * There must be an ARC buf associated with this Direct I/O + * write otherwise there is no reason to wait for previous + * dirty records to sync out. + * + * The db_state will temporarily be set to DB_CACHED so that + * that any synchronous writes issued through the ZIL will + * still be handled properly. In particular, the call to + * dbuf_read() in dmu_sync_late_arrival() must account for the + * data still being in the ARC. After waiting here for previous + * TXGs to sync out, dmu_write_direct_done() will update the + * db_state. + */ + ASSERT3P(db->db_buf, !=, NULL); + ASSERT3U(txg, >, 0); + db->db_mixed_io_dio_wait = TRUE; + db->db_state = DB_CACHED; + while (dbuf_find_dirty_lte(db, txg) != NULL) { + DBUF_STAT_BUMP(direct_mixed_io_write_wait); + cv_wait(&db->db_changed, &db->db_mtx); + } + db->db_mixed_io_dio_wait = FALSE; + } +} + +/* + * Direct I/O writes may need to undirty the open-context dirty record + * associated with it in the event of an I/O error. + */ +void +dmu_buf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + /* + * Direct I/O writes always happen in open-context. + */ + ASSERT(!dmu_tx_is_syncing(tx)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_NOFILL || db->db_state == DB_UNCACHED); + + + /* + * In the event of an I/O error we will handle the metaslab clean up in + * zio_done(). Also, the dirty record's dr_overridden_by BP is not + * currently set as that is done in dmu_sync_done(). Since the db_state + * is still set to DB_NOFILL, dbuf_unoverride() will not be called in + * dbuf_undirty() and the dirty record's BP will not be added the SPA's + * spa_free_bplist via zio_free(). + * + * This function can also be called in the event that a Direct I/O + * write is overwriting a previous Direct I/O to the same block for + * this TXG. It is important to go ahead and free up the space + * accounting in this case through dbuf_undirty() -> dbuf_unoverride() + * -> zio_free(). This is necessary because the space accounting for + * determining if a write can occur in zfs_write() happens through + * dmu_tx_assign(). This can cause an issue with Direct I/O writes in + * the case of overwrites, because all DVA allocations are being done + * in open-context. Constanstly allowing Direct I/O overwrites to the + * same blocks can exhaust the pools available space leading to ENOSPC + * errors at the DVA allcoation part of the ZIO pipeline, which will + * eventually suspend the pool. By cleaning up space accounting now + * the ENOSPC pool suspend can be avoided. + * + * Since we are undirtying the record for the Direct I/O in + * open-context we must have a hold on the db, so it should never be + * evicted after calling dbuf_undirty(). + */ + VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE); + + DBUF_STAT_BUMP(direct_undirty); +} + +/* + * Normally the db_blkptr points to the most recent on-disk content for the + * dbuf (and anything newer will be cached in the dbuf). However, a recent + * Direct I/O write could leave newer content on disk and the dbuf uncached. + * In this case we must return the (as yet unsynced) pointer to the lastest + * on-disk content. + */ +blkptr_t * +dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level != 0) + return (db->db_blkptr); + + blkptr_t *bp = db->db_blkptr; + + dbuf_dirty_record_t *dr_dio = dbuf_get_dirty_direct(db); + if (dr_dio && dr_dio->dt.dl.dr_override_state == DR_OVERRIDDEN && + dr_dio->dt.dl.dr_data == NULL) { + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); + /* We have a Direct I/O write or cloned block, use it's BP */ + bp = &dr_dio->dt.dl.dr_overridden_by; + } + + return (bp); +} + +/* + * Direct I/O reads can read directly from the ARC, but the data has + * to be untransformed in order to copy it over into user pages. + */ +int +dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa) +{ + int err = 0; + + DB_DNODE_ENTER(db); + ASSERT3S(db->db_state, ==, DB_CACHED); + ASSERT(MUTEX_HELD(&db->db_mtx)); + + /* + * Ensure that this block's dnode has been decrypted if + * the caller has requested decrypted data. + */ + err = dbuf_read_verify_dnode_crypt(db, 0); + + /* + * If the arc buf is compressed or encrypted and the caller + * requested uncompressed data, we need to untransform it + * before returning. We also call arc_untransform() on any + * unauthenticated blocks, which will verify their MAC if + * the key is now available. + */ + if (err == 0 && db->db_buf != NULL && + (arc_is_encrypted(db->db_buf) || + arc_is_unauthenticated(db->db_buf) || + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + dbuf_fix_old_data(db, spa_syncing_txg(spa)); + err = arc_untransform(db->db_buf, spa, &zb, B_FALSE); + dbuf_set_data(db, db->db_buf); + } + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_hits); + + return (err); +} + void dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -3263,6 +3514,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; + db->db_mixed_io_dio_wait = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -4053,7 +4305,6 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting) } else { mutex_exit(&db->db_mtx); } - } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -4512,13 +4763,32 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); + /* - * To be synced, we must be dirtied. But we - * might have been freed after the dirty. + * It is possible a buffered read has come in after a Direct I/O + * write and is currently transistioning the db_state from DB_READ + * in dbuf_read_impl() to another state in dbuf_read_done(). We + * have to wait in order for the dbuf state to change from DB_READ + * before syncing the dirty record of the Direct I/O write. + */ + if (db->db_state == DB_READ && !dr->dt.dl.dr_brtwrite) { + ASSERT3P(*datap, ==, NULL); + ASSERT3P(db->db_buf, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); + while (db->db_state == DB_READ) { + DBUF_STAT_BUMP(direct_sync_wait); + cv_wait(&db->db_changed, &db->db_mtx); + } + } + + /* + * To be synced, we must be dirtied. But we might have been freed + * after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ - ASSERT(db->db.db_data == NULL); + ASSERT3P(db->db.db_data, ==, NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); @@ -4581,8 +4851,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dbuf_check_blkptr(dn, db); /* - * If this buffer is in the middle of an immediate write, - * wait for the synchronous IO to complete. + * If this buffer is in the middle of an immediate write, wait for the + * synchronous IO to complete. + * + * This is also valid even with Direct I/O writes setting a dirty + * records override state into DR_IN_DMU_SYNC, because all + * Direct I/O writes happen in open-context. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); @@ -4888,8 +5162,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + + /* no dr_data if this is a NO_FILL or Direct I/O */ if (dr->dt.dl.dr_data != NULL && dr->dt.dl.dr_data != db->db_buf) { + ASSERT3B(dr->dt.dl.dr_brtwrite, ==, B_FALSE); arc_buf_destroy(dr->dt.dl.dr_data, db); } } else { @@ -4951,7 +5228,9 @@ dbuf_write_override_done(zio_t *zio) if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); - arc_release(dr->dt.dl.dr_data, db); + + if (dr->dt.dl.dr_data && dr->dt.dl.dr_data != db->db_buf) + arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); @@ -5156,10 +5435,17 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context - * (by dmu_sync() or dmu_buf_write_embedded()). + * (by dmu_sync(), dmu_write_direct(), + * or dmu_buf_write_embedded()). */ - abd_t *contents = (data != NULL) ? - abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + abd_t *contents = NULL; + if (data) { + ASSERT(BP_IS_HOLE(bp) || + arc_buf_lsize(data) == BP_GET_LSIZE(bp)); + contents = abd_get_from_buf(data->b_data, + arc_buf_size(data)); + } dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, @@ -5168,9 +5454,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, - dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite, - dr->dt.dl.dr_brtwrite); + zio_write_override(dr->dr_zio, bp, dr->dt.dl.dr_copies, + dr->dt.dl.dr_nopwrite, dr->dt.dl.dr_brtwrite); mutex_exit(&db->db_mtx); } else if (data == NULL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || @@ -5195,7 +5480,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db), - dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, + dbuf_is_l2cacheable(db, NULL), &zp, dbuf_write_ready, children_ready_cb, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 6ef149aab9a6..85d32329cb66 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -604,8 +604,16 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbp[i] = &db->db; } - if (!read) - zfs_racct_write(length, nblks); + /* + * If we are doing O_DIRECT we still hold the dbufs, even for reads, + * but we do not issue any reads here. We do not want to account for + * writes in this case. + * + * O_DIRECT write/read accounting takes place in + * dmu_{write/read}_abd(). + */ + if (!read && ((flags & DMU_DIRECTIO) == 0)) + zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags); if (zs) dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE); @@ -794,7 +802,7 @@ dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri) /* * Get the next "chunk" of file data to free. We traverse the file from - * the end so that the file gets shorter over time (if we crashes in the + * the end so that the file gets shorter over time (if we crash in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * @@ -1058,7 +1066,7 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, /* * Deal with odd block sizes, where there can't be data past the first - * block. If we ever do the tail block optimization, we will need to + * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { @@ -1068,6 +1076,18 @@ dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, size = newsz; } + if (size == 0) + return (0); + + /* Allow Direct I/O when requested and properly aligned */ + if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) && + zfs_dio_aligned(offset, size, SPA_MINBLOCKSIZE)) { + abd_t *data = abd_get_from_buf(buf, size); + err = dmu_read_abd(dn, offset, size, data, flags); + abd_free(data); + return (err); + } + while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; @@ -1176,22 +1196,41 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } /* - * Note: Lustre is an external consumer of this interface. + * This interface is not used internally by ZFS but is provided for + * use by Lustre which is built on the DMU interfaces. */ -void -dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) +int +dmu_write_by_dnode_flags(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx, uint32_t flags) { dmu_buf_t **dbp; int numbufs; + int error; if (size == 0) - return; + return (0); + + /* Allow Direct I/O when requested and properly aligned */ + if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) && + zfs_dio_aligned(offset, size, dn->dn_datablksz)) { + abd_t *data = abd_get_from_buf((void *)buf, size); + error = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); + abd_free(data); + return (error); + } VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); + return (0); +} + +int +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + return (dmu_write_by_dnode_flags(dn, offset, size, buf, tx, 0)); } void @@ -1255,6 +1294,9 @@ dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size) dmu_buf_t **dbp; int numbufs, i, err; + if (uio->uio_extflg & UIO_DIRECT) + return (dmu_read_uio_direct(dn, uio, size)); + /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. @@ -1345,23 +1387,52 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) dmu_buf_t **dbp; int numbufs; int err = 0; - int i; + uint64_t write_size; - err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size, +top: + write_size = size; + + /* + * We only allow Direct I/O writes to happen if we are block + * sized aligned. Otherwise, we pass the write off to the ARC. + */ + if ((uio->uio_extflg & UIO_DIRECT) && + (write_size >= dn->dn_datablksz)) { + if (zfs_dio_aligned(zfs_uio_offset(uio), write_size, + dn->dn_datablksz)) { + return (dmu_write_uio_direct(dn, uio, size, tx)); + } else if (write_size > dn->dn_datablksz && + zfs_dio_offset_aligned(zfs_uio_offset(uio), + dn->dn_datablksz)) { + err = dmu_write_uio_direct(dn, uio, dn->dn_datablksz, + tx); + if (err == 0) { + size -= dn->dn_datablksz; + goto top; + } else { + return (err); + } + } else { + write_size = + P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz); + } + } + + err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); - for (i = 0; i < numbufs; i++) { + for (int i = 0; i < numbufs; i++) { uint64_t tocpy; int64_t bufoff; dmu_buf_t *db = dbp[i]; - ASSERT(size > 0); + ASSERT(write_size > 0); offset_t off = zfs_uio_offset(uio); bufoff = off - db->db_offset; - tocpy = MIN(db->db_size - bufoff, size); + tocpy = MIN(db->db_size - bufoff, write_size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); @@ -1381,10 +1452,18 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) if (err) break; + write_size -= tocpy; size -= tocpy; } + IMPLY(err == 0, write_size == 0); + dmu_buf_rele_array(dbp, numbufs, FTAG); + + if ((uio->uio_extflg & UIO_DIRECT) && size > 0) { + goto top; + } + return (err); } @@ -1517,7 +1596,7 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, * same size as the dbuf. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { - zfs_racct_write(blksz, 1); + zfs_racct_write(os->os_spa, blksz, 1, 0); dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { @@ -1547,23 +1626,22 @@ dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, return (err); } -typedef struct { - dbuf_dirty_record_t *dsa_dr; - dmu_sync_cb_t *dsa_done; - zgd_t *dsa_zgd; - dmu_tx_t *dsa_tx; -} dmu_sync_arg_t; - -static void +void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; dmu_sync_arg_t *dsa = varg; - dmu_buf_t *db = dsa->dsa_zgd->zgd_db; - blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { + dbuf_dirty_record_t *dr = dsa->dsa_dr; + blkptr_t *bp = zio->io_bp; + if (BP_IS_HOLE(bp)) { + dmu_buf_t *db = NULL; + if (dr) + db = &(dr->dr_dbuf->db); + else + db = dsa->dsa_zgd->zgd_db; /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. @@ -1582,7 +1660,7 @@ dmu_sync_late_arrival_ready(zio_t *zio) dmu_sync_ready(zio, NULL, zio->io_private); } -static void +void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { (void) buf; @@ -1595,7 +1673,7 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ - if (zio->io_error == 0) { + if (zgd && zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } @@ -1634,10 +1712,12 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } + cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); - dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + if (dsa->dsa_done) + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } @@ -1905,9 +1985,10 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, - dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), - &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), + dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL, + dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, + &zb)); return (0); } @@ -2148,6 +2229,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_nopwrite = nopwrite; zp->zp_encrypt = encrypt; zp->zp_byteorder = ZFS_HOST_BYTEORDER; + zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE; memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN); @@ -2581,8 +2663,15 @@ EXPORT_SYMBOL(dmu_free_long_range); EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_read_by_dnode); +EXPORT_SYMBOL(dmu_read_uio); +EXPORT_SYMBOL(dmu_read_uio_dbuf); +EXPORT_SYMBOL(dmu_read_uio_dnode); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_write_by_dnode); +EXPORT_SYMBOL(dmu_write_by_dnode_flags); +EXPORT_SYMBOL(dmu_write_uio); +EXPORT_SYMBOL(dmu_write_uio_dbuf); +EXPORT_SYMBOL(dmu_write_uio_dnode); EXPORT_SYMBOL(dmu_prealloc); EXPORT_SYMBOL(dmu_object_info); EXPORT_SYMBOL(dmu_object_info_from_dnode); diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c new file mode 100644 index 000000000000..8aabf88fee03 --- /dev/null +++ b/module/zfs/dmu_direct.c @@ -0,0 +1,438 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +static abd_t * +make_abd_for_dbuf(dmu_buf_impl_t *db, abd_t *data, uint64_t offset, + uint64_t size) +{ + size_t buf_size = db->db.db_size; + abd_t *pre_buf = NULL, *post_buf = NULL, *mbuf = NULL; + size_t buf_off = 0; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (offset > db->db.db_offset) { + size_t pre_size = offset - db->db.db_offset; + pre_buf = abd_alloc_for_io(pre_size, B_TRUE); + buf_size -= pre_size; + buf_off = 0; + } else { + buf_off = db->db.db_offset - offset; + size -= buf_off; + } + + if (size < buf_size) { + size_t post_size = buf_size - size; + post_buf = abd_alloc_for_io(post_size, B_TRUE); + buf_size -= post_size; + } + + ASSERT3U(buf_size, >, 0); + abd_t *buf = abd_get_offset_size(data, buf_off, buf_size); + + if (pre_buf || post_buf) { + mbuf = abd_alloc_gang(); + if (pre_buf) + abd_gang_add(mbuf, pre_buf, B_TRUE); + abd_gang_add(mbuf, buf, B_TRUE); + if (post_buf) + abd_gang_add(mbuf, post_buf, B_TRUE); + } else { + mbuf = buf; + } + + return (mbuf); +} + +static void +dmu_read_abd_done(zio_t *zio) +{ + abd_free(zio->io_abd); +} + +static void +dmu_write_direct_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + +static void +dmu_write_direct_done(zio_t *zio) +{ + dmu_sync_arg_t *dsa = zio->io_private; + dbuf_dirty_record_t *dr = dsa->dsa_dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dsa->dsa_tx->tx_txg; + + abd_free(zio->io_abd); + mutex_enter(&db->db_mtx); + + if (zio->io_error == 0) { + /* + * After a successful Direct I/O write any stale contents in + * the ARC must be cleaned up in order to force all future + * reads down to the VDEVs. + * + * If a previous write operation to this dbuf was buffered + * (in the ARC) we have to wait for the previous dirty records + * associated with this dbuf to be synced out if they are in + * the quiesce or sync phase for their TXG. This is done to + * guarantee we are not racing to destroy the ARC buf that + * is associated with the dbuf between this done callback and + * spa_sync(). Outside of using a heavy handed approach of + * locking down the spa_syncing_txg while it is being updated, + * there is no way to synchronize when a dirty record's TXG + * has moved over to the sync phase. + * + * In order to make sure all TXG's are consistent we must + * do this stall if there is an associated ARC buf with this + * dbuf. It is because of this that a user should not really + * be mixing buffered and Direct I/O writes. If they choose to + * do so, there is an associated performance penalty for that + * as we will not give up consistency with a TXG over + * performance. + */ + if (db->db_buf) { + dmu_buf_direct_mixed_io_wait(db, txg - 1, B_FALSE); + ASSERT3P(db->db_buf, ==, dr->dt.dl.dr_data); + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + dr->dt.dl.dr_data = NULL; + db->db.db_data = NULL; + ASSERT3U(db->db_dirtycnt, ==, 1); + } + + /* + * The current contents of the dbuf are now stale. + */ + ASSERT3P(dr->dt.dl.dr_data, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + db->db_state = DB_UNCACHED; + } else { + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) + ASSERT3U(zio->io_error, ==, EAGAIN); + + /* + * If there is a valid ARC buffer assocatied with this dirty + * record we will stall just like on a successful Direct I/O + * write to make sure all TXG's are consistent. See comment + * above. + */ + if (db->db_buf) { + ASSERT3P(db->db_buf, ==, dr->dt.dl.dr_data); + dmu_buf_direct_mixed_io_wait(db, txg - 1, B_FALSE); + dmu_buf_undirty(db, dsa->dsa_tx); + db->db_state = DB_CACHED; + } else { + ASSERT3P(dr->dt.dl.dr_data, ==, NULL); + dmu_buf_undirty(db, dsa->dsa_tx); + db->db_state = DB_UNCACHED; + } + + ASSERT0(db->db_dirtycnt); + } + + mutex_exit(&db->db_mtx); + dmu_sync_done(zio, NULL, zio->io_private); + kmem_free(zio->io_bp, sizeof (blkptr_t)); + zio->io_bp = NULL; +} + +int +dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) +{ + objset_t *os = db->db_objset; + dsl_dataset_t *ds = dmu_objset_ds(os); + zbookmark_phys_t zb; + dbuf_dirty_record_t *dr_head; + + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + DB_DNODE_ENTER(db); + dnode_t *dn = DB_DNODE(db); + zio_prop_t zp; + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC | WP_DIRECT_WR, &zp); + + DB_DNODE_EXIT(db); + + /* + * If we going to overwrite a previous Direct I/O write that is part of + * the current TXG, then we can can go ahead and undirty it now. Part + * of it being undirtied will be allowing for previously allocated + * space in the dr_overridden_bp BP's DVAs to be freed. This avoids + * ENOSPC errors from possibly occuring when trying to allocate new + * metaslabs in open-context for Direct I/O writes. + */ + mutex_enter(&db->db_mtx); + dr_head = dbuf_find_dirty_eq(db, dmu_tx_get_txg(tx)); + if (dbuf_dirty_is_direct_write(db, dr_head)) { + dmu_buf_undirty(db, tx); + } + mutex_exit(&db->db_mtx); + + /* + * Dirty this dbuf with DB_NOFILL since we will not have any data + * associated with the dbuf. + */ + dmu_buf_will_not_fill(&db->db, tx); + + mutex_enter(&db->db_mtx); + + uint64_t txg = dmu_tx_get_txg(tx); + ASSERT3U(txg, >, spa_last_synced_txg(os->os_spa)); + ASSERT3U(txg, >, spa_syncing_txg(os->os_spa)); + + dr_head = dbuf_get_dirty_direct(db); + ASSERT3U(dr_head->dr_txg, ==, txg); + dr_head->dr_accounted = db->db.db_size; + + blkptr_t *bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + if (db->db_blkptr != NULL) { + /* + * Fill in bp with the current block pointer so that + * the nopwrite code can check if we're writing the same + * data that's already on disk. + */ + *bp = *db->db_blkptr; + } else { + memset(bp, 0, sizeof (blkptr_t)); + } + + /* + * Disable nopwrite if the current block pointer could change + * before this TXG syncs. + */ + if (list_next(&db->db_dirty_records, dr_head) != NULL) + zp.zp_nopwrite = B_FALSE; + + ASSERT3S(dr_head->dt.dl.dr_override_state, ==, DR_NOT_OVERRIDDEN); + dr_head->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + + mutex_exit(&db->db_mtx); + + dmu_objset_willuse_space(os, dr_head->dr_accounted, tx); + + dmu_sync_arg_t *dsa = kmem_zalloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr_head; + dsa->dsa_tx = tx; + + zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data, + db->db.db_size, db->db.db_size, &zp, + dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb); + + if (pio == NULL) + return (zio_wait(zio)); + + zio_nowait(zio); + + return (0); +} + +int +dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, + abd_t *data, uint32_t flags, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + spa_t *spa = dn->dn_objset->os_spa; + int numbufs, err; + + ASSERT(flags & DMU_DIRECTIO); + + err = dmu_buf_hold_array_by_dnode(dn, offset, + size, B_FALSE, FTAG, &numbufs, &dbp, flags); + if (err) + return (err); + + zio_t *pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + for (int i = 0; i < numbufs && err == 0; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + + abd_t *abd = abd_get_offset_size(data, + db->db.db_offset - offset, dn->dn_datablksz); + + zfs_racct_write(spa, db->db.db_size, 1, flags); + err = dmu_write_direct(pio, db, abd, tx); + ASSERT0(err); + } + + err = zio_wait(pio); + + /* + * The dbuf must be held until the Direct I/O write has completed in + * the event there was any errors and dmu_buf_undirty() was called. + */ + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +int +dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, + abd_t *data, uint32_t flags) +{ + objset_t *os = dn->dn_objset; + spa_t *spa = os->os_spa; + dmu_buf_t **dbp; + int numbufs, err; + + ASSERT(flags & DMU_DIRECTIO); + + err = dmu_buf_hold_array_by_dnode(dn, offset, + size, B_FALSE, FTAG, &numbufs, &dbp, flags); + if (err) + return (err); + + zio_t *rio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + + for (int i = 0; i < numbufs; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + abd_t *mbuf; + zbookmark_phys_t zb; + + mutex_enter(&db->db_mtx); + + SET_BOOKMARK(&zb, dmu_objset_ds(os)->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + /* + * If there is another buffered read for this dbuf, we will + * wait for that to complete first. + */ + dmu_buf_direct_mixed_io_wait(db, 0, B_TRUE); + + blkptr_t *bp = dmu_buf_get_bp_from_dbuf(db); + + /* + * There is no need to read if this is a hole or the data is + * cached. This will not be considered a direct read for IO + * accounting in the same way that an ARC hit is not counted. + */ + if (bp == NULL || BP_IS_HOLE(bp) || db->db_state == DB_CACHED) { + size_t aoff = offset < db->db.db_offset ? + db->db.db_offset - offset : 0; + size_t boff = offset > db->db.db_offset ? + offset - db->db.db_offset : 0; + size_t len = MIN(size - aoff, db->db.db_size - boff); + + if (db->db_state == DB_CACHED) { + /* + * We need to untransformed the ARC buf data + * before we copy it over. + */ + err = dmu_buf_untransform_direct(db, spa); + ASSERT0(err); + abd_copy_from_buf_off(data, + (char *)db->db.db_data + boff, aoff, len); + } else { + abd_zero_off(data, aoff, len); + } + + mutex_exit(&db->db_mtx); + continue; + } + + mbuf = make_abd_for_dbuf(db, data, offset, size); + ASSERT3P(mbuf, !=, NULL); + + /* + * The dbuf mutex (db_mtx) must be held when creating the ZIO + * for the read. The BP returned from + * dmu_buf_get_bp_from_dbuf() could be from a previous Direct + * I/O write that is in the dbuf's dirty record. When + * zio_read() is called, zio_create() will make a copy of the + * BP. However, if zio_read() is called without the mutex + * being held then the dirty record from the dbuf could be + * freed in dbuf_write_done() resulting in garbage being set + * for the zio BP. + */ + zio_t *cio = zio_read(rio, spa, bp, mbuf, db->db.db_size, + dmu_read_abd_done, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL, &zb); + mutex_exit(&db->db_mtx); + + zfs_racct_read(spa, db->db.db_size, 1, flags); + zio_nowait(cio); + } + + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (zio_wait(rio)); +} + +#ifdef _KERNEL +int +dmu_read_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size) +{ + offset_t offset = zfs_uio_offset(uio); + offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; + int err; + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3U(page_index, <, uio->uio_dio.npages); + + abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], + offset & (PAGESIZE - 1), size); + err = dmu_read_abd(dn, offset, size, data, DMU_DIRECTIO); + abd_free(data); + + if (err == 0) + zfs_uioskip(uio, size); + + return (err); +} + +int +dmu_write_uio_direct(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) +{ + offset_t offset = zfs_uio_offset(uio); + offset_t page_index = (offset - zfs_uio_soffset(uio)) >> PAGESHIFT; + int err; + + ASSERT(uio->uio_extflg & UIO_DIRECT); + ASSERT3U(page_index, <, uio->uio_dio.npages); + + abd_t *data = abd_alloc_from_pages(&uio->uio_dio.pages[page_index], + offset & (PAGESIZE - 1), size); + err = dmu_write_abd(dn, offset, size, data, DMU_DIRECTIO, tx); + abd_free(data); + + if (err == 0) + zfs_uioskip(uio, size); + + return (err); +} +#endif /* _KERNEL */ + +EXPORT_SYMBOL(dmu_read_uio_direct); +EXPORT_SYMBOL(dmu_write_uio_direct); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 2ba26f68e398..36961f01a02f 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -350,6 +350,20 @@ smallblk_changed_cb(void *arg, uint64_t newval) os->os_zpl_special_smallblock = newval; } +static void +direct_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD || + newval == ZFS_DIRECT_ALWAYS); + + os->os_direct = newval; +} + static void logbias_changed_cb(void *arg, uint64_t newval) { @@ -633,6 +647,11 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ZFS_PROP_SPECIAL_SMALL_BLOCKS), smallblk_changed_cb, os); } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DIRECT), + direct_changed_cb, os); + } } if (err != 0) { arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index 17ed2a620b1e..45a2f06263a0 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -895,6 +895,14 @@ static const spa_iostats_t spa_iostats_template = { { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, + { "arc_read_count", KSTAT_DATA_UINT64 }, + { "arc_read_bytes", KSTAT_DATA_UINT64 }, + { "arc_write_count", KSTAT_DATA_UINT64 }, + { "arc_write_bytes", KSTAT_DATA_UINT64 }, + { "direct_read_count", KSTAT_DATA_UINT64 }, + { "direct_read_bytes", KSTAT_DATA_UINT64 }, + { "direct_write_count", KSTAT_DATA_UINT64 }, + { "direct_write_bytes", KSTAT_DATA_UINT64 }, }; #define SPA_IOSTATS_ADD(stat, val) \ @@ -938,6 +946,44 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type, } } +void +spa_iostats_read_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + + if (ksp == NULL) + return; + + spa_iostats_t *iostats = ksp->ks_data; + if (flags & DMU_DIRECTIO) { + SPA_IOSTATS_ADD(direct_read_count, iops); + SPA_IOSTATS_ADD(direct_read_bytes, size); + } else { + SPA_IOSTATS_ADD(arc_read_count, iops); + SPA_IOSTATS_ADD(arc_read_bytes, size); + } +} + +void +spa_iostats_write_add(spa_t *spa, uint64_t size, uint64_t iops, uint32_t flags) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + + if (ksp == NULL) + return; + + spa_iostats_t *iostats = ksp->ks_data; + if (flags & DMU_DIRECTIO) { + SPA_IOSTATS_ADD(direct_write_count, iops); + SPA_IOSTATS_ADD(direct_write_bytes, size); + } else { + SPA_IOSTATS_ADD(arc_write_count, iops); + SPA_IOSTATS_ADD(arc_write_bytes, size); + } +} + static int spa_iostats_update(kstat_t *ksp, int rw) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index d97d0a8100c2..418fa1e67b97 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -112,6 +112,11 @@ int zfs_vdev_dtl_sm_blksz = (1 << 12); */ static unsigned int zfs_slow_io_events_per_second = 20; +/* + * Rate limit direct write IO verify failures to this many per scond. + */ +static unsigned int zfs_dio_write_verify_events_per_second = 20; + /* * Rate limit checksum events after this many checksum errors per second. */ @@ -148,6 +153,17 @@ int zfs_nocacheflush = 0; uint_t zfs_vdev_max_auto_ashift = 14; uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; +/* + * VDEV checksum verification percentage for Direct I/O writes. This is + * neccessary for Linux, because user pages can not be placed under write + * protection during Direct I/O writes. + */ +#if !defined(__FreeBSD__) +uint_t zfs_vdev_direct_write_verify_pct = 2; +#else +uint_t zfs_vdev_direct_write_verify_pct = 0; +#endif + void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) { @@ -667,6 +683,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 1); zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_slow_io_events_per_second, 1); + zfs_ratelimit_init(&vd->vdev_dio_verify_rl, + &zfs_dio_write_verify_events_per_second, 1); zfs_ratelimit_init(&vd->vdev_checksum_rl, &zfs_checksum_events_per_second, 1); @@ -1176,6 +1194,7 @@ vdev_free(vdev_t *vd) zfs_ratelimit_fini(&vd->vdev_delay_rl); zfs_ratelimit_fini(&vd->vdev_deadman_rl); + zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); zfs_ratelimit_fini(&vd->vdev_checksum_rl); if (vd == spa->spa_root_vdev) @@ -4464,6 +4483,7 @@ vdev_clear(spa_t *spa, vdev_t *vd) vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_dio_verify_errors = 0; vd->vdev_stat.vs_slow_ios = 0; for (int c = 0; c < vd->vdev_children; c++) @@ -6471,6 +6491,9 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, "Rate limit slow IO (delay) events to this many per second"); +ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, + "Rate Direct I/O write verify events to this many per second"); + /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, "Rate limit checksum events to this many checksum errors per second " @@ -6498,4 +6521,9 @@ ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, param_set_max_auto_ashift, param_get_uint, ZMOD_RW, "Maximum ashift used when optimizing for logical -> physical sector " "size on new top-level vdevs"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, direct_write_verify_pct, + param_set_direct_write_verify_pct, param_get_uint, ZMOD_RW, + "Percentage of Direct I/O writes per top-level VDEV for checksum " + "verification to be performed"); /* END CSTYLED */ diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index c31f48028bbc..89c2addf75ba 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -387,6 +387,10 @@ vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) /* IO delays */ fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + /* Direct I/O write verify errors */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_DIO_VERIFY_ERRORS, + vs->vs_dio_verify_errors); + /* Add extended stats nvlist to main nvlist */ fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 2f43c4aa41b8..6c79da9e9896 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -595,6 +595,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, vs->vs_checksum_errors, FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, DATA_TYPE_UINT64, vs->vs_slow_ios, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS, + DATA_TYPE_UINT64, vs->vs_dio_verify_errors, NULL); } diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 433a653e5500..84f7e52dae66 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -607,7 +607,7 @@ static int64_t zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, boolean_t commit, - zil_callback_t callback, void *callback_data) + boolean_t o_direct, zil_callback_t callback, void *callback_data) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); uint32_t blocksize = zp->z_blksz; @@ -622,7 +622,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, return; } - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT || o_direct) write_state = WR_INDIRECT; else if (!spa_has_slogs(zilog->zl_spa) && resid >= zfs_immediate_write_sz) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index babb07ca25a9..7dfa9ba7e995 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -202,6 +202,99 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } +zfs_direct_enabled_t +zfs_check_direct_enabled(znode_t *zp, int ioflags, int *error) +{ + zfs_direct_enabled_t is_direct = ZFS_DIRECT_IO_DISABLED; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + if ((*error = zfs_enter(zfsvfs, FTAG)) != 0) + return (ZFS_DIRECT_IO_ERR); + + if (ioflags & O_DIRECT && + zfsvfs->z_os->os_direct != ZFS_DIRECT_DISABLED) { + is_direct = ZFS_DIRECT_IO_ENABLED; + } else if (zfsvfs->z_os->os_direct == ZFS_DIRECT_ALWAYS) { + is_direct = ZFS_DIRECT_IO_ENABLED; + } + + zfs_exit(zfsvfs, FTAG); + + return (is_direct); +} + +/* + * Determine if Direct I/O has been requested (either via the O_DIRECT flag or + * the "direct" dataset property). When inherited by the property only apply + * the O_DIRECT flag to correctly aligned IO requests. The rational for this + * is it allows the property to be safely set on a dataset without forcing + * all of the applications to be aware of the alignment restrictions. When + * O_DIRECT is explicitly requested by an application return EINVAL if the + * request is unaligned. In all cases, if the range for this request has + * been mmap'ed then we will perform buffered I/O to keep the mapped region + * synhronized with the ARC. + * + * It is possible that a file's pages could be mmap'ed after it is checked + * here. If so, that is handled according in zfs_read() and zfs_write(). See + * comments in the following two areas for how this handled: + * zfs_read() -> mappedread() + * zfs_write() -> update_pages() + */ +int +zfs_setup_direct(struct znode *zp, zfs_uio_t *uio, zfs_uio_rw_t rw, + int *ioflagp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + int ioflag = *ioflagp; + int error = 0; + + if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) + return (error); + + if (os->os_direct == ZFS_DIRECT_DISABLED) { + error = EAGAIN; + goto out; + + } else if (os->os_direct == ZFS_DIRECT_ALWAYS && + zfs_uio_page_aligned(uio) && + zfs_uio_aligned(uio, SPA_MINBLOCKSIZE)) { + if ((rw == UIO_WRITE && zfs_uio_resid(uio) >= zp->z_blksz) || + (rw == UIO_READ)) { + ioflag |= O_DIRECT; + } + } + + if (ioflag & O_DIRECT) { + if (!zfs_uio_page_aligned(uio) || + !zfs_uio_aligned(uio, SPA_MINBLOCKSIZE)) { + error = SET_ERROR(EINVAL); + goto out; + } + + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + zfs_uio_resid(uio) - 1)) { + error = SET_ERROR(EAGAIN); + goto out; + } + + error = zfs_uio_get_dio_pages_alloc(uio, rw); + if (error) + goto out; + } else { + error = EAGAIN; + goto out; + } + + IMPLY(ioflag & O_DIRECT, uio->uio_extflg & UIO_DIRECT); + ASSERT0(error); + + *ioflagp = ioflag; +out: + zfs_exit(zfsvfs, FTAG); + return (error); +} + /* * Read bytes from specified file into supplied buffer. * @@ -291,20 +384,61 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) #if defined(__linux__) ssize_t start_offset = zfs_uio_offset(uio); #endif + ssize_t chunk_size = zfs_vnops_read_chunk_size; ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio)); ssize_t start_resid = n; + ssize_t dio_remaining_resid = 0; + + if (uio->uio_extflg & UIO_DIRECT) { + /* + * All pages for an O_DIRECT request ahve already been mapped + * so there's no compelling reason to handle this uio in + * smaller chunks. + */ + chunk_size = DMU_MAX_ACCESS; + + /* + * In the event that the O_DIRECT request is reading the entire + * file, it is possible file's length is not page sized + * aligned. However, lower layers expect that the Direct I/O + * request is page-aligned. In this case, as much of the file + * that can be read using Direct I/O happens and the remaining + * amount will be read through the ARC. + * + * This is still consistent with the semantics of Direct I/O in + * ZFS as at a minimum the I/O request must be page-aligned. + */ + dio_remaining_resid = n - P2ALIGN(n, PAGE_SIZE); + if (dio_remaining_resid != 0) + n -= dio_remaining_resid; + } while (n > 0) { - ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size - - P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size)); + ssize_t nbytes = MIN(n, chunk_size - + P2PHASE(zfs_uio_offset(uio), chunk_size)); #ifdef UIO_NOCOPY if (zfs_uio_segflg(uio) == UIO_NOCOPY) error = mappedread_sf(zp, nbytes, uio); else #endif if (zn_has_cached_data(zp, zfs_uio_offset(uio), - zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) { + zfs_uio_offset(uio) + nbytes - 1)) { + /* + * It is possible that a files pages have been mmap'ed + * since our check for Direct I/O reads and the read + * being issued. In this case, we will use the ARC to + * keep it synchronized with the page cache. In order + * to do this we temporarily remove the UIO_DIRECT + * flag. + */ + boolean_t uio_direct_mmap = B_FALSE; + if (uio->uio_extflg & UIO_DIRECT) { + uio->uio_extflg &= ~UIO_DIRECT; + uio_direct_mmap = B_TRUE; + } error = mappedread(zp, nbytes, uio); + if (uio_direct_mmap) + uio->uio_extflg |= UIO_DIRECT; } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); @@ -332,9 +466,30 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) n -= nbytes; } - int64_t nread = start_resid - n; + int64_t nread = start_resid; + if (error == 0 && (uio->uio_extflg & UIO_DIRECT) && + dio_remaining_resid != 0) { + /* + * Temporarily remove the UIO_DIRECT flag from the UIO so the + * remainder of the file can be read using the ARC. + */ + uio->uio_extflg &= ~UIO_DIRECT; + + if (zn_has_cached_data(zp, zfs_uio_offset(uio), + zfs_uio_offset(uio) + dio_remaining_resid - 1)) { + error = mappedread(zp, dio_remaining_resid, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, + dio_remaining_resid); + } + uio->uio_extflg |= UIO_DIRECT; + + if (error != 0) + n -= dio_remaining_resid; + } + nread -= n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); - task_io_account_read(nread); out: zfs_rangelock_exit(lr); @@ -422,6 +577,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) int error = 0, error1; ssize_t start_resid = zfs_uio_resid(uio); uint64_t clear_setid_bits_txg = 0; + boolean_t o_direct_defer = B_FALSE; /* * Fasttrack empty write @@ -504,6 +660,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) woff = zp->z_size; } zfs_uio_setoffset(uio, woff); + /* + * We need to update the starting offset as well because it is + * set previously in the ZPL (Linux) and VNOPS (FreeBSD) + * layers. + */ + zfs_uio_setsoffset(uio, woff); } else { /* * Note that if the file block size will change as a result of @@ -539,6 +701,33 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) const uint64_t gid = KGID_TO_SGID(ZTOGID(zp)); const uint64_t projid = zp->z_projid; + /* + * In the event we are increasing the file block size + * (lr_length == UINT64_MAX), we will direct the write to the ARC. + * Because zfs_grow_blocksize() will read from the ARC in order to + * grow the dbuf, we avoid doing Direct I/O here as that would cause + * data written to disk to be overwritten by data in the ARC during + * the sync phase. Besides writing data twice to disk, we also + * want to avoid consistency concerns between data in the the ARC and + * on disk while growing the file's blocksize. + * + * We will only temporarily remove Direct I/O and put it back after + * we have grown the blocksize. We do this in the event a request + * is larger than max_blksz, so further requests to + * dmu_write_uio_dbuf() will still issue the requests using Direct + * IO. + * + * As an example: + * The first block to file is being written as a 4k request with + * a recorsize of 1K. The first 1K issued in the loop below will go + * through the ARC; however, the following 3 1K requests will + * use Direct I/O. + */ + if (uio->uio_extflg & UIO_DIRECT && lr->lr_length == UINT64_MAX) { + uio->uio_extflg &= ~UIO_DIRECT; + o_direct_defer = B_TRUE; + } + /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small @@ -580,6 +769,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) ssize_t nbytes = n; if (n >= blksz && woff >= zp->z_size && P2PHASE(woff, blksz) == 0 && + !(uio->uio_extflg & UIO_DIRECT) && (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { /* * This write covers a full block. "Borrow" a buffer @@ -705,9 +895,18 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) zfs_uioskip(uio, nbytes); tx_bytes = nbytes; } + + /* + * There is a a window where a file's pages can be mmap'ed after + * the Direct I/O write has started. In this case we will still + * call update_pages() to make sure there is consistency + * between the ARC and the page cache. This is unfortunate + * situation as the data will be read back into the ARC after + * the Direct I/O write has completed, but this is the pentalty + * for writing to a mmap'ed region of the file using O_DIRECT. + */ if (tx_bytes && - zn_has_cached_data(zp, woff, woff + tx_bytes - 1) && - !(ioflag & O_DIRECT)) { + zn_has_cached_data(zp, woff, woff + tx_bytes - 1)) { update_pages(zp, woff, tx_bytes, zfsvfs->z_os); } @@ -756,10 +955,21 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * the TX_WRITE records logged here. */ zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit, - NULL, NULL); + uio->uio_extflg & UIO_DIRECT ? B_TRUE : B_FALSE, NULL, + NULL); dmu_tx_commit(tx); + /* + * Direct I/O was deferred in order to grow the first block. + * At this point it can be re-enabled for subsequent writes. + */ + if (o_direct_defer) { + ASSERT(ioflag & O_DIRECT); + uio->uio_extflg |= UIO_DIRECT; + o_direct_defer = B_FALSE; + } + if (error != 0) break; ASSERT3S(tx_bytes, ==, nbytes); @@ -767,6 +977,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) pfbytes -= nbytes; } + if (o_direct_defer) { + ASSERT(ioflag & O_DIRECT); + uio->uio_extflg |= UIO_DIRECT; + o_direct_defer = B_FALSE; + } + zfs_znode_update_vfs(zp); zfs_rangelock_exit(lr); @@ -784,9 +1000,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (commit) zil_commit(zilog, zp->z_id); - const int64_t nwritten = start_resid - zfs_uio_resid(uio); + int64_t nwritten = start_resid - zfs_uio_resid(uio); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); - task_io_account_write(nwritten); zfs_exit(zfsvfs, FTAG); return (0); @@ -846,7 +1061,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; - dmu_buf_t *db; zgd_t *zgd; int error = 0; uint64_t zp_gen; @@ -882,6 +1096,32 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zgd->zgd_lwb = lwb; zgd->zgd_private = zp; + dmu_buf_t *dbp; + error = dmu_buf_hold_noread(os, object, offset, zgd, &dbp); + zgd->zgd_db = dbp; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; + + if (error) { + zfs_get_done(zgd, error); + return (error); + } + + /* + * If a Direct I/O write is waiting for previous dirty records to sync + * out in dmu_buf_direct_mixed_io_wait(), then the ranglock is already + * held across the entire block by the O_DIRECT write. + * + * The dirty record for this TXG will also be used to identify if this + * log record is associated with a Direct I/O write. + */ + mutex_enter(&db->db_mtx); + boolean_t rangelock_held = db->db_mixed_io_dio_wait; + zgd->zgd_grabbed_rangelock = !(rangelock_held); + dbuf_dirty_record_t *dr = + dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); + boolean_t direct_write = dbuf_dirty_is_direct_write(db, dr); + mutex_exit(&db->db_mtx); + /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the @@ -890,8 +1130,10 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); + if (zgd->zgd_grabbed_rangelock) { + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + } /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); @@ -908,18 +1150,29 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; - offset -= blkoff; - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - zfs_rangelock_exit(zgd->zgd_lr); + if (zgd->zgd_grabbed_rangelock) { + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : + offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter( + &zp->z_rangelock, offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + ASSERT3U(dbp->db_size, ==, size); + ASSERT3U(dbp->db_offset, ==, offset); + } else { + /* + * A Direct I/O write always covers an entire block. + */ + ASSERT3U(dbp->db_size, ==, zp->z_blksz); } + /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); @@ -929,44 +1182,48 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zil_fault_io = 0; } #endif - if (error == 0) - error = dmu_buf_hold_noread(os, object, offset, zgd, - &db); + if (error) { + zfs_get_done(zgd, error); + return (error); + } - if (error == 0) { - blkptr_t *bp = &lr->lr_blkptr; + /* + * All Direct I/O writes will have already completed and the + * block pointer can be immediately stored in the log record. + */ + if (direct_write) { + lr->lr_blkptr = dr->dt.dl.dr_overridden_by; + zfs_get_done(zgd, 0); + return (0); + } - zgd->zgd_db = db; - zgd->zgd_bp = bp; + blkptr_t *bp = &lr->lr_blkptr; + zgd->zgd_bp = bp; - ASSERT(db->db_offset == offset); - ASSERT(db->db_size == size); + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. */ - if (error == 0) - return (0); - - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; - /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP. - */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; - } + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; } } @@ -982,10 +1239,11 @@ zfs_get_done(zgd_t *zgd, int error) (void) error; znode_t *zp = zgd->zgd_private; - if (zgd->zgd_db) - dmu_buf_rele(zgd->zgd_db, zgd); + ASSERT3P(zgd->zgd_db, !=, NULL); + dmu_buf_rele(zgd->zgd_db, zgd); - zfs_rangelock_exit(zgd->zgd_lr); + if (zgd->zgd_grabbed_rangelock) + zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1ba99f4d4624..9e091f1a8294 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -792,6 +792,12 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, pio->io_reexecute |= zio->io_reexecute; ASSERT3U(*countp, >, 0); + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { + ASSERT3U(*errorp, ==, EAGAIN); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + pio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + } + (*countp)--; if (*countp == 0 && pio->io_stall == countp) { @@ -1264,20 +1270,14 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; + enum zio_stage pipeline = zp->zp_direct_write == B_TRUE ? + ZIO_DIRECT_WRITE_PIPELINE : (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE; - ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && - zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && - zp->zp_compress >= ZIO_COMPRESS_OFF && - zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && - DMU_OT_IS_VALID(zp->zp_type) && - zp->zp_level < 32 && - zp->zp_copies > 0 && - zp->zp_copies <= spa_max_replication(spa)); zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, - ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? - ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); + ZIO_STAGE_OPEN, pipeline); zio->io_ready = ready; zio->io_children_ready = children_ready; @@ -1554,6 +1554,19 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, */ pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; + } else if (type == ZIO_TYPE_WRITE && + pio->io_prop.zp_direct_write == B_TRUE) { + /* + * By default we only will verify checksums for Direct I/O + * writes for Linux. FreeBSD is able to place user pages under + * write protection before issuing them to the ZIO pipeline. + * + * Checksum validation errors will only be reported through + * the top-level VDEV, which is set by this child ZIO. + */ + ASSERT3P(bp, !=, NULL); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + pipeline |= ZIO_STAGE_DIO_CHECKSUM_VERIFY; } if (vd->vdev_ops->vdev_op_leaf) { @@ -3080,6 +3093,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zp.zp_nopwrite = B_FALSE; zp.zp_encrypt = gio->io_prop.zp_encrypt; zp.zp_byteorder = gio->io_prop.zp_byteorder; + zp.zp_direct_write = B_FALSE; memset(zp.zp_salt, 0, ZIO_DATA_SALT_LEN); memset(zp.zp_iv, 0, ZIO_DATA_IV_LEN); memset(zp.zp_mac, 0, ZIO_DATA_MAC_LEN); @@ -4203,6 +4217,20 @@ zio_vdev_io_assess(zio_t *zio) zio->io_vsd = NULL; } + /* + * If a Direct I/O write checksum verify error has occurred then this + * I/O should not attempt to be issued again. Instead the EAGAIN will + * be returned and this write will attempt to be issued through the + * ARC instead. + */ + if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) { + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_LOGICAL); + ASSERT3U(zio->io_error, ==, EAGAIN); + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + return (zio); + } + + if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); @@ -4516,6 +4544,58 @@ zio_checksum_verify(zio_t *zio) return (zio); } +static zio_t * +zio_dio_checksum_verify(zio_t *zio) +{ + zio_t *pio = zio_unique_parent(zio); + + ASSERT3P(zio->io_vd, !=, NULL); + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3B(pio->io_prop.zp_direct_write, ==, B_TRUE); + ASSERT3U(pio->io_child_type, ==, ZIO_CHILD_LOGICAL); + + if (zfs_vdev_direct_write_verify_pct == 0 || zio->io_error != 0) + goto out; + + /* + * A Direct I/O write checksum verification will only be + * performed based on the top-level VDEV percentage for checks. + */ + uint32_t rand = random_in_range(100); + int error; + + if (rand < zfs_vdev_direct_write_verify_pct) { + if ((error = zio_checksum_error(zio, NULL)) != 0) { + zio->io_error = error; + if (error == ECKSUM) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_dio_verify_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + zio->io_error = SET_ERROR(EAGAIN); + zio->io_flags |= ZIO_FLAG_DIO_CHKSUM_ERR; + + /* + * The EAGAIN error must be propagated up to the + * logical parent ZIO in zio_notify_parent() so + * it can be returned to dmu_write_abd(). + */ + zio->io_flags &= ~ZIO_FLAG_DONT_PROPAGATE; + + (void) zfs_ereport_post( + FM_EREPORT_ZFS_DIO_VERIFY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0); + } + } + } + +out: + return (zio); +} + + /* * Called by RAID-Z to ensure we don't compute the checksum twice. */ @@ -4846,7 +4926,8 @@ zio_done(zio_t *zio) * device is currently unavailable. */ if (zio->io_error != ECKSUM && zio->io_vd != NULL && - !vdev_is_dead(zio->io_vd)) { + !vdev_is_dead(zio->io_vd) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0); if (ret != EALREADY) { @@ -4860,7 +4941,8 @@ zio_done(zio_t *zio) } if ((zio->io_error == EIO || !(zio->io_flags & - (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DIO_CHKSUM_ERR))) && zio == zio->io_logical) { /* * For logical I/O requests, tell the SPA to log the @@ -4882,7 +4964,8 @@ zio_done(zio_t *zio) ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); if (IO_IS_ALLOCATING(zio) && - !(zio->io_flags & ZIO_FLAG_CANFAIL)) { + !(zio->io_flags & ZIO_FLAG_CANFAIL) && + !(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else @@ -4932,6 +5015,14 @@ zio_done(zio_t *zio) zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; if (zio->io_reexecute) { + /* + * A Direct I/O write that has a checksum verify error should + * not attempt to reexecute. Instead, EAGAIN should just be + * propagated back up so the write can be attempt to be issued + * through the ARC. + */ + ASSERT(!(zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR)); + /* * This is a logical I/O that wants to reexecute. * @@ -5093,6 +5184,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_vdev_io_done, zio_vdev_io_assess, zio_checksum_verify, + zio_dio_checksum_verify, zio_done }; diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 558cd425afd8..f1f5e0954df4 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -666,6 +666,14 @@ tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] tags = ['functional', 'delegate'] +[tests/functional/direct] +tests = ['dio_aligned_block', 'dio_async_always', 'dio_async_fio_ioengines', + 'dio_compression', 'dio_dedup', 'dio_encryption', 'dio_grow_block', + 'dio_max_recordsize', 'dio_mixed', 'dio_mmap', 'dio_overwrites', + 'dio_property', 'dio_random', 'dio_recordsize', 'dio_unaligned_block', + 'dio_unaligned_filesize'] +tags = ['functional', 'direct'] + [tests/functional/exec] tests = ['exec_001_pos', 'exec_002_neg'] tags = ['functional', 'exec'] @@ -708,7 +716,7 @@ pre = tags = ['functional', 'inheritance'] [tests/functional/io] -tests = ['sync', 'psync', 'posixaio', 'mmap'] +tests = ['mmap', 'posixaio', 'psync', 'sync'] tags = ['functional', 'io'] [tests/functional/inuse] diff --git a/tests/runfiles/freebsd.run b/tests/runfiles/freebsd.run index 13696d645850..e1ae0c6b7721 100644 --- a/tests/runfiles/freebsd.run +++ b/tests/runfiles/freebsd.run @@ -30,3 +30,7 @@ tags = ['functional', 'cli_root', 'zfs_jail'] tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', 'pam_short_password'] tags = ['functional', 'pam'] + +[tests/functional/direct:FreeBSD] +tests = ['dio_write_stable_pages'] +tags = ['functional', 'direct'] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index a0b74ef4a8c6..8ef1bf6b7021 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -102,6 +102,10 @@ tags = ['functional', 'compression'] tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] tags = ['functional', 'devices'] +[tests/functional/direct:Linux] +tests = ['dio_write_verify'] +tags = ['functional', 'direct'] + [tests/functional/events:Linux] tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill', 'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config', diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 0ed0a69eb013..e9e3b8f73e42 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -16,6 +16,7 @@ /getversion /largest_file /libzfs_input_check +/manipulate_user_buffer /mkbusy /mkfile /mkfiles diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 23848a82ffbd..2017294fc203 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -60,6 +60,8 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/libzfs_input_check libzfs_core.la \ libnvpair.la +scripts_zfs_tests_bin_PROGRAMS += %D%/manipulate_user_buffer +%C%_manipulate_user_buffer_LDADD = -lpthread scripts_zfs_tests_bin_PROGRAMS += %D%/mkbusy %D%/mkfile %D%/mkfiles %D%/mktree %C%_mkfile_LDADD = $(LTLIBINTL) diff --git a/tests/zfs-tests/cmd/manipulate_user_buffer.c b/tests/zfs-tests/cmd/manipulate_user_buffer.c new file mode 100644 index 000000000000..c195a197aded --- /dev/null +++ b/tests/zfs-tests/cmd/manipulate_user_buffer.c @@ -0,0 +1,260 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 by Triad National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a) < (b)) ? (a) : (b) +#endif + +static char *outputfile = NULL; +static int blocksize = 131072; /* 128K */ +static int numblocks = 100; +static char *execname = NULL; +static int print_usage = 0; +static int randompattern = 0; +static int ofd; +char *buf = NULL; + +typedef struct { + int entire_file_written; +} pthread_args_t; + +static void +usage(void) +{ + (void) fprintf(stderr, + "usage %s -o outputfile [-b blocksize] [-n numblocks]\n" + " [-p randpattern] [-h help]\n" + "\n" + "Testing whether checksum verify works correctly for O_DIRECT.\n" + "when manipulating the contents of a userspace buffer.\n" + "\n" + " outputfile: File to write to.\n" + " blocksize: Size of each block to write (must be at \n" + " least >= 512).\n" + " numblocks: Total number of blocksized blocks to write.\n" + " randpattern: Fill data buffer with random data. Default \n" + " behavior is to fill the buffer with the \n" + " known data pattern (0xdeadbeef).\n" + " help: Print usage information and exit.\n" + "\n" + " Required parameters:\n" + " outputfile\n" + "\n" + " Default Values:\n" + " blocksize -> 131072\n" + " numblocks -> 100\n" + " randpattern -> false\n", + execname); + (void) exit(1); +} + +static void +parse_options(int argc, char *argv[]) +{ + int c; + int errflag = 0; + extern char *optarg; + extern int optind, optopt; + execname = argv[0]; + + while ((c = getopt(argc, argv, "b:hn:o:p")) != -1) { + switch (c) { + case 'b': + blocksize = atoi(optarg); + break; + + case 'h': + print_usage = 1; + break; + + case 'n': + numblocks = atoi(optarg); + break; + + case 'o': + outputfile = optarg; + break; + + case 'p': + randompattern = 1; + break; + + case ':': + (void) fprintf(stderr, + "Option -%c requires an opertand\n", + optopt); + errflag++; + break; + case '?': + default: + (void) fprintf(stderr, + "Unrecognized option: -%c\n", optopt); + errflag++; + break; + } + } + + if (errflag || print_usage == 1) + (void) usage(); + + if (blocksize < 512 || outputfile == NULL || numblocks <= 0) { + (void) fprintf(stderr, + "Required paramater(s) missing or invalid.\n"); + (void) usage(); + } +} + +/* + * Write blocksize * numblocks to the file using O_DIRECT. + */ +static void * +write_thread(void *arg) +{ + size_t offset = 0; + int total_data = blocksize * numblocks; + int left = total_data; + ssize_t wrote = 0; + pthread_args_t *args = (pthread_args_t *)arg; + + while (!args->entire_file_written) { + wrote = pwrite(ofd, buf, blocksize, offset); + if (wrote != blocksize) { + perror("write"); + exit(2); + } + + offset = ((offset + blocksize) % total_data); + left -= blocksize; + + if (left == 0) + args->entire_file_written = 1; + } + + pthread_exit(NULL); +} + +/* + * Update the buffers contents with random data. + */ +static void * +manipulate_buf_thread(void *arg) +{ + size_t rand_offset; + char rand_char; + pthread_args_t *args = (pthread_args_t *)arg; + + while (!args->entire_file_written) { + rand_offset = (rand() % blocksize); + rand_char = (rand() % (126 - 33) + 33); + buf[rand_offset] = rand_char; + } + + pthread_exit(NULL); +} + +int +main(int argc, char *argv[]) +{ + const char *datapattern = "0xdeadbeef"; + int ofd_flags = O_WRONLY | O_CREAT | O_DIRECT; + mode_t mode = S_IRUSR | S_IWUSR; + pthread_t write_thr; + pthread_t manipul_thr; + int left = blocksize; + int offset = 0; + int rc; + pthread_args_t args = { 0 }; + + parse_options(argc, argv); + + ofd = open(outputfile, ofd_flags, mode); + if (ofd == -1) { + (void) fprintf(stderr, "%s, %s\n", execname, outputfile); + perror("open"); + exit(2); + } + + int err = posix_memalign((void **)&buf, sysconf(_SC_PAGE_SIZE), + blocksize); + if (err != 0) { + (void) fprintf(stderr, + "%s: %s\n", execname, strerror(err)); + exit(2); + } + + if (!randompattern) { + /* Putting known data pattern in buffer */ + while (left) { + size_t amt = MIN(strlen(datapattern), left); + memcpy(&buf[offset], datapattern, amt); + offset += amt; + left -= amt; + } + } else { + /* Putting random data in buffer */ + for (int i = 0; i < blocksize; i++) + buf[i] = rand(); + } + + /* + * Writing using O_DIRECT while manipulating the buffer conntents until + * the entire file is written. + */ + if ((rc = pthread_create(&manipul_thr, NULL, manipulate_buf_thread, + &args))) { + fprintf(stderr, "error: pthreads_create, manipul_thr, " + "rc: %d\n", rc); + exit(2); + } + + if ((rc = pthread_create(&write_thr, NULL, write_thread, &args))) { + fprintf(stderr, "error: pthreads_create, write_thr, " + "rc: %d\n", rc); + exit(2); + } + + pthread_join(write_thr, NULL); + pthread_join(manipul_thr, NULL); + + assert(args.entire_file_written == 1); + + (void) close(ofd); + + free(buf); + + return (0); +} diff --git a/tests/zfs-tests/cmd/stride_dd.c b/tests/zfs-tests/cmd/stride_dd.c index a20b26131650..19aab1c97f0c 100644 --- a/tests/zfs-tests/cmd/stride_dd.c +++ b/tests/zfs-tests/cmd/stride_dd.c @@ -21,12 +21,19 @@ #include #include +static int alignment = 0; static int bsize = 0; static int count = 0; static char *ifile = NULL; static char *ofile = NULL; -static off_t stride = 0; +static off_t stride = 1; static off_t seek = 0; +static int seekbytes = 0; +static int if_o_direct = 0; +static int of_o_direct = 0; +static int skip = 0; +static int skipbytes = 0; +static int entire_file = 0; static const char *execname = "stride_dd"; static void usage(void); @@ -36,8 +43,10 @@ static void usage(void) { (void) fprintf(stderr, - "usage: %s -i inputfile -o outputfile -b blocksize -c count \n" - " -s stride [ -k seekblocks]\n" + "usage: %s -i inputfile -o outputfile -b blocksize [-c count]\n" + " [-s stride] [-k seekblocks] [-K seekbytes]\n" + " [-a alignment] [-d if_o_direct] [-D of_o_direct]\n" + " [-p skipblocks] [-P skipbytes] [-e entire_file]\n" "\n" "Simplified version of dd that supports the stride option.\n" "A stride of n means that for each block written, n - 1 blocks\n" @@ -45,16 +54,47 @@ usage(void) "means that blocks are read and written consecutively.\n" "All numeric parameters must be integers.\n" "\n" - " inputfile: File to read from\n" - " outputfile: File to write to\n" - " blocksize: Size of each block to read/write\n" - " count: Number of blocks to read/write\n" - " stride: Read/write a block then skip (stride - 1) blocks\n" - " seekblocks: Number of blocks to skip at start of output\n", + " inputfile: File to read from\n" + " outputfile: File to write to\n" + " blocksize: Size of each block to read/write\n" + " count: Number of blocks to read/write (Required" + " unless -e is used)\n" + " stride: Read/write a block then skip (stride - 1) blocks" + "\n" + " seekblocks: Number of blocks to skip at start of output\n" + " seekbytes: Treat seekblocks as byte count\n" + " alignment: Alignment passed to posix_memalign() (default" + " PAGE_SIZE)\n" + " if_o_direct: Use O_DIRECT with inputfile (default no O_DIRECT)" + "\n" + " of_o_direct: Use O_DIRECT with outputfile (default no " + " O_DIRECT)\n" + " skipblocks: Number of blocks to skip at start of input " + " (default 0)\n" + " skipbytes: Treat skipblocks as byte count\n" + " entire_file: When used the entire inputfile will be read and" + " count will be ignored\n", execname); (void) exit(1); } +/* + * posix_memalign() only allows for alignments which are postive, powers of two + * and a multiple of sizeof (void *). + */ +static int +invalid_alignment(int alignment) +{ + if ((alignment < 0) || (alignment & (alignment - 1)) || + ((alignment % sizeof (void *)))) { + (void) fprintf(stderr, + "Alignment must be a postive, power of two, and multiple " + "of sizeof (void *).\n"); + return (1); + } + return (0); +} + static void parse_options(int argc, char *argv[]) { @@ -62,12 +102,17 @@ parse_options(int argc, char *argv[]) int errflag = 0; execname = argv[0]; + alignment = sysconf(_SC_PAGE_SIZE); extern char *optarg; extern int optind, optopt; - while ((c = getopt(argc, argv, ":b:c:i:o:s:k:")) != -1) { + while ((c = getopt(argc, argv, "a:b:c:deDi:o:s:k:Kp:P")) != -1) { switch (c) { + case 'a': + alignment = atoi(optarg); + break; + case 'b': bsize = atoi(optarg); break; @@ -76,6 +121,18 @@ parse_options(int argc, char *argv[]) count = atoi(optarg); break; + case 'd': + if_o_direct = 1; + break; + + case 'e': + entire_file = 1; + break; + + case 'D': + of_o_direct = 1; + break; + case 'i': ifile = optarg; break; @@ -92,6 +149,18 @@ parse_options(int argc, char *argv[]) seek = atoi(optarg); break; + case 'K': + seekbytes = 1; + break; + + case 'p': + skip = atoi(optarg); + break; + + case 'P': + skipbytes = 1; + break; + case ':': (void) fprintf(stderr, "Option -%c requires an operand\n", optopt); @@ -111,64 +180,60 @@ parse_options(int argc, char *argv[]) } } - if (bsize <= 0 || count <= 0 || stride <= 0 || ifile == NULL || - ofile == NULL || seek < 0) { + if (bsize <= 0 || stride <= 0 || ifile == NULL || ofile == NULL || + seek < 0 || invalid_alignment(alignment) || skip < 0) { + (void) fprintf(stderr, + "Required parameter(s) missing or invalid.\n"); + (void) usage(); + } + + if (count <= 0 && entire_file == 0) { (void) fprintf(stderr, "Required parameter(s) missing or invalid.\n"); (void) usage(); } } -int -main(int argc, char *argv[]) +static void +read_entire_file(int ifd, int ofd, void *buf) { - int i; - int ifd; - int ofd; - void *buf; int c; - parse_options(argc, argv); - - ifd = open(ifile, O_RDONLY); - if (ifd == -1) { - (void) fprintf(stderr, "%s: %s: ", execname, ifile); - perror("open"); - exit(2); - } - - ofd = open(ofile, O_WRONLY | O_CREAT, 0666); - if (ofd == -1) { - (void) fprintf(stderr, "%s: %s: ", execname, ofile); - perror("open"); - exit(2); - } + do { + c = read(ifd, buf, bsize); + if (c < 0) { + perror("read"); + exit(2); + } else if (c != 0) { + c = write(ofd, buf, bsize); + if (c < 0) { + perror("write"); + exit(2); + } - /* - * We use valloc because some character block devices expect a - * page-aligned buffer. - */ - int err = posix_memalign(&buf, 4096, bsize); - if (err != 0) { - (void) fprintf(stderr, - "%s: %s\n", execname, strerror(err)); - exit(2); - } + } - if (seek > 0) { - if (lseek(ofd, seek * bsize, SEEK_CUR) == -1) { - perror("output lseek"); - exit(2); + if (stride > 1) { + if (lseek(ifd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("input lseek"); + exit(2); + } + if (lseek(ofd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } } - } + } while (c != 0); +} + +static void +read_on_count(int ifd, int ofd, void *buf) +{ + int i; + int c; for (i = 0; i < count; i++) { c = read(ifd, buf, bsize); - if (c != bsize) { - - perror("read"); - exit(2); - } if (c != bsize) { if (c < 0) { perror("read"); @@ -205,6 +270,71 @@ main(int argc, char *argv[]) } } } +} + +int +main(int argc, char *argv[]) +{ + int ifd; + int ofd; + int ifd_flags = O_RDONLY; + int ofd_flags = O_WRONLY | O_CREAT; + void *buf; + + parse_options(argc, argv); + + if (if_o_direct) + ifd_flags |= O_DIRECT; + + if (of_o_direct) + ofd_flags |= O_DIRECT; + + ifd = open(ifile, ifd_flags); + if (ifd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ifile); + perror("open"); + exit(2); + } + + ofd = open(ofile, ofd_flags, 0666); + if (ofd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ofile); + perror("open"); + exit(2); + } + + /* + * We use valloc because some character block devices expect a + * page-aligned buffer. + */ + int err = posix_memalign(&buf, alignment, bsize); + if (err != 0) { + (void) fprintf(stderr, + "%s: %s\n", execname, strerror(err)); + exit(2); + } + + if (skip > 0) { + int skipamt = skipbytes == 1 ? skip : skip * bsize; + if (lseek(ifd, skipamt, SEEK_CUR) == -1) { + perror("input lseek"); + exit(2); + } + } + + if (seek > 0) { + int seekamt = seekbytes == 1 ? seek : seek * bsize; + if (lseek(ofd, seekamt, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + + if (entire_file == 1) + read_entire_file(ifd, ofd, buf); + else + read_on_count(ifd, ofd, buf); + free(buf); (void) close(ofd); diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index daa794551682..4b7759daf88f 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -199,6 +199,7 @@ export ZFSTEST_FILES='badsend getversion largest_file libzfs_input_check + manipulate_user_buffer mkbusy mkfile mkfiles diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index dfab48d2cdaf..dfeb4db01215 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3446,6 +3446,18 @@ function md5digest esac } +# +# Compare the MD5 digest of two files. +# +function cmp_md5s { + typeset file1=$1 + typeset file2=$2 + + typeset sum1=$(md5digest $file1) + typeset sum2=$(md5digest $file2) + test "$sum1" = "$sum2" +} + # # Compute SHA256 digest for given file or stdin if no file given. # Note: file path must not contain spaces diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index a619b846dd11..b995513ccc81 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -88,6 +88,7 @@ VDEV_FILE_LOGICAL_ASHIFT vdev.file.logical_ashift vdev_file_logical_ashift VDEV_FILE_PHYSICAL_ASHIFT vdev.file.physical_ashift vdev_file_physical_ashift VDEV_MAX_AUTO_ASHIFT vdev.max_auto_ashift zfs_vdev_max_auto_ashift VDEV_MIN_MS_COUNT vdev.min_ms_count zfs_vdev_min_ms_count +VDEV_DIRECT_WR_VERIFY_PCT vdev.direct_write_verify_pct zfs_vdev_direct_write_verify_pct VDEV_VALIDATE_SKIP vdev.validate_skip vdev_validate_skip VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index f182a2825cd6..a6a0c023ed13 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -265,6 +265,8 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/delegate/delegate_common.kshlib \ functional/devices/devices.cfg \ functional/devices/devices_common.kshlib \ + functional/direct/dio.cfg \ + functional/direct/dio.kshlib \ functional/events/events.cfg \ functional/events/events_common.kshlib \ functional/fault/fault.cfg \ @@ -1437,6 +1439,26 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/devices/devices_002_neg.ksh \ functional/devices/devices_003_pos.ksh \ functional/devices/setup.ksh \ + functional/direct/dio_aligned_block.ksh \ + functional/direct/dio_async_always.ksh \ + functional/direct/dio_async_fio_ioengines.ksh \ + functional/direct/dio_compression.ksh \ + functional/direct/dio_dedup.ksh \ + functional/direct/dio_encryption.ksh \ + functional/direct/dio_grow_block.ksh \ + functional/direct/dio_max_recordsize.ksh \ + functional/direct/dio_mixed.ksh \ + functional/direct/dio_mmap.ksh \ + functional/direct/dio_overwrites.ksh \ + functional/direct/dio_property.ksh \ + functional/direct/dio_random.ksh \ + functional/direct/dio_recordsize.ksh \ + functional/direct/dio_unaligned_block.ksh \ + functional/direct/dio_unaligned_filesize.ksh \ + functional/direct/dio_write_verify.ksh \ + functional/direct/dio_write_stable_pages.ksh \ + functional/direct/setup.ksh \ + functional/direct/cleanup.ksh \ functional/dos_attributes/cleanup.ksh \ functional/dos_attributes/read_dos_attrs_001.ksh \ functional/dos_attributes/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh index 945db71bf113..20498440bea7 100755 --- a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -75,7 +75,7 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) )) log_must set_tunable32 L2ARC_WRITE_MAX $(( $VCACHE_SZ * 2 )) diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh index 57f6b6a0242b..1d3cbfc79ee6 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh @@ -36,7 +36,7 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 # # DESCRIPTION: diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh index f7b8a4b950d5..460c95bb6051 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh @@ -37,7 +37,7 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 # # DESCRIPTION: diff --git a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh index 0838b2c93e68..2f352e2af5d4 100755 --- a/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh +++ b/tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh @@ -37,7 +37,7 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 # # DESCRIPTION: diff --git a/tests/zfs-tests/tests/functional/direct/cleanup.ksh b/tests/zfs-tests/tests/functional/direct/cleanup.ksh new file mode 100755 index 000000000000..382e9b1734b0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/cleanup.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "global" + +default_cleanup diff --git a/tests/zfs-tests/tests/functional/direct/dio.cfg b/tests/zfs-tests/tests/functional/direct/dio.cfg new file mode 100644 index 000000000000..6472610d7b41 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio.cfg @@ -0,0 +1,26 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# + +DIO_VDEV1=$TEST_BASE_DIR/file1 +DIO_VDEV2=$TEST_BASE_DIR/file2 +DIO_VDEV3=$TEST_BASE_DIR/file3 +DIO_VDEVS="$DIO_VDEV1 $DIO_VDEV2 $DIO_VDEV3" + +DIO_FILESIZE=4M +DIO_BS=128K diff --git a/tests/zfs-tests/tests/functional/direct/dio.kshlib b/tests/zfs-tests/tests/functional/direct/dio.kshlib new file mode 100644 index 000000000000..3a70cf293967 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio.kshlib @@ -0,0 +1,331 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg + +function dio_cleanup +{ + if poolexists $TESTPOOL1; then + destroy_pool $TESTPOOL1 + fi + + rm -f $DIO_VDEVS +} + +# +# Generate an IO workload using fio and then verify the resulting data. +# +function dio_and_verify # mode file-size block-size directory ioengine extra-args +{ + typeset mode=$1 + typeset size=$2 + typeset bs=$3 + typeset mntpnt=$4 + typeset ioengine=$5 + typeset extra_args=$6 + + # Invoke an fio workload via Direct I/O and verify with Direct I/O. + log_must fio --directory=$mntpnt --name=direct-$mode \ + --rw=$mode --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --verify=sha1 --ioengine=$ioengine --fallocate=none \ + --group_reporting --minimal --do_verify=1 $extra_args + + # Now just read back the file without Direct I/O into the ARC as an + # additional verfication step. + log_must fio --directory=$mntpnt --name=direct-$mode \ + --rw=read --size=$size --bs=$bs --direct=0 --numjobs=1 \ + --ioengine=$ioengine --group_reporting --minimal + + log_must rm -f "$mntpnt/direct-*" +} + +# +# Get zpool status -d checksum verify failures +# +function get_zpool_status_chksum_verify_failures # pool_name vdev_type +{ + typeset pool=$1 + typeset vdev_type=$2 + + if [[ "$vdev_type" == "stripe" ]]; then + val=$(zpool status -dp $pool | \ + awk '{s+=$6} END {print s}' ) + elif [[ "$vdev_type" == "mirror" || "$vdev_type" == "raidz" || + "$vdev_type" == "draid" ]]; then + val=$(zpool status -dp $pool | \ + awk -v d="$vdev_type" '$0 ~ d {print $6}' ) + else + log_fail "Unsupported VDEV type in \ + get_zpool_status_chksum_verify_failures(): $vdev_type" + fi + echo "$val" +} + +# +# Get ZED dio_verify events +# +function get_zed_dio_verify_events # pool +{ + typeset pool=$1 + + val=$(zpool events $pool | grep -c dio_verify) + + echo "$val" +} + +# +# Checking for checksum verify write failures with: +# zpool status -d +# zpool events +# After getting that counts will clear the out the ZPool errors and events +# +function check_dio_write_chksum_verify_failures # pool vdev_type expect_errors +{ + typeset pool=$1 + typeset vdev_type=$2 + typeset expect_errors=$3 + typeset note_str="expecting none" + + if [[ $expect_errors -ne 0 ]]; then + note_str="expecting some" + fi + + log_note "Checking for Direct I/O write checksum verify errors \ + $note_str on ZPool: $pool" + + status_failures=$(get_zpool_status_chksum_verify_failures $pool $vdev_type) + zed_dio_verify_events=$(get_zed_dio_verify_events $pool) + + if [[ $expect_errors -ne 0 ]]; then + if [[ $status_failures -eq 0 || + $zed_dio_verify_events -eq 0 ]]; then + zpool status -dp $pool + zpool events $pool + log_fail "Checksum verifies in zpool status -d \ + $status_failures. ZED dio_verify events \ + $zed_dio_verify_events. Neither should be 0." + fi + else + if [[ $status_failures -ne 0 || + $zed_dio_verify_events -ne 0 ]]; then + zpool status -dp $pool + zpool events $pool + log_fail "Checksum verifies in zpool status -d \ + $status_failures. ZED dio_verify events \ + $zed_dio_verify_events. Both should be zero." + fi + fi + + log_must zpool clear $pool + log_must zpool events -c + +} + +# +# Get the value of a counter from +# Linux: /proc/spl/kstat/zfs/$pool/iostats file. +# FreeBSD: kstat.zfs.$pool.msic.iostats.$stat +# +function get_iostats_stat # pool stat +{ + typeset pool=$1 + typeset stat=$2 + + if is_linux; then + iostats_file=/proc/spl/kstat/zfs/$pool/iostats + val=$(grep -m1 "$stat" $iostats_file | awk '{ print $3 }') + else + val=$(sysctl -n kstat.zfs.$pool.misc.iostats.$stat) + fi + if [[ -z "$val" ]]; then + log_fail "Unable to read $stat counter" + fi + + echo "$val" +} + +# +# Evict any buffered blocks by overwritting them using an O_DIRECT request. +# +function evict_blocks +{ + typeset pool=$1 + typeset file=$2 + typeset size=$3 + + log_must stride_dd -i /dev/urandom -o $file -b $size -c 1 -D +} + +# +# Perform FIO Direct I/O writes to a file with the given arguments. +# Then verify thae minimum expected number of blocks were written as +# Direct I/O. +# +function verify_dio_write_count #pool bs size mnpnt +{ + typeset pool=$1 + typeset bs=$2 + typeset size=$3 + typeset mntpnt=$4 + typeset dio_wr_expected=$(((size / bs) -1)) + + log_note "Checking for $dio_wr_expected Direct I/O writes" + + prev_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_and_verify write $size $bs $mntpnt "sync" + curr_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_wr_actual=$((curr_dio_wr - prev_dio_wr)) + + if [[ $dio_wr_actual -lt $dio_wr_expected ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct writes $dio_wr_actual of $dio_wr_expected" + fi +} + +# +# Perform a stride_dd write command to the file with the given arguments. +# Then verify the minimum expected number of blocks were written as either +# buffered IO (by the ARC), or Direct I/O to the application (dd). +# +function check_write # pool file bs count seek flags buf_wr dio_wr +{ + typeset pool=$1 + typeset file=$2 + typeset bs=$3 + typeset count=$4 + typeset seek=$5 + typeset flags=$6 + typeset buf_wr_expect=$7 + typeset dio_wr_expect=$8 + + log_note "Checking $count * $bs write(s) at offset $seek, $flags" + + prev_buf_wr=$(get_iostats_stat $pool arc_write_count) + prev_dio_wr=$(get_iostats_stat $pool direct_write_count) + + log_must stride_dd -i /dev/urandom -o $file -b $bs -c $count \ + -k $seek $flags + + curr_buf_wr=$(get_iostats_stat $pool arc_write_count) + buf_wr_actual=$((curr_buf_wr - prev_buf_wr)) + + curr_dio_wr=$(get_iostats_stat $pool direct_write_count) + dio_wr_actual=$((curr_dio_wr - prev_dio_wr)) + + if [[ $buf_wr_actual -lt $buf_wr_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Buffered writes $buf_wr_actual of $buf_wr_expect" + fi + + if [[ $dio_wr_actual -lt $dio_wr_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct writes $dio_wr_actual of $dio_wr_expect" + fi +} + +# +# Perform a stride_dd read command to the file with the given arguments. +# Then verify the minimum expected number of blocks were read as either +# buffered IO (by the ARC), or Direct I/O to the application (dd). +# +function check_read # pool file bs count skip flags buf_rd dio_rd +{ + typeset pool=$1 + typeset file=$2 + typeset bs=$3 + typeset count=$4 + typeset skip=$5 + typeset flags=$6 + typeset buf_rd_expect=$7 + typeset dio_rd_expect=$8 + + log_note "Checking $count * $bs read(s) at offset $skip, $flags" + + prev_buf_rd=$(get_iostats_stat $pool arc_read_count) + prev_dio_rd=$(get_iostats_stat $pool direct_read_count) + + log_must stride_dd -i $file -o /dev/null -b $bs -c $count \ + -p $skip $flags + + curr_buf_rd=$(get_iostats_stat $pool arc_read_count) + buf_rd_actual=$((curr_buf_rd - prev_buf_rd)) + + curr_dio_rd=$(get_iostats_stat $pool direct_read_count) + dio_rd_actual=$((curr_dio_rd - prev_dio_rd)) + + if [[ $buf_rd_actual -lt $buf_rd_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Buffered reads $buf_rd_actual of $buf_rd_expect" + fi + + if [[ $dio_rd_actual -lt $dio_rd_expect ]]; then + if is_linux; then + cat /proc/spl/kstat/zfs/$pool/iostats + else + sysctl kstat.zfs.$pool.misc.iostats + fi + log_fail "Direct reads $dio_rd_actual of $dio_rd_expect" + fi +} + +function get_file_size +{ + typeset filename="$1" + + if is_linux; then + filesize=$(stat -c %s $filename) + else + filesize=$(stat -s $filename | awk '{print $8}' | grep -o '[0-9]\+') + fi + + echo $filesize +} + +function do_truncate_reduce +{ + typeset filename=$1 + typeset size=$2 + + filesize=$(get_file_size $filename) + eval "echo original filesize: $filesize" + if is_linux; then + truncate $filename -s $((filesize - size)) + else + truncate -s -$size $filename + fi + filesize=$(get_file_size $filename) + eval "echo new filesize after truncate: $filesize" +} diff --git a/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh new file mode 100755 index 000000000000..4aac5edd8e0d --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_aligned_block.ksh @@ -0,0 +1,116 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify the number direct/buffered requests for (un)aligned access +# +# STRATEGY: +# 1. Create a multi-block file +# 2. Perform various (un)aligned accesses and verify the result. +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + log_must rm -f $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_onexit cleanup + +log_assert "Verify the number direct/buffered requests for unaligned access" + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +tmp_file=$mntpnt/tmp_file +file_size=$((rs * 8)) + +log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1 + +# N recordsize aligned writes which do not span blocks +check_write $TESTPOOL $tmp_file $rs 1 0 "-D" 0 1 +check_write $TESTPOOL $tmp_file $rs 2 0 "-D" 0 2 +check_write $TESTPOOL $tmp_file $rs 4 0 "-D" 0 4 +check_write $TESTPOOL $tmp_file $rs 8 0 "-D" 0 8 + +# 1 recordsize aligned write which spans multiple blocks at various offsets +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 0 "-D" 0 2 +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 1 "-D" 0 2 +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 2 "-D" 0 2 +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 3 "-D" 0 2 +check_write $TESTPOOL $tmp_file $((rs * 4)) 1 0 "-D" 0 4 +check_write $TESTPOOL $tmp_file $((rs * 4)) 1 1 "-D" 0 4 +check_write $TESTPOOL $tmp_file $((rs * 8)) 1 0 "-D" 0 8 + +# sub-blocksize unaligned writes which do not span blocks. +check_write $TESTPOOL $tmp_file $((rs / 2)) 1 0 "-D" 1 0 +check_write $TESTPOOL $tmp_file $((rs / 2)) 1 1 "-D" 1 0 +check_write $TESTPOOL $tmp_file $((rs / 2)) 1 2 "-D" 1 0 +check_write $TESTPOOL $tmp_file $((rs / 2)) 1 3 "-D" 1 0 + +# large unaligned writes which span multiple blocks +check_write $TESTPOOL $tmp_file $((rs * 2)) 1 $((rs / 2)) "-D -K" 2 1 +check_write $TESTPOOL $tmp_file $((rs * 4)) 2 $((rs / 4)) "-D -K" 4 6 + +# evict any cached blocks by overwriting with O_DIRECT +evict_blocks $TESTPOOL $tmp_file $file_size + +# recordsize aligned reads which do not span blocks +check_read $TESTPOOL $tmp_file $rs 1 0 "-d" 0 1 +check_read $TESTPOOL $tmp_file $rs 2 0 "-d" 0 2 +check_read $TESTPOOL $tmp_file $rs 4 0 "-d" 0 4 +check_read $TESTPOOL $tmp_file $rs 8 0 "-d" 0 8 + +# 1 recordsize aligned read which spans multiple blocks at various offsets +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 0 "-d" 0 2 +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 1 "-d" 0 2 +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 2 "-d" 0 2 +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 3 "-d" 0 2 +check_read $TESTPOOL $tmp_file $((rs * 4)) 1 0 "-d" 0 4 +check_read $TESTPOOL $tmp_file $((rs * 4)) 1 1 "-d" 0 4 +check_read $TESTPOOL $tmp_file $((rs * 8)) 1 0 "-d" 0 8 + +# sub-blocksize unaligned reads which do not span blocks. +check_read $TESTPOOL $tmp_file $((rs / 2)) 1 0 "-d" 0 1 +check_read $TESTPOOL $tmp_file $((rs / 2)) 1 1 "-d" 0 1 +check_read $TESTPOOL $tmp_file $((rs / 2)) 1 2 "-d" 0 1 +check_read $TESTPOOL $tmp_file $((rs / 2)) 1 3 "-d" 0 1 + +# large unaligned reads which span multiple blocks +check_read $TESTPOOL $tmp_file $((rs * 2)) 1 $((rs / 2)) "-d -P" 0 3 +check_read $TESTPOOL $tmp_file $((rs * 4)) 1 $((rs / 4)) "-d -P" 0 5 + +log_pass "Verify the number direct/buffered requests for (un)aligned access" diff --git a/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh b/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh new file mode 100755 index 000000000000..3f26715fc338 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_async_always.ksh @@ -0,0 +1,69 @@ +#!/bin/ksh -p +# +# DDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify small async Direct I/O requests +# +# STRATEGY: +# 1. Use fio to issue small read/write requests. Writes are +# smaller than the block size and thus will be buffered, +# reads satisfy the minimum alignment and will be direct. +# + +verify_runnable "global" + +function cleanup +{ + zfs set direct=standard $TESTPOOL/$TESTFS + rm $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify direct=always mixed small async requests" + +log_onexit cleanup + +log_must zfs set direct=always $TESTPOOL/$TESTFS + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/tmp_file +page_size=$(getconf PAGESIZE) +file_size=1G +runtime=10 + +log_must truncate -s $file_size $tmp_file + +log_must fio --filename=$tmp_file --name=always-randrw \ + --rw=randwrite --bs=$page_size --size=$file_size --numjobs=1 \ + --ioengine=posixaio --fallocate=none --iodepth=4 --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based + +log_pass "Verify direct=always mixed small async requests" diff --git a/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh b/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh new file mode 100755 index 000000000000..b90bad48ad8c --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_async_fio_ioengines.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify FIO async engines work using Direct I/O. +# +# STRATEGY: +# 1. Select a FIO async ioengine +# 2. Start sequntial Direct I/O and verify with buffered I/O +# 3. Start mixed Direct I/O and verify with buffered I/O +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$mntpnt/direct-*" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +function check_fio_ioengine +{ + fio --ioengine=io_uring --parse-only > /dev/null 2>&1 + return $? +} + +log_assert "Verify FIO async ioengines work using Direct I/O." + +log_onexit cleanup + +typeset -a async_ioengine_args=("--iodepth=4" "--iodepth=4 --thread") + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +fio_async_ioengines="posixaio" + +if is_linux; then + fio_async_ioengines+=" libaio" + if $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then + if [ -e /etc/os-release ] ; then + source /etc/os-release + if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && + ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then + log_note "io_uring disabled on CentOS 9, fails " \ + "with 'Operation not permitted'" + elif $(check_fio_ioengine -eq 0); then + fio_async_ioengines+=" io_uring" + else + log_note "io_uring not supported by fio and " \ + "will not be tested" + fi + else + if $(check_fio_ioengine); then + fio_async_ioengines+=" io_uring" + + else + log_note "io_uring not supported by fio and " \ + "will not be tested" + fi + fi + else + log_note "io_uring not supported by kernel will not " \ + "be tested" + + fi +fi + +for ioengine in $fio_async_ioengines; do + for ioengine_args in "${async_ioengine_args[@]}"; do + for op in "rw" "randrw" "write"; do + log_note "Checking Direct I/O with FIO async ioengine" \ + " $ioengine with args $ioengine_args --rw=$op" + dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "$ioengine" \ + "$ioengine_args" + done + done +done + +log_pass "Verfied FIO async ioengines work using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_compression.ksh b/tests/zfs-tests/tests/functional/direct/dio_compression.ksh new file mode 100755 index 000000000000..5be93d104d8c --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_compression.ksh @@ -0,0 +1,66 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify compression works using Direct I/O. +# +# STRATEGY: +# 1. Select a random compression algoritm +# 2. Start sequential Direct I/O and verify with buffered I/O +# 3. Start mixed Direct I/O and verify with buffered I/O +# 4. Repeat from 2 for all compression algoritms +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$mntpnt/direct-*" + log_must zfs set compression=off $TESTPOOL/$TESTFS + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify compression works using Direct I/O." + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +compress_args="--buffer_compress_percentage=50" + +for comp in "${compress_prop_vals[@]:1}"; do + log_must zfs set compression=$comp $TESTPOOL/$TESTFS + for op in "rw" "randrw" "write"; do + dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "sync" $compress_args + done +done + +log_pass "Verfied compression works using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh b/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh new file mode 100755 index 000000000000..c703fcc05f67 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_dedup.ksh @@ -0,0 +1,63 @@ +#!/bin/ksh -p +# +# DDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify deduplication works using Direct I/O. +# +# STRATEGY: +# 1. Enable dedup +# 2. Start sequential Direct I/O and verify with buffered I/O +# 3. Start mixed Direct IO and verify with buffered I/O +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$mntpnt/direct-*" + log_must zfs set dedup=off $TESTPOOL/$TESTFS + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify deduplication works using Direct I/O." + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +dedup_args="--dedupe_percentage=50" + +log_must zfs set dedup=on $TESTPOOL/$TESTFS +for op in "rw" "randrw" "write"; do + dio_and_verify $op $DIO_FILESIZE $DIO_BS $mntpnt "sync" $dedup_args +done + +log_pass "Verfied deduplication works using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh b/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh new file mode 100755 index 000000000000..843b570d2dd5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_encryption.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh -p +# +# DDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify encryption works using Direct I/O. +# +# STRATEGY: +# 1. Create multidisk pool. +# 2. Start some mixed readwrite Direct I/O. +# 3. Verify the results are as expected using buffered I/O. +# + +verify_runnable "global" + +log_assert "Verify encryption works using Direct I/O." + +log_onexit dio_cleanup + +log_must truncate -s $MINVDEVSIZE $DIO_VDEVS + +create_pool $TESTPOOL1 $DIO_VDEVS +log_must eval "echo 'password' | zfs create -o encryption=on \ + -o keyformat=passphrase -o keylocation=prompt -o compression=off \ + $TESTPOOL1/$TESTFS1" + +mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) + +for bs in "4k" "128k" "1m"; do + for op in "rw" "randrw" "write"; do + dio_and_verify $op $DIO_FILESIZE $bs $mntpnt "sync" + done +done + +check_dio_write_chksum_verify_failures $TESTPOOL1 "stripe" 0 + +log_pass "Verified encryption works using Direct I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh new file mode 100755 index 000000000000..c54d07936625 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_grow_block.ksh @@ -0,0 +1,87 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify the number direct/buffered requests when growing a file +# +# STRATEGY: +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + log_must rm -f $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify the number direct/buffered requests when growing a file" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +tmp_file=$mntpnt/tmp_file + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +# +# Verify the expected number of buffered and Direct I/O's when growing +# the first block of a file up to the maximum recordsize. +# +for bs in "8192" "16384" "32768" "65536" "131072"; do + + # When O_DIRECT is set the first write to a new file, or when the + # block size needs to be grown, it will be done as a buffered write. + check_write $TESTPOOL $tmp_file $bs 1 0 "-D" 1 0 + + # Overwriting the first block of an existing file with O_DIRECT will + # be a buffered write if less than the block size. + check_write $TESTPOOL $tmp_file 4096 1 0 "-D" 1 0 + check_write $TESTPOOL $tmp_file 4096 1 1 "-D" 1 0 + + # Overwriting the first block of an existing file with O_DIRECT will + # be a direct write as long as the block size matches. + check_write $TESTPOOL $tmp_file $bs 1 0 "-D" 0 1 + + # Evict any blocks which may be buffered before the read tests. + evict_blocks $TESTPOOL $tmp_file $bs + + # Reading the first block of an existing file with O_DIRECT will + # be a direct read for part or all of the block size. + check_read $TESTPOOL $tmp_file $bs 1 0 "-d" 0 1 + check_read $TESTPOOL $tmp_file 4096 1 0 "-d" 0 1 + check_read $TESTPOOL $tmp_file 4096 1 1 "-d" 0 1 +done + +log_pass "Verify the number direct/buffered requests when growing a file" diff --git a/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh b/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh new file mode 100755 index 000000000000..87900443ed1a --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_max_recordsize.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify max recordsizes are supported for Direct I/O. +# +# STRATEGY: +# 1. Create a pool from each vdev type with varying recordsizes. +# 2. Start sequential Direct I/O and verify with buffered I/O. +# + +verify_runnable "global" + +log_assert "Verify max recordsizes are supported for Direct I/O." + +log_onexit dio_cleanup + +log_must truncate -s $MINVDEVSIZE $DIO_VDEVS + +for type in "" "mirror" "raidz" "draid"; do; + for recsize in "2097152" "8388608" "16777216"; do + create_pool $TESTPOOL1 $type $DIO_VDEVS + log_must eval "zfs create \ + -o recordsize=$recsize -o compression=off \ + $TESTPOOL1/$TESTFS1" + + mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) + + verify_dio_write_count $TESTPOOL1 $recsize $((4 * recsize)) \ + $mntpnt + + if [[ "$type" == "" ]]; then + check_dio_write_chksum_verify_failures $TESTPOOL1 \ + "stripe" 0 + else + check_dio_write_chksum_verify_failures $TESTPOOL1 \ + "$type" 0 + fi + + destroy_pool $TESTPOOL1 + done +done + +log_pass "Verified max recordsizes are supported for Direct I/O." diff --git a/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh b/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh new file mode 100755 index 000000000000..38c61595371b --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_mixed.ksh @@ -0,0 +1,108 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify mixed buffered and Direct I/O are coherent. +# +# STRATEGY: +# 1. Verify interleaved buffered and Direct I/O +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f $src_file $new_file $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify mixed buffered and Direct I/O are coherent." + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +src_file=$mntpnt/src_file +new_file=$mntpnt/new_file +tmp_file=$mntpnt/tmp_file +page_size=$(getconf PAGESIZE) +file_size=1048576 + +log_must stride_dd -i /dev/urandom -o $src_file -b $file_size -c 1 + +# +# Using mixed input and output block sizes verify that buffered and +# Direct I/O can be interleaved and the result with always be coherent. +# +for ibs in "512" "$page_size" "131072"; do + for obs in "512" "$page_size" "131072"; do + iblocks=$(($file_size / $ibs)) + oblocks=$(($file_size / $obs)) + iflags="" + oflags="" + + # Only allow Direct I/O when it is at least page sized. + if [[ $ibs -ge $page_size ]]; then + iflags="-d" + fi + + if [[ $obs -ge $page_size ]]; then + oflags="-D" + fi + + # Verify buffered write followed by a direct read. + log_must stride_dd -i $src_file -o $new_file -b $obs \ + -c $oblocks + log_must stride_dd -i $new_file -o $tmp_file -b $ibs \ + -c $iblocks $iflags + log_must cmp_md5s $new_file $tmp_file + log_must rm -f $new_file $tmp_file + + # Verify direct write followed by a buffered read. + log_must stride_dd -i $src_file -o $new_file -b $obs \ + -c $oblocks $oflags + log_must stride_dd -i $new_file -o $tmp_file -b $ibs \ + -c $iblocks + log_must cmp_md5s $new_file $tmp_file + log_must rm -f $new_file $tmp_file + + # Verify direct write followed by a direct read. + log_must stride_dd -i $src_file -o $new_file -b $obs \ + -c $oblocks $oflags + log_must stride_dd -i $new_file -o $tmp_file -b $ibs \ + -c $iblocks $iflags + log_must cmp_md5s $new_file $tmp_file + log_must rm -f $new_file $tmp_file + done +done + +log_pass "Verify mixed buffered and Direct I/O are coherent." diff --git a/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh b/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh new file mode 100755 index 000000000000..27d03e04125e --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_mmap.ksh @@ -0,0 +1,93 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify mixed Direct I/O and mmap I/O. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a background Direct I/O random read/write fio to the +# file. +# 3. Start a background mmap random read/write fio to the file. +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + log_must rm -f "$tmp_file" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify mixed Direct I/O and mmap I/O" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((128 * 1024)) +blocks=64 +size=$((bs * blocks)) +runtime=60 + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks + +# Direct I/O writes +log_must eval "fio --filename=$tmp_file --name=direct-write \ + --rw=randwrite --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# Direct I/O reads +log_must eval "fio --filename=$tmp_file --name=direct-read \ + --rw=randread --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap I/O writes +log_must eval "fio --filename=$tmp_file --name=mmap-write \ + --rw=randwrite --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +# mmap I/O reads +log_must eval "fio --filename=$tmp_file --name=mmap-read \ + --rw=randread --size=$size --bs=$bs --numjobs=1 \ + --ioengine=mmap --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap &" + +wait + +log_pass "Verfied mixed Direct I/O and mmap I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh b/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh new file mode 100755 index 000000000000..3854766ed873 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_overwrites.ksh @@ -0,0 +1,71 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify Direct I/O overwrite. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a Direct I/O random write fio to the file. +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + log_must rm -f "$tmp_file" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify Direct I/O overwrites" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((128 * 1024)) +blocks=64 +size=$((bs * blocks)) +runtime=60 + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks + +# Direct I/O overwrites +log_must eval "fio --filename=$tmp_file --name=direct-write \ + --rw=randwrite --size=$size --bs=$bs --direct=1 --numjobs=1 \ + --ioengine=sync --fallocate=none --group_reporting --minimal \ + --runtime=$runtime --time_based --norandommap" + +log_pass "Verfied Direct I/O overwrites" diff --git a/tests/zfs-tests/tests/functional/direct/dio_property.ksh b/tests/zfs-tests/tests/functional/direct/dio_property.ksh new file mode 100755 index 000000000000..4fbcfec06810 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_property.ksh @@ -0,0 +1,126 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify the direct=always|disabled|standard property +# +# STRATEGY: +# 1. Verify direct=always behavior +# 2. Verify direct=disabled behavior +# 3. Verify direct=standard behavior +# + +verify_runnable "global" + +function cleanup +{ + zfs set direct=standard $TESTPOOL/$TESTFS + log_must rm -f $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify the direct=always|disabled|standard property" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) + +tmp_file=$mntpnt/tmp_file +page_size=$(getconf PAGESIZE) +file_size=1048576 +count=8 + +# +# Check when "direct=always" any aligned IO is done as direct. +# Note that "flag=direct" is not set in the following calls to dd(1). +# +log_must zfs set direct=always $TESTPOOL/$TESTFS + +log_note "Aligned writes (buffered, then all direct)" +check_write $TESTPOOL $tmp_file $rs $count 0 "" 1 $((count - 1)) + +log_note "Aligned overwrites" +check_write $TESTPOOL $tmp_file $rs $count 0 "" 0 $count + +log_note "Sub-recordsize unaligned overwrites" +check_write $TESTPOOL $tmp_file $((rs / 2)) $((2 * count)) 0 "" $((2 * count)) 0 + +log_note "Sub-page size aligned overwrites" +check_write $TESTPOOL $tmp_file 512 $count 0 "" $count 0 +evict_blocks $TESTPOOL $tmp_file $file_size + +log_note "Aligned reads" +check_read $TESTPOOL $tmp_file $rs $count 0 "" 0 $count + +log_note "Sub-recordsize unaligned reads" +check_read $TESTPOOL $tmp_file $((rs / 2)) $((count * 2)) 0 "" 0 $((2 * count)) + +log_note "Sub-page size aligned reads (one read then ARC hits)" +check_read $TESTPOOL $tmp_file 512 $count 0 "" 1 0 + +log_must rm -f $tmp_file + + +# +# Check when "direct=disabled" there are never any direct requests. +# Note that "flag=direct" is always set in the following calls to dd(1). +# +log_must zfs set direct=disabled $TESTPOOL/$TESTFS + +log_note "Aligned writes (all buffered with an extra for create)" +check_write $TESTPOOL $tmp_file $rs $count 0 "-D" $count 0 + +log_note "Aligned overwrites" +check_write $TESTPOOL $tmp_file $rs $count 0 "-D" $count 0 + +log_note "Aligned reads (all ARC hits)" +check_read $TESTPOOL $tmp_file $rs $count 0 "-d" 0 0 + +log_must rm -f $tmp_file + + +# +# Check when "direct=standard" only requested Direct I/O occur. +# +log_must zfs set direct=standard $TESTPOOL/$TESTFS + +log_note "Aligned writes/overwrites (buffered / direct)" +check_write $TESTPOOL $tmp_file $rs $count 0 "" $count 0 +check_write $TESTPOOL $tmp_file $rs $count 0 "-D" 0 $count + +log_note "Aligned reads (buffered / direct)" +evict_blocks $TESTPOOL $tmp_file $file_size +check_read $TESTPOOL $tmp_file $rs $count 0 "" $count 0 +evict_blocks $TESTPOOL $tmp_file $file_size +check_read $TESTPOOL $tmp_file $rs $count 0 "-d" 0 $count + +log_pass "Verify the direct=always|disabled|standard property" diff --git a/tests/zfs-tests/tests/functional/direct/dio_random.ksh b/tests/zfs-tests/tests/functional/direct/dio_random.ksh new file mode 100755 index 000000000000..42c18d426121 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_random.ksh @@ -0,0 +1,83 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify mixed Direct I/O and buffered I/O. A workload of random +# but correctly aligned direct read/writes is mixed with a +# concurrent workload of entirely unaligned buffered read/writes. +# +# STRATEGY: +# 1. Create an empty file. +# 2. Start a background fio randomly issuing direct read/writes. +# 3. Start a background fio randomly issuing buffered read/writes. +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$tmp_file" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify randomly sized mixed Direct I/O and buffered I/O" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +tmp_file=$mntpnt/file +bs=$((1024 * 1024)) +blocks=32 +size=$((bs * blocks)) +runtime=10 +page_size=$(getconf PAGESIZE) + +log_must stride_dd -i /dev/zero -o $tmp_file -b $bs -c $blocks + +# Direct random read/write page-aligned IO of varying sizes with +# occasional calls to fsync(2), mixed with... +log_must eval "fio --filename=$tmp_file --name=direct-rwrand \ + --rw=randrw --size=$size --offset_align=$(getconf PAGESIZE) \ + --bsrange=$page_size-1m --direct=1 --fsync=32 --numjobs=2 \ + --ioengine=sync --fallocate=none --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based &" + +# Buffered random read/write entirely unaligned IO of varying sizes +# occasional calls to fsync(2). +log_must eval "fio --filename=$tmp_file --name=buffered-write \ + --rw=randrw --size=$size --offset_align=512 --bs_unaligned=1 \ + --bsrange=$page_size-1m --direct=0 --fsync=32 --numjobs=2 \ + --ioengine=sync --fallocate=none --verify=sha1 \ + --group_reporting --minimal --runtime=$runtime --time_based &" + +wait + +log_pass "Verfied randomly sized mixed Direct I/O and buffered I/O" diff --git a/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh b/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh new file mode 100755 index 000000000000..e1087e5ac3fc --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_recordsize.ksh @@ -0,0 +1,76 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify different recordsizes are supported for Direct I/O. +# +# STRATEGY: +# 1. Create a pool from each vdev type with varying recordsizes. +# 2. Start sequential Direct I/O and verify with buffered I/O. +# 3. Start mixed Direct I/O and verify with buffered I/O. +# + +verify_runnable "global" + +log_assert "Verify different recordsizes are supported for Direct I/O." + +log_onexit dio_cleanup + +log_must truncate -s $MINVDEVSIZE $DIO_VDEVS + +for type in "" "mirror" "raidz" "draid"; do + for recsize in "1024" "4096" "128k"; do + create_pool $TESTPOOL1 $type $DIO_VDEVS + log_must eval "zfs create \ + -o recordsize=$recsize -o compression=off \ + $TESTPOOL1/$TESTFS1" + + mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS1) + + for bs in "4k" "128k"; do + for op in "rw" "randrw" "write"; do + dio_and_verify $op $DIO_FILESIZE $bs $mntpnt "sync" + done + done + + if [[ "$type" == "" ]]; then + check_dio_write_chksum_verify_failures $TESTPOOL1 \ + "stripe" 0 + else + check_dio_write_chksum_verify_failures $TESTPOOL1 \ + "$type" 0 + fi + + destroy_pool $TESTPOOL1 + done +done + +log_pass "Verified different recordsizes are supported for Direct I/O." diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh new file mode 100755 index 000000000000..9f50187149de --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_block.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify failure for (un)aligned O_DIRECT +# +# STRATEGY: +# 1. Create a multi-block file +# 2. Perform (un)aligned write/read verify the result. +# + +verify_runnable "global" + +function cleanup +{ + zfs set recordsize=$rs $TESTPOOL/$TESTFS + zfs set direct=standard $TESTPOOL/$TESTFS + log_must rm -f $tmp_file + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_onexit cleanup + +log_assert "Verify direct requests for (un)aligned access" + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +tmp_file=$mntpnt/tmp_file +file_size=$((rs * 8)) + +log_must stride_dd -i /dev/urandom -o $tmp_file -b $file_size -c 1 + +log_must zfs set direct=standard $TESTPOOL/$TESTFS +# sub-pagesize direct writes/read will always fail if direct=standard. +log_mustnot stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D +log_mustnot stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d + +log_must zfs set direct=always $TESTPOOL/$TESTFS +# sub-pagesize direct writes/read will always pass if direct=always. +log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 +log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 + +log_must zfs set direct=disabled $TESTPOOL/$TESTFS +# sub-pagesize direct writes/read will always pass if direct=disabled. +log_must stride_dd -i /dev/urandom -o $tmp_file -b 512 -c 8 -D +log_must stride_dd -i $tmp_file -o /dev/null -b 512 -c 8 -d + +log_pass "Verify direct requests for (un)aligned access" diff --git a/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh new file mode 100755 index 000000000000..571767d3b1c9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_unaligned_filesize.ksh @@ -0,0 +1,92 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify Direct I/O reads can read an entire file that is not +# page-aligned in length. When a file is not page-aligned in total +# length, as much that can be read using using O_DIRECT is done so and +# the rest is read using the ARC. O_DIRECT requires page-size alignment. +# +# STRATEGY: +# 1. Write a file that is page-aligned (buffered) +# 2. Truncate the file to be 512 bytes less +# 3. Export then import the Zpool flushing out the ARC +# 4. Read back the file using O_DIRECT +# 5. Verify the file is read back with both Direct I/O and buffered I/O +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$filename" + log_must set recordsize=$rs $TESTPOOL/$TESTFS + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify Direct I/O reads can read an entire file that is not \ + page-aligned" + +log_onexit cleanup + +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +rs=$(get_prop recordsize $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +bs=$((128 * 1024)) # bs=recordsize (128k) +filename="$mntpnt/testfile.iso" + +log_must stride_dd -i /dev/urandom -o $filename -b $bs -c 2 +# Truncating file so the total length is no longer page-size aligned +log_must do_truncate_reduce $filename 512 + +# Exporting the Zpool to make sure all future reads happen from the ARC +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL + +# Reading the file back using Direct I/O +prev_dio_read=$(get_iostats_stat $TESTPOOL direct_read_count) +prev_arc_read=$(get_iostats_stat $TESTPOOL arc_read_count) +log_must stride_dd -i $filename -o /dev/null -b $bs -e -d +curr_dio_read=$(get_iostats_stat $TESTPOOL direct_read_count) +curr_arc_read=$(get_iostats_stat $TESTPOOL arc_read_count) +total_dio_read=$((curr_dio_read - prev_dio_read)) +total_arc_read=$((curr_arc_read - prev_arc_read)) + +# We should see both Direct I/O reads an ARC read to read the entire file that +# is not page-size aligned +if [[ $total_dio_read -lt 2 ]] || [[ $total_arc_read -lt 1 ]]; then + log_fail "Expect 2 reads from Direct I/O and 1 from the ARC but \ + Direct I/O: $total_dio_read ARC: $total_arc_read" +fi + +log_pass "Verified Direct I/O read can read a none page-aligned length file" diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh new file mode 100755 index 000000000000..5a5a5cf7ad29 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_write_stable_pages.ksh @@ -0,0 +1,103 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify stable pages work for O_DIRECT writes. +# +# STRATEGY: +# 1. Start a Direct I/O write workload while manipulating the user +# buffer. +# 2. Verify we can Read the contents of the file using buffered reads. +# 3. Verify there is no checksum errors reported from zpool status. +# 4. Repeat steps 1 and 2 for 3 iterations. +# 5. Repeat 1-3 but with compression disabled. +# + +verify_runnable "global" + +function cleanup +{ + log_must rm -f "$mntpnt/direct-write.iso" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 0 +} + +log_assert "Verify stable pages work for Direct I/O writes." + +if is_linux; then + log_unsupported "Linux does not support stable pages for O_DIRECT \ + writes" +fi + +log_onexit cleanup + +ITERATIONS=3 +NUMBLOCKS=300 +BS=$((128 * 1024)) #128k +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS + +for compress in "on" "off"; +do + log_must zfs set compression=$compress $TESTPOOL/$TESTFS + + for i in $(seq 1 $ITERATIONS); do + log_note "Verifying stable pages for Direct I/O writes \ + iteration $i of $ITERATIONS" + + prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + + # Manipulate the user's buffer while running O_DIRECT write + # workload with the buffer. + log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS + + # Reading back the contents of the file + log_must stride_dd -i $mntpnt/direct-write.iso -o /dev/null \ + -b $BS -c $NUMBLOCKS + + curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + total_dio_wr=$((curr_dio_wr - prev_dio_wr)) + + log_note "Making sure we have Direct I/O writes logged" + if [[ $total_dio_wr -lt 1 ]]; then + log_fail "No Direct I/O writes $total_dio_wr" + fi + + # Making sure there are no data errors for the zpool + log_note "Making sure there are no checksum errors with the ZPool" + log_must check_pool_status $TESTPOOL "errors" \ + "No known data errors" + + log_must rm -f "$mntpnt/direct-write.iso" + done +done + +log_pass "Verified stable pages work for Direct I/O writes." diff --git a/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh new file mode 100755 index 000000000000..a7e9dc0cde7b --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/dio_write_verify.ksh @@ -0,0 +1,222 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/direct/dio.cfg +. $STF_SUITE/tests/functional/direct/dio.kshlib + +# +# DESCRIPTION: +# Verify checksum verify works for Direct I/O writes. +# +# STRATEGY: +# 1. Set the module parameter zfs_vdev_direct_write_verify_pct to 30. +# 2. Check that manipulating the user buffer while Direct I/O writes are +# taking place does not cause any panics with compression turned on. +# 3. Start a Direct I/O write workload while manipulating the user buffer +# without compression. +# 4. Verify there are Direct I/O write verify failures using +# zpool status -d and checking for zevents. We also make sure there +# are reported data errors when reading the file back. +# 5. Repeat steps 3 and 4 for 3 iterations. +# 6. Set zfs_vdev_direct_write_verify_pct set to 1 and repeat 3. +# 7. Verify there are Direct I/O write verify failures using +# zpool status -d and checking for zevents. We also make sure there +# there are no reported data errors when reading the file back because +# with us checking every Direct I/O write and on checksum validation +# failure those writes will not be committed to a VDEV. +# + +verify_runnable "global" + +function cleanup +{ + # Clearing out DIO counts for Zpool + log_must zpool clear $TESTPOOL + # Clearing out dio_verify from event logs + log_must zpool events -c + log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT 2 +} + +log_assert "Verify checksum verify works for Direct I/O writes." + +if is_freebsd; then + log_unsupported "FeeBSD is capable of stable pages for O_DIRECT writes" +fi + +log_onexit cleanup + +ITERATIONS=3 +NUMBLOCKS=300 +VERIFY_PCT=30 +BS=$((128 * 1024)) # 128k +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) + +# Get a list of vdevs in our pool +set -A array $(get_disklist_fullpath $TESTPOOL) + +# Get the first vdev +firstvdev=${array[0]} + +log_must zfs set recordsize=128k $TESTPOOL/$TESTFS +log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT $VERIFY_PCT + +# First we will verify there are no panics while manipulating the contents of +# the user buffer during Direct I/O writes with compression. The contents +# will always be copied out of the ABD and there should never be any ABD ASSERT +# failures +log_note "Verifying no panics for Direct I/O writes with compression" +log_must zfs set compression=on $TESTPOOL/$TESTFS +prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) +log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" -n $NUMBLOCKS \ + -b $BS +curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) +total_dio_wr=$((curr_dio_wr - prev_dio_wr)) + +log_note "Making sure we have Direct I/O writes logged" +if [[ $total_dio_wr -lt 1 ]]; then + log_fail "No Direct I/O writes $total_dio_wr" +fi + +log_must rm -f "$mntpnt/direct-write.iso" +# Clearing out DIO counts for Zpool +log_must zpool clear $TESTPOOL +# Clearing out dio_verify from event logs +log_must zpool events -c + + + +# Next we will verify there are checksum errors for Direct I/O writes while +# manipulating the contents of the user pages. +log_must zfs set compression=off $TESTPOOL/$TESTFS + +for i in $(seq 1 $ITERATIONS); do + log_note "Verifying 30% of Direct I/O write checksums iteration \ + $i of $ITERATIONS with \ + zfs_vdev_direct_write_verify_pct=$VERIFY_PCT" + + prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + prev_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) + log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS + + # Reading file back to verify checksum errors + filesize=$(get_file_size "$mntpnt/direct-write.iso") + num_blocks=$((filesize / BS)) + log_mustnot stride_dd -i "$mntpnt/direct-write.iso" -o /dev/null -b $BS \ + -c $num_blocks + + # Getting new Direct I/O and ARC write counts. + curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + curr_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) + total_dio_wr=$((curr_dio_wr - prev_dio_wr)) + total_arc_wr=$((curr_arc_wr - prev_arc_wr)) + + # Verifying there are checksum errors + log_note "Making sure there are checksum errors for the ZPool" + cksum=$(zpool status -P -v $TESTPOOL | awk -v v="$firstvdev" '$0 ~ v \ + {print $5}') + if [[ $cksum -eq 0 ]]; then + zpool status -P -v $TESTPOOL + log_fail "No checksum failures for ZPool $TESTPOOL" + fi + + # Getting checksum verify failures + verify_failures=$(get_zpool_status_chksum_verify_failures $TESTPOOL "raidz") + + log_note "Making sure we have Direct I/O writes logged" + if [[ $total_dio_wr -lt 1 ]]; then + log_fail "No Direct I/O writes $total_dio_wr" + fi + log_note "Making sure we have Direct I/O write checksum verifies with ZPool" + check_dio_write_chksum_verify_failures $TESTPOOL "raidz" 1 + + # In the event of checksum verify error, the write will be redirected + # through the ARC. We check here that we have ARC writes. + log_note "Making sure we have ARC writes have taken place in the event \ + a Direct I/O checksum verify failures occurred" + if [[ $total_arc_wr -lt $verify_failures ]]; then + log_fail "ARC writes $total_arc_wr < $verify_failures" + fi + + log_must rm -f "$mntpnt/direct-write.iso" +done + +log_must zpool status -v $TESTPOOL +log_must zpool sync $TESTPOOL + +# Finally we will verfiy that with checking every Direct I/O write we have no +# errors at all. +VERIFY_PCT=100 +log_must set_tunable32 VDEV_DIRECT_WR_VERIFY_PCT $VERIFY_PCT + +for i in $(seq 1 $ITERATIONS); do + log_note "Verifying every Direct I/O write checksums iteration $i of \ + $ITERATIONS with zfs_vdev_direct_write_verify_pct=$VERIFY_PCT" + + prev_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + prev_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) + log_must manipulate_user_buffer -o "$mntpnt/direct-write.iso" \ + -n $NUMBLOCKS -b $BS + + # Reading file back to verify there no are checksum errors + filesize=$(get_file_size "$mntpnt/direct-write.iso") + num_blocks=$((filesize / BS)) + log_must stride_dd -i "$mntpnt/direct-write.iso" -o /dev/null -b $BS \ + -c $num_blocks + + # Getting new Direct I/O and ARC Write counts. + curr_dio_wr=$(get_iostats_stat $TESTPOOL direct_write_count) + curr_arc_wr=$(get_iostats_stat $TESTPOOL arc_write_count) + total_dio_wr=$((curr_dio_wr - prev_dio_wr)) + total_arc_wr=$((curr_arc_wr - prev_arc_wr)) + + log_note "Making sure there are no checksum errors with the ZPool" + log_must check_pool_status $TESTPOOL "errors" "No known data errors" + + # Geting checksum verify failures + verify_failures=$(get_zpool_status_chksum_verify_failures $TESTPOOL "raidz") + + log_note "Making sure we have Direct I/O writes logged" + if [[ $total_dio_wr -lt 1 ]]; then + log_fail "No Direct I/O writes $total_dio_wr" + fi + + log_note "Making sure we have Direct I/O write checksum verifies with ZPool" + check_dio_write_chksum_verify_failures "$TESTPOOL" "raidz" 1 + + # In the event of checksum verify error, the write will be redirected + # through the ARC. We check here that we have ARC writes. + log_note "Making sure we have ARC writes have taken place in the event \ + a Direct I/O checksum verify failures occurred" + if [[ $total_arc_wr -lt $verify_failures ]]; then + log_fail "ARC writes $total_arc_wr < $verify_failures" + fi + + log_must rm -f "$mntpnt/direct-write.iso" +done + +log_pass "Verified checksum verify works for Direct I/O writes." diff --git a/tests/zfs-tests/tests/functional/direct/setup.ksh b/tests/zfs-tests/tests/functional/direct/setup.ksh new file mode 100755 index 000000000000..5ce95dddf401 --- /dev/null +++ b/tests/zfs-tests/tests/functional/direct/setup.ksh @@ -0,0 +1,32 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +verify_runnable "global" + +default_raidz_setup_noexit "$DISKS" +log_must zfs set compression=off $TESTPOOL/$TESTFS +log_pass diff --git a/tests/zfs-tests/tests/functional/io/setup.ksh b/tests/zfs-tests/tests/functional/io/setup.ksh index 82aaf5bc91b5..29d267115891 100755 --- a/tests/zfs-tests/tests/functional/io/setup.ksh +++ b/tests/zfs-tests/tests/functional/io/setup.ksh @@ -27,5 +27,5 @@ . $STF_SUITE/include/libtest.shlib verify_runnable "global" -default_setup "$DISKS" +default_raidz_setup "$DISKS" log_must zfs set compression=on $TESTPOOL/$TESTFS diff --git a/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg b/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg index 0302392f4c7f..f79123e5b2e1 100644 --- a/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg +++ b/tests/zfs-tests/tests/functional/l2arc/l2arc.cfg @@ -35,4 +35,4 @@ export PERF_COMPPERCENT=66 export PERF_COMPCHUNK=0 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib index 26e7c2cc25bc..80badd27331a 100644 --- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib +++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib @@ -155,13 +155,6 @@ function cleanup_pool fi } -function cmp_md5s { - typeset file1=$1 - typeset file2=$2 - - [ "$(md5digest $file1)" = "$(md5digest $file2)" ] -} - # # Detect if the given two filesystems have same sub-datasets # diff --git a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh index 8f3585a5997f..deb963f25894 100755 --- a/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh +++ b/tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh @@ -77,6 +77,14 @@ log_must zfs create $TESTPOOL/$TESTFS log_must dd if=/dev/zero of=/$TESTPOOL/$TESTFS/sync \ conv=fdatasync,fsync bs=1 count=1 +# +# Create a small file for the O_DIRECT test before freezing the pool. This +# allows us to overwrite it after the pool is frozen and avoid the case +# where O_DIRECT is disabled because the first block must be grown. +# +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/direct \ + oflag=sync,direct bs=4k count=1 + # # 2. Freeze TESTFS # @@ -140,6 +148,10 @@ log_must truncate -s 0 /$TESTPOOL/$TESTFS/truncated_file log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/large \ oflag=sync bs=128k count=64 +# TX_WRITE (O_DIRECT) +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/direct \ + oflag=sync,direct bs=4k count=1 + # Write zeros, which compress to holes, in the middle of a file log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/holes.1 \ oflag=sync bs=128k count=8 diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh index a93d0b3cc803..62563e0dd4cb 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh @@ -77,7 +77,7 @@ export PERF_COMPCHUNK=0 export RUNTIME=30 export BLOCKSIZE=128K export SYNC_TYPE=0 -export DIRECT=1 +export DIRECT=0 # Write to the pool. log_must fio $FIO_SCRIPTS/mkfiles.fio