diff --git a/Makefile.am b/Makefile.am index 11e45dae8255..919f20c13ca5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -57,6 +57,8 @@ dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2 dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.cityhash dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.cityhash.descrip +dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.zia +dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.zia.descrip @CODE_COVERAGE_RULES@ diff --git a/config/Rules.am b/config/Rules.am index 30c5f353cd23..0a73eb4bf337 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -43,6 +43,7 @@ AM_CPPFLAGS += -DPKGDATADIR=\"$(pkgdatadir)\" AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-@ac_system_l@-user\" +AM_CPPFLAGS += $(ZIA_CPPFLAGS) if ASAN_ENABLED AM_CPPFLAGS += -DZFS_ASAN_ENABLED diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 368684e1c512..e122dfe85e57 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -263,6 +263,8 @@ AC_DEFUN([ZFS_AC_CONFIG], [ AC_SUBST(TEST_JOBS) ]) + ZFS_AC_ZIA + ZFS_INIT_SYSV= ZFS_INIT_SYSTEMD= ZFS_WANT_MODULES_LOAD_D= @@ -294,7 +296,8 @@ AC_DEFUN([ZFS_AC_CONFIG], [ [test "x$qatsrc" != x ]) AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ]) AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ]) - AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes]) + AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes ]) + AM_CONDITIONAL([ZIA_ENABLED], [test "x$enable_zia" = xyes ]) ]) dnl # @@ -342,6 +345,10 @@ AC_DEFUN([ZFS_AC_RPM], [ RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "__strip /bin/true"' ]) + AS_IF([test "x$enable_zia" = xyes], [ + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(WITH_ZIA) 1" --define "DPUSM_ROOT $(DPUSM_ROOT)"' + ]) + RPM_DEFINE_UTIL=' --define "_initconfdir $(initconfdir)"' dnl # Make the next three RPM_DEFINE_UTIL additions conditional, since diff --git a/config/zia.m4 b/config/zia.m4 new file mode 100644 index 000000000000..eca0bd4a82e6 --- /dev/null +++ b/config/zia.m4 @@ -0,0 +1,45 @@ +dnl # Adds --with-zia=PATH to configuration options +dnl # The path provided should point to the DPUSM +dnl # root and contain Module.symvers. +AC_DEFUN([ZFS_AC_ZIA], [ + AC_ARG_WITH([zia], + AS_HELP_STRING([--with-zia=PATH], + [Path to Data Processing Services Module]), + [ + DPUSM_ROOT="$withval" + AS_IF([test "x$DPUSM_ROOT" != "xno"], + [enable_zia=yes], + [enable_zia=no]) + ], + [enable_zia=no] + ) + + AS_IF([test "x$enable_zia" == "xyes"], + AS_IF([! test -d "$DPUSM_ROOT"], + [AC_MSG_ERROR([--with-zia=PATH requires the DPUSM root directory])] + ) + + DPUSM_SYMBOLS="$DPUSM_ROOT/Module.symvers" + + AS_IF([test -r $DPUSM_SYMBOLS], + [ + AC_MSG_RESULT([$DPUSM_SYMBOLS]) + ZIA_CPPFLAGS="-DZIA=1 -I$DPUSM_ROOT/include" + KERNEL_ZIA_CPPFLAGS="-DZIA=1 -I$DPUSM_ROOT/include" + WITH_ZIA="_with_zia" + + AC_SUBST(WITH_ZIA) + AC_SUBST(KERNEL_ZIA_CPPFLAGS) + AC_SUBST(ZIA_CPPFLAGS) + AC_SUBST(DPUSM_SYMBOLS) + AC_SUBST(DPUSM_ROOT) + ], + [ + AC_MSG_ERROR([ + *** Failed to find Module.symvers in: + $DPUSM_SYMBOLS + ]) + ] + ) + ) +]) diff --git a/include/Makefile.am b/include/Makefile.am index fa725c2e7a5f..3cfbf2f6c99e 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -142,6 +142,9 @@ COMMON_H = \ sys/zfs_vfsops.h \ sys/zfs_vnops.h \ sys/zfs_znode.h \ + sys/zia.h \ + sys/zia_cddl.h \ + sys/zia_private.h sys/zil.h \ sys/zil_impl.h \ sys/zio.h \ diff --git a/include/sys/abd.h b/include/sys/abd.h index 19fe96292d5f..2faf0a879c28 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -75,6 +75,7 @@ typedef struct abd { list_t abd_gang_chain; } abd_gang; } abd_u; + void *abd_zia_handle; } abd_t; typedef int abd_iter_func_t(void *buf, size_t len, void *priv); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index e191420f2d2d..bc9a9165d9c4 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -258,6 +258,18 @@ typedef enum { ZPOOL_PROP_BCLONEUSED, ZPOOL_PROP_BCLONESAVED, ZPOOL_PROP_BCLONERATIO, + ZPOOL_PROP_ZIA_PROVIDER, + ZPOOL_PROP_ZIA_COMPRESS, + ZPOOL_PROP_ZIA_DECOMPRESS, + ZPOOL_PROP_ZIA_CHECKSUM, + ZPOOL_PROP_ZIA_RAIDZ1_GEN, + ZPOOL_PROP_ZIA_RAIDZ2_GEN, + ZPOOL_PROP_ZIA_RAIDZ3_GEN, + ZPOOL_PROP_ZIA_RAIDZ1_REC, + ZPOOL_PROP_ZIA_RAIDZ2_REC, + ZPOOL_PROP_ZIA_RAIDZ3_REC, + ZPOOL_PROP_ZIA_FILE_WRITE, + ZPOOL_PROP_ZIA_DISK_WRITE, ZPOOL_NUM_PROPS } zpool_prop_t; diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 5605a35b8641..e894a63f4bde 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -52,6 +52,9 @@ #include #include #include +#ifdef __linux__ +#include +#endif #include #ifdef __cplusplus @@ -474,6 +477,10 @@ struct spa { */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ zfs_refcount_t spa_refcount; /* number of opens */ + +#ifdef __linux__ + zia_props_t spa_zia_props; +#endif }; extern char *spa_config_path; diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h index 02c583777ebc..9877be498c74 100644 --- a/include/sys/vdev_disk.h +++ b/include/sys/vdev_disk.h @@ -40,7 +40,15 @@ #define NEW_START_BLOCK 2048 #define PARTITION_END_ALIGNMENT 2048 +#ifdef __linux__ #ifdef _KERNEL #include + +int __vdev_classic_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags); +int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio); +void vdev_disk_error(zio_t *zio); + #endif /* _KERNEL */ +#endif /* __linux__ */ #endif /* _SYS_VDEV_DISK_H */ diff --git a/include/sys/vdev_file.h b/include/sys/vdev_file.h index fddecbfe1ab5..6c6c79511809 100644 --- a/include/sys/vdev_file.h +++ b/include/sys/vdev_file.h @@ -40,6 +40,10 @@ typedef struct vdev_file { extern void vdev_file_init(void); extern void vdev_file_fini(void); +#ifdef __linux__ +extern mode_t vdev_file_open_mode(spa_mode_t spa_mode); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 57ff31e89eb9..6bd015158815 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -462,6 +462,8 @@ struct vdev { uint64_t vdev_io_t; uint64_t vdev_slow_io_n; uint64_t vdev_slow_io_t; + + void *vdev_zia_handle; }; #define VDEV_PAD_SIZE (8 << 10) diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index a34bc00ca4df..50c7e50b8c36 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -169,6 +169,11 @@ extern int vdev_raidz_load(vdev_t *); #define RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1 6 #define RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2 7 +void vdev_raidz_generate_parity_p(struct raidz_row *); +void vdev_raidz_generate_parity_pq(struct raidz_row *); +void vdev_raidz_generate_parity_pqr(struct raidz_row *); +void vdev_raidz_reconstruct_general(struct raidz_row *, int *, int); + #ifdef __cplusplus } #endif diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h index 45cb5864a22b..f081f7809c47 100644 --- a/include/sys/vdev_raidz_impl.h +++ b/include/sys/vdev_raidz_impl.h @@ -136,6 +136,7 @@ typedef struct raidz_row { uint64_t rr_offset; /* Logical offset for *_io_verify() */ uint64_t rr_size; /* Physical size for *_io_verify() */ #endif + void *rr_zia_handle; raidz_col_t rr_col[]; /* Flexible array of I/O columns */ } raidz_row_t; diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h index 2959aa9b2ca4..33d06db62ec0 100644 --- a/include/sys/zap_impl.h +++ b/include/sys/zap_impl.h @@ -61,7 +61,7 @@ typedef struct mzap_phys { uint64_t mz_salt; uint64_t mz_normflags; uint64_t mz_pad[5]; - mzap_ent_phys_t mz_chunk[1]; + mzap_ent_phys_t mz_chunk[]; /* actually variable size depending on block size */ } mzap_phys_t; diff --git a/include/sys/zia.h b/include/sys/zia.h new file mode 100644 index 000000000000..8880c12de269 --- /dev/null +++ b/include/sys/zia.h @@ -0,0 +1,227 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _ZIA_H +#define _ZIA_H + +#include +#include /* VDEV_RAIDZ_MAXPARITY */ +#include +#include +#include +#include +#include + +/* ******************************************************** */ +/* return values */ +#define ZIA_OK 1000 + +/* something bad happened not related to missing functionality */ +#define ZIA_ERROR 1001 + +/* error, fallback to zfs implementation */ +#define ZIA_FALLBACK 1002 + +/* ran, but result is bad */ +#define ZIA_BAD_RESULT 1003 + +/* expected provider and actual provider do not match */ +#define ZIA_PROVIDER_MISMATCH 1004 + +/* + * error, returned when the provider can no longer + * communicate with the accelerator (providers are + * software, and are not expected to randomly go + * down) + */ +#define ZIA_ACCELERATOR_DOWN 1005 +/* ******************************************************** */ + +/* DPUSM was not found by configure */ +#define ZIA_DISABLED 1006 + +/* + * This struct is normally set with + * zpool set zia_=on/off/ + * and passed around in spa_t. + */ +typedef struct zia_props { + /* global state */ + boolean_t can_offload; + void *provider; + + /* minimum size allowed to offload - set by ashift */ + size_t min_offload_size; + + int compress; + int decompress; + + int checksum; + + struct { + int gen[VDEV_RAIDZ_MAXPARITY + 1]; + int rec[VDEV_RAIDZ_MAXPARITY + 1]; + } raidz; + + int file_write; + int disk_write; +} zia_props_t; + +zia_props_t *zia_get_props(spa_t *spa); +void zia_prop_warn(boolean_t val, const char *name); + +int zia_init(void); +int zia_fini(void); + +void *zia_get_provider(const char *name, vdev_t *vdev); +const char *zia_get_provider_name(void *provider); +int zia_put_provider(void **provider, vdev_t *vdev); + +/* + * turn off offloading for this zio as well as + * all new zios created with the same spa + */ +int zia_disable_offloading(zio_t *zio, boolean_t reexecute); + +/* check if offloading can occur */ +boolean_t zia_is_used(zio_t *zio); + +/* + * check if a handle is associated with this pointer + * + * not exposing functions for different handles because + * only abd handles are checked outside of zia.c + */ +boolean_t zia_is_offloaded(abd_t *abd); + +int zia_worst_error(const int lhs, const int rhs); + +/* create a new offloader handle without copying data */ +void *zia_alloc(void *provider, size_t size, size_t min_offload_size); + +/* deallocate handle without onloading */ +int zia_free(void **handle); + +/* move linear data between from the offloader to memory */ +int zia_onload(void **handle, void *buf, size_t size); + +/* calls abd_iterate_func on the abd to copy abd data back and forth */ +int zia_offload_abd(void *provider, abd_t *abd, + size_t size, size_t min_offload_size, + boolean_t *local_offload, boolean_t lock); +int zia_onload_abd(abd_t *abd, size_t size, + boolean_t keep_handle); +/* move a handle into an abd */ +void zia_move_into_abd(abd_t *dst, void **src); +int zia_free_abd(abd_t *abd, boolean_t lock); + +/* + * if offloaded locally, just free the handle + * if not, onload the data and free the handle + */ +int zia_cleanup_abd(abd_t *abd, size_t size, + boolean_t local_offload, boolean_t lock); + +/* if the accelerator failed, restart the zio */ +void zia_restart_before_vdev(zio_t *zio); + +/* fill a buffer with zeros */ +int zia_zero_fill(abd_t *abd, size_t offset, size_t size); + +int +zia_compress(zia_props_t *props, enum zio_compress c, + abd_t *src, size_t s_len, + void **cbuf_handle, uint64_t *c_len, + uint8_t level, boolean_t *local_offload); + +int +zia_decompress(zia_props_t *props, enum zio_compress c, + abd_t *src, size_t s_len, abd_t *dst, size_t d_len, + uint8_t *level); + +int zia_checksum_compute(void *provider, zio_cksum_t *dst, + enum zio_checksum alg, zio_t *zio, uint64_t size, + boolean_t *local_offload); +int zia_checksum_error(enum zio_checksum alg, abd_t *abd, + uint64_t size, int byteswap, zio_cksum_t *actual_cksum); + +/* raidz */ +int zia_raidz_alloc(zio_t *zio, raidz_row_t *rr, boolean_t rec, + uint_t cksum, boolean_t *local_offload); +int zia_raidz_free(raidz_row_t *rr, boolean_t onload_parity); +int zia_raidz_gen(raidz_row_t *rr); +int zia_raidz_gen_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload); +int zia_raidz_new_parity(zio_t *zio, raidz_row_t *rr, uint64_t c); +/* compare the contents of offloaded abds (only used in resilver) */ +int zia_raidz_cmp(abd_t *lhs, abd_t *rhs, int *diff); +int zia_raidz_rec(raidz_row_t *rr, int *t, int nt); +int zia_raidz_rec_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, boolean_t onload_parity); + +/* file I/O */ +int zia_file_open(vdev_t *vdev, const char *path, + int flags, int mode); +int zia_file_write(vdev_t *vdev, abd_t *abd, ssize_t size, + loff_t offset, ssize_t *resid, int *err); +int zia_file_close(vdev_t *vdev); + +#ifdef __linux__ +#ifdef _KERNEL +#include + +/* disk I/O */ +int zia_disk_open(vdev_t *vdev, const char *path, + struct block_device *bdev); +int zia_disk_invalidate(vdev_t *vdev); +int zia_disk_write(vdev_t *vdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int flags); +int zia_disk_flush(vdev_t *vdev, zio_t *zio); +int zia_disk_close(vdev_t *vdev); +#endif +#endif + +#endif diff --git a/include/sys/zia_cddl.h b/include/sys/zia_cddl.h new file mode 100644 index 000000000000..5d4136640a56 --- /dev/null +++ b/include/sys/zia_cddl.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _ZIA_CDDL_H +#define _ZIA_CDDL_H + +#include +#include +#include + +#ifdef ZIA +#include +int +zia_compress_impl(const dpusm_uf_t *dpusm, zia_props_t *props, + enum zio_compress c, abd_t *src, size_t s_len, + void **cbuf_handle, uint64_t *c_len, + uint8_t level, boolean_t *local_offload); + +int +zia_raidz_rec_impl(const dpusm_uf_t *dpusm, + raidz_row_t *rr, int *t, int nt); + +#ifdef _KERNEL +void +zia_disk_write_completion(void *zio_ptr, int error); + +void +zia_disk_flush_completion(void *zio_ptr, int error); +#endif /* _KERNEL */ + +#endif /* ZIA */ + +#endif /* _ZIA_CDDL_H */ diff --git a/include/sys/zia_private.h b/include/sys/zia_private.h new file mode 100644 index 000000000000..b173c5c29d0c --- /dev/null +++ b/include/sys/zia_private.h @@ -0,0 +1,75 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _ZIA_PRIVATE_H +#define _ZIA_PRIVATE_H + +/* + * needed by both zia.h and zia_cddl.h + */ + +#include +#include +#include + +#define ABD_HANDLE(abd) (abd)->abd_zia_handle + +#define VDEV_HANDLE(vdev) (vdev)->vdev_zia_handle + +int +dpusm_to_ret(const int dpusm_ret); + +#ifdef ZIA +#include + +dpusm_compress_t +compress_to_dpusm(enum zio_compress c); + +int zia_get_capabilities(void *provider, dpusm_pc_t **caps); + +#endif /* ZIA */ + +#endif /* _ZIA_PRIVATE_H */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 77c70b9b481c..b8cd60515065 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -223,6 +223,8 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_REEXECUTED (1ULL << 29) #define ZIO_FLAG_DELEGATED (1ULL << 30) +#define ZIO_FLAG_ZIA_REEXECUTE (1ULL << 32) + #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) @@ -528,6 +530,8 @@ struct zio { /* Taskq dispatching state */ taskq_ent_t io_tqent; + + boolean_t io_can_offload; }; enum blk_verify_flag { @@ -618,6 +622,7 @@ extern void zio_data_buf_free(void *buf, size_t size); extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, uint64_t bufsize, zio_transform_func_t *transform); +extern zio_transform_t *zio_pop_transform(zio_t *zio); extern void zio_pop_transforms(zio_t *zio); extern void zio_resubmit_stage_async(void *); diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h index 691d7b624488..7c12b139f243 100644 --- a/include/sys/zio_compress.h +++ b/include/sys/zio_compress.h @@ -154,6 +154,8 @@ typedef const struct zio_compress_info { extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; +extern int zio_compress_zeroed_cb(void *data, size_t len, void *private); + /* * lz4 compression init & free */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 42f3404db5a9..031bb7691ccd 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -180,6 +180,8 @@ nodist_libzpool_la_SOURCES = \ module/zfs/zfs_rlock.c \ module/zfs/zfs_sa.c \ module/zfs/zil.c \ + module/zfs/zia.c \ + module/zfs/zia_cddl.c \ module/zfs/zio.c \ module/zfs/zio_checksum.c \ module/zfs/zio_compress.c \ diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 5428ab8d3076..a219d4018f3e 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -437,6 +437,42 @@ command, though this property can be used when a specific version is needed for backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. +.It Sy zia_checksum Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload checksum computations. +Does not have any effect if the checksum stage is disabled. +Embedded checksums are onloaded, and will suffer a data movement penalty. +.It Sy zia_compress Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload compression. +Does not have any effect if the compression stage is disabled. +Embedded data is onloaded, and will suffer a data movement penalty. +.It Sy zia_decompress Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload decompression. +.It Sy zia_disk_write Ns = Ns Sy on Ns | Ns Sy off +Controls whether a pool should offload write I/Os to disks. +.It Sy zia_file_write Ns = Ns Sy on Ns | Ns Sy off +Controls whether a pool should offload write I/Os to files. +.It Sy zia_provider Ns = Ns Sy (unset) | Ns Sy Z.I.A. Provider Name +Selects an accelerator registered in the Data Processing Unit Services +Module to offload data to. +Only one accelerator can be used by a pool at a time. +.It Sy zia_raidz1_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ1 parity generation. +Does not have any effect if RAIDZ1 is disabled. +.It Sy zia_raidz1_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ1 reconstruction. +Does not have any effect if RAIDZ1 is disabled. +.It Sy zia_raidz2_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ2 parity generation. +Does not have any effect if RAIDZ2 is disabled. +.It Sy zia_raidz2_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ2 reconstruction. +Does not have any effect if RAIDZ2 is disabled. +.It Sy zia_raidz3_gen Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ3 parity generation. +Does not have any effect if RAIDZ3 is disabled. +.It Sy zia_raidz3_rec Ns = Ns Sy on Ns | Ns Sy off +Controls whether the pool should offload RAIDZ3 reconstruction. +Does not have any effect if RAIDZ3 is disabled. .El . .Ss User Properties diff --git a/module/Kbuild.in b/module/Kbuild.in index 9e44364b7584..30643733df80 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -27,6 +27,7 @@ ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs ZFS_MODULE_CFLAGS += -I$(zfs_include) ZFS_MODULE_CPPFLAGS += -D_KERNEL ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ +ZFS_MODULE_CPPFLAGS += @KERNEL_ZIA_CPPFLAGS@ # KASAN enables -Werror=frame-larger-than=1024, which # breaks oh so many parts of our build. @@ -422,6 +423,8 @@ ZFS_OBJS := \ zfs_sa.o \ zfs_vnops.o \ zil.o \ + zia.o \ + zia_cddl.o \ zio.o \ zio_checksum.o \ zio_compress.o \ @@ -499,3 +502,19 @@ OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y ifeq ($(CONFIG_ALTIVEC),y) $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec endif + +ifneq ("@DPUSM_SYMBOLS@","") +obj-$(CONFIG_ZFS) += zia-software-provider.o + +ZIA_SOFTWARE_PROVIDER_OBJS := \ + provider.o \ + kernel_offloader.o + +zia-software-provider-objs += $(addprefix zia-software-provider/,$(ZIA_SOFTWARE_PROVIDER_OBJS)) +# zfs_file_os does not have any dependencies, so just link to it directly +zia-software-provider-objs += os/linux/zfs/zfs_file_os.o + +$(addprefix $(obj)/zia-software-provider/,$(ZIA_SOFTWARE_PROVIDER_OBJS)) : ccflags-y += -I@abs_top_builddir@ $(ZFS_MODULE_CFLAGS) -I@abs_srcdir@/zia-software-provider/ -I@DPUSM_ROOT@/include + +@ZIA_ENABLED_TRUE@KBUILD_EXTRA_SYMBOLS += @DPUSM_SYMBOLS@ +endif \ No newline at end of file diff --git a/module/Makefile.in b/module/Makefile.in index 9b34b3dfaec7..c780c491de39 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -80,7 +80,7 @@ clean: clean-@ac_system@ .PHONY: modules_uninstall-Linux-legacy modules_uninstall-Linux-legacy: - $(RM) -r $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,spl/ avl/ icp/ lua/ nvpair/ unicode/ zcommon/ zfs/ zstd/) + $(RM) -r $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,spl/ avl/ icp/ lua/ nvpair/ unicode/ zcommon/ zfs/ zstd/ zia-software-provider) KMODDIR := $(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@ modules_install-Linux: modules_uninstall-Linux-legacy @@ -123,7 +123,7 @@ data_install: data_install-@ac_system@ modules_uninstall-Linux: modules_uninstall-Linux-legacy @# Uninstall the kernel modules - $(RM) $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,zfs.ko spl.ko) + $(RM) $(addprefix $(KMODDIR)/$(INSTALL_MOD_DIR)/,zfs.ko spl.ko zia-software-provider) modules_uninstall-FreeBSD: @false @@ -153,7 +153,7 @@ cppcheck-Linux: -I @top_srcdir@/include/os/linux/spl \ -I @top_srcdir@/include/os/linux/zfs \ -I @top_srcdir@/include \ - avl icp lua nvpair unicode zcommon zfs zstd os/linux + avl icp lua nvpair unicode zcommon zfs zstd os/linux zia-software-provider cppcheck-FreeBSD: @true diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 7284b922b3bf..4d8bd3f0b583 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -220,7 +221,7 @@ bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) return (psize); } -static void +void vdev_disk_error(zio_t *zio) { /* @@ -333,6 +334,7 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } + zia_disk_close(v); vdev_blkdev_put(bdh, smode, zfs_vdev_holder); } @@ -457,6 +459,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, *logical_ashift = highbit64(MAX(logical_block_size, SPA_MINBLOCKSIZE)) - 1; + zia_get_props(v->vdev_spa)->min_offload_size = 2 << *physical_ashift; + + /* open disk; ignore errors - will fall back to ZFS */ + zia_disk_open(v, v->vdev_path, vd->vd_bdh); + return (0); } @@ -468,9 +475,11 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; - if (vd->vd_bdh != NULL) + if (vd->vd_bdh != NULL) { + zia_disk_close(v); vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); + } rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); @@ -1100,17 +1109,10 @@ vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) #endif } -static int -vdev_classic_physio(zio_t *zio) +int +__vdev_classic_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags) { - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; - struct block_device *bdev = BDH_BDEV(vd->vd_bdh); - size_t io_size = zio->io_size; - uint64_t io_offset = zio->io_offset; - int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; - int flags = 0; - dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; @@ -1217,6 +1219,23 @@ vdev_classic_physio(zio_t *zio) return (error); } +EXPORT_SYMBOL(__vdev_classic_physio); + +static int +vdev_classic_physio(zio_t *zio) +{ + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + struct block_device *bdev = BDH_BDEV(vd->vd_bdh); + size_t io_size = zio->io_size; + uint64_t io_offset = zio->io_offset; + int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE; + int flags = 0; + + return __vdev_classic_physio(bdev, zio, + io_size, io_offset, rw, flags); +} + /* ========== */ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) @@ -1238,7 +1257,7 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) zio_interrupt(zio); } -static int +int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) { struct request_queue *q; @@ -1261,6 +1280,8 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } +EXPORT_SYMBOL(vdev_disk_io_flush); + BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) { zio_t *zio = bio->bi_private; @@ -1419,6 +1440,17 @@ vdev_disk_io_start(zio_t *zio) * Issue the flush. If successful, the response will * be handled in the completion callback, so we're done. */ + error = zia_disk_flush(v, zio); + + /* + * have to return here in order to not dispatch + * this zio to multiple task queues + */ + if (error == 0) { + rw_exit(&vd->vd_lock); + return; + } + error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); if (error == 0) { rw_exit(&vd->vd_lock); @@ -1442,8 +1474,46 @@ vdev_disk_io_start(zio_t *zio) return; case ZIO_TYPE_READ: + zio->io_target_timestamp = zio_handle_io_delay(zio); + error = vdev_disk_io_rw_fn(zio); + rw_exit(&vd->vd_lock); + if (error) { + zio->io_error = error; + zio_interrupt(zio); + } + return; + case ZIO_TYPE_WRITE: zio->io_target_timestamp = zio_handle_io_delay(zio); + error = EIO; + + boolean_t local_offload = B_FALSE; + zia_props_t *zia_props = zia_get_props(zio->io_spa); + if ((zia_props->disk_write == 1) && + (zio->io_can_offload == B_TRUE)) { + if (zia_offload_abd(zia_props->provider, zio->io_abd, + zio->io_size, zia_props->min_offload_size, + &local_offload, B_TRUE) == ZIA_OK) { + error = zia_disk_write(v, zio, zio->io_size, + zio->io_offset, 0); + } + } + + if (error == 0) { + rw_exit(&vd->vd_lock); + return; + } + + error = zia_cleanup_abd(zio->io_abd, zio->io_size, + local_offload, B_TRUE); + + if (error == ZIA_ACCELERATOR_DOWN) { + zia_disable_offloading(zio, B_TRUE); + rw_exit(&vd->vd_lock); + zio_interrupt(zio); + return; + } + error = vdev_disk_io_rw_fn(zio); rw_exit(&vd->vd_lock); if (error) { @@ -1484,6 +1554,7 @@ vdev_disk_io_done(zio_t *zio) vdev_disk_t *vd = v->vdev_tsd; if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { + zia_disk_invalidate(v); invalidate_bdev(BDH_BDEV(vd->vd_bdh)); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index ac41a2615f16..fcb3d7af9461 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -36,6 +36,7 @@ #include #include #include +#include #ifdef _KERNEL #include #endif @@ -68,7 +69,7 @@ vdev_file_rele(vdev_t *vd) ASSERT(vd->vdev_path != NULL); } -static mode_t +mode_t vdev_file_open_mode(spa_mode_t spa_mode) { mode_t mode = 0; @@ -161,6 +162,12 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, } #endif + zia_get_props(vd->vdev_spa)->min_offload_size = 2 << *physical_ashift; + + /* try to open the file; ignore errors - will fall back to ZFS */ + zia_file_open(vd, vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0); + skip_open: error = zfs_file_getattr(vf->vf_file, &zfa); @@ -184,6 +191,8 @@ vdev_file_close(vdev_t *vd) if (vd->vdev_reopening || vf == NULL) return; + zia_file_close(vd); + if (vf->vf_file != NULL) { (void) zfs_file_close(vf->vf_file); } @@ -203,18 +212,53 @@ vdev_file_io_strategy(void *arg) void *buf; loff_t off; ssize_t size; - int err; + int err = 0; off = zio->io_offset; size = zio->io_size; resid = 0; if (zio->io_type == ZIO_TYPE_READ) { - buf = abd_borrow_buf(zio->io_abd, zio->io_size); + buf = abd_borrow_buf(zio->io_abd, size); err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); abd_return_buf_copy(zio->io_abd, buf, size); } else { - buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + err = EIO; + + boolean_t local_offload = B_FALSE; + zia_props_t *zia_props = zia_get_props(zio->io_spa); + + if ((zia_props->file_write == 1) && + (zio->io_can_offload == B_TRUE)) { + if (zia_offload_abd(zia_props->provider, zio->io_abd, + size, zia_props->min_offload_size, + &local_offload, B_TRUE) == ZIA_OK) { + err = zia_file_write(vd, zio->io_abd, size, off, + &resid, &err); + } + } + + /* if offload and write succeeded, return here */ + if (err == 0) { + zio->io_error = err; + if (resid != 0 && zio->io_error == 0) + zio->io_error = SET_ERROR(ENOSPC); + + zio_delay_interrupt(zio); + return; + } + + /* if offload or write failed, bring data back into memory */ + err = zia_cleanup_abd(zio->io_abd, size, local_offload, B_TRUE); + + /* if onload failed, restart the zio with offloading disabled */ + if (err == ZIA_ACCELERATOR_DOWN) { + zia_disable_offloading(zio, B_TRUE); + zio_delay_interrupt(zio); + return; + } + + buf = abd_borrow_buf_copy(zio->io_abd, size); err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); abd_return_buf(zio->io_abd, buf, size); } diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index e2e3bf5be69e..8517b3644d00 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -178,6 +178,43 @@ zpool_prop_init(void) PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO", B_FALSE, sfeatures); + zprop_register_string(ZPOOL_PROP_ZIA_PROVIDER, "zia_provider", NULL, + PROP_DEFAULT, ZFS_TYPE_POOL, "", "PROVIDER", + sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_COMPRESS, "zia_compress", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_compress", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_DECOMPRESS, "zia_decompress", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_decompress", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_CHECKSUM, + "zia_checksum", 1, PROP_DEFAULT, ZFS_TYPE_POOL, + "on | off", "zia_checksum", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ1_GEN, "zia_raidz1_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz1_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ2_GEN, "zia_raidz2_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz2_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ3_GEN, "zia_raidz3_gen", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz3_gen", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ1_REC, "zia_raidz1_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz1_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ2_REC, "zia_raidz2_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz2_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_RAIDZ3_REC, "zia_raidz3_rec", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_raidz3_rec", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_FILE_WRITE, "zia_file_write", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_file_write", boolean_table, sfeatures); + zprop_register_index(ZPOOL_PROP_ZIA_DISK_WRITE, "zia_disk_write", + 1, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", + "zia_disk_write", boolean_table, sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/THIRDPARTYLICENSE.zia b/module/zfs/THIRDPARTYLICENSE.zia new file mode 100644 index 000000000000..9f81923f051d --- /dev/null +++ b/module/zfs/THIRDPARTYLICENSE.zia @@ -0,0 +1,42 @@ +© 2021. Triad National Security, LLC. All rights reserved. + +This program was produced under U.S. Government contract +89233218CNA000001 for Los Alamos National Laboratory (LANL), which +is operated by Triad National Security, LLC for the U.S. +Department of Energy/National Nuclear Security Administration. All +rights in the program are reserved by Triad National Security, LLC, +and the U.S. Department of Energy/National Nuclear Security +Administration. The Government is granted for itself and others +acting on its behalf a nonexclusive, paid-up, irrevocable worldwide +license in this material to reproduce, prepare derivative works, +distribute copies to the public, perform publicly and display +publicly, and to permit others to do so. + +---- + +This program is open source under the BSD-3 License. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from this +software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/module/zfs/THIRDPARTYLICENSE.zia.descrip b/module/zfs/THIRDPARTYLICENSE.zia.descrip new file mode 100644 index 000000000000..4be64904acc6 --- /dev/null +++ b/module/zfs/THIRDPARTYLICENSE.zia.descrip @@ -0,0 +1 @@ +Z.I.A. FUNCTIONALITY IN ZFS \ No newline at end of file diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 2c0cda25dbc6..32f75266706f 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -101,6 +101,7 @@ #include #include #include +#include /* see block comment above for description */ int zfs_abd_scatter_enabled = B_TRUE; @@ -147,11 +148,15 @@ abd_init_struct(abd_t *abd) abd->abd_parent = NULL; #endif abd->abd_size = 0; + + abd->abd_zia_handle = NULL; } static void abd_fini_struct(abd_t *abd) { + zia_free_abd(abd, B_TRUE); + mutex_destroy(&abd->abd_mtx); ASSERT(!list_link_active(&abd->abd_gang_link)); #ifdef ZFS_DEBUG @@ -321,6 +326,8 @@ abd_free(abd_t *abd) abd_free_struct_impl(abd); } +EXPORT_SYMBOL(abd_free); + /* * Allocate an ABD of the same format (same metadata flag, same scatterize * setting) as another ABD. @@ -597,9 +604,15 @@ abd_get_offset_size(abd_t *sabd, size_t off, size_t size) abd_t * abd_get_zeros(size_t size) { + abd_t *abd = NULL; + ASSERT3P(abd_zero_scatter, !=, NULL); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); - return (abd_get_offset_size(abd_zero_scatter, 0, size)); + + abd = abd_get_offset_size(abd_zero_scatter, 0, size); + abd->abd_flags |= ABD_FLAG_ZEROS; + + return (abd); } /* @@ -625,6 +638,8 @@ abd_get_from_buf(void *buf, size_t size) return (abd); } +EXPORT_SYMBOL(abd_get_from_buf); + /* * Get the raw buffer associated with a linear ABD. */ @@ -724,7 +739,6 @@ abd_release_ownership_of_buf(abd_t *abd) abd_update_linear_stats(abd, ABDSTAT_DECR); } - /* * Give this ABD ownership of the buffer that it's storing. Can only be used on * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 8b440aafba43..6c5828fa053a 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -57,6 +57,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -2543,6 +2544,7 @@ byteswap_uint8_array(void *vbuf, size_t size) void dmu_init(void) { + zia_init(); abd_init(); zfs_dbgmsg_init(); sa_cache_init(); @@ -2558,6 +2560,7 @@ dmu_init(void) void dmu_fini(void) { + zia_fini(); arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); dmu_tx_fini(); diff --git a/module/zfs/lz4_zfs.c b/module/zfs/lz4_zfs.c index e28215cf3501..cb633833d22d 100644 --- a/module/zfs/lz4_zfs.c +++ b/module/zfs/lz4_zfs.c @@ -80,6 +80,8 @@ lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, return (bufsiz + sizeof (bufsiz)); } +EXPORT_SYMBOL(lz4_compress_zfs); + int lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) @@ -100,6 +102,8 @@ lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len, d_start, bufsiz, d_len) < 0); } +EXPORT_SYMBOL(lz4_decompress_zfs); + /* * LZ4 API Description: * diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 638572996c3a..105551af8978 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #ifdef _KERNEL @@ -486,6 +487,46 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) dp->scd_path, 0, ZPROP_SRC_LOCAL); } } + + zia_props_t *zia_props = zia_get_props(spa); + if (zia_props->provider != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_PROVIDER, + (char *)zia_get_provider_name(zia_props->provider), + 0, ZPROP_SRC_LOCAL); + } + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_COMPRESS, + NULL, zia_props->compress, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_DECOMPRESS, + NULL, zia_props->decompress, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_CHECKSUM, + NULL, zia_props->checksum, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ1_GEN, + NULL, zia_props->raidz.gen[1], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ2_GEN, + NULL, zia_props->raidz.gen[2], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ3_GEN, + NULL, zia_props->raidz.gen[3], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ1_REC, + NULL, zia_props->raidz.rec[1], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ2_REC, + NULL, zia_props->raidz.rec[2], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_RAIDZ3_REC, + NULL, zia_props->raidz.rec[3], ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_FILE_WRITE, + NULL, zia_props->file_write, ZPROP_SRC_LOCAL); + + spa_prop_add_list(*nvp, ZPOOL_PROP_ZIA_DISK_WRITE, + NULL, zia_props->disk_write, ZPROP_SRC_LOCAL); } /* @@ -800,6 +841,20 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) error = SET_ERROR(E2BIG); break; + case ZPOOL_PROP_ZIA_PROVIDER: + case ZPOOL_PROP_ZIA_COMPRESS: + case ZPOOL_PROP_ZIA_DECOMPRESS: + case ZPOOL_PROP_ZIA_CHECKSUM: + case ZPOOL_PROP_ZIA_RAIDZ1_GEN: + case ZPOOL_PROP_ZIA_RAIDZ2_GEN: + case ZPOOL_PROP_ZIA_RAIDZ3_GEN: + case ZPOOL_PROP_ZIA_RAIDZ1_REC: + case ZPOOL_PROP_ZIA_RAIDZ2_REC: + case ZPOOL_PROP_ZIA_RAIDZ3_REC: + case ZPOOL_PROP_ZIA_FILE_WRITE: + case ZPOOL_PROP_ZIA_DISK_WRITE: + break; + default: break; } @@ -2127,6 +2182,11 @@ spa_unload(spa_t *spa) spa->spa_raidz_expand = NULL; + if (zia_get_props(spa)->provider != NULL) { + zia_put_provider(&zia_get_props(spa)->provider, + spa->spa_root_vdev); + } + spa_config_exit(spa, SCL_ALL, spa); } @@ -6618,6 +6678,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_import_os(spa); + zia_get_props(spa)->can_offload = B_FALSE; + mutex_exit(&spa_namespace_lock); return (0); @@ -9476,6 +9538,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; + zia_props_t *zia_props = zia_get_props(spa); mutex_enter(&spa->spa_props_lock); @@ -9549,7 +9612,142 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; + case ZPOOL_PROP_ZIA_PROVIDER: + strval = fnvpair_value_string(elem); + if (zia_props->provider != NULL) + zia_put_provider(&zia_props->provider, + spa->spa_root_vdev); + zia_props->provider = zia_get_provider(strval, + spa->spa_root_vdev); + zia_props->can_offload = !!zia_props->provider; + /* + * Dirty the configuration on vdevs as above. + */ + if (tx->tx_txg != TXG_INITIAL) { + vdev_config_dirty(spa->spa_root_vdev); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + /* + * reopen devices so that provider is used + * copied from zfs_ioc_pool_reopen + */ + spa_vdev_state_enter(spa, SCL_NONE); + vdev_close(spa->spa_root_vdev); + (void) vdev_open(spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + break; + case ZPOOL_PROP_ZIA_COMPRESS: + zia_props->compress = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_props->compress, + "Compression"); + break; + case ZPOOL_PROP_ZIA_DECOMPRESS: + zia_props->decompress = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_props->decompress, + "Decompression"); + break; + case ZPOOL_PROP_ZIA_CHECKSUM: + zia_props->checksum = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_props->checksum, + "Checksum"); + break; + case ZPOOL_PROP_ZIA_RAIDZ1_GEN: + zia_props->raidz.gen[1] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_props->raidz.gen[1], + "RAIDZ 1 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ2_GEN: + zia_props->raidz.gen[2] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_props->raidz.gen[2], + "RAIDZ 2 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ3_GEN: + zia_props->raidz.gen[3] = + fnvpair_value_uint64(elem); + zia_prop_warn(zia_props->raidz.gen[3], + "RAIDZ 3 Generation"); + break; + case ZPOOL_PROP_ZIA_RAIDZ1_REC: + zia_props->raidz.rec[1] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_props->raidz.rec[1]) { + if (!zia_props->checksum) { + zia_props->checksum = 1; + zia_prop_warn( + zia_props->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_props->raidz.rec[1], + "RAIDZ 1 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_RAIDZ2_REC: + zia_props->raidz.rec[2] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_props->raidz.rec[2]) { + if (!zia_props->checksum) { + zia_props->checksum = 1; + zia_prop_warn( + zia_props->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_props->raidz.rec[2], + "RAIDZ 2 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_RAIDZ3_REC: + zia_props->raidz.rec[3] = + fnvpair_value_uint64(elem); + /* need checksum */ + if (zia_props->raidz.rec[3]) { + if (!zia_props->checksum) { + zia_props->checksum = 1; + zia_prop_warn( + zia_props->checksum, + "Checksum"); + } + } + zia_prop_warn(zia_props->raidz.rec[3], + "RAIDZ 3 Reconstruction"); + break; + case ZPOOL_PROP_ZIA_FILE_WRITE: + zia_props->file_write = + fnvpair_value_uint64(elem); + + /* reopen devices so that provider is used */ + spa_vdev_state_enter(spa, SCL_NONE); + vdev_close(spa->spa_root_vdev); + (void) vdev_open(spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + + zia_prop_warn(zia_props->file_write, + "File Write"); + break; + case ZPOOL_PROP_ZIA_DISK_WRITE: + zia_props->disk_write = + fnvpair_value_uint64(elem); + + /* reopen devices so that provider is used */ + spa_vdev_state_enter(spa, SCL_NONE); + vdev_close(spa->spa_root_vdev); + (void) vdev_open(spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + + zia_prop_warn(zia_props->disk_write, + "Disk Write"); + break; case ZPOOL_PROP_INVAL: if (zpool_prop_feature(elemname)) { fname = strchr(elemname, '@') + 1; diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c74f72159dc9..b8bfbeeb259a 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -725,6 +725,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); + vd->vdev_zia_handle = NULL; + return (vd); } @@ -1067,6 +1069,8 @@ vdev_free(vdev_t *vd) */ vdev_close(vd); + ASSERT3P(vd->vdev_zia_handle, ==, NULL); + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index 13bb33cc6871..10eaee5f26da 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -1033,6 +1033,7 @@ vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset, rr->rr_offset = io_offset; rr->rr_size = io_size; #endif + rr->rr_zia_handle = NULL; *rrp = rr; uint8_t *base; diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 15c8b8ca6016..4c690d760e22 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -43,6 +43,7 @@ #include #include #include +#include #ifdef ZFS_DEBUG #include /* For vdev_xlate() in vdev_raidz_io_verify() */ @@ -376,6 +377,8 @@ static int zfs_scrub_after_expand = 1; static void vdev_raidz_row_free(raidz_row_t *rr) { + zia_raidz_free(rr, B_FALSE); + for (int c = 0; c < rr->rr_cols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -628,6 +631,7 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, rr->rr_offset = zio->io_offset; rr->rr_size = zio->io_size; #endif + rr->rr_zia_handle = NULL; uint64_t asize = 0; @@ -1094,7 +1098,7 @@ vdev_raidz_pqr_func(void *buf, size_t size, void *private) return (0); } -static void +void vdev_raidz_generate_parity_p(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -1112,7 +1116,9 @@ vdev_raidz_generate_parity_p(raidz_row_t *rr) } } -static void +EXPORT_SYMBOL(vdev_raidz_generate_parity_p); + +void vdev_raidz_generate_parity_pq(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -1154,7 +1160,9 @@ vdev_raidz_generate_parity_pq(raidz_row_t *rr) } } -static void +EXPORT_SYMBOL(vdev_raidz_generate_parity_pq); + +void vdev_raidz_generate_parity_pqr(raidz_row_t *rr) { uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); @@ -1202,6 +1210,8 @@ vdev_raidz_generate_parity_pqr(raidz_row_t *rr) } } +EXPORT_SYMBOL(vdev_raidz_generate_parity_pqr); + /* * Generate RAID parity in the first virtual columns according to the number of * parity columns available. @@ -1888,7 +1898,7 @@ vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, kmem_free(p, psize); } -static void +void vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) { int i, c, t, tt; @@ -2029,6 +2039,8 @@ vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) } } +EXPORT_SYMBOL(vdev_raidz_reconstruct_general); + static void vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, const int *t, int nt) @@ -2333,7 +2345,24 @@ vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) vdev_t *vd = zio->io_vd; raidz_map_t *rm = zio->io_vsd; - vdev_raidz_generate_parity_row(rm, rr); + /* + * here instead of vdev_raidz_map_alloc or + * vdev_raidz_generate_parity_row to not have to + * store local_offload and be able to use zio + */ + boolean_t local_offload = B_FALSE; + if ((zia_raidz_alloc(zio, rr, B_FALSE, 0, &local_offload) != ZIA_OK) || + (zia_raidz_gen(rr) != ZIA_OK)) { + if (zia_raidz_gen_cleanup(zio, rr, + local_offload) == ZIA_ACCELERATOR_DOWN) { + zia_disable_offloading(zio, B_TRUE); + zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; + return; + } + vdev_raidz_generate_parity_row(rm, rr); + } else { + zio->io_flags |= ZIO_FLAG_DONT_AGGREGATE; + } for (int c = 0; c < rr->rr_scols; c++) { raidz_col_t *rc = &rr->rr_col[c]; @@ -2631,14 +2660,69 @@ raidz_checksum_verify(zio_t *zio) { zio_bad_cksum_t zbc = {0}; raidz_map_t *rm = zio->io_vsd; + /* + * if the zio entered this function offloaded, + * need to onload the parity columns on error + */ + const boolean_t entered_offloaded = zia_is_offloaded(zio->io_abd); int ret = zio_checksum_error(zio, &zbc); if (ret != 0 && zbc.zbc_injected != 0) rm->rm_ecksuminjected = 1; + /* + * zio_checksum_error does not get access to + * rm, so only the abd is freed on error - + * clean up rm here + */ + if (zia_is_offloaded(zio->io_abd) != B_TRUE) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + + /* + * force onload, since data was modified + * + * ignore return value - will always return ZIA_ERROR + */ + zia_raidz_rec_cleanup(zio, rr, B_TRUE, + entered_offloaded); + } + } + return (ret); } +static void +raidz_move_orig_parity(zio_t *zio, raidz_row_t *rr, abd_t **orig) +{ + (void) zio; + + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + + orig[c] = rc->rc_abd; + ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); + rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); + zia_raidz_new_parity(zio, rr, c); + } +} + +static void +raidz_restore_orig_parity(raidz_row_t *rr, abd_t **orig) +{ + for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + + abd_free(rc->rc_abd); + rc->rc_abd = orig[c]; + orig[c] = NULL; + } +} + /* * Generate the parity from the data columns. If we tried and were able to * read the parity without error, verify that the generated parity matches the @@ -2646,10 +2730,10 @@ raidz_checksum_verify(zio_t *zio) * number of such failures. */ static int -raidz_parity_verify(zio_t *zio, raidz_row_t *rr) +raidz_parity_verify(zio_t *zio, raidz_row_t *rr, int *unexpected_errors) { - abd_t *orig[VDEV_RAIDZ_MAXPARITY]; - int c, ret = 0; + abd_t *orig[VDEV_RAIDZ_MAXPARITY] = { NULL }; + int c; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; @@ -2658,31 +2742,45 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); if (checksum == ZIO_CHECKSUM_NOPARITY) - return (ret); - - for (c = 0; c < rr->rr_firstdatacol; c++) { - rc = &rr->rr_col[c]; - if (!rc->rc_tried || rc->rc_error != 0) - continue; + return (0); - orig[c] = rc->rc_abd; - ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); - rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); - } + raidz_move_orig_parity(zio, rr, orig); /* * Verify any empty sectors are zero filled to ensure the parity * is calculated correctly even if these non-data sectors are damaged. */ if (rr->rr_nempty && rr->rr_abd_empty != NULL) - ret += vdev_draid_map_verify_empty(zio, rr); + *unexpected_errors += vdev_draid_map_verify_empty(zio, rr); /* * Regenerates parity even for !tried||rc_error!=0 columns. This * isn't harmful but it does have the side effect of fixing stuff * we didn't realize was necessary (i.e. even if we return 0). */ - vdev_raidz_generate_parity_row(rm, rr); + if (zia_raidz_gen(rr) != ZIA_OK) { + /* + * restore original parity columns so + * that the reconstructed parity can + * be brought back with the data columns + */ + raidz_restore_orig_parity(rr, orig); + + /* return reconstructed columns to memory */ + const int ret = zia_raidz_rec_cleanup(zio, rr, + B_FALSE, B_TRUE); + + if (ret == ZIA_ACCELERATOR_DOWN) { + return (ret); + } + + /* + * continue to software, so redo the + * original moving of parity columns + */ + raidz_move_orig_parity(zio, rr, orig); + vdev_raidz_generate_parity_row(rm, rr); + } for (c = 0; c < rr->rr_firstdatacol; c++) { rc = &rr->rr_col[c]; @@ -2690,17 +2788,80 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) if (!rc->rc_tried || rc->rc_error != 0) continue; - if (abd_cmp(orig[c], rc->rc_abd) != 0) { + int cmp = 0; + if (zia_raidz_cmp(orig[c], rc->rc_abd, &cmp) != ZIA_OK) { + if (zia_is_offloaded(zio->io_abd) || + rr->rr_zia_handle) { + /* + * should only need to onload orig[c] and + * rc but onloading everything to not create + * inconsistent rr state + */ + int ret = zia_raidz_rec_cleanup(zio, rr, + B_FALSE, B_TRUE); + + for (uint64_t i = 0; i < rr->rr_firstdatacol; + i++) { + if (orig[i]) { + ret = zia_worst_error(ret, + zia_onload_abd(orig[i], + orig[i]->abd_size, + B_FALSE)); + } + } + + if (ret == ZIA_ACCELERATOR_DOWN) { + /* + * get original parity columns back to + * get the original in-memory data + */ + raidz_restore_orig_parity(rr, orig); + return (ret); + } + } + cmp = abd_cmp(orig[c], rc->rc_abd); + } + if (cmp != 0) { + if (zia_is_offloaded(zio->io_abd) || + rr->rr_zia_handle) { + /* + * should only need to onload orig[c] and + * rc but onloading everything to not create + * inconsistent rr state + */ + int ret = zia_raidz_rec_cleanup(zio, rr, + B_FALSE, B_TRUE); + + for (uint64_t i = 0; i < rr->rr_firstdatacol; + i++) { + if (orig[i]) { + ret = zia_worst_error(ret, + zia_onload_abd(orig[i], + orig[i]->abd_size, + B_FALSE)); + } + } + + if (ret == ZIA_ACCELERATOR_DOWN) { + /* + * get original parity columns back to + * get the original in-memory data + */ + raidz_restore_orig_parity(rr, orig); + return (ret); + } + } + zfs_dbgmsg("found error on col=%u devidx=%u off %llx", c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); - ret++; + (*unexpected_errors)++; } abd_free(orig[c]); } - return (ret); + return (0); } static int @@ -2716,7 +2877,7 @@ vdev_raidz_worst_error(raidz_row_t *rr) return (error); } -static void +static int vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) { int unexpected_errors = 0; @@ -2756,8 +2917,11 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) if (parity_errors + parity_untried < rr->rr_firstdatacol - data_errors || (zio->io_flags & ZIO_FLAG_RESILVER)) { - int n = raidz_parity_verify(zio, rr); - unexpected_errors += n; + int ret = + raidz_parity_verify(zio, rr, &unexpected_errors); + if (ret != 0) { + return (ret); + } } if (zio->io_error == 0 && spa_writeable(zio->io_spa) && @@ -2826,6 +2990,7 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) zio_nowait(cio); } } + return (0); } static void @@ -2970,15 +3135,43 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) zfs_dbgmsg("reconstruction not possible; " "too many failures"); } + + /* drop offloaded data */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); + /* no data movement, so errors don't matter */ + } raidz_restore_orig_data(rm); return (EINVAL); } - if (dead_data > 0) - vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); + + if (dead_data > 0) { + /* + * here instead of vdev_raidz_reconstruct_row + * to be able to use zio + */ + if (zia_raidz_rec(rr, my_tgts, t) != ZIA_OK) { + int ret = ZIA_OK; + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + ret = zia_worst_error(ret, + zia_raidz_rec_cleanup(zio, rr, + B_FALSE, B_TRUE)); + } + + if ((ret != ZIA_OK) && + (ret != ZIA_ACCELERATOR_DOWN)) { + vdev_raidz_reconstruct_row(rm, rr, + my_tgts, t); + } + } + } } /* Check for success */ if (raidz_checksum_verify(zio) == 0) { + int ret = 0; /* Reconstruction succeeded - report errors */ for (int i = 0; i < rm->rm_nrows; i++) { @@ -3008,19 +3201,32 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) } } - vdev_raidz_io_done_verified(zio, rr); + const int rc = + vdev_raidz_io_done_verified(zio, rr); + ret = zia_worst_error(ret, rc); } zio_checksum_verified(zio); + if (ret != ZIA_ACCELERATOR_DOWN) { + ret = 0; + } + if (dbgmsg) { zfs_dbgmsg("reconstruction successful " "(checksum verified)"); } - return (0); + + return (ret); } /* Reconstruction failed - restore original data */ + /* drop offloaded data */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); + /* no data movement, so errors don't matter */ + } raidz_restore_orig_data(rm); if (dbgmsg) { zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " @@ -3128,6 +3334,9 @@ vdev_raidz_combrec(zio_t *zio) for (;;) { int err = raidz_reconstruct(zio, ltgts, num_failures, nparity); + if (err == ZIA_ACCELERATOR_DOWN) { + return (err); + } if (err == EINVAL) { /* * Reconstruction not possible with this # @@ -3316,6 +3525,18 @@ vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, ASSERT(rr->rr_firstdatacol >= n); + if (zia_raidz_rec(rr, tgts, n) == ZIA_OK) { + return; + } + + /* + * drop handles instead of onloading + * + * return value doesn't matter because + * the data hasn't changed yet + */ + zia_raidz_rec_cleanup(zio, rr, + B_TRUE, B_FALSE); vdev_raidz_reconstruct_row(rm, rr, tgts, n); } } @@ -3437,16 +3658,88 @@ vdev_raidz_io_done(zio_t *zio) } } + /* the raidz rows should never enter here already offloaded */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + ASSERT(rr->rr_zia_handle == NULL); + } + + /* offload once at beginning */ + blkptr_t *bp = zio->io_bp; + if (bp && !BP_IS_METADATA(bp)) { + uint_t checksum = (BP_IS_GANG(bp) ? + ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)); + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + if (!(ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + /* + * Allow unchecked failure since failure + * to offload means the software path + * will be taken. Whether or not the + * provider/offloader is valid + * becomes irrelevant. + */ + zia_raidz_alloc(zio, rr, + B_TRUE, checksum, NULL); + } + } + } + for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; vdev_raidz_io_done_reconstruct_known_missing(zio, rm, rr); + /* + * Restarting here is unnecessary. If the offloader + * failed, the offloaded data is still in sync with + * the in-memory data, and falling back reconstructed + * using the correct data. + */ } - if (raidz_checksum_verify(zio) == 0) { + int ret = raidz_checksum_verify(zio); + + /* ZIA_ACCELERATOR_DOWN is a completely orthogonal error */ + if (ret == ZIA_ACCELERATOR_DOWN) { + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_rec_cleanup(zio, rr, B_TRUE, B_FALSE); + } + + zio->io_can_offload = B_FALSE; + zio_vdev_io_redone(zio); + return; + } + + if (ret == 0) { for (int i = 0; i < rm->rm_nrows; i++) { raidz_row_t *rr = rm->rm_row[i]; - vdev_raidz_io_done_verified(zio, rr); + ret = + vdev_raidz_io_done_verified(zio, rr); + if (ret == ZIA_ACCELERATOR_DOWN) { + for (int j = 0; j < rm->rm_nrows; j++) { + rr = rm->rm_row[j]; + + /* + * vdev_raidz_io_done_verified + * will have already attempted + * to load reconstructed data + * back into memory, so this + * line should just drop any + * remaining handles + * + * not sure why onload_parity + * has to be set to B_TRUE + */ + zia_raidz_rec_cleanup(zio, rr, + B_TRUE, B_TRUE); + } + + zio->io_can_offload = B_FALSE; + zio_vdev_io_redone(zio); + return; + } } zio_checksum_verified(zio); } else { @@ -3473,6 +3766,12 @@ vdev_raidz_io_done(zio_t *zio) rm->rm_row[i]); } if (nread != 0) { + /* drop handles */ + for (int i = 0; i < rm->rm_nrows; i++) { + raidz_row_t *rr = rm->rm_row[i]; + zia_raidz_rec_cleanup(zio, rr, + B_TRUE, B_FALSE); + } /* * Normally our stage is VDEV_IO_DONE, but if * we've already called redone(), it will have @@ -3532,6 +3831,14 @@ vdev_raidz_io_done(zio_t *zio) * that is also a known failure, that's fine. */ zio->io_error = vdev_raidz_combrec(zio); + + if (zio->io_error == ZIA_ACCELERATOR_DOWN) { + zio->io_error = 0; + zio->io_can_offload = B_FALSE; + zio_vdev_io_redone(zio); + return; + } + if (zio->io_error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { vdev_raidz_io_done_unrecoverable(zio); diff --git a/module/zfs/zia.c b/module/zfs/zia.c new file mode 100644 index 000000000000..9f026877d6ff --- /dev/null +++ b/module/zfs/zia.c @@ -0,0 +1,1726 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef ZIA +#include +#else +typedef void * dpusm_uf_t; +#endif + +/* ************************************************************* */ +/* global offloader functions initialized with ZFS */ +static const dpusm_uf_t *dpusm = NULL; +/* ************************************************************* */ + +zia_props_t * +zia_get_props(spa_t *spa) +{ + return (spa?&spa->spa_zia_props:NULL); +} + +void +zia_prop_warn(boolean_t val, const char *name) +{ +#ifdef _KERNEL + if (val == B_TRUE) { + printk("Z.I.A. %s enabled. Encryption and " + "Dedup for this spa will be disabled.\n", + name); + } +#else + (void) val; (void) name; +#endif +} + +int +dpusm_to_ret(const int dpusm_ret) +{ +#ifdef ZIA + int zia_ret = ZIA_FALLBACK; + switch (dpusm_ret) { + case DPUSM_OK: + zia_ret = ZIA_OK; + break; + case DPUSM_ERROR: + zia_ret = ZIA_ERROR; + break; + case DPUSM_PROVIDER_MISMATCH: + zia_ret = ZIA_PROVIDER_MISMATCH; + break; + case DPUSM_NOT_IMPLEMENTED: + case DPUSM_NOT_SUPPORTED: + zia_ret = ZIA_FALLBACK; + break; + case DPUSM_BAD_RESULT: + zia_ret = ZIA_BAD_RESULT; + break; + case DPUSM_PROVIDER_NOT_EXISTS: + case DPUSM_PROVIDER_INVALIDATED: + case DPUSM_PROVIDER_UNREGISTERED: + default: + zia_ret = ZIA_ACCELERATOR_DOWN; + break; + } + return (zia_ret); +#else + (void) dpusm_ret; + return (ZIA_DISABLED); +#endif +} + +#ifdef ZIA +dpusm_compress_t +compress_to_dpusm(enum zio_compress c) +{ + dpusm_compress_t dpusm_c = 0; + + switch (c) { + case ZIO_COMPRESS_GZIP_1: + dpusm_c = DPUSM_COMPRESS_GZIP_1; + break; + case ZIO_COMPRESS_GZIP_2: + dpusm_c = DPUSM_COMPRESS_GZIP_2; + break; + case ZIO_COMPRESS_GZIP_3: + dpusm_c = DPUSM_COMPRESS_GZIP_3; + break; + case ZIO_COMPRESS_GZIP_4: + dpusm_c = DPUSM_COMPRESS_GZIP_4; + break; + case ZIO_COMPRESS_GZIP_5: + dpusm_c = DPUSM_COMPRESS_GZIP_5; + break; + case ZIO_COMPRESS_GZIP_6: + dpusm_c = DPUSM_COMPRESS_GZIP_6; + break; + case ZIO_COMPRESS_GZIP_7: + dpusm_c = DPUSM_COMPRESS_GZIP_7; + break; + case ZIO_COMPRESS_GZIP_8: + dpusm_c = DPUSM_COMPRESS_GZIP_8; + break; + case ZIO_COMPRESS_GZIP_9: + dpusm_c = DPUSM_COMPRESS_GZIP_9; + break; + case ZIO_COMPRESS_LZ4: + dpusm_c = DPUSM_COMPRESS_LZ4; + break; + case ZIO_COMPRESS_INHERIT: + case ZIO_COMPRESS_ON: + case ZIO_COMPRESS_OFF: + case ZIO_COMPRESS_LZJB: + case ZIO_COMPRESS_EMPTY: + case ZIO_COMPRESS_ZLE: + case ZIO_COMPRESS_ZSTD: + case ZIO_COMPRESS_FUNCTIONS: + default: + break; + } + + return (dpusm_c); +} + +static dpusm_checksum_t +checksum_to_dpusm(enum zio_checksum c) +{ + dpusm_checksum_t dpusm_c = 0; + switch (c) { + case ZIO_CHECKSUM_FLETCHER_2: + dpusm_c = DPUSM_CHECKSUM_FLETCHER_2; + break; + case ZIO_CHECKSUM_FLETCHER_4: + dpusm_c = DPUSM_CHECKSUM_FLETCHER_4; + break; + case ZIO_CHECKSUM_INHERIT: + case ZIO_CHECKSUM_ON: + case ZIO_CHECKSUM_OFF: + case ZIO_CHECKSUM_LABEL: + case ZIO_CHECKSUM_GANG_HEADER: + case ZIO_CHECKSUM_ZILOG: + case ZIO_CHECKSUM_SHA256: + case ZIO_CHECKSUM_ZILOG2: + case ZIO_CHECKSUM_NOPARITY: + case ZIO_CHECKSUM_SHA512: + case ZIO_CHECKSUM_SKEIN: + default: + break; + } + + return (dpusm_c); +} + +static dpusm_checksum_byteorder_t +byteorder_to_dpusm(zio_byteorder_t bo) +{ + dpusm_checksum_byteorder_t dpusm_bo = DPUSM_BYTEORDER_MAX; + switch (bo) { + case ZIO_CHECKSUM_NATIVE: + dpusm_bo = DPUSM_BYTEORDER_NATIVE; + break; + case ZIO_CHECKSUM_BYTESWAP: + dpusm_bo = DPUSM_BYTEORDER_BYTESWAP; + break; + default: + break; + } + + return (dpusm_bo); +} +#endif + +#ifdef ZIA +int +zia_get_capabilities(void *provider, dpusm_pc_t **caps) +{ + /* dpusm is checked by the caller */ + /* provider and caps are checked by the dpusm */ + return (dpusm_to_ret(dpusm->capabilities(provider, caps))); +} +#endif + +int +zia_init(void) +{ +#ifdef ZIA + if (dpusm) { + return (ZIA_OK); + } + + if (dpusm_initialize) { + dpusm = dpusm_initialize(); + } + + if (!dpusm) { +#ifdef _KERNEL + printk("Warning: Z.I.A. not initialized\n"); +#endif + return (ZIA_ERROR); + } + +#ifdef _KERNEL + printk("Z.I.A. initialized (%p)\n", dpusm); +#endif + return (ZIA_OK); +#else + return (ZIA_DISABLED); +#endif +} + +int +zia_fini(void) +{ + if (!dpusm) { +#ifdef _KERNEL + printk("Warning: Z.I.A. not initialized. " + "Not uninitializing.\n"); +#endif + return (ZIA_ERROR); + } + +#ifdef ZIA + if (dpusm_finalize) { + dpusm_finalize(); +#ifdef _KERNEL + printk("Z.I.A. finalized\n"); +#endif + } else { +#ifdef _KERNEL + if (dpusm) { + printk("Z.I.A. incomplete finalize\n"); + } +#endif + } +#endif + + dpusm = NULL; + return (ZIA_OK); +} + +#ifdef ZIA +/* recursively find all leaf vdevs and open them */ +static void zia_open_vdevs(vdev_t *vd) { + vdev_ops_t *ops = vd->vdev_ops; + if (ops->vdev_op_leaf) { + ASSERT(!vd->vdev_zia_handle); + + const size_t len = strlen(ops->vdev_op_type); + if (len == 4) { + if (memcmp(ops->vdev_op_type, "file", 4) == 0) { + zia_file_open(vd, vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), + 0); + } +#ifdef _KERNEL + else if (memcmp(ops->vdev_op_type, "disk", 4) == 0) { + /* first member is struct block_device * */ + void *disk = vd->vdev_tsd; + zia_disk_open(vd, vd->vdev_path, disk); + } +#endif + } + } else { + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *child = vd->vdev_child[i]; + zia_open_vdevs(child); + } + } +} +#endif + +void * +zia_get_provider(const char *name, vdev_t *vdev) +{ +#ifdef ZIA + if (!dpusm) { + return (NULL); + } + + void *provider = NULL; + provider = dpusm->get(name); +#ifdef _KERNEL + printk("Z.I.A. obtained handle to provider \"%s\" (%p)", + name, provider); +#endif + + /* set up Z.I.A. for existing vdevs */ + if (vdev) { + zia_open_vdevs(vdev); + } + return (provider); +#else + (void) name; (void) vdev; + return (NULL); +#endif +} + +const char * +zia_get_provider_name(void *provider) +{ + if (!dpusm || !provider) { + return (NULL); + } + +#ifdef ZIA + return (dpusm->get_name(provider)); +#else + return (NULL); +#endif +} + +#ifdef ZIA +/* recursively find all leaf vdevs and close them */ +static void zia_close_vdevs(vdev_t *vd) { + vdev_ops_t *ops = vd->vdev_ops; + if (ops->vdev_op_leaf) { + const size_t len = strlen(ops->vdev_op_type); + if (len == 4) { + if (memcmp(ops->vdev_op_type, "file", 4) == 0) { + zia_file_close(vd); + } +#ifdef _KERNEL + else if (memcmp(ops->vdev_op_type, "disk", 4) == 0) { + zia_disk_close(vd); + } +#endif + } + } else { + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_t *child = vd->vdev_child[i]; + zia_close_vdevs(child); + } + } +} +#endif + +int +zia_put_provider(void **provider, vdev_t *vdev) +{ +#ifdef ZIA + if (!dpusm || !provider || !*provider) { + return (ZIA_FALLBACK); + } + + /* + * if the zpool is not going down, but the provider is going away, + * make sure the vdevs don't keep pointing to the invalid provider + */ + if (vdev) { + zia_close_vdevs(vdev); + } + +#ifdef _KERNEL + const char *name = zia_get_provider_name(*provider); +#endif + + const int ret = dpusm->put(*provider); + +#ifdef _KERNEL + printk("Z.I.A. returned provider handle \"%s\" " + "(%p) and got return value %d", + name, *provider, ret); +#endif + + *provider = NULL; + + return (dpusm_to_ret(ret)); +#else + (void) provider; (void) vdev; + return (ZIA_DISABLED); +#endif +} + +int +zia_disable_offloading(zio_t *zio, boolean_t reexecute) +{ + if (!zio) { + return (ZIA_ERROR); + } + + /* stop all future zios from offloading */ + spa_t *spa = zio->io_spa; + zia_props_t *zia_props = zia_get_props(spa); + mutex_enter(&spa->spa_props_lock); + zia_props->can_offload = B_FALSE; + mutex_exit(&spa->spa_props_lock); + + /* stop this zio from offloading again */ + zio->io_can_offload = B_FALSE; + + if (reexecute == B_TRUE) { + zio->io_flags |= ZIO_FLAG_ZIA_REEXECUTE; + } + + return (ZIA_OK); +} + +boolean_t +zia_is_used(zio_t *zio) +{ + if (!zio) { + return (B_FALSE); + } + + zia_props_t *props = zia_get_props(zio->io_spa); + + /* provider + at least 1 operation */ + if (props->provider && + (props->compress || + props->decompress || + props->checksum || + props->raidz.gen[1] || + props->raidz.gen[2] || + props->raidz.gen[3] || + props->raidz.rec[1] || + props->raidz.rec[2] || + props->raidz.rec[3] || + props->file_write || + props->disk_write)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +zia_is_offloaded(abd_t *abd) +{ + if (!abd) { + return (B_FALSE); + } + + return (ABD_HANDLE(abd)?B_TRUE:B_FALSE); +} + +int +zia_worst_error(const int lhs, const int rhs) +{ + if (lhs == ZIA_ACCELERATOR_DOWN) { + return (lhs); + } + + if (rhs == ZIA_ACCELERATOR_DOWN) { + return (rhs); + } + + if (lhs == ZIA_OK) { + return (rhs); + } + + if (rhs == ZIA_OK) { + return (lhs); + } + + return (ZIA_ERROR); +} + +/* create a provider handle/offloader buffer without copying data */ +void * +zia_alloc(void *provider, size_t size, size_t min_offload_size) +{ +#ifdef ZIA + if (size < min_offload_size) { + return (NULL); + } + + return ((dpusm && provider)?dpusm->alloc(provider, size):NULL); +#else + (void) provider; (void) size; (void) min_offload_size; + return (NULL); +#endif +} + +/* free the offloader handle without onloading the data */ +int +zia_free(void **handle) +{ + ASSERT(handle); + +#ifdef ZIA + int ret = DPUSM_OK; + if (dpusm) { + ret = dpusm->free(*handle); + *handle = NULL; + } + return (dpusm_to_ret(ret)); +#else + return (ZIA_DISABLED); +#endif +} + +/* move data from the offloader into a linear abd and unregister the mapping */ +int +zia_onload(void **handle, void *buf, size_t size) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!handle || !*handle || !buf) { + return (ZIA_ERROR); + } + + void *provider = dpusm->extract(*handle); + if (!provider) { + return (ZIA_ERROR); + } + + dpusm_pc_t *caps = NULL; + if (zia_get_capabilities(provider, &caps) != ZIA_OK) { + return (ZIA_ERROR); + } + + dpusm_mv_t mv = { .handle = *handle, .offset = 0 }; + int ret = ZIA_ERROR; + + if (caps->optional & DPUSM_OPTIONAL_COPY_TO_PTR) { + ret = dpusm_to_ret(dpusm->copy.to.ptr(&mv, buf, size)); + } else { + ret = dpusm_to_ret(dpusm->copy.to.generic(&mv, buf, size)); + } + + /* + * if success, no more need for handle + * if failure, can't do anything with + * handle in any case, so destroy it + */ + zia_free(handle); + + return (dpusm_to_ret(ret)); +#else + (void) handle; (void) buf; (void) size; + return (ZIA_DISABLED); +#endif +} + +#ifdef ZIA +static int +zia_offload_generic_cb(void *buf, size_t len, void *priv) +{ + dpusm_mv_t *mv = (dpusm_mv_t *)priv; + + const int ret = dpusm->copy.from.generic(mv, buf, len); + if (dpusm_to_ret(ret) != ZIA_OK) { + return (ZIA_ERROR); + } + + mv->offset += len; + return (0); +} +#endif + +/* offload abd + offset to handle + 0 */ +static int +zia_offload_abd_offset(void *provider, abd_t *abd, + size_t offset, size_t size, + size_t min_offload_size, boolean_t *local_offload) +{ +#ifdef ZIA + /* already offloaded */ + if (ABD_HANDLE(abd)) { + if (local_offload) { + *local_offload = B_FALSE; + } + + if (!provider) { + return (ZIA_OK); + } + + void *abd_provider = dpusm->extract(ABD_HANDLE(abd)); + return ((provider == abd_provider)? + ZIA_OK:ZIA_PROVIDER_MISMATCH); + } + + dpusm_pc_t *caps = NULL; + if (zia_get_capabilities(provider, &caps) != ZIA_OK) { + return (ZIA_ERROR); + } + + if (local_offload) { + *local_offload = B_TRUE; + } + + /* provider is checked by dpusm */ + void *handle = zia_alloc(provider, size, min_offload_size); + if (!handle) { + return (ZIA_ERROR); + } + + dpusm_mv_t mv = { + .handle = handle, + .offset = 0, + }; + + int ret = ZIA_FALLBACK; + if (abd_is_linear(abd) == B_TRUE) { + ret = dpusm->copy.from.generic(&mv, + ABD_LINEAR_BUF(abd), size); + ret = dpusm_to_ret(ret); + } else { + ret = abd_iterate_func(abd, offset, size, + zia_offload_generic_cb, &mv); + + if (ret == 0) { + ret = ZIA_OK; + } + } + + if (ret == ZIA_OK) { + ABD_HANDLE(abd) = handle; + } else { + zia_free(&handle); + } + + return (ret); +#else + (void) provider; (void) abd; (void) offset; + (void) size; (void) min_offload_size; + (void) local_offload; + return (ZIA_DISABLED); +#endif +} + +int +zia_offload_abd(void *provider, abd_t *abd, + size_t size, size_t min_offload_size, + boolean_t *local_offload, boolean_t lock) +{ + if (!dpusm || !provider) { + return (ZIA_FALLBACK); + } + + if (!abd) { + return (ZIA_ERROR); + } + + if (lock) { + mutex_enter(&abd->abd_mtx); + } + + const int rc = zia_offload_abd_offset(provider, + abd, 0, size, min_offload_size, local_offload); + + if (lock) { + mutex_exit(&abd->abd_mtx); + } + + return (rc); +} + +#ifdef ZIA +static int +zia_onload_generic_cb(void *buf, size_t len, void *priv) +{ + dpusm_mv_t *mv = (dpusm_mv_t *)priv; + + const int ret = dpusm->copy.to.generic(mv, buf, len); + if (dpusm_to_ret(ret) != ZIA_OK) { + return (ZIA_ERROR); + } + + mv->offset += len; + return (0); +} +#endif + +/* onload handle + 0 into abd + offset */ +static int +zia_onload_abd_offset(abd_t *abd, size_t offset, + size_t size, boolean_t keep_handle) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd) { + return (ZIA_ERROR); + } + + void *handle = ABD_HANDLE(abd); + if (!handle) { + return (ZIA_ERROR); + } + + void *provider = dpusm->extract(handle); + if (!provider) { + mutex_exit(&abd->abd_mtx); + return (ZIA_ERROR); + } + + dpusm_pc_t *caps = NULL; + if (zia_get_capabilities(provider, &caps) != ZIA_OK) { + mutex_exit(&abd->abd_mtx); + return (ZIA_ERROR); + } + + dpusm_mv_t mv = { + .handle = handle, + .offset = 0, + }; + + int ret = ZIA_FALLBACK; + if (abd_is_linear(abd) == B_TRUE) { + ret = dpusm->copy.to.generic(&mv, + ABD_LINEAR_BUF(abd), size); + ret = dpusm_to_ret(ret); + } else { + ret = abd_iterate_func(abd, offset, size, + zia_onload_generic_cb, &mv); + + if (ret == 0) { + ret = ZIA_OK; + } + } + + if (keep_handle != B_TRUE) { + zia_free_abd(abd, B_FALSE); + } + + return (ret); +#else + (void) abd; (void) offset; (void) size; (void) keep_handle; + return (ZIA_DISABLED); +#endif +} + +int +zia_onload_abd(abd_t *abd, size_t size, boolean_t keep_handle) +{ + if (abd_is_gang(abd)) { + /* + * the only gangs that show up are from raidz + * + * get leading data size, stopping at first zero page + * which should always be the second child + */ + const size_t original_size = size; + size = 0; + for (abd_t *child = list_head(&ABD_GANG(abd).abd_gang_chain); + child != NULL; + child = list_next(&ABD_GANG(abd).abd_gang_chain, child)) { + if (child->abd_flags & ABD_FLAG_ZEROS) { + break; + } + + size += child->abd_size; + } + + ASSERT(size <= original_size); + } + + return (zia_onload_abd_offset(abd, 0, size, keep_handle)); +} + +void +zia_move_into_abd(abd_t *dst, void **src_handle) +{ + ABD_HANDLE(dst) = *src_handle; + *src_handle = NULL; +} + +int +zia_free_abd(abd_t *abd, boolean_t lock) +{ + if (lock == B_TRUE) { + mutex_enter(&abd->abd_mtx); + } + + const int ret = zia_free(&ABD_HANDLE(abd)); + + if (lock == B_TRUE) { + mutex_exit(&abd->abd_mtx); + } + return (ret); +} + +/* + * if offloaded locally, just free the handle + * if not, onload the data and free the handle + */ +int +zia_cleanup_abd(abd_t *abd, size_t size, + boolean_t local_offload, boolean_t lock) +{ + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd) { + return (ZIA_ERROR); + } + + int ret = ZIA_OK; + + if (lock == B_TRUE) { + mutex_enter(&abd->abd_mtx); + } + + if (local_offload == B_TRUE) { + /* in-memory copy is still valid */ + /* lock just in case mirrors clean up at the same time */ + ret = zia_free_abd(abd, B_FALSE); + } else { + /* have to copy data into memory */ + ret = zia_onload_abd(abd, size, B_FALSE); + } + + if (lock == B_TRUE) { + mutex_exit(&abd->abd_mtx); + } + + return (ret); +} + +void +zia_restart_before_vdev(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (BP_IS_ENCRYPTED(bp) && + (zio->io_stage != ZIO_STAGE_ENCRYPT)) { + zio_pop_transform(zio); + } + + if ((BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) && + (zio->io_stage != ZIO_STAGE_WRITE_COMPRESS)) { + zio_pop_transform(zio); + BP_SET_PSIZE(bp, zio->io_size); + BP_SET_LSIZE(bp, zio->io_size); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + } + + zia_disable_offloading(zio, B_TRUE); + + /* only keep trace up to issue async */ + zio->io_pipeline_trace &= + ZIO_STAGE_OPEN | + ZIO_STAGE_READ_BP_INIT | + ZIO_STAGE_WRITE_BP_INIT | + ZIO_STAGE_FREE_BP_INIT | + ZIO_STAGE_ISSUE_ASYNC; + + /* let zio_execute find the stage after issue async */ + zio->io_stage = ZIO_STAGE_ISSUE_ASYNC; +} + +int +zia_zero_fill(abd_t *abd, size_t offset, size_t size) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd || !ABD_HANDLE(abd)) { + return (ZIA_ERROR); + } + + return (dpusm_to_ret(dpusm->zero_fill(ABD_HANDLE(abd), offset, size))); +#else + (void) abd; (void) offset; (void) size; + return (ZIA_DISABLED); +#endif +} + +int +zia_compress(zia_props_t *props, enum zio_compress c, + abd_t *src, size_t s_len, + void **cbuf_handle, uint64_t *c_len, + uint8_t level, boolean_t *local_offload) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + return (zia_compress_impl(dpusm, props, c, src, s_len, + cbuf_handle, c_len, level, local_offload)); +#else + (void) props; (void) c; (void) src; (void) s_len; + (void) cbuf_handle; (void) c_len; (void) level; + (void) local_offload; + return (ZIA_DISABLED); +#endif +} + +int +zia_decompress(zia_props_t *props, enum zio_compress c, + abd_t *src, size_t s_len, abd_t *dst, size_t d_len, + uint8_t *level) +{ +#ifdef ZIA + if (!props) { + return (ZIA_ERROR); + } + + if (!dpusm || !props->provider) { + return (ZIA_FALLBACK); + } + + /* check that decompression can be done before offloading src */ + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(props->provider, &caps) != ZIA_OK) || + !(caps->decompress & compress_to_dpusm(c))) { + return (ZIA_FALLBACK); + } + + int ret = zia_offload_abd(props->provider, src, + s_len, props->min_offload_size, NULL, B_FALSE); + if (ret != ZIA_OK) { + return (ret); + } + + /* + * allocate space for decompressed data + * + * a lot of these will fail because d_len tends to be small + */ + ABD_HANDLE(dst) = zia_alloc(props->provider, d_len, + props->min_offload_size); + if (!ABD_HANDLE(dst)) { + /* let abd_free clean up zio->io_abd */ + return (ZIA_ERROR); + } + + /* + * d_len pulled from accelerator is not used, so + * passing in address of local variable is fine + */ + int cmp_level = *level; + ret = dpusm->decompress(compress_to_dpusm(c), &cmp_level, + ABD_HANDLE(src), s_len, ABD_HANDLE(dst), &d_len); + *level = cmp_level; + + if (ret != DPUSM_OK) { + zia_free_abd(dst, B_FALSE); + /* let abd_free clean up zio->io_abd */ + } + return (dpusm_to_ret(ret)); +#else + (void) props; (void) c; (void) src; (void) s_len; + (void) dst; (void) d_len; (void) level; + return (ZIA_FALLBACK); +#endif +} + +int +zia_checksum_compute(void *provider, zio_cksum_t *dst, enum zio_checksum alg, + zio_t *zio, uint64_t size, boolean_t *local_offload) +{ +#ifdef ZIA + if (!dpusm || !provider) { + return (ZIA_FALLBACK); + } + + const dpusm_checksum_byteorder_t byteorder = + byteorder_to_dpusm(BP_SHOULD_BYTESWAP(zio->io_bp)); + + if (!ABD_HANDLE(zio->io_abd)) { + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(provider, &caps) != ZIA_OK) || + !(caps->checksum & checksum_to_dpusm(alg)) || + !(caps->checksum_byteorder & byteorder)) { + return (ZIA_FALLBACK); + } + + if (zia_offload_abd(provider, zio->io_abd, size, + zia_get_props(zio->io_spa)->min_offload_size, + local_offload, B_FALSE) != ZIA_OK) { + return (ZIA_ERROR); + } + } else { + void *old_provider = dpusm->extract(ABD_HANDLE(zio->io_abd)); + if (old_provider != provider) { + return (ZIA_PROVIDER_MISMATCH); + } + + /* skip checks because dpusm will do them */ + } + + return (dpusm_to_ret(dpusm->checksum(checksum_to_dpusm(alg), + byteorder, ABD_HANDLE(zio->io_abd), size, dst->zc_word, + sizeof (dst->zc_word)))); +#else + (void) provider; (void) dst; (void) alg; + (void) zio; (void) size; (void) local_offload; + return (ZIA_FALLBACK); +#endif +} + +int +zia_checksum_error(enum zio_checksum alg, abd_t *abd, + uint64_t size, int byteswap, zio_cksum_t *actual_cksum) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!abd || !actual_cksum) { + return (ZIA_ERROR); + } + + if (!ABD_HANDLE(abd)) { + return (ZIA_FALLBACK); + } + + const dpusm_checksum_byteorder_t byteorder = + byteorder_to_dpusm(byteswap); + + return (dpusm_to_ret(dpusm->checksum(checksum_to_dpusm(alg), + byteorder, ABD_HANDLE(abd), size, actual_cksum->zc_word, + sizeof (actual_cksum->zc_word)))); +#else + (void) alg; (void) abd; (void) size; + (void) byteswap; (void) actual_cksum; + return (ZIA_FALLBACK); +#endif +} + +#ifdef ZIA +static boolean_t +zia_can_raidz(raidz_row_t *rr, zia_props_t *props, uint64_t raidn, + boolean_t rec, uint_t cksum, size_t *col_sizes) +{ + /* + * generation is needed for both + * generation and reconstruction + */ + int good = ( + /* raidz generation is turned on */ + (props->raidz.gen[raidn] == 1) && + + /* + * the provider knows whether or not + * raidz functions are available + */ + (dpusm->raid.can_compute(props->provider, raidn, + rr->rr_cols - rr->rr_firstdatacol, + col_sizes, rec == B_TRUE) == DPUSM_OK)); + + if (good && (rec == B_TRUE)) { + dpusm_pc_t *caps = NULL; + if (zia_get_capabilities(props->provider, &caps) != ZIA_OK) { + return (B_FALSE); + } + + good &= ( + /* raidz reconstruction is turned on */ + (props->raidz.rec[raidn] == 1) && + + /* need checksum */ + (props->checksum == 1) && + + /* raidz reconstruction support was checked earlier */ + + /* make sure the checksum is supported by the provider */ + (caps->checksum & checksum_to_dpusm(cksum))); + } + return (good?B_TRUE:B_FALSE); +} +#endif + +/* onload abd and delete raidz_row_t stuff */ +static int +zia_raidz_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, boolean_t onload_parity) +{ + /* + * bring data back to zio->io_abd, which should + * place data into parent automatically + */ + + const int ret = zia_worst_error( + zia_raidz_free(rr, onload_parity), + zia_cleanup_abd(zio->io_abd, zio->io_size, local_offload, B_TRUE)); + + return (ret); +} + +int +zia_raidz_alloc(zio_t *zio, raidz_row_t *rr, boolean_t rec, + uint_t cksum, boolean_t *local_offload) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!zio || !rr) { + return (ZIA_ERROR); + } + + /* do not offload in the middle of resilvering */ + if (zio->io_flags & ZIO_FLAG_RESILVER) { + if (!ABD_HANDLE(zio->io_abd)) { + return (ZIA_FALLBACK); + } + } + + /* + * existence of row handle implies existence + * of data and column handles + */ + if (rr->rr_zia_handle) { + return (ZIA_OK); + } + + if (zio->io_can_offload != B_TRUE) { + return (ZIA_ACCELERATOR_DOWN); + } + + const uint64_t raidn = rr->rr_firstdatacol; + if ((1 > raidn) || (raidn > 3)) { + return (ZIA_ERROR); + } + + /* need at least raidn + 2 columns */ + if (raidn + 2 > rr->rr_cols) { + return (ZIA_ERROR); + } + + zia_props_t *props = zia_get_props(zio->io_spa); + + /* get column sizes */ + const size_t column_sizes_size = sizeof (size_t) * rr->rr_cols; + size_t *column_sizes = kmem_alloc(column_sizes_size, KM_SLEEP); + for (uint64_t c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* this is tied to the ashift, not to the accelerator */ + if (rc->rc_abd->abd_size < props->min_offload_size) { + kmem_free(column_sizes, column_sizes_size); + return (ZIA_FALLBACK); + } + + column_sizes[c] = rc->rc_size; + } + + if (zia_can_raidz(rr, props, raidn, rec, + cksum, column_sizes) != B_TRUE) { + kmem_free(column_sizes, column_sizes_size); + return (ZIA_FALLBACK); + } + kmem_free(column_sizes, column_sizes_size); + + void *provider = props->provider; + if (!provider) { + return (ZIA_FALLBACK); + } + + /* + * offload the source data if it hasn't already been offloaded + * + * need to lock here since offloading normally doesn't lock, but + * abds hitting raidz might have been mirrored + */ + const int ret = zia_offload_abd(provider, zio->io_abd, + zio->io_size, props->min_offload_size, local_offload, B_TRUE); + if (ret != ZIA_OK) { + return (ret); + } + + /* mirrored abds generate their own references to the columns */ + + /* set up raid context */ + rr->rr_zia_handle = dpusm->raid.alloc(provider, + raidn, rr->rr_cols - raidn); + + if (!rr->rr_zia_handle) { + return (ZIA_ERROR); + } + + /* fill in raid context */ + + /* create parity column handles */ + for (uint64_t c = 0; c < raidn; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + ASSERT(!ABD_HANDLE(rc->rc_abd)); + + void *handle = NULL; + + /* allocate rc->rc_abd->abd_size, mark as rc->rc_size */ + if (rec == B_TRUE) { + /* + * reconstructing - parity columns are not + * in zio->io_abd - offload rc->rc_abd + */ + zia_offload_abd(provider, rc->rc_abd, + rc->rc_abd->abd_size, props->min_offload_size, + NULL, B_FALSE); + handle = ABD_HANDLE(rc->rc_abd); + } else { + /* generating - create new columns */ + handle = + dpusm->alloc(provider, rc->rc_abd->abd_size); + } + + if (!handle) { + goto error; + } + + if (dpusm->raid.set_column(rr->rr_zia_handle, + c, handle, rc->rc_size) != DPUSM_OK) { + goto error; + } + + ABD_HANDLE(rc->rc_abd) = handle; + } + + /* + * recalculate data column offsets and + * create references for each column + */ + uint64_t offset = 0; + for (uint64_t c = raidn; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + /* + * if the column is a gang abd, the handle + * will point to the first child + */ + void *handle = dpusm->alloc_ref(ABD_HANDLE(zio->io_abd), + offset, rc->rc_size); + + if (!handle) { + goto error; + } + + if (dpusm->raid.set_column(rr->rr_zia_handle, + c, handle, rc->rc_size) != DPUSM_OK) { + goto error; + } + + ABD_HANDLE(rc->rc_abd) = handle; + offset += rc->rc_size; + } + + for (uint64_t c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ASSERT(zia_is_offloaded(rc->rc_abd) == B_TRUE); + } + ASSERT(rr->rr_zia_handle); + ASSERT(zia_is_offloaded(zio->io_abd) == B_TRUE); + + return (ZIA_OK); + +error: + zia_raidz_cleanup(zio, rr, local_offload?*local_offload:B_FALSE, + B_FALSE); + + for (uint64_t c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ASSERT(zia_is_offloaded(rc->rc_abd) == B_FALSE); + } + ASSERT(rr->rr_zia_handle == NULL); + ASSERT(zia_is_offloaded(zio->io_abd) == B_FALSE); + + return (ZIA_ERROR); +#else + (void) zio; (void) rr; (void) rec; + (void) cksum; (void) local_offload; + return (ZIA_FALLBACK); +#endif + +} + +/* + * only frees the raidz data + * onload the data separately if it is needed + */ +int +zia_raidz_free(raidz_row_t *rr, boolean_t onload_parity) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + if (!rr) { + return (ZIA_ERROR); + } + + if (!rr->rr_zia_handle) { + return (ZIA_FALLBACK); + } + + int ret = ZIA_OK; + uint64_t c = 0; + + if (onload_parity == B_TRUE) { + for (; c < rr->rr_firstdatacol; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ret = zia_worst_error(ret, + zia_onload_abd(rc->rc_abd, + rc->rc_size, B_FALSE)); + } + } + + for (; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + ret = zia_worst_error(ret, + zia_free_abd(rc->rc_abd, B_FALSE)); + } + + ret = zia_worst_error(ret, + dpusm_to_ret(dpusm->raid.free( + rr->rr_zia_handle))); + rr->rr_zia_handle = NULL; + + return (ret); +#else + (void) rr; (void) onload_parity; + return (ZIA_FALLBACK); +#endif +} + +int +zia_raidz_gen(raidz_row_t *rr) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + /* can only pass if raidz_alloc succeeded */ + if (!rr->rr_zia_handle) { + return (ZIA_ERROR); + } + + return (dpusm_to_ret(dpusm->raid.gen(rr->rr_zia_handle))); +#else + (void) rr; + return (ZIA_FALLBACK); +#endif +} + +int +zia_raidz_gen_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload) +{ + /* + * RAIDZ generation only calls cleanup + * on failure, so parity does not need + * to be brought back. + */ + return (zia_raidz_cleanup(zio, rr, + local_offload, B_FALSE)); +} + +/* + * allocate new parity columns for this row + * and assign them to the raidz struct + * + * orig takes ownership of the original handles + */ +int +zia_raidz_new_parity(zio_t *zio, raidz_row_t *rr, uint64_t c) +{ +#ifdef ZIA + if (!zio || !rr || (c >= rr->rr_firstdatacol)) { + return (ZIA_ERROR); + } + + if (!ABD_HANDLE(zio->io_abd) || !rr->rr_zia_handle) { + return (ZIA_FALLBACK); + } + + zia_props_t *props = zia_get_props(zio->io_spa); + void *provider = props->provider; + if (!provider) { + return (ZIA_FALLBACK); + } + + raidz_col_t *rc = &rr->rr_col[c]; + if (ABD_HANDLE(rc->rc_abd)) { + return (ZIA_ERROR); + } + + void *new_parity_handle = zia_alloc(provider, + rc->rc_abd->abd_size, props->min_offload_size); + if (!new_parity_handle) { + return (ZIA_ERROR); + } + + const int ret = dpusm->raid.set_column(rr->rr_zia_handle, + c, new_parity_handle, rc->rc_size); + if (ret == DPUSM_OK) { + ABD_HANDLE(rc->rc_abd) = new_parity_handle; + } else { + zia_free(&new_parity_handle); + } + + return (dpusm_to_ret(ret)); +#else + (void) zio; (void) rr; (void) c; + return (ZIA_FALLBACK); +#endif +} + +int +zia_raidz_cmp(abd_t *lhs, abd_t *rhs, int *diff) +{ +#ifdef ZIA + if (!lhs || !rhs || !diff) { + return (ZIA_ERROR); + } + + if (lhs == rhs) { + *diff = 0; + return (ZIA_OK); + } + + void *lhs_handle = ABD_HANDLE(lhs); + void *rhs_handle = ABD_HANDLE(rhs); + if (!lhs_handle || !rhs_handle) { + return (ZIA_ERROR); + } + + return (dpusm_to_ret(dpusm->raid.cmp(lhs_handle, rhs_handle, diff))); +#else + (void) lhs; (void) rhs; (void) diff; + return (ZIA_FALLBACK); +#endif +} + +int +zia_raidz_rec(raidz_row_t *rr, int *t, int nt) +{ +#ifdef ZIA + if (!dpusm) { + return (ZIA_FALLBACK); + } + + /* can only pass if raidz_alloc succeeded */ + if (!rr->rr_zia_handle) { + return (ZIA_FALLBACK); + } + + return (dpusm_to_ret(zia_raidz_rec_impl(dpusm, rr, t, nt))); +#else + (void) rr; (void) t; (void) nt; + return (ZIA_FALLBACK); +#endif +} + +int +zia_raidz_rec_cleanup(zio_t *zio, raidz_row_t *rr, + boolean_t local_offload, boolean_t onload_parity) +{ + return (zia_raidz_cleanup(zio, rr, + local_offload, onload_parity)); +} + +int +zia_file_open(vdev_t *vdev, const char *path, + int flags, int mode) +{ + if (!vdev || !vdev->vdev_spa) { + return (ZIA_ERROR); + } + + zia_props_t *zia_props = zia_get_props(vdev->vdev_spa); + if (zia_props->file_write != 1) { + return (ZIA_FALLBACK); + } + +#ifdef ZIA + void *provider = zia_props->provider; + if (!dpusm || !provider) { + return (ZIA_FALLBACK); + } + + if (!VDEV_HANDLE(vdev)) { + VDEV_HANDLE(vdev) = dpusm->file.open(provider, + path, flags, mode); + } + + return (VDEV_HANDLE(vdev)?ZIA_OK:ZIA_ERROR); +#else + (void) path; (void) flags; (void) mode; + return (ZIA_FALLBACK); +#endif +} + +int +zia_file_write(vdev_t *vdev, abd_t *abd, ssize_t size, + loff_t offset, ssize_t *resid, int *err) +{ +#ifdef ZIA + if (!vdev || !abd) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev) || !ABD_HANDLE(abd)) { + return (ZIA_FALLBACK); + } + + if (!abd_is_linear(abd)) { + return (EIO); + } + + /* + * this was intended to handle gang abds, but breaking + * at first zero child abd was not correct + */ + size_t data_size = size; + size_t trailing_zeros = 0; + + return (dpusm->file.write(VDEV_HANDLE(vdev), + ABD_HANDLE(abd), data_size, trailing_zeros, offset, resid, err)); +#else + (void) vdev; (void) abd; (void) size; + (void) offset; (void) resid; (void) err; + return (ZIA_FALLBACK); +#endif +} + +int +zia_file_close(vdev_t *vdev) +{ +#ifdef ZIA + if (!vdev) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + dpusm->file.close(VDEV_HANDLE(vdev)); + VDEV_HANDLE(vdev) = NULL; + zia_get_props(vdev->vdev_spa)->min_offload_size = 0; + + return (ZIA_OK); +#else + (void) vdev; + return (ZIA_FALLBACK); +#endif +} + +#ifdef __linux__ +#ifdef _KERNEL +int +zia_disk_open(vdev_t *vdev, const char *path, + struct block_device *bdev) +{ +#ifdef ZIA + if (!vdev || !vdev->vdev_spa) { + return (ZIA_ERROR); + } + + void *provider = zia_get_props(vdev->vdev_spa)->provider; + if (!dpusm || !provider) { + return (ZIA_FALLBACK); + } + + if (!VDEV_HANDLE(vdev)) { + VDEV_HANDLE(vdev) = dpusm->disk.open(provider, + path, bdev); + } + + return (VDEV_HANDLE(vdev)?ZIA_OK:ZIA_ERROR); +#else + (void) vdev; (void) path; (void) bdev; + return (ZIA_FALLBACK); +#endif +} + +int +zia_disk_invalidate(vdev_t *vdev) +{ +#ifdef ZIA + if (!vdev) { + return (ZIA_ERROR); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (ZIA_FALLBACK); + } + + return (dpusm_to_ret(dpusm->disk.invalidate(VDEV_HANDLE(vdev)))); +#else + (void) vdev; + return (ZIA_FALLBACK); +#endif +} + +int +zia_disk_write(vdev_t *vdev, zio_t *zio, size_t io_size, + uint64_t io_offset, int flags) +{ +#ifdef ZIA + if (!vdev || !zio || !zio->io_abd) { + return (EIO); + } + + if (!dpusm || !VDEV_HANDLE(vdev) || !ABD_HANDLE(zio->io_abd)) { + return (EIO); + } + + if (!abd_is_linear(zio->io_abd)) { + return (EIO); + } + + /* + * this was intended to handle gang abds, but breaking + * at first zero child abd was not correct + */ + size_t data_size = io_size; + size_t trailing_zeros = 0; + + /* returns E errors */ + return (dpusm->disk.write(VDEV_HANDLE(vdev), ABD_HANDLE(zio->io_abd), + data_size, trailing_zeros, io_offset, flags, + zia_disk_write_completion, zio)); +#else + (void) vdev; (void) zio; (void) io_size; + (void) io_offset; (void) flags; + return (ZIA_FALLBACK); +#endif +} + +int +zia_disk_flush(vdev_t *vdev, zio_t *zio) +{ +#ifdef ZIA + if (!vdev || !zio) { + return (EIO); + } + + if (!dpusm || !VDEV_HANDLE(vdev)) { + return (EIO); + } + + return (dpusm->disk.flush(VDEV_HANDLE(vdev), + zia_disk_flush_completion, zio)); +#else + (void) vdev; (void) zio; + return (EIO); +#endif +} + +int +zia_disk_close(vdev_t *vdev) +{ +#ifdef ZIA + if (!vdev) { + return (ZIA_ERROR); + } + + void *handle = VDEV_HANDLE(vdev); + VDEV_HANDLE(vdev) = NULL; + + zia_get_props(vdev->vdev_spa)->min_offload_size = 0; + + if (!dpusm || !handle) { + return (ZIA_FALLBACK); + } + + /* trust that ZFS handles closing disks once */ + dpusm->disk.close(handle); + + return (ZIA_OK); +#else + (void) vdev; + return (ZIA_FALLBACK); +#endif +} +#endif +#endif diff --git a/module/zfs/zia_cddl.c b/module/zfs/zia_cddl.c new file mode 100644 index 000000000000..557f240050aa --- /dev/null +++ b/module/zfs/zia_cddl.c @@ -0,0 +1,209 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifdef ZIA + +#include +#include +#include +#include +#include +#include +#include + +/* basically a duplicate of zio_compress_data */ +int +zia_compress_impl(const dpusm_uf_t *dpusm, zia_props_t *props, + enum zio_compress c, abd_t *src, size_t s_len, + void **cbuf_handle, uint64_t *c_len, + uint8_t level, boolean_t *local_offload) +{ + size_t d_len; + uint8_t complevel; + zio_compress_info_t *ci = &zio_compress_table[c]; + int ret = ZIA_OK; + + ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); + ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); + + /* + * If the data is all zeros, we don't even need to allocate + * a block for it. We indicate this by returning zero size. + */ + if (!ABD_HANDLE(src)) { + /* check in-memory buffer for zeros */ + if (abd_iterate_func(src, 0, s_len, + zio_compress_zeroed_cb, NULL) == 0) { + *c_len = 0; + return (ZIA_OK); + } + + if (c == ZIO_COMPRESS_EMPTY) { + *c_len = s_len; + return (ZIA_OK); + } + + /* check that compression can be done before offloading */ + dpusm_pc_t *caps = NULL; + if ((zia_get_capabilities(props->provider, &caps) != ZIA_OK) || + !(caps->compress & compress_to_dpusm(c))) { + return (ZIA_FALLBACK); + } + + ret = zia_offload_abd(props->provider, src, s_len, + props->min_offload_size, local_offload, B_FALSE); + if (ret != ZIA_OK) { + return (ret); + } + } else { + /* came in offloaded */ + void *old_provider = dpusm->extract(ABD_HANDLE(src)); + if (old_provider != props->provider) { + return (ZIA_PROVIDER_MISMATCH); + } + + /* use provider to check for zero buffer */ + ret = dpusm->all_zeros(ABD_HANDLE(src), 0, s_len); + if (ret == DPUSM_OK) { + *c_len = 0; + return (ZIA_OK); + } else if (ret != DPUSM_BAD_RESULT) { + return (dpusm_to_ret(ret)); + } + + if (c == ZIO_COMPRESS_EMPTY) { + *c_len = s_len; + return (ZIA_OK); + } + + dpusm_pc_t *caps = NULL; + ret = zia_get_capabilities(props->provider, &caps); + if (ret != ZIA_OK) { + return (ret); + } + + if (!(caps->compress & compress_to_dpusm(c))) { + return (ZIA_FALLBACK); + } + } + + /* Compress at least 12.5% */ + d_len = s_len - (s_len >> 3); + + complevel = ci->ci_level; + + if (c == ZIO_COMPRESS_ZSTD) { + /* If we don't know the level, we can't compress it */ + if (level == ZIO_COMPLEVEL_INHERIT) { + *c_len = s_len; + return (ZIA_OK); + } + + if (level == ZIO_COMPLEVEL_DEFAULT) + complevel = ZIO_ZSTD_LEVEL_DEFAULT; + else + complevel = level; + + ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT); + } + + /* nothing to offload, so just allocate space */ + *cbuf_handle = zia_alloc(props->provider, + s_len, props->min_offload_size); + if (!*cbuf_handle) { + return (ZIA_ERROR); + } + + /* DPUSM interface takes in a size_t, not a uint64_t */ + size_t zia_c_len = (size_t)s_len; + ret = dpusm->compress(compress_to_dpusm(c), (int8_t)level, + ABD_HANDLE(src), s_len, *cbuf_handle, &zia_c_len); + if (ret != DPUSM_OK) { + zia_free(cbuf_handle); + return (dpusm_to_ret(ret)); + } + + *c_len = zia_c_len; + + /* + * Return ZIA_OK because this is not an error - it just didn't + * compress well. The data will be dropped later on (instead of + * onloaded) because c_len is too big. + */ + if (*c_len > d_len) { + *c_len = s_len; + } + + return (ZIA_OK); +} + +int +zia_raidz_rec_impl(const dpusm_uf_t *dpusm, + raidz_row_t *rr, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY]; + int ntgts = 0; + for (int i = 0, c = 0; c < rr->rr_cols; c++) { + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rr->rr_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } + } + + ASSERT(ntgts >= nt); + + return (dpusm->raid.rec(rr->rr_zia_handle, + tgts, ntgts)); +} + +#ifdef _KERNEL +/* called by provider */ +void +zia_disk_write_completion(void *zio_ptr, int error) +{ + zio_t *zio = (zio_t *)zio_ptr; + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); +} + +/* called by provider */ +void +zia_disk_flush_completion(void *zio_ptr, int error) +{ + zio_t *zio = (zio_t *)zio_ptr; + + if (zio->io_error && (zio->io_error == EOPNOTSUPP)) + zio->io_vd->vdev_nowritecache = B_TRUE; + + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} +#endif /* _KERNEL */ + +#endif /* ZIA */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c index d68d5ababe79..64d421586d15 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -51,6 +51,7 @@ #include #include #include +#include #include /* @@ -436,12 +437,12 @@ zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, zio->io_size = size; } -void -zio_pop_transforms(zio_t *zio) +zio_transform_t * +zio_pop_transform(zio_t *zio) { - zio_transform_t *zt; + zio_transform_t *zt = zio->io_transform_stack; - while ((zt = zio->io_transform_stack) != NULL) { + if (zt != NULL) { if (zt->zt_transform != NULL) zt->zt_transform(zio, zt->zt_orig_abd, zt->zt_orig_size); @@ -455,6 +456,15 @@ zio_pop_transforms(zio_t *zio) kmem_free(zt, sizeof (zio_transform_t)); } + + return (zt); +} + +void +zio_pop_transforms(zio_t *zio) +{ + while (zio_pop_transform(zio)) { + } } /* @@ -475,11 +485,42 @@ static void zio_decompress(zio_t *zio, abd_t *data, uint64_t size) { if (zio->io_error == 0) { - void *tmp = abd_borrow_buf(data, size); - int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_abd, tmp, zio->io_size, size, - &zio->io_prop.zp_complevel); - abd_return_buf_copy(data, tmp, size); + int ret = ZIA_FALLBACK; + zia_props_t *zia_props = zia_get_props(zio->io_spa); + if ((zia_props->decompress == 1) && + (zio->io_can_offload == B_TRUE)) { + ret = zia_decompress(zia_props, + BP_GET_COMPRESS(zio->io_bp), + zio->io_abd, zio->io_size, + data, size, + &zio->io_prop.zp_complevel); + } + + if (ret == ZIA_OK) { + ASSERT(zia_is_offloaded(zio->io_abd) == B_TRUE); + /* + * bring data back into memory since there + * are no subsequent offloaded stages + */ + ret = zia_onload_abd(data, size, B_FALSE); + } + + ASSERT(zia_is_offloaded(data) != B_TRUE); + /* let abd_free clean up zio->io_abd */ + + if (ret == ZIA_OK) { + ret = 0; + } else { + if (ret == ZIA_ACCELERATOR_DOWN) { + zia_disable_offloading(zio, B_FALSE); + } + + void *tmp = abd_borrow_buf(data, size); + ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_abd, tmp, zio->io_size, size, + &zio->io_prop.zp_complevel); + abd_return_buf_copy(data, tmp, size); + } if (zio_injection_enabled && ret == 0) ret = zio_handle_fault_injection(zio, EINVAL); @@ -790,6 +831,11 @@ zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) *errorp = zio_worst_error(*errorp, zio->io_error); pio->io_reexecute |= zio->io_reexecute; + if ((zio->io_flags & ZIO_FLAG_ZIA_REEXECUTE) && + (zio->io_can_offload != B_TRUE)) { + pio->io_flags |= ZIO_FLAG_ZIA_REEXECUTE; + pio->io_can_offload = B_FALSE; + } ASSERT3U(*countp, >, 0); (*countp)--; @@ -842,6 +888,10 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) { if (zio->io_child_error[c] != 0 && zio->io_error == 0) zio->io_error = zio->io_child_error[c]; + + if (zio->io_flags & ZIO_FLAG_ZIA_REEXECUTE) { + zio->io_can_offload = B_FALSE; + } } int @@ -963,7 +1013,13 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, if (zb != NULL) zio->io_bookmark = *zb; + zio->io_can_offload = zia_get_props(spa)->can_offload; + if (pio != NULL) { + if ((pio->io_flags & ZIO_FLAG_ZIA_REEXECUTE) || + (pio->io_can_offload != B_TRUE)) { + zio->io_can_offload = B_FALSE; + } zio->io_metaslab_class = pio->io_metaslab_class; if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; @@ -972,6 +1028,13 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio_add_child_first(pio, zio); } + /* turn off encryption and dedup if Z.I.A. is used */ + if (zia_is_used(zio) == B_TRUE) { + zio->io_prop.zp_dedup = B_FALSE; + zio->io_prop.zp_dedup_verify = B_FALSE; + zio->io_prop.zp_encrypt = B_FALSE; + } + taskq_init_ent(&zio->io_tqent); return (zio); @@ -1850,18 +1913,85 @@ zio_write_compress(zio_t *zio) if (compress != ZIO_COMPRESS_OFF && !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = NULL; - psize = zio_compress_data(compress, zio->io_abd, &cbuf, lsize, - zp->zp_complevel); + int zia_rc = ZIA_FALLBACK; + void *cbuf_handle = NULL; /* only valid if zia_rc == ZIA_OK */ + zia_props_t *zia_props = zia_get_props(spa); + boolean_t local_offload = B_FALSE; + if ((zia_props->compress == 1) && + (zio->io_can_offload == B_TRUE)) { + zia_rc = zia_compress(zia_props, compress, + zio->io_abd, lsize, &cbuf_handle, &psize, + zp->zp_complevel, &local_offload); + } + + if (zia_rc != ZIA_OK) { + ASSERT(cbuf_handle == NULL); + zia_rc = zia_cleanup_abd(zio->io_abd, + lsize, local_offload, B_FALSE); + + /* + * if data has to be brought back for cpu compression, + * but could not, restart the pipeline for this zio + * (not necessary in this case, but still doing it here + * in case a previous stage is offloaded) + */ + if (zia_rc == ZIA_ACCELERATOR_DOWN) { + zia_restart_before_vdev(zio); + return (zio); + } + psize = zio_compress_data(compress, zio->io_abd, &cbuf, + lsize, zp->zp_complevel); + } + if (psize == 0) { compress = ZIO_COMPRESS_OFF; + ASSERT0(cbuf_handle); } else if (psize >= lsize) { compress = ZIO_COMPRESS_OFF; if (cbuf != NULL) zio_buf_free(cbuf, lsize); + /* + * no need for offloaded + * compressed buffer any more + * + * catch accelerator failures elsewhere + */ + zia_free(&cbuf_handle); + + /* source abd is still offloaded */ } else if (!zp->zp_dedup && !zp->zp_encrypt && psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { + + /* + * compressed enough, but not handling embedded + * data, so move compressed data back into memory + */ + if (cbuf_handle) { + cbuf = zio_buf_alloc(lsize); + } + + if (zia_onload(&cbuf_handle, cbuf, + psize) == ZIA_ACCELERATOR_DOWN) { + zia_free_abd(zio->io_abd, B_FALSE); + zio_buf_free(cbuf, lsize); + zia_restart_before_vdev(zio); + return (zio); + } + + /* + * remove offloaded source abd + * + * in-memory copy should still be valid, but calling + * zia_cleanup_abd just in case + */ + if (zia_cleanup_abd(zio->io_abd, lsize, + local_offload, B_FALSE) == ZIA_ACCELERATOR_DOWN) { + zio_buf_free(cbuf, lsize); + zia_restart_before_vdev(zio); + return (zio); + } encode_embedded_bp_compressed(bp, cbuf, compress, lsize, psize); BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); @@ -1886,15 +2016,68 @@ zio_write_compress(zio_t *zio) psize); if (rounded >= lsize) { compress = ZIO_COMPRESS_OFF; - zio_buf_free(cbuf, lsize); + if (cbuf_handle) { + /* + * catch accelerator + * down elsewhere + */ + zia_free(&cbuf_handle); + } else { + zio_buf_free(cbuf, lsize); + } psize = lsize; } else { + /* abd_get_from_buf must not get a NULL */ + if (cbuf_handle) { + cbuf = zio_buf_alloc(lsize); + } abd_t *cdata = abd_get_from_buf(cbuf, lsize); abd_take_ownership_of_buf(cdata, B_TRUE); + if (cbuf_handle) { + /* + * zio->io_abd offload no longer needed + * but don't free here - let abd_free + * handle it + */ + + /* + * compressed enough, so associate the + * compressed buffer with the abd + */ + zia_move_into_abd(cdata, &cbuf_handle); + zia_rc = zia_zero_fill(cdata, psize, + rounded - psize); + if (zia_rc != ZIA_OK) { + /* + * if setting cdata's handle + * fails, onload the compressed + * buffer (automatically placing + * it into cdata) and continue + * using zfs + * + * if cbuf is not offloaded, + * nothing happens + */ + zia_rc = zia_onload( + &cbuf_handle, cbuf, lsize); + } + + if (zia_rc == ZIA_ACCELERATOR_DOWN) { + zia_free(&cbuf_handle); + zia_free_abd(zio->io_abd, + B_FALSE); + zia_restart_before_vdev(zio); + return (zio); + } + } abd_zero_off(cdata, psize, rounded - psize); psize = rounded; zio_push_transform(zio, cdata, psize, lsize, NULL); + if (zia_is_offloaded(zio->io_abd)) { + zio->io_flags |= + ZIO_FLAG_DONT_AGGREGATE; + } } } @@ -3971,6 +4154,13 @@ zio_vdev_io_start(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE) { abd_copy(abuf, zio->io_abd, zio->io_size); abd_zero_off(abuf, zio->io_size, asize - zio->io_size); + /* + * The Z.I.A. handles of the abds that come here + * were not modified and do not get associated with + * abuf during the transform. Instead of dropping + * the handle and delaying here, let abd_free clean + * it up later. + */ } zio_push_transform(zio, abuf, asize, asize, zio_subblock); } @@ -4175,6 +4365,8 @@ zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr) { void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); + zia_onload_abd(zio->io_abd, zio->io_size, B_FALSE); + abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; @@ -4209,7 +4401,9 @@ zio_vdev_io_assess(zio_t *zio) * On retry, we cut in line in the issue queue, since we don't want * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ - if (zio->io_error && vd == NULL && + if (zio->io_error && + !(zio->io_flags & ZIO_FLAG_ZIA_REEXECUTE) && + vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ @@ -4836,6 +5030,8 @@ zio_done(zio_t *zio) } if (zio->io_error) { + ASSERT(!(zio->io_flags & ZIO_FLAG_ZIA_REEXECUTE)); + /* * If this I/O is attached to a particular vdev, * generate an error message describing the I/O failure @@ -4870,7 +5066,10 @@ zio_done(zio_t *zio) } } - if (zio->io_error && zio == zio->io_logical) { + if ((zio->io_error || + (zio->io_flags & ZIO_FLAG_ZIA_REEXECUTE) || + 0) && + zio == zio->io_logical) { /* * Determine whether zio should be reexecuted. This will * propagate all the way to the root via zio_notify_parent(). @@ -5241,6 +5440,7 @@ EXPORT_SYMBOL(zio_buf_alloc); EXPORT_SYMBOL(zio_data_buf_alloc); EXPORT_SYMBOL(zio_buf_free); EXPORT_SYMBOL(zio_data_buf_free); +EXPORT_SYMBOL(zio_push_transform); ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, "Max I/O completion time (milliseconds) before marking it as slow"); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index ce6772a40c8b..73196b0c24e3 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -31,6 +31,7 @@ #include #include #include +#include #include /* @@ -357,6 +358,13 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, zio_eck_t eck; size_t eck_offset; + /* not handling embedded checksums, so bring back data */ + const int zia_rc = zia_cleanup_abd(abd, size, B_FALSE, B_FALSE); + if (zia_rc == ZIA_ACCELERATOR_DOWN) { + zia_restart_before_vdev(zio); + return; + } + memset(&saved, 0, sizeof (zio_cksum_t)); if (checksum == ZIO_CHECKSUM_ZILOG2) { @@ -403,8 +411,31 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, sizeof (zio_cksum_t)); } else { saved = bp->blk_cksum; + + int zia_rc = ZIA_FALLBACK; + + /* only offload non-embedded checksums */ + boolean_t local_offload = B_FALSE; + zia_props_t *zia_props = zia_get_props(spa); + if ((zia_props->checksum == 1) && + (zio->io_can_offload == B_TRUE)) { + zia_rc = zia_checksum_compute(zia_props->provider, + &cksum, checksum, zio, size, &local_offload); + } + + /* fall back to ZFS implementation */ + if (zia_rc != ZIA_OK) { + zia_rc = zia_cleanup_abd(abd, size, local_offload, + B_FALSE); + if (zia_rc == ZIA_ACCELERATOR_DOWN) { + zia_restart_before_vdev(zio); + return; + } ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); + } else { + zio->io_flags |= ZIO_FLAG_DONT_AGGREGATE; + } if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) zio_checksum_handle_crypt(&cksum, &saved, insecure); bp->blk_cksum = cksum; @@ -433,6 +464,12 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, zio_cksum_t verifier; size_t eck_offset; + /* not handling embedded checksums, so bring back data */ + const int zia_rc = zia_cleanup_abd(abd, size, B_FALSE, B_FALSE); + if (zia_rc == ZIA_ACCELERATOR_DOWN) { + return (zia_rc); + } + if (checksum == ZIO_CHECKSUM_ZILOG2) { zil_chain_t zilc; uint64_t nused; @@ -494,8 +531,25 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, } else { byteswap = BP_SHOULD_BYTESWAP(bp); expected_cksum = bp->blk_cksum; - ci->ci_func[byteswap](abd, size, - spa->spa_cksum_tmpls[checksum], &actual_cksum); + + zia_props_t *zia_props = zia_get_props(spa); + int error = ZIA_FALLBACK; + if ((zia_props->can_offload == B_TRUE) && + (zia_props->checksum == 1)) { + error = zia_checksum_error(checksum, abd, size, + byteswap, &actual_cksum); + } + + /* fall back to ZFS implementation */ + if ((error != ZIA_OK) && (error != ECKSUM)) { + /* data was modified by reconstruction */ + error = zia_onload_abd(abd, size, B_FALSE); + if (error == ZIA_ACCELERATOR_DOWN) { + return (error); + } + ci->ci_func[byteswap](abd, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); + } } /* diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index c8a10db7483b..e09a2afd513a 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -111,7 +111,7 @@ zio_compress_select(spa_t *spa, enum zio_compress child, return (result); } -static int +int zio_compress_zeroed_cb(void *data, size_t len, void *private) { (void) private; diff --git a/module/zia-software-provider/kernel_offloader.c b/module/zia-software-provider/kernel_offloader.c new file mode 100644 index 000000000000..eab9b1598fcf --- /dev/null +++ b/module/zia-software-provider/kernel_offloader.c @@ -0,0 +1,921 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kernel_offloader.h" + +static const char NAME[] = "Kernel Offloader"; +static const size_t NAME_LEN = sizeof (NAME); + +typedef enum kernel_offloader_handle_type { + KOH_REAL, /* default type - convert all data into a single blob */ + KOH_REFERENCE, + + KOH_INVALID, +} koht_t; + +/* offloaded data (not defined outside of "hardware") */ +typedef struct kernel_offloader_handle { + koht_t type; + void *ptr; + size_t size; +} koh_t; + +/* **************************************** */ +/* memory bookkeeping */ +rwlock_t rwlock; /* atomic ints are not big enough */ + +/* never decreases */ +static size_t total_count; /* number of times alloc/alloc_ref was called */ +static size_t total_size; /* buffer size */ +static size_t total_actual; /* buffer size + any extra memory */ + +/* currently active */ +static size_t active_count; /* number of times alloc/alloc_ref was called */ +static size_t active_size; /* buffer size */ +static size_t active_actual; /* buffer size + any extra memory */ +/* **************************************** */ + +/* **************************************** */ +/* set kernel offloader to DOWN state */ +typedef struct kernel_offloader_down { + rwlock_t rwlock; + int count; + int max; + int printed; +} kod_t; + +#define kod_init(name, max_val) \ + do { \ + rwlock_init(&name.rwlock); \ + name.count = 0; \ + name.max = max_val; \ + name.printed = 0; \ + } while (0) + +#define kod_inc(name) \ + do { \ + write_lock(&name.rwlock); \ + name.count++; \ + write_unlock(&name.rwlock); \ + } while (0) + +#define kod_ret(name) \ + do { \ + if (name.max) { \ + write_lock(&name.rwlock); \ + if (name.count > name.max) { \ + if (!name.printed) { \ + printk("%s\n", #name); \ + name.printed = 1; \ + } \ + write_unlock(&name.rwlock); \ + return (DPUSM_PROVIDER_INVALIDATED); \ + } \ + write_unlock(&name.rwlock); \ + } \ + } while (0) + +#define kod_run(name) \ + do { \ + kod_inc(name); \ + kod_ret(name); \ + } while (0) + +/* can probably do with macros */ +static kod_t copy_from_generic_down; static int copy_from_generic_down_max = 0; +module_param(copy_from_generic_down_max, int, 0660); + +static kod_t copy_to_generic_down; static int copy_to_generic_down_max = 0; +module_param(copy_to_generic_down_max, int, 0660); + +static kod_t cmp_down; static int cmp_down_max = 0; +module_param(cmp_down_max, int, 0660); + +static kod_t compress_down; static int compress_down_max = 0; +module_param(compress_down_max, int, 0660); + +static kod_t checksum_down; static int checksum_down_max = 0; +module_param(checksum_down_max, int, 0660); + +static kod_t raidz_gen_down; static int raidz_gen_down_max = 0; +module_param(raidz_gen_down_max, int, 0660); + +static kod_t raidz_rec_down; static int raidz_rec_down_max = 0; +module_param(raidz_rec_down_max, int, 0660); + +static kod_t disk_write_down; static int disk_write_down_max = 0; +module_param(disk_write_down_max, int, 0660); +/* **************************************** */ + +/* + * value used to swizzle the pointer so that + * dereferencing the handle will fail + */ +static void *mask = NULL; +void +kernel_offloader_init(void) +{ + get_random_bytes(&mask, sizeof (mask)); + rwlock_init(&rwlock); + total_count = 0; + total_size = 0; + total_actual = 0; + active_count = 0; + active_size = 0; + active_actual = 0; + + kod_init(copy_from_generic_down, copy_from_generic_down_max); + kod_init(copy_to_generic_down, copy_to_generic_down_max); + kod_init(cmp_down, cmp_down_max); + kod_init(compress_down, compress_down_max); + kod_init(checksum_down, checksum_down_max); + kod_init(raidz_gen_down, raidz_gen_down_max); + kod_init(raidz_rec_down, raidz_rec_down_max); + kod_init(disk_write_down, disk_write_down_max); + + printk("kernel offloader init: %p\n", mask); +} + +void +kernel_offloader_fini(void) +{ + mask = NULL; + + printk("kernel offloader fini with " + "%zu/%zu (actual %zu/%zu) bytes " + "in %zu/%zu allocations remaining\n", + active_size, total_size, + active_actual, total_actual, + active_count, total_count); +} + +/* get a starting address of a linear koh_t */ +static void * +ptr_start(koh_t *koh, size_t offset) +{ + return (void *)(((uintptr_t)koh->ptr) + offset); +} + +/* + * convert the actual pointer to a handle (pretend + * the data is not accessible from the Z.I.A. base) + */ +static void * +swizzle(void *ptr) +{ + return (ptr?((void *)(((uintptr_t)ptr) ^ ((uintptr_t)mask))):NULL); +} + +/* convert the handle to a usable pointer */ +static void * +unswizzle(void *handle) +{ + return (swizzle(handle)); +} + +static koh_t * +koh_alloc(size_t size) +{ + koh_t *koh = kmalloc(sizeof (koh_t), GFP_KERNEL); + if (koh) { + koh->type = KOH_REAL; + koh->ptr = kmalloc(size, GFP_KERNEL); + koh->size = size; + + write_lock(&rwlock); + total_count++; + active_count++; + + /* the allocation itself */ + total_size += size; + active_size += size; + total_actual += size; + active_actual += size; + + /* the wrapper struct */ + total_actual += sizeof (koh_t); + active_actual += sizeof (koh_t); + + write_unlock(&rwlock); + } + + return (koh); +} + +static koh_t * +koh_alloc_ref(koh_t *src, size_t offset, size_t size) +{ + koh_t *ref = NULL; + if (src) { + koh_t *src_koh = (koh_t *)src; + + if ((offset + size) > src_koh->size) { + printk("Error: Cannot reference handle of size %zu " + "starting at offset %zu with size %zu\n", + src_koh->size, offset, size); + return (NULL); + } + + ref = kmalloc(sizeof (koh_t), GFP_KERNEL); + if (ref) { + ref->type = KOH_REFERENCE; + ref->ptr = ptr_start(src, offset); + ref->size = size; + + write_lock(&rwlock); + total_count++; + active_count++; + + /* no new requested space */ + + /* the wrapper struct */ + total_actual += sizeof (koh_t); + active_actual += sizeof (koh_t); + write_unlock(&rwlock); + } + } + + return (ref); +} + +int +kernel_offloader_get_size(void *handle, size_t *size, size_t *actual) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + + if (size) { + *size = koh->size; + } + + if (actual) { + *actual = koh->size; + } + + return (KERNEL_OFFLOADER_OK); +} + +static int +koh_free(koh_t *koh) +{ + if (koh) { + write_lock(&rwlock); + switch (koh->type) { + case KOH_REAL: + /* the allocation itself */ + active_size -= koh->size; + active_actual -= koh->size; + kfree(koh->ptr); + break; + case KOH_REFERENCE: + case KOH_INVALID: + default: + break; + } + + /* the wrapper struct */ + active_actual -= sizeof (koh_t); + + active_count--; + write_unlock(&rwlock); + + kfree(koh); + } + + return (KERNEL_OFFLOADER_OK); +} + +void * +kernel_offloader_alloc(size_t size) +{ + return (swizzle(koh_alloc(size))); +} + +void * +kernel_offloader_alloc_ref(void *src_handle, size_t offset, size_t size) +{ + return swizzle(koh_alloc_ref(unswizzle(src_handle), + offset, size)); +} + +int +kernel_offloader_free(void *handle) +{ + koh_free(unswizzle(handle)); + return (DPUSM_OK); +} + +int +kernel_offloader_copy_from_generic(void *handle, size_t offset, + const void *src, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if ((offset + size) > koh->size) { + return (KERNEL_OFFLOADER_ERROR); + } + + kod_run(copy_from_generic_down); + + void *dst = ptr_start(koh, offset); + if (memcpy(dst, src, size) != dst) { + return (KERNEL_OFFLOADER_ERROR); + } + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_copy_to_generic(void *handle, size_t offset, + void *dst, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + if ((offset + size) > koh->size) { + return (KERNEL_OFFLOADER_ERROR); + } + + kod_run(copy_to_generic_down); + + if (memcpy(dst, ptr_start(koh, offset), size) != dst) { + return (KERNEL_OFFLOADER_ERROR); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_cmp(void *lhs_handle, void *rhs_handle, int *diff) +{ + koh_t *lhs = (koh_t *)unswizzle(lhs_handle); + koh_t *rhs = (koh_t *)unswizzle(rhs_handle); + + if (!lhs || !rhs || !diff) { + return (KERNEL_OFFLOADER_ERROR); + } + + kod_run(cmp_down); + + size_t len = rhs->size; + if (lhs->size != rhs->size) { + len = + (lhs->size < rhs->size)?lhs->size:rhs->size; + } + + *diff = memcmp(ptr_start(lhs, 0), + ptr_start(rhs, 0), len); + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_zero_fill(void *handle, size_t offset, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + memset(ptr_start(koh, offset), 0, size); + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_all_zeros(void *handle, size_t offset, size_t size) +{ + koh_t *koh = (koh_t *)unswizzle(handle); + if (koh->size - offset < size) { + return (KERNEL_OFFLOADER_ERROR); + } + + uint64_t *array = ptr_start(koh, offset); + size_t i; + for (i = 0; i < size / sizeof (uint64_t); i++) { + if (array[i]) { + return (KERNEL_OFFLOADER_BAD_RESULT); + } + } + + char *remaining = ptr_start(koh, offset); + for (i *= sizeof (uint64_t); i < size; i++) { + if (remaining[i]) { + return (KERNEL_OFFLOADER_BAD_RESULT); + } + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_mem_stats( + void *t_count_handle, void *t_size_handle, void *t_actual_handle, + void *a_count_handle, void *a_size_handle, void *a_actual_handle) +{ + read_lock(&rwlock); + + if (t_count_handle) { + *(size_t *)ptr_start(t_count_handle, 0) = + total_count; + } + + if (t_size_handle) { + *(size_t *)ptr_start(t_size_handle, 0) = + total_size; + } + + if (t_actual_handle) { + *(size_t *)ptr_start(t_actual_handle, 0) = + total_actual; + } + + if (a_count_handle) { + *(size_t *)ptr_start(a_count_handle, 0) = + active_count; + } + + if (a_size_handle) { + *(size_t *)ptr_start(a_size_handle, 0) = + active_size; + } + + if (a_actual_handle) { + *(size_t *)ptr_start(a_actual_handle, 0) = + active_actual; + } + + read_unlock(&rwlock); + + return (KERNEL_OFFLOADER_OK); +} + +/* specific implementation */ +static int +kernel_offloader_gzip_compress(koh_t *src, size_t s_len, + koh_t *dst, size_t *d_len, int level) +{ + if (z_compress_level(ptr_start(dst, 0), d_len, + ptr_start(src, 0), s_len, level) != Z_OK) { + if (*d_len != src->size) { + return (KERNEL_OFFLOADER_ERROR); + } + return (KERNEL_OFFLOADER_OK); + } + + return (KERNEL_OFFLOADER_OK); +} + +static int +kernel_offloader_lz4_compress(koh_t *src, koh_t *dst, + size_t s_len, int level, size_t *c_len) +{ + *c_len = dst->size; + + if (lz4_compress_zfs(ptr_start(src, 0), ptr_start(dst, 0), + s_len, *c_len, level) == s_len) { + return (KERNEL_OFFLOADER_ERROR); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_compress(dpusm_compress_t alg, int level, + void *src, size_t s_len, void *dst, void *d_len) +{ + int status = KERNEL_OFFLOADER_UNAVAILABLE; + koh_t *src_koh = NULL; + koh_t *dst_koh = NULL; + koh_t *d_len_koh = NULL; + if (!src || !dst || !d_len) { + return (KERNEL_OFFLOADER_ERROR); + } + + kod_run(compress_down); + + src_koh = (koh_t *)unswizzle(src); + dst_koh = (koh_t *)unswizzle(dst); + d_len_koh = (koh_t *)unswizzle(d_len); + + if ((DPUSM_COMPRESS_GZIP_1 <= alg) && + (alg <= DPUSM_COMPRESS_GZIP_9)) { + status = kernel_offloader_gzip_compress(src_koh, s_len, + dst_koh, (size_t *)ptr_start(d_len_koh, 0), level); + } else if (alg == DPUSM_COMPRESS_LZ4) { + status = kernel_offloader_lz4_compress(src_koh, dst_koh, s_len, + level, (size_t *)ptr_start(d_len_koh, 0)); + } + + return (status); +} + +/* specific implementation */ +static int +kernel_offloader_gzip_decompress(koh_t *src, size_t s_len, + koh_t *dst, size_t *d_len, int level) +{ + if (z_uncompress(ptr_start(dst, 0), d_len, + ptr_start(src, 0), s_len) != Z_OK) { + return (KERNEL_OFFLOADER_ERROR); + } + + return (KERNEL_OFFLOADER_OK); +} + +static int +kernel_offloader_lz4_decompress(koh_t *src, size_t s_len, + koh_t *dst, size_t *d_len, int level) +{ + if (lz4_decompress_zfs(ptr_start(src, 0), ptr_start(dst, 0), + s_len, *d_len, level) != 0) { + return (KERNEL_OFFLOADER_ERROR); + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_decompress(dpusm_decompress_t alg, void *level, + void *src, size_t s_len, void *dst, void *d_len) +{ + int status = KERNEL_OFFLOADER_UNAVAILABLE; + koh_t *level_koh = NULL; + koh_t *src_koh = NULL; + koh_t *dst_koh = NULL; + koh_t *d_len_koh = NULL; + if (!level || !src || !dst || !d_len) { + return (KERNEL_OFFLOADER_ERROR); + } + + level_koh = (koh_t *)unswizzle(level); + src_koh = (koh_t *)unswizzle(src); + dst_koh = (koh_t *)unswizzle(dst); + d_len_koh = (koh_t *)unswizzle(d_len); + + if ((DPUSM_COMPRESS_GZIP_1 <= alg) && + (alg <= DPUSM_COMPRESS_GZIP_9)) { + status = kernel_offloader_gzip_decompress(src_koh, s_len, + dst_koh, (size_t *)ptr_start(d_len_koh, 0), + *(int *)ptr_start(level_koh, 0)); + } else if (alg == DPUSM_COMPRESS_LZ4) { + status = kernel_offloader_lz4_decompress(src_koh, s_len, + dst_koh, (size_t *)ptr_start(d_len_koh, 0), + *(int *)ptr_start(level_koh, 0)); + } + + return (status); +} + +int +kernel_offloader_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size) +{ + koh_t *data_koh = (koh_t *)unswizzle(data); + if (!data_koh) { + return (KERNEL_OFFLOADER_ERROR); + } + + zio_cksum_t zcp; + if (cksum_size < sizeof (zcp.zc_word)) { + return (KERNEL_OFFLOADER_ERROR); + } + + kod_run(checksum_down); + + /* compute checksum */ + + void *buf = ptr_start(data_koh, 0); + + if (alg == DPUSM_CHECKSUM_FLETCHER_2) { + fletcher_init(&zcp); + if (order == DPUSM_BYTEORDER_NATIVE) { + fletcher_2_native(buf, size, NULL, &zcp); + } else { + fletcher_2_byteswap(buf, size, NULL, &zcp); + } + } else if (alg == DPUSM_CHECKSUM_FLETCHER_4) { + fletcher_init(&zcp); + if (order == DPUSM_BYTEORDER_NATIVE) { + fletcher_4_native(buf, size, NULL, &zcp); + } else { + fletcher_4_byteswap(buf, size, NULL, &zcp); + } + } else { + return (DPUSM_NOT_SUPPORTED); + } + + memcpy(cksum, zcp.zc_word, sizeof (zcp.zc_word)); + + return (DPUSM_OK); +} + +void * +kernel_offloader_raidz_alloc(size_t nparity, size_t ndata) +{ + const size_t ncols = nparity + ndata; + + const size_t rr_size = offsetof(raidz_row_t, rr_col[ncols]); + raidz_row_t *rr = kzalloc(rr_size, GFP_KERNEL); + rr->rr_cols = ncols; + rr->rr_firstdatacol = nparity; + + write_lock(&rwlock); + total_count++; + active_count++; + + /* the op struct does not contribute to buffer allocations */ + total_actual += rr_size; + active_actual += rr_size; + + write_unlock(&rwlock); + + return (swizzle(rr)); +} + +/* attaches a column to the raidz struct */ +int +kernel_offloader_raidz_set_column(void *raidz, uint64_t c, + void *col, size_t size) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + koh_t *koh = (koh_t *)unswizzle(col); + + if (!rr || !koh) { + return (DPUSM_ERROR); + } + + /* c is too big */ + if (c >= rr->rr_cols) { + return (DPUSM_ERROR); + } + + /* "active" size is larger than allocated size */ + if (size > koh->size) { + return (DPUSM_ERROR); + } + + raidz_col_t *rc = &rr->rr_col[c]; + + /* clean up old column */ + abd_free(rc->rc_abd); + + /* + * rc->rc_abd does not take ownership of koh->ptr, + * so don't need to release ownership + */ + rc->rc_abd = abd_get_from_buf(koh->ptr, size); + rc->rc_size = size; + + return (DPUSM_OK); +} + +int +kernel_offloader_raidz_free(void *raidz) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + if (!rr) { + return (DPUSM_ERROR); + } + + for (int c = 0; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + abd_free(rc->rc_abd); + } + kfree(rr); + + const size_t rr_size = offsetof(raidz_row_t, rr_col[rr->rr_cols]); + + write_lock(&rwlock); + active_count--; + active_actual -= rr_size; + write_unlock(&rwlock); + + return (DPUSM_OK); +} + +int +kernel_offloader_raidz_gen(void *raidz) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + if (!rr) { + return (KERNEL_OFFLOADER_ERROR); + } + + kod_run(raidz_gen_down); + + switch (rr->rr_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rr); + break; + case 2: + vdev_raidz_generate_parity_pq(rr); + break; + case 3: + vdev_raidz_generate_parity_pqr(rr); + break; + } + + return (KERNEL_OFFLOADER_OK); +} + +int +kernel_offloader_raidz_rec(void *raidz, int *tgts, int ntgts) +{ + raidz_row_t *rr = (raidz_row_t *)unswizzle(raidz); + if (!rr) { + return (KERNEL_OFFLOADER_ERROR); + } + + kod_run(raidz_rec_down); + + vdev_raidz_reconstruct_general(rr, tgts, ntgts); + + return (KERNEL_OFFLOADER_OK); +} + +void * +kernel_offloader_file_open(const char *path, int flags, int mode) +{ + zfs_file_t *fp = NULL; + /* on error, fp should still be NULL */ + zfs_file_open(path, flags, mode, &fp); + return (swizzle(fp)); +} + +int +kernel_offloader_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err) +{ + zfs_file_t *fp = (zfs_file_t *)unswizzle(fp_handle); + if (!fp) { + return (ENODEV); + } + + koh_t *koh = (koh_t *)unswizzle(handle); + if (!koh) { + return (EIO); + } + + if (!err) { + return (EIO); + } + + *err = zfs_file_pwrite(fp, ptr_start(koh, 0), + count, offset, resid); + + if (*err == 0) { + void *zeros = kzalloc(trailing_zeros, GFP_KERNEL); + *err = zfs_file_pwrite(fp, zeros, + trailing_zeros, offset + count, resid); + kfree(zeros); + } + + return (*err); +} + +void +kernel_offloader_file_close(void *fp_handle) +{ + zfs_file_close(unswizzle(fp_handle)); +} + +void * +kernel_offloader_disk_open(dpusm_dd_t *disk_data) +{ + return (swizzle(disk_data->bdev)); +} + +int +kernel_offloader_disk_invalidate(void *disk_handle) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + invalidate_bdev(bdev); + return (DPUSM_OK); +} + +int +kernel_offloader_disk_write(void *disk_handle, void *handle, size_t data_size, + size_t trailing_zeros, uint64_t io_offset, int flags, + dpusm_disk_write_completion_t write_completion, void *wc_args) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + koh_t *koh = (koh_t *)unswizzle(handle); + + const size_t io_size = data_size + trailing_zeros; + + kod_run(disk_write_down); + + if (trailing_zeros) { + /* create a copy of the data with the trailing zeros attached */ + void *copy = kzalloc(io_size, GFP_KERNEL); + memcpy(copy, ptr_start(koh, 0), data_size); + + write_lock(&rwlock); + /* need to keep copy alive, so replace koh->ptr */ + if (koh->type == KOH_REAL) { + /* subtract size of original koh->ptr */ + active_size -= koh->size; + active_actual -= koh->size; + + kfree(koh->ptr); + } + + koh->type = KOH_REAL; + koh->ptr = copy; + koh->size = io_size; + + total_size += io_size; + active_size += io_size; + total_actual += io_size; + active_actual += io_size; + + /* wrapper struct size was not modified */ + write_unlock(&rwlock); + } + + abd_t *abd = abd_get_from_buf(koh->ptr, io_size); + zio_push_transform(wc_args, abd, io_size, io_size, NULL); + + /* __vdev_disk_physio already adds write_completion */ + (void) write_completion; + + return (__vdev_classic_physio(bdev, wc_args, + io_size, io_offset, WRITE, flags)); +} + +int +kernel_offloader_disk_flush(void *disk_handle, + dpusm_disk_flush_completion_t flush_completion, void *fc_args) +{ + struct block_device *bdev = + (struct block_device *)unswizzle(disk_handle); + + /* vdev_disk_io_flush already adds flush completion */ + (void) flush_completion; + + return (vdev_disk_io_flush(bdev, fc_args)); +} + +void +kernel_offloader_disk_close(void *disk_handle) +{} diff --git a/module/zia-software-provider/kernel_offloader.h b/module/zia-software-provider/kernel_offloader.h new file mode 100644 index 000000000000..53abbfaaf759 --- /dev/null +++ b/module/zia-software-provider/kernel_offloader.h @@ -0,0 +1,152 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _KERNEL_OFFLOADER_H +#define _KERNEL_OFFLOADER_H + +#include +#include + +#include + +/* + * This file represents the API provided by a vendor to access their + * offloader. The API can be anything the implementor chooses to + * expose. There are no limitations on the function signature or + * name. They just have to be called correctly in the Z.I.A. provider. + * ZFS and Z.I.A. will not need direct access to any data located on + * the offloader. Some raw pointers from Z.I.A. will be used directly, + * but those will always contain information located in memory. + * + * ------------------------------------------------------------------- + * + * The kernel offloader fakes offloads by copying data into memory + * regions distinct from the calling process's memory space. The + * corresponding C file conflates the driver and the "physical" device + * since both memory spaces are in kernel space and run on the + * CPU. This offloader provides opaque pointers to the provider to + * simulate handles to inaccessible memory locations. In order to + * prevent the handle from being dereferenced and used successfully by + * ZFS or Z.I.A., the handle pointer is masked with a random value + * generated at load-time. Other offloaders may choose to present + * non-void handles. + */ + +/* return values */ +#define KERNEL_OFFLOADER_OK 0 + +/* function is implemented, but the chosen operation is not implemented */ +#define KERNEL_OFFLOADER_UNAVAILABLE 1 + +/* ran, but could not complete */ +#define KERNEL_OFFLOADER_ERROR 2 + +/* ran, but failed a check on a result */ +#define KERNEL_OFFLOADER_BAD_RESULT 3 + +/* "hardware" went down for some reason (overheated, unplugged, etc.) */ +#define KERNEL_OFFLOADER_DOWN 4 + +/* + * init function - this should be the kernel module init, but + * kernel offloader is not compiled as a separate kernel module + */ +void kernel_offloader_init(void); +void kernel_offloader_fini(void); + +/* offloader handle access */ +void *kernel_offloader_alloc(size_t size); +void *kernel_offloader_alloc_ref(void *src, size_t offset, size_t size); +int kernel_offloader_get_size(void *handle, size_t *size, size_t *actual); +int kernel_offloader_free(void *handle); +int kernel_offloader_copy_from_generic(void *handle, size_t offset, + const void *src, size_t size); +int kernel_offloader_copy_to_generic(void *handle, size_t offset, + void *dst, size_t size); +/* status check */ +int kernel_offloader_mem_stats( + void *t_count_handle, void *t_size_handle, void *t_actual_handle, + void *a_count_handle, void *a_size_handle, void *a_actual_handle); +int kernel_offloader_cmp(void *lhs_handle, void *rhs_handle, int *diff); +int kernel_offloader_zero_fill(void *handle, size_t offset, size_t size); +int kernel_offloader_all_zeros(void *handle, size_t offset, size_t size); + +/* ZIO Pipeline Stages */ + +int kernel_offloader_compress(dpusm_compress_t alg, int level, + void *src, size_t s_len, void *dst, void *d_len); + +int kernel_offloader_decompress(dpusm_compress_t alg, void *level, + void *src, size_t s_len, void *dst, void *d_len); + +int kernel_offloader_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size); + +void *kernel_offloader_raidz_alloc(size_t nparity, size_t ndata); +int kernel_offloader_raidz_set_column(void *raidz, uint64_t c, + void *col, size_t size); +int kernel_offloader_raidz_free(void *raidz); +int kernel_offloader_raidz_gen(void *raidz); +int kernel_offloader_raidz_rec(void *raidz, int *tgts, int ntgts); + +/* io */ +void *kernel_offloader_file_open(const char *path, int flags, int mode); +int kernel_offloader_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err); +void kernel_offloader_file_close(void *fp_handle); + +void *kernel_offloader_disk_open(dpusm_dd_t *disk_data); +int kernel_offloader_disk_reread_part(void *disk_handle); +int kernel_offloader_disk_invalidate(void *disk_handle); +int kernel_offloader_disk_write(void *disk_handle, void *handle, + size_t data_size, size_t trailing_zeros, uint64_t io_offset, int flags, + dpusm_disk_write_completion_t write_completion, void *wc_args); +int kernel_offloader_disk_flush(void *disk_handle, + dpusm_disk_flush_completion_t flush_completion, void *fc_args); +void kernel_offloader_disk_close(void *disk_handle); + +#endif diff --git a/module/zia-software-provider/provider.c b/module/zia-software-provider/provider.c new file mode 100644 index 000000000000..ce9353b19f0b --- /dev/null +++ b/module/zia-software-provider/provider.c @@ -0,0 +1,454 @@ +/* + * © 2021. Triad National Security, LLC. All rights reserved. + * + * This program was produced under U.S. Government contract + * 89233218CNA000001 for Los Alamos National Laboratory (LANL), which + * is operated by Triad National Security, LLC for the U.S. + * Department of Energy/National Nuclear Security Administration. All + * rights in the program are reserved by Triad National Security, LLC, + * and the U.S. Department of Energy/National Nuclear Security + * Administration. The Government is granted for itself and others + * acting on its behalf a nonexclusive, paid-up, irrevocable worldwide + * license in this material to reproduce, prepare derivative works, + * distribute copies to the public, perform publicly and display + * publicly, and to permit others to do so. + * + * ---- + * + * This program is open source under the BSD-3 License. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This provider communicates with the "kernel offloader", which is + * actually just software running on the local kernel. + * + * Providers and offloaders are usually separate entities. However, to + * keep things simple, the kernel offloader is compiled into this + * provider. + * + * Providers run at the same location as ZFS. They are intended to be + * small shims that translate between the DPUSM provider API and an + * offloader's API (probably a header file analogous to + * kernel_offloader.h). + * + * The method used to communicate between the provider and offloader + * is not prescribed by the DPUSM. This allows for vendors to place + * their offloaders locally or remotely, and use whatever method they + * wish to use to communicate with their offloaders e.g. NVMeOF. The + * kernel offloader is local and the communication method to access + * the kernel offloader is calling local functions. + * + * Offloaders are normally expected to be hardware with its own memory + * space. In order to simulate copying data to an offloader's memory + * space, the kernel offloader allocates new buffers and copies ZFS + * data into them, rather than using ZFS data directly. In order to + * simulate handles that the provider does not know how to manipulate + * or have access to, pointers returned from the kernel offloader are + * masked with a random value. + * + * Note that this provider has to be loaded after ZFS because it + * depends on ZFS for its "offload" functionality. + * + * Usage: + * 1. Reconfigure ZFS with --with-zia= + * + * 2. Create a zpool + * + * 3. Select this provider with + * zpool set zia_provider=zia-software-provider + * + * 4. Enable "offloading" of operations with + * zpool set zia_compress=on + * zpool set zia_decompress=on + * zpool set zia_checksum=on + * zpool set zia_raidz1_gen=on + * zpool set zia_raidz2_gen=on + * zpool set zia_raidz3_gen=on + * zpool set zia_raidz1_rec=on + * zpool set zia_raidz2_rec=on + * zpool set zia_raidz3_rec=on + * zpool set zia_file_write=on + * zpool set zia_disk_write=on + * + * 5. Use the zpool as you would normally + * + * Notes: + * If a ZFS IO stage is not run, enabling a Z.I.A. offload + * will have no effect. + * + * Resilvering requires both zia_checksum and zia_raidz*_rec + * to be enabled. Not enabling checksums would cause offloaded + * resilvering to fail, and perform the remaining operations + * in memory. To avoid the cost of offloading data only to + * fail, a check has been inserted to prevent offloading + * altogether if zia_checksum is not enabled. + */ + +#include +#include +#include + +#include /* the DPUSM provider API */ +#include /* provides access to the offloader */ + +/* translate from offloader values to DPUSM values */ +static int +translate_rc(const int offloader_rc) +{ + int dpusm_rc = DPUSM_NOT_IMPLEMENTED; + switch (offloader_rc) { + case KERNEL_OFFLOADER_OK: + dpusm_rc = DPUSM_OK; + break; + case KERNEL_OFFLOADER_ERROR: + dpusm_rc = DPUSM_ERROR; + break; + case KERNEL_OFFLOADER_UNAVAILABLE: + dpusm_rc = DPUSM_NOT_IMPLEMENTED; + break; + case KERNEL_OFFLOADER_BAD_RESULT: + dpusm_rc = DPUSM_BAD_RESULT; + break; + case KERNEL_OFFLOADER_DOWN: + dpusm_rc = DPUSM_PROVIDER_INVALIDATED; + break; + default: + /* only translate recognized values */ + dpusm_rc = offloader_rc; + break; + } + return (dpusm_rc); +} + +static int +sw_provider_algorithms(int *compress, int *decompress, + int *checksum, int *checksum_byteorder, int *raid) +{ + *compress = + DPUSM_COMPRESS_GZIP_1 | + DPUSM_COMPRESS_GZIP_2 | + DPUSM_COMPRESS_GZIP_3 | + DPUSM_COMPRESS_GZIP_4 | + DPUSM_COMPRESS_GZIP_5 | + DPUSM_COMPRESS_GZIP_6 | + DPUSM_COMPRESS_GZIP_7 | + DPUSM_COMPRESS_GZIP_8 | + DPUSM_COMPRESS_GZIP_9 | + DPUSM_COMPRESS_LZ4; + + *decompress = *compress; + + *checksum = DPUSM_CHECKSUM_FLETCHER_2 | DPUSM_CHECKSUM_FLETCHER_4; + + *checksum_byteorder = DPUSM_BYTEORDER_NATIVE | DPUSM_BYTEORDER_BYTESWAP; + + *raid = + DPUSM_RAID_1_GEN | + DPUSM_RAID_2_GEN | + DPUSM_RAID_3_GEN | + DPUSM_RAID_1_REC | + DPUSM_RAID_2_REC | + DPUSM_RAID_3_REC; + + return (DPUSM_OK); +} + +static int +sw_provider_get_size(void *handle, size_t *size, size_t *actual) +{ + return (translate_rc(kernel_offloader_get_size(handle, + size, actual))); +} + +static int +sw_provider_copy_from_generic(dpusm_mv_t *mv, const void *buf, size_t size) +{ + return (translate_rc(kernel_offloader_copy_from_generic(mv->handle, + mv->offset, buf, size))); +} + +static int +sw_provider_copy_to_generic(dpusm_mv_t *mv, void *buf, size_t size) +{ + return (translate_rc(kernel_offloader_copy_to_generic(mv->handle, + mv->offset, buf, size))); +} + +static int +sw_provider_mem_stats(size_t *t_count, size_t *t_size, size_t *t_actual, + size_t *a_count, size_t *a_size, size_t *a_actual) +{ + void *t_count_handle = NULL; + void *t_size_handle = NULL; + void *t_actual_handle = NULL; + void *a_size_handle = NULL; + void *a_count_handle = NULL; + void *a_actual_handle = NULL; + + if (t_count) { + t_count_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (t_size) { + t_size_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (t_actual) { + t_actual_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_count) { + a_count_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_size) { + a_size_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + if (a_actual) { + a_actual_handle = kernel_offloader_alloc(sizeof (size_t)); + } + + const int rc = kernel_offloader_mem_stats(t_count, t_size, t_actual, + a_count, a_size, a_actual); + if (rc == KERNEL_OFFLOADER_OK) { + /* should probably check for errors */ + kernel_offloader_copy_to_generic(t_count_handle, 0, + t_count, sizeof (*t_count)); + kernel_offloader_copy_to_generic(t_size_handle, 0, + t_size, sizeof (*t_size)); + kernel_offloader_copy_to_generic(t_actual_handle, 0, + t_actual, sizeof (*t_actual)); + kernel_offloader_copy_to_generic(a_count_handle, 0, + a_count, sizeof (*a_count)); + kernel_offloader_copy_to_generic(a_size_handle, 0, + a_size, sizeof (*a_size)); + kernel_offloader_copy_to_generic(a_actual_handle, 0, + a_actual, sizeof (*a_actual)); + } + + kernel_offloader_free(t_size_handle); + kernel_offloader_free(t_count_handle); + kernel_offloader_free(t_actual_handle); + kernel_offloader_free(a_size_handle); + kernel_offloader_free(a_count_handle); + kernel_offloader_free(a_actual_handle); + + return (translate_rc(rc)); +} + +static int +sw_provider_zero_fill(void *handle, size_t offset, size_t size) +{ + return (translate_rc(kernel_offloader_zero_fill(handle, offset, size))); +} + +static int +sw_provider_all_zeros(void *handle, size_t offset, size_t size) +{ + return (translate_rc(kernel_offloader_all_zeros(handle, offset, size))); +} + +static int +sw_provider_compress(dpusm_compress_t alg, int level, + void *src, size_t s_len, void *dst, size_t *d_len) +{ + /* buffer that offloader fills out */ + void *d_len_handle = kernel_offloader_alloc(sizeof (size_t)); + + /* send original d_len to offloader */ + kernel_offloader_copy_from_generic(d_len_handle, 0, + d_len, sizeof (*d_len)); + + const int kz_rc = kernel_offloader_compress(alg, level, + src, s_len, dst, d_len_handle); + if (kz_rc == KERNEL_OFFLOADER_OK) { + /* get updated d_len back from offloader */ + kernel_offloader_copy_to_generic(d_len_handle, 0, + d_len, sizeof (*d_len)); + } + + kernel_offloader_free(d_len_handle); + + return (translate_rc(kz_rc)); +} + +static int +sw_provider_decompress(dpusm_compress_t alg, int *level, + void *src, size_t s_len, void *dst, size_t *d_len) +{ + /* buffers that offloader fills out */ + void *level_handle = kernel_offloader_alloc(sizeof (*level)); + void *d_len_handle = kernel_offloader_alloc(sizeof (*d_len)); + + /* send original d_len to offloader */ + kernel_offloader_copy_from_generic(d_len_handle, 0, + d_len, sizeof (*d_len)); + + const int kz_rc = kernel_offloader_decompress(alg, level_handle, + src, s_len, dst, d_len_handle); + if (kz_rc == KERNEL_OFFLOADER_OK) { + /* get updated d_len back from offloader */ + kernel_offloader_copy_to_generic(d_len_handle, 0, + d_len, sizeof (*d_len)); + kernel_offloader_copy_to_generic(level_handle, 0, + level, sizeof (*level)); + } + + kernel_offloader_free(d_len_handle); + kernel_offloader_free(level_handle); + + return (translate_rc(kz_rc)); +} + +static int +sw_provider_checksum(dpusm_checksum_t alg, + dpusm_checksum_byteorder_t order, void *data, size_t size, + void *cksum, size_t cksum_size) +{ + /* maybe translate alg and order */ + + /* trigger offloader to do actual calculation */ + return (translate_rc(kernel_offloader_checksum(alg, + order, data, size, cksum, cksum_size))); +} + +static int +sw_provider_raid_can_compute(size_t nparity, size_t ndata, + size_t *col_sizes, int rec) +{ + if ((nparity < 1) || (nparity > 3)) { + return (DPUSM_NOT_SUPPORTED); + } + + return (DPUSM_OK); +} + +static int +sw_provider_raid_gen(void *raid) +{ + return (translate_rc(kernel_offloader_raidz_gen(raid))); +} + +static int +sw_provider_raid_cmp(void *lhs_handle, void *rhs_handle, int *diff) +{ + return (translate_rc(kernel_offloader_cmp(lhs_handle, + rhs_handle, diff))); +} + +static int +sw_provider_raid_rec(void *raid, int *tgts, int ntgts) +{ + return (translate_rc(kernel_offloader_raidz_rec(raid, + tgts, ntgts))); +} + +static int +sw_provider_file_write(void *fp_handle, void *handle, size_t count, + size_t trailing_zeros, loff_t offset, ssize_t *resid, int *err) +{ + return (translate_rc(kernel_offloader_file_write(fp_handle, + handle, count, trailing_zeros, offset, resid, err))); +} + +/* BEGIN CSTYLED */ +static const char name[] = "zia-software-provider"; +static const dpusm_pf_t sw_provider_functions = { + .algorithms = sw_provider_algorithms, + .alloc = kernel_offloader_alloc, + .alloc_ref = kernel_offloader_alloc_ref, + .get_size = sw_provider_get_size, + .free = kernel_offloader_free, + .copy = { + .from = { + .generic = sw_provider_copy_from_generic, + .ptr = NULL, + .scatterlist = NULL, + }, + .to = { + .generic = sw_provider_copy_to_generic, + .ptr = NULL, + .scatterlist = NULL, + }, + }, + .mem_stats = sw_provider_mem_stats, + .zero_fill = sw_provider_zero_fill, + .all_zeros = sw_provider_all_zeros, + .compress = sw_provider_compress, + .decompress = sw_provider_decompress, + .checksum = sw_provider_checksum, + .raid = { + .can_compute = sw_provider_raid_can_compute, + .alloc = kernel_offloader_raidz_alloc, + .set_column = kernel_offloader_raidz_set_column, + .free = kernel_offloader_raidz_free, + .gen = sw_provider_raid_gen, + .cmp = sw_provider_raid_cmp, + .rec = sw_provider_raid_rec, + }, + .file = { + .open = kernel_offloader_file_open, + .write = sw_provider_file_write, + .close = kernel_offloader_file_close, + }, + .disk = { + .open = kernel_offloader_disk_open, + .invalidate = kernel_offloader_disk_invalidate, + .write = kernel_offloader_disk_write, + .flush = kernel_offloader_disk_flush, + .close = kernel_offloader_disk_close, + }, +}; +/* END CSTYLED */ + +static int __init +sw_provider_init(void) +{ + /* + * this should be a separate kernel module, + * but is here for simplicity + */ + kernel_offloader_init(); + + return (dpusm_register_bsd(name, &sw_provider_functions)); +} + +static void __exit +sw_provider_exit(void) +{ + dpusm_unregister_bsd(name); + + kernel_offloader_fini(); +} + +module_init(sw_provider_init); +module_exit(sw_provider_exit); + +MODULE_LICENSE("CDDL"); diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in index 4cc075585d4b..358720ed471e 100644 --- a/rpm/generic/zfs-kmod.spec.in +++ b/rpm/generic/zfs-kmod.spec.in @@ -38,6 +38,7 @@ %bcond_with debug %bcond_with debuginfo +%bcond_with zia Name: %{module}-kmod @@ -124,6 +125,12 @@ bash %{SOURCE10} --target %{_target_cpu} %{?repo:--repo %{?repo}} --kmodname %{ %define debuginfo --disable-debuginfo %endif +%if %{with zia} + %define zia --with-zia="%{?DPUSM_ROOT}" +%else + %define zia --without-zia +%endif + # Leverage VPATH from configure to avoid making multiple copies. %define _configure ../%{module}-%{version}/configure @@ -144,7 +151,8 @@ for kernel_version in %{?kernel_versions}; do %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{zia} make %{?_smp_mflags} cd .. done diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in index 2e89abd0edfd..db560254d91d 100644 --- a/rpm/generic/zfs.spec.in +++ b/rpm/generic/zfs.spec.in @@ -68,6 +68,7 @@ %bcond_with systemd %bcond_with pam %bcond_without pyzfs +%bcond_with zia # Generic enable switch for systemd %if %{with systemd} @@ -390,6 +391,12 @@ support for unlocking datasets on user login. %define pam --disable-pam %endif +%if %{with zia} + %define zia --with-zia="%{DPUSM_ROOT}" +%else + %define zia --without-zia +%endif + %setup -q %build @@ -409,7 +416,8 @@ support for unlocking datasets on user login. %{ubsan} \ %{systemd} \ %{pam} \ - %{pyzfs} + %{pyzfs} \ + %{zia} make %{?_smp_mflags} %install diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index 876c198c64de..9bb7dcac5578 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -59,6 +59,12 @@ fi %define debuginfo --disable-debuginfo %endif +%if %{with zia} +%define zia --with-zia="%{?DPUSM_ROOT}" +%else +%define zia --without-zia +%endif + %setup -n %{kmod_name}-%{version} %build %configure \ @@ -69,7 +75,8 @@ fi %{debuginfo} \ %{?kernel_cc} \ %{?kernel_ld} \ - %{?kernel_llvm} + %{?kernel_llvm} \ + %{zia} make %{?_smp_mflags} # Module signing (modsign) diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 92ce09ec6fcb..cb82970f5c13 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -222,3 +222,7 @@ tags = ['functional', 'zvol', 'zvol_misc'] tests = ['idmap_mount_001', 'idmap_mount_002', 'idmap_mount_003', 'idmap_mount_004', 'idmap_mount_005'] tags = ['functional', 'idmap_mount'] + +[tests/functional/zia:Linux] +tests = ['zia_props', 'zia_write_pipeline', 'zia_raidz_resilver'] +tags = ['functional', 'zia'] diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index daa794551682..e702d57d1893 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -138,6 +138,7 @@ export SYSTEM_FILES_LINUX='attr groupdel groupmod hostid + insmod logger losetup lsattr @@ -153,6 +154,7 @@ export SYSTEM_FILES_LINUX='attr nsenter parted perf + rmmod setfattr setpriv sha256sum diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 44eedcf6fae5..95baa340eee6 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -387,7 +387,9 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/zvol/zvol_misc/zvol_misc_common.kshlib \ functional/zvol/zvol_swap/zvol_swap.cfg \ functional/idmap_mount/idmap_mount.cfg \ - functional/idmap_mount/idmap_mount_common.kshlib + functional/idmap_mount/idmap_mount_common.kshlib \ + functional/zia/zia.cfg \ + functional/zia/zia.kshlib nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/acl/off/cleanup.ksh \ @@ -2123,4 +2125,9 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/idmap_mount/idmap_mount_002.ksh \ functional/idmap_mount/idmap_mount_003.ksh \ functional/idmap_mount/idmap_mount_004.ksh \ - functional/idmap_mount/idmap_mount_005.ksh + functional/idmap_mount/idmap_mount_005.ksh \ + functional/zia/cleanup.ksh \ + functional/zia/setup.ksh \ + functional/zia/zia_props.ksh \ + functional/zia/zia_raidz_resilver.ksh \ + functional/zia/zia_write_pipeline.ksh diff --git a/tests/zfs-tests/tests/functional/zia/cleanup.ksh b/tests/zfs-tests/tests/functional/zia/cleanup.ksh new file mode 100755 index 000000000000..23b7b3739fe4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zia/cleanup.ksh @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/zia/zia.kshlib + +verify_runnable "global" + +lsmod | grep dpusm > /dev/null +ret="$?" +(( "${ret}" != "0" )) && log_unsupported "dpusm not loaded" + +log_must unload_provider +default_cleanup diff --git a/tests/zfs-tests/tests/functional/zia/setup.ksh b/tests/zfs-tests/tests/functional/zia/setup.ksh new file mode 100755 index 000000000000..e2f8df68628c --- /dev/null +++ b/tests/zfs-tests/tests/functional/zia/setup.ksh @@ -0,0 +1,44 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/zia/zia.kshlib +verify_runnable "global" + +# dpusm needs to be loaded before ZFS +lsmod | grep dpusm > /dev/null +ret="$?" +(( "${ret}" != "0" )) && log_unsupported "dpusm not loaded" + +# unload the software provider if the test starts with it loaded +lsmod | grep "${PROVIDER_MODULE}" > /dev/null +ret="$?" +(( "${ret}" == "0" )) && log_must rmmod "${PROVIDER_MODULE}" + +log_must default_zpool +log_must load_provider + +log_pass diff --git a/tests/zfs-tests/tests/functional/zia/zia.cfg b/tests/zfs-tests/tests/functional/zia/zia.cfg new file mode 100644 index 000000000000..c680d19702e4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zia/zia.cfg @@ -0,0 +1,37 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +export PROVIDER_MODULE="zia_software_provider" +export PROVIDER="zia-software-provider" +export BLOCKSZ=8192 +export NUM_WRITES=65536 +export DATA="R" +export FILENAME="${TESTDIR}/file" +export RESILVER_REPLACEMENT="${TEST_BASE_DIR}/replacement" diff --git a/tests/zfs-tests/tests/functional/zia/zia.kshlib b/tests/zfs-tests/tests/functional/zia/zia.kshlib new file mode 100644 index 000000000000..f0b532ee7e2e --- /dev/null +++ b/tests/zfs-tests/tests/functional/zia/zia.kshlib @@ -0,0 +1,112 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib +. $STF_SUITE/tests/functional/zia/zia.cfg + +function default_zpool +{ + default_raidz_setup_noexit "${DISKS}" + log_must zfs set compression=on "${TESTPOOL}" + log_must zfs set checksum=on "${TESTPOOL}" +} + +# providers can be loaded at any time after the dpusm +# +# the software provider must be loaded after ZFS since +# it uses ZFS symbols +function load_provider +{ + log_must insmod "${SBIN_DIR}/module/${PROVIDER}.ko" + log_must zpool set zia_provider="${PROVIDER}" "${TESTPOOL}" +} + +function unload_provider +{ + log_must zpool set zia_provider="" "${TESTPOOL}" + log_must rmmod "${PROVIDER_MODULE}" +} + +function offload_all +{ + log_must zpool set zia_compress="on" "${TESTPOOL}" + log_must zpool set zia_checksum="on" "${TESTPOOL}" + log_must zpool set zia_raidz1_gen="on" "${TESTPOOL}" + log_must zpool set zia_raidz2_gen="on" "${TESTPOOL}" + log_must zpool set zia_raidz3_gen="on" "${TESTPOOL}" + log_must zpool set zia_raidz1_rec="on" "${TESTPOOL}" + log_must zpool set zia_raidz2_rec="on" "${TESTPOOL}" + log_must zpool set zia_raidz3_rec="on" "${TESTPOOL}" + log_must zpool set zia_disk_write="on" "${TESTPOOL}" + log_must zpool set zia_file_write="on" "${TESTPOOL}" +} + +# +# loop through each combination of Z.I.A. offloads +# and make sure writing works +# +function loop_offloads_and_write +{ + for comp in on off + do + log_must zpool set zia_compress="${comp}" "${TESTPOOL}" + + for cksum in on off + do + log_must zpool set zia_checksum="${cksum}" "${TESTPOOL}" + + for raidz in on off + do + log_must zpool set zia_raidz1_gen="${raidz}" "${TESTPOOL}" + log_must zpool set zia_raidz2_gen="${raidz}" "${TESTPOOL}" + log_must zpool set zia_raidz3_gen="${raidz}" "${TESTPOOL}" + + for diskfile in on off + do + log_must zpool set zia_disk_write="${diskfile}" "${TESTPOOL}" + log_must zpool set zia_file_write="${diskfile}" "${TESTPOOL}" + + log_must file_write -o create -f "${FILENAME}" -b "${BLOCKSZ}" -c "${NUM_WRITES}" -d "${DATA}" + log_must ls -l "${FILENAME}" + log_must verify_pool "${TESTPOOL}" + log_must check_pool_status "${TESTPOOL}" "errors" "No known data errors" + log_must rm "${FILENAME}" + done + done + done + done +} + +# copied from default_raidz_setup_noexit +function random_disk +{ + typeset disklist="$*" + disks=(${disklist[*]}) + count="${#disks[*]}" + idx="$(($(random 1 ${count}) - 1))" + echo "${disks[${idx}]}" +} diff --git a/tests/zfs-tests/tests/functional/zia/zia_props.ksh b/tests/zfs-tests/tests/functional/zia/zia_props.ksh new file mode 100755 index 000000000000..a7a6dfbceebf --- /dev/null +++ b/tests/zfs-tests/tests/functional/zia/zia_props.ksh @@ -0,0 +1,53 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/zia/zia.kshlib + +# +# DESCRIPTION: +# Z.I.A. zpool settings work +# +# STRATEGY: +# 1. Turn on all offloads +# 2. Run zpool get on each property +# + +log_must offload_all + +log_must zpool get zia_provider "${TESTPOOL}" +log_must zpool get zia_compress "${TESTPOOL}" +log_must zpool get zia_checksum "${TESTPOOL}" +log_must zpool get zia_raidz1_gen "${TESTPOOL}" +log_must zpool get zia_raidz2_gen "${TESTPOOL}" +log_must zpool get zia_raidz3_gen "${TESTPOOL}" +log_must zpool get zia_raidz1_rec "${TESTPOOL}" +log_must zpool get zia_raidz2_rec "${TESTPOOL}" +log_must zpool get zia_raidz3_rec "${TESTPOOL}" +log_must zpool get zia_disk_write "${TESTPOOL}" +log_must zpool get zia_file_write "${TESTPOOL}" + +log_pass diff --git a/tests/zfs-tests/tests/functional/zia/zia_raidz_resilver.ksh b/tests/zfs-tests/tests/functional/zia/zia_raidz_resilver.ksh new file mode 100755 index 000000000000..31e0e0940110 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zia/zia_raidz_resilver.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/zia/zia.kshlib + +# +# DESCRIPTION: +# Z.I.A. RAIDZ Resilver works +# +# STRATEGY: +# 1. Turn on all offloads +# 2. Write data to the zpool +# 3. Replace a drive +# 4. Resilver the zpool with Z.I.A. +# 5. Check for errors +# + +log_must truncate -s 4G "${RESILVER_REPLACEMENT}" + +function cleanup +{ + log_must rm "${RESILVER_REPLACEMENT}" +} +log_onexit cleanup + +log_must offload_all + +# write a file +log_must file_write -o create -f "${FILENAME}" -b "${BLOCKSZ}" -c "${NUM_WRITES}" -d "${DATA}" +log_must ls -l "${FILENAME}" + +# pick a random backing device to offline and replace it +bad="$(random_disk ${DISKS})" +log_must zpool offline "${TESTPOOL}" "${bad}" +log_must zpool replace "${TESTPOOL}" "${bad}" "${RESILVER_REPLACEMENT}" +log_must wait_replacing "${TESTPOOL}" + +log_must verify_pool "${TESTPOOL}" +log_must check_pool_status "${TESTPOOL}" "errors" "No known data errors" + +log_pass diff --git a/tests/zfs-tests/tests/functional/zia/zia_write_pipeline.ksh b/tests/zfs-tests/tests/functional/zia/zia_write_pipeline.ksh new file mode 100755 index 000000000000..b31abfa9ab41 --- /dev/null +++ b/tests/zfs-tests/tests/functional/zia/zia_write_pipeline.ksh @@ -0,0 +1,46 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2021 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/zia/zia.kshlib + +# +# DESCRIPTION: +# Z.I.A. Write Pipeline works +# +# STRATEGY: +# 1. Turn each of the offloaded stages on and off +# 1.1. Write data to the zpool +# 1.2. Delete the file +# 2. Disable the provider for the pool and unload the provider +# 3. Do 1. again, but without a provider to make sure Z.I.A. falls back to ZFS properly +# + +log_must loop_offloads_and_write +log_must unload_provider +log_must loop_offloads_and_write +log_must load_provider +log_pass