Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vdev_disk: rewrite BIO filling machinery to avoid split pages #15588

Closed
wants to merge 9 commits into from
17 changes: 17 additions & 0 deletions config/kernel-mm-page-size.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
ZFS_LINUX_TEST_SRC([page_size], [
#include <linux/mm.h>
],[
unsigned long s;
s = page_size(NULL);
])
])
AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
AC_MSG_CHECKING([whether page_size() is available])
ZFS_LINUX_TEST_RESULT([page_size], [
AC_MSG_RESULT(yes)
AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
],[
AC_MSG_RESULT(no)
])
])
2 changes: 2 additions & 0 deletions config/kernel.m4
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
ZFS_AC_KERNEL_SRC_SYNC_BDEV
ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
Expand Down Expand Up @@ -316,6 +317,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
ZFS_AC_KERNEL_COPY_SPLICE_READ
ZFS_AC_KERNEL_SYNC_BDEV
ZFS_AC_KERNEL_MM_PAGE_SIZE
case "$host_cpu" in
powerpc*)
ZFS_AC_KERNEL_CPU_HAS_FEATURE
Expand Down
1 change: 1 addition & 0 deletions include/os/linux/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ kernel_linux_HEADERS = \
%D%/kernel/linux/compiler_compat.h \
%D%/kernel/linux/dcache_compat.h \
%D%/kernel/linux/kmap_compat.h \
%D%/kernel/linux/mm_compat.h \
%D%/kernel/linux/mod_compat.h \
%D%/kernel/linux/page_compat.h \
%D%/kernel/linux/percpu_compat.h \
Expand Down
36 changes: 36 additions & 0 deletions include/os/linux/kernel/linux/mm_compat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2023, 2024, Klara Inc.
*/

#ifndef _ZFS_MM_COMPAT_H
#define _ZFS_MM_COMPAT_H

#include <linux/mm.h>

/* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
#ifndef HAVE_MM_PAGE_SIZE
#define page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
#endif

#endif /* _ZFS_MM_COMPAT_H */
1 change: 1 addition & 0 deletions include/os/linux/kernel/linux/mod_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ enum scope_prefix_types {
zfs_trim,
zfs_txg,
zfs_vdev,
zfs_vdev_disk,
zfs_vdev_file,
zfs_vdev_mirror,
zfs_vnops,
Expand Down
9 changes: 9 additions & 0 deletions include/sys/abd.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ typedef struct abd {

typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
#if defined(__linux__) && defined(_KERNEL)
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
#endif

extern int zfs_abd_scatter_enabled;

Expand Down Expand Up @@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *);
#if defined(__linux__) && defined(_KERNEL)
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
#endif
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
Expand Down Expand Up @@ -213,6 +220,8 @@ void abd_fini(void);

/*
* Linux ABD bio functions
* Note: these are only needed to support vdev_classic. See comment in
* vdev_disk.c.
*/
#if defined(__linux__) && defined(_KERNEL)
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
Expand Down
26 changes: 23 additions & 3 deletions include/sys/abd_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/

#ifndef _ABD_IMPL_H
Expand All @@ -38,12 +39,30 @@ typedef enum abd_stats_op {
ABDSTAT_DECR /* Decrease abdstat values */
} abd_stats_op_t;

struct scatterlist; /* forward declaration */
/* forward declarations */
struct scatterlist;
struct page;

struct abd_iter {
/* public interface */
void *iter_mapaddr; /* addr corresponding to iter_pos */
size_t iter_mapsize; /* length of data valid at mapaddr */
union {
/* for abd_iter_map()/abd_iter_unmap() */
struct {
/* addr corresponding to iter_pos */
void *iter_mapaddr;
/* length of data valid at mapaddr */
size_t iter_mapsize;
};
/* for abd_iter_page() */
struct {
/* current page */
struct page *iter_page;
robn marked this conversation as resolved.
Show resolved Hide resolved
/* offset of data in page */
size_t iter_page_doff;
robn marked this conversation as resolved.
Show resolved Hide resolved
/* size of data in page */
size_t iter_page_dsize;
};
};

/* private */
abd_t *iter_abd; /* ABD being iterated through */
Expand Down Expand Up @@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
void abd_iter_advance(struct abd_iter *, size_t);
void abd_iter_map(struct abd_iter *);
void abd_iter_unmap(struct abd_iter *);
void abd_iter_page(struct abd_iter *);

/*
* Helper macros
Expand Down
26 changes: 25 additions & 1 deletion man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
.\" Copyright (c) 2013 by Turbo Fredriksson <[email protected]>. All rights reserved.
.\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
.\" Copyright (c) 2019 Datto Inc.
.\" Copyright (c) 2023, 2024 Klara, Inc.
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
.\" in compliance with the License. You can obtain a copy of the license at
Expand All @@ -15,7 +16,7 @@
.\" own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.\"
.Dd July 21, 2023
.Dd January 9, 2024
.Dt ZFS 4
.Os
.
Expand Down Expand Up @@ -1362,6 +1363,29 @@ _
4 Driver No driver retries on driver errors.
.TE
.
.It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
Maximum number of segments to add to a BIO (min 4).
If this is higher than the maximum allowed by the device queue or the kernel
itself, it will be clamped.
Setting it to zero will cause the kernel's ideal size to be used.
This parameter only applies on Linux.
This parameter is ignored if
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
.
.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
and earlier.
This "classic" method has known issues with highly fragmented IO requests and
is slower on many workloads, but it has been in use for many years and is known
to be very stable.
If you set this parameter, please also open a bug report why you did so,
including the workload involved and any error messages.
.Pp
This parameter and the classic submission method will be removed once we have
total confidence in the new method.
.Pp
This parameter only applies on Linux, and can only be set at module load time.
.
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
Time before expiring
.Pa .zfs/snapshot .
Expand Down
4 changes: 1 addition & 3 deletions module/os/freebsd/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
aiter->iter_pos = 0;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}

/*
Expand Down
123 changes: 115 additions & 8 deletions module/os/linux/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/

/*
Expand Down Expand Up @@ -59,7 +60,9 @@
#include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h>
#include <linux/mm_compat.h>
#include <linux/scatterlist.h>
#include <linux/version.h>
#endif

#ifdef _KERNEL
Expand Down Expand Up @@ -895,14 +898,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
aiter->iter_pos = 0;
if (abd_is_linear(abd)) {
aiter->iter_offset = 0;
aiter->iter_sg = NULL;
} else {
if (!abd_is_linear(abd)) {
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
}
Expand All @@ -915,6 +913,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}

Expand All @@ -926,8 +925,15 @@ abd_iter_at_end(struct abd_iter *aiter)
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
/*
* Ensure that last chunk is not in use. abd_iterate_*() must clear
* this state (directly or abd_iter_unmap()) before advancing.
*/
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
ASSERT3P(aiter->iter_page, ==, NULL);
ASSERT0(aiter->iter_page_doff);
ASSERT0(aiter->iter_page_dsize);

/* There's nothing left to advance to, so do nothing */
if (abd_iter_at_end(aiter))
Expand Down Expand Up @@ -1009,6 +1015,106 @@ abd_cache_reap_now(void)
}

#if defined(_KERNEL)
/*
* Yield the next page struct and data offset and size within it, without
* mapping it into the address space.
*/
void
abd_iter_page(struct abd_iter *aiter)
{
if (abd_iter_at_end(aiter)) {
aiter->iter_page = NULL;
aiter->iter_page_doff = 0;
aiter->iter_page_dsize = 0;
return;
}

struct page *page;
size_t doff, dsize;

if (abd_is_linear(aiter->iter_abd)) {
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);

/* memory address at iter_pos */
void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;

/* struct page for address */
page = is_vmalloc_addr(paddr) ?
vmalloc_to_page(paddr) : virt_to_page(paddr);

/* offset of address within the page */
doff = offset_in_page(paddr);

/* total data remaining in abd from this position */
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
} else {
ASSERT(!abd_is_gang(aiter->iter_abd));

/* current scatter page */
page = sg_page(aiter->iter_sg);

/* position within page */
doff = aiter->iter_offset;

/* remaining data in scatterlist */
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
}
ASSERT(page);

#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
if (PageTail(page)) {
/*
* This page is part of a "compound page", which is a group of
* pages that can be referenced from a single struct page *.
* Its organised as a "head" page, followed by a series of
* "tail" pages.
*
* In OpenZFS, compound pages are allocated using the
* __GFP_COMP flag, which we get from scatter ABDs and SPL
* vmalloc slabs (ie >16K allocations). So a great many of the
* IO buffers we get are going to be of this type.
*
* The tail pages are just regular PAGE_SIZE pages, and can be
* safely used as-is. However, the head page has length
* covering itself and all the tail pages. If this ABD chunk
* spans multiple pages, then we can use the head page and a
* >PAGE_SIZE length, which is far more efficient.
*
* To do this, we need to adjust the offset to be counted from
* the head page. struct page for compound pages are stored
* contiguously, so we can just adjust by a simple offset.
*
* Before kernel 4.5, compound page heads were refcounted
* separately, such that moving back to the head page would
* require us to take a reference to it and releasing it once
* we're completely finished with it. In practice, that means
* when our caller is done with the ABD, which we have no
* insight into from here. Rather than contort this API to
* track head page references on such ancient kernels, we just
* compile this block out and use the tail pages directly. This
* is slightly less efficient, but makes everything far
* simpler.
*/
struct page *head = compound_head(page);
doff += ((page - head) * PAGESIZE);
page = head;
}
#endif

/* final page and position within it */
aiter->iter_page = page;
aiter->iter_page_doff = doff;

/* amount of data in the chunk, up to the end of the page */
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
}

/*
* Note: ABD BIO functions only needed to support vdev_classic. See comments in
* vdev_disk.c.
*/

/*
* bio_nr_pages for ABD.
* @off is the offset in @abd
Expand Down Expand Up @@ -1163,4 +1269,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD.");
#endif

#endif /* _KERNEL */
Loading
Loading