Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hierarchical bandwidth and operations rate limits. #16205

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions cmd/zfs/zfs_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2341,15 +2341,25 @@ zfs_do_inherit(int argc, char **argv)
if (!zfs_prop_inheritable(prop) && !received) {
(void) fprintf(stderr, gettext("'%s' property cannot "
"be inherited\n"), propname);
if (prop == ZFS_PROP_QUOTA ||
prop == ZFS_PROP_RESERVATION ||
prop == ZFS_PROP_REFQUOTA ||
prop == ZFS_PROP_REFRESERVATION) {
switch (prop) {
case ZFS_PROP_QUOTA:
case ZFS_PROP_RESERVATION:
case ZFS_PROP_REFQUOTA:
case ZFS_PROP_REFRESERVATION:
case ZFS_PROP_RATELIMIT_BW_READ:
case ZFS_PROP_RATELIMIT_BW_WRITE:
case ZFS_PROP_RATELIMIT_BW_TOTAL:
case ZFS_PROP_RATELIMIT_OP_READ:
case ZFS_PROP_RATELIMIT_OP_WRITE:
case ZFS_PROP_RATELIMIT_OP_TOTAL:
(void) fprintf(stderr, gettext("use 'zfs set "
"%s=none' to clear\n"), propname);
(void) fprintf(stderr, gettext("use 'zfs "
"inherit -S %s' to revert to received "
"value\n"), propname);
break;
default:
break;
}
return (1);
}
Expand Down
2 changes: 1 addition & 1 deletion include/os/freebsd/spl/sys/sdt.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ SDT_PROBE_DECLARE(sdt, , , set__error);
#define SET_ERROR(err) \
((sdt_sdt___set__error->id ? \
(*sdt_probe_func)(sdt_sdt___set__error->id, \
(uintptr_t)err, 0, 0, 0, 0) : 0), err)
(uintptr_t)err, 0, 0, 0, 0, 0) : 0), err)
#else
#define SET_ERROR(err) (err)
#endif
Expand Down
1 change: 1 addition & 0 deletions include/os/freebsd/spl/sys/systm.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,6 @@
#define PAGEMASK (~PAGEOFFSET)

#define delay(x) pause("soldelay", (x))
#define delay_sig(x) (pause_sig("soldelay", (x)) != EAGAIN)

#endif /* _OPENSOLARIS_SYS_SYSTM_H_ */
9 changes: 6 additions & 3 deletions include/os/freebsd/zfs/sys/zfs_znode_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,12 @@ zfs_exit(zfsvfs_t *zfsvfs, const char *tag)
(tp)->tv_sec = (time_t)(stmp)[0]; \
(tp)->tv_nsec = (long)(stmp)[1]; \
}
#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \
if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
zfs_tstamp_update_setup_ext(zp, ACCESSED, NULL, NULL, B_FALSE);
#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) do { \
if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) { \
vfs_ratelimit_metadata_write((zfsvfs)->z_os); \
zfs_tstamp_update_setup_ext(zp, ACCESSED, NULL, NULL, B_FALSE);\
} \
} while (0)

extern void zfs_tstamp_update_setup_ext(struct znode *,
uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx);
Expand Down
1 change: 1 addition & 0 deletions include/os/linux/spl/sys/timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a)

#define delay(ticks) schedule_timeout_uninterruptible(ticks)
#define delay_sig(ticks) (schedule_timeout_interruptible(ticks) > 0)

#define SEC_TO_TICK(sec) ((sec) * HZ)
#define MSEC_TO_TICK(ms) msecs_to_jiffies(ms)
Expand Down
6 changes: 6 additions & 0 deletions include/sys/dsl_dir.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ extern "C" {
#endif

struct dsl_dataset;
struct vfs_ratelimit;
struct zthr;
/*
* DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
Expand Down Expand Up @@ -127,6 +128,10 @@ struct dsl_dir {
boolean_t dd_activity_cancelled;
uint64_t dd_activity_waiters;

/* protected by spa_ratelimit_lock */
struct vfs_ratelimit *dd_ratelimit;
dsl_dir_t *dd_ratelimit_root;

/* protected by dd_lock; keep at end of struct for better locality */
char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
};
Expand Down Expand Up @@ -182,6 +187,7 @@ int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
uint64_t quota);
int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
uint64_t reservation);
int dsl_dir_set_ratelimit(const char *dsname, zfs_prop_t prop, uint64_t value);
int dsl_dir_activate_fs_ss_limit(const char *);
int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
cred_t *, proc_t *);
Expand Down
6 changes: 6 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,12 @@ typedef enum {
ZFS_PROP_SNAPSHOTS_CHANGED,
ZFS_PROP_PREFETCH,
ZFS_PROP_VOLTHREADING,
ZFS_PROP_RATELIMIT_BW_READ,
ZFS_PROP_RATELIMIT_BW_WRITE,
ZFS_PROP_RATELIMIT_BW_TOTAL,
ZFS_PROP_RATELIMIT_OP_READ,
ZFS_PROP_RATELIMIT_OP_WRITE,
ZFS_PROP_RATELIMIT_OP_TOTAL,
ZFS_NUM_PROPS
} zfs_prop_t;

Expand Down
2 changes: 2 additions & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,8 @@ struct spa {
uint64_t spa_leaf_list_gen; /* track leaf_list changes */
uint32_t spa_hostid; /* cached system hostid */

rrmlock_t spa_ratelimit_lock;

/* synchronization for threads in spa_wait */
kmutex_t spa_activities_lock;
kcondvar_t spa_activities_cv;
Expand Down
72 changes: 72 additions & 0 deletions include/sys/vfs_ratelimit.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2024 The FreeBSD Foundation
*
* This software was developed by Pawel Dawidek <[email protected]>
* under sponsorship from the FreeBSD Foundation.
*/

#ifndef _SYS_VFS_RATELIMIT_H
#define _SYS_VFS_RATELIMIT_H

#include <sys/dmu_objset.h>

#ifdef __cplusplus
extern "C" {
#endif

struct vfs_ratelimit;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This barely matters, but we don't have anything else really called vfs_. Should this be zfs_ratelimit, or zfs_vfs_ratelimit, or something like that?

(and change the function names etc to match, of course)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FOLLOWUP: Having just read the zvol changes, I'm even more convinced this should definitely not say just be zfs_ratelimit, not vfs or anything else. vfs is sort of a strange term to see in the zvol code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Of course I wanted to use zfs_ratelimit, but it is already taken:) See module/zfs/zfs_ratelimit.c. I'm open to changing the name.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about zfs_ioratelimit or zfs_reqlimit or zfs_iolimit or something else? The naming conflicts with VFS on one side in ZVOL on another and zfs_ratelimit on third are indeed annoying.


#define ZFS_RATELIMIT_BW_READ 0
#define ZFS_RATELIMIT_BW_WRITE 1
#define ZFS_RATELIMIT_BW_TOTAL 2
#define ZFS_RATELIMIT_OP_READ 3
#define ZFS_RATELIMIT_OP_WRITE 4
#define ZFS_RATELIMIT_OP_TOTAL 5
#define ZFS_RATELIMIT_FIRST ZFS_RATELIMIT_BW_READ
#define ZFS_RATELIMIT_LAST ZFS_RATELIMIT_OP_TOTAL
#define ZFS_RATELIMIT_NTYPES (ZFS_RATELIMIT_LAST + 1)

int vfs_ratelimit_prop_to_type(zfs_prop_t prop);
zfs_prop_t vfs_ratelimit_type_to_prop(int type);

struct vfs_ratelimit *vfs_ratelimit_alloc(const uint64_t *limits);
void vfs_ratelimit_free(struct vfs_ratelimit *rl);
struct vfs_ratelimit *vfs_ratelimit_set(struct vfs_ratelimit *rl,
zfs_prop_t prop, uint64_t limit);

int vfs_ratelimit_data_read(objset_t *os, size_t blocksize, size_t bytes);
int vfs_ratelimit_data_write(objset_t *os, size_t blocksize, size_t bytes);
int vfs_ratelimit_data_copy(objset_t *srcos, objset_t *dstos, size_t blocksize,
size_t bytes);
int vfs_ratelimit_metadata_read(objset_t *os);
int vfs_ratelimit_metadata_write(objset_t *os);

void vfs_ratelimit_data_read_spin(objset_t *os, size_t blocksize, size_t bytes);
void vfs_ratelimit_data_write_spin(objset_t *os, size_t blocksize, size_t bytes);

#ifdef __cplusplus
}
#endif

#endif /* _SYS_VFS_RATELIMIT_H */
8 changes: 7 additions & 1 deletion lib/libzfs/libzfs.abi
Original file line number Diff line number Diff line change
Expand Up @@ -1847,7 +1847,13 @@
<enumerator name='ZFS_PROP_SNAPSHOTS_CHANGED' value='95'/>
<enumerator name='ZFS_PROP_PREFETCH' value='96'/>
<enumerator name='ZFS_PROP_VOLTHREADING' value='97'/>
<enumerator name='ZFS_NUM_PROPS' value='98'/>
<enumerator name='ZFS_PROP_RATELIMIT_BW_READ' value='98'/>
<enumerator name='ZFS_PROP_RATELIMIT_BW_WRITE' value='99'/>
<enumerator name='ZFS_PROP_RATELIMIT_BW_TOTAL' value='100'/>
<enumerator name='ZFS_PROP_RATELIMIT_OP_READ' value='101'/>
<enumerator name='ZFS_PROP_RATELIMIT_OP_WRITE' value='102'/>
<enumerator name='ZFS_PROP_RATELIMIT_OP_TOTAL' value='103'/>
<enumerator name='ZFS_NUM_PROPS' value='104'/>
</enum-decl>
<typedef-decl name='zfs_prop_t' type-id='4b000d60' id='58603c44'/>
<enum-decl name='zprop_source_t' naming-typedef-id='a2256d42' id='5903f80e'>
Expand Down
33 changes: 31 additions & 2 deletions lib/libzfs/libzfs_dataset.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
#include <sys/spa.h>
#include <sys/zap.h>
#include <sys/dsl_crypt.h>
#include <sys/vfs_ratelimit.h>
#include <libzfs.h>
#include <libzutil.h>

Expand Down Expand Up @@ -2287,6 +2288,12 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
case ZFS_PROP_SNAPSHOT_LIMIT:
case ZFS_PROP_FILESYSTEM_COUNT:
case ZFS_PROP_SNAPSHOT_COUNT:
case ZFS_PROP_RATELIMIT_BW_READ:
case ZFS_PROP_RATELIMIT_BW_WRITE:
case ZFS_PROP_RATELIMIT_BW_TOTAL:
case ZFS_PROP_RATELIMIT_OP_READ:
case ZFS_PROP_RATELIMIT_OP_WRITE:
case ZFS_PROP_RATELIMIT_OP_TOTAL:
*val = getprop_uint64(zhp, prop, source);

if (*source == NULL) {
Expand Down Expand Up @@ -2811,12 +2818,15 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
case ZFS_PROP_REFQUOTA:
case ZFS_PROP_RESERVATION:
case ZFS_PROP_REFRESERVATION:
case ZFS_PROP_RATELIMIT_BW_READ:
case ZFS_PROP_RATELIMIT_BW_WRITE:
case ZFS_PROP_RATELIMIT_BW_TOTAL:

if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
return (-1);
/*
* If quota or reservation is 0, we translate this into 'none'
* (unless literal is set), and indicate that it's the default
* If the value is 0, we translate this into 'none' (unless
* literal is set), and indicate that it's the default
* value. Otherwise, we print the number nicely and indicate
* that its set locally.
*/
Expand All @@ -2835,6 +2845,25 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
zcp_check(zhp, prop, val, NULL);
break;

case ZFS_PROP_RATELIMIT_OP_READ:
case ZFS_PROP_RATELIMIT_OP_WRITE:
case ZFS_PROP_RATELIMIT_OP_TOTAL:

if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
return (-1);
/*
* If the value is 0, we translate this into 'none', unless
* literal is set.
*/
if (val == 0 && !literal) {
(void) strlcpy(propbuf, "none", proplen);
} else {
(void) snprintf(propbuf, proplen, "%llu",
(u_longlong_t)val);
}
zcp_check(zhp, prop, val, NULL);
break;

case ZFS_PROP_FILESYSTEM_LIMIT:
case ZFS_PROP_SNAPSHOT_LIMIT:
case ZFS_PROP_FILESYSTEM_COUNT:
Expand Down
1 change: 1 addition & 0 deletions lib/libzpool/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ nodist_libzpool_la_SOURCES = \
module/zfs/vdev_removal.c \
module/zfs/vdev_root.c \
module/zfs/vdev_trim.c \
module/zfs/vfs_ratelimit.c \
module/zfs/zap.c \
module/zfs/zap_leaf.c \
module/zfs/zap_micro.c \
Expand Down
111 changes: 111 additions & 0 deletions man/man7/zfsprops.7
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,117 @@ and the minimum is
.Sy 100000 .
This property may be changed with
.Nm zfs Cm change-key .
.It Sy limit_bw_read Ns = Ns Ar size Ns | Ns Sy none
.It Sy limit_bw_write Ns = Ns Ar size Ns | Ns Sy none
.It Sy limit_bw_total Ns = Ns Ar size Ns | Ns Sy none
Limits the read, write, or combined bandwidth, respectively, that a dataset and
its descendants can consume.
Limits are applied to file systems, volumes and their snapshots.
Bandwidth limits are in bytes per second.
.Pp
The configured limits are hierarchical, just like quotas; i.e., even if a
higher limit is configured on the child dataset, the parent's lower limit will
be enforced.
.Pp
The limits are applied at the VFS level, not at the disk level.
The dataset is charged for each operation even if no disk access is required
(e.g., due to caching, compression, deduplication, or NOP writes) or if the
operation will cause more traffic (due to the copies property, mirroring,
or RAIDZ).
.Pp
Read bandwidth consumption is based on:
.Bl -bullet
.It
read-like syscalls, eg.,
.Xr aio_read 2 ,
.Xr copy_file_range 2 ,
.Xr pread 2 ,
.Xr preadv 2 ,
.Xr read 2 ,
.Xr readv 2 ,
.Xr sendfile 2
.It
syscalls like
.Xr getdents 2
and
.Xr getdirentries 2
.It
reading via mmaped files
.It
.Nm zfs Cm send
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similar to above, maybe mention snapshot mounts?

.El
.Pp
Write bandwidth consumption is based on:
.Bl -bullet
.It
write-like syscalls, eg.,
.Xr aio_write 2 ,
.Xr copy_file_range 2 ,
.Xr pwrite 2 ,
.Xr pwritev 2 ,
.Xr write 2 ,
.Xr writev 2
.It
writing via mmaped files
.It
.Nm zfs Cm receive
.El
.It Sy limit_op_read Ns = Ns Ar count Ns | Ns Sy none
.It Sy limit_op_write Ns = Ns Ar count Ns | Ns Sy none
.It Sy limit_op_total Ns = Ns Ar count Ns | Ns Sy none
Limits the read, write, or both metadata operations, respectively, that a
dataset and its descendants can generate.
Limits are number of operations per second.
.Pp
Read operations consumption is based on:
.Bl -bullet
.It
read-like syscalls where the number of operations is equal to the number of
blocks being read (never less than 1)
.It
reading via mmaped files, where the number of operations is equal to the
number of pages being read (never less than 1)
.It
syscalls accessing metadata:
.Xr readlink 2 ,
.Xr stat 2
.El
.Pp
Write operations consumption is based on:
.Bl -bullet
.It
write-like syscalls where the number of operations is equal to the number of
blocks being written (never less than 1)
.It
writing via mmaped files, where the number of operations is equal to the
number of pages being written (never less than 1)
.It
syscalls modifing a directory's content:
.Xr bind 2 (UNIX-domain sockets) ,
.Xr link 2 ,
.Xr mkdir 2 ,
.Xr mkfifo 2 ,
.Xr mknod 2 ,
.Xr open 2 (file creation) ,
.Xr rename 2 ,
.Xr rmdir 2 ,
.Xr symlink 2 ,
.Xr unlink 2
.It
syscalls modifing metadata:
.Xr chflags 2 ,
.Xr chmod 2 ,
.Xr chown 2 ,
.Xr utimes 2
.It
updating the access time of a file when reading it
.El
.Pp
Just like
.Sy limit_bw
limits, the
.Sy limit_op
limits are also hierarchical and applied at the VFS level.
.It Sy exec Ns = Ns Sy on Ns | Ns Sy off
Controls whether processes can be executed from within this file system.
The default value is
Expand Down
Loading