Skip to content

Commit

Permalink
Implement parallel ARC eviction
Browse files Browse the repository at this point in the history
Read and write performance can become limited by the arc_evict
process being single threaded. Additional data cannot be added
to the ARC until sufficient existing data is evicted.

On many-core systems with TBs of RAM, a single thread becomes
a significant bottleneck.

With the change we see a 25% increase in read and write throughput

Sponsored-by: Expensify, Inc.
Sponsored-by: Klara, Inc.
Co-authored-by: Allan Jude <[email protected]>
Co-authored-by: Mateusz Piotrowski <[email protected]>
Signed-off-by: Alexander Stetsenko <[email protected]>
Signed-off-by: Allan Jude <[email protected]>
Signed-off-by: Mateusz Piotrowski <[email protected]>
  • Loading branch information
3 people committed Oct 23, 2024
1 parent e0bf43d commit b6a65a2
Show file tree
Hide file tree
Showing 2 changed files with 182 additions and 8 deletions.
18 changes: 18 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,24 @@ Number ARC headers to evict per sub-list before proceeding to another sub-list.
This batch-style operation prevents entire sub-lists from being evicted at once
but comes at a cost of additional unlocking and locking.
.
.It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq uint
Sets the maximum number of ARC eviction threads to be used.
.Pp
When set to 0,
ZFS uses the number of threads depending on the number of CPU cores.
The minimum number of threads is 1 and applies to systems from 1 to 5 CPU cores.
Systems with 6 CPU cores get 2 eviction threads.
ZFS on systems larger than that uses log2 of the CPU count
plus the CPU count shifted 6 bits.
This way the number of eviction threads scales up more on high CPU counts.
Currently, ZFS will not scale automatically beyond 16 threads.
.Pp
More threads may improve the responsiveness of ZFS to memory pressure.
This can be important for performance when eviction from the ARC becomes
a bottleneck for reads and writes.
.Pp
Note that the thread count cannot be changed during runtime.
.
.It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
If set to a non zero value, it will replace the
.Sy arc_grow_retry
Expand Down
172 changes: 164 additions & 8 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,19 @@ static uint_t zfs_arc_lotsfree_percent = 10;
*/
static int zfs_arc_prune_task_threads = 1;

/*
* Number of arc_evict threads
*/
static uint_t zfs_arc_evict_threads = 0;
static uint_t zfs_arc_evict_threads_live = 0;

/*
* The minimum number of bytes we can evict at once is a block size.
* So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
* We use this value to compute a scaling factor for the eviction tasks.
*/
#define MIN_EVICT_PERTASK_SHIFT (SPA_MAXBLOCKSHIFT)

/* The 7 states: */
arc_state_t ARC_anon;
arc_state_t ARC_mru;
Expand Down Expand Up @@ -3890,7 +3903,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* specifically implemented to ensure this is the case
* (only 'marker' will be removed and re-inserted).
*/
multilist_sublist_move_forward(mls, marker);

/*
* The only case where the b_spa field should ever be
Expand All @@ -3900,11 +3912,14 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* dsl_pool_close() and zio_inject_fault()), so we must
* skip any markers we see from these other threads.
*/
if (hdr->b_spa == 0)
if (hdr->b_spa == 0) {
multilist_sublist_move_forward(mls, marker);
continue;
}

/* we're only interested in evicting buffers of a certain spa */
if (spa != 0 && hdr->b_spa != spa) {
multilist_sublist_move_forward(mls, marker);
ARCSTAT_BUMP(arcstat_evict_skip);
continue;
}
Expand Down Expand Up @@ -3939,6 +3954,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
evict_count--;

} else {
multilist_sublist_move_forward(mls, marker);
ARCSTAT_BUMP(arcstat_mutex_miss);
}
}
Expand Down Expand Up @@ -4026,6 +4042,35 @@ arc_state_free_markers(arc_buf_hdr_t **markers, int count)
kmem_free(markers, sizeof (*markers) * count);
}

taskq_t *arc_evict_taskq;

typedef struct evict_arg {
taskq_ent_t tqe;
multilist_t *ml;
int idx;
arc_buf_hdr_t *marker;
uint64_t spa;
uint64_t bytes;
volatile uint64_t *evicted_ptr;
} evict_arg_t;

static void
arc_evict_task(void *arg)
{
evict_arg_t *eva = arg;
volatile uint64_t *evictedp = eva->evicted_ptr;
multilist_t *ml = eva->ml;
arc_buf_hdr_t *marker = eva->marker;
int idx = eva->idx;
uint64_t spa = eva->spa;
uint64_t evict = eva->bytes;
uint64_t bytes_evicted;

bytes_evicted = arc_evict_state_impl(ml, idx, marker, spa, evict);

atomic_add_64(evictedp, bytes_evicted);
}

/*
* Evict buffers from the given arc state, until we've removed the
* specified number of bytes. Move the removed buffers to the
Expand All @@ -4045,10 +4090,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
{
uint64_t total_evicted = 0;
multilist_t *ml = &state->arcs_list[type];
int num_sublists;
arc_buf_hdr_t **markers;
unsigned num_sublists = multilist_get_num_sublists(ml);

num_sublists = multilist_get_num_sublists(ml);
if (bytes == 0)
return (total_evicted);

/*
* If we've tried to evict from each sublist, made some
Expand All @@ -4071,25 +4117,108 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
multilist_sublist_unlock(mls);
}

evict_arg_t *evarg = kmem_alloc(sizeof (*evarg) * num_sublists,
KM_SLEEP);
/*
* While we haven't hit our target number of bytes to evict, or
* we're evicting all available buffers.
*/
while (total_evicted < bytes) {
int sublist_idx = multilist_get_random_index(ml);
boolean_t usetskq = zfs_arc_evict_threads_live > 1;
uint64_t scan_evicted = 0;

uint64_t left = (bytes == ARC_EVICT_ALL ? bytes :
bytes - total_evicted);

/*
* How we scale
*
* Example 1, # of chunks less than # of tasks.
* We have:
* - 4 tasks
* - 3 chunks
* - 3 full col
* - 0 low cols.
*
* The first low col index is 3.
* The tasks #0-#2 evict 1 chunk each.
*
* 0 | 1 | 2 | 3 |
* +===+===+===+===+
* | x | x | x | |
* +---+---+---+---+
*
* Example 2, # of chunks more than # of tasks.
* We have:
* - 4 tasks
* - 9 chunks
* - 1 full col
* - 3 low cols
*
* The first low col index is 1.
* The task #0 evicts 3 chunks, the others evict 2 chunks each.
*
* 0 | 1 | 2 | 3 |
* +===+===+===+===+
* | x | x | x | x |
* +---+---+---+---+
* | x | x | x | x |
* +---+---+---+---+
* | x | | | |
* +---+---+---+---+
*/

/*
* Compute number of tasks to run (n), low col index (k)
* and normal and low bytes per task.
*/
uint64_t nchunks = ((left - 1) >> MIN_EVICT_PERTASK_SHIFT) + 1;
unsigned n = nchunks < num_sublists ? nchunks : num_sublists;
uint64_t fullrows = nchunks / n;
unsigned lastrowcols = nchunks % n;
unsigned k = (lastrowcols ? lastrowcols : n);

uint64_t bytes_pertask_low =
fullrows << MIN_EVICT_PERTASK_SHIFT;
uint64_t bytes_pertask = bytes_pertask_low + (lastrowcols ?
(1 << MIN_EVICT_PERTASK_SHIFT) : 0);

/*
* Start eviction using a randomly selected sublist,
* this is to try and evenly balance eviction across all
* sublists. Always starting at the same sublist
* (e.g. index 0) would cause evictions to favor certain
* sublists over others.
*/
for (int i = 0; i < num_sublists; i++) {
for (unsigned i = 0; i < n; i++, sublist_idx++) {
uint64_t bytes_remaining;
uint64_t bytes_evicted;

/* we've reached the end, wrap to the beginning */
if (sublist_idx >= num_sublists)
sublist_idx = 0;

if (usetskq) {
uint64_t evict = i < k ? bytes_pertask :
bytes_pertask_low;

ASSERT3S(n, <=, num_sublists);

memset(&evarg[i].tqe, 0, sizeof (evarg[i].tqe));
evarg[i].ml = ml;
evarg[i].marker = markers[sublist_idx];
evarg[i].spa = spa;
evarg[i].evicted_ptr = &scan_evicted;
evarg[i].idx = sublist_idx;
evarg[i].bytes = evict;

taskq_dispatch_ent(arc_evict_taskq,
arc_evict_task,
&evarg[i], 0, &evarg[i].tqe);
continue;
}

if (total_evicted < bytes)
bytes_remaining = bytes - total_evicted;
else
Expand All @@ -4100,10 +4229,11 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,

scan_evicted += bytes_evicted;
total_evicted += bytes_evicted;
}

/* we've reached the end, wrap to the beginning */
if (++sublist_idx >= num_sublists)
sublist_idx = 0;
if (usetskq) {
taskq_wait(arc_evict_taskq);
total_evicted += scan_evicted;
}

/*
Expand All @@ -4130,11 +4260,14 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
}
}

kmem_free(evarg, sizeof (*evarg) * num_sublists);

for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_remove(mls, markers[i]);
multilist_sublist_unlock(mls);
}

if (markers != arc_state_evict_markers)
arc_state_free_markers(markers, num_sublists);

Expand Down Expand Up @@ -7673,6 +7806,13 @@ arc_set_limits(uint64_t allmem)
/* How to set default max varies by platform. */
arc_c_max = arc_default_max(arc_c_min, allmem);
}

static inline size_t
arc_ilog2(int a)
{
return (a > 1 ? 1 + arc_ilog2(a >> 1) : 0);
}

void
arc_init(void)
{
Expand Down Expand Up @@ -7743,12 +7883,22 @@ arc_init(void)

buf_init();

if (zfs_arc_evict_threads == 0)
zfs_arc_evict_threads_live = MIN(MAX(max_ncpus > 6 ? 2 : 1,
arc_ilog2(max_ncpus) + (max_ncpus >> 6)), 16);
else
zfs_arc_evict_threads_live = zfs_arc_evict_threads;

list_create(&arc_prune_list, sizeof (arc_prune_t),
offsetof(arc_prune_t, p_node));
mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);

arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
arc_evict_taskq = taskq_create("arc_evict",
MIN(zfs_arc_evict_threads_live, max_ncpus), defclsyspri,
MIN(zfs_arc_evict_threads_live, max_ncpus), max_ncpus,
TASKQ_PREPOPULATE);

arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
Expand Down Expand Up @@ -7823,6 +7973,9 @@ arc_fini(void)
arc_ksp = NULL;
}

taskq_wait(arc_evict_taskq);
taskq_destroy(arc_evict_taskq);

taskq_wait(arc_prune_taskq);
taskq_destroy(arc_prune_taskq);

Expand Down Expand Up @@ -10849,3 +11002,6 @@ ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
"Number of arc_prune threads");

ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RW,
"Maximum number of arc_evict threads");

0 comments on commit b6a65a2

Please sign in to comment.