Skip to content

Commit

Permalink
Split deep vmem_alloc()/vmem_xalloc() stacks
Browse files Browse the repository at this point in the history
In #90
a user reported panics on an M1 with the message

"Invalid kernel stack pointer (probable overflow)."

In at least several of these a deep multi-arena allocation
was in progress (several vmem_alloc/vmem_xalloc reaching
all the way down through vmem_bucket_alloc,
xnu_alloc_throttled, and ultimately to osif_malloc).

The stack frames above the first vmem_alloc were also fairly large.

This commit sets a dynamically sysctl-tunable threshold
(8k default) for remaining stack size as reported by xnu.
If we do not have more bytes than that when vmem_alloc()
is called, then the actual allocation will be done in a
separate worker thread which will start with a nearly
empty stack that is much more likely to hold the various
frames all the way through our code boundary with the
kernel and beyond.

The xnu / mach thread_call API (osfmk/kern/thread_call.h)
is used to avoid circular dependencies with taskq, and the
mechanism is per-arena costing a quick stack-depth check
per vmem_alloc() but allowing for wildly varying stack
depths above the first vmem_alloc() call.

Vmem arenas now have two further kstats: the lowest amount
of available stack space seen at a vmem_alloc() into it,
and the number of times the allocation work has been done
in a thread_call worker.

* some spl_vmem.c functions are given inline hints

These are small functions with no or very few automatic
variables that were good candidates for clang/llvm's
inlining heuristics before we switched to building
the kext with -finline-hint-functions.

* remove some (unrelated) unused variables which escaped
previous commits, eliminating a couple compile-time
warnings.
  • Loading branch information
rottegift authored and lundman committed Jul 30, 2021
1 parent 8e2081c commit ec6c19e
Show file tree
Hide file tree
Showing 3 changed files with 252 additions and 30 deletions.
9 changes: 9 additions & 0 deletions include/os/macos/spl/sys/vmem_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,14 @@ typedef struct vmem_kstat {
kstat_named_t vk_threads_waiting; /* threads in cv_wait in vmem */
/* allocator function */
kstat_named_t vk_excess; /* count of retained excess imports */
kstat_named_t vk_lowest_stack; /* least remaining stack seen */
kstat_named_t vk_async_stack_calls; /* times allocated off-thread */
} vmem_kstat_t;


/* forward declaration of opaque xnu struct */
typedef struct thread_call *thread_call_t;

struct vmem {
char vm_name[VMEM_NAMELEN]; /* arena name */
kcondvar_t vm_cv; /* cv for blocking allocations */
Expand Down Expand Up @@ -146,6 +152,9 @@ struct vmem {
void *vm_qcache[VMEM_NQCACHE_MAX]; /* quantum caches */
vmem_freelist_t vm_freelist[VMEM_FREELISTS + 1]; /* power-of-2 flists */
vmem_kstat_t vm_kstat; /* kstat data */
thread_call_t vm_stack_call_thread;
kmutex_t vm_stack_lock;
kcondvar_t vm_stack_cv;
};

#ifdef __cplusplus
Expand Down
23 changes: 19 additions & 4 deletions module/os/macos/spl/spl-kmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
* Copyright (C) 2008 MacZFS
* Copyright (C) 2013, 2020 Jorgen Lundman <[email protected]>
* Copyright (C) 2014 Brendon Humphrey <[email protected]>
* Copyright (C) 2017 Sean Doran <[email protected]>
* Copyright (C) 2017, 2021 Sean Doran <[email protected]>
* Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*
*/
Expand Down Expand Up @@ -510,6 +510,9 @@ uint64_t spl_arc_reclaim_avoided = 0;

uint64_t kmem_free_to_slab_when_fragmented = 0;

extern _Atomic unsigned int spl_lowest_stack_remaining;
extern unsigned int spl_vmem_split_stack_below;

typedef struct spl_stats {
kstat_named_t spl_os_alloc;
kstat_named_t spl_active_threads;
Expand Down Expand Up @@ -574,6 +577,8 @@ typedef struct spl_stats {
kstat_named_t spl_vm_pages_reclaimed;
kstat_named_t spl_vm_pages_wanted;
kstat_named_t spl_vm_pressure_level;
kstat_named_t spl_lowest_stack_remaining;
kstat_named_t spl_vmem_split_stack_below;
} spl_stats_t;

static spl_stats_t spl_stats = {
Expand Down Expand Up @@ -640,6 +645,8 @@ static spl_stats_t spl_stats = {
{"spl_vm_pages_reclaimed", KSTAT_DATA_UINT64},
{"spl_vm_pages_wanted", KSTAT_DATA_UINT64},
{"spl_vm_pressure_level", KSTAT_DATA_UINT64},
{"lowest_stack_remaining", KSTAT_DATA_UINT64},
{"split_stack_below", KSTAT_DATA_UINT64},
};

static kstat_t *spl_ksp = 0;
Expand Down Expand Up @@ -4422,7 +4429,6 @@ static void
spl_free_thread()
{
callb_cpr_t cpr;
uint64_t last_update = zfs_lbolt();
int64_t last_spl_free;

CALLB_CPR_INIT(&cpr, &spl_free_thread_lock, callb_generic_cpr, FTAG);
Expand Down Expand Up @@ -4827,8 +4833,6 @@ spl_free_thread()
new_spl_free = -1024LL;
}

double delta = (double)new_spl_free - (double)last_spl_free;

boolean_t spl_free_is_negative = false;

if (new_spl_free < 0LL) {
Expand Down Expand Up @@ -4948,6 +4952,13 @@ spl_kstat_update(kstat_t *ksp, int rw)
ks->kmem_free_to_slab_when_fragmented.value.ui64;
}

if ((unsigned int) ks->spl_vmem_split_stack_below.value.ui64 !=
spl_vmem_split_stack_below) {
spl_vmem_split_stack_below =
(unsigned int)
ks->spl_vmem_split_stack_below.value.ui64;
}

} else {
ks->spl_os_alloc.value.ui64 = segkmem_total_mem_allocated;
ks->spl_active_threads.value.ui64 = zfs_threads;
Expand Down Expand Up @@ -5036,6 +5047,10 @@ spl_kstat_update(kstat_t *ksp, int rw)
ks->spl_vm_pressure_level.value.ui64 =
spl_vm_pressure_level;

ks->spl_lowest_stack_remaining.value.ui64 =
spl_lowest_stack_remaining;
ks->spl_vmem_split_stack_below.value.ui64 =
spl_vmem_split_stack_below;
}

return (0);
Expand Down
Loading

0 comments on commit ec6c19e

Please sign in to comment.