diff --git a/include/os/macos/spl/sys/vmem_impl.h b/include/os/macos/spl/sys/vmem_impl.h index 233a6b33a..840c7a746 100644 --- a/include/os/macos/spl/sys/vmem_impl.h +++ b/include/os/macos/spl/sys/vmem_impl.h @@ -115,8 +115,14 @@ typedef struct vmem_kstat { kstat_named_t vk_threads_waiting; /* threads in cv_wait in vmem */ /* allocator function */ kstat_named_t vk_excess; /* count of retained excess imports */ + kstat_named_t vk_lowest_stack; /* least remaining stack seen */ + kstat_named_t vk_async_stack_calls; /* times allocated off-thread */ } vmem_kstat_t; + +/* forward declaration of opaque xnu struct */ +typedef struct thread_call *thread_call_t; + struct vmem { char vm_name[VMEM_NAMELEN]; /* arena name */ kcondvar_t vm_cv; /* cv for blocking allocations */ @@ -146,6 +152,9 @@ struct vmem { void *vm_qcache[VMEM_NQCACHE_MAX]; /* quantum caches */ vmem_freelist_t vm_freelist[VMEM_FREELISTS + 1]; /* power-of-2 flists */ vmem_kstat_t vm_kstat; /* kstat data */ + thread_call_t vm_stack_call_thread; + kmutex_t vm_stack_lock; + kcondvar_t vm_stack_cv; }; #ifdef __cplusplus diff --git a/module/os/macos/spl/spl-kmem.c b/module/os/macos/spl/spl-kmem.c index 1400b3fbc..9512c00c1 100644 --- a/module/os/macos/spl/spl-kmem.c +++ b/module/os/macos/spl/spl-kmem.c @@ -24,7 +24,7 @@ * Copyright (C) 2008 MacZFS * Copyright (C) 2013, 2020 Jorgen Lundman * Copyright (C) 2014 Brendon Humphrey - * Copyright (C) 2017 Sean Doran + * Copyright (C) 2017, 2021 Sean Doran * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * */ @@ -510,6 +510,9 @@ uint64_t spl_arc_reclaim_avoided = 0; uint64_t kmem_free_to_slab_when_fragmented = 0; +extern _Atomic unsigned int spl_lowest_stack_remaining; +extern unsigned int spl_vmem_split_stack_below; + typedef struct spl_stats { kstat_named_t spl_os_alloc; kstat_named_t spl_active_threads; @@ -574,6 +577,8 @@ typedef struct spl_stats { kstat_named_t spl_vm_pages_reclaimed; kstat_named_t spl_vm_pages_wanted; kstat_named_t spl_vm_pressure_level; + kstat_named_t spl_lowest_stack_remaining; + kstat_named_t spl_vmem_split_stack_below; } spl_stats_t; static spl_stats_t spl_stats = { @@ -640,6 +645,8 @@ static spl_stats_t spl_stats = { {"spl_vm_pages_reclaimed", KSTAT_DATA_UINT64}, {"spl_vm_pages_wanted", KSTAT_DATA_UINT64}, {"spl_vm_pressure_level", KSTAT_DATA_UINT64}, + {"lowest_stack_remaining", KSTAT_DATA_UINT64}, + {"split_stack_below", KSTAT_DATA_UINT64}, }; static kstat_t *spl_ksp = 0; @@ -4422,7 +4429,6 @@ static void spl_free_thread() { callb_cpr_t cpr; - uint64_t last_update = zfs_lbolt(); int64_t last_spl_free; CALLB_CPR_INIT(&cpr, &spl_free_thread_lock, callb_generic_cpr, FTAG); @@ -4827,8 +4833,6 @@ spl_free_thread() new_spl_free = -1024LL; } - double delta = (double)new_spl_free - (double)last_spl_free; - boolean_t spl_free_is_negative = false; if (new_spl_free < 0LL) { @@ -4948,6 +4952,13 @@ spl_kstat_update(kstat_t *ksp, int rw) ks->kmem_free_to_slab_when_fragmented.value.ui64; } + if ((unsigned int) ks->spl_vmem_split_stack_below.value.ui64 != + spl_vmem_split_stack_below) { + spl_vmem_split_stack_below = + (unsigned int) + ks->spl_vmem_split_stack_below.value.ui64; + } + } else { ks->spl_os_alloc.value.ui64 = segkmem_total_mem_allocated; ks->spl_active_threads.value.ui64 = zfs_threads; @@ -5036,6 +5047,10 @@ spl_kstat_update(kstat_t *ksp, int rw) ks->spl_vm_pressure_level.value.ui64 = spl_vm_pressure_level; + ks->spl_lowest_stack_remaining.value.ui64 = + spl_lowest_stack_remaining; + ks->spl_vmem_split_stack_below.value.ui64 = + spl_vmem_split_stack_below; } return (0); diff --git a/module/os/macos/spl/spl-vmem.c b/module/os/macos/spl/spl-vmem.c index ecaf5b23f..505d78b10 100644 --- a/module/os/macos/spl/spl-vmem.c +++ b/module/os/macos/spl/spl-vmem.c @@ -26,7 +26,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2017 Sean Doran + * Copyright (c) 2017, 2021 by Sean Doran */ /* @@ -221,6 +221,9 @@ #include #include #include +#include +#include +#include #define VMEM_INITIAL 21 /* early vmem arenas */ #define VMEM_SEG_INITIAL 800 @@ -360,7 +363,9 @@ static vmem_kstat_t vmem_kstat_template = { { "parent_alloc", KSTAT_DATA_UINT64 }, { "parent_free", KSTAT_DATA_UINT64 }, { "threads_waiting", KSTAT_DATA_UINT64 }, - { "excess", KSTAT_DATA_UINT64 }, + { "excess", KSTAT_DATA_UINT64 }, + { "lowest_stack", KSTAT_DATA_UINT64 }, + { "async_stack_calls", KSTAT_DATA_UINT64 }, }; @@ -449,7 +454,7 @@ extern void IOSleep(unsigned milliseconds); /* * Get a vmem_seg_t from the global segfree list. */ -static vmem_seg_t * +static inline vmem_seg_t * vmem_getseg_global(void) { vmem_seg_t *vsp; @@ -468,7 +473,7 @@ vmem_getseg_global(void) /* * Put a vmem_seg_t on the global segfree list. */ -static void +static inline void vmem_putseg_global(vmem_seg_t *vsp) { mutex_enter(&vmem_segfree_lock); @@ -480,7 +485,7 @@ vmem_putseg_global(vmem_seg_t *vsp) /* * Get a vmem_seg_t from vmp's segfree list. */ -static vmem_seg_t * +static inline vmem_seg_t * vmem_getseg(vmem_t *vmp) { vmem_seg_t *vsp; @@ -497,7 +502,7 @@ vmem_getseg(vmem_t *vmp) /* * Put a vmem_seg_t on vmp's segfree list. */ -static void +static inline void vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp) { vsp->vs_knext = vmp->vm_segfree; @@ -823,7 +828,7 @@ vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end) /* * Remove segment vsp from the arena. */ -static void +static inline void vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp) { ASSERT(vsp->vs_type != VMEM_ROTOR); @@ -957,7 +962,7 @@ vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size) * Returns 1 if we are populating, 0 otherwise. * Call it if we want to prevent recursion from HAT. */ -int +inline int vmem_is_populator() { return (mutex_owner(&vmem_sleep_lock) == curthread || @@ -1238,7 +1243,7 @@ vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag) * Used to decide if a newly imported span is superfluous after re-acquiring * the arena lock. */ -static int +static inline int vmem_canalloc(vmem_t *vmp, size_t size) { int hb; @@ -1257,7 +1262,7 @@ vmem_canalloc(vmem_t *vmp, size_t size) // allocation ability when not holding the lock. // These are unreliable because vmp->vm_freemap is // liable to change immediately after being examined. -int +inline int vmem_canalloc_lock(vmem_t *vmp, size_t size) { mutex_enter(&vmp->vm_lock); @@ -1284,7 +1289,7 @@ vmem_canalloc_atomic(vmem_t *vmp, size_t size) return (flist); } -static inline uint64_t +uint64_t spl_vmem_xnu_useful_bytes_free(void) { extern _Atomic uint32_t spl_vm_pages_reclaimed; @@ -1312,7 +1317,7 @@ vmem_xnu_useful_bytes_free(void) } -static void * +static inline void * spl_vmem_malloc_unconditionally_unlocked(size_t size) { extern void *osif_malloc(uint64_t); @@ -1321,7 +1326,7 @@ spl_vmem_malloc_unconditionally_unlocked(size_t size) return (osif_malloc(size)); } -static void * +static inline void * spl_vmem_malloc_unconditionally(size_t size) { mutex_enter(&vmem_xnu_alloc_lock); @@ -1330,7 +1335,7 @@ spl_vmem_malloc_unconditionally(size_t size) return (m); } -static void * +static inline void * spl_vmem_malloc_if_no_pressure(size_t size) { // The mutex serializes concurrent callers, providing time for @@ -1358,7 +1363,7 @@ spl_vmem_malloc_if_no_pressure(size_t size) * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr) * that does not straddle a nocross-aligned boundary. */ -void * +inline void * vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase, size_t nocross, void *minaddr, void *maxaddr, int vmflag) { @@ -1726,14 +1731,177 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size) } /* + * vmem_alloc() and auxiliary functions : + * * Allocate size bytes from arena vmp. Returns the allocated address * on success, NULL on failure. vmflag specifies VM_SLEEP or VM_NOSLEEP, * and may also specify best-fit, first-fit, or next-fit allocation policy * instead of the default instant-fit policy. VM_SLEEP allocations are * guaranteed to succeed. */ + +/* + * If there is less space on the kernel stack than + * (dynamically tunable) spl_vmem_split_stack_below + * then perform the vmem_alloc in the thread_call + * function + */ +unsigned int spl_vmem_split_stack_below = 8192; + +/* kstat tracking the global minimum free stack space */ +_Atomic unsigned int spl_lowest_stack_remaining = UINT_MAX; + +/* forward decls */ +static inline void *wrapped_vmem_alloc(vmem_t *, size_t, int); +static void *vmem_alloc_in_worker_thread(vmem_t *, size_t, int); + +/* + * unwrapped vmem_alloc() : + * Examine stack remaining; if it is less than our split stack below + * threshold, or (for code coverage early near kext load time) is less than + * the lowest we have seen call out to a worker thread that will + * perform the wrapped_vmem_alloc() and update stat counters. + */ void * vmem_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + const vm_offset_t r = OSKernelStackRemaining(); + + if (vmp->vm_kstat.vk_lowest_stack.value.ui64 == 0) { + vmp->vm_kstat.vk_lowest_stack.value.ui64 = r; + } else if (vmp->vm_kstat.vk_lowest_stack.value.ui64 > r) { + vmp->vm_kstat.vk_lowest_stack.value.ui64 = r; + } + + if (vmem_is_populator()) { + /* + * Current thread holds one of the vmem locks and the worker + * thread invoked in vmem_alloc_in_worker_thread() would + * therefore deadlock. vmem_populate on a vmem cache is an + * early (and rare) operation and typically does descend below + * the vmem source. + */ + return (wrapped_vmem_alloc(vmp, size, vmflag)); + } + + if (r < spl_lowest_stack_remaining || + r < spl_vmem_split_stack_below) { + return (vmem_alloc_in_worker_thread(vmp, size, vmflag)); + } + + return (wrapped_vmem_alloc(vmp, size, vmflag)); +} + +/* parameters passed between thread_call threads */ +typedef struct cb_params { + vmem_t *vmp; + size_t size; + int vmflag; + vm_offset_t r_parent; + vm_offset_t r_cb; + void *r_alloc; +} cb_params_t; + +/* + * Executes in a kernel worker thread, which will start with an essentially + * empty stack. The stack above the immediate client of the vmem_alloc() that + * has thread_enter1()-ed this function is already over a depth threshold. + */ +static void +vmem_alloc_update_lowest_cb(thread_call_param_t param0, + thread_call_param_t param1) +{ + + /* param 0 is a vmp, set in vmem_create() */ + /* param 1 is the in-params */ + + vmem_t *vmp0 = (vmem_t *)param0; + + /* synchronize param1 and make sure vmp identity */ + mutex_enter(&vmp0->vm_stack_lock); + cb_params_t *cbp = (cb_params_t *)param1; + vmem_t *vmp = cbp->vmp; + mutex_exit(&vmp0->vm_stack_lock); + + VERIFY3P(vmp0, ==, vmp); + + dprintf("SPL: %s:%d got vm_name %s, alloc size %lu, " + "parent depth %lu, our depth %lu\n", + __func__, __LINE__, vmp->vm_name, cbp->size, + cbp->r_parent, OSKernelStackRemaining()); + + atomic_inc_64(&vmp->vm_kstat.vk_async_stack_calls.value.ui64); + + spl_lowest_stack_remaining = cbp->r_parent; + + cbp->r_alloc = wrapped_vmem_alloc(cbp->vmp, + cbp->size, cbp->vmflag); + + mutex_enter(&vmp->vm_stack_lock); + /* + * There can be other cv_broadcast() callers + * and other cv_waiters() in different threads intercepting + * them, so the (arbitrary) nonzero value MUST be visible + * in our cbp->r_cb with immediate sequential consistency, + * or our calling thread may hang. + */ + __atomic_store_n(&cbp->r_cb, + MAX(OSKernelStackRemaining(), 1), + __ATOMIC_SEQ_CST); + cv_broadcast(&vmp->vm_stack_cv); + mutex_exit(&vmp->vm_stack_lock); +} + +/* + * Set up parameters and thread_enter1() to send them to a worker thread + * executing vmem_alloc_update_lowest_cb(). Wait for the worker thread + * to set r_cb to nonzero. + */ +void * +vmem_alloc_in_worker_thread(vmem_t *vmp, size_t size, int vmflag) +{ + cb_params_t cb = { 0 }; + cb.vmp = vmp; + cb.size = size; + cb.vmflag = vmflag; + cb.r_parent = OSKernelStackRemaining(); + + mutex_enter(&vmp->vm_stack_lock); + + /* + * send a pointer to our parameter struct to the worker thread's + * vmem_alloc_update_lowest_cb()'s param1. + */ + boolean_t tc_already_pending = + thread_call_enter1(vmp->vm_stack_call_thread, &cb); + + /* in DEBUG, bleat if worker thread was already working */ + ASSERT0(tc_already_pending); + + /* + * wait until the worker thread sets a nonzero in our + * cb.r_cb. Other threads doing this vmem_alloc() on this + * vmem arena may also be causing the worker function + * to emit cv_broadcasts, but we must not progress from + * here until *our* work has been done. + */ + for (;;) { + cv_wait(&vmp->vm_stack_cv, &vmp->vm_stack_lock); + if (cb.r_cb != 0) + break; + } + mutex_exit(&vmp->vm_stack_lock); + + ASSERT3P(cb.r_alloc, !=, NULL); + + return (cb.r_alloc); +} + +/* + * The guts of vmem_alloc() + */ +static inline void * +wrapped_vmem_alloc(vmem_t *vmp, size_t size, int vmflag) { vmem_seg_t *vsp; uintptr_t addr; @@ -2095,6 +2263,19 @@ vmem_create_common(const char *name, void *base, size_t size, size_t quantum, return (NULL); } + /* set up thread call */ + mutex_init(&vmp->vm_stack_lock, "lock for thread call", + MUTEX_DEFAULT, NULL); + cv_init(&vmp->vm_stack_cv, NULL, CV_DEFAULT, NULL); + vmp->vm_stack_call_thread = thread_call_allocate_with_options( + (thread_call_func_t)vmem_alloc_update_lowest_cb, + (thread_call_param_t)vmp, + THREAD_CALL_PRIORITY_KERNEL, + 0); + + printf("SPL: %s:%d: setup of %s done\n", + __func__, __LINE__, vmp->vm_name); + return (vmp); } @@ -2134,6 +2315,21 @@ vmem_destroy(vmem_t *vmp) vmem_seg_t *vsp, *anext; size_t leaked; + /* check for possible async stack calls */ + + const boolean_t ret_thread_call_cancel = + thread_call_cancel(vmp->vm_stack_call_thread); + ASSERT0(ret_thread_call_cancel); + + /* tear down async stack call mechanisms */ + + const boolean_t ret_thread_call_free = + thread_call_free(vmp->vm_stack_call_thread); + ASSERT0(!ret_thread_call_free); + + mutex_destroy(&vmp->vm_stack_lock); + cv_destroy(&vmp->vm_stack_cv); + /* * set vm_nsegfree to zero because vmem_free_span_list * would have already freed vm_segfree. @@ -2369,7 +2565,7 @@ vmem_bucket_arena_by_size(size_t size) return (vmem_bucket_arena[bucket]); } -vmem_t * +inline vmem_t * spl_vmem_bucket_arena_by_size(size_t size) { return (vmem_bucket_arena_by_size(size)); @@ -3120,7 +3316,7 @@ vmem_bucket_arena_used(int bucket) } -int64_t +inline int64_t vmem_buckets_size(int typemask) { int64_t total_size = 0; @@ -3139,7 +3335,7 @@ vmem_buckets_size(int typemask) return ((size_t)total_size); } -static uint64_t +static inline uint64_t spl_validate_bucket_span_size(uint64_t val) { if (!ISP2(val)) { @@ -3244,14 +3440,14 @@ spl_set_bucket_tunable_small_span(uint64_t size) spl_printf_bucket_span_sizes(); } -static void * +static inline void * spl_vmem_default_alloc(vmem_t *vmp, size_t size, int vmflags) { extern void *osif_malloc(uint64_t); return (osif_malloc(size)); } -static void +static inline void spl_vmem_default_free(vmem_t *vmp, void *vaddr, size_t size) { extern void osif_free(void *, uint64_t);