diff --git a/include/os/macos/spl/sys/vmem_impl.h b/include/os/macos/spl/sys/vmem_impl.h
index 233a6b33a..840c7a746 100644
--- a/include/os/macos/spl/sys/vmem_impl.h
+++ b/include/os/macos/spl/sys/vmem_impl.h
@@ -115,8 +115,14 @@ typedef struct vmem_kstat {
 	kstat_named_t   vk_threads_waiting; /* threads in cv_wait in vmem */
 					/* allocator function */
 	kstat_named_t   vk_excess;	/* count of retained excess imports */
+	kstat_named_t	vk_lowest_stack; /* least remaining stack seen */
+	kstat_named_t	vk_async_stack_calls; /* times allocated off-thread */
 } vmem_kstat_t;
 
+
+/* forward declaration of opaque xnu struct */
+typedef struct thread_call *thread_call_t;
+
 struct vmem {
 	char		vm_name[VMEM_NAMELEN]; /* arena name */
 	kcondvar_t	vm_cv;		/* cv for blocking allocations */
@@ -146,6 +152,9 @@ struct vmem {
 	void		*vm_qcache[VMEM_NQCACHE_MAX]; /* quantum caches */
 	vmem_freelist_t	vm_freelist[VMEM_FREELISTS + 1]; /* power-of-2 flists */
 	vmem_kstat_t	vm_kstat;	/* kstat data */
+	thread_call_t	vm_stack_call_thread;
+	kmutex_t	vm_stack_lock;
+	kcondvar_t	vm_stack_cv;
 };
 
 #ifdef	__cplusplus
diff --git a/module/os/macos/spl/spl-kmem.c b/module/os/macos/spl/spl-kmem.c
index 1400b3fbc..9512c00c1 100644
--- a/module/os/macos/spl/spl-kmem.c
+++ b/module/os/macos/spl/spl-kmem.c
@@ -24,7 +24,7 @@
  * Copyright (C) 2008 MacZFS
  * Copyright (C) 2013, 2020 Jorgen Lundman <lundman@lundman.net>
  * Copyright (C) 2014 Brendon Humphrey <brendon.humphrey@mac.com>
- * Copyright (C) 2017 Sean Doran <smd@use.net>
+ * Copyright (C) 2017, 2021 Sean Doran <smd@use.net>
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  *
  */
@@ -510,6 +510,9 @@ uint64_t spl_arc_reclaim_avoided = 0;
 
 uint64_t kmem_free_to_slab_when_fragmented = 0;
 
+extern _Atomic unsigned int spl_lowest_stack_remaining;
+extern unsigned int spl_vmem_split_stack_below;
+
 typedef struct spl_stats {
 	kstat_named_t spl_os_alloc;
 	kstat_named_t spl_active_threads;
@@ -574,6 +577,8 @@ typedef struct spl_stats {
 	kstat_named_t spl_vm_pages_reclaimed;
 	kstat_named_t spl_vm_pages_wanted;
 	kstat_named_t spl_vm_pressure_level;
+	kstat_named_t spl_lowest_stack_remaining;
+	kstat_named_t spl_vmem_split_stack_below;
 } spl_stats_t;
 
 static spl_stats_t spl_stats = {
@@ -640,6 +645,8 @@ static spl_stats_t spl_stats = {
 	{"spl_vm_pages_reclaimed", KSTAT_DATA_UINT64},
 	{"spl_vm_pages_wanted", KSTAT_DATA_UINT64},
 	{"spl_vm_pressure_level", KSTAT_DATA_UINT64},
+	{"lowest_stack_remaining", KSTAT_DATA_UINT64},
+	{"split_stack_below", KSTAT_DATA_UINT64},
 };
 
 static kstat_t *spl_ksp = 0;
@@ -4422,7 +4429,6 @@ static void
 spl_free_thread()
 {
 	callb_cpr_t cpr;
-	uint64_t last_update = zfs_lbolt();
 	int64_t last_spl_free;
 
 	CALLB_CPR_INIT(&cpr, &spl_free_thread_lock, callb_generic_cpr, FTAG);
@@ -4827,8 +4833,6 @@ spl_free_thread()
 				new_spl_free = -1024LL;
 		}
 
-		double delta = (double)new_spl_free - (double)last_spl_free;
-
 		boolean_t spl_free_is_negative = false;
 
 		if (new_spl_free < 0LL) {
@@ -4948,6 +4952,13 @@ spl_kstat_update(kstat_t *ksp, int rw)
 			    ks->kmem_free_to_slab_when_fragmented.value.ui64;
 		}
 
+		if ((unsigned int) ks->spl_vmem_split_stack_below.value.ui64 !=
+		    spl_vmem_split_stack_below) {
+			spl_vmem_split_stack_below =
+			    (unsigned int)
+			    ks->spl_vmem_split_stack_below.value.ui64;
+		}
+
 	} else {
 		ks->spl_os_alloc.value.ui64 = segkmem_total_mem_allocated;
 		ks->spl_active_threads.value.ui64 = zfs_threads;
@@ -5036,6 +5047,10 @@ spl_kstat_update(kstat_t *ksp, int rw)
 		ks->spl_vm_pressure_level.value.ui64 =
 		    spl_vm_pressure_level;
 
+		ks->spl_lowest_stack_remaining.value.ui64 =
+		    spl_lowest_stack_remaining;
+		ks->spl_vmem_split_stack_below.value.ui64 =
+		    spl_vmem_split_stack_below;
 	}
 
 	return (0);
diff --git a/module/os/macos/spl/spl-vmem.c b/module/os/macos/spl/spl-vmem.c
index ecaf5b23f..505d78b10 100644
--- a/module/os/macos/spl/spl-vmem.c
+++ b/module/os/macos/spl/spl-vmem.c
@@ -26,7 +26,7 @@
 /*
  * Copyright (c) 2012 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2017 Sean Doran <smd@use.net>
+ * Copyright (c) 2017, 2021 by Sean Doran <smd@use.net>
  */
 
 /*
@@ -221,6 +221,9 @@
 #include <sys/debug.h>
 #include <sys/types.h>
 #include <stdbool.h>
+#include <mach/i386/vm_types.h>
+#include <libkern/OSDebug.h>
+#include <kern/thread_call.h>
 
 #define	VMEM_INITIAL		21	/* early vmem arenas */
 #define	VMEM_SEG_INITIAL	800
@@ -360,7 +363,9 @@ static vmem_kstat_t vmem_kstat_template = {
 	{ "parent_alloc",	KSTAT_DATA_UINT64 },
 	{ "parent_free",	KSTAT_DATA_UINT64 },
 	{ "threads_waiting",	KSTAT_DATA_UINT64 },
-	{ "excess",	KSTAT_DATA_UINT64 },
+	{ "excess",		KSTAT_DATA_UINT64 },
+	{ "lowest_stack",	KSTAT_DATA_UINT64 },
+	{ "async_stack_calls",	KSTAT_DATA_UINT64 },
 };
 
 
@@ -449,7 +454,7 @@ extern void IOSleep(unsigned milliseconds);
 /*
  * Get a vmem_seg_t from the global segfree list.
  */
-static vmem_seg_t *
+static inline vmem_seg_t *
 vmem_getseg_global(void)
 {
 	vmem_seg_t *vsp;
@@ -468,7 +473,7 @@ vmem_getseg_global(void)
 /*
  * Put a vmem_seg_t on the global segfree list.
  */
-static void
+static inline void
 vmem_putseg_global(vmem_seg_t *vsp)
 {
 	mutex_enter(&vmem_segfree_lock);
@@ -480,7 +485,7 @@ vmem_putseg_global(vmem_seg_t *vsp)
 /*
  * Get a vmem_seg_t from vmp's segfree list.
  */
-static vmem_seg_t *
+static inline vmem_seg_t *
 vmem_getseg(vmem_t *vmp)
 {
 	vmem_seg_t *vsp;
@@ -497,7 +502,7 @@ vmem_getseg(vmem_t *vmp)
 /*
  * Put a vmem_seg_t on vmp's segfree list.
  */
-static void
+static inline void
 vmem_putseg(vmem_t *vmp, vmem_seg_t *vsp)
 {
 	vsp->vs_knext = vmp->vm_segfree;
@@ -823,7 +828,7 @@ vmem_seg_create(vmem_t *vmp, vmem_seg_t *vprev, uintptr_t start, uintptr_t end)
 /*
  * Remove segment vsp from the arena.
  */
-static void
+static inline void
 vmem_seg_destroy(vmem_t *vmp, vmem_seg_t *vsp)
 {
 	ASSERT(vsp->vs_type != VMEM_ROTOR);
@@ -957,7 +962,7 @@ vmem_seg_alloc(vmem_t *vmp, vmem_seg_t *vsp, uintptr_t addr, size_t size)
  * Returns 1 if we are populating, 0 otherwise.
  * Call it if we want to prevent recursion from HAT.
  */
-int
+inline int
 vmem_is_populator()
 {
 	return (mutex_owner(&vmem_sleep_lock) == curthread ||
@@ -1238,7 +1243,7 @@ vmem_nextfit_alloc(vmem_t *vmp, size_t size, int vmflag)
  * Used to decide if a newly imported span is superfluous after re-acquiring
  * the arena lock.
  */
-static int
+static inline int
 vmem_canalloc(vmem_t *vmp, size_t size)
 {
 	int hb;
@@ -1257,7 +1262,7 @@ vmem_canalloc(vmem_t *vmp, size_t size)
 // allocation ability when not holding the lock.
 // These are unreliable because vmp->vm_freemap is
 // liable to change immediately after being examined.
-int
+inline int
 vmem_canalloc_lock(vmem_t *vmp, size_t size)
 {
 	mutex_enter(&vmp->vm_lock);
@@ -1284,7 +1289,7 @@ vmem_canalloc_atomic(vmem_t *vmp, size_t size)
 	return (flist);
 }
 
-static inline uint64_t
+uint64_t
 spl_vmem_xnu_useful_bytes_free(void)
 {
 	extern _Atomic uint32_t spl_vm_pages_reclaimed;
@@ -1312,7 +1317,7 @@ vmem_xnu_useful_bytes_free(void)
 }
 
 
-static void *
+static inline void *
 spl_vmem_malloc_unconditionally_unlocked(size_t size)
 {
 	extern void *osif_malloc(uint64_t);
@@ -1321,7 +1326,7 @@ spl_vmem_malloc_unconditionally_unlocked(size_t size)
 	return (osif_malloc(size));
 }
 
-static void *
+static inline void *
 spl_vmem_malloc_unconditionally(size_t size)
 {
 	mutex_enter(&vmem_xnu_alloc_lock);
@@ -1330,7 +1335,7 @@ spl_vmem_malloc_unconditionally(size_t size)
 	return (m);
 }
 
-static void *
+static inline void *
 spl_vmem_malloc_if_no_pressure(size_t size)
 {
 	// The mutex serializes concurrent callers, providing time for
@@ -1358,7 +1363,7 @@ spl_vmem_malloc_if_no_pressure(size_t size)
  * resulting segment [addr, addr + size) is a subset of [minaddr, maxaddr)
  * that does not straddle a nocross-aligned boundary.
  */
-void *
+inline void *
 vmem_xalloc(vmem_t *vmp, size_t size, size_t align_arg, size_t phase,
     size_t nocross, void *minaddr, void *maxaddr, int vmflag)
 {
@@ -1726,14 +1731,177 @@ vmem_xfree(vmem_t *vmp, void *vaddr, size_t size)
 }
 
 /*
+ * vmem_alloc() and auxiliary functions :
+ *
  * Allocate size bytes from arena vmp.  Returns the allocated address
  * on success, NULL on failure.  vmflag specifies VM_SLEEP or VM_NOSLEEP,
  * and may also specify best-fit, first-fit, or next-fit allocation policy
  * instead of the default instant-fit policy.  VM_SLEEP allocations are
  * guaranteed to succeed.
  */
+
+/*
+ * If there is less space on the kernel stack than
+ * (dynamically tunable) spl_vmem_split_stack_below
+ * then perform the vmem_alloc in the thread_call
+ * function
+ */
+unsigned int spl_vmem_split_stack_below = 8192;
+
+/* kstat tracking the global minimum free stack space */
+_Atomic unsigned int spl_lowest_stack_remaining = UINT_MAX;
+
+/* forward decls */
+static inline void *wrapped_vmem_alloc(vmem_t *, size_t, int);
+static void *vmem_alloc_in_worker_thread(vmem_t *, size_t, int);
+
+/*
+ * unwrapped vmem_alloc() :
+ * Examine stack remaining; if it is less than our split stack below
+ * threshold, or (for code coverage early near kext load time) is less than
+ * the lowest we have seen call out to a worker thread that will
+ * perform the wrapped_vmem_alloc() and update stat counters.
+ */
 void *
 vmem_alloc(vmem_t *vmp, size_t size, int vmflag)
+{
+	const vm_offset_t r = OSKernelStackRemaining();
+
+	if (vmp->vm_kstat.vk_lowest_stack.value.ui64 == 0) {
+		vmp->vm_kstat.vk_lowest_stack.value.ui64 = r;
+	} else if (vmp->vm_kstat.vk_lowest_stack.value.ui64 > r) {
+		vmp->vm_kstat.vk_lowest_stack.value.ui64 = r;
+	}
+
+	if (vmem_is_populator()) {
+		/*
+		 * Current thread holds one of the vmem locks and the worker
+		 * thread invoked in vmem_alloc_in_worker_thread() would
+		 * therefore deadlock. vmem_populate on a vmem cache is an
+		 * early (and rare) operation and typically does descend below
+		 * the vmem source.
+		 */
+		return (wrapped_vmem_alloc(vmp, size, vmflag));
+	}
+
+	if (r < spl_lowest_stack_remaining ||
+	    r < spl_vmem_split_stack_below) {
+		return (vmem_alloc_in_worker_thread(vmp, size, vmflag));
+	}
+
+	return (wrapped_vmem_alloc(vmp, size, vmflag));
+}
+
+/* parameters passed between thread_call threads */
+typedef struct cb_params {
+	vmem_t		*vmp;
+	size_t		size;
+	int		vmflag;
+	vm_offset_t	r_parent;
+	vm_offset_t	r_cb;
+	void		*r_alloc;
+} cb_params_t;
+
+/*
+ * Executes in a kernel worker thread, which will start with an essentially
+ * empty stack.  The stack above the immediate client of the vmem_alloc() that
+ * has thread_enter1()-ed this function is already over a depth threshold.
+ */
+static void
+vmem_alloc_update_lowest_cb(thread_call_param_t param0,
+    thread_call_param_t param1)
+{
+
+	/* param 0 is a vmp, set in vmem_create() */
+	/* param 1 is the in-params */
+
+	vmem_t *vmp0 = (vmem_t *)param0;
+
+	/* synchronize param1 and make sure vmp identity */
+	mutex_enter(&vmp0->vm_stack_lock);
+	cb_params_t *cbp = (cb_params_t *)param1;
+	vmem_t *vmp = cbp->vmp;
+	mutex_exit(&vmp0->vm_stack_lock);
+
+	VERIFY3P(vmp0, ==, vmp);
+
+	dprintf("SPL: %s:%d got vm_name %s, alloc size %lu, "
+	    "parent depth %lu, our depth %lu\n",
+	    __func__, __LINE__, vmp->vm_name, cbp->size,
+	    cbp->r_parent, OSKernelStackRemaining());
+
+	atomic_inc_64(&vmp->vm_kstat.vk_async_stack_calls.value.ui64);
+
+	spl_lowest_stack_remaining = cbp->r_parent;
+
+	cbp->r_alloc = wrapped_vmem_alloc(cbp->vmp,
+	    cbp->size, cbp->vmflag);
+
+	mutex_enter(&vmp->vm_stack_lock);
+	/*
+	 * There can be other cv_broadcast() callers
+	 * and other cv_waiters() in different threads intercepting
+	 * them, so the (arbitrary) nonzero value MUST be visible
+	 * in our cbp->r_cb with immediate sequential consistency,
+	 * or our calling thread may hang.
+	 */
+	__atomic_store_n(&cbp->r_cb,
+	    MAX(OSKernelStackRemaining(), 1),
+	    __ATOMIC_SEQ_CST);
+	cv_broadcast(&vmp->vm_stack_cv);
+	mutex_exit(&vmp->vm_stack_lock);
+}
+
+/*
+ * Set up parameters and thread_enter1() to send them to a worker thread
+ * executing vmem_alloc_update_lowest_cb().   Wait for the worker thread
+ * to set r_cb to nonzero.
+ */
+void *
+vmem_alloc_in_worker_thread(vmem_t *vmp, size_t size, int vmflag)
+{
+	cb_params_t cb = { 0 };
+	cb.vmp = vmp;
+	cb.size = size;
+	cb.vmflag = vmflag;
+	cb.r_parent = OSKernelStackRemaining();
+
+	mutex_enter(&vmp->vm_stack_lock);
+
+	/*
+	 * send a pointer to our parameter struct to the worker thread's
+	 * vmem_alloc_update_lowest_cb()'s param1.
+	 */
+	boolean_t tc_already_pending =
+	    thread_call_enter1(vmp->vm_stack_call_thread, &cb);
+
+	/* in DEBUG, bleat if worker thread was already working */
+	ASSERT0(tc_already_pending);
+
+	/*
+	 * wait until the worker thread sets a nonzero in our
+	 * cb.r_cb. Other threads doing this vmem_alloc() on this
+	 * vmem arena may also be causing the worker function
+	 * to emit cv_broadcasts, but we must not progress from
+	 * here until *our* work has been done.
+	 */
+	for (;;) {
+		cv_wait(&vmp->vm_stack_cv, &vmp->vm_stack_lock);
+		if (cb.r_cb != 0)
+			break;
+	}
+	mutex_exit(&vmp->vm_stack_lock);
+
+	ASSERT3P(cb.r_alloc, !=, NULL);
+
+	return (cb.r_alloc);
+}
+
+/*
+ * The guts of vmem_alloc()
+ */
+static inline void *
+wrapped_vmem_alloc(vmem_t *vmp, size_t size, int vmflag)
 {
 	vmem_seg_t *vsp;
 	uintptr_t addr;
@@ -2095,6 +2263,19 @@ vmem_create_common(const char *name, void *base, size_t size, size_t quantum,
 		return (NULL);
 	}
 
+	/* set up thread call */
+	mutex_init(&vmp->vm_stack_lock, "lock for thread call",
+	    MUTEX_DEFAULT, NULL);
+	cv_init(&vmp->vm_stack_cv, NULL, CV_DEFAULT, NULL);
+	vmp->vm_stack_call_thread = thread_call_allocate_with_options(
+	    (thread_call_func_t)vmem_alloc_update_lowest_cb,
+	    (thread_call_param_t)vmp,
+	    THREAD_CALL_PRIORITY_KERNEL,
+	    0);
+
+	printf("SPL: %s:%d: setup of %s done\n",
+	    __func__, __LINE__, vmp->vm_name);
+
 	return (vmp);
 }
 
@@ -2134,6 +2315,21 @@ vmem_destroy(vmem_t *vmp)
 	vmem_seg_t *vsp, *anext;
 	size_t leaked;
 
+	/* check for possible async stack calls */
+
+	const boolean_t ret_thread_call_cancel =
+	    thread_call_cancel(vmp->vm_stack_call_thread);
+	ASSERT0(ret_thread_call_cancel);
+
+	/* tear down async stack call mechanisms */
+
+	const boolean_t ret_thread_call_free =
+	    thread_call_free(vmp->vm_stack_call_thread);
+	ASSERT0(!ret_thread_call_free);
+
+	mutex_destroy(&vmp->vm_stack_lock);
+	cv_destroy(&vmp->vm_stack_cv);
+
 	/*
 	 * set vm_nsegfree to zero because vmem_free_span_list
 	 * would have already freed vm_segfree.
@@ -2369,7 +2565,7 @@ vmem_bucket_arena_by_size(size_t size)
 	return (vmem_bucket_arena[bucket]);
 }
 
-vmem_t *
+inline vmem_t *
 spl_vmem_bucket_arena_by_size(size_t size)
 {
 	return (vmem_bucket_arena_by_size(size));
@@ -3120,7 +3316,7 @@ vmem_bucket_arena_used(int bucket)
 }
 
 
-int64_t
+inline int64_t
 vmem_buckets_size(int typemask)
 {
 	int64_t total_size = 0;
@@ -3139,7 +3335,7 @@ vmem_buckets_size(int typemask)
 	return ((size_t)total_size);
 }
 
-static uint64_t
+static inline uint64_t
 spl_validate_bucket_span_size(uint64_t val)
 {
 	if (!ISP2(val)) {
@@ -3244,14 +3440,14 @@ spl_set_bucket_tunable_small_span(uint64_t size)
 	spl_printf_bucket_span_sizes();
 }
 
-static void *
+static inline void *
 spl_vmem_default_alloc(vmem_t *vmp, size_t size, int vmflags)
 {
 	extern void *osif_malloc(uint64_t);
 	return (osif_malloc(size));
 }
 
-static void
+static inline void
 spl_vmem_default_free(vmem_t *vmp, void *vaddr, size_t size)
 {
 	extern void osif_free(void *, uint64_t);