Skip to content

Commit

Permalink
Linux 6.9: Call add_disk() from workqueue to fix zfs_allow_010_pos
Browse files Browse the repository at this point in the history
The 6.9 kernel behaves differently in how it releases block devices.  In
the common case it will async release the device only after the return
to userspace.  This is different from the 6.8 and older kernels which
release the block devices synchronously.  To get around this, call
add_disk() from a workqueue so that the kernel uses a different
codepath to release our zvols in the way we expect.  This stops
zfs_allow_010_pos from hanging.

Fixes: #16089
Signed-off-by: Tony Hutter <[email protected]>
  • Loading branch information
tonyhutter committed Jun 18, 2024
1 parent c98295e commit 3ea02f3
Showing 1 changed file with 97 additions and 5 deletions.
102 changes: 97 additions & 5 deletions module/os/linux/zfs/zvol_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@

#include <linux/blkdev_compat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/workqueue.h>

#ifdef HAVE_BLK_MQ
#include <linux/blk-mq.h>
Expand Down Expand Up @@ -1338,6 +1339,101 @@ zvol_wait_close(zvol_state_t *zv)
{
}

struct add_disk_work {
struct delayed_work work;
struct gendisk *disk;
int error;
};

static int
__zvol_os_add_disk(struct gendisk *disk)
{
int error = 0;
#ifdef HAVE_ADD_DISK_RET
error = add_disk(disk);
#else
add_disk(disk);
#endif
return (error);
}

#if defined(HAVE_BDEV_OPEN_BY_PATH)
static void
zvol_os_add_disk_work(struct work_struct *work)
{
struct add_disk_work *add_disk_work;
add_disk_work = container_of(work, struct add_disk_work, work.work);
add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
}
#endif

/*
* SPECIAL CASE:
*
* This function basically calls add_disk() from a workqueue. You may be
* thinking: why not just call add_disk() directly?
*
* When you call add_disk(), the zvol appears to the world. When this happens,
* the kernel calls disk_scan_partitions() on the zvol, which behaves
* differently on the 6.9+ kernels:
*
* - 6.8 and older kernels -
* disk_scan_partitions()
* handle = bdev_open_by_dev(
* zvol_open()
* bdev_release(handle);
* zvol_release()
*
*
* - 6.9+ kernels -
* disk_scan_partitions()
* file = bdev_file_open_by_dev()
* zvol_open()
* fput(file)
* < wait for return to userspace >
* zvol_release()
*
* The difference is that the bdev_release() from the 6.8 kernel is synchronous
* while the fput() from the 6.9 kernel is async. Or more specifically it's
* async that has to wait until we return to userspace (since it adds the fput
* into the caller's work queue with the TWA_RESUME flag set). This is not the
* behavior we want, since we want do things like create+destroy a zvol within
* a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
* reference to the zvol while we're in the IOCTL, which can't wait until we
* return to userspace.
*
* We can get around this since fput() has a special codepath for when it's
* running in a kernel thread or interrupt. In those cases, it just puts the
* fput into the system workqueue, which we can force to run with
* __flush_workqueue(). That is why we call add_disk() from a workqueue - so it
* run from a kernel thread and "tricks" the fput() codepaths.
*
* Note that __flush_workqueue() is slowly getting deprecated. This may be ok
* though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
* fput) to happen, which it eventually, naturally, will from the system_wq
* without us explicitly calling __flush_workqueue().
*/
static int
zvol_os_add_disk(struct gendisk *disk)
{
#if defined(HAVE_BDEV_OPEN_BY_PATH) /* 6.9+ kernel */
struct add_disk_work add_disk_work;

INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
add_disk_work.disk = disk;
add_disk_work.error = 0;

/* Use *_delayed_work functions since they're not GPL'd */
schedule_delayed_work(&add_disk_work.work, 0);
flush_delayed_work(&add_disk_work.work);

__flush_workqueue(system_wq);
return (add_disk_work.error);
#else /* <= 6.8 kernel */
return (__zvol_os_add_disk(disk));
#endif
}

/*
* Create a block device minor node and setup the linkage between it
* and the specified volume. Once this function returns the block
Expand Down Expand Up @@ -1549,11 +1645,7 @@ zvol_os_create_minor(const char *name)
rw_enter(&zvol_state_lock, RW_WRITER);
zvol_insert(zv);
rw_exit(&zvol_state_lock);
#ifdef HAVE_ADD_DISK_RET
error = add_disk(zv->zv_zso->zvo_disk);
#else
add_disk(zv->zv_zso->zvo_disk);
#endif
error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
} else {
ida_simple_remove(&zvol_ida, idx);
}
Expand Down

0 comments on commit 3ea02f3

Please sign in to comment.