From 26e6057674b670bf18a21585d053acb017acc90c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:49 +0200 Subject: [PATCH 01/18] arm/bL_switcher: Use kthread_run_on_cpu() Use the proper API instead of open coding it. Reviewed-by: Dave Martin Acked-by: Nicolas Pitre Signed-off-by: Frederic Weisbecker --- arch/arm/common/bL_switcher.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 9a9aa53547a6b0..d1e82a318e3b22 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -307,13 +307,11 @@ static struct task_struct *bL_switcher_thread_create(int cpu, void *arg) { struct task_struct *task; - task = kthread_create_on_node(bL_switcher_thread, arg, - cpu_to_node(cpu), "kswitcher_%d", cpu); - if (!IS_ERR(task)) { - kthread_bind(task, cpu); - wake_up_process(task); - } else + task = kthread_run_on_cpu(bL_switcher_thread, arg, + cpu, "kswitcher_%d"); + if (IS_ERR(task)) pr_err("%s failed for CPU %d\n", __func__, cpu); + return task; } From e5c10da9b19c70efd4462bd4181fe84ff528730b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:52 +0200 Subject: [PATCH 02/18] scsi: bnx2fc: Use kthread_create_on_cpu() Use the proper API instead of open coding it. However it looks like bnx2fc_percpu_io_thread() kthread could be replaced by the use of a high prio workqueue instead. Signed-off-by: Frederic Weisbecker --- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index f49783b89d04f4..36126030e76d58 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -2610,14 +2610,11 @@ static int bnx2fc_cpu_online(unsigned int cpu) p = &per_cpu(bnx2fc_percpu, cpu); - thread = kthread_create_on_node(bnx2fc_percpu_io_thread, - (void *)p, cpu_to_node(cpu), - "bnx2fc_thread/%d", cpu); + thread = kthread_create_on_cpu(bnx2fc_percpu_io_thread, + (void *)p, cpu, "bnx2fc_thread/%d"); if (IS_ERR(thread)) return PTR_ERR(thread); - /* bind thread to the cpu */ - kthread_bind(thread, cpu); p->iothread = thread; wake_up_process(thread); return 0; From 00f6810a81a374b4acbbd40fbc69f01ec6bf9941 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:53 +0200 Subject: [PATCH 03/18] scsi: bnx2i: Use kthread_create_on_cpu() Use the proper API instead of open coding it. However it looks like bnx2i_percpu_io_thread() kthread could be replaced by the use of a high prio workqueue instead. Signed-off-by: Frederic Weisbecker --- drivers/scsi/bnx2i/bnx2i_init.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/bnx2i/bnx2i_init.c b/drivers/scsi/bnx2i/bnx2i_init.c index 872ad37e2a6eaf..cecc3a0267629c 100644 --- a/drivers/scsi/bnx2i/bnx2i_init.c +++ b/drivers/scsi/bnx2i/bnx2i_init.c @@ -415,14 +415,11 @@ static int bnx2i_cpu_online(unsigned int cpu) p = &per_cpu(bnx2i_percpu, cpu); - thread = kthread_create_on_node(bnx2i_percpu_io_thread, (void *)p, - cpu_to_node(cpu), - "bnx2i_thread/%d", cpu); + thread = kthread_create_on_cpu(bnx2i_percpu_io_thread, (void *)p, + cpu, "bnx2i_thread/%d"); if (IS_ERR(thread)) return PTR_ERR(thread); - /* bind thread to the cpu */ - kthread_bind(thread, cpu); p->iothread = thread; wake_up_process(thread); return 0; From 4f770f93002861a5ab13c4278f1a85f3a15c815e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:54 +0200 Subject: [PATCH 04/18] scsi: qedi: Use kthread_create_on_cpu() Use the proper API instead of open coding it. However it looks like qedi_percpu_io_thread() kthread could be replaced by the use of a high prio workqueue instead. Signed-off-by: Frederic Weisbecker --- drivers/scsi/qedi/qedi_main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index 628d59dda20cc4..f2bb490196170d 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -1961,13 +1961,11 @@ static int qedi_cpu_online(unsigned int cpu) struct qedi_percpu_s *p = this_cpu_ptr(&qedi_percpu); struct task_struct *thread; - thread = kthread_create_on_node(qedi_percpu_io_thread, (void *)p, - cpu_to_node(cpu), - "qedi_thread/%d", cpu); + thread = kthread_create_on_cpu(qedi_percpu_io_thread, (void *)p, + cpu, "qedi_thread/%d"); if (IS_ERR(thread)) return PTR_ERR(thread); - kthread_bind(thread, cpu); p->iothread = thread; wake_up_process(thread); return 0; From b61ac4b022157563092d24ec9936682f6b1fd3c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:55 +0200 Subject: [PATCH 05/18] soc/qman: test: Use kthread_run_on_cpu() Use the proper API instead of open coding it. However it looks like kthreads here could be replaced by the use of a per-cpu workqueue instead. Signed-off-by: Frederic Weisbecker --- drivers/soc/fsl/qbman/qman_test_stash.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/soc/fsl/qbman/qman_test_stash.c b/drivers/soc/fsl/qbman/qman_test_stash.c index b7e8e5ec884c6c..f4d3c2146f4f0f 100644 --- a/drivers/soc/fsl/qbman/qman_test_stash.c +++ b/drivers/soc/fsl/qbman/qman_test_stash.c @@ -108,14 +108,12 @@ static int on_all_cpus(int (*fn)(void)) .fn = fn, .started = ATOMIC_INIT(0) }; - struct task_struct *k = kthread_create(bstrap_fn, &bstrap, - "hotpotato%d", cpu); + struct task_struct *k = kthread_run_on_cpu(bstrap_fn, &bstrap, + cpu, "hotpotato%d"); int ret; if (IS_ERR(k)) return -ENOMEM; - kthread_bind(k, cpu); - wake_up_process(k); /* * If we call kthread_stop() before the "wake up" has had an * effect, then the thread may exit with -EINTR without ever From 1e47d952f01b4d33c3f3ac037004238b39d7686a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:56 +0200 Subject: [PATCH 06/18] kallsyms: Use kthread_run_on_cpu() Use the proper API instead of open coding it. Signed-off-by: Frederic Weisbecker --- kernel/kallsyms_selftest.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c index 873f7c445488c1..cf4af5728307b3 100644 --- a/kernel/kallsyms_selftest.c +++ b/kernel/kallsyms_selftest.c @@ -435,13 +435,11 @@ static int __init kallsyms_test_init(void) { struct task_struct *t; - t = kthread_create(test_entry, NULL, "kallsyms_test"); + t = kthread_run_on_cpu(test_entry, NULL, 0, "kallsyms_test"); if (IS_ERR(t)) { pr_info("Create kallsyms selftest task failed\n"); return PTR_ERR(t); } - kthread_bind(t, 0); - wake_up_process(t); return 0; } From be70893bff33a8914f3fc93c034efe1828073b99 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:57 +0200 Subject: [PATCH 07/18] lib: test_objpool: Use kthread_run_on_cpu() Use the proper API instead of open coding it. Reviewed-by: Matt Wu Signed-off-by: Frederic Weisbecker --- lib/test_objpool.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/lib/test_objpool.c b/lib/test_objpool.c index 5a3f6961a70fac..896c0131c9a8c3 100644 --- a/lib/test_objpool.c +++ b/lib/test_objpool.c @@ -371,14 +371,10 @@ static int ot_start_sync(struct ot_test *test) if (!cpu_online(cpu)) continue; - work = kthread_create_on_node(ot_thread_worker, item, - cpu_to_node(cpu), "ot_worker_%d", cpu); - if (IS_ERR(work)) { + work = kthread_run_on_cpu(ot_thread_worker, item, + cpu, "ot_worker_%d"); + if (IS_ERR(work)) pr_err("failed to create thread for cpu %d\n", cpu); - } else { - kthread_bind(work, cpu); - wake_up_process(work); - } } /* wait a while to make sure all threads waiting at start line */ @@ -562,14 +558,9 @@ static int ot_start_async(struct ot_test *test) if (!cpu_online(cpu)) continue; - work = kthread_create_on_node(ot_thread_worker, item, - cpu_to_node(cpu), "ot_worker_%d", cpu); - if (IS_ERR(work)) { + work = kthread_run_on_cpu(ot_thread_worker, item, cpu, "ot_worker_%d"); + if (IS_ERR(work)) pr_err("failed to create thread for cpu %d\n", cpu); - } else { - kthread_bind(work, cpu); - wake_up_process(work); - } } /* wait a while to make sure all threads waiting at start line */ From 9b0379cd587a1977fe4b0a6589562ecd02fee87b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 25 Oct 2024 00:12:59 +0200 Subject: [PATCH 08/18] arm64: Exclude nohz_full CPUs from 32bits el0 support Nohz full CPUs are not a desirable fallback target to run 32bits el0 applications. If present, prefer a set of housekeeping CPUs that can do the job instead. Otherwise just don't support el0 32 bits. Should the need arise, appropriate support can be introduced in the future. Suggested-by: Will Deacon Signed-off-by: Frederic Weisbecker --- Documentation/arch/arm64/asymmetric-32bit.rst | 9 +++++++++ arch/arm64/kernel/cpufeature.c | 6 +++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/arch/arm64/asymmetric-32bit.rst b/Documentation/arch/arm64/asymmetric-32bit.rst index 64a0b505da7d80..1b8df970693042 100644 --- a/Documentation/arch/arm64/asymmetric-32bit.rst +++ b/Documentation/arch/arm64/asymmetric-32bit.rst @@ -153,3 +153,12 @@ asymmetric system, a broken guest at EL1 could still attempt to execute mode will return to host userspace with an ``exit_reason`` of ``KVM_EXIT_FAIL_ENTRY`` and will remain non-runnable until successfully re-initialised by a subsequent ``KVM_ARM_VCPU_INIT`` operation. + +NOHZ FULL +--------- + +Nohz full CPUs are not a desirable fallback target to run 32bits el0 +applications. If present, a set of housekeeping CPUs that can do +the job instead is preferred. Otherwise 32-bit EL0 is not supported. +Should the need arise, appropriate support can be introduced in the +future. diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 6ce71f444ed84f..7ce1b8ab417f08 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -3742,7 +3743,10 @@ static int enable_mismatched_32bit_el0(unsigned int cpu) static int lucky_winner = -1; struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); - bool cpu_32bit = id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0); + bool cpu_32bit = false; + + if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0) && housekeeping_cpu(cpu, HK_TYPE_TICK)) + cpu_32bit = true; if (cpu_32bit) { cpumask_set_cpu(cpu, cpu_32bit_el0_mask); From 8baab05e0de8d0c8b9944f0527b96ec338606529 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:48:59 +0200 Subject: [PATCH 09/18] sched,arm64: Handle CPU isolation on last resort fallback rq selection When a kthread or any other task has an affinity mask that is fully offline or unallowed, the scheduler reaffines the task to all possible CPUs as a last resort. This default decision doesn't mix up very well with nohz_full CPUs that are part of the possible cpumask but don't want to be disturbed by unbound kthreads or even detached pinned user tasks. Make the fallback affinity setting aware of nohz_full. Suggested-by: Michal Hocko Signed-off-by: Frederic Weisbecker --- arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/include/asm/mmu_context.h | 2 ++ arch/arm64/kernel/cpufeature.c | 11 +++++++++++ include/linux/mmu_context.h | 1 + kernel/sched/core.c | 2 +- 5 files changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 8b4e5a3cd24c80..cac5efc836c0a1 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -671,6 +671,7 @@ static inline bool supports_clearbhb(int scope) } const struct cpumask *system_32bit_el0_cpumask(void); +const struct cpumask *fallback_32bit_el0_cpumask(void); DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0); static inline bool system_supports_32bit_el0(void) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 48b3d9553b675c..7883abd6b29a0a 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -283,6 +283,8 @@ task_cpu_possible_mask(struct task_struct *p) } #define task_cpu_possible_mask task_cpu_possible_mask +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p); + void verify_cpu_asid_bits(void); void post_ttbr_update_workaround(void); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7ce1b8ab417f08..2b7aa32bf4365a 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1642,6 +1642,17 @@ const struct cpumask *system_32bit_el0_cpumask(void) return cpu_possible_mask; } +const struct cpumask *task_cpu_fallback_mask(struct task_struct *p) +{ + if (!static_branch_unlikely(&arm64_mismatched_32bit_el0)) + return housekeeping_cpumask(HK_TYPE_TICK); + + if (!is_compat_thread(task_thread_info(p))) + return housekeeping_cpumask(HK_TYPE_TICK); + + return system_32bit_el0_cpumask(); +} + static int __init parse_32bit_el0_param(char *str) { allow_mismatched_32bit_el0 = true; diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index bbaec80c78c505..ac01dc4eb2cecd 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -24,6 +24,7 @@ static inline void leave_mm(void) { } #ifndef task_cpu_possible_mask # define task_cpu_possible_mask(p) cpu_possible_mask # define task_cpu_possible(cpu, p) true +# define task_cpu_fallback_mask(p) housekeeping_cpumask(HK_TYPE_TICK) #else # define task_cpu_possible(cpu, p) cpumask_test_cpu((cpu), task_cpu_possible_mask(p)) #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95e40895a51909..233b50b0e123ee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3534,7 +3534,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * * More yuck to audit. */ - do_set_cpus_allowed(p, task_cpu_possible_mask(p)); + do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: From 73382d8781e53c5093609ec7b3d4f413a8a5edce Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:00 +0200 Subject: [PATCH 10/18] kthread: Make sure kthread hasn't started while binding it Make sure the kthread is sleeping in the schedule_preempt_disabled() call before calling its handler when kthread_bind[_mask]() is called on it. This provides a sanity check verifying that the task is not randomly blocked later at some point within its function handler, in which case it could be just concurrently awaken, leaving the call to do_set_cpus_allowed() without any effect until the next voluntary sleep. Rely on the wake-up ordering to ensure that the newly introduced "started" field returns the expected value: TASK A TASK B ------ ------ READ kthread->started wake_up_process(B) rq_lock() ... rq_unlock() // RELEASE schedule() rq_lock() // ACQUIRE // schedule task B rq_unlock() WRITE kthread->started Similarly, writing kthread->started before subsequent voluntary sleeps will be visible after calling wait_task_inactive() in __kthread_bind_mask(), reporting potential misuse of the API. Upcoming patches will make further use of this facility. Acked-by: Vlastimil Babka Signed-off-by: Frederic Weisbecker --- kernel/kthread.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/kthread.c b/kernel/kthread.c index a5ac612b160925..b6f9ce475a4fcd 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -53,6 +53,7 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + int started; int result; int (*threadfn)(void *); void *data; @@ -382,6 +383,8 @@ static int kthread(void *_create) schedule_preempt_disabled(); preempt_enable(); + self->started = 1; + ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); @@ -540,7 +543,9 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { + struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); + WARN_ON_ONCE(kthread->started); } /** @@ -554,7 +559,9 @@ void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) */ void kthread_bind(struct task_struct *p, unsigned int cpu) { + struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); + WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); From 0fea1a6b7afa537a1b6e45ac5898dfa58836b102 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:01 +0200 Subject: [PATCH 11/18] kthread: Default affine kthread to its preferred NUMA node Kthreads attached to a preferred NUMA node for their task structure allocation can also be assumed to run preferrably within that same node. A more precise affinity is usually notified by calling kthread_create_on_cpu() or kthread_bind[_mask]() before the first wakeup. For the others, a default affinity to the node is desired and sometimes implemented with more or less success when it comes to deal with hotplug events and nohz_full / CPU Isolation interactions: - kcompactd is affine to its node and handles hotplug but not CPU Isolation - kswapd is affine to its node and ignores hotplug and CPU Isolation - A bunch of drivers create their kthreads on a specific node and don't take care about affining further. Handle that default node affinity preference at the generic level instead, provided a kthread is created on an actual node and doesn't apply any specific affinity such as a given CPU or a custom cpumask to bind to before its first wake-up. This generic handling is aware of CPU hotplug events and CPU isolation such that: * When a housekeeping CPU goes up that is part of the node of a given kthread, the related task is re-affined to that own node if it was previously running on the default last resort online housekeeping set from other nodes. * When a housekeeping CPU goes down while it was part of the node of a kthread, the running task is migrated (or the sleeping task is woken up) automatically by the scheduler to other housekeepers within the same node or, as a last resort, to all housekeepers from other nodes. Acked-by: Vlastimil Babka Signed-off-by: Frederic Weisbecker --- include/linux/cpuhotplug.h | 1 + kernel/kthread.c | 106 ++++++++++++++++++++++++++++++++++++- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index a04b73c40173de..6cc5e484547c1a 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -240,6 +240,7 @@ enum cpuhp_state { CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RANDOM_ONLINE, CPUHP_AP_RCUTREE_ONLINE, + CPUHP_AP_KTHREADS_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 40, diff --git a/kernel/kthread.c b/kernel/kthread.c index b6f9ce475a4fcd..26a839288a6028 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -35,6 +35,9 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; +static LIST_HEAD(kthreads_hotplug); +static DEFINE_MUTEX(kthreads_hotplug_lock); + struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ @@ -53,6 +56,7 @@ struct kthread_create_info struct kthread { unsigned long flags; unsigned int cpu; + unsigned int node; int started; int result; int (*threadfn)(void *); @@ -64,6 +68,8 @@ struct kthread { #endif /* To store the full name if task comm is truncated. */ char *full_name; + struct task_struct *task; + struct list_head hotplug_node; }; enum KTHREAD_BITS { @@ -122,8 +128,11 @@ bool set_kthread_struct(struct task_struct *p) init_completion(&kthread->exited); init_completion(&kthread->parked); + INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; + kthread->task = p; + kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } @@ -314,6 +323,11 @@ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; + if (!list_empty(&kthread->hotplug_node)) { + mutex_lock(&kthreads_hotplug_lock); + list_del(&kthread->hotplug_node); + mutex_unlock(&kthreads_hotplug_lock); + } do_exit(0); } EXPORT_SYMBOL(kthread_exit); @@ -339,6 +353,48 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) } EXPORT_SYMBOL(kthread_complete_and_exit); +static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) +{ + cpumask_and(cpumask, cpumask_of_node(kthread->node), + housekeeping_cpumask(HK_TYPE_KTHREAD)); + + if (cpumask_empty(cpumask)) + cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); +} + +static void kthread_affine_node(void) +{ + struct kthread *kthread = to_kthread(current); + cpumask_var_t affinity; + + WARN_ON_ONCE(kthread_is_per_cpu(current)); + + if (kthread->node == NUMA_NO_NODE) { + housekeeping_affine(current, HK_TYPE_KTHREAD); + } else { + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { + WARN_ON_ONCE(1); + return; + } + + mutex_lock(&kthreads_hotplug_lock); + WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); + list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); + /* + * The node cpumask is racy when read from kthread() but: + * - a racing CPU going down will either fail on the subsequent + * call to set_cpus_allowed_ptr() or be migrated to housekeepers + * afterwards by the scheduler. + * - a racing CPU going up will be handled by kthreads_online_cpu() + */ + kthread_fetch_affinity(kthread, affinity); + set_cpus_allowed_ptr(current, affinity); + mutex_unlock(&kthreads_hotplug_lock); + + free_cpumask_var(affinity); + } +} + static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; @@ -369,7 +425,6 @@ static int kthread(void *_create) * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); @@ -385,6 +440,9 @@ static int kthread(void *_create) self->started = 1; + if (!(current->flags & PF_NO_SETAFFINITY)) + kthread_affine_node(); + ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); @@ -781,6 +839,52 @@ int kthreadd(void *unused) return 0; } +/* + * Re-affine kthreads according to their preferences + * and the newly online CPU. The CPU down part is handled + * by select_fallback_rq() which default re-affines to + * housekeepers in case the preferred affinity doesn't + * apply anymore. + */ +static int kthreads_online_cpu(unsigned int cpu) +{ + cpumask_var_t affinity; + struct kthread *k; + int ret; + + guard(mutex)(&kthreads_hotplug_lock); + + if (list_empty(&kthreads_hotplug)) + return 0; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return -ENOMEM; + + ret = 0; + + list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { + if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || + kthread_is_per_cpu(k->task) || + k->node == NUMA_NO_NODE)) { + ret = -EINVAL; + continue; + } + kthread_fetch_affinity(k, affinity); + set_cpus_allowed_ptr(k->task, affinity); + } + + free_cpumask_var(affinity); + + return ret; +} + +static int kthreads_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", + kthreads_online_cpu, NULL); +} +early_initcall(kthreads_init); + void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) From 33c87b1153742c7439070b0376ca558d8ddb801e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:02 +0200 Subject: [PATCH 12/18] mm: Create/affine kcompactd to its preferred node Kcompactd is dedicated to a specific node. As such it wants to be preferrably affine to it, memory and CPUs-wise. Use the proper kthread API to achieve that. As a bonus it takes care of CPU-hotplug events and CPU-isolation on its behalf. Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Frederic Weisbecker --- mm/compaction.c | 43 +++---------------------------------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index a2b16b08cbbff7..a31c0f5758cfa0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -3154,15 +3154,9 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx) static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t *)p; - struct task_struct *tsk = current; long default_timeout = msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC); long timeout = default_timeout; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); - set_freezable(); pgdat->kcompactd_max_order = 0; @@ -3233,10 +3227,12 @@ void __meminit kcompactd_run(int nid) if (pgdat->kcompactd) return; - pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + pgdat->kcompactd = kthread_create_on_node(kcompactd, pgdat, nid, "kcompactd%d", nid); if (IS_ERR(pgdat->kcompactd)) { pr_err("Failed to start kcompactd on node %d\n", nid); pgdat->kcompactd = NULL; + } else { + wake_up_process(pgdat->kcompactd); } } @@ -3254,30 +3250,6 @@ void __meminit kcompactd_stop(int nid) } } -/* - * It's optimal to keep kcompactd on the same CPUs as their memory, but - * not required for correctness. So if the last cpu in a node goes - * away, we get changed to run anywhere: as the first one comes back, - * restore their cpu bindings. - */ -static int kcompactd_cpu_online(unsigned int cpu) -{ - int nid; - - for_each_node_state(nid, N_MEMORY) { - pg_data_t *pgdat = NODE_DATA(nid); - const struct cpumask *mask; - - mask = cpumask_of_node(pgdat->node_id); - - if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) - /* One of our CPUs online: restore mask */ - if (pgdat->kcompactd) - set_cpus_allowed_ptr(pgdat->kcompactd, mask); - } - return 0; -} - static int proc_dointvec_minmax_warn_RT_change(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -3337,15 +3309,6 @@ static struct ctl_table vm_compaction[] = { static int __init kcompactd_init(void) { int nid; - int ret; - - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "mm/compaction:online", - kcompactd_cpu_online, NULL); - if (ret < 0) { - pr_err("kcompactd: failed to register hotplug callbacks.\n"); - return ret; - } for_each_node_state(nid, N_MEMORY) kcompactd_run(nid); From 3956f5ecddc91be58f3fc2f93d788bf0ee0ed591 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:03 +0200 Subject: [PATCH 13/18] mm: Create/affine kswapd to its preferred node kswapd is dedicated to a specific node. As such it wants to be preferrably affine to it, memory and CPUs-wise. Use the proper kthread API to achieve that. As a bonus it takes care of CPU-hotplug events and CPU-isolation on its behalf. Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Frederic Weisbecker --- mm/vmscan.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 76378bc257e383..ec4eab23fb191b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7182,10 +7182,6 @@ static int kswapd(void *p) unsigned int highest_zoneidx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t *)p; struct task_struct *tsk = current; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); - - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(tsk, cpumask); /* * Tell the memory management that we're a "memory allocator", @@ -7354,13 +7350,15 @@ void __meminit kswapd_run(int nid) pgdat_kswapd_lock(pgdat); if (!pgdat->kswapd) { - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); + pgdat->kswapd = kthread_create_on_node(kswapd, pgdat, nid, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ pr_err("Failed to start kswapd on node %d,ret=%ld\n", nid, PTR_ERR(pgdat->kswapd)); BUG_ON(system_state < SYSTEM_RUNNING); pgdat->kswapd = NULL; + } else { + wake_up_process(pgdat->kswapd); } } pgdat_kswapd_unlock(pgdat); From 85e6bdc71aa034baa302857c32dfdb4fa9908177 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:04 +0200 Subject: [PATCH 14/18] kthread: Implement preferred affinity Affining kthreads follow either of four existing different patterns: 1) Per-CPU kthreads must stay affine to a single CPU and never execute relevant code on any other CPU. This is currently handled by smpboot code which takes care of CPU-hotplug operations. 2) Kthreads that _have_ to be affine to a specific set of CPUs and can't run anywhere else. The affinity is set through kthread_bind_mask() and the subsystem takes care by itself to handle CPU-hotplug operations. 3) Kthreads that prefer to be affine to a specific NUMA node. That preferred affinity is applied by default when an actual node ID is passed on kthread creation, provided the kthread is not per-CPU and no call to kthread_bind_mask() has been issued before the first wake-up. 4) Similar to the previous point but kthreads have a preferred affinity different than a node. It is set manually like any other task and CPU-hotplug is supposed to be handled by the relevant subsystem so that the task is properly reaffined whenever a given CPU from the preferred affinity comes up. Also care must be taken so that the preferred affinity doesn't cross housekeeping cpumask boundaries. Provide a function to handle the last usecase, mostly reusing the current node default affinity infrastructure. kthread_affine_preferred() is introduced, to be used just like kthread_bind_mask(), right after kthread creation and before the first wake up. The kthread is then affine right away to the cpumask passed through the API if it has online housekeeping CPUs. Otherwise it will be affine to all online housekeeping CPUs as a last resort. As with node affinity, it is aware of CPU hotplug events such that: * When a housekeeping CPU goes up that is part of the preferred affinity of a given kthread, the related task is re-affined to that preferred affinity if it was previously running on the default last resort online housekeeping set. * When a housekeeping CPU goes down while it was part of the preferred affinity of a kthread, the running task is migrated (or the sleeping task is woken up) automatically by the scheduler to other housekeepers within the preferred affinity or, as a last resort, to all housekeepers from other nodes. Acked-by: Vlastimil Babka Signed-off-by: Frederic Weisbecker --- include/linux/kthread.h | 1 + kernel/kthread.c | 68 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 7 deletions(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index b11f53c1ba2e6f..30209bdf83a228 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -85,6 +85,7 @@ kthread_run_on_cpu(int (*threadfn)(void *data), void *data, void free_kthread_struct(struct task_struct *k); void kthread_bind(struct task_struct *k, unsigned int cpu); void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask); int kthread_stop(struct task_struct *k); int kthread_stop_put(struct task_struct *k); bool kthread_should_stop(void); diff --git a/kernel/kthread.c b/kernel/kthread.c index 26a839288a6028..32d620a99f11ad 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -70,6 +70,7 @@ struct kthread { char *full_name; struct task_struct *task; struct list_head hotplug_node; + struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { @@ -327,6 +328,11 @@ void __noreturn kthread_exit(long result) mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); + + if (kthread->preferred_affinity) { + kfree(kthread->preferred_affinity); + kthread->preferred_affinity = NULL; + } } do_exit(0); } @@ -355,9 +361,17 @@ EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { - cpumask_and(cpumask, cpumask_of_node(kthread->node), - housekeeping_cpumask(HK_TYPE_KTHREAD)); + const struct cpumask *pref; + if (kthread->preferred_affinity) { + pref = kthread->preferred_affinity; + } else { + if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) + return; + pref = cpumask_of_node(kthread->node); + } + + cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } @@ -440,7 +454,7 @@ static int kthread(void *_create) self->started = 1; - if (!(current->flags & PF_NO_SETAFFINITY)) + if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; @@ -839,12 +853,53 @@ int kthreadd(void *unused) return 0; } +int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) +{ + struct kthread *kthread = to_kthread(p); + cpumask_var_t affinity; + unsigned long flags; + int ret; + + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { + WARN_ON(1); + return -EINVAL; + } + + WARN_ON_ONCE(kthread->preferred_affinity); + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return -ENOMEM; + + kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); + if (!kthread->preferred_affinity) { + ret = -ENOMEM; + goto out; + } + + mutex_lock(&kthreads_hotplug_lock); + cpumask_copy(kthread->preferred_affinity, mask); + WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); + list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); + kthread_fetch_affinity(kthread, affinity); + + /* It's safe because the task is inactive. */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, affinity); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + mutex_unlock(&kthreads_hotplug_lock); +out: + free_cpumask_var(affinity); + + return 0; +} + /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to - * housekeepers in case the preferred affinity doesn't - * apply anymore. + * housekeepers from other nodes in case the preferred + * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { @@ -864,8 +919,7 @@ static int kthreads_online_cpu(unsigned int cpu) list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || - kthread_is_per_cpu(k->task) || - k->node == NUMA_NO_NODE)) { + kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } From 89ef6bb01704870abea6cfb2cb06238e30764529 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:05 +0200 Subject: [PATCH 15/18] rcu: Use kthread preferred affinity for RCU boost Now that kthreads have an infrastructure to handle preferred affinity against CPU hotplug and housekeeping cpumask, convert RCU boost to use it instead of handling all the constraints by itself. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 27 +++++++++++++++++++-------- kernel/rcu/tree_plugin.h | 11 ++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ff98233d4aa59f..4a4c498210588b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -149,7 +149,6 @@ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); -static struct task_struct *rcu_boost_task(struct rcu_node *rnp); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); @@ -5011,6 +5010,22 @@ int rcutree_prepare_cpu(unsigned int cpu) return 0; } +static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) +{ + cpumask_var_t affinity; + int cpu; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return; + + for_each_leaf_node_possible_cpu(rnp, cpu) + cpumask_set_cpu(cpu, affinity); + + kthread_affine_preferred(t, affinity); + + free_cpumask_var(affinity); +} + /* * Update kthreads affinity during CPU-hotplug changes. * @@ -5030,19 +5045,18 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp; - struct task_struct *task_boost, *task_exp; + struct task_struct *task_exp; rdp = per_cpu_ptr(&rcu_data, cpu); rnp = rdp->mynode; - task_boost = rcu_boost_task(rnp); task_exp = rcu_exp_par_gp_task(rnp); /* - * If CPU is the boot one, those tasks are created later from early + * If CPU is the boot one, this task is created later from early * initcall since kthreadd must be created first. */ - if (!task_boost && !task_exp) + if (!task_exp) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) @@ -5064,9 +5078,6 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) if (task_exp) set_cpus_allowed_ptr(task_exp, cm); - if (task_boost) - set_cpus_allowed_ptr(task_boost, cm); - mutex_unlock(&rnp->kthread_mutex); free_cpumask_var(cm); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 3927ea5f7955c0..b141b8cc4f41b5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1217,16 +1217,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + rcu_thread_affine_rnp(t, rnp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ } -static struct task_struct *rcu_boost_task(struct rcu_node *rnp) -{ - return READ_ONCE(rnp->boost_kthread_task); -} - #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1243,10 +1240,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { } -static struct task_struct *rcu_boost_task(struct rcu_node *rnp) -{ - return NULL; -} #endif /* #else #ifdef CONFIG_RCU_BOOST */ /* From 2ff8f9a3b11ff0f9ae9d8713754e077a2231de09 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:06 +0200 Subject: [PATCH 16/18] kthread: Unify kthread_create_on_cpu() and kthread_create_worker_on_cpu() automatic format kthread_create_on_cpu() uses the CPU argument as an implicit and unique printf argument to add to the format whereas kthread_create_worker_on_cpu() still relies on explicitly passing the printf arguments. This difference in behaviour is error prone and doesn't help standardizing per-CPU kthread names. Unify the behaviours and convert kthread_create_worker_on_cpu() to use the printf behaviour of kthread_create_on_cpu(). Signed-off-by: Frederic Weisbecker --- fs/erofs/zdata.c | 2 +- include/linux/kthread.h | 21 +++++++++++---- kernel/kthread.c | 59 ++++++++++++++++++++++++----------------- 3 files changed, 52 insertions(+), 30 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 01f14750548745..a23392327ce2b7 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -320,7 +320,7 @@ static void erofs_destroy_percpu_workers(void) static struct kthread_worker *erofs_init_percpu_worker(int cpu) { struct kthread_worker *worker = - kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); + kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u"); if (IS_ERR(worker)) return worker; diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 30209bdf83a228..0c66e7c1092a92 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -187,13 +187,24 @@ extern void __kthread_init_worker(struct kthread_worker *worker, int kthread_worker_fn(void *worker_ptr); -__printf(2, 3) -struct kthread_worker * -kthread_create_worker(unsigned int flags, const char namefmt[], ...); +__printf(3, 4) +struct kthread_worker *kthread_create_worker_on_node(unsigned int flags, + int node, + const char namefmt[], ...); -__printf(3, 4) struct kthread_worker * +#define kthread_create_worker(flags, namefmt, ...) \ +({ \ + struct kthread_worker *__kw \ + = kthread_create_worker_on_node(flags, NUMA_NO_NODE, \ + namefmt, ## __VA_ARGS__); \ + if (!IS_ERR(__kw)) \ + wake_up_process(__kw->task); \ + __kw; \ +}) + +struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, - const char namefmt[], ...); + const char namefmt[]); bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); diff --git a/kernel/kthread.c b/kernel/kthread.c index 32d620a99f11ad..2d52126d5da0a8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1030,12 +1030,11 @@ int kthread_worker_fn(void *worker_ptr) EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * -__kthread_create_worker(int cpu, unsigned int flags, - const char namefmt[], va_list args) +__kthread_create_worker_on_node(unsigned int flags, int node, + const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; - int node = NUMA_NO_NODE; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) @@ -1043,20 +1042,14 @@ __kthread_create_worker(int cpu, unsigned int flags, kthread_init_worker(worker); - if (cpu >= 0) - node = cpu_to_node(cpu); - task = __kthread_create_on_node(kthread_worker_fn, worker, - node, namefmt, args); + node, namefmt, args); if (IS_ERR(task)) goto fail_task; - if (cpu >= 0) - kthread_bind(task, cpu); - worker->flags = flags; worker->task = task; - wake_up_process(task); + return worker; fail_task: @@ -1067,6 +1060,7 @@ __kthread_create_worker(int cpu, unsigned int flags, /** * kthread_create_worker - create a kthread worker * @flags: flags modifying the default behavior of the worker + * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) @@ -1074,25 +1068,49 @@ __kthread_create_worker(int cpu, unsigned int flags, * when the caller was killed by a fatal signal. */ struct kthread_worker * -kthread_create_worker(unsigned int flags, const char namefmt[], ...) +kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) +{ + struct kthread_worker *worker; + va_list args; + + va_start(args, namefmt); + worker = __kthread_create_worker_on_node(flags, node, namefmt, args); + va_end(args); + + if (worker) + wake_up_process(worker->task); + + return worker; +} +EXPORT_SYMBOL(kthread_create_worker_on_node); + +static __printf(3, 4) struct kthread_worker * +__kthread_create_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); - worker = __kthread_create_worker(-1, flags, namefmt, args); + worker = __kthread_create_worker_on_node(flags, cpu_to_node(cpu), + namefmt, args); va_end(args); + if (worker) { + kthread_bind(worker->task, cpu); + wake_up_process(worker->task); + } + return worker; } -EXPORT_SYMBOL(kthread_create_worker); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker - * @namefmt: printf-style name for the kthread worker (task). + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. @@ -1124,16 +1142,9 @@ EXPORT_SYMBOL(kthread_create_worker); */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, - const char namefmt[], ...) + const char namefmt[]) { - struct kthread_worker *worker; - va_list args; - - va_start(args, namefmt); - worker = __kthread_create_worker(cpu, flags, namefmt, args); - va_end(args); - - return worker; + return __kthread_create_worker_on_cpu(cpu, flags, namefmt, cpu); } EXPORT_SYMBOL(kthread_create_worker_on_cpu); From 0936fadc0114b5e7c4e9e6a07c98945018bcfc34 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:07 +0200 Subject: [PATCH 17/18] treewide: Introduce kthread_run_worker[_on_cpu]() kthread_create() creates a kthread without running it yet. kthread_run() creates a kthread and runs it. On the other hand, kthread_create_worker() creates a kthread worker and runs it. This difference in behaviours is confusing. Also there is no way to create a kthread worker and affine it using kthread_bind_mask() or kthread_affine_preferred() before starting it. Consolidate the behaviours and introduce kthread_run_worker[_on_cpu]() that behaves just like kthread_run(). kthread_create_worker[_on_cpu]() will now only create a kthread worker without starting it. Signed-off-by: Frederic Weisbecker Signed-off-by: Dan Carpenter --- arch/x86/kvm/i8254.c | 2 +- crypto/crypto_engine.c | 2 +- drivers/cpufreq/cppc_cpufreq.c | 2 +- drivers/gpu/drm/drm_vblank_work.c | 2 +- .../drm/i915/gem/selftests/i915_gem_context.c | 2 +- drivers/gpu/drm/i915/gt/selftest_execlists.c | 2 +- drivers/gpu/drm/i915/gt/selftest_hangcheck.c | 2 +- drivers/gpu/drm/i915/gt/selftest_slpc.c | 2 +- drivers/gpu/drm/i915/selftests/i915_request.c | 8 ++-- drivers/gpu/drm/msm/disp/msm_disp_snapshot.c | 2 +- drivers/gpu/drm/msm/msm_atomic.c | 2 +- drivers/gpu/drm/msm/msm_gpu.c | 2 +- drivers/gpu/drm/msm/msm_kms.c | 2 +- .../platform/chips-media/wave5/wave5-vpu.c | 2 +- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- drivers/net/ethernet/intel/ice/ice_dpll.c | 2 +- drivers/net/ethernet/intel/ice/ice_gnss.c | 2 +- drivers/net/ethernet/intel/ice/ice_ptp.c | 2 +- drivers/platform/chrome/cros_ec_spi.c | 2 +- drivers/ptp/ptp_clock.c | 2 +- drivers/spi/spi.c | 2 +- drivers/usb/typec/tcpm/tcpm.c | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- drivers/watchdog/watchdog_dev.c | 2 +- fs/erofs/zdata.c | 2 +- include/linux/kthread.h | 48 ++++++++++++++++--- kernel/kthread.c | 31 +++--------- kernel/rcu/tree.c | 4 +- kernel/sched/ext.c | 2 +- kernel/workqueue.c | 2 +- net/dsa/tag_ksz.c | 2 +- net/dsa/tag_ocelot_8021q.c | 2 +- net/dsa/tag_sja1105.c | 2 +- 33 files changed, 83 insertions(+), 66 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index cd57a517d04a73..d7ab8780ab9e12 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -681,7 +681,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + pit->worker = kthread_run_worker(0, "kvm-pit/%d", pid_nr); if (IS_ERR(pit->worker)) goto fail_kthread; diff --git a/crypto/crypto_engine.c b/crypto/crypto_engine.c index e60a0eb628e8a0..c7c16da5e6496c 100644 --- a/crypto/crypto_engine.c +++ b/crypto/crypto_engine.c @@ -517,7 +517,7 @@ struct crypto_engine *crypto_engine_alloc_init_and_set(struct device *dev, crypto_init_queue(&engine->queue, qlen); spin_lock_init(&engine->queue_lock); - engine->kworker = kthread_create_worker(0, "%s", engine->name); + engine->kworker = kthread_run_worker(0, "%s", engine->name); if (IS_ERR(engine->kworker)) { dev_err(dev, "failed to create crypto request pump task\n"); return NULL; diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index bd8f75accfa01e..2486a6c5256ae9 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -225,7 +225,7 @@ static void __init cppc_freq_invariance_init(void) if (fie_disabled) return; - kworker_fie = kthread_create_worker(0, "cppc_fie"); + kworker_fie = kthread_run_worker(0, "cppc_fie"); if (IS_ERR(kworker_fie)) { pr_warn("%s: failed to create kworker_fie: %ld\n", __func__, PTR_ERR(kworker_fie)); diff --git a/drivers/gpu/drm/drm_vblank_work.c b/drivers/gpu/drm/drm_vblank_work.c index 1752ffb44e1db7..9cc71120246ff2 100644 --- a/drivers/gpu/drm/drm_vblank_work.c +++ b/drivers/gpu/drm/drm_vblank_work.c @@ -277,7 +277,7 @@ int drm_vblank_worker_init(struct drm_vblank_crtc *vblank) INIT_LIST_HEAD(&vblank->pending_work); init_waitqueue_head(&vblank->work_wait_queue); - worker = kthread_create_worker(0, "card%d-crtc%d", + worker = kthread_run_worker(0, "card%d-crtc%d", vblank->dev->primary->index, vblank->pipe); if (IS_ERR(worker)) diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c index 89d4dc8b60c6a2..eb0158e4341782 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_context.c @@ -369,7 +369,7 @@ static int live_parallel_switch(void *arg) if (!data[n].ce[0]) continue; - worker = kthread_create_worker(0, "igt/parallel:%s", + worker = kthread_run_worker(0, "igt/parallel:%s", data[n].ce[0]->engine->name); if (IS_ERR(worker)) { err = PTR_ERR(worker); diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c index 222ca7c4495196..81c31396ecebc9 100644 --- a/drivers/gpu/drm/i915/gt/selftest_execlists.c +++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c @@ -3574,7 +3574,7 @@ static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags) arg[id].batch = NULL; arg[id].count = 0; - worker[id] = kthread_create_worker(0, "igt/smoke:%d", id); + worker[id] = kthread_run_worker(0, "igt/smoke:%d", id); if (IS_ERR(worker[id])) { err = PTR_ERR(worker[id]); break; diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c index 9ce8ff1c04fe5e..9d3aeb23729563 100644 --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c @@ -1025,7 +1025,7 @@ static int __igt_reset_engines(struct intel_gt *gt, threads[tmp].engine = other; threads[tmp].flags = flags; - worker = kthread_create_worker(0, "igt/%s", + worker = kthread_run_worker(0, "igt/%s", other->name); if (IS_ERR(worker)) { err = PTR_ERR(worker); diff --git a/drivers/gpu/drm/i915/gt/selftest_slpc.c b/drivers/gpu/drm/i915/gt/selftest_slpc.c index 4ecc4ae74a54c4..e218b229681ff9 100644 --- a/drivers/gpu/drm/i915/gt/selftest_slpc.c +++ b/drivers/gpu/drm/i915/gt/selftest_slpc.c @@ -489,7 +489,7 @@ static int live_slpc_tile_interaction(void *arg) return -ENOMEM; for_each_gt(gt, i915, i) { - threads[i].worker = kthread_create_worker(0, "igt/slpc_parallel:%d", gt->info.id); + threads[i].worker = kthread_run_worker(0, "igt/slpc_parallel:%d", gt->info.id); if (IS_ERR(threads[i].worker)) { ret = PTR_ERR(threads[i].worker); diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c index acae30a04a947c..88870844b5bd90 100644 --- a/drivers/gpu/drm/i915/selftests/i915_request.c +++ b/drivers/gpu/drm/i915/selftests/i915_request.c @@ -492,7 +492,7 @@ static int mock_breadcrumbs_smoketest(void *arg) for (n = 0; n < ncpus; n++) { struct kthread_worker *worker; - worker = kthread_create_worker(0, "igt/%d", n); + worker = kthread_run_worker(0, "igt/%d", n); if (IS_ERR(worker)) { ret = PTR_ERR(worker); ncpus = n; @@ -1645,7 +1645,7 @@ static int live_parallel_engines(void *arg) for_each_uabi_engine(engine, i915) { struct kthread_worker *worker; - worker = kthread_create_worker(0, "igt/parallel:%s", + worker = kthread_run_worker(0, "igt/parallel:%s", engine->name); if (IS_ERR(worker)) { err = PTR_ERR(worker); @@ -1806,7 +1806,7 @@ static int live_breadcrumbs_smoketest(void *arg) unsigned int i = idx * ncpus + n; struct kthread_worker *worker; - worker = kthread_create_worker(0, "igt/%d.%d", idx, n); + worker = kthread_run_worker(0, "igt/%d.%d", idx, n); if (IS_ERR(worker)) { ret = PTR_ERR(worker); goto out_flush; @@ -3219,7 +3219,7 @@ static int perf_parallel_engines(void *arg) memset(&engines[idx].p, 0, sizeof(engines[idx].p)); - worker = kthread_create_worker(0, "igt:%s", + worker = kthread_run_worker(0, "igt:%s", engine->name); if (IS_ERR(worker)) { err = PTR_ERR(worker); diff --git a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c index e75b97127c0d18..2be00b11e5573a 100644 --- a/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c +++ b/drivers/gpu/drm/msm/disp/msm_disp_snapshot.c @@ -109,7 +109,7 @@ int msm_disp_snapshot_init(struct drm_device *drm_dev) mutex_init(&kms->dump_mutex); - kms->dump_worker = kthread_create_worker(0, "%s", "disp_snapshot"); + kms->dump_worker = kthread_run_worker(0, "%s", "disp_snapshot"); if (IS_ERR(kms->dump_worker)) DRM_ERROR("failed to create disp state task\n"); diff --git a/drivers/gpu/drm/msm/msm_atomic.c b/drivers/gpu/drm/msm/msm_atomic.c index 9c45d641b5212c..a7a2384044ffdb 100644 --- a/drivers/gpu/drm/msm/msm_atomic.c +++ b/drivers/gpu/drm/msm/msm_atomic.c @@ -115,7 +115,7 @@ int msm_atomic_init_pending_timer(struct msm_pending_timer *timer, timer->kms = kms; timer->crtc_idx = crtc_idx; - timer->worker = kthread_create_worker(0, "atomic-worker-%d", crtc_idx); + timer->worker = kthread_run_worker(0, "atomic-worker-%d", crtc_idx); if (IS_ERR(timer->worker)) { int ret = PTR_ERR(timer->worker); timer->worker = NULL; diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 0d4a3744cfcbd2..8557998e0c92ff 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -859,7 +859,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, gpu->funcs = funcs; gpu->name = name; - gpu->worker = kthread_create_worker(0, "gpu-worker"); + gpu->worker = kthread_run_worker(0, "gpu-worker"); if (IS_ERR(gpu->worker)) { ret = PTR_ERR(gpu->worker); gpu->worker = NULL; diff --git a/drivers/gpu/drm/msm/msm_kms.c b/drivers/gpu/drm/msm/msm_kms.c index f3326d09bdbce1..dac831ba6219d1 100644 --- a/drivers/gpu/drm/msm/msm_kms.c +++ b/drivers/gpu/drm/msm/msm_kms.c @@ -269,7 +269,7 @@ int msm_drm_kms_init(struct device *dev, const struct drm_driver *drv) /* initialize event thread */ ev_thread = &priv->event_thread[drm_crtc_index(crtc)]; ev_thread->dev = ddev; - ev_thread->worker = kthread_create_worker(0, "crtc_event:%d", crtc->base.id); + ev_thread->worker = kthread_run_worker(0, "crtc_event:%d", crtc->base.id); if (IS_ERR(ev_thread->worker)) { ret = PTR_ERR(ev_thread->worker); DRM_DEV_ERROR(dev, "failed to create crtc_event kthread\n"); diff --git a/drivers/media/platform/chips-media/wave5/wave5-vpu.c b/drivers/media/platform/chips-media/wave5/wave5-vpu.c index 6b294a2d671781..d1320298a0f767 100644 --- a/drivers/media/platform/chips-media/wave5/wave5-vpu.c +++ b/drivers/media/platform/chips-media/wave5/wave5-vpu.c @@ -271,7 +271,7 @@ static int wave5_vpu_probe(struct platform_device *pdev) dev_err(&pdev->dev, "failed to get irq resource, falling back to polling\n"); hrtimer_init(&dev->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); dev->hrtimer.function = &wave5_vpu_timer_callback; - dev->worker = kthread_create_worker(0, "vpu_irq_thread"); + dev->worker = kthread_run_worker(0, "vpu_irq_thread"); if (IS_ERR(dev->worker)) { dev_err(&pdev->dev, "failed to create vpu irq worker\n"); ret = PTR_ERR(dev->worker); diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 3a792f79270d99..377e66cf7a4824 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -394,7 +394,7 @@ static int mv88e6xxx_irq_poll_setup(struct mv88e6xxx_chip *chip) kthread_init_delayed_work(&chip->irq_poll_work, mv88e6xxx_irq_poll); - chip->kworker = kthread_create_worker(0, "%s", dev_name(chip->dev)); + chip->kworker = kthread_run_worker(0, "%s", dev_name(chip->dev)); if (IS_ERR(chip->kworker)) return PTR_ERR(chip->kworker); diff --git a/drivers/net/ethernet/intel/ice/ice_dpll.c b/drivers/net/ethernet/intel/ice/ice_dpll.c index d5ad6d84007c21..75570be61fef2b 100644 --- a/drivers/net/ethernet/intel/ice/ice_dpll.c +++ b/drivers/net/ethernet/intel/ice/ice_dpll.c @@ -2053,7 +2053,7 @@ static int ice_dpll_init_worker(struct ice_pf *pf) struct kthread_worker *kworker; kthread_init_delayed_work(&d->work, ice_dpll_periodic_work); - kworker = kthread_create_worker(0, "ice-dplls-%s", + kworker = kthread_run_worker(0, "ice-dplls-%s", dev_name(ice_pf_to_dev(pf))); if (IS_ERR(kworker)) return PTR_ERR(kworker); diff --git a/drivers/net/ethernet/intel/ice/ice_gnss.c b/drivers/net/ethernet/intel/ice/ice_gnss.c index f02e8ca5537507..b2148dbe49b284 100644 --- a/drivers/net/ethernet/intel/ice/ice_gnss.c +++ b/drivers/net/ethernet/intel/ice/ice_gnss.c @@ -182,7 +182,7 @@ static struct gnss_serial *ice_gnss_struct_init(struct ice_pf *pf) pf->gnss_serial = gnss; kthread_init_delayed_work(&gnss->read_work, ice_gnss_read); - kworker = kthread_create_worker(0, "ice-gnss-%s", dev_name(dev)); + kworker = kthread_run_worker(0, "ice-gnss-%s", dev_name(dev)); if (IS_ERR(kworker)) { kfree(gnss); return NULL; diff --git a/drivers/net/ethernet/intel/ice/ice_ptp.c b/drivers/net/ethernet/intel/ice/ice_ptp.c index a999fface27281..3154bb674dd3dd 100644 --- a/drivers/net/ethernet/intel/ice/ice_ptp.c +++ b/drivers/net/ethernet/intel/ice/ice_ptp.c @@ -3080,7 +3080,7 @@ static int ice_ptp_init_work(struct ice_pf *pf, struct ice_ptp *ptp) /* Allocate a kworker for handling work required for the ports * connected to the PTP hardware clock. */ - kworker = kthread_create_worker(0, "ice-ptp-%s", + kworker = kthread_run_worker(0, "ice-ptp-%s", dev_name(ice_pf_to_dev(pf))); if (IS_ERR(kworker)) return PTR_ERR(kworker); diff --git a/drivers/platform/chrome/cros_ec_spi.c b/drivers/platform/chrome/cros_ec_spi.c index 86a3d32a776322..08f566cc148005 100644 --- a/drivers/platform/chrome/cros_ec_spi.c +++ b/drivers/platform/chrome/cros_ec_spi.c @@ -715,7 +715,7 @@ static int cros_ec_spi_devm_high_pri_alloc(struct device *dev, int err; ec_spi->high_pri_worker = - kthread_create_worker(0, "cros_ec_spi_high_pri"); + kthread_run_worker(0, "cros_ec_spi_high_pri"); if (IS_ERR(ec_spi->high_pri_worker)) { err = PTR_ERR(ec_spi->high_pri_worker); diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index 77a36e7bddd54e..b932425ddc6a37 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -296,7 +296,7 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, if (ptp->info->do_aux_work) { kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker); - ptp->kworker = kthread_create_worker(0, "ptp%d", ptp->index); + ptp->kworker = kthread_run_worker(0, "ptp%d", ptp->index); if (IS_ERR(ptp->kworker)) { err = PTR_ERR(ptp->kworker); pr_err("failed to create ptp aux_worker %d\n", err); diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index ff1add2ecb91f1..e4aa8f8389344b 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -2060,7 +2060,7 @@ static int spi_init_queue(struct spi_controller *ctlr) ctlr->busy = false; ctlr->queue_empty = true; - ctlr->kworker = kthread_create_worker(0, dev_name(&ctlr->dev)); + ctlr->kworker = kthread_run_worker(0, dev_name(&ctlr->dev)); if (IS_ERR(ctlr->kworker)) { dev_err(&ctlr->dev, "failed to create message pump kworker\n"); return PTR_ERR(ctlr->kworker); diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 6021eeb903fec5..95c0c63119ac23 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -7635,7 +7635,7 @@ struct tcpm_port *tcpm_register_port(struct device *dev, struct tcpc_dev *tcpc) mutex_init(&port->lock); mutex_init(&port->swap_lock); - port->wq = kthread_create_worker(0, dev_name(dev)); + port->wq = kthread_run_worker(0, dev_name(dev)); if (IS_ERR(port->wq)) return ERR_CAST(port->wq); sched_set_fifo(port->wq->task); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 8ffea8430f95fe..c204fc8e471a74 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -229,7 +229,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr, dev = &vdpasim->vdpa.dev; kthread_init_work(&vdpasim->work, vdpasim_work_fn); - vdpasim->worker = kthread_create_worker(0, "vDPA sim worker: %s", + vdpasim->worker = kthread_run_worker(0, "vDPA sim worker: %s", dev_attr->name); if (IS_ERR(vdpasim->worker)) goto err_iommu; diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 4190cb800cc45d..19698d87dc5728 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1229,7 +1229,7 @@ int __init watchdog_dev_init(void) { int err; - watchdog_kworker = kthread_create_worker(0, "watchdogd"); + watchdog_kworker = kthread_run_worker(0, "watchdogd"); if (IS_ERR(watchdog_kworker)) { pr_err("Failed to create watchdog kworker\n"); return PTR_ERR(watchdog_kworker); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index a23392327ce2b7..35381c00ee0998 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -320,7 +320,7 @@ static void erofs_destroy_percpu_workers(void) static struct kthread_worker *erofs_init_percpu_worker(int cpu) { struct kthread_worker *worker = - kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u"); + kthread_run_worker_on_cpu(cpu, 0, "erofs_worker/%u"); if (IS_ERR(worker)) return worker; diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 0c66e7c1092a92..8d27403888ce93 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -193,19 +193,53 @@ struct kthread_worker *kthread_create_worker_on_node(unsigned int flags, const char namefmt[], ...); #define kthread_create_worker(flags, namefmt, ...) \ -({ \ - struct kthread_worker *__kw \ - = kthread_create_worker_on_node(flags, NUMA_NO_NODE, \ - namefmt, ## __VA_ARGS__); \ - if (!IS_ERR(__kw)) \ - wake_up_process(__kw->task); \ - __kw; \ + kthread_create_worker_on_node(flags, NUMA_NO_NODE, namefmt, ## __VA_ARGS__); + +/** + * kthread_run_worker - create and wake a kthread worker. + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the thread. + * + * Description: Convenient wrapper for kthread_create_worker() followed by + * wake_up_process(). Returns the kthread_worker or ERR_PTR(-ENOMEM). + */ +#define kthread_run_worker(flags, namefmt, ...) \ +({ \ + struct kthread_worker *__kw \ + = kthread_create_worker(flags, namefmt, ## __VA_ARGS__); \ + if (!IS_ERR(__kw)) \ + wake_up_process(__kw->task); \ + __kw; \ }) struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]); +/** + * kthread_run_worker_on_cpu - create and wake a cpu bound kthread worker. + * @cpu: CPU number + * @flags: flags modifying the default behavior of the worker + * @namefmt: printf-style name for the thread. Format is restricted + * to "name.*%u". Code fills in cpu number. + * + * Description: Convenient wrapper for kthread_create_worker_on_cpu() + * followed by wake_up_process(). Returns the kthread_worker or + * ERR_PTR(-ENOMEM). + */ +static inline struct kthread_worker * +kthread_run_worker_on_cpu(int cpu, unsigned int flags, + const char namefmt[]) +{ + struct kthread_worker *kw; + + kw = kthread_create_worker_on_cpu(cpu, flags, namefmt); + if (!IS_ERR(kw)) + wake_up_process(kw->task); + + return kw; +} + bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work); diff --git a/kernel/kthread.c b/kernel/kthread.c index 2d52126d5da0a8..922f15762ec3d2 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1077,33 +1077,10 @@ kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[] worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); - if (worker) - wake_up_process(worker->task); - return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); -static __printf(3, 4) struct kthread_worker * -__kthread_create_worker_on_cpu(int cpu, unsigned int flags, - const char namefmt[], ...) -{ - struct kthread_worker *worker; - va_list args; - - va_start(args, namefmt); - worker = __kthread_create_worker_on_node(flags, cpu_to_node(cpu), - namefmt, args); - va_end(args); - - if (worker) { - kthread_bind(worker->task, cpu); - wake_up_process(worker->task); - } - - return worker; -} - /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. @@ -1144,7 +1121,13 @@ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { - return __kthread_create_worker_on_cpu(cpu, flags, namefmt, cpu); + struct kthread_worker *worker; + + worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); + if (!IS_ERR(worker)) + kthread_bind(worker->task, cpu); + + return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4a4c498210588b..d4b8e87a473b80 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4906,7 +4906,7 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) if (rnp->exp_kworker) return; - kworker = kthread_create_worker(0, name, rnp_index); + kworker = kthread_run_worker(0, name, rnp_index); if (IS_ERR_OR_NULL(kworker)) { pr_err("Failed to create par gp kworker on %d/%d\n", rnp->grplo, rnp->grphi); @@ -4933,7 +4933,7 @@ static void __init rcu_start_exp_gp_kworker(void) const char *name = "rcu_exp_gp_kthread_worker"; struct sched_param param = { .sched_priority = kthread_prio }; - rcu_exp_gp_kworker = kthread_create_worker(0, name); + rcu_exp_gp_kworker = kthread_run_worker(0, name); if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { pr_err("Failed to create %s!\n", name); rcu_exp_gp_kworker = NULL; diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7fff1d0454770f..ab8962d2e9d315 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5352,7 +5352,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) { struct kthread_worker *helper; - helper = kthread_create_worker(0, name); + helper = kthread_run_worker(0, name); if (helper) sched_set_fifo(helper->task); return helper; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8b07576814a58d..fe01c1f8095cd9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -7828,7 +7828,7 @@ static void __init wq_cpu_intensive_thresh_init(void) unsigned long thresh; unsigned long bogo; - pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release"); BUG_ON(IS_ERR(pwq_release_worker)); /* if the user set it to a specific value, keep it */ diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 281bbac5539d10..c33d4bf179297c 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -66,7 +66,7 @@ static int ksz_connect(struct dsa_switch *ds) if (!priv) return -ENOMEM; - xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit", ds->dst->index, ds->index); if (IS_ERR(xmit_worker)) { ret = PTR_ERR(xmit_worker); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 8e8b1bef6af69d..6ce0bc16679207 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -110,7 +110,7 @@ static int ocelot_connect(struct dsa_switch *ds) if (!priv) return -ENOMEM; - priv->xmit_worker = kthread_create_worker(0, "felix_xmit"); + priv->xmit_worker = kthread_run_worker(0, "felix_xmit"); if (IS_ERR(priv->xmit_worker)) { err = PTR_ERR(priv->xmit_worker); kfree(priv); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 3e902af7eea645..02adec6938110c 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -707,7 +707,7 @@ static int sja1105_connect(struct dsa_switch *ds) spin_lock_init(&priv->meta_lock); - xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit", ds->dst->index, ds->index); if (IS_ERR(xmit_worker)) { err = PTR_ERR(xmit_worker); From fc7d29fc87711c4e5a850e7498045991688e9785 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Sep 2024 00:49:08 +0200 Subject: [PATCH 18/18] rcu: Use kthread preferred affinity for RCU exp kworkers Now that kthreads have an infrastructure to handle preferred affinity against CPU hotplug and housekeeping cpumask, convert RCU exp workers to use it instead of handling all the constraints by itself. Acked-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- kernel/rcu/tree.c | 105 +++++++++------------------------------------- 1 file changed, 19 insertions(+), 86 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d4b8e87a473b80..c160e05dfb7c73 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4894,6 +4894,22 @@ rcu_boot_init_percpu_data(int cpu) rcu_boot_init_nocb_percpu_data(rdp); } +static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) +{ + cpumask_var_t affinity; + int cpu; + + if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) + return; + + for_each_leaf_node_possible_cpu(rnp, cpu) + cpumask_set_cpu(cpu, affinity); + + kthread_affine_preferred(t, affinity); + + free_cpumask_var(affinity); +} + struct kthread_worker *rcu_exp_gp_kworker; static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) @@ -4906,7 +4922,7 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) if (rnp->exp_kworker) return; - kworker = kthread_run_worker(0, name, rnp_index); + kworker = kthread_create_worker(0, name, rnp_index); if (IS_ERR_OR_NULL(kworker)) { pr_err("Failed to create par gp kworker on %d/%d\n", rnp->grplo, rnp->grphi); @@ -4916,16 +4932,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD)) sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m); -} -static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) -{ - struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker); - - if (!kworker) - return NULL; - - return kworker->task; + rcu_thread_affine_rnp(kworker->task, rnp); + wake_up_process(kworker->task); } static void __init rcu_start_exp_gp_kworker(void) @@ -5010,79 +5019,6 @@ int rcutree_prepare_cpu(unsigned int cpu) return 0; } -static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp) -{ - cpumask_var_t affinity; - int cpu; - - if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) - return; - - for_each_leaf_node_possible_cpu(rnp, cpu) - cpumask_set_cpu(cpu, affinity); - - kthread_affine_preferred(t, affinity); - - free_cpumask_var(affinity); -} - -/* - * Update kthreads affinity during CPU-hotplug changes. - * - * Set the per-rcu_node kthread's affinity to cover all CPUs that are - * served by the rcu_node in question. The CPU hotplug lock is still - * held, so the value of rnp->qsmaskinit will be stable. - * - * We don't include outgoingcpu in the affinity set, use -1 if there is - * no outgoing CPU. If there are no CPUs left in the affinity set, - * this function allows the kthread to execute on any CPU. - * - * Any future concurrent calls are serialized via ->kthread_mutex. - */ -static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) -{ - cpumask_var_t cm; - unsigned long mask; - struct rcu_data *rdp; - struct rcu_node *rnp; - struct task_struct *task_exp; - - rdp = per_cpu_ptr(&rcu_data, cpu); - rnp = rdp->mynode; - - task_exp = rcu_exp_par_gp_task(rnp); - - /* - * If CPU is the boot one, this task is created later from early - * initcall since kthreadd must be created first. - */ - if (!task_exp) - return; - - if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) - return; - - mutex_lock(&rnp->kthread_mutex); - mask = rcu_rnp_online_cpus(rnp); - for_each_leaf_node_possible_cpu(rnp, cpu) - if ((mask & leaf_node_cpu_bit(rnp, cpu)) && - cpu != outgoingcpu) - cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (cpumask_empty(cm)) { - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); - if (outgoingcpu >= 0) - cpumask_clear_cpu(outgoingcpu, cm); - } - - if (task_exp) - set_cpus_allowed_ptr(task_exp, cm); - - mutex_unlock(&rnp->kthread_mutex); - - free_cpumask_var(cm); -} - /* * Has the specified (known valid) CPU ever been fully online? */ @@ -5111,7 +5047,6 @@ int rcutree_online_cpu(unsigned int cpu) if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return 0; /* Too early in boot for scheduler work. */ sync_sched_exp_online_cleanup(cpu); - rcutree_affinity_setting(cpu, -1); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); @@ -5328,8 +5263,6 @@ int rcutree_offline_cpu(unsigned int cpu) rnp->ffmask &= ~rdp->grpmask; raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - rcutree_affinity_setting(cpu, cpu); - // nohz_full CPUs need the tick for stop-machine to work quickly tick_dep_set(TICK_DEP_BIT_RCU); return 0;