From 9890117114f183765113106ffb428568a462daef Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Sat, 13 Nov 2021 06:44:47 +0300 Subject: [PATCH 01/11] Make child runs first in fork by default. It can be switched back to parent run first via sysctl `sysctl kernel.sched_child_runs_first=0` --- kernel/sched/bs.c | 6 +++++- kernel/sched/bs.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index 38d385e4521d8..778a538b3c6e1 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -1003,9 +1003,13 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (curr) + if (curr) { update_curr(cfs_rq); + if (sysctl_sched_child_runs_first) + resched_curr(rq); + } + rq_unlock(rq, &rf); } diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h index 7e63970246b4c..28f2f45070104 100644 --- a/kernel/sched/bs.h +++ b/kernel/sched/bs.h @@ -3,7 +3,7 @@ * After fork, child runs first. If set to 0 (default) then * parent will (try to) run first. */ -unsigned int sysctl_sched_child_runs_first __read_mostly; +unsigned int sysctl_sched_child_runs_first __read_mostly = 1; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; From 4f2d5fc603d4e0e5e350ff059e105c1efd533c51 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Sat, 13 Nov 2021 07:50:17 +0300 Subject: [PATCH 02/11] revert adding hz values to Kconfig.hz since now the Hz work is in a separate patch (high-hz.patch). --- kernel/Kconfig.hz | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index b2ef596d63501..b4a1995149d05 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -47,27 +47,12 @@ choice on desktops with great smoothness without increasing CPU power consumption and sacrificing the battery life on laptops. - config HZ_833 - bool "833 HZ" - help - 833 Hz is the TT alternative to 1000 Hz. Choose 833 Hz - if you want a balance between latency and performance. - config HZ_1000 bool "1000 HZ" help 1000 Hz is the preferred choice for desktop systems and other systems requiring fast interactive responses to events. - config HZ_1666 - bool "1666 HZ" - help - 1666 Hz is for very high latency bound systems. Choose 1666 Hz - if you don't care about overall throughput or performance, but - you care more about latency (some realtime applications) require - low latency. The response and interactive processes with 1666 Hz - feel much snappier. - endchoice config HZ @@ -76,9 +61,7 @@ config HZ default 250 if HZ_250 default 300 if HZ_300 default 500 if HZ_500 - default 833 if HZ_833 default 1000 if HZ_1000 - default 1666 if HZ_1666 config SCHED_HRTICK def_bool HIGH_RES_TIMERS From bdf7933b51c4e2d744f1e40e5b11d7912b992482 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Sat, 13 Nov 2021 20:27:41 +0300 Subject: [PATCH 03/11] Added tasks load accountings and statistics. --- init/Kconfig | 11 +- kernel/sched/bs.c | 53 +++ kernel/sched/bs.h | 112 ++++- kernel/sched/bs_nohz.h | 63 ++- kernel/sched/tt_stats.h | 913 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 1135 insertions(+), 17 deletions(-) create mode 100644 kernel/sched/tt_stats.h diff --git a/init/Kconfig b/init/Kconfig index 5a0f21056d848..a9b7e2d7fb1c8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -109,6 +109,14 @@ config TT_SCHED bool "TT Scheduler" default y +config TT_ACCOUNTING_STATS + bool "TT include all accounting and statistics" + depends on TT_SCHED + default y + help + This will include all CFS tasks' load accounting and statistics. + If you are using 'performance' governor and do not depend/care + about tasks statistics, then choose N. Otherwise say Y. menu "General setup" @@ -821,8 +829,7 @@ menu "Scheduler features" config UCLAMP_TASK bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL - depends on !TT_SCHED + depends on CPU_FREQ_GOV_SCHEDUTIL && TT_ACCOUNTING_STATS help This feature enables the scheduler to track the clamped utilization of each CPU based on RUNNABLE tasks scheduled on that CPU. diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index 778a538b3c6e1..76d9ca2ad289a 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -7,6 +7,7 @@ #include "sched.h" #include "pelt.h" #include "fair_numa.h" +#include "tt_stats.h" #include "bs.h" unsigned int __read_mostly tt_max_lifetime = 22000; // in ms @@ -288,7 +289,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its weight to reflect the new share of + * its group cfs_rq + * - Add its new weight to cfs_rq->load.weight + */ + update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); account_entity_enqueue(cfs_rq, se); + check_schedstat_required(); + update_stats_enqueue(cfs_rq, se, flags); if (!curr) __enqueue_entity(cfs_rq, se); @@ -313,6 +325,17 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Subtract its load from the cfs_rq->runnable_avg. + * - Subtract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ + update_load_avg(cfs_rq, se, UPDATE_TG); + update_stats_dequeue(cfs_rq, se, flags); + if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); @@ -326,6 +349,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); + + /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's add the task's estimated utilization to the cfs_rq's + * estimated utilization, before we update schedutil. + */ + util_est_enqueue(&rq->cfs, p); + + /* + * If in_iowait is set, the code below may not trigger any cpufreq + * utilization updates, so do it here explicitly with the IOWAIT flag + * passed. + */ + if (p->in_iowait) + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); if (!se->on_rq) { enqueue_entity(cfs_rq, se, flags); @@ -334,6 +374,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) } add_nr_running(rq, 1); + + if (!task_new) + update_overutilized_status(rq); } static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) @@ -341,6 +384,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); int idle_h_nr_running = task_has_idle_policy(p); + int task_sleep = flags & DEQUEUE_SLEEP; + + util_est_dequeue(&rq->cfs, p); dequeue_entity(cfs_rq, se, flags); @@ -348,6 +394,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->idle_h_nr_running -= idle_h_nr_running; sub_nr_running(rq, 1); + util_est_update(&rq->cfs, p, task_sleep); } static void yield_task_fair(struct rq *rq) @@ -849,6 +896,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); raw_spin_unlock(&this_rq->__lock); + update_blocked_averages(this_cpu); + for_each_online_cpu(cpu) { /* * Stop searching for tasks to pull if there are @@ -1052,6 +1101,10 @@ DEFINE_SCHED_CLASS(fair) = { .get_rr_interval = get_rr_interval_fair, .update_curr = update_curr_fair, + +#ifdef CONFIG_UCLAMP_TASK + .uclamp_enabled = 1, +#endif }; __init void init_sched_fair_class(void) diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h index 28f2f45070104..80f813826406d 100644 --- a/kernel/sched/bs.h +++ b/kernel/sched/bs.h @@ -19,13 +19,38 @@ int __weak arch_asym_cpu_priority(int cpu) } /* Give new sched_entity start runnable values to heavy its load in infant time */ -void init_entity_runnable_average(struct sched_entity *se) {} -void post_init_entity_util_avg(struct task_struct *p) {} void update_max_interval(void) {} static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { +#ifdef CONFIG_TT_ACCOUNTING_STATS + if (p->on_rq == TASK_ON_RQ_MIGRATING) { + /* + * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' + * rq->lock and can modify state directly. + */ + lockdep_assert_rq_held(task_rq(p)); + detach_entity_cfs_rq(&p->se); + + } else { + /* + * We are supposed to update the task to "current" time, then + * its up to date and ready to go to new CPU/cfs_rq. But we + * have difficulty in getting what current time is, so simply + * throw away the out-of-date time. This will result in the + * wakee task is less decayed, but giving the wakee more load + * sounds not bad. + */ + remove_entity_load_avg(&p->se); + } + + /* We have migrated, no longer consider this task hot */ + p->se.exec_start = 0; +#endif + /* Tell new CPU we are migrated */ + p->se.avg.last_update_time = 0; + update_scan_period(p, new_cpu); } @@ -51,7 +76,67 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) #endif } -void reweight_task(struct task_struct *p, int prio) {} +#ifdef CONFIG_TT_ACCOUNTING_STATS +static void update_curr(struct cfs_rq *cfs_rq); + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + update_load_sub(&cfs_rq->load, se->load.weight); + } + dequeue_load_avg(cfs_rq, se); + + update_load_set(&se->load, weight); + +#ifdef CONFIG_SMP + do { + u32 divider = get_pelt_divider(&se->avg); + + se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); + } while (0); +#endif + + enqueue_load_avg(cfs_rq, se); + if (se->on_rq) + update_load_add(&cfs_rq->load, se->load.weight); + +} +#endif + +void reweight_task(struct task_struct *p, int prio) +{ +#ifdef CONFIG_TT_ACCOUNTING_STATS + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct load_weight *load = &se->load; + unsigned long weight = scale_load(sched_prio_to_weight[prio]); + + reweight_entity(cfs_rq, se, weight); + load->inv_weight = sched_prio_to_wmult[prio]; +#endif +} static inline struct sched_entity *se_of(struct tt_node *ttn) { @@ -109,12 +194,10 @@ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SMP - if (entity_is_task(se)) { - struct rq *rq = rq_of(cfs_rq); + struct rq *rq = rq_of(cfs_rq); - account_numa_enqueue(rq, task_of(se)); - list_add(&se->group_node, &rq->cfs_tasks); - } + account_numa_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -123,10 +206,8 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_SMP - if (entity_is_task(se)) { - account_numa_dequeue(rq_of(cfs_rq), task_of(se)); - list_del_init(&se->group_node); - } + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); + list_del_init(&se->group_node); #endif cfs_rq->nr_running--; } @@ -153,10 +234,15 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } -static void switched_from_fair(struct rq *rq, struct task_struct *p) {} +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + detach_task_cfs_rq(p); +} static void switched_to_fair(struct rq *rq, struct task_struct *p) { + attach_task_cfs_rq(p); + if (task_on_rq_queued(p)) { /* * We were most likely switched from sched_rt, so diff --git a/kernel/sched/bs_nohz.h b/kernel/sched/bs_nohz.h index c1fdf26bbe61a..3802951591f36 100644 --- a/kernel/sched/bs_nohz.h +++ b/kernel/sched/bs_nohz.h @@ -11,8 +11,6 @@ static struct { #endif /* CONFIG_NO_HZ_COMMON */ - - #ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { @@ -61,9 +59,66 @@ static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif +#ifdef CONFIG_TT_ACCOUNTING_STATS +static bool __update_blocked_others(struct rq *rq, bool *done) +{ + const struct sched_class *curr_class; + u64 now = rq_clock_pelt(rq); + unsigned long thermal_pressure; + bool decayed; + + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + + thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); + + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) | + update_irq_load_avg(rq, 0); + + if (others_have_blocked(rq)) + *done = false; + + return decayed; +} + +static bool __update_blocked_fair(struct rq *rq, bool *done) +{ + struct cfs_rq *cfs_rq = &rq->cfs; + bool decayed; + + decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + if (cfs_rq_has_blocked(cfs_rq)) + *done = false; + + return decayed; +} + +static void update_blocked_averages(int cpu) +{ + bool decayed = false, done = true; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + rq_lock_irqsave(rq, &rf); + update_blocked_load_tick(rq); + update_rq_clock(rq); + decayed |= __update_blocked_others(rq, &done); + decayed |= __update_blocked_fair(rq, &done); + update_blocked_load_status(rq, !done); + if (decayed) + cpufreq_update_util(rq, 0); + rq_unlock_irqrestore(rq, &rf); +} +#else +static void update_blocked_averages(int cpu) {} +#endif #ifdef CONFIG_NO_HZ_COMMON /* @@ -388,6 +443,8 @@ static bool update_nohz_stats(struct rq *rq) if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick))) return true; + update_blocked_averages(cpu); + return rq->has_blocked_load; } @@ -629,4 +686,6 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ if (nohz_idle_balance(this_rq, idle)) return; + + update_blocked_averages(this_rq->cpu); } diff --git a/kernel/sched/tt_stats.h b/kernel/sched/tt_stats.h new file mode 100644 index 0000000000000..0f8f7e61da66a --- /dev/null +++ b/kernel/sched/tt_stats.h @@ -0,0 +1,913 @@ +#ifdef CONFIG_TT_ACCOUNTING_STATS +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 wait_start, prev_wait_start; + + if (!schedstat_enabled()) + return; + + wait_start = rq_clock(rq_of(cfs_rq)); + prev_wait_start = schedstat_val(se->statistics.wait_start); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > prev_wait_start)) + wait_start -= prev_wait_start; + + __schedstat_set(se->statistics.wait_start, wait_start); +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta; + + if (!schedstat_enabled()) + return; + + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(se->statistics.wait_start))) + return; + + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + __schedstat_set(se->statistics.wait_start, delta); + return; + } + trace_sched_stat_wait(p, delta); + } + + __schedstat_set(se->statistics.wait_max, + max(schedstat_val(se->statistics.wait_max), delta)); + __schedstat_inc(se->statistics.wait_count); + __schedstat_add(se->statistics.wait_sum, delta); + __schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *tsk = NULL; + u64 sleep_start, block_start; + + if (!schedstat_enabled()) + return; + + sleep_start = schedstat_val(se->statistics.sleep_start); + block_start = schedstat_val(se->statistics.block_start); + + if (entity_is_task(se)) + tsk = task_of(se); + + if (sleep_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) + __schedstat_set(se->statistics.sleep_max, delta); + + __schedstat_set(se->statistics.sleep_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (block_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > schedstat_val(se->statistics.block_max))) + __schedstat_set(se->statistics.block_max, delta); + + __schedstat_set(se->statistics.block_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); + + if (tsk) { + if (tsk->in_iowait) { + __schedstat_add(se->statistics.iowait_sum, delta); + __schedstat_inc(se->statistics.iowait_count); + trace_sched_stat_iowait(tsk, delta); + } + + trace_sched_stat_blocked(tsk, delta); + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } +} + +/* + * Task is being enqueued - update stats: + */ +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + if (!schedstat_enabled()) + return; + + /* + * Are we enqueueing a waiting task? (for current tasks + * a dequeue/enqueue event is a NOP) + */ + if (se != cfs_rq->curr) + update_stats_wait_start(cfs_rq, se); + + if (flags & ENQUEUE_WAKEUP) + update_stats_enqueue_sleeper(cfs_rq, se); +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + + if (!schedstat_enabled()) + return; + + /* + * Mark the end of the wait period if dequeueing a + * waiting task: + */ + if (se != cfs_rq->curr) + update_stats_wait_end(cfs_rq, se); + + if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + unsigned int state; + + /* XXX racy against TTWU */ + state = READ_ONCE(tsk->__state); + if (state & TASK_INTERRUPTIBLE) + __schedstat_set(se->statistics.sleep_start, + rq_clock(rq_of(cfs_rq))); + if (state & TASK_UNINTERRUPTIBLE) + __schedstat_set(se->statistics.block_start, + rq_clock(rq_of(cfs_rq))); + } +} +#else +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} +#endif /* CONFIG_TT_ACCOUNTING_STATS */ + +static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) +{ + struct rq *rq = rq_of(cfs_rq); + + if (&rq->cfs == cfs_rq) { + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq, flags); + } +} + +#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) +/* + * The margin used when comparing utilization with CPU capacity. + * + * (default: ~20%) + */ +#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) + +static unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; +} + +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u32 divider = get_pelt_divider(&se->avg); + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; +} +#else +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) +static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} + +static inline int propagate_entity_load_avg(struct sched_entity *se) +{ + return 0; +} + +static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} + +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_pelt() + * @cfs_rq: cfs_rq to update + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed load. + * + * Since both these conditions indicate a changed cfs_rq->avg.load we should + * call update_tg_load_avg() when this function returns true. + */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) +{ + unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; + struct sched_avg *sa = &cfs_rq->avg; + int decayed = 0; + + if (cfs_rq->removed.nr) { + unsigned long r; + u32 divider = get_pelt_divider(&cfs_rq->avg); + + raw_spin_lock(&cfs_rq->removed.lock); + swap(cfs_rq->removed.util_avg, removed_util); + swap(cfs_rq->removed.load_avg, removed_load); + swap(cfs_rq->removed.runnable_avg, removed_runnable); + cfs_rq->removed.nr = 0; + raw_spin_unlock(&cfs_rq->removed.lock); + + r = removed_load; + sub_positive(&sa->load_avg, r); + sa->load_sum = sa->load_avg * divider; + + r = removed_util; + sub_positive(&sa->util_avg, r); + sa->util_sum = sa->util_avg * divider; + + r = removed_runnable; + sub_positive(&sa->runnable_avg, r); + sa->runnable_sum = sa->runnable_avg * divider; + + /* + * removed_runnable is the unweighted version of removed_load so we + * can use it to estimate removed_load_sum. + */ + add_tg_cfs_propagate(cfs_rq, + -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT); + + decayed = 1; + } + + decayed |= __update_load_avg_cfs_rq(now, cfs_rq); + +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->load_last_update_time_copy = sa->last_update_time; +#endif + + return decayed; +} + +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); + + /* + * When we attach the @se to the @cfs_rq, we must align the decay + * window because without that, really weird and wonderful things can + * happen. + * + * XXX illustrate + */ + se->avg.last_update_time = cfs_rq->avg.last_update_time; + se->avg.period_contrib = cfs_rq->avg.period_contrib; + + /* + * Hell(o) Nasty stuff.. we need to recompute _sum based on the new + * period_contrib. This isn't strictly correct, but since we're + * entirely outside of the PELT hierarchy, nobody cares if we truncate + * _sum a little. + */ + se->avg.util_sum = se->avg.util_avg * divider; + + se->avg.runnable_sum = se->avg.runnable_avg * divider; + + se->avg.load_sum = divider; + if (se_weight(se)) { + se->avg.load_sum = + div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se)); + } + + enqueue_load_avg(cfs_rq, se); + cfs_rq->avg.util_avg += se->avg.util_avg; + cfs_rq->avg.util_sum += se->avg.util_sum; + cfs_rq->avg.runnable_avg += se->avg.runnable_avg; + cfs_rq->avg.runnable_sum += se->avg.runnable_sum; + + add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); + + cfs_rq_util_change(cfs_rq, 0); + + trace_pelt_cfs_tp(cfs_rq); +} + +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ +static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); + + dequeue_load_avg(cfs_rq, se); + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; + sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; + + add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); + + cfs_rq_util_change(cfs_rq, 0); + + trace_pelt_cfs_tp(cfs_rq); +} + +/* + * Optional action to be done while updating the load average + */ +#define UPDATE_TG 0x1 +#define SKIP_AGE_LOAD 0x2 +#define DO_ATTACH 0x4 + +/* Update task and its cfs_rq load average */ +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + u64 now = cfs_rq_clock_pelt(cfs_rq); + int decayed; + + /* + * Track task load average for carrying it to new CPU after migrated, and + * track group sched_entity load average for task_h_load calc in migration + */ + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cfs_rq, se); + + decayed = update_cfs_rq_load_avg(now, cfs_rq); + decayed |= propagate_entity_load_avg(se); + + if (!se->avg.last_update_time && (flags & DO_ATTACH)) { + + /* + * DO_ATTACH means we're here from enqueue_entity(). + * !last_update_time means we've passed through + * migrate_task_rq_fair() indicating we migrated. + * + * IOW we're enqueueing a task on a new CPU. + */ + attach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); + + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); + + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq); + } +} + +#ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + u64 last_update_time_copy; + u64 last_update_time; + + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); + + return last_update_time; +} +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} +#endif + +/* + * Synchronize entity load avg of dequeued entity without locking + * the previous rq. + */ +static void sync_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + __update_load_avg_blocked_se(last_update_time, se); +} + +/* + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). + */ +static void remove_entity_load_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + unsigned long flags; + + /* + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + */ + + sync_entity_load_avg(se); + + raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); + ++cfs_rq->removed.nr; + cfs_rq->removed.util_avg += se->avg.util_avg; + cfs_rq->removed.load_avg += se->avg.load_avg; + cfs_rq->removed.runnable_avg += se->avg.runnable_avg; + raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); +} + +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.runnable_avg; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; +} + +static inline unsigned long task_util(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_avg); +} + +static inline unsigned long _task_util_est(struct task_struct *p) +{ + struct util_est ue = READ_ONCE(p->se.avg.util_est); + + return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); +} + +static inline unsigned long task_util_est(struct task_struct *p) +{ + return max(task_util(p), _task_util_est(p)); +} + +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + +static inline void util_est_enqueue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued += _task_util_est(p); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) + +/* + * Check if a (signed) value is within a specified (unsigned) margin, + * based on the observation that: + * + * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) + * + * NOTE: this only works when value + margin < INT_MAX. + */ +static inline bool within_margin(int value, int margin) +{ + return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); +} + +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) +{ + long last_ewma_diff, last_enqueued_diff; + struct util_est ue; + + if (!sched_feat(UTIL_EST)) + return; + + /* + * Skip update of task's estimated utilization when the task has not + * yet completed an activation, e.g. being migrated. + */ + if (!task_sleep) + return; + + /* + * If the PELT values haven't changed since enqueue time, + * skip the util_est update. + */ + ue = p->se.avg.util_est; + if (ue.enqueued & UTIL_AVG_UNCHANGED) + return; + + last_enqueued_diff = ue.enqueued; + + /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + ue.enqueued = task_util(p); + if (sched_feat(UTIL_EST_FASTUP)) { + if (ue.ewma < ue.enqueued) { + ue.ewma = ue.enqueued; + goto done; + } + } + + /* + * Skip update of task's estimated utilization when its members are + * already ~1% close to its last activation value. + */ + last_ewma_diff = ue.enqueued - ue.ewma; + last_enqueued_diff -= ue.enqueued; + if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { + if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN)) + goto done; + + return; + } + + /* + * To avoid overestimation of actual task utilization, skip updates if + * we cannot grant there is idle time in this CPU. + */ + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) + return; + + /* + * Update Task's estimated utilization + * + * When *p completes an activation we can consolidate another sample + * of the task size. This is done by storing the current PELT value + * as ue.enqueued and by using this value to update the Exponential + * Weighted Moving Average (EWMA): + * + * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) + * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) + * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) + * = w * ( last_ewma_diff ) + ewma(t-1) + * = w * (last_ewma_diff + ewma(t-1) / w) + * + * Where 'w' is the weight of new samples, which is configured to be + * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) + */ + ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; + ue.ewma += last_ewma_diff; + ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: + ue.enqueued |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(p->se.avg.util_est, ue); + + trace_sched_util_est_se_tp(&p->se); +} + +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ + return fits_capacity(uclamp_task_util(p), capacity); +} + +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.avg.load_avg; +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p || p->nr_cpus_allowed == 1) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); +} + +#else /* CONFIG_SMP && CONFIG_TT_ACCOUNTING_STATS */ + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + return true; +} + +#define UPDATE_TG 0x0 +#define SKIP_AGE_LOAD 0x0 +#define DO_ATTACH 0x0 + +static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) +{ + cfs_rq_util_change(cfs_rq, 0); +} + +static inline void remove_entity_load_avg(struct sched_entity *se) {} + +static inline void +attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} + +static inline void +util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} + +#endif /* CONFIG_SMP */ + +static inline void check_schedstat_required(void) +{ +#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_TT_ACCOUNTING_STATS) + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enable or " + "kernel.sched_schedstats=1\n"); + } +#endif +} + +#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) +static inline unsigned long cpu_util(int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned int util; + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + return min_t(unsigned long, util, capacity_orig_of(cpu)); +} + +static inline bool cpu_overutilized(int cpu) +{ + return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); +} + +static inline void update_overutilized_status(struct rq *rq) +{ + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { + WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); + } +} + +#else +static inline void update_overutilized_status(struct rq *rq) { } +#endif + +#ifdef CONFIG_TT_ACCOUNTING_STATS +static void detach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Catch up with the cfs_rq and remove our load when we leave */ + update_load_avg(cfs_rq, se, 0); + detach_entity_load_avg(cfs_rq, se); +} +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) +/* Give new sched_entity start runnable values to heavy its load in infant time */ +void init_entity_runnable_average(struct sched_entity *se) +{ + struct sched_avg *sa = &se->avg; + + memset(sa, 0, sizeof(*sa)); + + /* + * Tasks are initialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are initialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); + + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ +} + +static void attach_entity_cfs_rq(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* Synchronize entity with its cfs_rq */ + update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + attach_entity_load_avg(cfs_rq, se); +} + +static void detach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + detach_entity_cfs_rq(se); +} + +static void attach_task_cfs_rq(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + attach_entity_cfs_rq(se); +} + +void post_init_entity_util_avg(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct sched_avg *sa = &se->avg; + long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); + long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; + + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq); + return; + } + + attach_entity_cfs_rq(se); +} +#else /* !CONFIG_SMP */ +static void detach_task_cfs_rq(struct task_struct *p) {} +static void attach_task_cfs_rq(struct task_struct *p) {} +void init_entity_runnable_average(struct sched_entity *se) {} +void post_init_entity_util_avg(struct task_struct *p) {} +#endif + From 84b763697edebae91b2235e52841ed4dc02d0cd3 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Sat, 13 Nov 2021 21:54:14 +0300 Subject: [PATCH 04/11] fix when enabling CONFIG_NUMA_BALANCING --- kernel/sched/bs.c | 2 +- kernel/sched/fair_numa.h | 34 ------------------- kernel/sched/tt_stats.h | 70 +++++++++++++++++++++------------------- 3 files changed, 37 insertions(+), 69 deletions(-) diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index 76d9ca2ad289a..2813bf71cb246 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -6,8 +6,8 @@ */ #include "sched.h" #include "pelt.h" -#include "fair_numa.h" #include "tt_stats.h" +#include "fair_numa.h" #include "bs.h" unsigned int __read_mostly tt_max_lifetime = 22000; // in ms diff --git a/kernel/sched/fair_numa.h b/kernel/sched/fair_numa.h index fd49208da7c69..a0860a2832862 100644 --- a/kernel/sched/fair_numa.h +++ b/kernel/sched/fair_numa.h @@ -486,40 +486,16 @@ struct task_numa_env { int best_cpu; }; -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.load_avg; -} - static unsigned long cpu_load(struct rq *rq) { return cfs_rq_load_avg(&rq->cfs); } -static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.runnable_avg; -} - static unsigned long cpu_runnable(struct rq *rq) { return cfs_rq_runnable_avg(&rq->cfs); } -static inline unsigned long cpu_util(int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); - - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - /* * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. * This is an approximation as the number of running tasks may not be @@ -590,11 +566,6 @@ static inline int numa_idle_core(int idle_core, int cpu) } #endif -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - /* * Gather all necessary information to make NUMA balancing placement * decisions that are compatible with standard load balancer. This @@ -716,11 +687,6 @@ static bool load_too_imbalanced(long src_load, long dst_load, return (imb > old_imb); } -static unsigned long task_h_load(struct task_struct *p) -{ - return p->se.avg.load_avg; -} - /* * Maximum NUMA importance can be 1998 (2*999); * SMALLIMP @ 30 would be close to 1998/64. diff --git a/kernel/sched/tt_stats.h b/kernel/sched/tt_stats.h index 0f8f7e61da66a..2e897866c7f8b 100644 --- a/kernel/sched/tt_stats.h +++ b/kernel/sched/tt_stats.h @@ -221,6 +221,42 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } } +#if defined(CONFIG_NUMA_BALANCING) || (defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS)) +static unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.load_avg; +} + +static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.runnable_avg; +} + +static inline unsigned long cpu_util(int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned int util; + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + return min_t(unsigned long, util, capacity_orig_of(cpu)); +} + +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.avg.load_avg; +} +#endif + #if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) /* * The margin used when comparing utilization with CPU capacity. @@ -229,11 +265,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) */ #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024) -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -517,16 +548,6 @@ static void remove_entity_load_avg(struct sched_entity *se) raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); } -static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.runnable_avg; -} - -static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) -{ - return cfs_rq->avg.load_avg; -} - static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -696,11 +717,6 @@ static inline int task_fits_capacity(struct task_struct *p, long capacity) return fits_capacity(uclamp_task_util(p), capacity); } -static unsigned long task_h_load(struct task_struct *p) -{ - return p->se.avg.load_avg; -} - static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { if (!static_branch_unlikely(&sched_asym_cpucapacity)) @@ -780,20 +796,6 @@ static inline void check_schedstat_required(void) } #if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS) -static inline unsigned long cpu_util(int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); - - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - static inline bool cpu_overutilized(int cpu) { return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); From 0e451ef78e155fa87273357a725ace2841d47213 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Sun, 14 Nov 2021 01:06:33 +0300 Subject: [PATCH 05/11] Add missing load update in couple functions: trigger_loadbalancer and task_dead. --- kernel/sched/bs.c | 1 + kernel/sched/bs.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index 2813bf71cb246..f61c883b51734 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -1012,6 +1012,7 @@ void trigger_load_balance(struct rq *this_rq) if (unlikely(on_null_domain(this_rq) || !cpu_active(cpu_of(this_rq)))) return; + update_blocked_averages(this_rq->cpu); nohz_balancer_kick(this_rq); } diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h index 80f813826406d..d413aad1d0a37 100644 --- a/kernel/sched/bs.h +++ b/kernel/sched/bs.h @@ -58,12 +58,16 @@ static void rq_online_fair(struct rq *rq) {} static void rq_offline_fair(struct rq *rq) {} static void task_dead_fair(struct task_struct *p) { +#ifdef CONFIG_TT_ACCOUNTING_STATS + remove_entity_load_avg(&p->se); +#else struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); unsigned long flags; raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags); ++cfs_rq->removed.nr; raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); +#endif } #endif /** CONFIG_SMP */ From 0bf382ee5cfe826f3974e587f0b450cb84808af0 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Sun, 14 Nov 2021 02:16:42 +0300 Subject: [PATCH 06/11] Forgot to update load in the most important functions :D update_curr, set_next_task, put_prev_task, and others. --- kernel/sched/bs.c | 59 +++++++++++++++++++++++++++++++++++++---- kernel/sched/tt_stats.h | 4 +++ 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index f61c883b51734..0b289f655e65e 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -192,6 +192,9 @@ static void update_curr(struct cfs_rq *cfs_rq) struct tt_node *ttn = &curr->tt_node; u64 now = sched_clock(); u64 delta_exec; +#ifdef CONFIG_TT_ACCOUNTING_STATS + struct task_struct *curtask = task_of(curr); +#endif if (unlikely(!curr)) return; @@ -201,13 +204,26 @@ static void update_curr(struct cfs_rq *cfs_rq) return; curr->exec_start = now; + +#ifdef CONFIG_TT_ACCOUNTING_STATS + schedstat_set(curr->statistics.exec_max, + max(delta_exec, curr->statistics.exec_max)); +#endif curr->sum_exec_runtime += delta_exec; +#ifdef CONFIG_TT_ACCOUNTING_STATS + schedstat_add(cfs_rq->exec_clock, delta_exec); +#endif ttn->curr_burst += delta_exec; ttn->vruntime += convert_to_vruntime(delta_exec, curr); detect_type(ttn, now, 0); - normalize_lifetime(now, &curr->tt_node); + +#ifdef CONFIG_TT_ACCOUNTING_STATS + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + cgroup_account_cputime(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); +#endif } static void update_curr_fair(struct rq *rq) @@ -434,11 +450,33 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (se->on_rq) + if (se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_load_avg(cfs_rq, se, UPDATE_TG); + } se->exec_start = sched_clock(); cfs_rq->curr = se; + +#ifdef CONFIG_TT_ACCOUNTING_STATS + /* + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): + */ + if (schedstat_enabled() && + rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { + schedstat_set(se->statistics.slice_max, + max((u64)schedstat_val(se->statistics.slice_max), + se->sum_exec_runtime - se->prev_sum_exec_runtime)); + } +#endif se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -498,6 +536,8 @@ done: __maybe_unused; list_move(&p->se.group_node, &rq->cfs_tasks); #endif + update_misfit_status(p, rq); + return p; idle: @@ -561,11 +601,12 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: */ - if (prev->on_rq) + if (prev->on_rq) { update_curr(cfs_rq); - - if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev); __enqueue_entity(cfs_rq, prev); + update_load_avg(cfs_rq, prev, 0); + } cfs_rq->curr = NULL; } @@ -607,6 +648,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { update_curr(cfs_rq); + /* + * Ensure that runnable average is periodically updated. + */ + update_load_avg(cfs_rq, curr, UPDATE_TG); + if (cfs_rq->nr_running > 1) check_preempt_tick(cfs_rq, curr); } @@ -1028,6 +1074,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); + update_overutilized_status(task_rq(curr)); } static void task_fork_fair(struct task_struct *p) diff --git a/kernel/sched/tt_stats.h b/kernel/sched/tt_stats.h index 2e897866c7f8b..7aa1e8936be4f 100644 --- a/kernel/sched/tt_stats.h +++ b/kernel/sched/tt_stats.h @@ -193,6 +193,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } #else static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) {} +static inline void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {} From da59d0d57aaf62c4ae6193d679de5336f8df3547 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Sun, 14 Nov 2021 03:52:19 +0300 Subject: [PATCH 07/11] update_blocked_averages in trigger_loadbalancer every 19ms instead of every tick. --- kernel/sched/bs.c | 11 ++++++++++- kernel/sched/bs_nohz.h | 2 -- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index 0b289f655e65e..758cd410d6e6d 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -1058,7 +1058,16 @@ void trigger_load_balance(struct rq *this_rq) if (unlikely(on_null_domain(this_rq) || !cpu_active(cpu_of(this_rq)))) return; - update_blocked_averages(this_rq->cpu); +#ifdef CONFIG_TT_ACCOUNTING_STATS + if (time_after_eq(jiffies, this_rq->next_balance)) { + /* scale ms to jiffies */ + unsigned long interval = msecs_to_jiffies(19); + + this_rq->next_balance = jiffies + interval; + update_blocked_averages(this_rq->cpu); + } +#endif + nohz_balancer_kick(this_rq); } diff --git a/kernel/sched/bs_nohz.h b/kernel/sched/bs_nohz.h index 3802951591f36..89466954743c3 100644 --- a/kernel/sched/bs_nohz.h +++ b/kernel/sched/bs_nohz.h @@ -686,6 +686,4 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ if (nohz_idle_balance(this_rq, idle)) return; - - update_blocked_averages(this_rq->cpu); } From 11c50e29b9d19a22c5296697bcc1531a47327c4a Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Tue, 16 Nov 2021 22:05:05 +0300 Subject: [PATCH 08/11] Initial work to https://github.com/hamadmarri/TT-CPU-Scheduler/issues/6 `YIELD_MARK` only applies if `cfs_rq->h_nr_running > 1` `YIELD_UNMARK` on migrate_task --- kernel/sched/bs.c | 8 +++----- kernel/sched/bs.h | 7 ++++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index 758cd410d6e6d..bbeb0c8443251 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -21,9 +21,6 @@ unsigned int __read_mostly tt_max_lifetime = 22000; // in ms #define RACE_TIME 40000000 #define FACTOR (RACE_TIME / HZ_PERIOD) -#define YIELD_MARK(ttn) ((ttn)->vruntime |= 0x8000000000000000ULL) -#define YIELD_UNMARK(ttn) ((ttn)->vruntime &= 0x7FFFFFFFFFFFFFFFULL) - #define IS_REALTIME(ttn) ((ttn)->task_type == TT_REALTIME) #define IS_INTERACTIVE(ttn) ((ttn)->task_type == TT_INTERACTIVE) #define IS_NO_TYPE(ttn) ((ttn)->task_type == TT_NO_TYPE) @@ -418,14 +415,15 @@ static void yield_task_fair(struct rq *rq) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - YIELD_MARK(&curr->se.tt_node); - /* * Are we the only task in the tree? */ if (unlikely(rq->nr_running == 1)) return; + if (cfs_rq->h_nr_running > 1) + YIELD_MARK(&curr->se.tt_node); + if (curr->policy != SCHED_BATCH) { update_rq_clock(rq); /* diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h index d413aad1d0a37..721eb690abeb2 100644 --- a/kernel/sched/bs.h +++ b/kernel/sched/bs.h @@ -1,3 +1,5 @@ +#define YIELD_MARK(ttn) ((ttn)->vruntime |= 0x8000000000000000ULL) +#define YIELD_UNMARK(ttn) ((ttn)->vruntime &= 0x7FFFFFFFFFFFFFFFULL) /* * After fork, child runs first. If set to 0 (default) then @@ -44,13 +46,12 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) */ remove_entity_load_avg(&p->se); } - - /* We have migrated, no longer consider this task hot */ - p->se.exec_start = 0; #endif /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; + YIELD_UNMARK(&p->se.tt_node); + update_scan_period(p, new_cpu); } From 9dd12e1bae868db24a55f6cb2c16f5569be38678 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Tue, 16 Nov 2021 23:47:38 +0300 Subject: [PATCH 09/11] Add sysctl to soften RT task priority kernel.sched_tt_rt_prio default value = -20 --- include/linux/sched/sysctl.h | 1 + kernel/sched/bs.c | 3 ++- kernel/sysctl.c | 13 +++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index f8f93ff881322..bddf5f9175840 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -30,6 +30,7 @@ extern unsigned int sysctl_sched_child_runs_first; #ifdef CONFIG_TT_SCHED extern unsigned int tt_max_lifetime; +extern int tt_rt_prio; #endif enum sched_tunable_scaling { diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index bbeb0c8443251..df645d722e766 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -11,6 +11,7 @@ #include "bs.h" unsigned int __read_mostly tt_max_lifetime = 22000; // in ms +int __read_mostly tt_rt_prio = -20; #define INTERACTIVE_HRRN 2U #define RT_WAIT_DELTA 800000U @@ -169,7 +170,7 @@ static u64 convert_to_vruntime(u64 delta, struct sched_entity *se) { struct task_struct *p = task_of(se); s64 prio_diff; - int prio = IS_REALTIME(&se->tt_node) ? -20 : PRIO_TO_NICE(p->prio); + int prio = IS_REALTIME(&se->tt_node) ? tt_rt_prio : PRIO_TO_NICE(p->prio); if (prio == 0) return delta; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1aa76d2dcf755..c0ae8c2ee59e5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -113,6 +113,10 @@ static int sixty = 60; #endif +#ifdef CONFIG_TT_SCHED +static int neg_twenty = -20; +static int thirty_nine = 39; +#endif static int __maybe_unused neg_one = -1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; @@ -1808,6 +1812,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_tt_rt_prio", + .data = &tt_rt_prio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_twenty, + .extra2 = &thirty_nine, + }, #endif #ifdef CONFIG_SCHEDSTATS { From db588007d2bdffe81c11f7d78b2f475ae00152f6 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Wed, 17 Nov 2021 01:18:54 +0300 Subject: [PATCH 10/11] Ported CFS `is cache hot` code during `can_migrate_task` to avoid migrate cache hot tasks. This also includes numa locality checks. --- kernel/sched/bs.c | 105 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/bs.h | 3 ++ 2 files changed, 104 insertions(+), 4 deletions(-) diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index df645d722e766..be1a55581f498 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -839,17 +839,114 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) return new_cpu; } +/* + * Is this task likely cache-hot: + */ +static int task_hot(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) +{ + s64 delta; + + lockdep_assert_rq_held(src_rq); + + if (p->sched_class != &fair_sched_class) + return 0; + + if (unlikely(task_has_idle_policy(p))) + return 0; + + /* SMT siblings share cache */ + if (cpus_share_cache(cpu_of(dst_rq), cpu_of(src_rq))) + return 0; + + if (sysctl_sched_migration_cost == -1) + return 1; + + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = sched_clock() - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + +#ifdef CONFIG_NUMA_BALANCING +/* + * Returns 1, if task migration degrades locality + * Returns 0, if task migration improves locality i.e migration preferred. + * Returns -1, if task migration is not affected by locality. + */ static int -can_migrate_task(struct task_struct *p, int dst_cpu, struct rq *src_rq) +migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) { - if (task_running(src_rq, p)) + struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_weight, dst_weight; + int src_nid, dst_nid, dist; + + if (!static_branch_likely(&sched_numa_balancing)) + return -1; + + src_nid = cpu_to_node(cpu_of(src_rq)); + dst_nid = cpu_to_node(cpus_of(dst_cpu)); + + if (src_nid == dst_nid) + return -1; + + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { + if (src_rq->nr_running > src_rq->nr_preferred_running) + return 1; + else + return -1; + } + + /* Encourage migration to the preferred node. */ + if (dst_nid == p->numa_preferred_nid) return 0; + /* Leaving a core idle is often worse than degrading locality. */ + if (dst_rq->idle_balance) + return -1; + + dist = node_distance(src_nid, dst_nid); + if (numa_group) { + src_weight = group_weight(p, src_nid, dist); + dst_weight = group_weight(p, dst_nid, dist); + } else { + src_weight = task_weight(p, src_nid, dist); + dst_weight = task_weight(p, dst_nid, dist); + } + + return dst_weight < src_weight; +} + +#else +static inline int migrate_degrades_locality(struct task_struct *p, + struct rq *dst_rq, struct rq *src_rq) +{ + return -1; +} +#endif + +static int +can_migrate_task(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq) +{ + int tsk_cache_hot; + /* Disregard pcpu kthreads; they are where they need to be. */ if (kthread_is_per_cpu(p)) return 0; - if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) + if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) + return 0; + + if (task_running(src_rq, p)) + return 0; + + tsk_cache_hot = migrate_degrades_locality(p, dst_rq, src_rq); + if (tsk_cache_hot == -1) + tsk_cache_hot = task_hot(p, dst_rq, src_rq); + + if (tsk_cache_hot > 0) return 0; return 1; @@ -891,7 +988,7 @@ static int move_task(struct rq *dist_rq, struct rq *src_rq, while (ttn) { p = task_of(se_of(ttn)); - if (can_migrate_task(p, cpu_of(dist_rq), src_rq)) { + if (can_migrate_task(p, dist_rq, src_rq)) { pull_from(dist_rq, src_rq, src_rf, p); return 1; } diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h index 721eb690abeb2..b3d99cf135769 100644 --- a/kernel/sched/bs.h +++ b/kernel/sched/bs.h @@ -50,6 +50,9 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; + /* We have migrated, no longer consider this task hot */ + p->se.exec_start = 0; + YIELD_UNMARK(&p->se.tt_node); update_scan_period(p, new_cpu); From edabdb8ee7703ec4f3a33dfcd6ad0149eaf4d538 Mon Sep 17 00:00:00 2001 From: Hamad Al Marri Date: Wed, 17 Nov 2021 01:32:04 +0300 Subject: [PATCH 11/11] fixed typos --- kernel/sched/bs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c index be1a55581f498..793bb1c338d1d 100644 --- a/kernel/sched/bs.c +++ b/kernel/sched/bs.c @@ -886,7 +886,7 @@ migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *s return -1; src_nid = cpu_to_node(cpu_of(src_rq)); - dst_nid = cpu_to_node(cpus_of(dst_cpu)); + dst_nid = cpu_to_node(cpu_of(dst_rq)); if (src_nid == dst_nid) return -1;