diff --git a/include/linux/sched.h b/include/linux/sched.h index 994d8da69140..c5ce888597d5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -768,6 +768,7 @@ struct uclamp_se { unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; + unsigned int ignore_uclamp_max : 1; }; #endif /* CONFIG_UCLAMP_TASK */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index a7a1e63e4a84..a02f76707243 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -125,6 +125,7 @@ extern int sysctl_sched_rt_runtime; #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; +extern unsigned int sysctl_sched_uclamp_max_filter_divider; #endif #ifdef CONFIG_CFS_BANDWIDTH diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b416e096dbe0..a82a81039e76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -814,6 +814,21 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +/* + * Ignore uclamp_max for tasks if + * + * runtime < sched_slice() / divider + * + * ==> + * + * runtime * divider < sched_slice() + * + * where + * + * divider = 1 << sysctl_sched_uclamp_max_filter_divider + */ +unsigned int sysctl_sched_uclamp_max_filter_divider = 2; + /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -984,7 +999,7 @@ unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) * This "local max aggregation" allows to track the exact "requested" value * for each bucket when all its RUNNABLE tasks require the same clamp. */ -static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, +inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; @@ -1022,7 +1037,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, * always valid. If it's detected they are not, as defensive programming, * enforce the expected state and warn. */ -static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, +inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; @@ -1333,6 +1348,8 @@ static void uclamp_fork(struct task_struct *p) for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; + p->uclamp_req[UCLAMP_MAX].ignore_uclamp_max = 0; + if (likely(!p->sched_reset_on_fork)) return; @@ -1372,6 +1389,8 @@ static void __init init_uclamp(void) uclamp_none(clamp_id), false); } + init_task.uclamp_req[UCLAMP_MAX].ignore_uclamp_max = 0; + /* System defaults allow max clamp values for both indexes */ uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); for_each_clamp_id(clamp_id) { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7b02a857d0b7..f6203dc1a14b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3742,11 +3742,120 @@ static inline unsigned long uclamp_task_util(struct task_struct *p) uclamp_eff_value(p, UCLAMP_MIN), uclamp_eff_value(p, UCLAMP_MAX)); } + +/* + * Check if we can ignore uclamp_max requirement of a task. The goal is to + * prevent small transient tasks that share the rq with other tasks that are + * capped to lift the capping easily/unnecessarily, hence increase power + * consumption. + * + * Returns true if a task can finish its work within a sched_slice() / divider. + * Where divider = 1 << sysctl_sched_uclamp_max_filter_divider. + * + * We look at the immediate history of how long the task ran previously. + * Converting task util_avg into runtime or sched_slice() into capacity is not + * trivial and is an expensive operations. In practice this simple approach + * proved effective to address the common source of noise. If a task suddenly + * becomes a busy task, we should detect that and lift the capping at tick, see + * task_tick_uclamp(). + */ +static inline bool uclamp_can_ignore_uclamp_max(struct rq *rq, + struct task_struct *p) +{ + unsigned long uclamp_min, uclamp_max, util; + unsigned long runtime, slice; + struct sched_entity *se; + struct cfs_rq *cfs_rq; + + if (!uclamp_is_used()) + return false; + + /* + * If the task is boosted, we generally assume it is important and + * ignoring its uclamp_max to retain the rq at a low performance level + * is unlikely to be the desired behavior. + */ + uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); + if (uclamp_min) + return false; + + /* + * If util has crossed uclamp_max threshold, then we have to ensure + * this is always enforced. + */ + util = task_util_est(p); + uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); + if (util >= uclamp_max) + return false; + + /* + * Based on previous runtime, we check the allowed sched_slice() of the + * task is large enough for this task to run without preemption. + * + * + * runtime < sched_slice() / divider + * + * ==> + * + * runtime * divider < sched_slice() + * + * where + * + * divider = 1 << sysctl_sched_uclamp_max_filter_divider + * + * There are 2 caveats: + * + * 1- When a task migrates on big.LITTLE system, the runtime will not + * be representative then (not capacity invariant). But this would + * be one time off error. + * + * 2. runtime is not frequency invariant either. If the + * divider >= fmax/fmin we should be okay in general because that's + * the worst case scenario of how much the runtime will be stretched + * due to it being capped to minimum frequency but the rq should run + * at max. The rule here is that the task should finish its work + * within its sched_slice(). Without this runtime scaling there's a + * small opportunity for the task to ping-pong between capped and + * uncapped state. + * + */ + se = &p->se; + + runtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (!runtime) + return false; + + cfs_rq = cfs_rq_of(se); + slice = sched_slice(cfs_rq, se); + runtime <<= sysctl_sched_uclamp_max_filter_divider; + + if (runtime >= slice) + return false; + + return true; +} + +static inline void uclamp_set_ignore_uclamp_max(struct task_struct *p) +{ + p->uclamp_req[UCLAMP_MAX].ignore_uclamp_max = 1; +} + +static inline void uclamp_reset_ignore_uclamp_max(struct task_struct *p) +{ + p->uclamp_req[UCLAMP_MAX].ignore_uclamp_max = 0; +} #else static inline unsigned long uclamp_task_util(struct task_struct *p) { return task_util_est(p); } +static inline bool uclamp_can_ignore_uclamp_max(struct rq *rq, + struct task_struct *p) +{ + return false; +} +static inline void uclamp_set_ignore_uclamp_max(struct task_struct *p) {} +static inline void uclamp_reset_ignore_uclamp_max(struct task_struct *p) {} #endif static inline void util_est_enqueue(struct cfs_rq *cfs_rq, @@ -5495,6 +5604,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); + if (uclamp_can_ignore_uclamp_max(rq, p)) { + uclamp_set_ignore_uclamp_max(p); + uclamp_rq_dec_id(rq, p, UCLAMP_MAX); + } + /* * The code below (indirectly) updates schedutil which looks at * the cfs_rq utilization to select a frequency. @@ -5511,6 +5625,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + if (uclamp_is_ignore_uclamp_max(p)) + uclamp_reset_ignore_uclamp_max(p); + for_each_sched_entity(se) { if (se->on_rq) break; @@ -7294,6 +7411,12 @@ int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is * aligned with schedutil_cpu_util(). + * + * When the task is enqueued, the uclamp_max of the + * task could be ignored, but it's hard for us to know + * this now since we can only know the sched_slice() + * after the task was enqueued. So we do the energy + * calculation based on worst case scenario. */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) @@ -11498,6 +11621,33 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ +#ifdef CONFIG_UCLAMP_TASK +static inline void task_tick_uclamp(struct rq *rq, struct task_struct *curr) +{ + bool can_ignore = uclamp_can_ignore_uclamp_max(rq, curr); + bool is_ignored = uclamp_is_ignore_uclamp_max(curr); + + /* + * Condition might have changed since we enqueued the task. + * + * If uclamp_max was ignored, we might need to reverse this condition. + * + * Or, we might have not ignored (becuase uclamp_min != 0 for example) + * but this condition has changed now, so re-evaluate and if necessary + * ignore it. + */ + if (is_ignored && !can_ignore) { + uclamp_reset_ignore_uclamp_max(curr); + uclamp_rq_inc_id(rq, curr, UCLAMP_MAX); + } else if (!is_ignored && can_ignore) { + uclamp_set_ignore_uclamp_max(curr); + uclamp_rq_dec_id(rq, curr, UCLAMP_MAX); + } +} +#else +static inline void task_tick_uclamp(struct rq *rq, struct task_struct *curr) {} +#endif + #ifdef CONFIG_SCHED_WALT static bool silver_has_big_tasks(void) { @@ -11763,6 +11913,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) #endif update_overutilized_status(task_rq(curr)); + task_tick_uclamp(rq, curr); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c452ff640a60..14abe4eadfe5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2734,6 +2734,14 @@ static inline bool uclamp_is_used(void) { return static_branch_likely(&sched_uclamp_used); } +static inline bool uclamp_is_ignore_uclamp_max(struct task_struct *p) +{ + return p->uclamp_req[UCLAMP_MAX].ignore_uclamp_max; +} +inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id); +inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id); #else /* CONFIG_UCLAMP_TASK */ static inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, @@ -2750,6 +2758,12 @@ static inline bool uclamp_is_used(void) { return false; } +static inline bool uclamp_is_ignore_uclamp_max(struct task_struct *p) +{ + return false; +} +static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, + enum uclamp_id clamp_id) {} #endif /* CONFIG_UCLAMP_TASK */ #ifdef CONFIG_UCLAMP_TASK_GROUP