Skip to content

Commit

Permalink
BACKPORT: sched/uclamp: Filter out uclamp_max for small tasks
Browse files Browse the repository at this point in the history
DISCLAIMER:
=====================================================================

This patch is intended to go upstream after collecting feedback from
Android community that it resolves the issues reported by various
partners. It is not meant to be merged into android-mainline.

=====================================================================

uclamp_max effectiveness could be easily impacted by small transient
tasks that wake up frequency to do small work then go back to sleep.

If there's a busy task that is capped by uclamp_max to run at a smaller
frequency, due to max-aggregation rule tasks that wake up on the same
cpu will increase the rq->uclamp_max value if they were higher than the
capped task. Given that all tasks by default have a uclamp_max = 1024,
this is the likely case by default.

Note that since the capped task is likely to be a busy and throttled
one, its util, and hence the rq->util, will be very high and as soon as
we lift the capping the requested frequency will be very high.

To address this issue of increasing the resilience of uclamp_max against
these transient tasks that don't really need to run at a higher
frequency, we implement a simple filter mechanism to ignore uclamp_max
for those tasks.

The algorithm looks at the runtime of the task and compares it to
sched_slice(). By default we assume any task that its runtime is 1/4th
of sched_slice() or less is a small transient task that we can ignore
its uclamp_max requirement.

	runtime < sched_slice() / divider

We can tweak the divider by
/proc/sys/kernel/sched_util_uclamp_max_filter_divider sysctl. It accepts
values 0-4.

	divider = 1 << sched_util_uclamp_max_filter_divider

We add a new task_tick_uclamp() function to verify this condition
periodically and ensure the conditions checked at wake up are still true
- in case this transient task suddenly becomes a busy one.

For EAS, we can't use sched_slice() there to figure out if uclamp_max
will be ignored because the task is not enqueued yet. So we leave it
as-is to figure out the placement based on worst case scenario.

Signed-off-by: Qais Yousef <[email protected]>
Change-Id: Ie3afa93a7d70dab5b7c22e820cc078ffd0e891ef
[yaro: ported to msm-5.4 and remove sysctl parts for now]
Signed-off-by: Yaroslav Furman <[email protected]>
  • Loading branch information
Qais Yousef authored and Official-Ayrton990 committed Jun 6, 2022
1 parent 7a49bfe commit d07fca6
Show file tree
Hide file tree
Showing 5 changed files with 188 additions and 2 deletions.
1 change: 1 addition & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -768,6 +768,7 @@ struct uclamp_se {
unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
unsigned int active : 1;
unsigned int user_defined : 1;
unsigned int ignore_uclamp_max : 1;
};
#endif /* CONFIG_UCLAMP_TASK */

Expand Down
1 change: 1 addition & 0 deletions include/linux/sched/sysctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ extern int sysctl_sched_rt_runtime;
#ifdef CONFIG_UCLAMP_TASK
extern unsigned int sysctl_sched_uclamp_util_min;
extern unsigned int sysctl_sched_uclamp_util_max;
extern unsigned int sysctl_sched_uclamp_max_filter_divider;
#endif

#ifdef CONFIG_CFS_BANDWIDTH
Expand Down
23 changes: 21 additions & 2 deletions kernel/sched/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,21 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
/* Max allowed maximum utilization */
unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;

/*
* Ignore uclamp_max for tasks if
*
* runtime < sched_slice() / divider
*
* ==>
*
* runtime * divider < sched_slice()
*
* where
*
* divider = 1 << sysctl_sched_uclamp_max_filter_divider
*/
unsigned int sysctl_sched_uclamp_max_filter_divider = 2;

/* All clamps are required to be less or equal than these values */
static struct uclamp_se uclamp_default[UCLAMP_CNT];

Expand Down Expand Up @@ -984,7 +999,7 @@ unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
* This "local max aggregation" allows to track the exact "requested" value
* for each bucket when all its RUNNABLE tasks require the same clamp.
*/
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
Expand Down Expand Up @@ -1022,7 +1037,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
* always valid. If it's detected they are not, as defensive programming,
* enforce the expected state and warn.
*/
static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id)
{
struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
Expand Down Expand Up @@ -1333,6 +1348,8 @@ static void uclamp_fork(struct task_struct *p)
for_each_clamp_id(clamp_id)
p->uclamp[clamp_id].active = false;

p->uclamp_req[UCLAMP_MAX].ignore_uclamp_max = 0;

if (likely(!p->sched_reset_on_fork))
return;

Expand Down Expand Up @@ -1372,6 +1389,8 @@ static void __init init_uclamp(void)
uclamp_none(clamp_id), false);
}

init_task.uclamp_req[UCLAMP_MAX].ignore_uclamp_max = 0;

/* System defaults allow max clamp values for both indexes */
uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
for_each_clamp_id(clamp_id) {
Expand Down
151 changes: 151 additions & 0 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -3742,11 +3742,120 @@ static inline unsigned long uclamp_task_util(struct task_struct *p)
uclamp_eff_value(p, UCLAMP_MIN),
uclamp_eff_value(p, UCLAMP_MAX));
}

/*
* Check if we can ignore uclamp_max requirement of a task. The goal is to
* prevent small transient tasks that share the rq with other tasks that are
* capped to lift the capping easily/unnecessarily, hence increase power
* consumption.
*
* Returns true if a task can finish its work within a sched_slice() / divider.
* Where divider = 1 << sysctl_sched_uclamp_max_filter_divider.
*
* We look at the immediate history of how long the task ran previously.
* Converting task util_avg into runtime or sched_slice() into capacity is not
* trivial and is an expensive operations. In practice this simple approach
* proved effective to address the common source of noise. If a task suddenly
* becomes a busy task, we should detect that and lift the capping at tick, see
* task_tick_uclamp().
*/
static inline bool uclamp_can_ignore_uclamp_max(struct rq *rq,
struct task_struct *p)
{
unsigned long uclamp_min, uclamp_max, util;
unsigned long runtime, slice;
struct sched_entity *se;
struct cfs_rq *cfs_rq;

if (!uclamp_is_used())
return false;

/*
* If the task is boosted, we generally assume it is important and
* ignoring its uclamp_max to retain the rq at a low performance level
* is unlikely to be the desired behavior.
*/
uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
if (uclamp_min)
return false;

/*
* If util has crossed uclamp_max threshold, then we have to ensure
* this is always enforced.
*/
util = task_util_est(p);
uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
if (util >= uclamp_max)
return false;

/*
* Based on previous runtime, we check the allowed sched_slice() of the
* task is large enough for this task to run without preemption.
*
*
* runtime < sched_slice() / divider
*
* ==>
*
* runtime * divider < sched_slice()
*
* where
*
* divider = 1 << sysctl_sched_uclamp_max_filter_divider
*
* There are 2 caveats:
*
* 1- When a task migrates on big.LITTLE system, the runtime will not
* be representative then (not capacity invariant). But this would
* be one time off error.
*
* 2. runtime is not frequency invariant either. If the
* divider >= fmax/fmin we should be okay in general because that's
* the worst case scenario of how much the runtime will be stretched
* due to it being capped to minimum frequency but the rq should run
* at max. The rule here is that the task should finish its work
* within its sched_slice(). Without this runtime scaling there's a
* small opportunity for the task to ping-pong between capped and
* uncapped state.
*
*/
se = &p->se;

runtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
if (!runtime)
return false;

cfs_rq = cfs_rq_of(se);
slice = sched_slice(cfs_rq, se);
runtime <<= sysctl_sched_uclamp_max_filter_divider;

if (runtime >= slice)
return false;

return true;
}

static inline void uclamp_set_ignore_uclamp_max(struct task_struct *p)
{
p->uclamp_req[UCLAMP_MAX].ignore_uclamp_max = 1;
}

static inline void uclamp_reset_ignore_uclamp_max(struct task_struct *p)
{
p->uclamp_req[UCLAMP_MAX].ignore_uclamp_max = 0;
}
#else
static inline unsigned long uclamp_task_util(struct task_struct *p)
{
return task_util_est(p);
}
static inline bool uclamp_can_ignore_uclamp_max(struct rq *rq,
struct task_struct *p)
{
return false;
}
static inline void uclamp_set_ignore_uclamp_max(struct task_struct *p) {}
static inline void uclamp_reset_ignore_uclamp_max(struct task_struct *p) {}
#endif

static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
Expand Down Expand Up @@ -5495,6 +5604,11 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int idle_h_nr_running = task_has_idle_policy(p);
int task_new = !(flags & ENQUEUE_WAKEUP);

if (uclamp_can_ignore_uclamp_max(rq, p)) {
uclamp_set_ignore_uclamp_max(p);
uclamp_rq_dec_id(rq, p, UCLAMP_MAX);
}

/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
Expand All @@ -5511,6 +5625,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);

if (uclamp_is_ignore_uclamp_max(p))
uclamp_reset_ignore_uclamp_max(p);

for_each_sched_entity(se) {
if (se->on_rq)
break;
Expand Down Expand Up @@ -7294,6 +7411,12 @@ int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
* overutilized. Take uclamp into account to see how
* much capacity we can get out of the CPU; this is
* aligned with schedutil_cpu_util().
*
* When the task is enqueued, the uclamp_max of the
* task could be ignored, but it's hard for us to know
* this now since we can only know the sched_slice()
* after the task was enqueued. So we do the energy
* calculation based on worst case scenario.
*/
util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
if (!fits_capacity(util, cpu_cap))
Expand Down Expand Up @@ -11498,6 +11621,33 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle
static inline void nohz_newidle_balance(struct rq *this_rq) { }
#endif /* CONFIG_NO_HZ_COMMON */

#ifdef CONFIG_UCLAMP_TASK
static inline void task_tick_uclamp(struct rq *rq, struct task_struct *curr)
{
bool can_ignore = uclamp_can_ignore_uclamp_max(rq, curr);
bool is_ignored = uclamp_is_ignore_uclamp_max(curr);

/*
* Condition might have changed since we enqueued the task.
*
* If uclamp_max was ignored, we might need to reverse this condition.
*
* Or, we might have not ignored (becuase uclamp_min != 0 for example)
* but this condition has changed now, so re-evaluate and if necessary
* ignore it.
*/
if (is_ignored && !can_ignore) {
uclamp_reset_ignore_uclamp_max(curr);
uclamp_rq_inc_id(rq, curr, UCLAMP_MAX);
} else if (!is_ignored && can_ignore) {
uclamp_set_ignore_uclamp_max(curr);
uclamp_rq_dec_id(rq, curr, UCLAMP_MAX);
}
}
#else
static inline void task_tick_uclamp(struct rq *rq, struct task_struct *curr) {}
#endif

#ifdef CONFIG_SCHED_WALT
static bool silver_has_big_tasks(void)
{
Expand Down Expand Up @@ -11763,6 +11913,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
#endif

update_overutilized_status(task_rq(curr));
task_tick_uclamp(rq, curr);
}

/*
Expand Down
14 changes: 14 additions & 0 deletions kernel/sched/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -2734,6 +2734,14 @@ static inline bool uclamp_is_used(void)
{
return static_branch_likely(&sched_uclamp_used);
}
static inline bool uclamp_is_ignore_uclamp_max(struct task_struct *p)
{
return p->uclamp_req[UCLAMP_MAX].ignore_uclamp_max;
}
inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id);
inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id);
#else /* CONFIG_UCLAMP_TASK */
static inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
Expand All @@ -2750,6 +2758,12 @@ static inline bool uclamp_is_used(void)
{
return false;
}
static inline bool uclamp_is_ignore_uclamp_max(struct task_struct *p)
{
return false;
}
static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
enum uclamp_id clamp_id) {}
#endif /* CONFIG_UCLAMP_TASK */

#ifdef CONFIG_UCLAMP_TASK_GROUP
Expand Down

0 comments on commit d07fca6

Please sign in to comment.