From 9890117114f183765113106ffb428568a462daef Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Sat, 13 Nov 2021 06:44:47 +0300
Subject: [PATCH 01/11] Make child runs first in fork by default. It can be
 switched back to parent run first via sysctl `sysctl
 kernel.sched_child_runs_first=0`

---
 kernel/sched/bs.c | 6 +++++-
 kernel/sched/bs.h | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index 38d385e4521d8..778a538b3c6e1 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -1003,9 +1003,13 @@ static void task_fork_fair(struct task_struct *p)
 	cfs_rq = task_cfs_rq(current);
 
 	curr = cfs_rq->curr;
-	if (curr)
+	if (curr) {
 		update_curr(cfs_rq);
 
+		if (sysctl_sched_child_runs_first)
+			resched_curr(rq);
+	}
+
 	rq_unlock(rq, &rf);
 }
 
diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h
index 7e63970246b4c..28f2f45070104 100644
--- a/kernel/sched/bs.h
+++ b/kernel/sched/bs.h
@@ -3,7 +3,7 @@
  * After fork, child runs first. If set to 0 (default) then
  * parent will (try to) run first.
  */
-unsigned int sysctl_sched_child_runs_first __read_mostly;
+unsigned int sysctl_sched_child_runs_first __read_mostly = 1;
 
 const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 

From 4f2d5fc603d4e0e5e350ff059e105c1efd533c51 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Sat, 13 Nov 2021 07:50:17 +0300
Subject: [PATCH 02/11] revert adding hz values to Kconfig.hz since now the Hz
 work is in a separate patch (high-hz.patch).

---
 kernel/Kconfig.hz | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index b2ef596d63501..b4a1995149d05 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -47,27 +47,12 @@ choice
 	 on desktops with great smoothness without increasing CPU power
 	 consumption and sacrificing the battery life on laptops.
 
-	config HZ_833
-		bool "833 HZ"
-	help
-	 833 Hz is the TT alternative to 1000 Hz. Choose 833 Hz
-	 if you want a balance between latency and performance.
-
 	config HZ_1000
 		bool "1000 HZ"
 	help
 	 1000 Hz is the preferred choice for desktop systems and other
 	 systems requiring fast interactive responses to events.
 
-	config HZ_1666
-		bool "1666 HZ"
-	help
-	 1666 Hz is for very high latency bound systems. Choose 1666 Hz
-	 if you don't care about overall throughput or performance, but
-	 you care more about latency (some realtime applications) require
-	 low latency. The response and interactive processes with 1666 Hz
-	 feel much snappier.
-
 endchoice
 
 config HZ
@@ -76,9 +61,7 @@ config HZ
 	default 250 if HZ_250
 	default 300 if HZ_300
 	default 500 if HZ_500
-	default 833 if HZ_833
 	default 1000 if HZ_1000
-	default 1666 if HZ_1666
 
 config SCHED_HRTICK
 	def_bool HIGH_RES_TIMERS

From bdf7933b51c4e2d744f1e40e5b11d7912b992482 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Sat, 13 Nov 2021 20:27:41 +0300
Subject: [PATCH 03/11] Added tasks load accountings and statistics.

---
 init/Kconfig            |  11 +-
 kernel/sched/bs.c       |  53 +++
 kernel/sched/bs.h       | 112 ++++-
 kernel/sched/bs_nohz.h  |  63 ++-
 kernel/sched/tt_stats.h | 913 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1135 insertions(+), 17 deletions(-)
 create mode 100644 kernel/sched/tt_stats.h

diff --git a/init/Kconfig b/init/Kconfig
index 5a0f21056d848..a9b7e2d7fb1c8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -109,6 +109,14 @@ config TT_SCHED
 	bool "TT Scheduler"
 	default y
 
+config TT_ACCOUNTING_STATS
+	bool "TT include all accounting and statistics"
+	depends on TT_SCHED
+	default y
+	help
+	  This will include all CFS tasks' load accounting and statistics.
+	  If you are using 'performance' governor and do not depend/care
+	  about tasks statistics, then choose N. Otherwise say Y.
 
 menu "General setup"
 
@@ -821,8 +829,7 @@ menu "Scheduler features"
 
 config UCLAMP_TASK
 	bool "Enable utilization clamping for RT/FAIR tasks"
-	depends on CPU_FREQ_GOV_SCHEDUTIL
-	depends on !TT_SCHED
+	depends on CPU_FREQ_GOV_SCHEDUTIL && TT_ACCOUNTING_STATS
 	help
 	  This feature enables the scheduler to track the clamped utilization
 	  of each CPU based on RUNNABLE tasks scheduled on that CPU.
diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index 778a538b3c6e1..76d9ca2ad289a 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -7,6 +7,7 @@
 #include "sched.h"
 #include "pelt.h"
 #include "fair_numa.h"
+#include "tt_stats.h"
 #include "bs.h"
 
 unsigned int __read_mostly tt_max_lifetime	= 22000; // in ms
@@ -288,7 +289,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	update_curr(cfs_rq);
 
+	/*
+	 * When enqueuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Add its load to cfs_rq->runnable_avg
+	 *   - For group_entity, update its weight to reflect the new share of
+	 *     its group cfs_rq
+	 *   - Add its new weight to cfs_rq->load.weight
+	 */
+	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
 	account_entity_enqueue(cfs_rq, se);
+	check_schedstat_required();
+	update_stats_enqueue(cfs_rq, se, flags);
 
 	if (!curr)
 		__enqueue_entity(cfs_rq, se);
@@ -313,6 +325,17 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	update_curr(cfs_rq);
 
+	/*
+	 * When dequeuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Subtract its load from the cfs_rq->runnable_avg.
+	 *   - Subtract its previous weight from cfs_rq->load.weight.
+	 *   - For group entity, update its weight to reflect the new share
+	 *     of its group cfs_rq.
+	 */
+	update_load_avg(cfs_rq, se, UPDATE_TG);
+	update_stats_dequeue(cfs_rq, se, flags);
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 
@@ -326,6 +349,23 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	int idle_h_nr_running = task_has_idle_policy(p);
+	int task_new = !(flags & ENQUEUE_WAKEUP);
+
+	/*
+	 * The code below (indirectly) updates schedutil which looks at
+	 * the cfs_rq utilization to select a frequency.
+	 * Let's add the task's estimated utilization to the cfs_rq's
+	 * estimated utilization, before we update schedutil.
+	 */
+	util_est_enqueue(&rq->cfs, p);
+
+	/*
+	 * If in_iowait is set, the code below may not trigger any cpufreq
+	 * utilization updates, so do it here explicitly with the IOWAIT flag
+	 * passed.
+	 */
+	if (p->in_iowait)
+		cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
 
 	if (!se->on_rq) {
 		enqueue_entity(cfs_rq, se, flags);
@@ -334,6 +374,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	}
 
 	add_nr_running(rq, 1);
+
+	if (!task_new)
+		update_overutilized_status(rq);
 }
 
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
@@ -341,6 +384,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	int idle_h_nr_running = task_has_idle_policy(p);
+	int task_sleep = flags & DEQUEUE_SLEEP;
+
+	util_est_dequeue(&rq->cfs, p);
 
 	dequeue_entity(cfs_rq, se, flags);
 
@@ -348,6 +394,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	cfs_rq->idle_h_nr_running -= idle_h_nr_running;
 
 	sub_nr_running(rq, 1);
+	util_est_update(&rq->cfs, p, task_sleep);
 }
 
 static void yield_task_fair(struct rq *rq)
@@ -849,6 +896,8 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	rq_unpin_lock(this_rq, rf);
 	raw_spin_unlock(&this_rq->__lock);
 
+	update_blocked_averages(this_cpu);
+
 	for_each_online_cpu(cpu) {
 		/*
 		 * Stop searching for tasks to pull if there are
@@ -1052,6 +1101,10 @@ DEFINE_SCHED_CLASS(fair) = {
 	.get_rr_interval	= get_rr_interval_fair,
 
 	.update_curr		= update_curr_fair,
+
+#ifdef CONFIG_UCLAMP_TASK
+	.uclamp_enabled		= 1,
+#endif
 };
 
 __init void init_sched_fair_class(void)
diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h
index 28f2f45070104..80f813826406d 100644
--- a/kernel/sched/bs.h
+++ b/kernel/sched/bs.h
@@ -19,13 +19,38 @@ int __weak arch_asym_cpu_priority(int cpu)
 }
 
 /* Give new sched_entity start runnable values to heavy its load in infant time */
-void init_entity_runnable_average(struct sched_entity *se) {}
-void post_init_entity_util_avg(struct task_struct *p) {}
 void update_max_interval(void) {}
 static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
 
 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 {
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+		/*
+		 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+		 * rq->lock and can modify state directly.
+		 */
+		lockdep_assert_rq_held(task_rq(p));
+		detach_entity_cfs_rq(&p->se);
+
+	} else {
+		/*
+		 * We are supposed to update the task to "current" time, then
+		 * its up to date and ready to go to new CPU/cfs_rq. But we
+		 * have difficulty in getting what current time is, so simply
+		 * throw away the out-of-date time. This will result in the
+		 * wakee task is less decayed, but giving the wakee more load
+		 * sounds not bad.
+		 */
+		remove_entity_load_avg(&p->se);
+	}
+
+	/* We have migrated, no longer consider this task hot */
+	p->se.exec_start = 0;
+#endif
+	/* Tell new CPU we are migrated */
+	p->se.avg.last_update_time = 0;
+
 	update_scan_period(p, new_cpu);
 }
 
@@ -51,7 +76,67 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #endif
 }
 
-void reweight_task(struct task_struct *p, int prio) {}
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+static void update_curr(struct cfs_rq *cfs_rq);
+
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+	lw->weight += inc;
+	lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+	lw->weight -= dec;
+	lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+	lw->weight = w;
+	lw->inv_weight = 0;
+}
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			    unsigned long weight)
+{
+	if (se->on_rq) {
+		/* commit outstanding execution time */
+		if (cfs_rq->curr == se)
+			update_curr(cfs_rq);
+		update_load_sub(&cfs_rq->load, se->load.weight);
+	}
+	dequeue_load_avg(cfs_rq, se);
+
+	update_load_set(&se->load, weight);
+
+#ifdef CONFIG_SMP
+	do {
+		u32 divider = get_pelt_divider(&se->avg);
+
+		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+	} while (0);
+#endif
+
+	enqueue_load_avg(cfs_rq, se);
+	if (se->on_rq)
+		update_load_add(&cfs_rq->load, se->load.weight);
+
+}
+#endif
+
+void reweight_task(struct task_struct *p, int prio)
+{
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct load_weight *load = &se->load;
+	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+	reweight_entity(cfs_rq, se, weight);
+	load->inv_weight = sched_prio_to_wmult[prio];
+#endif
+}
 
 static inline struct sched_entity *se_of(struct tt_node *ttn)
 {
@@ -109,12 +194,10 @@ static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SMP
-	if (entity_is_task(se)) {
-		struct rq *rq = rq_of(cfs_rq);
+	struct rq *rq = rq_of(cfs_rq);
 
-		account_numa_enqueue(rq, task_of(se));
-		list_add(&se->group_node, &rq->cfs_tasks);
-	}
+	account_numa_enqueue(rq, task_of(se));
+	list_add(&se->group_node, &rq->cfs_tasks);
 #endif
 	cfs_rq->nr_running++;
 }
@@ -123,10 +206,8 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SMP
-	if (entity_is_task(se)) {
-		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
-		list_del_init(&se->group_node);
-	}
+	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
+	list_del_init(&se->group_node);
 #endif
 	cfs_rq->nr_running--;
 }
@@ -153,10 +234,15 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 		check_preempt_curr(rq, p, 0);
 }
 
-static void switched_from_fair(struct rq *rq, struct task_struct *p) {}
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+	detach_task_cfs_rq(p);
+}
 
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+	attach_task_cfs_rq(p);
+
 	if (task_on_rq_queued(p)) {
 		/*
 		 * We were most likely switched from sched_rt, so
diff --git a/kernel/sched/bs_nohz.h b/kernel/sched/bs_nohz.h
index c1fdf26bbe61a..3802951591f36 100644
--- a/kernel/sched/bs_nohz.h
+++ b/kernel/sched/bs_nohz.h
@@ -11,8 +11,6 @@ static struct {
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
-
-
 #ifdef CONFIG_NO_HZ_COMMON
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
 {
@@ -61,9 +59,66 @@ static inline void update_blocked_load_tick(struct rq *rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
 #endif
 
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+static bool __update_blocked_others(struct rq *rq, bool *done)
+{
+	const struct sched_class *curr_class;
+	u64 now = rq_clock_pelt(rq);
+	unsigned long thermal_pressure;
+	bool decayed;
+
+	/*
+	 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
+	 * DL and IRQ signals have been updated before updating CFS.
+	 */
+	curr_class = rq->curr->sched_class;
+
+	thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+
+	decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
+		  update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
+		  update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
+		  update_irq_load_avg(rq, 0);
+
+	if (others_have_blocked(rq))
+		*done = false;
+
+	return decayed;
+}
+
+static bool __update_blocked_fair(struct rq *rq, bool *done)
+{
+	struct cfs_rq *cfs_rq = &rq->cfs;
+	bool decayed;
+
+	decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+	if (cfs_rq_has_blocked(cfs_rq))
+		*done = false;
+
+	return decayed;
+}
+
+static void update_blocked_averages(int cpu)
+{
+	bool decayed = false, done = true;
+	struct rq *rq = cpu_rq(cpu);
+	struct rq_flags rf;
 
+	rq_lock_irqsave(rq, &rf);
+	update_blocked_load_tick(rq);
+	update_rq_clock(rq);
 
+	decayed |= __update_blocked_others(rq, &done);
+	decayed |= __update_blocked_fair(rq, &done);
 
+	update_blocked_load_status(rq, !done);
+	if (decayed)
+		cpufreq_update_util(rq, 0);
+	rq_unlock_irqrestore(rq, &rf);
+}
+#else
+static void update_blocked_averages(int cpu) {}
+#endif
 
 #ifdef CONFIG_NO_HZ_COMMON
 /*
@@ -388,6 +443,8 @@ static bool update_nohz_stats(struct rq *rq)
 	if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
 		return true;
 
+	update_blocked_averages(cpu);
+
 	return rq->has_blocked_load;
 }
 
@@ -629,4 +686,6 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 	 */
 	if (nohz_idle_balance(this_rq, idle))
 		return;
+
+	update_blocked_averages(this_rq->cpu);
 }
diff --git a/kernel/sched/tt_stats.h b/kernel/sched/tt_stats.h
new file mode 100644
index 0000000000000..0f8f7e61da66a
--- /dev/null
+++ b/kernel/sched/tt_stats.h
@@ -0,0 +1,913 @@
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {				\
+	typeof(_ptr) ptr = (_ptr);				\
+	typeof(*ptr) val = (_val);				\
+	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
+	res = var - val;					\
+	if (res > var)						\
+		res = 0;					\
+	WRITE_ONCE(*ptr, res);					\
+} while (0)
+
+static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	u64 wait_start, prev_wait_start;
+
+	if (!schedstat_enabled())
+		return;
+
+	wait_start = rq_clock(rq_of(cfs_rq));
+	prev_wait_start = schedstat_val(se->statistics.wait_start);
+
+	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
+	    likely(wait_start > prev_wait_start))
+		wait_start -= prev_wait_start;
+
+	__schedstat_set(se->statistics.wait_start, wait_start);
+}
+
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct task_struct *p;
+	u64 delta;
+
+	if (!schedstat_enabled())
+		return;
+
+	/*
+	 * When the sched_schedstat changes from 0 to 1, some sched se
+	 * maybe already in the runqueue, the se->statistics.wait_start
+	 * will be 0.So it will let the delta wrong. We need to avoid this
+	 * scenario.
+	 */
+	if (unlikely(!schedstat_val(se->statistics.wait_start)))
+		return;
+
+	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
+
+	if (entity_is_task(se)) {
+		p = task_of(se);
+		if (task_on_rq_migrating(p)) {
+			/*
+			 * Preserve migrating task's wait time so wait_start
+			 * time stamp can be adjusted to accumulate wait time
+			 * prior to migration.
+			 */
+			__schedstat_set(se->statistics.wait_start, delta);
+			return;
+		}
+		trace_sched_stat_wait(p, delta);
+	}
+
+	__schedstat_set(se->statistics.wait_max,
+		      max(schedstat_val(se->statistics.wait_max), delta));
+	__schedstat_inc(se->statistics.wait_count);
+	__schedstat_add(se->statistics.wait_sum, delta);
+	__schedstat_set(se->statistics.wait_start, 0);
+}
+
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	struct task_struct *tsk = NULL;
+	u64 sleep_start, block_start;
+
+	if (!schedstat_enabled())
+		return;
+
+	sleep_start = schedstat_val(se->statistics.sleep_start);
+	block_start = schedstat_val(se->statistics.block_start);
+
+	if (entity_is_task(se))
+		tsk = task_of(se);
+
+	if (sleep_start) {
+		u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+			__schedstat_set(se->statistics.sleep_max, delta);
+
+		__schedstat_set(se->statistics.sleep_start, 0);
+		__schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+		if (tsk) {
+			account_scheduler_latency(tsk, delta >> 10, 1);
+			trace_sched_stat_sleep(tsk, delta);
+		}
+	}
+	if (block_start) {
+		u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+			__schedstat_set(se->statistics.block_max, delta);
+
+		__schedstat_set(se->statistics.block_start, 0);
+		__schedstat_add(se->statistics.sum_sleep_runtime, delta);
+
+		if (tsk) {
+			if (tsk->in_iowait) {
+				__schedstat_add(se->statistics.iowait_sum, delta);
+				__schedstat_inc(se->statistics.iowait_count);
+				trace_sched_stat_iowait(tsk, delta);
+			}
+
+			trace_sched_stat_blocked(tsk, delta);
+
+			/*
+			 * Blocking time is in units of nanosecs, so shift by
+			 * 20 to get a milliseconds-range estimation of the
+			 * amount of time that the task spent sleeping:
+			 */
+			if (unlikely(prof_on == SLEEP_PROFILING)) {
+				profile_hits(SLEEP_PROFILING,
+						(void *)get_wchan(tsk),
+						delta >> 20);
+			}
+			account_scheduler_latency(tsk, delta >> 10, 0);
+		}
+	}
+}
+
+/*
+ * Task is being enqueued - update stats:
+ */
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+	if (!schedstat_enabled())
+		return;
+
+	/*
+	 * Are we enqueueing a waiting task? (for current tasks
+	 * a dequeue/enqueue event is a NOP)
+	 */
+	if (se != cfs_rq->curr)
+		update_stats_wait_start(cfs_rq, se);
+
+	if (flags & ENQUEUE_WAKEUP)
+		update_stats_enqueue_sleeper(cfs_rq, se);
+}
+
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+
+	if (!schedstat_enabled())
+		return;
+
+	/*
+	 * Mark the end of the wait period if dequeueing a
+	 * waiting task:
+	 */
+	if (se != cfs_rq->curr)
+		update_stats_wait_end(cfs_rq, se);
+
+	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
+		struct task_struct *tsk = task_of(se);
+		unsigned int state;
+
+		/* XXX racy against TTWU */
+		state = READ_ONCE(tsk->__state);
+		if (state & TASK_INTERRUPTIBLE)
+			__schedstat_set(se->statistics.sleep_start,
+				      rq_clock(rq_of(cfs_rq)));
+		if (state & TASK_UNINTERRUPTIBLE)
+			__schedstat_set(se->statistics.block_start,
+				      rq_clock(rq_of(cfs_rq)));
+	}
+}
+#else
+static inline void
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+static inline void
+update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
+#endif /* CONFIG_TT_ACCOUNTING_STATS */
+
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
+{
+	struct rq *rq = rq_of(cfs_rq);
+
+	if (&rq->cfs == cfs_rq) {
+		/*
+		 * There are a few boundary cases this might miss but it should
+		 * get called often enough that that should (hopefully) not be
+		 * a real problem.
+		 *
+		 * It will not get called when we go idle, because the idle
+		 * thread is a different class (!fair), nor will the utilization
+		 * number include things like RT tasks.
+		 *
+		 * As is, the util number is not freq-invariant (we'd have to
+		 * implement arch_scale_freq_capacity() for that).
+		 *
+		 * See cpu_util().
+		 */
+		cpufreq_update_util(rq, flags);
+	}
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS)
+/*
+ * The margin used when comparing utilization with CPU capacity.
+ *
+ * (default: ~20%)
+ */
+#define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)
+
+static unsigned long capacity_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	cfs_rq->avg.load_avg += se->avg.load_avg;
+	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	u32 divider = get_pelt_divider(&se->avg);
+	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+}
+#else
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
+
+static inline int propagate_entity_load_avg(struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
+
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_pelt()
+ * @cfs_rq: cfs_rq to update
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed load.
+ *
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
+ * call update_tg_load_avg() when this function returns true.
+ */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+{
+	unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
+	struct sched_avg *sa = &cfs_rq->avg;
+	int decayed = 0;
+
+	if (cfs_rq->removed.nr) {
+		unsigned long r;
+		u32 divider = get_pelt_divider(&cfs_rq->avg);
+
+		raw_spin_lock(&cfs_rq->removed.lock);
+		swap(cfs_rq->removed.util_avg, removed_util);
+		swap(cfs_rq->removed.load_avg, removed_load);
+		swap(cfs_rq->removed.runnable_avg, removed_runnable);
+		cfs_rq->removed.nr = 0;
+		raw_spin_unlock(&cfs_rq->removed.lock);
+
+		r = removed_load;
+		sub_positive(&sa->load_avg, r);
+		sa->load_sum = sa->load_avg * divider;
+
+		r = removed_util;
+		sub_positive(&sa->util_avg, r);
+		sa->util_sum = sa->util_avg * divider;
+
+		r = removed_runnable;
+		sub_positive(&sa->runnable_avg, r);
+		sa->runnable_sum = sa->runnable_avg * divider;
+
+		/*
+		 * removed_runnable is the unweighted version of removed_load so we
+		 * can use it to estimate removed_load_sum.
+		 */
+		add_tg_cfs_propagate(cfs_rq,
+			-(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
+
+		decayed = 1;
+	}
+
+	decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
+
+#ifndef CONFIG_64BIT
+	smp_wmb();
+	cfs_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
+
+	return decayed;
+}
+
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	/*
+	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+	 * See ___update_load_avg() for details.
+	 */
+	u32 divider = get_pelt_divider(&cfs_rq->avg);
+
+	/*
+	 * When we attach the @se to the @cfs_rq, we must align the decay
+	 * window because without that, really weird and wonderful things can
+	 * happen.
+	 *
+	 * XXX illustrate
+	 */
+	se->avg.last_update_time = cfs_rq->avg.last_update_time;
+	se->avg.period_contrib = cfs_rq->avg.period_contrib;
+
+	/*
+	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+	 * period_contrib. This isn't strictly correct, but since we're
+	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
+	 * _sum a little.
+	 */
+	se->avg.util_sum = se->avg.util_avg * divider;
+
+	se->avg.runnable_sum = se->avg.runnable_avg * divider;
+
+	se->avg.load_sum = divider;
+	if (se_weight(se)) {
+		se->avg.load_sum =
+			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+	}
+
+	enqueue_load_avg(cfs_rq, se);
+	cfs_rq->avg.util_avg += se->avg.util_avg;
+	cfs_rq->avg.util_sum += se->avg.util_sum;
+	cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
+	cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
+
+	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
+
+	cfs_rq_util_change(cfs_rq, 0);
+
+	trace_pelt_cfs_tp(cfs_rq);
+}
+
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
+static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	/*
+	 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
+	 * See ___update_load_avg() for details.
+	 */
+	u32 divider = get_pelt_divider(&cfs_rq->avg);
+
+	dequeue_load_avg(cfs_rq, se);
+	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+	sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
+	cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+
+	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
+
+	cfs_rq_util_change(cfs_rq, 0);
+
+	trace_pelt_cfs_tp(cfs_rq);
+}
+
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG	0x1
+#define SKIP_AGE_LOAD	0x2
+#define DO_ATTACH	0x4
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+	u64 now = cfs_rq_clock_pelt(cfs_rq);
+	int decayed;
+
+	/*
+	 * Track task load average for carrying it to new CPU after migrated, and
+	 * track group sched_entity load average for task_h_load calc in migration
+	 */
+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+		__update_load_avg_se(now, cfs_rq, se);
+
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
+	decayed |= propagate_entity_load_avg(se);
+
+	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
+
+		/*
+		 * DO_ATTACH means we're here from enqueue_entity().
+		 * !last_update_time means we've passed through
+		 * migrate_task_rq_fair() indicating we migrated.
+		 *
+		 * IOW we're enqueueing a task on a new CPU.
+		 */
+		attach_entity_load_avg(cfs_rq, se);
+		update_tg_load_avg(cfs_rq);
+
+	} else if (decayed) {
+		cfs_rq_util_change(cfs_rq, 0);
+
+		if (flags & UPDATE_TG)
+			update_tg_load_avg(cfs_rq);
+	}
+}
+
+#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	u64 last_update_time_copy;
+	u64 last_update_time;
+
+	do {
+		last_update_time_copy = cfs_rq->load_last_update_time_copy;
+		smp_rmb();
+		last_update_time = cfs_rq->avg.last_update_time;
+	} while (last_update_time != last_update_time_copy);
+
+	return last_update_time;
+}
+#else
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.last_update_time;
+}
+#endif
+
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+static void sync_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	u64 last_update_time;
+
+	last_update_time = cfs_rq_last_update_time(cfs_rq);
+	__update_load_avg_blocked_se(last_update_time, se);
+}
+
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+static void remove_entity_load_avg(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	unsigned long flags;
+
+	/*
+	 * tasks cannot exit without having gone through wake_up_new_task() ->
+	 * post_init_entity_util_avg() which will have added things to the
+	 * cfs_rq, so we can remove unconditionally.
+	 */
+
+	sync_entity_load_avg(se);
+
+	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+	++cfs_rq->removed.nr;
+	cfs_rq->removed.util_avg	+= se->avg.util_avg;
+	cfs_rq->removed.load_avg	+= se->avg.load_avg;
+	cfs_rq->removed.runnable_avg	+= se->avg.runnable_avg;
+	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
+}
+
+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.runnable_avg;
+}
+
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.load_avg;
+}
+
+static inline unsigned long task_util(struct task_struct *p)
+{
+	return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+	struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+	return max(task_util(p), _task_util_est(p));
+}
+
+#ifdef CONFIG_UCLAMP_TASK
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+	return clamp(task_util_est(p),
+		     uclamp_eff_value(p, UCLAMP_MIN),
+		     uclamp_eff_value(p, UCLAMP_MAX));
+}
+#else
+static inline unsigned long uclamp_task_util(struct task_struct *p)
+{
+	return task_util_est(p);
+}
+#endif
+
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+				    struct task_struct *p)
+{
+	unsigned int enqueued;
+
+	if (!sched_feat(UTIL_EST))
+		return;
+
+	/* Update root cfs_rq's estimated utilization */
+	enqueued  = cfs_rq->avg.util_est.enqueued;
+	enqueued += _task_util_est(p);
+	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+	trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+				    struct task_struct *p)
+{
+	unsigned int enqueued;
+
+	if (!sched_feat(UTIL_EST))
+		return;
+
+	/* Update root cfs_rq's estimated utilization */
+	enqueued  = cfs_rq->avg.util_est.enqueued;
+	enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+	trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
+
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + margin < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+				   struct task_struct *p,
+				   bool task_sleep)
+{
+	long last_ewma_diff, last_enqueued_diff;
+	struct util_est ue;
+
+	if (!sched_feat(UTIL_EST))
+		return;
+
+	/*
+	 * Skip update of task's estimated utilization when the task has not
+	 * yet completed an activation, e.g. being migrated.
+	 */
+	if (!task_sleep)
+		return;
+
+	/*
+	 * If the PELT values haven't changed since enqueue time,
+	 * skip the util_est update.
+	 */
+	ue = p->se.avg.util_est;
+	if (ue.enqueued & UTIL_AVG_UNCHANGED)
+		return;
+
+	last_enqueued_diff = ue.enqueued;
+
+	/*
+	 * Reset EWMA on utilization increases, the moving average is used only
+	 * to smooth utilization decreases.
+	 */
+	ue.enqueued = task_util(p);
+	if (sched_feat(UTIL_EST_FASTUP)) {
+		if (ue.ewma < ue.enqueued) {
+			ue.ewma = ue.enqueued;
+			goto done;
+		}
+	}
+
+	/*
+	 * Skip update of task's estimated utilization when its members are
+	 * already ~1% close to its last activation value.
+	 */
+	last_ewma_diff = ue.enqueued - ue.ewma;
+	last_enqueued_diff -= ue.enqueued;
+	if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
+		if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
+			goto done;
+
+		return;
+	}
+
+	/*
+	 * To avoid overestimation of actual task utilization, skip updates if
+	 * we cannot grant there is idle time in this CPU.
+	 */
+	if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
+		return;
+
+	/*
+	 * Update Task's estimated utilization
+	 *
+	 * When *p completes an activation we can consolidate another sample
+	 * of the task size. This is done by storing the current PELT value
+	 * as ue.enqueued and by using this value to update the Exponential
+	 * Weighted Moving Average (EWMA):
+	 *
+	 *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
+	 *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
+	 *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
+	 *          = w * (      last_ewma_diff            ) +     ewma(t-1)
+	 *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+	 *
+	 * Where 'w' is the weight of new samples, which is configured to be
+	 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+	 */
+	ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+	ue.ewma  += last_ewma_diff;
+	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+done:
+	ue.enqueued |= UTIL_AVG_UNCHANGED;
+	WRITE_ONCE(p->se.avg.util_est, ue);
+
+	trace_sched_util_est_se_tp(&p->se);
+}
+
+static inline int task_fits_capacity(struct task_struct *p, long capacity)
+{
+	return fits_capacity(uclamp_task_util(p), capacity);
+}
+
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return p->se.avg.load_avg;
+}
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+	if (!static_branch_unlikely(&sched_asym_cpucapacity))
+		return;
+
+	if (!p || p->nr_cpus_allowed == 1) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+		rq->misfit_task_load = 0;
+		return;
+	}
+
+	/*
+	 * Make sure that misfit_task_load will not be null even if
+	 * task_h_load() returns 0.
+	 */
+	rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
+}
+
+#else /* CONFIG_SMP && CONFIG_TT_ACCOUNTING_STATS */
+
+static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
+{
+	return true;
+}
+
+#define UPDATE_TG	0x0
+#define SKIP_AGE_LOAD	0x0
+#define DO_ATTACH	0x0
+
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
+{
+	cfs_rq_util_change(cfs_rq, 0);
+}
+
+static inline void remove_entity_load_avg(struct sched_entity *se) {}
+
+static inline void
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
+		bool task_sleep) {}
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
+
+#endif /* CONFIG_SMP */
+
+static inline void check_schedstat_required(void)
+{
+#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_TT_ACCOUNTING_STATS)
+	if (schedstat_enabled())
+		return;
+
+	/* Force schedstat enabled if a dependent tracepoint is active */
+	if (trace_sched_stat_wait_enabled()    ||
+			trace_sched_stat_sleep_enabled()   ||
+			trace_sched_stat_iowait_enabled()  ||
+			trace_sched_stat_blocked_enabled() ||
+			trace_sched_stat_runtime_enabled())  {
+		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
+			     "stat_blocked and stat_runtime require the "
+			     "kernel parameter schedstats=enable or "
+			     "kernel.sched_schedstats=1\n");
+	}
+#endif
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS)
+static inline unsigned long cpu_util(int cpu)
+{
+	struct cfs_rq *cfs_rq;
+	unsigned int util;
+
+	cfs_rq = &cpu_rq(cpu)->cfs;
+	util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	if (sched_feat(UTIL_EST))
+		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+	return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+static inline bool cpu_overutilized(int cpu)
+{
+	return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
+}
+
+static inline void update_overutilized_status(struct rq *rq)
+{
+	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
+		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
+	}
+}
+
+#else
+static inline void update_overutilized_status(struct rq *rq) { }
+#endif
+
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+static void detach_entity_cfs_rq(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	/* Catch up with the cfs_rq and remove our load when we leave */
+	update_load_avg(cfs_rq, se, 0);
+	detach_entity_load_avg(cfs_rq, se);
+}
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS)
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_entity_runnable_average(struct sched_entity *se)
+{
+	struct sched_avg *sa = &se->avg;
+
+	memset(sa, 0, sizeof(*sa));
+
+	/*
+	 * Tasks are initialized with full load to be seen as heavy tasks until
+	 * they get a chance to stabilize to their real load level.
+	 * Group entities are initialized with zero load to reflect the fact that
+	 * nothing has been attached to the task group yet.
+	 */
+	if (entity_is_task(se))
+		sa->load_avg = scale_load_down(se->load.weight);
+
+	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+}
+
+static void attach_entity_cfs_rq(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	/* Synchronize entity with its cfs_rq */
+	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+	attach_entity_load_avg(cfs_rq, se);
+}
+
+static void detach_task_cfs_rq(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+
+	detach_entity_cfs_rq(se);
+}
+
+static void attach_task_cfs_rq(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+
+	attach_entity_cfs_rq(se);
+}
+
+void post_init_entity_util_avg(struct task_struct *p)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct sched_avg *sa = &se->avg;
+	long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
+	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
+
+	if (cap > 0) {
+		if (cfs_rq->avg.util_avg != 0) {
+			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
+			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+			if (sa->util_avg > cap)
+				sa->util_avg = cap;
+		} else {
+			sa->util_avg = cap;
+		}
+	}
+
+	sa->runnable_avg = sa->util_avg;
+
+	if (p->sched_class != &fair_sched_class) {
+		/*
+		 * For !fair tasks do:
+		 *
+		update_cfs_rq_load_avg(now, cfs_rq);
+		attach_entity_load_avg(cfs_rq, se);
+		switched_from_fair(rq, p);
+		 *
+		 * such that the next switched_to_fair() has the
+		 * expected state.
+		 */
+		se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
+		return;
+	}
+
+	attach_entity_cfs_rq(se);
+}
+#else /* !CONFIG_SMP */
+static void detach_task_cfs_rq(struct task_struct *p) {}
+static void attach_task_cfs_rq(struct task_struct *p) {}
+void init_entity_runnable_average(struct sched_entity *se) {}
+void post_init_entity_util_avg(struct task_struct *p) {}
+#endif
+

From 84b763697edebae91b2235e52841ed4dc02d0cd3 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Sat, 13 Nov 2021 21:54:14 +0300
Subject: [PATCH 04/11] fix when enabling CONFIG_NUMA_BALANCING

---
 kernel/sched/bs.c        |  2 +-
 kernel/sched/fair_numa.h | 34 -------------------
 kernel/sched/tt_stats.h  | 70 +++++++++++++++++++++-------------------
 3 files changed, 37 insertions(+), 69 deletions(-)

diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index 76d9ca2ad289a..2813bf71cb246 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -6,8 +6,8 @@
  */
 #include "sched.h"
 #include "pelt.h"
-#include "fair_numa.h"
 #include "tt_stats.h"
+#include "fair_numa.h"
 #include "bs.h"
 
 unsigned int __read_mostly tt_max_lifetime	= 22000; // in ms
diff --git a/kernel/sched/fair_numa.h b/kernel/sched/fair_numa.h
index fd49208da7c69..a0860a2832862 100644
--- a/kernel/sched/fair_numa.h
+++ b/kernel/sched/fair_numa.h
@@ -486,40 +486,16 @@ struct task_numa_env {
 	int best_cpu;
 };
 
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->avg.load_avg;
-}
-
 static unsigned long cpu_load(struct rq *rq)
 {
 	return cfs_rq_load_avg(&rq->cfs);
 }
 
-static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->avg.runnable_avg;
-}
-
 static unsigned long cpu_runnable(struct rq *rq)
 {
 	return cfs_rq_runnable_avg(&rq->cfs);
 }
 
-static inline unsigned long cpu_util(int cpu)
-{
-	struct cfs_rq *cfs_rq;
-	unsigned int util;
-
-	cfs_rq = &cpu_rq(cpu)->cfs;
-	util = READ_ONCE(cfs_rq->avg.util_avg);
-
-	if (sched_feat(UTIL_EST))
-		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
-
-	return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
 /*
  * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
  * This is an approximation as the number of running tasks may not be
@@ -590,11 +566,6 @@ static inline int numa_idle_core(int idle_core, int cpu)
 }
 #endif
 
-static unsigned long capacity_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity;
-}
-
 /*
  * Gather all necessary information to make NUMA balancing placement
  * decisions that are compatible with standard load balancer. This
@@ -716,11 +687,6 @@ static bool load_too_imbalanced(long src_load, long dst_load,
 	return (imb > old_imb);
 }
 
-static unsigned long task_h_load(struct task_struct *p)
-{
-	return p->se.avg.load_avg;
-}
-
 /*
  * Maximum NUMA importance can be 1998 (2*999);
  * SMALLIMP @ 30 would be close to 1998/64.
diff --git a/kernel/sched/tt_stats.h b/kernel/sched/tt_stats.h
index 0f8f7e61da66a..2e897866c7f8b 100644
--- a/kernel/sched/tt_stats.h
+++ b/kernel/sched/tt_stats.h
@@ -221,6 +221,42 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 	}
 }
 
+#if defined(CONFIG_NUMA_BALANCING) || (defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS))
+static unsigned long capacity_of(int cpu)
+{
+	return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.load_avg;
+}
+
+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->avg.runnable_avg;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+	struct cfs_rq *cfs_rq;
+	unsigned int util;
+
+	cfs_rq = &cpu_rq(cpu)->cfs;
+	util = READ_ONCE(cfs_rq->avg.util_avg);
+
+	if (sched_feat(UTIL_EST))
+		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+	return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return p->se.avg.load_avg;
+}
+#endif
+
 #if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS)
 /*
  * The margin used when comparing utilization with CPU capacity.
@@ -229,11 +265,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
  */
 #define fits_capacity(cap, max)	((cap) * 1280 < (max) * 1024)
 
-static unsigned long capacity_of(int cpu)
-{
-	return cpu_rq(cpu)->cpu_capacity;
-}
-
 static inline void
 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -517,16 +548,6 @@ static void remove_entity_load_avg(struct sched_entity *se)
 	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
-static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->avg.runnable_avg;
-}
-
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
-{
-	return cfs_rq->avg.load_avg;
-}
-
 static inline unsigned long task_util(struct task_struct *p)
 {
 	return READ_ONCE(p->se.avg.util_avg);
@@ -696,11 +717,6 @@ static inline int task_fits_capacity(struct task_struct *p, long capacity)
 	return fits_capacity(uclamp_task_util(p), capacity);
 }
 
-static unsigned long task_h_load(struct task_struct *p)
-{
-	return p->se.avg.load_avg;
-}
-
 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
 {
 	if (!static_branch_unlikely(&sched_asym_cpucapacity))
@@ -780,20 +796,6 @@ static inline void check_schedstat_required(void)
 }
 
 #if defined(CONFIG_SMP) && defined(CONFIG_TT_ACCOUNTING_STATS)
-static inline unsigned long cpu_util(int cpu)
-{
-	struct cfs_rq *cfs_rq;
-	unsigned int util;
-
-	cfs_rq = &cpu_rq(cpu)->cfs;
-	util = READ_ONCE(cfs_rq->avg.util_avg);
-
-	if (sched_feat(UTIL_EST))
-		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
-
-	return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
 static inline bool cpu_overutilized(int cpu)
 {
 	return !fits_capacity(cpu_util(cpu), capacity_of(cpu));

From 0e451ef78e155fa87273357a725ace2841d47213 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Sun, 14 Nov 2021 01:06:33 +0300
Subject: [PATCH 05/11] Add missing load update in couple functions:
 trigger_loadbalancer and task_dead.

---
 kernel/sched/bs.c | 1 +
 kernel/sched/bs.h | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index 2813bf71cb246..f61c883b51734 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -1012,6 +1012,7 @@ void trigger_load_balance(struct rq *this_rq)
 	if (unlikely(on_null_domain(this_rq) || !cpu_active(cpu_of(this_rq))))
 		return;
 
+	update_blocked_averages(this_rq->cpu);
 	nohz_balancer_kick(this_rq);
 }
 
diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h
index 80f813826406d..d413aad1d0a37 100644
--- a/kernel/sched/bs.h
+++ b/kernel/sched/bs.h
@@ -58,12 +58,16 @@ static void rq_online_fair(struct rq *rq) {}
 static void rq_offline_fair(struct rq *rq) {}
 static void task_dead_fair(struct task_struct *p)
 {
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	remove_entity_load_avg(&p->se);
+#else
 	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
 	++cfs_rq->removed.nr;
 	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
+#endif
 }
 
 #endif /** CONFIG_SMP */

From 0bf382ee5cfe826f3974e587f0b450cb84808af0 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Sun, 14 Nov 2021 02:16:42 +0300
Subject: [PATCH 06/11] Forgot to update load in the most important functions
 :D update_curr, set_next_task, put_prev_task, and others.

---
 kernel/sched/bs.c       | 59 +++++++++++++++++++++++++++++++++++++----
 kernel/sched/tt_stats.h |  4 +++
 2 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index f61c883b51734..0b289f655e65e 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -192,6 +192,9 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	struct tt_node *ttn = &curr->tt_node;
 	u64 now = sched_clock();
 	u64 delta_exec;
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	struct task_struct *curtask = task_of(curr);
+#endif
 
 	if (unlikely(!curr))
 		return;
@@ -201,13 +204,26 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		return;
 
 	curr->exec_start = now;
+
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	schedstat_set(curr->statistics.exec_max,
+		      max(delta_exec, curr->statistics.exec_max));
+#endif
 	curr->sum_exec_runtime += delta_exec;
 
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	schedstat_add(cfs_rq->exec_clock, delta_exec);
+#endif
 	ttn->curr_burst += delta_exec;
 	ttn->vruntime += convert_to_vruntime(delta_exec, curr);
 	detect_type(ttn, now, 0);
-
 	normalize_lifetime(now, &curr->tt_node);
+
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
+	cgroup_account_cputime(curtask, delta_exec);
+	account_group_exec_runtime(curtask, delta_exec);
+#endif
 }
 
 static void update_curr_fair(struct rq *rq)
@@ -434,11 +450,33 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
 static void
 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (se->on_rq)
+	if (se->on_rq) {
+		/*
+		 * Any task has to be enqueued before it get to execute on
+		 * a CPU. So account for the time it spent waiting on the
+		 * runqueue.
+		 */
+		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
+	}
 
 	se->exec_start = sched_clock();
 	cfs_rq->curr = se;
+
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	/*
+	 * Track our maximum slice length, if the CPU's load is at
+	 * least twice that of our own weight (i.e. dont track it
+	 * when there are only lesser-weight tasks around):
+	 */
+	if (schedstat_enabled() &&
+	    rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
+		schedstat_set(se->statistics.slice_max,
+			max((u64)schedstat_val(se->statistics.slice_max),
+			    se->sum_exec_runtime - se->prev_sum_exec_runtime));
+	}
+#endif
 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
 
@@ -498,6 +536,8 @@ done: __maybe_unused;
 	list_move(&p->se.group_node, &rq->cfs_tasks);
 #endif
 
+	update_misfit_status(p, rq);
+
 	return p;
 
 idle:
@@ -561,11 +601,12 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
 	 */
-	if (prev->on_rq)
+	if (prev->on_rq) {
 		update_curr(cfs_rq);
-
-	if (prev->on_rq)
+		update_stats_wait_start(cfs_rq, prev);
 		__enqueue_entity(cfs_rq, prev);
+		update_load_avg(cfs_rq, prev, 0);
+	}
 
 	cfs_rq->curr = NULL;
 }
@@ -607,6 +648,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
 	update_curr(cfs_rq);
 
+	/*
+	 * Ensure that runnable average is periodically updated.
+	 */
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
+
 	if (cfs_rq->nr_running > 1)
 		check_preempt_tick(cfs_rq, curr);
 }
@@ -1028,6 +1074,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 
 	if (static_branch_unlikely(&sched_numa_balancing))
 		task_tick_numa(rq, curr);
+
+	update_misfit_status(curr, rq);
+	update_overutilized_status(task_rq(curr));
 }
 
 static void task_fork_fair(struct task_struct *p)
diff --git a/kernel/sched/tt_stats.h b/kernel/sched/tt_stats.h
index 2e897866c7f8b..7aa1e8936be4f 100644
--- a/kernel/sched/tt_stats.h
+++ b/kernel/sched/tt_stats.h
@@ -193,6 +193,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 }
 #else
 static inline void
+update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}

From da59d0d57aaf62c4ae6193d679de5336f8df3547 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Sun, 14 Nov 2021 03:52:19 +0300
Subject: [PATCH 07/11] update_blocked_averages in trigger_loadbalancer every
 19ms instead of every tick.

---
 kernel/sched/bs.c      | 11 ++++++++++-
 kernel/sched/bs_nohz.h |  2 --
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index 0b289f655e65e..758cd410d6e6d 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -1058,7 +1058,16 @@ void trigger_load_balance(struct rq *this_rq)
 	if (unlikely(on_null_domain(this_rq) || !cpu_active(cpu_of(this_rq))))
 		return;
 
-	update_blocked_averages(this_rq->cpu);
+#ifdef CONFIG_TT_ACCOUNTING_STATS
+	if (time_after_eq(jiffies, this_rq->next_balance)) {
+		/* scale ms to jiffies */
+		unsigned long interval = msecs_to_jiffies(19);
+
+		this_rq->next_balance = jiffies + interval;
+		update_blocked_averages(this_rq->cpu);
+	}
+#endif
+
 	nohz_balancer_kick(this_rq);
 }
 
diff --git a/kernel/sched/bs_nohz.h b/kernel/sched/bs_nohz.h
index 3802951591f36..89466954743c3 100644
--- a/kernel/sched/bs_nohz.h
+++ b/kernel/sched/bs_nohz.h
@@ -686,6 +686,4 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 	 */
 	if (nohz_idle_balance(this_rq, idle))
 		return;
-
-	update_blocked_averages(this_rq->cpu);
 }

From 11c50e29b9d19a22c5296697bcc1531a47327c4a Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Tue, 16 Nov 2021 22:05:05 +0300
Subject: [PATCH 08/11] Initial work to
 https://github.com/hamadmarri/TT-CPU-Scheduler/issues/6

`YIELD_MARK` only applies if `cfs_rq->h_nr_running > 1`
`YIELD_UNMARK` on migrate_task
---
 kernel/sched/bs.c | 8 +++-----
 kernel/sched/bs.h | 7 ++++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index 758cd410d6e6d..bbeb0c8443251 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -21,9 +21,6 @@ unsigned int __read_mostly tt_max_lifetime	= 22000; // in ms
 #define RACE_TIME 40000000
 #define FACTOR (RACE_TIME / HZ_PERIOD)
 
-#define YIELD_MARK(ttn)		((ttn)->vruntime |= 0x8000000000000000ULL)
-#define YIELD_UNMARK(ttn)	((ttn)->vruntime &= 0x7FFFFFFFFFFFFFFFULL)
-
 #define IS_REALTIME(ttn)	((ttn)->task_type == TT_REALTIME)
 #define IS_INTERACTIVE(ttn)	((ttn)->task_type == TT_INTERACTIVE)
 #define IS_NO_TYPE(ttn)		((ttn)->task_type == TT_NO_TYPE)
@@ -418,14 +415,15 @@ static void yield_task_fair(struct rq *rq)
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 
-	YIELD_MARK(&curr->se.tt_node);
-
 	/*
 	 * Are we the only task in the tree?
 	 */
 	if (unlikely(rq->nr_running == 1))
 		return;
 
+	if (cfs_rq->h_nr_running > 1)
+		YIELD_MARK(&curr->se.tt_node);
+
 	if (curr->policy != SCHED_BATCH) {
 		update_rq_clock(rq);
 		/*
diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h
index d413aad1d0a37..721eb690abeb2 100644
--- a/kernel/sched/bs.h
+++ b/kernel/sched/bs.h
@@ -1,3 +1,5 @@
+#define YIELD_MARK(ttn)		((ttn)->vruntime |= 0x8000000000000000ULL)
+#define YIELD_UNMARK(ttn)	((ttn)->vruntime &= 0x7FFFFFFFFFFFFFFFULL)
 
 /*
  * After fork, child runs first. If set to 0 (default) then
@@ -44,13 +46,12 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 		 */
 		remove_entity_load_avg(&p->se);
 	}
-
-	/* We have migrated, no longer consider this task hot */
-	p->se.exec_start = 0;
 #endif
 	/* Tell new CPU we are migrated */
 	p->se.avg.last_update_time = 0;
 
+	YIELD_UNMARK(&p->se.tt_node);
+
 	update_scan_period(p, new_cpu);
 }
 

From 9dd12e1bae868db24a55f6cb2c16f5569be38678 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Tue, 16 Nov 2021 23:47:38 +0300
Subject: [PATCH 09/11] Add sysctl to soften RT task priority
 kernel.sched_tt_rt_prio

default value = -20
---
 include/linux/sched/sysctl.h |  1 +
 kernel/sched/bs.c            |  3 ++-
 kernel/sysctl.c              | 13 +++++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index f8f93ff881322..bddf5f9175840 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -30,6 +30,7 @@ extern unsigned int sysctl_sched_child_runs_first;
 
 #ifdef CONFIG_TT_SCHED
 extern unsigned int tt_max_lifetime;
+extern int tt_rt_prio;
 #endif
 
 enum sched_tunable_scaling {
diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index bbeb0c8443251..df645d722e766 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -11,6 +11,7 @@
 #include "bs.h"
 
 unsigned int __read_mostly tt_max_lifetime	= 22000; // in ms
+int __read_mostly tt_rt_prio			= -20;
 
 #define INTERACTIVE_HRRN	2U
 #define RT_WAIT_DELTA		800000U
@@ -169,7 +170,7 @@ static u64 convert_to_vruntime(u64 delta, struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
 	s64 prio_diff;
-	int prio = IS_REALTIME(&se->tt_node) ? -20 : PRIO_TO_NICE(p->prio);
+	int prio = IS_REALTIME(&se->tt_node) ? tt_rt_prio : PRIO_TO_NICE(p->prio);
 
 	if (prio == 0)
 		return delta;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1aa76d2dcf755..c0ae8c2ee59e5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -113,6 +113,10 @@
 static int sixty = 60;
 #endif
 
+#ifdef CONFIG_TT_SCHED
+static int neg_twenty	= -20;
+static int thirty_nine	= 39;
+#endif
 static int __maybe_unused neg_one = -1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
@@ -1808,6 +1812,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "sched_tt_rt_prio",
+		.data		= &tt_rt_prio,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &neg_twenty,
+		.extra2		= &thirty_nine,
+	},
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	{

From db588007d2bdffe81c11f7d78b2f475ae00152f6 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Wed, 17 Nov 2021 01:18:54 +0300
Subject: [PATCH 10/11] Ported CFS `is cache hot` code during
 `can_migrate_task` to avoid migrate cache hot tasks. This also includes numa
 locality checks.

---
 kernel/sched/bs.c | 105 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched/bs.h |   3 ++
 2 files changed, 104 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index df645d722e766..be1a55581f498 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -839,17 +839,114 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 	return new_cpu;
 }
 
+/*
+ * Is this task likely cache-hot:
+ */
+static int task_hot(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
+{
+	s64 delta;
+
+	lockdep_assert_rq_held(src_rq);
+
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	if (unlikely(task_has_idle_policy(p)))
+		return 0;
+
+	/* SMT siblings share cache */
+	if (cpus_share_cache(cpu_of(dst_rq), cpu_of(src_rq)))
+		return 0;
+
+	if (sysctl_sched_migration_cost == -1)
+		return 1;
+
+	if (sysctl_sched_migration_cost == 0)
+		return 0;
+
+	delta = sched_clock() - p->se.exec_start;
+
+	return delta < (s64)sysctl_sched_migration_cost;
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns 1, if task migration degrades locality
+ * Returns 0, if task migration improves locality i.e migration preferred.
+ * Returns -1, if task migration is not affected by locality.
+ */
 static int
-can_migrate_task(struct task_struct *p, int dst_cpu, struct rq *src_rq)
+migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
 {
-	if (task_running(src_rq, p))
+	struct numa_group *numa_group = rcu_dereference(p->numa_group);
+	unsigned long src_weight, dst_weight;
+	int src_nid, dst_nid, dist;
+
+	if (!static_branch_likely(&sched_numa_balancing))
+		return -1;
+
+	src_nid = cpu_to_node(cpu_of(src_rq));
+	dst_nid = cpu_to_node(cpus_of(dst_cpu));
+
+	if (src_nid == dst_nid)
+		return -1;
+
+	/* Migrating away from the preferred node is always bad. */
+	if (src_nid == p->numa_preferred_nid) {
+		if (src_rq->nr_running > src_rq->nr_preferred_running)
+			return 1;
+		else
+			return -1;
+	}
+
+	/* Encourage migration to the preferred node. */
+	if (dst_nid == p->numa_preferred_nid)
 		return 0;
 
+	/* Leaving a core idle is often worse than degrading locality. */
+	if (dst_rq->idle_balance)
+		return -1;
+
+	dist = node_distance(src_nid, dst_nid);
+	if (numa_group) {
+		src_weight = group_weight(p, src_nid, dist);
+		dst_weight = group_weight(p, dst_nid, dist);
+	} else {
+		src_weight = task_weight(p, src_nid, dist);
+		dst_weight = task_weight(p, dst_nid, dist);
+	}
+
+	return dst_weight < src_weight;
+}
+
+#else
+static inline int migrate_degrades_locality(struct task_struct *p,
+					     struct rq *dst_rq, struct rq *src_rq)
+{
+	return -1;
+}
+#endif
+
+static int
+can_migrate_task(struct task_struct *p, struct rq *dst_rq, struct rq *src_rq)
+{
+	int tsk_cache_hot;
+
 	/* Disregard pcpu kthreads; they are where they need to be. */
 	if (kthread_is_per_cpu(p))
 		return 0;
 
-	if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr))
+	if (!cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr))
+		return 0;
+
+	if (task_running(src_rq, p))
+		return 0;
+
+	tsk_cache_hot = migrate_degrades_locality(p, dst_rq, src_rq);
+	if (tsk_cache_hot == -1)
+		tsk_cache_hot = task_hot(p, dst_rq, src_rq);
+
+	if (tsk_cache_hot > 0)
 		return 0;
 
 	return 1;
@@ -891,7 +988,7 @@ static int move_task(struct rq *dist_rq, struct rq *src_rq,
 
 	while (ttn) {
 		p = task_of(se_of(ttn));
-		if (can_migrate_task(p, cpu_of(dist_rq), src_rq)) {
+		if (can_migrate_task(p, dist_rq, src_rq)) {
 			pull_from(dist_rq, src_rq, src_rf, p);
 			return 1;
 		}
diff --git a/kernel/sched/bs.h b/kernel/sched/bs.h
index 721eb690abeb2..b3d99cf135769 100644
--- a/kernel/sched/bs.h
+++ b/kernel/sched/bs.h
@@ -50,6 +50,9 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
 	/* Tell new CPU we are migrated */
 	p->se.avg.last_update_time = 0;
 
+	/* We have migrated, no longer consider this task hot */
+	p->se.exec_start = 0;
+
 	YIELD_UNMARK(&p->se.tt_node);
 
 	update_scan_period(p, new_cpu);

From edabdb8ee7703ec4f3a33dfcd6ad0149eaf4d538 Mon Sep 17 00:00:00 2001
From: Hamad Al Marri <hamad.s.almarri@gmail.com>
Date: Wed, 17 Nov 2021 01:32:04 +0300
Subject: [PATCH 11/11] fixed typos

---
 kernel/sched/bs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/bs.c b/kernel/sched/bs.c
index be1a55581f498..793bb1c338d1d 100644
--- a/kernel/sched/bs.c
+++ b/kernel/sched/bs.c
@@ -886,7 +886,7 @@ migrate_degrades_locality(struct task_struct *p, struct rq *dst_rq, struct rq *s
 		return -1;
 
 	src_nid = cpu_to_node(cpu_of(src_rq));
-	dst_nid = cpu_to_node(cpus_of(dst_cpu));
+	dst_nid = cpu_to_node(cpu_of(dst_rq));
 
 	if (src_nid == dst_nid)
 		return -1;