ANDROID: sched: Introducing PELT multiplier

The new sysctl sched_pelt_multiplier allows a user to set a clock multiplier x2 or x4 (x1 being the default). This clock multiplier artificially speed-up PELT ramp up/down similarly to a faster half-life. Indeed, if we write PELT as a first order filter: y(t) = G * (1 - exp(t/tau)) Then we can see that multiplying the time by a constant X, is the same as dividing the time constant tau by X. y(t) = G * (1 - exp((t*X)/tau)) y(t) = G * (1 - exp(t/(tau/X))) Tau being half-life*ln(2), multiplying the PELT time is the same as dividing the half-life: - x1: 32ms half-life - x2: 16ms half-life - x4: 8ms half-life Internally, a new clock is created: rq->clock_task_mult. It sits in the clock hierarchy between rq->clock_task and rq->clock_pelt. Bug: 177593580 Bug: 237219700 Change-Id: I67e6ca7994bebea22bf75732ee11d2b10e0d6b7e Suggested-by: Morten Rasmussen <morten.rasmussen@arm.com> Signed-off-by: Vincent Donnefort <vincent.donnefort@arm.com> Signed-off-by: JianMin Liu <jian-min.liu@mediatek.com>
2022-06-29 21:13:56 +08:00 · 2022-06-29 21:13:56 +08:00 · 4442801a43
commit 4442801a43
parent b2e5773ea4
6 changed files with 80 additions and 5 deletions
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@ -92,6 +92,13 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
 int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);

+#ifdef CONFIG_SMP
+extern unsigned int sysctl_sched_pelt_multiplier;
+
+int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
+#endif
+
 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
 extern unsigned int sysctl_sched_energy_aware;
 int sched_energy_aware_handler(struct ctl_table *table, int write,
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -4788,7 +4788,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)

 	cfs_rq->throttle_count--;
 	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
+		cfs_rq->throttled_clock_task_time += rq_clock_task_mult(rq) -
 					     cfs_rq->throttled_clock_task;

 		/* Add cfs_rq with already running entity in the list */
@ -4806,7 +4806,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)

 	/* group is entering throttled state, stop time */
 	if (!cfs_rq->throttle_count) {
-		cfs_rq->throttled_clock_task = rq_clock_task(rq);
+		cfs_rq->throttled_clock_task = rq_clock_task_mult(rq);
 		list_del_leaf_cfs_rq(cfs_rq);
 	}
 	cfs_rq->throttle_count++;
@ -5224,7 +5224,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
 	pcfs_rq = tg->parent->cfs_rq[cpu];

 	cfs_rq->throttle_count = pcfs_rq->throttle_count;
-	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+	cfs_rq->throttled_clock_task = rq_clock_task_mult(cpu_rq(cpu));
 }

 /* conditionally throttle active cfs_rq's from put_prev_entity() */
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@ -531,3 +531,45 @@ int update_irq_load_avg(struct rq *rq, u64 running)
 	return ret;
 }
 #endif
+
+DEFINE_PER_CPU(u64, clock_task_mult);
+
+unsigned int sysctl_sched_pelt_multiplier = 1;
+__read_mostly unsigned int sched_pelt_lshift;
+
+int sched_pelt_multiplier(struct ctl_table *table, int write, void *buffer,
+			  size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(mutex);
+	unsigned int old;
+	int ret;
+
+	mutex_lock(&mutex);
+
+	old = sysctl_sched_pelt_multiplier;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto undo;
+	if (!write)
+		goto done;
+
+	switch (sysctl_sched_pelt_multiplier)  {
+	case 1:
+		fallthrough;
+	case 2:
+		fallthrough;
+	case 4:
+		WRITE_ONCE(sched_pelt_lshift,
+			   sysctl_sched_pelt_multiplier >> 1);
+		goto done;
+	default:
+		ret = -EINVAL;
+	}
+
+undo:
+	sysctl_sched_pelt_multiplier = old;
+done:
+	mutex_unlock(&mutex);
+
+	return ret;
+}
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@ -61,6 +61,8 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 	WRITE_ONCE(avg->util_est.enqueued, enqueued);
 }

+extern unsigned int sched_pelt_lshift;
+
 /*
 * The clock_pelt scales the time to reflect the effective amount of
 * computation done during the running delta time but then sync back to
@ -75,9 +77,13 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 */
 static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
 {
+	delta <<= READ_ONCE(sched_pelt_lshift);
+
+	per_cpu(clock_task_mult, rq->cpu) += delta;
+
 	if (unlikely(is_idle_task(rq->curr))) {
 		/* The rq is idle, we can sync to clock_task */
-		rq->clock_pelt  = rq_clock_task(rq);
+		rq->clock_pelt = rq_clock_task_mult(rq);
 		return;
 	}

@ -129,7 +135,8 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
 	 * rq's clock_task.
 	 */
 	if (util_sum >= divider)
-		rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
+		rq->lost_idle_time += rq_clock_task_mult(rq) -
+				      rq->clock_pelt;
 }

 static inline u64 rq_clock_pelt(struct rq *rq)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -1193,6 +1193,16 @@ static inline u64 rq_clock_task(struct rq *rq)
 	return rq->clock_task;
 }

+DECLARE_PER_CPU(u64, clock_task_mult);
+
+static inline u64 rq_clock_task_mult(struct rq *rq)
+{
+	lockdep_assert_held(&rq->lock);
+	assert_clock_updated(rq);
+
+	return per_cpu(clock_task_mult, rq->cpu);
+}
+
 /**
 * By default the decay is the default pelt decay period.
 * The decay shift can change the decay period in
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@ -1829,6 +1829,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rr_handler,
 	},
+#ifdef CONFIG_SMP
+	{
+		.procname	= "sched_pelt_multiplier",
+		.data		= &sysctl_sched_pelt_multiplier,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sched_pelt_multiplier,
+	},
+#endif
 #ifdef CONFIG_UCLAMP_TASK
 	{
 		.procname	= "sched_util_clamp_min",