sched/walt: Introduce WALT as a module

WALT improves the scheduler performance from a power/perf and thermal
perspective. Bring it in as a module.

Change-Id: Ibeb6c0480796e8d8fcd81e1bdda7a117ae02c980
Signed-off-by: Shaleen Agrawal <shalagra@codeaurora.org>
This commit is contained in:
Shaleen Agrawal 2021-01-11 18:24:52 -08:00
parent d06697e749
commit d3b261dbd2
30 changed files with 12309 additions and 75 deletions

View File

@ -30,3 +30,5 @@ source "lib/Kconfig"
source "lib/Kconfig.debug" source "lib/Kconfig.debug"
source "Documentation/Kconfig" source "Documentation/Kconfig"
source "kernel/sched/walt/Kconfig"

View File

@ -230,3 +230,4 @@ CONFIG_EDAC_QCOM=m
CONFIG_EDAC_QCOM_LLCC_PANIC_ON_UE=y CONFIG_EDAC_QCOM_LLCC_PANIC_ON_UE=y
# CONFIG_EDAC_QCOM_LLCC_PANIC_ON_CE is not set # CONFIG_EDAC_QCOM_LLCC_PANIC_ON_CE is not set
CONFIG_MSM_BOOT_STATS=m CONFIG_MSM_BOOT_STATS=m
CONFIG_ARM_QCOM_CPUFREQ_HW=m

View File

@ -3,9 +3,6 @@
#undef TRACE_SYSTEM #undef TRACE_SYSTEM
#define TRACE_SYSTEM preemptirq #define TRACE_SYSTEM preemptirq
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH trace/events
#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ) #if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PREEMPTIRQ_H #define _TRACE_PREEMPTIRQ_H

View File

@ -1,56 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2020, The Linux Foundation. All rights reserved.
*/
#if !defined(_TRACE_RESTRICTED_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_RESTRICTED_PREEMPTIRQ_H
#ifdef CONFIG_PREEMPTIRQ_TRACEPOINTS
#undef TRACE_SYSTEM
#define TRACE_SYSTEM restricted_preemptirq
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH trace/hooks
#include <linux/tracepoint.h>
#include <trace/hooks/vendor_hooks.h>
#ifdef CONFIG_TRACE_IRQFLAGS
DECLARE_RESTRICTED_HOOK(restricted_irq_disable,
TP_PROTO(unsigned long ip, unsigned long parent_ip),
TP_ARGS(ip, parent_ip), 1);
DECLARE_RESTRICTED_HOOK(restricted_irq_enable,
TP_PROTO(unsigned long ip, unsigned long parent_ip),
TP_ARGS(ip, parent_ip), 1);
#else
#define trace_restricted_irq_enable(ip, parent_ip)
#define trace_restricted_irq_disable(ip, parent_ip)
#endif /* CONFIG_TRACE_IRQFLAGS */
#ifdef CONFIG_TRACE_PREEMPT_TOGGLE
DECLARE_RESTRICTED_HOOK(restricted_preempt_disable,
TP_PROTO(unsigned long ip, unsigned long parent_ip),
TP_ARGS(ip, parent_ip), 1);
DECLARE_RESTRICTED_HOOK(restricted_preempt_enable,
TP_PROTO(unsigned long ip, unsigned long parent_ip),
TP_ARGS(ip, parent_ip), 1);
#else
#define trace_restricted_preempt_enable(ip, parent_ip)
#define trace_restricted_preempt_disable(ip, parent_ip)
#endif /* CONFIG_TRACE_PREEMPT_TOGGLE */
#include <trace/define_trace.h>
#else /* ! CONFIG_PREEMPTIRQ_TRACEPOINTS */
#define trace_restricted_irq_enable(...)
#define trace_restricted_irq_disable(...)
#define trace_restricted_preempt_enable(...)
#define trace_restricted_preempt_disable(...)
#endif /* ! CONFIG_PREEMPTIRQ_TRACEPOINTS */
#endif /* TRACE_RESTRICTED_PREEMPTIRQ_H || TRACE_HEADER_MULTI_READ */

View File

@ -26,7 +26,6 @@ obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle.o fair.o rt.o deadline.o obj-y += idle.o fair.o rt.o deadline.o
obj-y += wait.o wait_bit.o swait.o completion.o obj-y += wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHEDSTATS) += stats.o
@ -37,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o
obj-$(CONFIG_PSI) += psi.o obj-$(CONFIG_PSI) += psi.o
obj-$(CONFIG_SCHED_WALT) += walt/

View File

@ -4333,10 +4333,6 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (panic_on_warn) if (panic_on_warn)
panic("scheduling while atomic\n"); panic("scheduling while atomic\n");
#if defined(CONFIG_PANIC_ON_SCHED_BUG) && defined(CONFIG_SCHED_WALT)
BUG();
#endif
trace_android_rvh_schedule_bug(NULL); trace_android_rvh_schedule_bug(NULL);
dump_stack(); dump_stack();
@ -7199,9 +7195,6 @@ void __init sched_init_smp(void)
/* Move init over to a non-isolated CPU */ /* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG(); BUG();
#ifdef CONFIG_SCHED_WALT
cpumask_copy(&current->wts.cpus_requested, cpu_possible_mask);
#endif
sched_init_granularity(); sched_init_granularity();
@ -7490,9 +7483,6 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
pr_err("Preemption disabled at:"); pr_err("Preemption disabled at:");
print_ip_sym(KERN_ERR, preempt_disable_ip); print_ip_sym(KERN_ERR, preempt_disable_ip);
} }
#ifdef CONFIG_PANIC_ON_SCHED_BUG
BUG();
#endif
trace_android_rvh_schedule_bug(NULL); trace_android_rvh_schedule_bug(NULL);
@ -8648,7 +8638,7 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_uclamp_ls_read_u64, .read_u64 = cpu_uclamp_ls_read_u64,
.write_u64 = cpu_uclamp_ls_write_u64, .write_u64 = cpu_uclamp_ls_write_u64,
}, },
#endif /* CONFIG_UCLAMP_TASK_GROUP */ #endif
{ } /* terminate */ { } /* terminate */
}; };

View File

@ -83,6 +83,7 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
#include "features.h" #include "features.h"
}; };
EXPORT_SYMBOL_GPL(sched_feat_keys); EXPORT_SYMBOL_GPL(sched_feat_keys);
#undef SCHED_FEAT #undef SCHED_FEAT
static void sched_feat_disable(int i) static void sched_feat_disable(int i)

31
kernel/sched/walt/Kconfig Normal file
View File

@ -0,0 +1,31 @@
# SPDX-License-Identifier: GPL-2.0-only
#
# QTI WALT based scheduler
#
menu "QTI WALT based scheduler features"
config SCHED_WALT
tristate "Support window based load tracking"
depends on SMP
help
This feature will allow the scheduler to maintain a tunable window
based set of metrics for tasks and runqueues. These metrics can be
used to guide task placement as well as task frequency requirements
for cpufreq governors.
config SCHED_WALT_DEBUG
tristate "WALT debug module"
select TRACE_PREEMPT_TOGGLE
select TRACE_IRQFLAGS
help
This module provides the means of debugging long preempt and
irq disable code. This helps in identifying the scheduling
latencies. The module rely on preemptirq trace hooks and
print the stacktrace to the ftrace upon long preempt and irq
events. Sysctl knobs are available for the user to configure
the thresholds.
This module also used to crash the system to catch issues
in scenarios like RT throttling and sleeping while in atomic
context etc.
endmenu

View File

@ -0,0 +1,10 @@
# SPDX-License-Identifier: GPL-2.0-only
KCOV_INSTRUMENT := n
KCSAN_SANITIZE := n
obj-$(CONFIG_SCHED_WALT) += sched-walt.o
sched-walt-$(CONFIG_SCHED_WALT) := walt.o boost.o sched_avg.o qc_vas.o core_ctl.o trace.o input-boost.o sysctl.o cpufreq_walt.o fixup.o walt_lb.o walt_rt.o walt_cfs.o
obj-$(CONFIG_SCHED_WALT_DEBUG) += sched-walt-debug.o
sched-walt-debug-$(CONFIG_SCHED_WALT_DEBUG) := walt_debug.o preemptirq_long.o

301
kernel/sched/walt/boost.c Normal file
View File

@ -0,0 +1,301 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2012-2021, The Linux Foundation. All rights reserved.
*/
#include <linux/of.h>
#include <linux/sched/core_ctl.h>
#include "walt.h"
#include "trace.h"
/*
* Scheduler boost is a mechanism to temporarily place tasks on CPUs
* with higher capacity than those where a task would have normally
* ended up with their load characteristics. Any entity enabling
* boost is responsible for disabling it as well.
*/
static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE;
static DEFINE_MUTEX(boost_mutex);
struct task_group *task_group_topapp;
struct task_group *task_group_foreground;
void walt_init_tg(struct task_group *tg)
{
struct walt_task_group *wtg;
wtg = (struct walt_task_group *) tg->android_vendor_data1;
wtg->colocate = false;
wtg->sched_boost_enable[NO_BOOST] = false;
wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true;
wtg->sched_boost_enable[CONSERVATIVE_BOOST] = false;
wtg->sched_boost_enable[RESTRAINED_BOOST] = false;
}
void walt_init_topapp_tg(struct task_group *tg)
{
struct walt_task_group *wtg;
wtg = (struct walt_task_group *) tg->android_vendor_data1;
wtg->colocate = true;
wtg->sched_boost_enable[NO_BOOST] = false;
wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true;
wtg->sched_boost_enable[CONSERVATIVE_BOOST] = true;
wtg->sched_boost_enable[RESTRAINED_BOOST] = false;
}
void walt_init_foreground_tg(struct task_group *tg)
{
struct walt_task_group *wtg;
wtg = (struct walt_task_group *) tg->android_vendor_data1;
wtg->colocate = false;
wtg->sched_boost_enable[NO_BOOST] = false;
wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true;
wtg->sched_boost_enable[CONSERVATIVE_BOOST] = true;
wtg->sched_boost_enable[RESTRAINED_BOOST] = false;
}
/*
* Scheduler boost type and boost policy might at first seem unrelated,
* however, there exists a connection between them that will allow us
* to use them interchangeably during placement decisions. We'll explain
* the connection here in one possible way so that the implications are
* clear when looking at placement policies.
*
* When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED
* When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can
* neither be none nor RESTRAINED.
*/
static void set_boost_policy(int type)
{
if (type == NO_BOOST || type == RESTRAINED_BOOST) {
boost_policy = SCHED_BOOST_NONE;
return;
}
if (boost_policy_dt) {
boost_policy = boost_policy_dt;
return;
}
if (hmp_capable()) {
boost_policy = SCHED_BOOST_ON_BIG;
return;
}
boost_policy = SCHED_BOOST_ON_ALL;
}
static bool verify_boost_params(int type)
{
return type >= RESTRAINED_BOOST_DISABLE && type <= RESTRAINED_BOOST;
}
static void sched_no_boost_nop(void)
{
}
static void sched_full_throttle_boost_enter(void)
{
core_ctl_set_boost(true);
walt_enable_frequency_aggregation(true);
}
static void sched_full_throttle_boost_exit(void)
{
core_ctl_set_boost(false);
walt_enable_frequency_aggregation(false);
}
static void sched_conservative_boost_enter(void)
{
}
static void sched_conservative_boost_exit(void)
{
}
static void sched_restrained_boost_enter(void)
{
walt_enable_frequency_aggregation(true);
}
static void sched_restrained_boost_exit(void)
{
walt_enable_frequency_aggregation(false);
}
struct sched_boost_data {
int refcount;
void (*enter)(void);
void (*exit)(void);
};
static struct sched_boost_data sched_boosts[] = {
[NO_BOOST] = {
.refcount = 0,
.enter = sched_no_boost_nop,
.exit = sched_no_boost_nop,
},
[FULL_THROTTLE_BOOST] = {
.refcount = 0,
.enter = sched_full_throttle_boost_enter,
.exit = sched_full_throttle_boost_exit,
},
[CONSERVATIVE_BOOST] = {
.refcount = 0,
.enter = sched_conservative_boost_enter,
.exit = sched_conservative_boost_exit,
},
[RESTRAINED_BOOST] = {
.refcount = 0,
.enter = sched_restrained_boost_enter,
.exit = sched_restrained_boost_exit,
},
};
#define SCHED_BOOST_START FULL_THROTTLE_BOOST
#define SCHED_BOOST_END (RESTRAINED_BOOST + 1)
static int sched_effective_boost(void)
{
int i;
/*
* The boosts are sorted in descending order by
* priority.
*/
for (i = SCHED_BOOST_START; i < SCHED_BOOST_END; i++) {
if (sched_boosts[i].refcount >= 1)
return i;
}
return NO_BOOST;
}
static void sched_boost_disable(int type)
{
struct sched_boost_data *sb = &sched_boosts[type];
int next_boost;
if (sb->refcount <= 0)
return;
sb->refcount--;
if (sb->refcount)
return;
/*
* This boost's refcount becomes zero, so it must
* be disabled. Disable it first and then apply
* the next boost.
*/
sb->exit();
next_boost = sched_effective_boost();
sched_boosts[next_boost].enter();
}
static void sched_boost_enable(int type)
{
struct sched_boost_data *sb = &sched_boosts[type];
int next_boost, prev_boost = sched_boost_type;
sb->refcount++;
if (sb->refcount != 1)
return;
/*
* This boost enable request did not come before.
* Take this new request and find the next boost
* by aggregating all the enabled boosts. If there
* is a change, disable the previous boost and enable
* the next boost.
*/
next_boost = sched_effective_boost();
if (next_boost == prev_boost)
return;
sched_boosts[prev_boost].exit();
sched_boosts[next_boost].enter();
}
static void sched_boost_disable_all(void)
{
int i;
for (i = SCHED_BOOST_START; i < SCHED_BOOST_END; i++) {
if (sched_boosts[i].refcount > 0) {
sched_boosts[i].exit();
sched_boosts[i].refcount = 0;
}
}
}
static void _sched_set_boost(int type)
{
if (type == 0)
sched_boost_disable_all();
else if (type > 0)
sched_boost_enable(type);
else
sched_boost_disable(-type);
/*
* sysctl_sched_boost holds the boost request from
* user space which could be different from the
* effectively enabled boost. Update the effective
* boost here.
*/
sched_boost_type = sched_effective_boost();
sysctl_sched_boost = sched_boost_type;
set_boost_policy(sysctl_sched_boost);
trace_sched_set_boost(sysctl_sched_boost);
}
int sched_set_boost(int type)
{
int ret = 0;
mutex_lock(&boost_mutex);
if (verify_boost_params(type))
_sched_set_boost(type);
else
ret = -EINVAL;
mutex_unlock(&boost_mutex);
return ret;
}
int sched_boost_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int *data = (unsigned int *)table->data;
mutex_lock(&boost_mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
goto done;
if (verify_boost_params(*data))
_sched_set_boost(*data);
else
ret = -EINVAL;
done:
mutex_unlock(&boost_mutex);
return ret;
}

1307
kernel/sched/walt/core_ctl.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,886 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* This is based on schedutil governor but modified to work with
* WALT.
*
* Copyright (C) 2016, Intel Corporation
* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kthread.h>
#include <trace/events/power.h>
#include "walt.h"
#include "trace.h"
struct waltgov_tunables {
struct gov_attr_set attr_set;
unsigned int up_rate_limit_us;
unsigned int down_rate_limit_us;
unsigned int hispeed_load;
unsigned int hispeed_freq;
unsigned int rtg_boost_freq;
bool pl;
};
struct waltgov_policy {
struct cpufreq_policy *policy;
u64 last_ws;
u64 curr_cycles;
u64 last_cyc_update_time;
unsigned long avg_cap;
struct waltgov_tunables *tunables;
struct list_head tunables_hook;
unsigned long hispeed_util;
unsigned long rtg_boost_util;
unsigned long max;
raw_spinlock_t update_lock;
u64 last_freq_update_time;
s64 min_rate_limit_ns;
s64 up_rate_delay_ns;
s64 down_rate_delay_ns;
unsigned int next_freq;
unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used: */
struct irq_work irq_work;
struct kthread_work work;
struct mutex work_lock;
struct kthread_worker worker;
struct task_struct *thread;
bool limits_changed;
bool need_freq_update;
};
struct waltgov_cpu {
struct waltgov_callback cb;
struct waltgov_policy *wg_policy;
unsigned int cpu;
struct walt_cpu_load walt_load;
unsigned long util;
unsigned long max;
unsigned int flags;
};
DEFINE_PER_CPU(struct waltgov_callback *, waltgov_cb_data);
static DEFINE_PER_CPU(struct waltgov_cpu, waltgov_cpu);
static DEFINE_PER_CPU(struct waltgov_tunables *, cached_tunables);
/************************ Governor internals ***********************/
static bool waltgov_should_update_freq(struct waltgov_policy *wg_policy, u64 time)
{
s64 delta_ns;
if (unlikely(wg_policy->limits_changed)) {
wg_policy->limits_changed = false;
wg_policy->need_freq_update = true;
return true;
}
/*
* No need to recalculate next freq for min_rate_limit_us
* at least. However we might still decide to further rate
* limit once frequency change direction is decided, according
* to the separate rate limits.
*/
delta_ns = time - wg_policy->last_freq_update_time;
return delta_ns >= wg_policy->min_rate_limit_ns;
}
static bool waltgov_up_down_rate_limit(struct waltgov_policy *wg_policy, u64 time,
unsigned int next_freq)
{
s64 delta_ns;
delta_ns = time - wg_policy->last_freq_update_time;
if (next_freq > wg_policy->next_freq &&
delta_ns < wg_policy->up_rate_delay_ns)
return true;
if (next_freq < wg_policy->next_freq &&
delta_ns < wg_policy->down_rate_delay_ns)
return true;
return false;
}
static bool waltgov_update_next_freq(struct waltgov_policy *wg_policy, u64 time,
unsigned int next_freq)
{
if (wg_policy->next_freq == next_freq)
return false;
if (waltgov_up_down_rate_limit(wg_policy, time, next_freq))
return false;
wg_policy->next_freq = next_freq;
wg_policy->last_freq_update_time = time;
return true;
}
static unsigned long freq_to_util(struct waltgov_policy *wg_policy,
unsigned int freq)
{
return mult_frac(wg_policy->max, freq,
wg_policy->policy->cpuinfo.max_freq);
}
#define KHZ 1000
static void waltgov_track_cycles(struct waltgov_policy *wg_policy,
unsigned int prev_freq,
u64 upto)
{
u64 delta_ns, cycles;
u64 next_ws = wg_policy->last_ws + sched_ravg_window;
upto = min(upto, next_ws);
/* Track cycles in current window */
delta_ns = upto - wg_policy->last_cyc_update_time;
delta_ns *= prev_freq;
do_div(delta_ns, (NSEC_PER_SEC / KHZ));
cycles = delta_ns;
wg_policy->curr_cycles += cycles;
wg_policy->last_cyc_update_time = upto;
}
static void waltgov_calc_avg_cap(struct waltgov_policy *wg_policy, u64 curr_ws,
unsigned int prev_freq)
{
u64 last_ws = wg_policy->last_ws;
unsigned int avg_freq;
BUG_ON(curr_ws < last_ws);
if (curr_ws <= last_ws)
return;
/* If we skipped some windows */
if (curr_ws > (last_ws + sched_ravg_window)) {
avg_freq = prev_freq;
/* Reset tracking history */
wg_policy->last_cyc_update_time = curr_ws;
} else {
waltgov_track_cycles(wg_policy, prev_freq, curr_ws);
avg_freq = wg_policy->curr_cycles;
avg_freq /= sched_ravg_window / (NSEC_PER_SEC / KHZ);
}
wg_policy->avg_cap = freq_to_util(wg_policy, avg_freq);
wg_policy->curr_cycles = 0;
wg_policy->last_ws = curr_ws;
}
static void waltgov_fast_switch(struct waltgov_policy *wg_policy, u64 time,
unsigned int next_freq)
{
struct cpufreq_policy *policy = wg_policy->policy;
unsigned int cpu;
if (!waltgov_update_next_freq(wg_policy, time, next_freq))
return;
waltgov_track_cycles(wg_policy, wg_policy->policy->cur, time);
next_freq = cpufreq_driver_fast_switch(policy, next_freq);
if (!next_freq)
return;
policy->cur = next_freq;
if (trace_cpu_frequency_enabled()) {
for_each_cpu(cpu, policy->cpus)
trace_cpu_frequency(next_freq, cpu);
}
}
static void waltgov_deferred_update(struct waltgov_policy *wg_policy, u64 time,
unsigned int next_freq)
{
if (!waltgov_update_next_freq(wg_policy, time, next_freq))
return;
walt_irq_work_queue(&wg_policy->irq_work);
}
#define TARGET_LOAD 80
static unsigned int get_next_freq(struct waltgov_policy *wg_policy,
unsigned long util, unsigned long max)
{
struct cpufreq_policy *policy = wg_policy->policy;
/*
* TODO:
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
*/
unsigned int freq = policy->cpuinfo.max_freq;
freq = map_util_freq(util, freq, max);
trace_waltgov_next_freq(policy->cpu, util, max, freq);
if (freq == wg_policy->cached_raw_freq && !wg_policy->need_freq_update)
return wg_policy->next_freq;
wg_policy->need_freq_update = false;
wg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
static unsigned long waltgov_get_util(struct waltgov_cpu *wg_cpu)
{
struct rq *rq = cpu_rq(wg_cpu->cpu);
unsigned long max = arch_scale_cpu_capacity(wg_cpu->cpu);
unsigned long util;
wg_cpu->max = max;
util = cpu_util_freq_walt(wg_cpu->cpu, &wg_cpu->walt_load);
return uclamp_rq_util_with(rq, util, NULL);
}
#define NL_RATIO 75
#define DEFAULT_HISPEED_LOAD 90
#define DEFAULT_CPU0_RTG_BOOST_FREQ 1000000
#define DEFAULT_CPU4_RTG_BOOST_FREQ 0
#define DEFAULT_CPU7_RTG_BOOST_FREQ 0
static void waltgov_walt_adjust(struct waltgov_cpu *wg_cpu, unsigned long *util,
unsigned long *max)
{
struct waltgov_policy *wg_policy = wg_cpu->wg_policy;
bool is_migration = wg_cpu->flags & WALT_CPUFREQ_IC_MIGRATION;
bool is_rtg_boost = wg_cpu->walt_load.rtgb_active;
unsigned long nl = wg_cpu->walt_load.nl;
unsigned long cpu_util = wg_cpu->util;
bool is_hiload;
unsigned long pl = wg_cpu->walt_load.pl;
if (is_rtg_boost)
*util = max(*util, wg_policy->rtg_boost_util);
is_hiload = (cpu_util >= mult_frac(wg_policy->avg_cap,
wg_policy->tunables->hispeed_load,
100));
if (is_hiload && !is_migration)
*util = max(*util, wg_policy->hispeed_util);
if (is_hiload && nl >= mult_frac(cpu_util, NL_RATIO, 100))
*util = *max;
if (wg_policy->tunables->pl) {
if (sysctl_sched_conservative_pl)
pl = mult_frac(pl, TARGET_LOAD, 100);
*util = max(*util, pl);
}
}
static inline unsigned long target_util(struct waltgov_policy *wg_policy,
unsigned int freq)
{
unsigned long util;
util = freq_to_util(wg_policy, freq);
util = mult_frac(util, TARGET_LOAD, 100);
return util;
}
static unsigned int waltgov_next_freq_shared(struct waltgov_cpu *wg_cpu, u64 time)
{
struct waltgov_policy *wg_policy = wg_cpu->wg_policy;
struct cpufreq_policy *policy = wg_policy->policy;
unsigned long util = 0, max = 1;
unsigned int j;
for_each_cpu(j, policy->cpus) {
struct waltgov_cpu *j_wg_cpu = &per_cpu(waltgov_cpu, j);
unsigned long j_util, j_max;
/*
* If the util value for all CPUs in a policy is 0, just using >
* will result in a max value of 1. WALT stats can later update
* the aggregated util value, causing get_next_freq() to compute
* freq = max_freq * 1.25 * (util / max) for nonzero util,
* leading to spurious jumps to fmax.
*/
j_util = j_wg_cpu->util;
j_max = j_wg_cpu->max;
if (j_util * max >= j_max * util) {
util = j_util;
max = j_max;
}
waltgov_walt_adjust(j_wg_cpu, &util, &max);
}
return get_next_freq(wg_policy, util, max);
}
static void waltgov_update_freq(struct waltgov_callback *cb, u64 time,
unsigned int flags)
{
struct waltgov_cpu *wg_cpu = container_of(cb, struct waltgov_cpu, cb);
struct waltgov_policy *wg_policy = wg_cpu->wg_policy;
unsigned long hs_util, boost_util;
unsigned int next_f;
if (!wg_policy->tunables->pl && flags & WALT_CPUFREQ_PL)
return;
wg_cpu->util = waltgov_get_util(wg_cpu);
wg_cpu->flags = flags;
raw_spin_lock(&wg_policy->update_lock);
if (wg_policy->max != wg_cpu->max) {
wg_policy->max = wg_cpu->max;
hs_util = target_util(wg_policy,
wg_policy->tunables->hispeed_freq);
wg_policy->hispeed_util = hs_util;
boost_util = target_util(wg_policy,
wg_policy->tunables->rtg_boost_freq);
wg_policy->rtg_boost_util = boost_util;
}
waltgov_calc_avg_cap(wg_policy, wg_cpu->walt_load.ws,
wg_policy->policy->cur);
trace_waltgov_util_update(wg_cpu->cpu, wg_cpu->util, wg_policy->avg_cap,
wg_cpu->max, wg_cpu->walt_load.nl,
wg_cpu->walt_load.pl,
wg_cpu->walt_load.rtgb_active, flags);
if (waltgov_should_update_freq(wg_policy, time) &&
!(flags & WALT_CPUFREQ_CONTINUE)) {
next_f = waltgov_next_freq_shared(wg_cpu, time);
if (wg_policy->policy->fast_switch_enabled)
waltgov_fast_switch(wg_policy, time, next_f);
else
waltgov_deferred_update(wg_policy, time, next_f);
}
raw_spin_unlock(&wg_policy->update_lock);
}
static void waltgov_work(struct kthread_work *work)
{
struct waltgov_policy *wg_policy = container_of(work, struct waltgov_policy, work);
unsigned int freq;
unsigned long flags;
raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
freq = wg_policy->next_freq;
waltgov_track_cycles(wg_policy, wg_policy->policy->cur,
ktime_get_ns());
raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
mutex_lock(&wg_policy->work_lock);
__cpufreq_driver_target(wg_policy->policy, freq, CPUFREQ_RELATION_L);
mutex_unlock(&wg_policy->work_lock);
}
static void waltgov_irq_work(struct irq_work *irq_work)
{
struct waltgov_policy *wg_policy;
wg_policy = container_of(irq_work, struct waltgov_policy, irq_work);
kthread_queue_work(&wg_policy->worker, &wg_policy->work);
}
/************************** sysfs interface ************************/
static inline struct waltgov_tunables *to_waltgov_tunables(struct gov_attr_set *attr_set)
{
return container_of(attr_set, struct waltgov_tunables, attr_set);
}
static DEFINE_MUTEX(min_rate_lock);
static void update_min_rate_limit_ns(struct waltgov_policy *wg_policy)
{
mutex_lock(&min_rate_lock);
wg_policy->min_rate_limit_ns = min(wg_policy->up_rate_delay_ns,
wg_policy->down_rate_delay_ns);
mutex_unlock(&min_rate_lock);
}
static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->up_rate_limit_us);
}
static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->down_rate_limit_us);
}
static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
struct waltgov_policy *wg_policy;
unsigned int rate_limit_us;
if (kstrtouint(buf, 10, &rate_limit_us))
return -EINVAL;
tunables->up_rate_limit_us = rate_limit_us;
list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) {
wg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
update_min_rate_limit_ns(wg_policy);
}
return count;
}
static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
struct waltgov_policy *wg_policy;
unsigned int rate_limit_us;
if (kstrtouint(buf, 10, &rate_limit_us))
return -EINVAL;
tunables->down_rate_limit_us = rate_limit_us;
list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) {
wg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
update_min_rate_limit_ns(wg_policy);
}
return count;
}
static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
static ssize_t hispeed_load_show(struct gov_attr_set *attr_set, char *buf)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->hispeed_load);
}
static ssize_t hispeed_load_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
if (kstrtouint(buf, 10, &tunables->hispeed_load))
return -EINVAL;
tunables->hispeed_load = min(100U, tunables->hispeed_load);
return count;
}
static ssize_t hispeed_freq_show(struct gov_attr_set *attr_set, char *buf)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->hispeed_freq);
}
static ssize_t hispeed_freq_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
unsigned int val;
struct waltgov_policy *wg_policy;
unsigned long hs_util;
unsigned long flags;
if (kstrtouint(buf, 10, &val))
return -EINVAL;
tunables->hispeed_freq = val;
list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) {
raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
hs_util = target_util(wg_policy,
wg_policy->tunables->hispeed_freq);
wg_policy->hispeed_util = hs_util;
raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
}
return count;
}
static ssize_t rtg_boost_freq_show(struct gov_attr_set *attr_set, char *buf)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->rtg_boost_freq);
}
static ssize_t rtg_boost_freq_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
unsigned int val;
struct waltgov_policy *wg_policy;
unsigned long boost_util;
unsigned long flags;
if (kstrtouint(buf, 10, &val))
return -EINVAL;
tunables->rtg_boost_freq = val;
list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) {
raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
boost_util = target_util(wg_policy,
wg_policy->tunables->rtg_boost_freq);
wg_policy->rtg_boost_util = boost_util;
raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
}
return count;
}
static ssize_t pl_show(struct gov_attr_set *attr_set, char *buf)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->pl);
}
static ssize_t pl_store(struct gov_attr_set *attr_set, const char *buf,
size_t count)
{
struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set);
if (kstrtobool(buf, &tunables->pl))
return -EINVAL;
return count;
}
static struct governor_attr hispeed_load = __ATTR_RW(hispeed_load);
static struct governor_attr hispeed_freq = __ATTR_RW(hispeed_freq);
static struct governor_attr rtg_boost_freq = __ATTR_RW(rtg_boost_freq);
static struct governor_attr pl = __ATTR_RW(pl);
static struct attribute *waltgov_attributes[] = {
&up_rate_limit_us.attr,
&down_rate_limit_us.attr,
&hispeed_load.attr,
&hispeed_freq.attr,
&rtg_boost_freq.attr,
&pl.attr,
NULL
};
static struct kobj_type waltgov_tunables_ktype = {
.default_attrs = waltgov_attributes,
.sysfs_ops = &governor_sysfs_ops,
};
/********************** cpufreq governor interface *********************/
static struct cpufreq_governor walt_gov;
static struct waltgov_policy *waltgov_policy_alloc(struct cpufreq_policy *policy)
{
struct waltgov_policy *wg_policy;
wg_policy = kzalloc(sizeof(*wg_policy), GFP_KERNEL);
if (!wg_policy)
return NULL;
wg_policy->policy = policy;
raw_spin_lock_init(&wg_policy->update_lock);
return wg_policy;
}
static void waltgov_policy_free(struct waltgov_policy *wg_policy)
{
kfree(wg_policy);
}
static int waltgov_kthread_create(struct waltgov_policy *wg_policy)
{
struct task_struct *thread;
struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
struct cpufreq_policy *policy = wg_policy->policy;
int ret;
/* kthread only required for slow path */
if (policy->fast_switch_enabled)
return 0;
kthread_init_work(&wg_policy->work, waltgov_work);
kthread_init_worker(&wg_policy->worker);
thread = kthread_create(kthread_worker_fn, &wg_policy->worker,
"waltgov:%d",
cpumask_first(policy->related_cpus));
if (IS_ERR(thread)) {
pr_err("failed to create waltgov thread: %ld\n", PTR_ERR(thread));
return PTR_ERR(thread);
}
ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
if (ret) {
kthread_stop(thread);
pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
return ret;
}
wg_policy->thread = thread;
kthread_bind_mask(thread, policy->related_cpus);
init_irq_work(&wg_policy->irq_work, waltgov_irq_work);
mutex_init(&wg_policy->work_lock);
wake_up_process(thread);
return 0;
}
static void waltgov_kthread_stop(struct waltgov_policy *wg_policy)
{
/* kthread only required for slow path */
if (wg_policy->policy->fast_switch_enabled)
return;
kthread_flush_worker(&wg_policy->worker);
kthread_stop(wg_policy->thread);
mutex_destroy(&wg_policy->work_lock);
}
static void waltgov_tunables_save(struct cpufreq_policy *policy,
struct waltgov_tunables *tunables)
{
int cpu;
struct waltgov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
if (!cached) {
cached = kzalloc(sizeof(*tunables), GFP_KERNEL);
if (!cached)
return;
for_each_cpu(cpu, policy->related_cpus)
per_cpu(cached_tunables, cpu) = cached;
}
cached->pl = tunables->pl;
cached->hispeed_load = tunables->hispeed_load;
cached->rtg_boost_freq = tunables->rtg_boost_freq;
cached->hispeed_freq = tunables->hispeed_freq;
cached->up_rate_limit_us = tunables->up_rate_limit_us;
cached->down_rate_limit_us = tunables->down_rate_limit_us;
}
static void waltgov_tunables_restore(struct cpufreq_policy *policy)
{
struct waltgov_policy *wg_policy = policy->governor_data;
struct waltgov_tunables *tunables = wg_policy->tunables;
struct waltgov_tunables *cached = per_cpu(cached_tunables, policy->cpu);
if (!cached)
return;
tunables->pl = cached->pl;
tunables->hispeed_load = cached->hispeed_load;
tunables->rtg_boost_freq = cached->rtg_boost_freq;
tunables->hispeed_freq = cached->hispeed_freq;
tunables->up_rate_limit_us = cached->up_rate_limit_us;
tunables->down_rate_limit_us = cached->down_rate_limit_us;
}
static int waltgov_init(struct cpufreq_policy *policy)
{
struct waltgov_policy *wg_policy;
struct waltgov_tunables *tunables;
int ret = 0;
/* State should be equivalent to EXIT */
if (policy->governor_data)
return -EBUSY;
cpufreq_enable_fast_switch(policy);
if (policy->fast_switch_possible && !policy->fast_switch_enabled)
BUG_ON(1);
wg_policy = waltgov_policy_alloc(policy);
if (!wg_policy) {
ret = -ENOMEM;
goto disable_fast_switch;
}
ret = waltgov_kthread_create(wg_policy);
if (ret)
goto free_wg_policy;
tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
if (!tunables) {
ret = -ENOMEM;
goto stop_kthread;
}
gov_attr_set_init(&tunables->attr_set, &wg_policy->tunables_hook);
tunables->hispeed_load = DEFAULT_HISPEED_LOAD;
switch (policy->cpu) {
default:
case 0:
tunables->rtg_boost_freq = DEFAULT_CPU0_RTG_BOOST_FREQ;
break;
case 4:
tunables->rtg_boost_freq = DEFAULT_CPU4_RTG_BOOST_FREQ;
break;
case 7:
tunables->rtg_boost_freq = DEFAULT_CPU7_RTG_BOOST_FREQ;
break;
}
policy->governor_data = wg_policy;
wg_policy->tunables = tunables;
waltgov_tunables_restore(policy);
ret = kobject_init_and_add(&tunables->attr_set.kobj, &waltgov_tunables_ktype,
get_governor_parent_kobj(policy), "%s",
walt_gov.name);
if (ret)
goto fail;
return 0;
fail:
kobject_put(&tunables->attr_set.kobj);
policy->governor_data = NULL;
kfree(tunables);
stop_kthread:
waltgov_kthread_stop(wg_policy);
free_wg_policy:
waltgov_policy_free(wg_policy);
disable_fast_switch:
cpufreq_disable_fast_switch(policy);
pr_err("initialization failed (error %d)\n", ret);
return ret;
}
static void waltgov_exit(struct cpufreq_policy *policy)
{
struct waltgov_policy *wg_policy = policy->governor_data;
struct waltgov_tunables *tunables = wg_policy->tunables;
unsigned int count;
count = gov_attr_set_put(&tunables->attr_set, &wg_policy->tunables_hook);
policy->governor_data = NULL;
if (!count) {
waltgov_tunables_save(policy, tunables);
kfree(tunables);
}
waltgov_kthread_stop(wg_policy);
waltgov_policy_free(wg_policy);
cpufreq_disable_fast_switch(policy);
}
static int waltgov_start(struct cpufreq_policy *policy)
{
struct waltgov_policy *wg_policy = policy->governor_data;
unsigned int cpu;
wg_policy->up_rate_delay_ns =
wg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
wg_policy->down_rate_delay_ns =
wg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
update_min_rate_limit_ns(wg_policy);
wg_policy->last_freq_update_time = 0;
wg_policy->next_freq = 0;
wg_policy->limits_changed = false;
wg_policy->need_freq_update = false;
wg_policy->cached_raw_freq = 0;
for_each_cpu(cpu, policy->cpus) {
struct waltgov_cpu *wg_cpu = &per_cpu(waltgov_cpu, cpu);
memset(wg_cpu, 0, sizeof(*wg_cpu));
wg_cpu->cpu = cpu;
wg_cpu->wg_policy = wg_policy;
}
for_each_cpu(cpu, policy->cpus) {
struct waltgov_cpu *wg_cpu = &per_cpu(waltgov_cpu, cpu);
waltgov_add_callback(cpu, &wg_cpu->cb, waltgov_update_freq);
}
return 0;
}
static void waltgov_stop(struct cpufreq_policy *policy)
{
struct waltgov_policy *wg_policy = policy->governor_data;
unsigned int cpu;
for_each_cpu(cpu, policy->cpus)
waltgov_remove_callback(cpu);
synchronize_rcu();
if (!policy->fast_switch_enabled) {
irq_work_sync(&wg_policy->irq_work);
kthread_cancel_work_sync(&wg_policy->work);
}
}
static void waltgov_limits(struct cpufreq_policy *policy)
{
struct waltgov_policy *wg_policy = policy->governor_data;
unsigned long flags, now;
unsigned int freq;
if (!policy->fast_switch_enabled) {
mutex_lock(&wg_policy->work_lock);
raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
waltgov_track_cycles(wg_policy, wg_policy->policy->cur,
ktime_get_ns());
raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
cpufreq_policy_apply_limits(policy);
mutex_unlock(&wg_policy->work_lock);
} else {
raw_spin_lock_irqsave(&wg_policy->update_lock, flags);
freq = policy->cur;
now = ktime_get_ns();
/*
* cpufreq_driver_resolve_freq() has a clamp, so we do not need
* to do any sort of additional validation here.
*/
freq = cpufreq_driver_resolve_freq(policy, freq);
wg_policy->cached_raw_freq = freq;
waltgov_fast_switch(wg_policy, now, freq);
raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags);
}
wg_policy->limits_changed = true;
}
static struct cpufreq_governor walt_gov = {
.name = "walt",
.init = waltgov_init,
.exit = waltgov_exit,
.start = waltgov_start,
.stop = waltgov_stop,
.limits = waltgov_limits,
.owner = THIS_MODULE,
};
int waltgov_register(void)
{
return cpufreq_register_governor(&walt_gov);
}

91
kernel/sched/walt/fixup.c Normal file
View File

@ -0,0 +1,91 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2016-2021, The Linux Foundation. All rights reserved.
*/
#include <trace/hooks/cpufreq.h>
#include "walt.h"
unsigned int cpuinfo_max_freq_cached;
char sched_lib_name[LIB_PATH_LENGTH];
unsigned int sched_lib_mask_force;
bool is_sched_lib_based_app(pid_t pid)
{
const char *name = NULL;
char *libname, *lib_list;
struct vm_area_struct *vma;
char path_buf[LIB_PATH_LENGTH];
char *tmp_lib_name;
bool found = false;
struct task_struct *p;
struct mm_struct *mm;
if (strnlen(sched_lib_name, LIB_PATH_LENGTH) == 0)
return false;
tmp_lib_name = kmalloc(LIB_PATH_LENGTH, GFP_KERNEL);
if (!tmp_lib_name)
return false;
rcu_read_lock();
p = pid ? get_pid_task(find_vpid(pid), PIDTYPE_PID) : current;
if (!p) {
rcu_read_unlock();
kfree(tmp_lib_name);
return false;
}
/* Prevent p going away */
get_task_struct(p);
rcu_read_unlock();
mm = get_task_mm(p);
if (!mm)
goto put_task_struct;
down_read(&mm->mmap_lock);
for (vma = mm->mmap; vma ; vma = vma->vm_next) {
if (vma->vm_file && vma->vm_flags & VM_EXEC) {
name = d_path(&vma->vm_file->f_path,
path_buf, LIB_PATH_LENGTH);
if (IS_ERR(name))
goto release_sem;
strlcpy(tmp_lib_name, sched_lib_name, LIB_PATH_LENGTH);
lib_list = tmp_lib_name;
while ((libname = strsep(&lib_list, ","))) {
libname = skip_spaces(libname);
if (strnstr(name, libname,
strnlen(name, LIB_PATH_LENGTH))) {
found = true;
goto release_sem;
}
}
}
}
release_sem:
up_read(&mm->mmap_lock);
mmput(mm);
put_task_struct:
put_task_struct(p);
kfree(tmp_lib_name);
return found;
}
void android_vh_show_max_freq(void *unused, struct cpufreq_policy *policy,
unsigned int *max_freq)
{
if (!cpuinfo_max_freq_cached)
return;
if (!(BIT(policy->cpu) & sched_lib_mask_force))
return;
if (is_sched_lib_based_app(current->pid))
*max_freq = cpuinfo_max_freq_cached << 1;
}

View File

@ -0,0 +1,300 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013-2015,2017,2019-2021, The Linux Foundation. All rights reserved.
*/
#define pr_fmt(fmt) "input-boost: " fmt
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/cpufreq.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/input.h>
#include <linux/time.h>
#include <linux/sysfs.h>
#include <linux/pm_qos.h>
#include "walt.h"
#define input_boost_attr_rw(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, show_##_name, store_##_name)
#define show_one(file_name) \
static ssize_t show_##file_name \
(struct kobject *kobj, struct kobj_attribute *attr, char *buf) \
{ \
return scnprintf(buf, PAGE_SIZE, "%u\n", file_name); \
}
#define store_one(file_name) \
static ssize_t store_##file_name \
(struct kobject *kobj, struct kobj_attribute *attr, \
const char *buf, size_t count) \
{ \
\
sscanf(buf, "%u", &file_name); \
return count; \
}
struct cpu_sync {
int cpu;
unsigned int input_boost_min;
unsigned int input_boost_freq;
};
static DEFINE_PER_CPU(struct cpu_sync, sync_info);
static struct workqueue_struct *input_boost_wq;
static struct work_struct input_boost_work;
static bool sched_boost_active;
static struct delayed_work input_boost_rem;
static u64 last_input_time;
#define MIN_INPUT_INTERVAL (150 * USEC_PER_MSEC)
static DEFINE_PER_CPU(struct freq_qos_request, qos_req);
static void boost_adjust_notify(struct cpufreq_policy *policy)
{
unsigned int cpu = policy->cpu;
struct cpu_sync *s = &per_cpu(sync_info, cpu);
unsigned int ib_min = s->input_boost_min;
struct freq_qos_request *req = &per_cpu(qos_req, cpu);
int ret;
pr_debug("CPU%u policy min before boost: %u kHz\n",
cpu, policy->min);
pr_debug("CPU%u boost min: %u kHz\n", cpu, ib_min);
ret = freq_qos_update_request(req, ib_min);
if (ret < 0)
pr_err("Failed to update freq constraint in boost_adjust: %d\n",
ib_min);
pr_debug("CPU%u policy min after boost: %u kHz\n", cpu, policy->min);
}
static void update_policy_online(void)
{
unsigned int i;
struct cpufreq_policy *policy;
struct cpumask online_cpus;
/* Re-evaluate policy to trigger adjust notifier for online CPUs */
get_online_cpus();
online_cpus = *cpu_online_mask;
for_each_cpu(i, &online_cpus) {
policy = cpufreq_cpu_get(i);
if (!policy) {
pr_err("%s: cpufreq policy not found for cpu%d\n",
__func__, i);
return;
}
cpumask_andnot(&online_cpus, &online_cpus,
policy->related_cpus);
boost_adjust_notify(policy);
}
put_online_cpus();
}
static void do_input_boost_rem(struct work_struct *work)
{
unsigned int i, ret;
struct cpu_sync *i_sync_info;
/* Reset the input_boost_min for all CPUs in the system */
pr_debug("Resetting input boost min for all CPUs\n");
for_each_possible_cpu(i) {
i_sync_info = &per_cpu(sync_info, i);
i_sync_info->input_boost_min = 0;
}
/* Update policies for all online CPUs */
update_policy_online();
if (sched_boost_active) {
ret = sched_set_boost(0);
if (!ret)
pr_err("input-boost: sched boost disable failed\n");
sched_boost_active = false;
}
}
static void do_input_boost(struct work_struct *work)
{
unsigned int i, ret;
struct cpu_sync *i_sync_info;
cancel_delayed_work_sync(&input_boost_rem);
if (sched_boost_active) {
sched_set_boost(0);
sched_boost_active = false;
}
/* Set the input_boost_min for all CPUs in the system */
pr_debug("Setting input boost min for all CPUs\n");
for (i = 0; i < 8; i++) {
i_sync_info = &per_cpu(sync_info, i);
i_sync_info->input_boost_min = sysctl_input_boost_freq[i];
}
/* Update policies for all online CPUs */
update_policy_online();
/* Enable scheduler boost to migrate tasks to big cluster */
if (sysctl_sched_boost_on_input > 0) {
ret = sched_set_boost(sysctl_sched_boost_on_input);
if (ret)
pr_err("input-boost: sched boost enable failed\n");
else
sched_boost_active = true;
}
queue_delayed_work(input_boost_wq, &input_boost_rem,
msecs_to_jiffies(sysctl_input_boost_ms));
}
static void inputboost_input_event(struct input_handle *handle,
unsigned int type, unsigned int code, int value)
{
u64 now;
int cpu;
int enabled = 0;
for_each_possible_cpu(cpu) {
if (sysctl_input_boost_freq[cpu] > 0) {
enabled = 1;
break;
}
}
if (!enabled)
return;
now = ktime_to_us(ktime_get());
if (now - last_input_time < MIN_INPUT_INTERVAL)
return;
if (work_pending(&input_boost_work))
return;
queue_work(input_boost_wq, &input_boost_work);
last_input_time = ktime_to_us(ktime_get());
}
static int inputboost_input_connect(struct input_handler *handler,
struct input_dev *dev, const struct input_device_id *id)
{
struct input_handle *handle;
int error;
handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL);
if (!handle)
return -ENOMEM;
handle->dev = dev;
handle->handler = handler;
handle->name = "cpufreq";
error = input_register_handle(handle);
if (error)
goto err2;
error = input_open_device(handle);
if (error)
goto err1;
return 0;
err1:
input_unregister_handle(handle);
err2:
kfree(handle);
return error;
}
static void inputboost_input_disconnect(struct input_handle *handle)
{
input_close_device(handle);
input_unregister_handle(handle);
kfree(handle);
}
static const struct input_device_id inputboost_ids[] = {
/* multi-touch touchscreen */
{
.flags = INPUT_DEVICE_ID_MATCH_EVBIT |
INPUT_DEVICE_ID_MATCH_ABSBIT,
.evbit = { BIT_MASK(EV_ABS) },
.absbit = { [BIT_WORD(ABS_MT_POSITION_X)] =
BIT_MASK(ABS_MT_POSITION_X) |
BIT_MASK(ABS_MT_POSITION_Y)
},
},
/* touchpad */
{
.flags = INPUT_DEVICE_ID_MATCH_KEYBIT |
INPUT_DEVICE_ID_MATCH_ABSBIT,
.keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) },
.absbit = { [BIT_WORD(ABS_X)] =
BIT_MASK(ABS_X) | BIT_MASK(ABS_Y)
},
},
/* Keypad */
{
.flags = INPUT_DEVICE_ID_MATCH_EVBIT,
.evbit = { BIT_MASK(EV_KEY) },
},
{ },
};
static struct input_handler inputboost_input_handler = {
.event = inputboost_input_event,
.connect = inputboost_input_connect,
.disconnect = inputboost_input_disconnect,
.name = "input-boost",
.id_table = inputboost_ids,
};
struct kobject *input_boost_kobj;
int input_boost_init(void)
{
int cpu, ret;
struct cpu_sync *s;
struct cpufreq_policy *policy;
struct freq_qos_request *req;
input_boost_wq = alloc_workqueue("inputboost_wq", WQ_HIGHPRI, 0);
if (!input_boost_wq)
return -EFAULT;
INIT_WORK(&input_boost_work, do_input_boost);
INIT_DELAYED_WORK(&input_boost_rem, do_input_boost_rem);
for_each_possible_cpu(cpu) {
s = &per_cpu(sync_info, cpu);
s->cpu = cpu;
req = &per_cpu(qos_req, cpu);
policy = cpufreq_cpu_get(cpu);
if (!policy) {
pr_err("%s: cpufreq policy not found for cpu%d\n",
__func__, cpu);
return -ESRCH;
}
ret = freq_qos_add_request(&policy->constraints, req,
FREQ_QOS_MIN, policy->min);
if (ret < 0) {
pr_err("%s: Failed to add freq constraint (%d)\n",
__func__, ret);
return ret;
}
}
ret = input_register_handler(&inputboost_input_handler);
return 0;
}

View File

@ -0,0 +1,177 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2020-2021 The Linux Foundation. All rights reserved.
*/
#include <linux/ftrace.h>
#include <linux/sched.h>
#include <linux/sysctl.h>
#include <linux/printk.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <trace/hooks/preemptirq.h>
#define CREATE_TRACE_POINTS
#include "preemptirq_long.h"
#define IRQSOFF_SENTINEL 0x0fffDEAD
static unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000;
static unsigned int sysctl_irqsoff_tracing_threshold_ns = 5000000;
static unsigned int sysctl_irqsoff_dmesg_output_enabled;
static unsigned int sysctl_irqsoff_crash_sentinel_value;
static unsigned int sysctl_irqsoff_crash_threshold_ns = 10000000;
static unsigned int half_million = 500000;
static unsigned int one_hundred_million = 100000000;
static unsigned int one_million = 1000000;
static DEFINE_PER_CPU(u64, irq_disabled_ts);
/*
* preemption disable tracking require additional context
* to rule out false positives. see the comment in
* test_preempt_disable_long() for more details.
*/
struct preempt_store {
u64 ts;
int pid;
unsigned long ncsw;
};
static DEFINE_PER_CPU(struct preempt_store, the_ps);
static void note_irq_disable(void *u1, unsigned long u2, unsigned long u3)
{
if (is_idle_task(current))
return;
/*
* We just have to note down the time stamp here. We
* use stacktrace trigger feature to print the stacktrace.
*/
this_cpu_write(irq_disabled_ts, sched_clock());
}
static void test_irq_disable_long(void *u1, unsigned long u2, unsigned long u3)
{
u64 ts = this_cpu_read(irq_disabled_ts);
if (!ts)
return;
this_cpu_write(irq_disabled_ts, 0);
ts = sched_clock() - ts;
if (ts > sysctl_irqsoff_tracing_threshold_ns) {
trace_irq_disable_long(ts);
if (sysctl_irqsoff_dmesg_output_enabled == IRQSOFF_SENTINEL)
printk_deferred("D=%llu C:(%ps<-%ps<-%ps<-%ps)\n",
ts, (void *)CALLER_ADDR2,
(void *)CALLER_ADDR3,
(void *)CALLER_ADDR4,
(void *)CALLER_ADDR5);
}
if (sysctl_irqsoff_crash_sentinel_value == IRQSOFF_SENTINEL &&
ts > sysctl_irqsoff_crash_threshold_ns) {
printk_deferred("delta=%llu(ns) > crash_threshold=%u(ns) Task=%s\n",
ts, sysctl_irqsoff_crash_threshold_ns,
current->comm);
BUG_ON(1);
}
}
static void note_preempt_disable(void *u1, unsigned long u2, unsigned long u3)
{
struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id());
ps->ts = sched_clock();
ps->pid = current->pid;
ps->ncsw = current->nvcsw + current->nivcsw;
}
static void test_preempt_disable_long(void *u1, unsigned long u2,
unsigned long u3)
{
struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id());
u64 delta = 0;
if (!ps->ts)
return;
/*
* schedule() calls __schedule() with preemption disabled.
* if we had entered idle and exiting idle now, we think
* preemption is disabled the whole time. Detect this by
* checking if the preemption is disabled across the same
* task. There is a possiblity that the same task is scheduled
* after idle. To rule out this possibility, compare the
* context switch count also.
*/
if (ps->pid == current->pid && (ps->ncsw == current->nvcsw +
current->nivcsw))
delta = sched_clock() - ps->ts;
ps->ts = 0;
if (delta > sysctl_preemptoff_tracing_threshold_ns)
trace_preempt_disable_long(delta);
}
static struct ctl_table preemptirq_long_table[] = {
{
.procname = "preemptoff_tracing_threshold_ns",
.data = &sysctl_preemptoff_tracing_threshold_ns,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "irqsoff_tracing_threshold_ns",
.data = &sysctl_irqsoff_tracing_threshold_ns,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
.extra1 = &half_million,
.extra2 = &one_hundred_million,
},
{
.procname = "irqsoff_dmesg_output_enabled",
.data = &sysctl_irqsoff_dmesg_output_enabled,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "irqsoff_crash_sentinel_value",
.data = &sysctl_irqsoff_crash_sentinel_value,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "irqsoff_crash_threshold_ns",
.data = &sysctl_irqsoff_crash_threshold_ns,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
.extra1 = &one_million,
.extra2 = &one_hundred_million,
},
{ }
};
int preemptirq_long_init(void)
{
if (!register_sysctl("preemptirq", preemptirq_long_table)) {
pr_err("Fail to register sysctl table\n");
return -EPERM;
}
register_trace_android_rvh_irqs_disable(note_irq_disable, NULL);
register_trace_android_rvh_irqs_enable(test_irq_disable_long, NULL);
register_trace_android_rvh_preempt_disable(note_preempt_disable, NULL);
register_trace_android_rvh_preempt_enable(test_preempt_disable_long,
NULL);
return 0;
}

View File

@ -1,13 +1,13 @@
/* SPDX-License-Identifier: GPL-2.0-only */ /* SPDX-License-Identifier: GPL-2.0-only */
/* /*
* Copyright (c) 2020 The Linux Foundation. All rights reserved. * Copyright (c) 2021 The Linux Foundation. All rights reserved.
*/ */
#undef TRACE_SYSTEM #undef TRACE_SYSTEM
#define TRACE_SYSTEM preemptirq_long #define TRACE_SYSTEM preemptirq_long
#undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH trace/events #define TRACE_INCLUDE_PATH .
#if !defined(_TRACE_PREEMPTIRQ_LONG_H) || defined(TRACE_HEADER_MULTI_READ) #if !defined(_TRACE_PREEMPTIRQ_LONG_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PREEMPTIRQ_LONG_H #define _TRACE_PREEMPTIRQ_LONG_H

View File

@ -0,0 +1,52 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2019-2021, The Linux Foundation. All rights reserved.
*/
#include <linux/irq.h>
#include <linux/delay.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include "walt.h"
#ifdef CONFIG_HOTPLUG_CPU
cpumask_t pending_active_mask = CPU_MASK_NONE;
int sched_pause_count(const cpumask_t *mask, bool include_offline)
{
cpumask_t count_mask = CPU_MASK_NONE;
cpumask_t pause_mask = CPU_MASK_NONE;
if (cpumask_any(&pending_active_mask) >= nr_cpu_ids) {
/* initialize pending_active_state */
cpumask_copy(&pending_active_mask, cpu_active_mask);
}
if (include_offline) {
/* get all offline or paused cpus */
cpumask_complement(&pause_mask, &pending_active_mask);
cpumask_complement(&count_mask, cpu_online_mask);
cpumask_or(&count_mask, &count_mask, &pause_mask);
/* get all offline or paused cpus in this cluster */
cpumask_and(&count_mask, &count_mask, mask);
} else {
cpumask_andnot(&count_mask, mask, &pending_active_mask);
}
return cpumask_weight(&count_mask);
}
void sched_pause_pending(int cpu)
{
cpumask_clear_cpu(cpu, &pending_active_mask);
}
void sched_unpause_pending(int cpu)
{
cpumask_set_cpu(cpu, &pending_active_mask);
}
#endif /* CONFIG_HOTPLUG_CPU */

View File

@ -0,0 +1,250 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2012, 2015-2021, The Linux Foundation. All rights reserved.
*/
/*
* Scheduler hook for average runqueue determination
*/
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/sched.h>
#include <linux/math64.h>
#include "walt.h"
#include "trace.h"
static DEFINE_PER_CPU(u64, nr_prod_sum);
static DEFINE_PER_CPU(u64, last_time);
static DEFINE_PER_CPU(u64, nr_big_prod_sum);
static DEFINE_PER_CPU(u64, nr);
static DEFINE_PER_CPU(u64, nr_max);
static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock);
static s64 last_get_time;
static DEFINE_PER_CPU(atomic64_t, busy_hyst_end_time) = ATOMIC64_INIT(0);
static DEFINE_PER_CPU(u64, hyst_time);
static DEFINE_PER_CPU(u64, coloc_hyst_busy);
static DEFINE_PER_CPU(u64, coloc_hyst_time);
#define NR_THRESHOLD_PCT 15
#define MAX_RTGB_TIME (sysctl_sched_coloc_busy_hyst_max_ms * NSEC_PER_MSEC)
/**
* sched_get_nr_running_avg
* @return: Average nr_running, iowait and nr_big_tasks value since last poll.
* Returns the avg * 100 to return up to two decimal points
* of accuracy.
*
* Obtains the average nr_running value since the last poll.
* This function may not be called concurrently with itself
*/
void sched_get_nr_running_avg(struct sched_avg_stats *stats)
{
int cpu;
u64 curr_time = sched_clock();
u64 period = curr_time - last_get_time;
u64 tmp_nr, tmp_misfit;
bool any_hyst_time = false;
if (!period)
return;
/* read and reset nr_running counts */
for_each_possible_cpu(cpu) {
unsigned long flags;
u64 diff;
spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
curr_time = sched_clock();
diff = curr_time - per_cpu(last_time, cpu);
BUG_ON((s64)diff < 0);
tmp_nr = per_cpu(nr_prod_sum, cpu);
tmp_nr += per_cpu(nr, cpu) * diff;
tmp_nr = div64_u64((tmp_nr * 100), period);
tmp_misfit = per_cpu(nr_big_prod_sum, cpu);
tmp_misfit += walt_big_tasks(cpu) * diff;
tmp_misfit = div64_u64((tmp_misfit * 100), period);
/*
* NR_THRESHOLD_PCT is to make sure that the task ran
* at least 85% in the last window to compensate any
* over estimating being done.
*/
stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT),
100);
stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit +
NR_THRESHOLD_PCT), 100);
stats[cpu].nr_max = per_cpu(nr_max, cpu);
stats[cpu].nr_scaled = tmp_nr;
trace_sched_get_nr_running_avg(cpu, stats[cpu].nr,
stats[cpu].nr_misfit, stats[cpu].nr_max,
stats[cpu].nr_scaled);
per_cpu(last_time, cpu) = curr_time;
per_cpu(nr_prod_sum, cpu) = 0;
per_cpu(nr_big_prod_sum, cpu) = 0;
per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
}
for_each_possible_cpu(cpu) {
if (per_cpu(coloc_hyst_time, cpu)) {
any_hyst_time = true;
break;
}
}
if (any_hyst_time && get_rtgb_active_time() >= MAX_RTGB_TIME)
sched_update_hyst_times();
last_get_time = curr_time;
}
EXPORT_SYMBOL(sched_get_nr_running_avg);
void sched_update_hyst_times(void)
{
bool rtgb_active;
int cpu;
unsigned long cpu_cap, coloc_busy_pct;
rtgb_active = is_rtgb_active() && (sched_boost() != CONSERVATIVE_BOOST)
&& (get_rtgb_active_time() < MAX_RTGB_TIME);
for_each_possible_cpu(cpu) {
cpu_cap = arch_scale_cpu_capacity(cpu);
coloc_busy_pct = sysctl_sched_coloc_busy_hyst_cpu_busy_pct[cpu];
per_cpu(hyst_time, cpu) = (BIT(cpu)
& sysctl_sched_busy_hyst_enable_cpus) ?
sysctl_sched_busy_hyst : 0;
per_cpu(coloc_hyst_time, cpu) = ((BIT(cpu)
& sysctl_sched_coloc_busy_hyst_enable_cpus)
&& rtgb_active) ?
sysctl_sched_coloc_busy_hyst_cpu[cpu] : 0;
per_cpu(coloc_hyst_busy, cpu) = mult_frac(cpu_cap,
coloc_busy_pct, 100);
}
}
#define BUSY_NR_RUN 3
#define BUSY_LOAD_FACTOR 10
static inline void update_busy_hyst_end_time(int cpu, bool dequeue,
unsigned long prev_nr_run, u64 curr_time)
{
bool nr_run_trigger = false;
bool load_trigger = false, coloc_load_trigger = false;
u64 agg_hyst_time;
if (!per_cpu(hyst_time, cpu) && !per_cpu(coloc_hyst_time, cpu))
return;
if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN)
nr_run_trigger = true;
if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) >
capacity_orig_of(cpu))
load_trigger = true;
if (dequeue && cpu_util(cpu) > per_cpu(coloc_hyst_busy, cpu))
coloc_load_trigger = true;
agg_hyst_time = max((nr_run_trigger || load_trigger) ?
per_cpu(hyst_time, cpu) : 0,
(nr_run_trigger || coloc_load_trigger) ?
per_cpu(coloc_hyst_time, cpu) : 0);
if (agg_hyst_time)
atomic64_set(&per_cpu(busy_hyst_end_time, cpu),
curr_time + agg_hyst_time);
}
int sched_busy_hyst_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
if (table->maxlen > (sizeof(unsigned int) * num_possible_cpus()))
table->maxlen = sizeof(unsigned int) * num_possible_cpus();
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
sched_update_hyst_times();
return ret;
}
/**
* sched_update_nr_prod
* @cpu: The core id of the nr running driver.
* @enq: enqueue/dequeue happening on this CPU.
* @return: N/A
*
* Update average with latest nr_running value for CPU
*/
void sched_update_nr_prod(int cpu, bool enq)
{
u64 diff;
u64 curr_time;
unsigned long flags, nr_running;
spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags);
nr_running = per_cpu(nr, cpu);
curr_time = sched_clock();
diff = curr_time - per_cpu(last_time, cpu);
BUG_ON((s64)diff < 0);
per_cpu(last_time, cpu) = curr_time;
per_cpu(nr, cpu) = cpu_rq(cpu)->nr_running;
if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu))
per_cpu(nr_max, cpu) = per_cpu(nr, cpu);
update_busy_hyst_end_time(cpu, !enq, nr_running, curr_time);
per_cpu(nr_prod_sum, cpu) += nr_running * diff;
per_cpu(nr_big_prod_sum, cpu) += walt_big_tasks(cpu) * diff;
spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags);
}
/*
* Returns the CPU utilization % in the last window.
*/
unsigned int sched_get_cpu_util(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 util;
unsigned long capacity, flags;
unsigned int busy;
struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
raw_spin_lock_irqsave(&rq->lock, flags);
capacity = capacity_orig_of(cpu);
util = wrq->prev_runnable_sum + wrq->grp_time.prev_runnable_sum;
util = div64_u64(util, sched_ravg_window >> SCHED_CAPACITY_SHIFT);
raw_spin_unlock_irqrestore(&rq->lock, flags);
util = (util >= capacity) ? capacity : util;
busy = div64_ul((util * 100), capacity);
return busy;
}
u64 sched_lpm_disallowed_time(int cpu)
{
u64 now = sched_clock();
u64 bias_end_time = atomic64_read(&per_cpu(busy_hyst_end_time, cpu));
if (now < bias_end_time)
return bias_end_time - now;
return 0;
}
EXPORT_SYMBOL(sched_lpm_disallowed_time);

900
kernel/sched/walt/sysctl.c Normal file
View File

@ -0,0 +1,900 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
*/
#include "walt.h"
static int neg_three = -3;
static int three = 3;
static int two_hundred_fifty_five = 255;
static unsigned int ns_per_sec = NSEC_PER_SEC;
static unsigned int one_hundred_thousand = 100000;
static unsigned int two_hundred_million = 200000000;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
static int one_hundred = 100;
static int one_thousand = 1000;
/*
* CFS task prio range is [100 ... 139]
* 120 is the default prio.
* RTG boost range is [100 ... 119] because giving
* boost for [120 .. 139] does not make sense.
* 99 means disabled and it is the default value.
*/
static unsigned int min_cfs_boost_prio = 99;
static unsigned int max_cfs_boost_prio = 119;
unsigned int sysctl_sched_capacity_margin_up_pct[MAX_MARGIN_LEVELS];
unsigned int sysctl_sched_capacity_margin_dn_pct[MAX_MARGIN_LEVELS];
unsigned int sysctl_sched_busy_hyst_enable_cpus;
unsigned int sysctl_sched_busy_hyst;
unsigned int sysctl_sched_coloc_busy_hyst_enable_cpus;
unsigned int sysctl_sched_coloc_busy_hyst_cpu[WALT_NR_CPUS];
unsigned int sysctl_sched_coloc_busy_hyst_max_ms;
unsigned int sysctl_sched_coloc_busy_hyst_cpu_busy_pct[WALT_NR_CPUS];
unsigned int sysctl_sched_boost;
unsigned int sysctl_sched_wake_up_idle[2];
unsigned int sysctl_input_boost_ms;
unsigned int sysctl_input_boost_freq[8];
unsigned int sysctl_sched_boost_on_input;
unsigned int sysctl_sched_init_stage;
unsigned int sysctl_sched_load_boost[WALT_NR_CPUS];
/* sysctl nodes accesed by other files */
unsigned int __read_mostly sysctl_sched_coloc_downmigrate_ns;
unsigned int __read_mostly sysctl_sched_group_downmigrate_pct;
unsigned int __read_mostly sysctl_sched_group_upmigrate_pct;
unsigned int __read_mostly sysctl_sched_window_stats_policy;
unsigned int sysctl_sched_ravg_window_nr_ticks;
unsigned int sysctl_sched_dynamic_ravg_window_enable;
unsigned int sysctl_sched_walt_rotate_big_tasks;
unsigned int sysctl_sched_task_unfilter_period;
unsigned int __read_mostly sysctl_sched_asym_cap_sibling_freq_match_pct;
unsigned int sysctl_walt_low_latency_task_threshold; /* disabled by default */
unsigned int sysctl_task_read_pid;
unsigned int sysctl_sched_conservative_pl;
unsigned int sysctl_sched_min_task_util_for_boost = 51;
unsigned int sysctl_sched_min_task_util_for_colocation = 35;
unsigned int sysctl_sched_many_wakeup_threshold = WALT_MANY_WAKEUP_DEFAULT;
const int sched_user_hint_max = 1000;
static void init_tg_pointers(void)
{
struct cgroup_subsys_state *css = &root_task_group.css;
struct cgroup_subsys_state *top_css = css;
/* ptrs are already initialized */
if (task_group_topapp)
return;
css_for_each_child(css, top_css) {
if (!strcmp(css->cgroup->kn->name, "top-app")) {
task_group_topapp = css_tg(css);
walt_init_topapp_tg(task_group_topapp);
} else if (!strcmp(css->cgroup->kn->name, "foreground")) {
task_group_foreground = css_tg(css);
walt_init_foreground_tg(task_group_foreground);
} else {
walt_init_tg(css_tg(css));
}
}
}
static int walt_init_stage_handler(struct ctl_table *table,
int write, void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
static DEFINE_MUTEX(mutex);
int old_value = sysctl_sched_init_stage;
mutex_lock(&mutex);
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write)
goto unlock;
if (sysctl_sched_init_stage == 1 &&
old_value != sysctl_sched_init_stage) {
init_tg_pointers();
}
unlock:
mutex_unlock(&mutex);
return ret;
}
static int walt_proc_group_thresholds_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
static DEFINE_MUTEX(mutex);
struct rq *rq = cpu_rq(cpumask_first(cpu_possible_mask));
unsigned long flags;
if (unlikely(num_sched_clusters <= 0))
return -EPERM;
mutex_lock(&mutex);
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write) {
mutex_unlock(&mutex);
return ret;
}
/*
* The load scale factor update happens with all
* rqs locked. so acquiring 1 CPU rq lock and
* updating the thresholds is sufficient for
* an atomic update.
*/
raw_spin_lock_irqsave(&rq->lock, flags);
walt_update_group_thresholds();
raw_spin_unlock_irqrestore(&rq->lock, flags);
mutex_unlock(&mutex);
return ret;
}
static int walt_proc_user_hint_handler(struct ctl_table *table,
int write, void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret;
unsigned int old_value;
static DEFINE_MUTEX(mutex);
mutex_lock(&mutex);
sched_user_hint_reset_time = jiffies + HZ;
old_value = sysctl_sched_user_hint;
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write || (old_value == sysctl_sched_user_hint))
goto unlock;
walt_irq_work_queue(&walt_migration_irq_work);
unlock:
mutex_unlock(&mutex);
return ret;
}
static int sched_ravg_window_handler(struct ctl_table *table,
int write, void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = -EPERM;
static DEFINE_MUTEX(mutex);
int val = sysctl_sched_ravg_window_nr_ticks;
struct ctl_table tmp = {
.data = &val,
.maxlen = sizeof(val),
.mode = table->mode,
};
mutex_lock(&mutex);
if (write && (HZ != 250 || !sysctl_sched_dynamic_ravg_window_enable))
goto unlock;
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (ret || !write || (val == sysctl_sched_ravg_window_nr_ticks))
goto unlock;
if (val != 2 && val != 3 && val != 4 && val != 5 && val != 8) {
ret = -EINVAL;
goto unlock;
}
sysctl_sched_ravg_window_nr_ticks = val;
sched_window_nr_ticks_change();
unlock:
mutex_unlock(&mutex);
return ret;
}
enum {
TASK_BEGIN = 0,
WAKE_UP_IDLE,
INIT_TASK_LOAD,
GROUP_ID,
PER_TASK_BOOST,
PER_TASK_BOOST_PERIOD_MS,
LOW_LATENCY,
};
static int sched_task_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret, param;
struct task_struct *task;
int pid_and_val[2] = {-1, -1};
int val;
struct walt_task_struct *wts;
struct ctl_table tmp = {
.data = &pid_and_val,
.maxlen = sizeof(pid_and_val),
.mode = table->mode,
};
static DEFINE_MUTEX(mutex);
mutex_lock(&mutex);
if (!write) {
if (sysctl_task_read_pid <= 0) {
ret = -ENOENT;
goto unlock_mutex;
}
task = get_pid_task(find_vpid(sysctl_task_read_pid),
PIDTYPE_PID);
if (!task) {
ret = -ENOENT;
goto put_task;
}
wts = (struct walt_task_struct *) task->android_vendor_data1;
pid_and_val[0] = sysctl_task_read_pid;
param = (unsigned long)table->data;
switch (param) {
case WAKE_UP_IDLE:
pid_and_val[1] = wts->wake_up_idle;
break;
case INIT_TASK_LOAD:
pid_and_val[1] = wts->init_load_pct;
break;
case GROUP_ID:
pid_and_val[1] = sched_get_group_id(task);
break;
case PER_TASK_BOOST:
pid_and_val[1] = wts->boost;
break;
case PER_TASK_BOOST_PERIOD_MS:
pid_and_val[1] =
div64_ul(wts->boost_period,
1000000UL);
break;
case LOW_LATENCY:
pid_and_val[1] = wts->low_latency;
break;
default:
ret = -EINVAL;
goto put_task;
}
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
goto put_task;
}
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (ret)
goto unlock_mutex;
if (pid_and_val[0] <= 0 || pid_and_val[1] < 0) {
ret = -ENOENT;
goto unlock_mutex;
}
/* parsed the values successfully in pid_and_val[] array */
task = get_pid_task(find_vpid(pid_and_val[0]), PIDTYPE_PID);
if (!task) {
ret = -ENOENT;
goto unlock_mutex;
}
wts = (struct walt_task_struct *) task->android_vendor_data1;
param = (unsigned long)table->data;
val = pid_and_val[1];
switch (param) {
case WAKE_UP_IDLE:
wts->wake_up_idle = val;
break;
case INIT_TASK_LOAD:
if (pid_and_val[1] < 0 || pid_and_val[1] > 100) {
ret = -EINVAL;
goto put_task;
}
wts->init_load_pct = val;
break;
case GROUP_ID:
ret = sched_set_group_id(task, val);
break;
case PER_TASK_BOOST:
if (val < TASK_BOOST_NONE || val >= TASK_BOOST_END) {
ret = -EINVAL;
goto put_task;
}
wts->boost = val;
if (val == 0)
wts->boost_period = 0;
break;
case PER_TASK_BOOST_PERIOD_MS:
if (wts->boost == 0 && val) {
/* setting boost period w/o boost is invalid */
ret = -EINVAL;
goto put_task;
}
wts->boost_period = (u64)val * 1000 * 1000;
wts->boost_expires = sched_clock() + wts->boost_period;
break;
case LOW_LATENCY:
wts->low_latency = val;
break;
default:
ret = -EINVAL;
}
put_task:
put_task_struct(task);
unlock_mutex:
mutex_unlock(&mutex);
return ret;
}
static int sched_load_boost_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret, i;
unsigned int *data = (unsigned int *)table->data;
int val[WALT_NR_CPUS];
struct ctl_table tmp = {
.data = &val,
.maxlen = sizeof(val),
.mode = table->mode,
};
static DEFINE_MUTEX(mutex);
mutex_lock(&mutex);
if (!write) {
ret = proc_dointvec(table, write, buffer, lenp, ppos);
goto unlock_mutex;
}
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (ret)
goto unlock_mutex;
for (i = 0; i < WALT_NR_CPUS; i++) {
if (val[i] < -100 || val[i] > 1000) {
ret = -EINVAL;
goto unlock_mutex;
}
}
/* all things checkout update the value */
for (i = 0; i < WALT_NR_CPUS; i++)
data[i] = val[i];
unlock_mutex:
mutex_unlock(&mutex);
return ret;
}
#ifdef CONFIG_PROC_SYSCTL
static void sched_update_updown_migrate_values(bool up)
{
int i = 0, cpu;
struct walt_sched_cluster *cluster;
int cap_margin_levels = num_sched_clusters - 1;
if (cap_margin_levels > 1) {
/*
* No need to worry about CPUs in last cluster
* if there are more than 2 clusters in the system
*/
for_each_sched_cluster(cluster) {
for_each_cpu(cpu, &cluster->cpus) {
if (up)
sched_capacity_margin_up[cpu] =
SCHED_FIXEDPOINT_SCALE * 100 /
sysctl_sched_capacity_margin_up_pct[i];
else
sched_capacity_margin_down[cpu] =
SCHED_FIXEDPOINT_SCALE * 100 /
sysctl_sched_capacity_margin_dn_pct[i];
}
if (++i >= cap_margin_levels)
break;
}
} else {
for_each_possible_cpu(cpu) {
if (up)
sched_capacity_margin_up[cpu] =
SCHED_FIXEDPOINT_SCALE * 100 /
sysctl_sched_capacity_margin_up_pct[0];
else
sched_capacity_margin_down[cpu] =
sysctl_sched_capacity_margin_dn_pct[0];
}
}
}
int sched_updown_migrate_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
int ret, i;
unsigned int *data = (unsigned int *)table->data;
static DEFINE_MUTEX(mutex);
int cap_margin_levels = num_sched_clusters ? num_sched_clusters - 1 : 0;
int val[MAX_MARGIN_LEVELS];
struct ctl_table tmp = {
.data = &val,
.maxlen = sizeof(int) * cap_margin_levels,
.mode = table->mode,
};
if (cap_margin_levels <= 0)
return -EINVAL;
mutex_lock(&mutex);
if (!write) {
ret = proc_dointvec(table, write, buffer, lenp, ppos);
goto unlock_mutex;
}
ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
if (ret)
goto unlock_mutex;
/* check if valid pct values are passed in */
for (i = 0; i < cap_margin_levels; i++) {
if (val[i] <= 0 || val[i] > 100) {
ret = -EINVAL;
goto unlock_mutex;
}
}
/* check up pct is greater than dn pct */
if (data == &sysctl_sched_capacity_margin_up_pct[0]) {
for (i = 0; i < cap_margin_levels; i++) {
if (val[i] < sysctl_sched_capacity_margin_dn_pct[i]) {
ret = -EINVAL;
goto unlock_mutex;
}
}
} else {
for (i = 0; i < cap_margin_levels; i++) {
if (sysctl_sched_capacity_margin_up_pct[i] < val[i]) {
ret = -EINVAL;
goto unlock_mutex;
}
}
}
/* all things checkout update the value */
for (i = 0; i < cap_margin_levels; i++)
data[i] = val[i];
/* update individual cpu thresholds */
sched_update_updown_migrate_values(data == &sysctl_sched_capacity_margin_up_pct[0]);
unlock_mutex:
mutex_unlock(&mutex);
return ret;
}
#endif /* CONFIG_PROC_SYSCTL */
struct ctl_table input_boost_sysctls[] = {
{
.procname = "input_boost_ms",
.data = &sysctl_input_boost_ms,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred_thousand,
},
{
.procname = "input_boost_freq",
.data = &sysctl_input_boost_freq,
.maxlen = sizeof(unsigned int) * 8,
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
{
.procname = "sched_boost_on_input",
.data = &sysctl_sched_boost_on_input,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
{ }
};
struct ctl_table walt_table[] = {
{
.procname = "sched_init_stage",
.data = &sysctl_sched_init_stage,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = walt_init_stage_handler,
},
{
.procname = "sched_user_hint",
.data = &sysctl_sched_user_hint,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = walt_proc_user_hint_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = (void *)&sched_user_hint_max,
},
{
.procname = "sched_window_stats_policy",
.data = &sysctl_sched_window_stats_policy,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &four,
},
{
.procname = "sched_group_upmigrate",
.data = &sysctl_sched_group_upmigrate_pct,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = walt_proc_group_thresholds_handler,
.extra1 = &sysctl_sched_group_downmigrate_pct,
},
{
.procname = "sched_group_downmigrate",
.data = &sysctl_sched_group_downmigrate_pct,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = walt_proc_group_thresholds_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &sysctl_sched_group_upmigrate_pct,
},
{
.procname = "sched_boost",
.data = &sysctl_sched_boost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_boost_handler,
.extra1 = &neg_three,
.extra2 = &three,
},
{
.procname = "sched_conservative_pl",
.data = &sysctl_sched_conservative_pl,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "sched_many_wakeup_threshold",
.data = &sysctl_sched_many_wakeup_threshold,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &two,
.extra2 = &one_thousand,
},
{
.procname = "sched_walt_rotate_big_tasks",
.data = &sysctl_sched_walt_rotate_big_tasks,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "sched_min_task_util_for_boost",
.data = &sysctl_sched_min_task_util_for_boost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &one_thousand,
},
{
.procname = "sched_min_task_util_for_colocation",
.data = &sysctl_sched_min_task_util_for_colocation,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &one_thousand,
},
{
.procname = "sched_asym_cap_sibling_freq_match_pct",
.data = &sysctl_sched_asym_cap_sibling_freq_match_pct,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
.extra2 = &one_hundred,
},
{
.procname = "sched_coloc_downmigrate_ns",
.data = &sysctl_sched_coloc_downmigrate_ns,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
},
{
.procname = "sched_task_unfilter_period",
.data = &sysctl_sched_task_unfilter_period,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
.extra2 = &two_hundred_million,
},
{
.procname = "sched_busy_hysteresis_enable_cpus",
.data = &sysctl_sched_busy_hyst_enable_cpus,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_busy_hyst_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &two_hundred_fifty_five,
},
{
.procname = "sched_busy_hyst_ns",
.data = &sysctl_sched_busy_hyst,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_busy_hyst_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &ns_per_sec,
},
{
.procname = "sched_coloc_busy_hysteresis_enable_cpus",
.data = &sysctl_sched_coloc_busy_hyst_enable_cpus,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_busy_hyst_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &two_hundred_fifty_five,
},
{
.procname = "sched_coloc_busy_hyst_cpu_ns",
.data = &sysctl_sched_coloc_busy_hyst_cpu,
.maxlen = sizeof(unsigned int) * WALT_NR_CPUS,
.mode = 0644,
.proc_handler = sched_busy_hyst_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &ns_per_sec,
},
{
.procname = "sched_coloc_busy_hyst_max_ms",
.data = &sysctl_sched_coloc_busy_hyst_max_ms,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_busy_hyst_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred_thousand,
},
{
.procname = "sched_coloc_busy_hyst_cpu_busy_pct",
.data = &sysctl_sched_coloc_busy_hyst_cpu_busy_pct,
.maxlen = sizeof(unsigned int) * WALT_NR_CPUS,
.mode = 0644,
.proc_handler = sched_busy_hyst_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = &one_hundred,
},
{
.procname = "sched_ravg_window_nr_ticks",
.data = &sysctl_sched_ravg_window_nr_ticks,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_ravg_window_handler,
},
{
.procname = "sched_dynamic_ravg_window_enable",
.data = &sysctl_sched_dynamic_ravg_window_enable,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "sched_upmigrate",
.data = &sysctl_sched_capacity_margin_up_pct,
.maxlen = sizeof(unsigned int) * MAX_MARGIN_LEVELS,
.mode = 0644,
.proc_handler = sched_updown_migrate_handler,
},
{
.procname = "sched_downmigrate",
.data = &sysctl_sched_capacity_margin_dn_pct,
.maxlen = sizeof(unsigned int) * MAX_MARGIN_LEVELS,
.mode = 0644,
.proc_handler = sched_updown_migrate_handler,
},
{
.procname = "sched_prefer_spread",
.data = &sysctl_sched_prefer_spread,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &four,
},
{
.procname = "walt_rtg_cfs_boost_prio",
.data = &sysctl_walt_rtg_cfs_boost_prio,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_cfs_boost_prio,
.extra2 = &max_cfs_boost_prio,
},
{
.procname = "walt_low_latency_task_threshold",
.data = &sysctl_walt_low_latency_task_threshold,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &one_thousand,
},
{
.procname = "sched_force_lb_enable",
.data = &sysctl_sched_force_lb_enable,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "sched_lib_name",
.data = sched_lib_name,
.maxlen = LIB_PATH_LENGTH,
.mode = 0644,
.proc_handler = proc_dostring,
},
{
.procname = "sched_lib_mask_force",
.data = &sched_lib_mask_force,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &two_hundred_fifty_five,
},
{
.procname = "input_boost",
.mode = 0555,
.child = input_boost_sysctls,
},
{
.procname = "sched_wake_up_idle",
.data = (int *) WAKE_UP_IDLE,
.maxlen = sizeof(unsigned int) * 2,
.mode = 0644,
.proc_handler = sched_task_handler,
},
{
.procname = "sched_init_task_load",
.data = (int *) INIT_TASK_LOAD,
.maxlen = sizeof(unsigned int) * 2,
.mode = 0644,
.proc_handler = sched_task_handler,
},
{
.procname = "sched_group_id",
.data = (int *) GROUP_ID,
.maxlen = sizeof(unsigned int) * 2,
.mode = 0644,
.proc_handler = sched_task_handler,
},
{
.procname = "sched_per_task_boost",
.data = (int *) PER_TASK_BOOST,
.maxlen = sizeof(unsigned int) * 2,
.mode = 0644,
.proc_handler = sched_task_handler,
},
{
.procname = "sched_per_task_boost_period_ms",
.data = (int *) PER_TASK_BOOST_PERIOD_MS,
.maxlen = sizeof(unsigned int) * 2,
.mode = 0644,
.proc_handler = sched_task_handler,
},
{
.procname = "sched_low_latency",
.data = (int *) LOW_LATENCY,
.maxlen = sizeof(unsigned int) * 2,
.mode = 0644,
.proc_handler = sched_task_handler,
},
{
.procname = "sched_task_read_pid",
.data = &sysctl_task_read_pid,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_load_boost",
.data = &sysctl_sched_load_boost,
.maxlen = sizeof(unsigned int) * 8,
.mode = 0644,
.proc_handler = sched_load_boost_handler,
},
{ }
};
struct ctl_table walt_base_table[] = {
{
.procname = "walt",
.mode = 0555,
.child = walt_table,
},
{ },
};
void walt_tunables(void)
{
int i;
for (i = 0; i < MAX_MARGIN_LEVELS; i++) {
sysctl_sched_capacity_margin_up_pct[i] = 95; /* ~5% margin */
sysctl_sched_capacity_margin_dn_pct[i] = 85; /* ~15% margin */
}
sysctl_sched_group_upmigrate_pct = 100;
sysctl_sched_group_downmigrate_pct = 95;
sysctl_sched_asym_cap_sibling_freq_match_pct = 100;
sysctl_sched_task_unfilter_period = 100000000;
sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG;
sysctl_sched_ravg_window_nr_ticks = (HZ / NR_WINDOWS_PER_SEC);
sysctl_sched_dynamic_ravg_window_enable = (HZ == 250);
sched_load_granule = DEFAULT_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES;
sysctl_sched_min_task_util_for_boost = 51;
sysctl_sched_min_task_util_for_colocation = 35;
for (i = 0; i < WALT_NR_CPUS; i++) {
sysctl_sched_coloc_busy_hyst_cpu[i] = 39000000;
sysctl_sched_coloc_busy_hyst_cpu_busy_pct[i] = 10;
}
sysctl_sched_coloc_busy_hyst_enable_cpus = 112;
sysctl_sched_coloc_busy_hyst_max_ms = 5000;
sysctl_walt_rtg_cfs_boost_prio = 99; /* disabled by default */
sched_ravg_window = DEFAULT_SCHED_RAVG_WINDOW;
sysctl_input_boost_ms = 40;
for (i = 0; i < 8; i++)
sysctl_input_boost_freq[i] = 0;
}

84
kernel/sched/walt/trace.c Normal file
View File

@ -0,0 +1,84 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2019-2021, The Linux Foundation. All rights reserved.
*/
#include "walt.h"
static inline void __window_data(u32 *dst, u32 *src)
{
if (src)
memcpy(dst, src, nr_cpu_ids * sizeof(u32));
else
memset(dst, 0, nr_cpu_ids * sizeof(u32));
}
struct trace_seq;
const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len)
{
int i;
const char *ret = p->buffer + seq_buf_used(&p->seq);
for (i = 0; i < buf_len; i++)
trace_seq_printf(p, "%u ", buf[i]);
trace_seq_putc(p, 0);
return ret;
}
static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new)
{
struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
if (curr)
if (new)
return wrq->nt_curr_runnable_sum;
else
return wrq->curr_runnable_sum;
else
if (new)
return wrq->nt_prev_runnable_sum;
else
return wrq->prev_runnable_sum;
}
static inline s64 __grp_update_sum(struct rq *rq, bool curr, bool new)
{
struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
if (curr)
if (new)
return wrq->grp_time.nt_curr_runnable_sum;
else
return wrq->grp_time.curr_runnable_sum;
else
if (new)
return wrq->grp_time.nt_prev_runnable_sum;
else
return wrq->grp_time.prev_runnable_sum;
}
static inline s64
__get_update_sum(struct rq *rq, enum migrate_types migrate_type,
bool src, bool new, bool curr)
{
switch (migrate_type) {
case RQ_TO_GROUP:
if (src)
return __rq_update_sum(rq, curr, new);
else
return __grp_update_sum(rq, curr, new);
case GROUP_TO_RQ:
if (src)
return __grp_update_sum(rq, curr, new);
else
return __rq_update_sum(rq, curr, new);
default:
WARN_ON_ONCE(1);
return -EINVAL;
}
}
#define CREATE_TRACE_POINTS
#include "trace.h"

1097
kernel/sched/walt/trace.h Normal file

File diff suppressed because it is too large Load Diff

4136
kernel/sched/walt/walt.c Normal file

File diff suppressed because it is too large Load Diff

1006
kernel/sched/walt/walt.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,785 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
*/
#include <trace/hooks/sched.h>
#include <trace/hooks/binder.h>
#include "walt.h"
#include "trace.h"
#include "../../../drivers/android/binder_trace.h"
/* Migration margins */
unsigned int sched_capacity_margin_up[WALT_NR_CPUS] = {
[0 ... WALT_NR_CPUS-1] = 1078 /* ~5% margin */
};
unsigned int sched_capacity_margin_down[WALT_NR_CPUS] = {
[0 ... WALT_NR_CPUS-1] = 1205 /* ~15% margin */
};
__read_mostly unsigned int sysctl_sched_prefer_spread;
unsigned int sysctl_walt_rtg_cfs_boost_prio = 99; /* disabled by default */
unsigned int sched_small_task_threshold = 102;
__read_mostly unsigned int sysctl_sched_force_lb_enable = 1;
unsigned int capacity_margin_freq = 1280; /* ~20% margin */
static inline bool prefer_spread_on_idle(int cpu, bool new_ilb)
{
switch (sysctl_sched_prefer_spread) {
case 1:
return is_min_capacity_cpu(cpu);
case 2:
return true;
case 3:
return (new_ilb && is_min_capacity_cpu(cpu));
case 4:
return new_ilb;
default:
return false;
}
}
static inline bool
bias_to_this_cpu(struct task_struct *p, int cpu, int start_cpu)
{
bool base_test = cpumask_test_cpu(cpu, &p->cpus_mask) &&
cpu_active(cpu);
bool start_cap_test = (capacity_orig_of(cpu) >=
capacity_orig_of(start_cpu));
return base_test && start_cap_test;
}
static inline bool task_demand_fits(struct task_struct *p, int cpu)
{
unsigned long capacity = capacity_orig_of(cpu);
unsigned long max_capacity = max_possible_capacity;
if (capacity == max_capacity)
return true;
return task_fits_capacity(p, capacity, cpu);
}
struct find_best_target_env {
bool is_rtg;
int need_idle;
bool boosted;
int fastpath;
int start_cpu;
int order_index;
int end_index;
bool strict_max;
int skip_cpu;
};
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
* @p: the task which utilization should be discounted
*
* The utilization of a CPU is defined by the utilization of tasks currently
* enqueued on that CPU as well as tasks which are currently sleeping after an
* execution on that CPU.
*
* This method returns the utilization of the specified CPU by discounting the
* utilization of the specified task, whenever the task is currently
* contributing to the CPU utilization.
*/
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
unsigned int util;
/*
* WALT does not decay idle tasks in the same manner
* as PELT, so it makes little sense to subtract task
* utilization from cpu utilization. Instead just use
* cpu_util for this case.
*/
if (likely(p->state == TASK_WAKING))
return cpu_util(cpu);
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util(cpu);
util = max_t(long, cpu_util(cpu) - task_util(p), 0);
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
* the cpu_util call.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
static inline bool walt_get_rtg_status(struct task_struct *p)
{
struct walt_related_thread_group *grp;
bool ret = false;
rcu_read_lock();
grp = task_related_thread_group(p);
if (grp)
ret = grp->skip_min;
rcu_read_unlock();
return ret;
}
static inline bool walt_task_skip_min_cpu(struct task_struct *p)
{
struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
return sched_boost() != CONSERVATIVE_BOOST &&
walt_get_rtg_status(p) && wts->unfilter;
}
static inline bool walt_is_many_wakeup(int sibling_count_hint)
{
return sibling_count_hint >= sysctl_sched_many_wakeup_threshold;
}
static inline bool walt_target_ok(int target_cpu, int order_index)
{
return !((order_index != num_sched_clusters - 1) &&
(cpumask_weight(&cpu_array[order_index][0]) == 1) &&
(target_cpu == cpumask_first(&cpu_array[order_index][0])));
}
static void walt_get_indicies(struct task_struct *p, int *order_index,
int *end_index, int task_boost, bool boosted)
{
int i = 0;
*order_index = 0;
*end_index = 0;
if (num_sched_clusters <= 1)
return;
if (task_boost > TASK_BOOST_ON_MID) {
*order_index = num_sched_clusters - 1;
return;
}
if (is_full_throttle_boost()) {
*order_index = num_sched_clusters - 1;
if ((*order_index > 1) && task_demand_fits(p,
cpumask_first(&cpu_array[*order_index][1])))
*end_index = 1;
return;
}
if (boosted || task_boost_policy(p) == SCHED_BOOST_ON_BIG ||
walt_task_skip_min_cpu(p))
*order_index = 1;
for (i = *order_index ; i < num_sched_clusters - 1; i++) {
if (task_demand_fits(p, cpumask_first(&cpu_array[i][0])))
break;
}
*order_index = i;
}
enum fastpaths {
NONE = 0,
SYNC_WAKEUP,
PREV_CPU_FASTPATH,
};
static void walt_find_best_target(struct sched_domain *sd,
cpumask_t *candidates,
struct task_struct *p,
struct find_best_target_env *fbt_env)
{
unsigned long min_util = uclamp_task_util(p);
long target_max_spare_cap = 0;
unsigned long best_idle_cuml_util = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
int best_idle_cpu = -1;
int target_cpu = -1;
int i, start_cpu;
long spare_wake_cap, most_spare_wake_cap = 0;
int most_spare_cap_cpu = -1;
int prev_cpu = task_cpu(p);
int active_candidate = -1;
int order_index = fbt_env->order_index, end_index = fbt_env->end_index;
int cluster;
unsigned int target_nr_rtg_high_prio = UINT_MAX;
bool rtg_high_prio_task = task_rtg_high_prio(p);
cpumask_t visit_cpus;
bool io_task_pack = (order_index > 0 && p->in_iowait);
struct cfs_rq *cfs_rq;
/* Find start CPU based on boost value */
start_cpu = fbt_env->start_cpu;
if (fbt_env->strict_max || io_task_pack)
target_max_spare_cap = LONG_MIN;
if (p->state == TASK_RUNNING)
most_spare_wake_cap = ULONG_MAX;
/* fast path for prev_cpu */
if (((capacity_orig_of(prev_cpu) == capacity_orig_of(start_cpu)) ||
asym_cap_siblings(prev_cpu, start_cpu)) &&
cpu_active(prev_cpu) && cpu_online(prev_cpu) &&
available_idle_cpu(prev_cpu)) {
target_cpu = prev_cpu;
fbt_env->fastpath = PREV_CPU_FASTPATH;
cpumask_set_cpu(target_cpu, candidates);
goto out;
}
for (cluster = 0; cluster < num_sched_clusters; cluster++) {
cpumask_and(&visit_cpus, &p->cpus_mask,
&cpu_array[order_index][cluster]);
for_each_cpu(i, &visit_cpus) {
unsigned long capacity_orig = capacity_orig_of(i);
unsigned long wake_util, new_util, new_util_cuml;
long spare_cap;
unsigned int idle_exit_latency = UINT_MAX;
trace_sched_cpu_util(i);
if (!cpu_active(i))
continue;
if (active_candidate == -1)
active_candidate = i;
/*
* This CPU is the target of an active migration that's
* yet to complete. Avoid placing another task on it.
*/
if (is_reserved(i))
continue;
if (sched_cpu_high_irqload(i))
continue;
if (fbt_env->skip_cpu == i)
continue;
/*
* p's blocked utilization is still accounted for on prev_cpu
* so prev_cpu will receive a negative bias due to the double
* accounting. However, the blocked utilization may be zero.
*/
wake_util = cpu_util_without(i, p);
new_util = wake_util + uclamp_task_util(p);
spare_wake_cap = capacity_orig - wake_util;
if (spare_wake_cap > most_spare_wake_cap) {
most_spare_wake_cap = spare_wake_cap;
most_spare_cap_cpu = i;
}
if ((per_task_boost(cpu_rq(i)->curr) ==
TASK_BOOST_STRICT_MAX) &&
!fbt_env->strict_max)
continue;
/* get rq's utilization with this task included */
cfs_rq = &cpu_rq(i)->cfs;
new_util_cuml = READ_ONCE(cfs_rq->avg.util_avg) + min_util;
/*
* Ensure minimum capacity to grant the required boost.
* The target CPU can be already at a capacity level higher
* than the one required to boost the task.
*/
new_util = max(min_util, new_util);
if (!(fbt_env->strict_max || io_task_pack) &&
new_util > capacity_orig)
continue;
/*
* Pre-compute the maximum possible capacity we expect
* to have available on this CPU once the task is
* enqueued here.
*/
spare_cap = capacity_orig - new_util;
/*
* Find an optimal backup IDLE CPU for non latency
* sensitive tasks.
*
* Looking for:
* - favoring shallowest idle states
* i.e. avoid to wakeup deep-idle CPUs
*
* The following code path is used by non latency
* sensitive tasks if IDLE CPUs are available. If at
* least one of such CPUs are available it sets the
* best_idle_cpu to the most suitable idle CPU to be
* selected.
*
* If idle CPUs are available, favour these CPUs to
* improve performances by spreading tasks.
* Indeed, the energy_diff() computed by the caller
* will take care to ensure the minimization of energy
* consumptions without affecting performance.
*/
if (available_idle_cpu(i)) {
idle_exit_latency = walt_get_idle_exit_latency(cpu_rq(i));
/*
* Prefer shallowest over deeper idle state cpu,
* of same capacity cpus.
*/
if (idle_exit_latency > min_exit_latency)
continue;
if (min_exit_latency == idle_exit_latency &&
(best_idle_cpu == prev_cpu ||
(i != prev_cpu &&
new_util_cuml > best_idle_cuml_util)))
continue;
min_exit_latency = idle_exit_latency;
best_idle_cuml_util = new_util_cuml;
best_idle_cpu = i;
continue;
}
/*
* Consider only idle CPUs for active migration.
*/
if (p->state == TASK_RUNNING)
continue;
/*
* Try to spread the rtg high prio tasks so that they
* don't preempt each other. This is a optimisitc
* check assuming rtg high prio can actually preempt
* the current running task with the given vruntime
* boost.
*/
if (rtg_high_prio_task) {
if (walt_nr_rtg_high_prio(i) > target_nr_rtg_high_prio)
continue;
/* Favor CPUs with maximum spare capacity */
if (walt_nr_rtg_high_prio(i) == target_nr_rtg_high_prio &&
spare_cap < target_max_spare_cap)
continue;
} else {
/* Favor CPUs with maximum spare capacity */
if (spare_cap < target_max_spare_cap)
continue;
}
target_max_spare_cap = spare_cap;
target_nr_rtg_high_prio = walt_nr_rtg_high_prio(i);
target_cpu = i;
}
if (best_idle_cpu != -1)
break;
if ((cluster >= end_index) && (target_cpu != -1) &&
walt_target_ok(target_cpu, order_index))
break;
}
if (best_idle_cpu != -1)
target_cpu = -1;
/*
* We set both idle and target as long as they are valid CPUs.
* If we don't find either, then we fallback to most_spare_cap,
* If we don't find most spare cap, we fallback to prev_cpu,
* provided that the prev_cpu is active.
* If the prev_cpu is not active, we fallback to active_candidate.
*/
if (unlikely(target_cpu == -1)) {
if (best_idle_cpu != -1)
target_cpu = best_idle_cpu;
else if (most_spare_cap_cpu != -1)
target_cpu = most_spare_cap_cpu;
else if (!cpu_active(prev_cpu))
target_cpu = active_candidate;
}
if (target_cpu != -1)
cpumask_set_cpu(target_cpu, candidates);
if (best_idle_cpu != -1 && target_cpu != best_idle_cpu)
cpumask_set_cpu(best_idle_cpu, candidates);
out:
trace_sched_find_best_target(p, min_util, start_cpu,
best_idle_cpu, most_spare_cap_cpu,
target_cpu, order_index, end_index,
fbt_env->skip_cpu, p->state == TASK_RUNNING);
}
static inline unsigned long
cpu_util_next_walt(int cpu, struct task_struct *p, int dst_cpu)
{
struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1;
unsigned long util = wrq->walt_stats.cumulative_runnable_avg_scaled;
bool queued = task_on_rq_queued(p);
/*
* When task is queued,
* (a) The evaluating CPU (cpu) is task's current CPU. If the
* task is migrating, discount the task contribution from the
* evaluation cpu.
* (b) The evaluating CPU (cpu) is task's current CPU. If the
* task is NOT migrating, nothing to do. The contribution is
* already present on the evaluation CPU.
* (c) The evaluating CPU (cpu) is not task's current CPU. But
* the task is migrating to the evaluating CPU. So add the
* task contribution to it.
* (d) The evaluating CPU (cpu) is neither the current CPU nor
* the destination CPU. don't care.
*
* When task is NOT queued i.e waking. Task contribution is not
* present on any CPU.
*
* (a) If the evaluating CPU is the destination CPU, add the task
* contribution.
* (b) The evaluation CPU is not the destination CPU, don't care.
*/
if (unlikely(queued)) {
if (task_cpu(p) == cpu) {
if (dst_cpu != cpu)
util = max_t(long, util - task_util(p), 0);
} else if (dst_cpu == cpu) {
util += task_util(p);
}
} else if (dst_cpu == cpu) {
util += task_util(p);
}
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of @pd's CPUs after the task migration, and uses the Energy Model
* to compute what would be the energy if we decided to actually migrate that
* task.
*/
static long
compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
struct cpumask *pd_mask = perf_domain_span(pd);
unsigned long max_util = 0, sum_util = 0;
int cpu;
unsigned long cpu_util;
/*
* The capacity state of CPUs of the current rd can be driven by CPUs
* of another rd if they belong to the same pd. So, account for the
* utilization of these CPUs too by masking pd with cpu_online_mask
* instead of the rd span.
*
* If an entire pd is outside of the current rd, it will not appear in
* its pd list and will not be accounted by compute_energy().
*/
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
cpu_util = cpu_util_next_walt(cpu, p, dst_cpu);
sum_util += cpu_util;
max_util = max(max_util, cpu_util);
}
return em_cpu_energy(pd->em_pd, max_util, sum_util);
}
static inline long
walt_compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
{
long energy = 0;
for (; pd; pd = pd->next)
energy += compute_energy(p, dst_cpu, pd);
return energy;
}
static inline int wake_to_idle(struct task_struct *p)
{
struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
struct walt_task_struct *cur_wts =
(struct walt_task_struct *) current->android_vendor_data1;
return (cur_wts->wake_up_idle || wts->wake_up_idle);
}
/* return true if cpu should be chosen over best_energy_cpu */
static inline bool select_cpu_same_energy(int cpu, int best_cpu, int prev_cpu)
{
if (capacity_orig_of(cpu) < capacity_orig_of(best_cpu))
return true;
if (best_cpu == prev_cpu)
return false;
if (available_idle_cpu(best_cpu) && walt_get_idle_exit_latency(cpu_rq(best_cpu)) <= 1)
return false; /* best_cpu is idle wfi or shallower */
if (available_idle_cpu(cpu) && walt_get_idle_exit_latency(cpu_rq(cpu)) <= 1)
return true; /* new cpu is idle wfi or shallower */
/*
* If we are this far this must be a tie between a busy and deep idle,
* pick the busy.
*/
return available_idle_cpu(best_cpu);
}
static DEFINE_PER_CPU(cpumask_t, energy_cpus);
int walt_find_energy_efficient_cpu(struct task_struct *p, int prev_cpu,
int sync, int sibling_count_hint)
{
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
int weight, cpu = smp_processor_id(), best_energy_cpu = prev_cpu;
struct perf_domain *pd;
unsigned long cur_energy;
cpumask_t *candidates;
bool is_rtg, curr_is_rtg;
struct find_best_target_env fbt_env;
bool need_idle = wake_to_idle(p) || uclamp_latency_sensitive(p);
u64 start_t = 0;
int delta = 0;
int task_boost = per_task_boost(p);
bool is_uclamp_boosted = uclamp_boosted(p);
bool boosted = is_uclamp_boosted || (task_boost > 0);
int start_cpu, order_index, end_index;
if (walt_is_many_wakeup(sibling_count_hint) && prev_cpu != cpu &&
cpumask_test_cpu(prev_cpu, &p->cpus_mask))
return prev_cpu;
if (unlikely(!cpu_array))
return -EPERM;
walt_get_indicies(p, &order_index, &end_index, task_boost, boosted);
start_cpu = cpumask_first(&cpu_array[order_index][0]);
is_rtg = task_in_related_thread_group(p);
curr_is_rtg = task_in_related_thread_group(cpu_rq(cpu)->curr);
fbt_env.fastpath = 0;
fbt_env.need_idle = need_idle;
if (trace_sched_task_util_enabled())
start_t = sched_clock();
/* Pre-select a set of candidate CPUs. */
candidates = this_cpu_ptr(&energy_cpus);
cpumask_clear(candidates);
if (sync && (need_idle || (is_rtg && curr_is_rtg)))
sync = 0;
if (sync && bias_to_this_cpu(p, cpu, start_cpu)) {
best_energy_cpu = cpu;
fbt_env.fastpath = SYNC_WAKEUP;
goto done;
}
rcu_read_lock();
pd = rcu_dereference(rd->pd);
if (!pd)
goto fail;
fbt_env.is_rtg = is_rtg;
fbt_env.start_cpu = start_cpu;
fbt_env.order_index = order_index;
fbt_env.end_index = end_index;
fbt_env.boosted = boosted;
fbt_env.strict_max = is_rtg &&
(task_boost == TASK_BOOST_STRICT_MAX);
fbt_env.skip_cpu = walt_is_many_wakeup(sibling_count_hint) ?
cpu : -1;
walt_find_best_target(NULL, candidates, p, &fbt_env);
/* Bail out if no candidate was found. */
weight = cpumask_weight(candidates);
if (!weight)
goto unlock;
/* If there is only one sensible candidate, select it now. */
cpu = cpumask_first(candidates);
if (weight == 1 && (available_idle_cpu(cpu) || cpu == prev_cpu)) {
best_energy_cpu = cpu;
goto unlock;
}
if (p->state == TASK_WAKING)
delta = task_util(p);
if (task_placement_boost_enabled(p) || fbt_env.need_idle ||
boosted || is_rtg || __cpu_overutilized(prev_cpu, delta) ||
!task_fits_max(p, prev_cpu) || !cpu_active(prev_cpu)) {
best_energy_cpu = cpu;
goto unlock;
}
if (cpumask_test_cpu(prev_cpu, &p->cpus_mask))
prev_delta = best_delta =
walt_compute_energy(p, prev_cpu, pd);
else
prev_delta = best_delta = ULONG_MAX;
/* Select the best candidate energy-wise. */
for_each_cpu(cpu, candidates) {
if (cpu == prev_cpu)
continue;
cur_energy = walt_compute_energy(p, cpu, pd);
trace_sched_compute_energy(p, cpu, cur_energy,
prev_delta, best_delta, best_energy_cpu);
if (cur_energy < best_delta) {
best_delta = cur_energy;
best_energy_cpu = cpu;
} else if (cur_energy == best_delta) {
if (select_cpu_same_energy(cpu, best_energy_cpu,
prev_cpu)) {
best_delta = cur_energy;
best_energy_cpu = cpu;
}
}
}
unlock:
rcu_read_unlock();
/*
* Pick the prev CPU, if best energy CPU can't saves at least 6% of
* the energy used by prev_cpu.
*/
if (!(available_idle_cpu(best_energy_cpu) &&
walt_get_idle_exit_latency(cpu_rq(best_energy_cpu)) <= 1) &&
(prev_delta != ULONG_MAX) && (best_energy_cpu != prev_cpu) &&
((prev_delta - best_delta) <= prev_delta >> 4) &&
(capacity_orig_of(prev_cpu) <= capacity_orig_of(start_cpu)))
best_energy_cpu = prev_cpu;
done:
trace_sched_task_util(p, cpumask_bits(candidates)[0], best_energy_cpu,
sync, fbt_env.need_idle, fbt_env.fastpath,
task_boost_policy(p), start_t, boosted, is_rtg,
walt_get_rtg_status(p), start_cpu);
return best_energy_cpu;
fail:
rcu_read_unlock();
return -EPERM;
}
static void
walt_select_task_rq_fair(void *unused, struct task_struct *p, int prev_cpu,
int sd_flag, int wake_flags, int *target_cpu)
{
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
int sibling_count_hint = p->wake_q_head ? p->wake_q_head->count : 1;
*target_cpu = walt_find_energy_efficient_cpu(p, prev_cpu, sync, sibling_count_hint);
if (unlikely(*target_cpu < 0))
*target_cpu = prev_cpu;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
static unsigned long task_h_load(struct task_struct *p)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
}
#else
static unsigned long task_h_load(struct task_struct *p)
{
return p->se.avg.load_avg;
}
#endif
static void walt_update_misfit_status(void *unused, struct task_struct *p,
struct rq *rq, bool *need_update)
{
*need_update = false;
if (!p) {
rq->misfit_task_load = 0;
return;
}
if (task_fits_max(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
/*
* Make sure that misfit_task_load will not be null even if
* task_h_load() returns 0.
*/
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
static inline struct task_struct *task_of(struct sched_entity *se)
{
return container_of(se, struct task_struct, se);
}
static void walt_place_entity(void *unused, struct sched_entity *se, u64 *vruntime)
{
if (entity_is_task(se)) {
unsigned long thresh = sysctl_sched_latency;
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;
if ((per_task_boost(task_of(se)) == TASK_BOOST_STRICT_MAX) ||
walt_low_latency_task(task_of(se)) ||
task_rtg_high_prio(task_of(se))) {
*vruntime -= sysctl_sched_latency;
*vruntime -= thresh;
se->vruntime = *vruntime;
}
}
}
static void walt_binder_low_latency_set(void *unused, struct task_struct *task)
{
struct walt_task_struct *wts = (struct walt_task_struct *) task->android_vendor_data1;
if (task && current->signal &&
(current->signal->oom_score_adj == 0) &&
(current->prio < DEFAULT_PRIO))
wts->low_latency = true;
}
static void walt_binder_low_latency_clear(void *unused, struct binder_transaction *t)
{
struct walt_task_struct *wts = (struct walt_task_struct *) current->android_vendor_data1;
if (wts->low_latency)
wts->low_latency = false;
}
void walt_cfs_init(void)
{
register_trace_android_rvh_select_task_rq_fair(walt_select_task_rq_fair, NULL);
register_trace_android_rvh_update_misfit_status(walt_update_misfit_status, NULL);
register_trace_android_rvh_place_entity(walt_place_entity, NULL);
register_trace_android_vh_binder_wakeup_ilocked(walt_binder_low_latency_set, NULL);
register_trace_binder_transaction_received(walt_binder_low_latency_clear, NULL);
}

View File

@ -0,0 +1,50 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
*/
#include <linux/module.h>
#include <trace/hooks/sched.h>
#include "walt.h"
#include "walt_debug.h"
static void dump_throttled_rt_tasks(void *unused, int cpu, u64 clock,
ktime_t rt_period, u64 rt_runtime, s64 rt_period_timer_expires)
{
printk_deferred("sched: RT throttling activated for cpu %d\n", cpu);
printk_deferred("rt_period_timer: expires=%lld now=%llu runtime=%llu period=%llu\n",
rt_period_timer_expires, ktime_get_ns(), rt_runtime, rt_period);
printk_deferred("potential CPU hogs:\n");
#ifdef CONFIG_SCHED_INFO
if (sched_info_on())
printk_deferred("current %s (%d) is running for %llu nsec\n",
current->comm, current->pid,
clock - current->sched_info.last_arrival);
#endif
BUG();
}
static void android_rvh_schedule_bug(void *unused, void *unused2)
{
BUG();
}
static int __init walt_debug_init(void)
{
int ret;
ret = preemptirq_long_init();
if (!ret)
return ret;
register_trace_android_vh_dump_throttled_rt_tasks(dump_throttled_rt_tasks, NULL);
register_trace_android_rvh_schedule_bug(android_rvh_schedule_bug, NULL);
return 0;
}
module_init(walt_debug_init);
MODULE_DESCRIPTION("QTI WALT Debug Module");
MODULE_LICENSE("GPL v2");

View File

@ -0,0 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (c) 2021, The Linux Foundation. All rights reserved.
*/
int preemptirq_long_init(void);

742
kernel/sched/walt/walt_lb.c Normal file
View File

@ -0,0 +1,742 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
*/
#include <trace/hooks/sched.h>
#include "walt.h"
#include "trace.h"
extern u64 sched_ktime_clock(void); // TODO
static void walt_detach_task(struct task_struct *p, struct rq *src_rq,
struct rq *dst_rq)
{
deactivate_task(src_rq, p, 0);
double_lock_balance(src_rq, dst_rq);
if (!(src_rq->clock_update_flags & RQCF_UPDATED))
update_rq_clock(src_rq);
set_task_cpu(p, dst_rq->cpu);
double_unlock_balance(src_rq, dst_rq);
}
static void walt_attach_task(struct task_struct *p, struct rq *rq)
{
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
}
static int walt_lb_active_migration(void *data)
{
struct rq *busiest_rq = data;
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct walt_rq *wrq = (struct walt_rq *) busiest_rq->android_vendor_data1;
struct task_struct *push_task = wrq->push_task;
int push_task_detached = 0;
raw_spin_lock_irq(&busiest_rq->lock);
/* sanity checks before initiating the pull */
if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
goto out_unlock;
if (unlikely(busiest_cpu != raw_smp_processor_id() ||
!busiest_rq->active_balance))
goto out_unlock;
if (busiest_rq->nr_running <= 1)
goto out_unlock;
BUG_ON(busiest_rq == target_rq);
if (task_on_rq_queued(push_task) &&
push_task->state == TASK_RUNNING &&
task_cpu(push_task) == busiest_cpu &&
cpu_active(target_cpu)) {
walt_detach_task(push_task, busiest_rq, target_rq);
push_task_detached = 1;
}
out_unlock: /* called with busiest_rq lock */
busiest_rq->active_balance = 0;
target_cpu = busiest_rq->push_cpu;
clear_reserved(target_cpu);
wrq->push_task = NULL;
raw_spin_unlock(&busiest_rq->lock);
if (push_task_detached) {
if (push_task_detached) {
raw_spin_lock(&target_rq->lock);
walt_attach_task(push_task, target_rq);
raw_spin_unlock(&target_rq->lock);
}
}
put_task_struct(push_task);
local_irq_enable();
return 0;
}
struct walt_lb_rotate_work {
struct work_struct w;
struct task_struct *src_task;
struct task_struct *dst_task;
int src_cpu;
int dst_cpu;
};
DEFINE_PER_CPU(struct walt_lb_rotate_work, walt_lb_rotate_works);
static void walt_lb_rotate_work_func(struct work_struct *work)
{
struct walt_lb_rotate_work *wr = container_of(work,
struct walt_lb_rotate_work, w);
migrate_swap(wr->src_task, wr->dst_task, wr->dst_cpu, wr->src_cpu);
put_task_struct(wr->src_task);
put_task_struct(wr->dst_task);
clear_reserved(wr->src_cpu);
clear_reserved(wr->dst_cpu);
}
static void walt_lb_rotate_work_init(void)
{
int i;
for_each_possible_cpu(i) {
struct walt_lb_rotate_work *wr = &per_cpu(walt_lb_rotate_works, i);
INIT_WORK(&wr->w, walt_lb_rotate_work_func);
}
}
#define WALT_ROTATION_THRESHOLD_NS 16000000
static void walt_lb_check_for_rotation(struct rq *src_rq)
{
u64 wc, wait, max_wait = 0, run, max_run = 0;
int deserved_cpu = nr_cpu_ids, dst_cpu = nr_cpu_ids;
int i, src_cpu = cpu_of(src_rq);
struct rq *dst_rq;
struct walt_lb_rotate_work *wr = NULL;
struct walt_task_struct *wts;
if (!is_min_capacity_cpu(src_cpu))
return;
wc = sched_ktime_clock();
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
if (!is_min_capacity_cpu(i))
break;
if (is_reserved(i))
continue;
if (!rq->misfit_task_load)
continue;
wts = (struct walt_task_struct *) rq->curr->android_vendor_data1;
wait = wc - wts->last_enqueued_ts;
if (wait > max_wait) {
max_wait = wait;
deserved_cpu = i;
}
}
if (deserved_cpu != src_cpu)
return;
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
if (is_min_capacity_cpu(i))
continue;
if (is_reserved(i))
continue;
if (rq->curr->prio < MAX_RT_PRIO)
continue;
if (rq->nr_running > 1)
continue;
wts = (struct walt_task_struct *) rq->curr->android_vendor_data1;
run = wc - wts->last_enqueued_ts;
if (run < WALT_ROTATION_THRESHOLD_NS)
continue;
if (run > max_run) {
max_run = run;
dst_cpu = i;
}
}
if (dst_cpu == nr_cpu_ids)
return;
dst_rq = cpu_rq(dst_cpu);
double_rq_lock(src_rq, dst_rq);
if (dst_rq->curr->prio >= MAX_RT_PRIO && dst_rq->curr != dst_rq->idle &&
src_rq->curr->prio >= MAX_RT_PRIO && src_rq->curr != src_rq->idle) {
get_task_struct(src_rq->curr);
get_task_struct(dst_rq->curr);
mark_reserved(src_cpu);
mark_reserved(dst_cpu);
wr = &per_cpu(walt_lb_rotate_works, src_cpu);
wr->src_task = src_rq->curr;
wr->dst_task = dst_rq->curr;
wr->src_cpu = src_cpu;
wr->dst_cpu = dst_cpu;
}
double_rq_unlock(src_rq, dst_rq);
if (wr)
queue_work_on(src_cpu, system_highpri_wq, &wr->w);
}
static inline bool _walt_can_migrate_task(struct task_struct *p, int dst_cpu,
bool to_lower)
{
struct walt_rq *wrq = (struct walt_rq *) task_rq(p)->android_vendor_data1;
if (to_lower) {
if (p->in_iowait)
return false;
if (per_task_boost(p) == TASK_BOOST_STRICT_MAX &&
task_in_related_thread_group(p))
return false;
}
/* Don't detach task if it is under active migration */
if (wrq->push_task == p)
return false;
return true;
}
static inline bool need_active_lb(struct task_struct *p, int dst_cpu,
int src_cpu)
{
struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
if (cpu_rq(src_cpu)->active_balance)
return false;
if (capacity_orig_of(dst_cpu) <= capacity_orig_of(src_cpu))
return false;
if (!wts->misfit)
return false;
return true;
}
static int walt_lb_pull_tasks(int dst_cpu, int src_cpu)
{
struct rq *dst_rq = cpu_rq(dst_cpu);
struct rq *src_rq = cpu_rq(src_cpu);
unsigned long flags;
struct task_struct *pulled_task = NULL, *p;
bool active_balance = false, to_lower;
struct walt_rq *wrq = (struct walt_rq *) src_rq->android_vendor_data1;
struct walt_task_struct *wts;
BUG_ON(src_cpu == dst_cpu);
to_lower = capacity_orig_of(dst_cpu) < capacity_orig_of(src_cpu);
raw_spin_lock_irqsave(&src_rq->lock, flags);
list_for_each_entry_reverse(p, &src_rq->cfs_tasks, se.group_node) {
if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr))
continue;
if (!_walt_can_migrate_task(p, dst_cpu, to_lower))
continue;
if (task_running(src_rq, p)) {
if (need_active_lb(p, dst_cpu, src_cpu)) {
active_balance = true;
break;
}
continue;
}
walt_detach_task(p, src_rq, dst_rq);
pulled_task = p;
break;
}
if (active_balance) {
src_rq->active_balance = 1;
src_rq->push_cpu = dst_cpu;
get_task_struct(p);
wrq->push_task = p;
mark_reserved(dst_cpu);
}
/* lock must be dropped before waking the stopper */
raw_spin_unlock_irqrestore(&src_rq->lock, flags);
/*
* Using our custom active load balance callback so that
* the push_task is really pulled onto this CPU.
*/
if (active_balance) {
wts = (struct walt_task_struct *) p->android_vendor_data1;
trace_walt_active_load_balance(p, src_cpu, dst_cpu, wts);
stop_one_cpu_nowait(src_cpu, walt_lb_active_migration,
src_rq, &src_rq->active_balance_work);
return 0; /* we did not pull any task here */
}
if (!pulled_task)
return 0;
raw_spin_lock_irqsave(&dst_rq->lock, flags);
walt_attach_task(p, dst_rq);
raw_spin_unlock_irqrestore(&dst_rq->lock, flags);
return 1; /* we pulled 1 task */
}
static int walt_lb_find_busiest_similar_cap_cpu(int dst_cpu, const cpumask_t *src_mask)
{
int i;
int busiest_cpu = -1;
int busiest_nr = 1; /* we need atleast 2 */
unsigned long util, busiest_util = 0;
struct walt_rq *wrq;
for_each_cpu(i, src_mask) {
wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
trace_walt_lb_cpu_util(i, wrq);
if (cpu_rq(i)->cfs.h_nr_running < 2)
continue;
util = cpu_util(i);
if (util < busiest_util)
continue;
busiest_nr = cpu_rq(i)->cfs.h_nr_running;
busiest_util = util;
busiest_cpu = i;
}
return busiest_cpu;
}
#define SMALL_TASK_THRESHOLD 102
static int walt_lb_find_busiest_higher_cap_cpu(int dst_cpu, const cpumask_t *src_mask)
{
int i;
int busiest_cpu = -1;
int busiest_nr = 1; /* we need atleast 2 */
unsigned long util, busiest_util = 0;
unsigned long total_capacity = 0, total_util = 0, total_nr = 0;
int total_cpus = 0;
struct walt_rq *wrq;
for_each_cpu(i, src_mask) {
if (!cpu_active(i))
continue;
wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
trace_walt_lb_cpu_util(i, wrq);
util = cpu_util(i);
total_cpus += 1;
total_util += util;
total_capacity += capacity_orig_of(i);
total_nr += cpu_rq(i)->cfs.h_nr_running;
if (cpu_rq(i)->cfs.h_nr_running < 2)
continue;
if (cpu_rq(i)->cfs.h_nr_running == 2 &&
task_util(cpu_rq(i)->curr) < SMALL_TASK_THRESHOLD)
continue;
/*
* During rotation, two silver fmax tasks gets
* placed on gold/prime and the CPU may not be
* overutilized but for rotation, we have to
* spread out.
*/
if (!walt_rotation_enabled && !cpu_overutilized(i))
continue;
if (util < busiest_util)
continue;
busiest_nr = cpu_rq(i)->cfs.h_nr_running;
busiest_util = util;
busiest_cpu = i;
}
/*
* Don't allow migrating to lower cluster unless this high
* capacity cluster is sufficiently loaded.
*/
if (!walt_rotation_enabled) {
if (total_nr <= total_cpus || total_util * 1280 < total_capacity * 1024)
busiest_cpu = -1;
}
return busiest_cpu;
}
static int walt_lb_find_busiest_lower_cap_cpu(int dst_cpu, const cpumask_t *src_mask)
{
int i;
int busiest_cpu = -1;
int busiest_nr = 1; /* we need atleast 2 */
unsigned long util, busiest_util = 0;
unsigned long total_capacity = 0, total_util = 0, total_nr = 0;
int total_cpus = 0;
int busy_nr_big_tasks = 0;
struct walt_rq *wrq;
/*
* A higher capacity CPU is looking at a lower capacity
* cluster. active balance and big tasks are in play.
* other than that, it is very much same as above. we
* really don't need this as a separate block. will
* refactor this after final testing is done.
*/
for_each_cpu(i, src_mask) {
wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1;
if (!cpu_active(i))
continue;
trace_walt_lb_cpu_util(i, wrq);
util = cpu_util(i);
total_cpus += 1;
total_util += util;
total_capacity += capacity_orig_of(i);
total_nr += cpu_rq(i)->cfs.h_nr_running;
/*
* no point in selecting this CPU as busy, as
* active balance is in progress.
*/
if (cpu_rq(i)->active_balance)
continue;
if (cpu_rq(i)->cfs.h_nr_running < 2 && !wrq->walt_stats.nr_big_tasks)
continue;
if (!walt_rotation_enabled && !cpu_overutilized(i))
continue;
if (util < busiest_util)
continue;
busiest_nr = cpu_rq(i)->cfs.h_nr_running;
busiest_util = util;
busiest_cpu = i;
busy_nr_big_tasks = wrq->walt_stats.nr_big_tasks;
}
if (!walt_rotation_enabled && !busy_nr_big_tasks) {
if (total_nr <= total_cpus || total_util * 1280 < total_capacity * 1024)
busiest_cpu = -1;
}
return busiest_cpu;
}
static int walt_lb_find_busiest_cpu(int dst_cpu, const cpumask_t *src_mask)
{
int fsrc_cpu = cpumask_first(src_mask);
int busiest_cpu;
if (capacity_orig_of(dst_cpu) == capacity_orig_of(fsrc_cpu))
busiest_cpu = walt_lb_find_busiest_similar_cap_cpu(dst_cpu,
src_mask);
else if (capacity_orig_of(dst_cpu) < capacity_orig_of(fsrc_cpu))
busiest_cpu = walt_lb_find_busiest_lower_cap_cpu(dst_cpu,
src_mask);
else
busiest_cpu = walt_lb_find_busiest_higher_cap_cpu(dst_cpu,
src_mask);
return busiest_cpu;
}
static DEFINE_RAW_SPINLOCK(walt_lb_migration_lock);
static void walt_lb_tick(void *unused, struct rq *rq)
{
int prev_cpu = rq->cpu, new_cpu, ret;
struct task_struct *p = rq->curr;
unsigned long flags;
struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1;
struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1;
if (!rq->misfit_task_load)
return;
if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1)
return;
raw_spin_lock_irqsave(&walt_lb_migration_lock, flags);
if (walt_rotation_enabled) {
walt_lb_check_for_rotation(rq);
goto out_unlock;
}
rcu_read_lock();
new_cpu = walt_find_energy_efficient_cpu(p, prev_cpu, 0, 1);
rcu_read_unlock();
if (new_cpu < 0 || same_cluster(new_cpu, prev_cpu))
goto out_unlock;
raw_spin_lock(&rq->lock);
if (rq->active_balance) {
raw_spin_unlock(&rq->lock);
goto out_unlock;
}
rq->active_balance = 1;
rq->push_cpu = new_cpu;
get_task_struct(p);
wrq->push_task = p;
raw_spin_unlock(&rq->lock);
mark_reserved(new_cpu);
raw_spin_unlock_irqrestore(&walt_lb_migration_lock, flags);
trace_walt_active_load_balance(p, prev_cpu, new_cpu, wts);
ret = stop_one_cpu_nowait(prev_cpu,
walt_lb_active_migration, rq,
&rq->active_balance_work);
if (!ret)
clear_reserved(new_cpu);
else
wake_up_if_idle(new_cpu);
return;
out_unlock:
raw_spin_unlock_irqrestore(&walt_lb_migration_lock, flags);
}
static void walt_newidle_balance(void *unused, struct rq *this_rq,
struct rq_flags *rf, int *pulled_task,
int *done)
{
int this_cpu = this_rq->cpu;
struct walt_rq *wrq = (struct walt_rq *) this_rq->android_vendor_data1;
int order_index = wrq->cluster->id;
int cluster = 0;
int busy_cpu;
if (unlikely(!cpu_array))
return;
/*
* newly idle load balance is completely handled here, so
* set done to skip the load balance by the caller.
*/
*done = 1;
*pulled_task = 0;
/*
* This CPU is about to enter idle, so clear the
* misfit_task_load and mark the idle stamp.
*/
this_rq->misfit_task_load = 0;
this_rq->idle_stamp = rq_clock(this_rq);
if (!cpu_active(this_cpu))
return;
if (!READ_ONCE(this_rq->rd->overload))
return;
rq_unpin_lock(this_rq, rf);
raw_spin_unlock(&this_rq->lock);
/*
* careful, we dropped the lock, and has to be acquired
* before returning. Since rq lock is dropped, tasks
* can be queued remotely, so keep a check on nr_running
* and bail out.
*/
do {
busy_cpu = walt_lb_find_busiest_cpu(this_cpu,
&cpu_array[order_index][cluster]);
/* we got the busy/src cpu here. */
if (busy_cpu != -1 || this_rq->nr_running > 0)
break;
} while (++cluster < num_sched_clusters);
/* sanity checks before attempting the pull */
if (busy_cpu == -1 || this_rq->nr_running > 0 || (busy_cpu == this_cpu))
goto out;
*pulled_task = walt_lb_pull_tasks(this_cpu, busy_cpu);
out:
raw_spin_lock(&this_rq->lock);
if (this_rq->cfs.h_nr_running && !*pulled_task)
*pulled_task = 1;
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
*pulled_task = -1;
/* reset the idle time stamp if we pulled any task */
if (*pulled_task)
this_rq->idle_stamp = 0;
rq_repin_lock(this_rq, rf);
trace_walt_newidle_balance(this_cpu, busy_cpu, *pulled_task);
}
static void walt_find_busiest_queue(void *unused, int dst_cpu,
struct sched_group *group,
struct cpumask *env_cpus,
struct rq **busiest, int *done)
{
int fsrc_cpu = group_first_cpu(group);
int busiest_cpu = -1;
struct cpumask src_mask;
*done = 1;
*busiest = NULL;
/*
* same cluster means, there will only be 1
* CPU in the busy group, so just select it.
*/
if (same_cluster(dst_cpu, fsrc_cpu)) {
busiest_cpu = fsrc_cpu;
goto done;
}
/*
* We will allow inter cluster migrations
* only if the source group is sufficiently
* loaded. The upstream load balancer is a
* bit more generous.
*
* re-using the same code that we use it
* for newly idle load balance. The policies
* remain same.
*/
cpumask_and(&src_mask, sched_group_span(group), env_cpus);
busiest_cpu = walt_lb_find_busiest_cpu(dst_cpu, &src_mask);
done:
if (busiest_cpu != -1)
*busiest = cpu_rq(busiest_cpu);
trace_walt_find_busiest_queue(dst_cpu, busiest_cpu, src_mask.bits[0]);
}
static void walt_migrate_queued_task(void *unused, struct rq *rq,
struct rq_flags *rf,
struct task_struct *p,
int new_cpu, int *detached)
{
/*
* WALT expects both source and destination rqs to be
* held when set_task_cpu() is called on a queued task.
* so implementing this detach hook. unpin the lock
* before detaching and repin it later to make lockdep
* happy.
*/
BUG_ON(!rf);
rq_unpin_lock(rq, rf);
walt_detach_task(p, rq, cpu_rq(new_cpu));
rq_repin_lock(rq, rf);
*detached = 1;
}
/*
* we only decide if nohz balance kick is needed or not. the
* first CPU in the nohz.idle will come out of idle and do
* load balance on behalf of every CPU. adding another hook
* to decide which cpu to kick is useless. most of the time,
* it is impossible to decide which CPU has to come out because
* we get to kick only once.
*/
static void walt_nohz_balancer_kick(void *unused, struct rq *rq,
unsigned int *flags, int *done)
{
*done = 1;
/*
* tick path migration takes care of misfit task.
* so we have to check for nr_running >= 2 here.
*/
if (rq->nr_running >= 2 && cpu_overutilized(rq->cpu)) {
*flags = NOHZ_KICK_MASK;
trace_walt_nohz_balance_kick(rq);
}
}
static void walt_can_migrate_task(void *unused, struct task_struct *p,
int dst_cpu, int *can_migrate)
{
bool to_lower;
to_lower = capacity_orig_of(dst_cpu) < capacity_orig_of(task_cpu(p));
if (_walt_can_migrate_task(p, dst_cpu, to_lower))
return;
*can_migrate = 0;
}
/*
* when WALT becomes module, this init will be called from
* another file and we don't have to define module_init().
*/
void walt_lb_init(void)
{
/*
* Any task movement outside task placement is called
* load balance, so moving the tick path and rotation
* code to here. we also use our custom active load balance
* stopper function instad of adding hooks to
* active_load_balance_cpu_stop() in fair.c
*/
walt_lb_rotate_work_init();
register_trace_android_rvh_migrate_queued_task(walt_migrate_queued_task, NULL);
register_trace_android_rvh_sched_nohz_balancer_kick(walt_nohz_balancer_kick, NULL);
register_trace_android_rvh_can_migrate_task(walt_can_migrate_task, NULL);
register_trace_android_rvh_find_busiest_queue(walt_find_busiest_queue, NULL);
register_trace_android_rvh_sched_newidle_balance(walt_newidle_balance, NULL);
/*
* TODO:
* scheduler tick is not a restricted hook so multiple entities
* can register for it. but from WALT, we will have only 1 hook
* and it will call our load balancer function later.
*/
register_trace_android_vh_scheduler_tick(walt_lb_tick, NULL);
}

View File

@ -0,0 +1,90 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
*/
#include <trace/hooks/sched.h>
#include "walt.h"
#include "trace.h"
static void rt_energy_aware_wake_cpu(void *unused, struct task_struct *task,
struct cpumask *lowest_mask, int ret, int *best_cpu)
{
int cpu;
unsigned long util, best_cpu_util = ULONG_MAX;
unsigned long best_cpu_util_cum = ULONG_MAX;
unsigned long util_cum;
unsigned long tutil = task_util(task);
unsigned int best_idle_exit_latency = UINT_MAX;
unsigned int cpu_idle_exit_latency = UINT_MAX;
bool boost_on_big = rt_boost_on_big();
int cluster;
int order_index = (boost_on_big && num_sched_clusters > 1) ? 1 : 0;
if (!ret)
return; /* No targets found */
rcu_read_lock();
for (cluster = 0; cluster < num_sched_clusters; cluster++) {
for_each_cpu_and(cpu, lowest_mask, &cpu_array[order_index][cluster]) {
trace_sched_cpu_util(cpu);
if (!cpu_active(cpu))
continue;
if (sched_cpu_high_irqload(cpu))
continue;
if (__cpu_overutilized(cpu, tutil))
continue;
util = cpu_util(cpu);
/* Find the least loaded CPU */
if (util > best_cpu_util)
continue;
/*
* If the previous CPU has same load, keep it as
* best_cpu.
*/
if (best_cpu_util == util && *best_cpu == task_cpu(task))
continue;
/*
* If candidate CPU is the previous CPU, select it.
* Otherwise, if its load is same with best_cpu and in
* a shallower C-state, select it. If all above
* conditions are same, select the least cumulative
* window demand CPU.
*/
cpu_idle_exit_latency = walt_get_idle_exit_latency(cpu_rq(cpu));
util_cum = cpu_util_cum(cpu, 0);
if (cpu != task_cpu(task) && best_cpu_util == util) {
if (best_idle_exit_latency < cpu_idle_exit_latency)
continue;
if (best_idle_exit_latency == cpu_idle_exit_latency &&
best_cpu_util_cum < util_cum)
continue;
}
best_idle_exit_latency = cpu_idle_exit_latency;
best_cpu_util_cum = util_cum;
best_cpu_util = util;
*best_cpu = cpu;
}
if (*best_cpu != -1)
break;
}
rcu_read_unlock();
}
void walt_rt_init(void)
{
register_trace_android_rvh_find_lowest_rq(rt_energy_aware_wake_cpu, NULL);
}

View File

@ -101,5 +101,3 @@ obj-$(CONFIG_IPC_LOGGING) += qcom_ipc_logging.o
qcom_ipc_logging-y := ipc_logging.o ipc_logging_debug.o qcom_ipc_logging-y := ipc_logging.o ipc_logging_debug.o
libftrace-y := ftrace.o libftrace-y := ftrace.o
obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += preemptirq_long.o

View File

@ -81,3 +81,4 @@ memory_dump_v2.ko
llcc-qcom.ko llcc-qcom.ko
qcom_edac.ko qcom_edac.ko
kryo_arm64_edac.ko kryo_arm64_edac.ko
qcom-cpufreq-hw.ko