diff --git a/Kconfig b/Kconfig index 745bc773f567..0b1735fb2bd1 100644 --- a/Kconfig +++ b/Kconfig @@ -30,3 +30,5 @@ source "lib/Kconfig" source "lib/Kconfig.debug" source "Documentation/Kconfig" + +source "kernel/sched/walt/Kconfig" diff --git a/arch/arm64/configs/vendor/lahaina_GKI.config b/arch/arm64/configs/vendor/lahaina_GKI.config index 073378b4dfa7..dd14f850b869 100644 --- a/arch/arm64/configs/vendor/lahaina_GKI.config +++ b/arch/arm64/configs/vendor/lahaina_GKI.config @@ -230,3 +230,4 @@ CONFIG_EDAC_QCOM=m CONFIG_EDAC_QCOM_LLCC_PANIC_ON_UE=y # CONFIG_EDAC_QCOM_LLCC_PANIC_ON_CE is not set CONFIG_MSM_BOOT_STATS=m +CONFIG_ARM_QCOM_CPUFREQ_HW=m diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h index 9718136488e9..3f249e150c0c 100644 --- a/include/trace/events/preemptirq.h +++ b/include/trace/events/preemptirq.h @@ -3,9 +3,6 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM preemptirq -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH trace/events - #if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PREEMPTIRQ_H diff --git a/include/trace/hooks/restricted_preemptirq.h b/include/trace/hooks/restricted_preemptirq.h deleted file mode 100644 index 238c3e665136..000000000000 --- a/include/trace/hooks/restricted_preemptirq.h +++ /dev/null @@ -1,56 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2020, The Linux Foundation. All rights reserved. - */ -#if !defined(_TRACE_RESTRICTED_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_RESTRICTED_PREEMPTIRQ_H - -#ifdef CONFIG_PREEMPTIRQ_TRACEPOINTS - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM restricted_preemptirq - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH trace/hooks - -#include -#include - -#ifdef CONFIG_TRACE_IRQFLAGS -DECLARE_RESTRICTED_HOOK(restricted_irq_disable, - TP_PROTO(unsigned long ip, unsigned long parent_ip), - TP_ARGS(ip, parent_ip), 1); - -DECLARE_RESTRICTED_HOOK(restricted_irq_enable, - TP_PROTO(unsigned long ip, unsigned long parent_ip), - TP_ARGS(ip, parent_ip), 1); - -#else -#define trace_restricted_irq_enable(ip, parent_ip) -#define trace_restricted_irq_disable(ip, parent_ip) -#endif /* CONFIG_TRACE_IRQFLAGS */ - -#ifdef CONFIG_TRACE_PREEMPT_TOGGLE -DECLARE_RESTRICTED_HOOK(restricted_preempt_disable, - TP_PROTO(unsigned long ip, unsigned long parent_ip), - TP_ARGS(ip, parent_ip), 1); - -DECLARE_RESTRICTED_HOOK(restricted_preempt_enable, - TP_PROTO(unsigned long ip, unsigned long parent_ip), - TP_ARGS(ip, parent_ip), 1); - -#else -#define trace_restricted_preempt_enable(ip, parent_ip) -#define trace_restricted_preempt_disable(ip, parent_ip) -#endif /* CONFIG_TRACE_PREEMPT_TOGGLE */ - -#include - -#else /* ! CONFIG_PREEMPTIRQ_TRACEPOINTS */ -#define trace_restricted_irq_enable(...) -#define trace_restricted_irq_disable(...) -#define trace_restricted_preempt_enable(...) -#define trace_restricted_preempt_disable(...) -#endif /* ! CONFIG_PREEMPTIRQ_TRACEPOINTS */ - -#endif /* TRACE_RESTRICTED_PREEMPTIRQ_H || TRACE_HEADER_MULTI_READ */ diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index ae4474b880a9..9165ff834856 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -26,7 +26,6 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o -obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o @@ -37,3 +36,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_SCHED_WALT) += walt/ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f35fe449fd59..549ef790212a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4333,10 +4333,6 @@ static noinline void __schedule_bug(struct task_struct *prev) if (panic_on_warn) panic("scheduling while atomic\n"); -#if defined(CONFIG_PANIC_ON_SCHED_BUG) && defined(CONFIG_SCHED_WALT) - BUG(); -#endif - trace_android_rvh_schedule_bug(NULL); dump_stack(); @@ -7199,9 +7195,6 @@ void __init sched_init_smp(void) /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); -#ifdef CONFIG_SCHED_WALT - cpumask_copy(¤t->wts.cpus_requested, cpu_possible_mask); -#endif sched_init_granularity(); @@ -7490,9 +7483,6 @@ void ___might_sleep(const char *file, int line, int preempt_offset) pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } -#ifdef CONFIG_PANIC_ON_SCHED_BUG - BUG(); -#endif trace_android_rvh_schedule_bug(NULL); @@ -8648,7 +8638,7 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_uclamp_ls_read_u64, .write_u64 = cpu_uclamp_ls_write_u64, }, -#endif /* CONFIG_UCLAMP_TASK_GROUP */ +#endif { } /* terminate */ }; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b06d66e57816..5d4fa723b0d8 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -83,6 +83,7 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; EXPORT_SYMBOL_GPL(sched_feat_keys); + #undef SCHED_FEAT static void sched_feat_disable(int i) diff --git a/kernel/sched/walt/Kconfig b/kernel/sched/walt/Kconfig new file mode 100644 index 000000000000..db60efdf4c8d --- /dev/null +++ b/kernel/sched/walt/Kconfig @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# QTI WALT based scheduler +# +menu "QTI WALT based scheduler features" + +config SCHED_WALT + tristate "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + +config SCHED_WALT_DEBUG + tristate "WALT debug module" + select TRACE_PREEMPT_TOGGLE + select TRACE_IRQFLAGS + help + This module provides the means of debugging long preempt and + irq disable code. This helps in identifying the scheduling + latencies. The module rely on preemptirq trace hooks and + print the stacktrace to the ftrace upon long preempt and irq + events. Sysctl knobs are available for the user to configure + the thresholds. + + This module also used to crash the system to catch issues + in scenarios like RT throttling and sleeping while in atomic + context etc. +endmenu diff --git a/kernel/sched/walt/Makefile b/kernel/sched/walt/Makefile new file mode 100644 index 000000000000..34987fd075d2 --- /dev/null +++ b/kernel/sched/walt/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0-only + +KCOV_INSTRUMENT := n +KCSAN_SANITIZE := n + +obj-$(CONFIG_SCHED_WALT) += sched-walt.o +sched-walt-$(CONFIG_SCHED_WALT) := walt.o boost.o sched_avg.o qc_vas.o core_ctl.o trace.o input-boost.o sysctl.o cpufreq_walt.o fixup.o walt_lb.o walt_rt.o walt_cfs.o + +obj-$(CONFIG_SCHED_WALT_DEBUG) += sched-walt-debug.o +sched-walt-debug-$(CONFIG_SCHED_WALT_DEBUG) := walt_debug.o preemptirq_long.o diff --git a/kernel/sched/walt/boost.c b/kernel/sched/walt/boost.c new file mode 100644 index 000000000000..17305aa10029 --- /dev/null +++ b/kernel/sched/walt/boost.c @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2012-2021, The Linux Foundation. All rights reserved. + */ + +#include +#include + +#include "walt.h" +#include "trace.h" + +/* + * Scheduler boost is a mechanism to temporarily place tasks on CPUs + * with higher capacity than those where a task would have normally + * ended up with their load characteristics. Any entity enabling + * boost is responsible for disabling it as well. + */ + +static enum sched_boost_policy boost_policy_dt = SCHED_BOOST_NONE; +static DEFINE_MUTEX(boost_mutex); + +struct task_group *task_group_topapp; +struct task_group *task_group_foreground; + +void walt_init_tg(struct task_group *tg) +{ + struct walt_task_group *wtg; + + wtg = (struct walt_task_group *) tg->android_vendor_data1; + + wtg->colocate = false; + wtg->sched_boost_enable[NO_BOOST] = false; + wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true; + wtg->sched_boost_enable[CONSERVATIVE_BOOST] = false; + wtg->sched_boost_enable[RESTRAINED_BOOST] = false; +} + +void walt_init_topapp_tg(struct task_group *tg) +{ + struct walt_task_group *wtg; + + wtg = (struct walt_task_group *) tg->android_vendor_data1; + + wtg->colocate = true; + wtg->sched_boost_enable[NO_BOOST] = false; + wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true; + wtg->sched_boost_enable[CONSERVATIVE_BOOST] = true; + wtg->sched_boost_enable[RESTRAINED_BOOST] = false; +} + +void walt_init_foreground_tg(struct task_group *tg) +{ + struct walt_task_group *wtg; + + wtg = (struct walt_task_group *) tg->android_vendor_data1; + + wtg->colocate = false; + wtg->sched_boost_enable[NO_BOOST] = false; + wtg->sched_boost_enable[FULL_THROTTLE_BOOST] = true; + wtg->sched_boost_enable[CONSERVATIVE_BOOST] = true; + wtg->sched_boost_enable[RESTRAINED_BOOST] = false; +} + +/* + * Scheduler boost type and boost policy might at first seem unrelated, + * however, there exists a connection between them that will allow us + * to use them interchangeably during placement decisions. We'll explain + * the connection here in one possible way so that the implications are + * clear when looking at placement policies. + * + * When policy = SCHED_BOOST_NONE, type is either none or RESTRAINED + * When policy = SCHED_BOOST_ON_ALL or SCHED_BOOST_ON_BIG, type can + * neither be none nor RESTRAINED. + */ +static void set_boost_policy(int type) +{ + if (type == NO_BOOST || type == RESTRAINED_BOOST) { + boost_policy = SCHED_BOOST_NONE; + return; + } + + if (boost_policy_dt) { + boost_policy = boost_policy_dt; + return; + } + + if (hmp_capable()) { + boost_policy = SCHED_BOOST_ON_BIG; + return; + } + + boost_policy = SCHED_BOOST_ON_ALL; +} + +static bool verify_boost_params(int type) +{ + return type >= RESTRAINED_BOOST_DISABLE && type <= RESTRAINED_BOOST; +} + +static void sched_no_boost_nop(void) +{ +} + +static void sched_full_throttle_boost_enter(void) +{ + core_ctl_set_boost(true); + walt_enable_frequency_aggregation(true); +} + +static void sched_full_throttle_boost_exit(void) +{ + core_ctl_set_boost(false); + walt_enable_frequency_aggregation(false); +} + +static void sched_conservative_boost_enter(void) +{ +} + +static void sched_conservative_boost_exit(void) +{ +} + +static void sched_restrained_boost_enter(void) +{ + walt_enable_frequency_aggregation(true); +} + +static void sched_restrained_boost_exit(void) +{ + walt_enable_frequency_aggregation(false); +} + +struct sched_boost_data { + int refcount; + void (*enter)(void); + void (*exit)(void); +}; + +static struct sched_boost_data sched_boosts[] = { + [NO_BOOST] = { + .refcount = 0, + .enter = sched_no_boost_nop, + .exit = sched_no_boost_nop, + }, + [FULL_THROTTLE_BOOST] = { + .refcount = 0, + .enter = sched_full_throttle_boost_enter, + .exit = sched_full_throttle_boost_exit, + }, + [CONSERVATIVE_BOOST] = { + .refcount = 0, + .enter = sched_conservative_boost_enter, + .exit = sched_conservative_boost_exit, + }, + [RESTRAINED_BOOST] = { + .refcount = 0, + .enter = sched_restrained_boost_enter, + .exit = sched_restrained_boost_exit, + }, +}; + +#define SCHED_BOOST_START FULL_THROTTLE_BOOST +#define SCHED_BOOST_END (RESTRAINED_BOOST + 1) + +static int sched_effective_boost(void) +{ + int i; + + /* + * The boosts are sorted in descending order by + * priority. + */ + for (i = SCHED_BOOST_START; i < SCHED_BOOST_END; i++) { + if (sched_boosts[i].refcount >= 1) + return i; + } + + return NO_BOOST; +} + +static void sched_boost_disable(int type) +{ + struct sched_boost_data *sb = &sched_boosts[type]; + int next_boost; + + if (sb->refcount <= 0) + return; + + sb->refcount--; + + if (sb->refcount) + return; + + /* + * This boost's refcount becomes zero, so it must + * be disabled. Disable it first and then apply + * the next boost. + */ + sb->exit(); + + next_boost = sched_effective_boost(); + sched_boosts[next_boost].enter(); +} + +static void sched_boost_enable(int type) +{ + struct sched_boost_data *sb = &sched_boosts[type]; + int next_boost, prev_boost = sched_boost_type; + + sb->refcount++; + + if (sb->refcount != 1) + return; + + /* + * This boost enable request did not come before. + * Take this new request and find the next boost + * by aggregating all the enabled boosts. If there + * is a change, disable the previous boost and enable + * the next boost. + */ + + next_boost = sched_effective_boost(); + if (next_boost == prev_boost) + return; + + sched_boosts[prev_boost].exit(); + sched_boosts[next_boost].enter(); +} + +static void sched_boost_disable_all(void) +{ + int i; + + for (i = SCHED_BOOST_START; i < SCHED_BOOST_END; i++) { + if (sched_boosts[i].refcount > 0) { + sched_boosts[i].exit(); + sched_boosts[i].refcount = 0; + } + } +} + +static void _sched_set_boost(int type) +{ + if (type == 0) + sched_boost_disable_all(); + else if (type > 0) + sched_boost_enable(type); + else + sched_boost_disable(-type); + + /* + * sysctl_sched_boost holds the boost request from + * user space which could be different from the + * effectively enabled boost. Update the effective + * boost here. + */ + + sched_boost_type = sched_effective_boost(); + sysctl_sched_boost = sched_boost_type; + set_boost_policy(sysctl_sched_boost); + trace_sched_set_boost(sysctl_sched_boost); +} + +int sched_set_boost(int type) +{ + int ret = 0; + + mutex_lock(&boost_mutex); + if (verify_boost_params(type)) + _sched_set_boost(type); + else + ret = -EINVAL; + mutex_unlock(&boost_mutex); + return ret; +} + +int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int *data = (unsigned int *)table->data; + + mutex_lock(&boost_mutex); + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto done; + + if (verify_boost_params(*data)) + _sched_set_boost(*data); + else + ret = -EINVAL; + +done: + mutex_unlock(&boost_mutex); + return ret; +} diff --git a/kernel/sched/walt/core_ctl.c b/kernel/sched/walt/core_ctl.c new file mode 100644 index 000000000000..1330e07b04f7 --- /dev/null +++ b/kernel/sched/walt/core_ctl.c @@ -0,0 +1,1307 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2014-2021, The Linux Foundation. All rights reserved. + */ + +#define pr_fmt(fmt) "core_ctl: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "walt.h" +#include "trace.h" + +struct cluster_data { + bool inited; + unsigned int min_cpus; + unsigned int max_cpus; + unsigned int offline_delay_ms; + unsigned int busy_up_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int busy_down_thres[MAX_CPUS_PER_CLUSTER]; + unsigned int active_cpus; + unsigned int num_cpus; + unsigned int nr_paused_cpus; + unsigned int nr_not_preferred_cpus; + cpumask_t cpu_mask; + unsigned int need_cpus; + unsigned int task_thres; + unsigned int max_nr; + unsigned int nr_prev_assist; + unsigned int nr_prev_assist_thresh; + s64 need_ts; + struct list_head lru; + bool enable; + int nrrun; + unsigned int first_cpu; + unsigned int boost; + struct kobject kobj; + unsigned int strict_nrrun; +}; + +struct cpu_data { + bool is_busy; + unsigned int busy; + unsigned int cpu; + bool not_preferred; + struct cluster_data *cluster; + struct list_head sib; + bool paused_by_us; +}; + +static DEFINE_PER_CPU(struct cpu_data, cpu_state); +static struct cluster_data cluster_state[MAX_CLUSTERS]; +static unsigned int num_clusters; + +#define for_each_cluster(cluster, idx) \ + for (; (idx) < num_clusters && ((cluster) = &cluster_state[idx]);\ + idx++) + +/* single core_ctl thread for all pause/unpause core_ctl operations */ +struct task_struct *core_ctl_thread; + +/* single lock per single thread for core_ctl + * protects core_ctl_pending flag + */ +spinlock_t core_ctl_pending_lock; +bool core_ctl_pending; + +static DEFINE_SPINLOCK(state_lock); +static void apply_need(struct cluster_data *state); +static void wake_up_core_ctl_thread(void); +static bool initialized; + +ATOMIC_NOTIFIER_HEAD(core_ctl_notifier); +static unsigned int last_nr_big; + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster); + +/* ========================= sysfs interface =========================== */ + +static ssize_t store_min_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->min_cpus = min(val, state->num_cpus); + apply_need(state); + + return count; +} + +static ssize_t show_min_cpus(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->min_cpus); +} + +static ssize_t store_max_cpus(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->max_cpus = min(val, state->num_cpus); + apply_need(state); + + return count; +} + +static ssize_t show_max_cpus(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->max_cpus); +} + +static ssize_t store_offline_delay_ms(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->offline_delay_ms = val; + apply_need(state); + + return count; +} + +static ssize_t show_task_thres(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->task_thres); +} + +static ssize_t store_task_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + if (val < state->num_cpus) + return -EINVAL; + + state->task_thres = val; + apply_need(state); + + return count; +} + +static ssize_t show_nr_prev_assist_thresh(const struct cluster_data *state, + char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->nr_prev_assist_thresh); +} + +static ssize_t store_nr_prev_assist_thresh(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + state->nr_prev_assist_thresh = val; + apply_need(state); + + return count; +} + +static ssize_t show_offline_delay_ms(const struct cluster_data *state, + char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->offline_delay_ms); +} + +static ssize_t store_busy_up_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_up_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_up_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += scnprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_up_thres[i]); + + count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_busy_down_thres(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val[MAX_CPUS_PER_CLUSTER]; + int ret, i; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != 1 && ret != state->num_cpus) + return -EINVAL; + + if (ret == 1) { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[0]; + } else { + for (i = 0; i < state->num_cpus; i++) + state->busy_down_thres[i] = val[i]; + } + apply_need(state); + return count; +} + +static ssize_t show_busy_down_thres(const struct cluster_data *state, char *buf) +{ + int i, count = 0; + + for (i = 0; i < state->num_cpus; i++) + count += scnprintf(buf + count, PAGE_SIZE - count, "%u ", + state->busy_down_thres[i]); + + count += scnprintf(buf + count, PAGE_SIZE - count, "\n"); + return count; +} + +static ssize_t store_enable(struct cluster_data *state, + const char *buf, size_t count) +{ + unsigned int val; + bool bval; + + if (sscanf(buf, "%u\n", &val) != 1) + return -EINVAL; + + bval = !!val; + if (bval != state->enable) { + state->enable = bval; + apply_need(state); + } + + return count; +} + +static ssize_t show_enable(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->enable); +} + +static ssize_t show_need_cpus(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->need_cpus); +} + +static ssize_t show_active_cpus(const struct cluster_data *state, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%u\n", state->active_cpus); +} + +static ssize_t show_global_state(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + struct cluster_data *cluster; + ssize_t count = 0; + unsigned int cpu; + + spin_lock_irq(&state_lock); + for_each_possible_cpu(cpu) { + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + if (!cluster || !cluster->inited) + continue; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "CPU%u\n", cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tCPU: %u\n", c->cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tOnline: %u\n", + cpu_online(c->cpu)); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tPaused: %u\n", + !cpu_active(c->cpu)); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tFirst CPU: %u\n", + cluster->first_cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tBusy%%: %u\n", c->busy); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tIs busy: %u\n", c->is_busy); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tNot preferred: %u\n", + c->not_preferred); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tNr running: %u\n", cluster->nrrun); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tActive CPUs: %u\n", get_active_cpu_count(cluster)); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tNeed CPUs: %u\n", cluster->need_cpus); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tNr paused CPUs: %u\n", + cluster->nr_paused_cpus); + count += scnprintf(buf + count, PAGE_SIZE - count, + "\tBoost: %u\n", (unsigned int) cluster->boost); + } + spin_unlock_irq(&state_lock); + + return count; +} + +static ssize_t store_not_preferred(struct cluster_data *state, + const char *buf, size_t count) +{ + struct cpu_data *c; + unsigned int i; + unsigned int val[MAX_CPUS_PER_CLUSTER]; + unsigned long flags; + int ret; + int not_preferred_count = 0; + + ret = sscanf(buf, "%u %u %u %u %u %u\n", + &val[0], &val[1], &val[2], &val[3], + &val[4], &val[5]); + if (ret != state->num_cpus) + return -EINVAL; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + c->not_preferred = val[i]; + not_preferred_count += !!val[i]; + } + state->nr_not_preferred_cpus = not_preferred_count; + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + +static ssize_t show_not_preferred(const struct cluster_data *state, char *buf) +{ + struct cpu_data *c; + ssize_t count = 0; + unsigned long flags; + int i; + + spin_lock_irqsave(&state_lock, flags); + for (i = 0; i < state->num_cpus; i++) { + c = &per_cpu(cpu_state, i + state->first_cpu); + count += scnprintf(buf + count, PAGE_SIZE - count, + "CPU#%d: %u\n", c->cpu, c->not_preferred); + } + spin_unlock_irqrestore(&state_lock, flags); + + return count; +} + +struct core_ctl_attr { + struct attribute attr; + ssize_t (*show)(const struct cluster_data *cd, char *c); + ssize_t (*store)(struct cluster_data *cd, const char *c, + size_t count); +}; + +#define core_ctl_attr_ro(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0444, show_##_name, NULL) + +#define core_ctl_attr_rw(_name) \ +static struct core_ctl_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +core_ctl_attr_rw(min_cpus); +core_ctl_attr_rw(max_cpus); +core_ctl_attr_rw(offline_delay_ms); +core_ctl_attr_rw(busy_up_thres); +core_ctl_attr_rw(busy_down_thres); +core_ctl_attr_rw(task_thres); +core_ctl_attr_rw(nr_prev_assist_thresh); +core_ctl_attr_ro(need_cpus); +core_ctl_attr_ro(active_cpus); +core_ctl_attr_ro(global_state); +core_ctl_attr_rw(not_preferred); +core_ctl_attr_rw(enable); + +static struct attribute *default_attrs[] = { + &min_cpus.attr, + &max_cpus.attr, + &offline_delay_ms.attr, + &busy_up_thres.attr, + &busy_down_thres.attr, + &task_thres.attr, + &nr_prev_assist_thresh.attr, + &enable.attr, + &need_cpus.attr, + &active_cpus.attr, + &global_state.attr, + ¬_preferred.attr, + NULL +}; + +#define to_cluster_data(k) container_of(k, struct cluster_data, kobj) +#define to_attr(a) container_of(a, struct core_ctl_attr, attr) +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->show) + ret = cattr->show(data, buf); + + return ret; +} + +static ssize_t store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct cluster_data *data = to_cluster_data(kobj); + struct core_ctl_attr *cattr = to_attr(attr); + ssize_t ret = -EIO; + + if (cattr->store) + ret = cattr->store(data, buf, count); + + return ret; +} + +static const struct sysfs_ops sysfs_ops = { + .show = show, + .store = store, +}; + +static struct kobj_type ktype_core_ctl = { + .sysfs_ops = &sysfs_ops, + .default_attrs = default_attrs, +}; + +/* ==================== runqueue based core count =================== */ + +static struct sched_avg_stats nr_stats[WALT_NR_CPUS]; + +/* + * nr_need: + * Number of tasks running on this cluster plus + * tasks running on higher capacity clusters. + * To find out CPUs needed from this cluster. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 4 small tasks running on min capacity CPUs + * and 2 big tasks running on 2 max capacity + * CPUs, nr_need has to be 6 for min capacity + * cluster and 2 for max capacity cluster. + * This is because, min capacity cluster has to + * account for tasks running on max capacity + * cluster, so that, the min capacity cluster + * can be ready to accommodate tasks running on max + * capacity CPUs if the demand of tasks goes down. + */ +static int compute_cluster_nr_need(int index) +{ + int cpu; + struct cluster_data *cluster; + int nr_need = 0; + + for_each_cluster(cluster, index) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_need += nr_stats[cpu].nr; + } + + return nr_need; +} + +/* + * prev_misfit_need: + * Tasks running on smaller capacity cluster which + * needs to be migrated to higher capacity cluster. + * To find out how many tasks need higher capacity CPUs. + * + * For example: + * On dual cluster system with 4 min capacity + * CPUs and 4 max capacity CPUs, if there are + * 2 small tasks and 2 big tasks running on + * min capacity CPUs and no tasks running on + * max cpacity, prev_misfit_need of min capacity + * cluster will be 0 and prev_misfit_need of + * max capacity cluster will be 2. + */ +static int compute_prev_cluster_misfit_need(int index) +{ + int cpu; + struct cluster_data *prev_cluster; + int prev_misfit_need = 0; + + /* + * Lowest capacity cluster does not have to + * accommodate any misfit tasks. + */ + if (index == 0) + return 0; + + prev_cluster = &cluster_state[index - 1]; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + prev_misfit_need += nr_stats[cpu].nr_misfit; + + return prev_misfit_need; +} + +static int compute_cluster_max_nr(int index) +{ + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + int max_nr = 0; + + for_each_cpu(cpu, &cluster->cpu_mask) + max_nr = max(max_nr, nr_stats[cpu].nr_max); + + return max_nr; +} + +static int cluster_real_big_tasks(int index) +{ + int nr_big = 0; + int cpu; + struct cluster_data *cluster = &cluster_state[index]; + + if (index == 0) { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr_misfit; + } else { + for_each_cpu(cpu, &cluster->cpu_mask) + nr_big += nr_stats[cpu].nr; + } + + return nr_big; +} + +/* + * prev_nr_need_assist: + * Tasks that are eligible to run on the previous + * cluster but cannot run because of insufficient + * CPUs there. prev_nr_need_assist is indicative + * of number of CPUs in this cluster that should + * assist its previous cluster to makeup for + * insufficient CPUs there. + * + * For example: + * On tri-cluster system with 4 min capacity + * CPUs, 3 intermediate capacity CPUs and 1 + * max capacity CPU, if there are 4 small + * tasks running on min capacity CPUs, 4 big + * tasks running on intermediate capacity CPUs + * and no tasks running on max capacity CPU, + * prev_nr_need_assist for min & max capacity + * clusters will be 0, but, for intermediate + * capacity cluster prev_nr_need_assist will + * be 1 as it has 3 CPUs, but, there are 4 big + * tasks to be served. + */ +static int prev_cluster_nr_need_assist(int index) +{ + int need = 0; + int cpu; + struct cluster_data *prev_cluster; + + if (index == 0) + return 0; + + index--; + prev_cluster = &cluster_state[index]; + + /* + * Next cluster should not assist, while there are paused cpus + * in this cluster. + */ + if (prev_cluster->nr_paused_cpus) + return 0; + + for_each_cpu(cpu, &prev_cluster->cpu_mask) + need += nr_stats[cpu].nr; + + need += compute_prev_cluster_misfit_need(index); + + if (need > prev_cluster->active_cpus) + need = need - prev_cluster->active_cpus; + else + need = 0; + + return need; +} + +/* + * This is only implemented for min capacity cluster. + * + * Bringing a little CPU out of pause and using it + * more does not hurt power as much as bringing big CPUs. + * + * little cluster provides help needed for the other clusters. + * we take nr_scaled (which gives better resolution) and find + * the total nr in the system. Then take out the active higher + * capacity CPUs from the nr and consider the remaining nr as + * strict and consider that many little CPUs are needed. + */ +static int compute_cluster_nr_strict_need(int index) +{ + int cpu; + struct cluster_data *cluster; + int nr_strict_need = 0; + + if (index != 0) + return 0; + + for_each_cluster(cluster, index) { + int nr_scaled = 0; + int active_cpus = cluster->active_cpus; + + for_each_cpu(cpu, &cluster->cpu_mask) + nr_scaled += nr_stats[cpu].nr_scaled; + + nr_scaled /= 100; + + /* + * For little cluster, nr_scaled becomes the nr_strict, + * for other cluster, overflow is counted towards + * the little cluster need. + */ + if (index == 0) + nr_strict_need += nr_scaled; + else + nr_strict_need += max(0, nr_scaled - active_cpus); + } + + return nr_strict_need; +} +static void update_running_avg(void) +{ + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + int big_avg = 0; + + sched_get_nr_running_avg(nr_stats); + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + int nr_need, prev_misfit_need; + + if (!cluster->inited) + continue; + + nr_need = compute_cluster_nr_need(index); + prev_misfit_need = compute_prev_cluster_misfit_need(index); + + cluster->nrrun = nr_need + prev_misfit_need; + cluster->max_nr = compute_cluster_max_nr(index); + cluster->nr_prev_assist = prev_cluster_nr_need_assist(index); + + cluster->strict_nrrun = compute_cluster_nr_strict_need(index); + + trace_core_ctl_update_nr_need(cluster->first_cpu, nr_need, + prev_misfit_need, + cluster->nrrun, cluster->max_nr, + cluster->nr_prev_assist); + + big_avg += cluster_real_big_tasks(index); + } + spin_unlock_irqrestore(&state_lock, flags); + + last_nr_big = big_avg; + walt_rotation_checkpoint(big_avg); +} + +#define MAX_NR_THRESHOLD 4 +/* adjust needed CPUs based on current runqueue information */ +static unsigned int apply_task_need(const struct cluster_data *cluster, + unsigned int new_need) +{ + /* resume all cores if there are enough tasks */ + if (cluster->nrrun >= cluster->task_thres) + return cluster->num_cpus; + + /* + * resume as many cores as the previous cluster + * needs assistance with. + */ + if (cluster->nr_prev_assist >= cluster->nr_prev_assist_thresh) + new_need = new_need + cluster->nr_prev_assist; + + /* only resume more cores if there are tasks to run */ + if (cluster->nrrun > new_need) + new_need = new_need + 1; + + /* + * We don't want tasks to be overcrowded in a cluster. + * If any CPU has more than MAX_NR_THRESHOLD in the last + * window, bring another CPU to help out. + */ + if (cluster->max_nr > MAX_NR_THRESHOLD) + new_need = new_need + 1; + + /* + * For little cluster, we use a bit more relaxed approach + * and impose the strict nr condition. Because all tasks can + * spill onto little if big cluster is crowded. + */ + if (new_need < cluster->strict_nrrun) + new_need = cluster->strict_nrrun; + + return new_need; +} + +/* ======================= load based core count ====================== */ + +static unsigned int apply_limits(const struct cluster_data *cluster, + unsigned int need_cpus) +{ + return min(max(cluster->min_cpus, need_cpus), cluster->max_cpus); +} + +static unsigned int get_active_cpu_count(const struct cluster_data *cluster) +{ + return cluster->num_cpus - + sched_pause_count(&cluster->cpu_mask, true); +} + +static bool is_active(const struct cpu_data *state) +{ + return cpu_online(state->cpu) && cpu_active(state->cpu); +} + +static bool adjustment_possible(const struct cluster_data *cluster, + unsigned int need) +{ + return (need < cluster->active_cpus || (need > cluster->active_cpus && + cluster->nr_paused_cpus)); +} + +static bool need_all_cpus(const struct cluster_data *cluster) +{ + return (is_min_capacity_cpu(cluster->first_cpu) && + sched_ravg_window < DEFAULT_SCHED_RAVG_WINDOW); +} + +static bool eval_need(struct cluster_data *cluster) +{ + unsigned long flags; + struct cpu_data *c; + unsigned int need_cpus = 0, last_need, thres_idx; + int ret = 0; + bool need_flag = false; + unsigned int new_need; + s64 now, elapsed; + + if (unlikely(!cluster->inited)) + return false; + + spin_lock_irqsave(&state_lock, flags); + + if (cluster->boost || !cluster->enable || need_all_cpus(cluster)) { + need_cpus = cluster->max_cpus; + } else { + cluster->active_cpus = get_active_cpu_count(cluster); + thres_idx = cluster->active_cpus ? cluster->active_cpus - 1 : 0; + list_for_each_entry(c, &cluster->lru, sib) { + bool old_is_busy = c->is_busy; + + if (c->busy >= cluster->busy_up_thres[thres_idx] || + sched_cpu_high_irqload(c->cpu)) + c->is_busy = true; + else if (c->busy < cluster->busy_down_thres[thres_idx]) + c->is_busy = false; + + trace_core_ctl_set_busy(c->cpu, c->busy, old_is_busy, + c->is_busy); + need_cpus += c->is_busy; + } + need_cpus = apply_task_need(cluster, need_cpus); + } + new_need = apply_limits(cluster, need_cpus); + need_flag = adjustment_possible(cluster, new_need); + + last_need = cluster->need_cpus; + now = ktime_to_ms(ktime_get()); + + if (new_need > cluster->active_cpus) { + ret = 1; + } else { + /* + * When there is no change in need and there are no more + * active CPUs than currently needed, just update the + * need time stamp and return. + */ + if (new_need == last_need && new_need == cluster->active_cpus) { + cluster->need_ts = now; + spin_unlock_irqrestore(&state_lock, flags); + return false; + } + + elapsed = now - cluster->need_ts; + ret = elapsed >= cluster->offline_delay_ms; + } + + if (ret) { + cluster->need_ts = now; + cluster->need_cpus = new_need; + } + trace_core_ctl_eval_need(cluster->first_cpu, last_need, new_need, + ret && need_flag); + spin_unlock_irqrestore(&state_lock, flags); + + return ret && need_flag; +} + +static void apply_need(struct cluster_data *cluster) +{ + if (eval_need(cluster)) + wake_up_core_ctl_thread(); +} + +/* ========================= core count enforcement ==================== */ + +static void wake_up_core_ctl_thread(void) +{ + unsigned long flags; + + spin_lock_irqsave(&core_ctl_pending_lock, flags); + core_ctl_pending = true; + spin_unlock_irqrestore(&core_ctl_pending_lock, flags); + + wake_up_process(core_ctl_thread); +} + +static u64 core_ctl_check_timestamp; + +int core_ctl_set_boost(bool boost) +{ + unsigned int index = 0; + struct cluster_data *cluster = NULL; + unsigned long flags; + int ret = 0; + bool boost_state_changed = false; + + if (unlikely(!initialized)) + return 0; + + spin_lock_irqsave(&state_lock, flags); + for_each_cluster(cluster, index) { + if (boost) { + boost_state_changed = !cluster->boost; + ++cluster->boost; + } else { + if (!cluster->boost) { + ret = -EINVAL; + break; + } + --cluster->boost; + boost_state_changed = !cluster->boost; + } + } + spin_unlock_irqrestore(&state_lock, flags); + + if (boost_state_changed) { + index = 0; + for_each_cluster(cluster, index) + apply_need(cluster); + } + + if (cluster) + trace_core_ctl_set_boost(cluster->boost, ret); + + return ret; +} +EXPORT_SYMBOL(core_ctl_set_boost); + +void core_ctl_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&core_ctl_notifier, n); +} + +void core_ctl_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&core_ctl_notifier, n); +} + +static void core_ctl_call_notifier(void) +{ + struct core_ctl_notif_data ndata = {0}; + struct notifier_block *nb; + + /* + * Don't bother querying the stats when the notifier + * chain is empty. + */ + rcu_read_lock(); + nb = rcu_dereference_raw(core_ctl_notifier.head); + rcu_read_unlock(); + + if (!nb) + return; + + ndata.nr_big = last_nr_big; + walt_fill_ta_data(&ndata); + trace_core_ctl_notif_data(ndata.nr_big, ndata.coloc_load_pct, + ndata.ta_util_pct, ndata.cur_cap_pct); + + atomic_notifier_call_chain(&core_ctl_notifier, 0, &ndata); +} + +void core_ctl_check(u64 window_start) +{ + int cpu; + struct cpu_data *c; + struct cluster_data *cluster; + unsigned int index = 0; + unsigned long flags; + + if (unlikely(!initialized)) + return; + + if (window_start == core_ctl_check_timestamp) + return; + + core_ctl_check_timestamp = window_start; + + spin_lock_irqsave(&state_lock, flags); + for_each_possible_cpu(cpu) { + + c = &per_cpu(cpu_state, cpu); + cluster = c->cluster; + + if (!cluster || !cluster->inited) + continue; + + c->busy = sched_get_cpu_util(cpu); + } + spin_unlock_irqrestore(&state_lock, flags); + + update_running_avg(); + + for_each_cluster(cluster, index) { + if (eval_need(cluster)) + wake_up_core_ctl_thread(); + } + + core_ctl_call_notifier(); +} + +static void move_cpu_lru(struct cpu_data *cpu_data) +{ + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + list_del(&cpu_data->sib); + list_add_tail(&cpu_data->sib, &cpu_data->cluster->lru); + spin_unlock_irqrestore(&state_lock, flags); +} + +static bool should_we_pause(int cpu, struct cluster_data *cluster) +{ + return true; +} + +static void try_to_pause(struct cluster_data *cluster, unsigned int need, + struct cpumask *pause_cpus) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_paused = 0; + bool first_pass = cluster->nr_not_preferred_cpus; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus == need) + break; + /* Don't pause busy CPUs. */ + if (c->is_busy) + continue; + /* + * We pause only the not_preferred CPUs. If none + * of the CPUs are selected as not_preferred, then + * all CPUs are eligible for pausing. + */ + if (cluster->nr_not_preferred_cpus && !c->not_preferred) + continue; + + if (!should_we_pause(c->cpu, cluster)) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to pause CPU%u\n", c->cpu); + + cpumask_set_cpu(c->cpu, pause_cpus); + sched_pause_pending(c->cpu); + + c->paused_by_us = true; + move_cpu_lru(c); + nr_paused++; + + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_paused_cpus += nr_paused; + spin_unlock_irqrestore(&state_lock, flags); + +again: + /* + * If the number of active CPUs is within the limits, then + * don't force pause of any busy CPUs. + */ + if (cluster->active_cpus <= cluster->max_cpus) + return; + + nr_paused = 0; + num_cpus = cluster->num_cpus; + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!is_active(c)) + continue; + if (cluster->active_cpus <= cluster->max_cpus) + break; + + if (first_pass && !c->not_preferred) + continue; + + spin_unlock_irqrestore(&state_lock, flags); + + cpumask_set_cpu(c->cpu, pause_cpus); + sched_pause_pending(c->cpu); + + c->paused_by_us = true; + move_cpu_lru(c); + nr_paused++; + + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + + cluster->nr_paused_cpus += nr_paused; + spin_unlock_irqrestore(&state_lock, flags); + + if (first_pass && cluster->active_cpus > cluster->max_cpus) { + first_pass = false; + goto again; + } +} + +static void __try_to_resume(struct cluster_data *cluster, + unsigned int need, bool force, struct cpumask *unpause_cpus) +{ + struct cpu_data *c, *tmp; + unsigned long flags; + unsigned int num_cpus = cluster->num_cpus; + unsigned int nr_unpaused = 0; + + /* + * Protect against entry being removed (and added at tail) by other + * thread (hotplug). + */ + spin_lock_irqsave(&state_lock, flags); + list_for_each_entry_safe(c, tmp, &cluster->lru, sib) { + if (!num_cpus--) + break; + + if (!c->paused_by_us) + continue; + if ((cpu_online(c->cpu) && cpu_active(c->cpu)) || + (!force && c->not_preferred)) + continue; + if (cluster->active_cpus == need) + break; + + spin_unlock_irqrestore(&state_lock, flags); + + pr_debug("Trying to resume CPU%u\n", c->cpu); + + cpumask_set_cpu(c->cpu, unpause_cpus); + sched_unpause_pending(c->cpu); + + c->paused_by_us = false; + move_cpu_lru(c); + nr_unpaused++; + + cluster->active_cpus = get_active_cpu_count(cluster); + spin_lock_irqsave(&state_lock, flags); + } + cluster->nr_paused_cpus -= nr_unpaused; + spin_unlock_irqrestore(&state_lock, flags); +} + +static void try_to_resume(struct cluster_data *cluster, unsigned int need, + struct cpumask *unpause_cpus) +{ + bool force_use_non_preferred = false; + + __try_to_resume(cluster, need, force_use_non_preferred, unpause_cpus); + + if (cluster->active_cpus == need) + return; + + force_use_non_preferred = true; + __try_to_resume(cluster, need, force_use_non_preferred, unpause_cpus); +} + +static void __ref do_core_ctl(void) +{ + struct cluster_data *cluster; + unsigned int index = 0; + unsigned int need; + cpumask_t cpus_to_pause = { CPU_BITS_NONE }; + cpumask_t cpus_to_unpause = { CPU_BITS_NONE }; + + for_each_cluster(cluster, index) { + + eval_need(cluster); + + need = apply_limits(cluster, cluster->need_cpus); + + if (adjustment_possible(cluster, need)) { + pr_debug("Trying to adjust group %u from %u to %u\n", + cluster->first_cpu, cluster->active_cpus, need); + + if (cluster->active_cpus > need) + try_to_pause(cluster, need, &cpus_to_pause); + + else if (cluster->active_cpus < need) + try_to_resume(cluster, need, &cpus_to_unpause); + } + } + + if (cpumask_any(&cpus_to_pause) < nr_cpu_ids) + pause_cpus(&cpus_to_pause); + + if (cpumask_any(&cpus_to_unpause) < nr_cpu_ids) + resume_cpus(&cpus_to_unpause); +} + +static int __ref try_core_ctl(void *data) +{ + unsigned long flags; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_lock_irqsave(&core_ctl_pending_lock, flags); + if (!core_ctl_pending) { + spin_unlock_irqrestore(&core_ctl_pending_lock, flags); + schedule(); + if (kthread_should_stop()) + break; + spin_lock_irqsave(&core_ctl_pending_lock, flags); + } + set_current_state(TASK_RUNNING); + core_ctl_pending = false; + spin_unlock_irqrestore(&core_ctl_pending_lock, flags); + + do_core_ctl(); + } + + return 0; +} + +/* ============================ init code ============================== */ + +static struct cluster_data *find_cluster_by_first_cpu(unsigned int first_cpu) +{ + unsigned int i; + + for (i = 0; i < num_clusters; ++i) { + if (cluster_state[i].first_cpu == first_cpu) + return &cluster_state[i]; + } + + return NULL; +} + +static int cluster_init(const struct cpumask *mask) +{ + struct device *dev; + unsigned int first_cpu = cpumask_first(mask); + struct cluster_data *cluster; + struct cpu_data *state; + unsigned int cpu; + + if (find_cluster_by_first_cpu(first_cpu)) + return 0; + + dev = get_cpu_device(first_cpu); + if (!dev) + return -ENODEV; + + pr_info("Creating CPU group %d\n", first_cpu); + + if (num_clusters == MAX_CLUSTERS) { + pr_err("Unsupported number of clusters. Only %u supported\n", + MAX_CLUSTERS); + return -EINVAL; + } + cluster = &cluster_state[num_clusters]; + ++num_clusters; + + cpumask_copy(&cluster->cpu_mask, mask); + cluster->num_cpus = cpumask_weight(mask); + if (cluster->num_cpus > MAX_CPUS_PER_CLUSTER) { + pr_err("HW configuration not supported\n"); + return -EINVAL; + } + cluster->first_cpu = first_cpu; + cluster->min_cpus = 1; + cluster->max_cpus = cluster->num_cpus; + cluster->need_cpus = cluster->num_cpus; + cluster->offline_delay_ms = 100; + cluster->task_thres = UINT_MAX; + cluster->nr_prev_assist_thresh = UINT_MAX; + cluster->nrrun = cluster->num_cpus; + cluster->enable = true; + cluster->nr_not_preferred_cpus = 0; + cluster->strict_nrrun = 0; + INIT_LIST_HEAD(&cluster->lru); + + for_each_cpu(cpu, mask) { + pr_info("Init CPU%u state\n", cpu); + + state = &per_cpu(cpu_state, cpu); + state->cluster = cluster; + state->cpu = cpu; + list_add_tail(&state->sib, &cluster->lru); + } + cluster->active_cpus = get_active_cpu_count(cluster); + + cluster->inited = true; + + kobject_init(&cluster->kobj, &ktype_core_ctl); + return kobject_add(&cluster->kobj, &dev->kobj, "core_ctl"); +} + +int core_ctl_init(void) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct walt_sched_cluster *cluster; + int ret; + + /* initialize our single kthread */ + core_ctl_thread = kthread_run(try_core_ctl, NULL, "core_ctl"); + + if (IS_ERR(core_ctl_thread)) + return PTR_ERR(core_ctl_thread); + + spin_lock_init(&core_ctl_pending_lock); + + sched_setscheduler_nocheck(core_ctl_thread, SCHED_FIFO, ¶m); + + for_each_sched_cluster(cluster) { + ret = cluster_init(&cluster->cpus); + if (ret) + pr_warn("unable to create core ctl group: %d\n", ret); + } + + initialized = true; + + return 0; +} diff --git a/kernel/sched/walt/cpufreq_walt.c b/kernel/sched/walt/cpufreq_walt.c new file mode 100644 index 000000000000..3d24cf3fd1e7 --- /dev/null +++ b/kernel/sched/walt/cpufreq_walt.c @@ -0,0 +1,886 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This is based on schedutil governor but modified to work with + * WALT. + * + * Copyright (C) 2016, Intel Corporation + * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include + +#include "walt.h" +#include "trace.h" + +struct waltgov_tunables { + struct gov_attr_set attr_set; + unsigned int up_rate_limit_us; + unsigned int down_rate_limit_us; + unsigned int hispeed_load; + unsigned int hispeed_freq; + unsigned int rtg_boost_freq; + bool pl; +}; + +struct waltgov_policy { + struct cpufreq_policy *policy; + u64 last_ws; + u64 curr_cycles; + u64 last_cyc_update_time; + unsigned long avg_cap; + struct waltgov_tunables *tunables; + struct list_head tunables_hook; + unsigned long hispeed_util; + unsigned long rtg_boost_util; + unsigned long max; + + raw_spinlock_t update_lock; + u64 last_freq_update_time; + s64 min_rate_limit_ns; + s64 up_rate_delay_ns; + s64 down_rate_delay_ns; + unsigned int next_freq; + unsigned int cached_raw_freq; + + /* The next fields are only needed if fast switch cannot be used: */ + struct irq_work irq_work; + struct kthread_work work; + struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; + + bool limits_changed; + bool need_freq_update; +}; + +struct waltgov_cpu { + struct waltgov_callback cb; + struct waltgov_policy *wg_policy; + unsigned int cpu; + struct walt_cpu_load walt_load; + unsigned long util; + unsigned long max; + unsigned int flags; +}; + +DEFINE_PER_CPU(struct waltgov_callback *, waltgov_cb_data); +static DEFINE_PER_CPU(struct waltgov_cpu, waltgov_cpu); +static DEFINE_PER_CPU(struct waltgov_tunables *, cached_tunables); + +/************************ Governor internals ***********************/ + +static bool waltgov_should_update_freq(struct waltgov_policy *wg_policy, u64 time) +{ + s64 delta_ns; + + if (unlikely(wg_policy->limits_changed)) { + wg_policy->limits_changed = false; + wg_policy->need_freq_update = true; + return true; + } + + /* + * No need to recalculate next freq for min_rate_limit_us + * at least. However we might still decide to further rate + * limit once frequency change direction is decided, according + * to the separate rate limits. + */ + + delta_ns = time - wg_policy->last_freq_update_time; + return delta_ns >= wg_policy->min_rate_limit_ns; +} + +static bool waltgov_up_down_rate_limit(struct waltgov_policy *wg_policy, u64 time, + unsigned int next_freq) +{ + s64 delta_ns; + + delta_ns = time - wg_policy->last_freq_update_time; + + if (next_freq > wg_policy->next_freq && + delta_ns < wg_policy->up_rate_delay_ns) + return true; + + if (next_freq < wg_policy->next_freq && + delta_ns < wg_policy->down_rate_delay_ns) + return true; + + return false; +} + +static bool waltgov_update_next_freq(struct waltgov_policy *wg_policy, u64 time, + unsigned int next_freq) +{ + if (wg_policy->next_freq == next_freq) + return false; + + if (waltgov_up_down_rate_limit(wg_policy, time, next_freq)) + return false; + + wg_policy->next_freq = next_freq; + wg_policy->last_freq_update_time = time; + + return true; +} + +static unsigned long freq_to_util(struct waltgov_policy *wg_policy, + unsigned int freq) +{ + return mult_frac(wg_policy->max, freq, + wg_policy->policy->cpuinfo.max_freq); +} + +#define KHZ 1000 +static void waltgov_track_cycles(struct waltgov_policy *wg_policy, + unsigned int prev_freq, + u64 upto) +{ + u64 delta_ns, cycles; + u64 next_ws = wg_policy->last_ws + sched_ravg_window; + + upto = min(upto, next_ws); + /* Track cycles in current window */ + delta_ns = upto - wg_policy->last_cyc_update_time; + delta_ns *= prev_freq; + do_div(delta_ns, (NSEC_PER_SEC / KHZ)); + cycles = delta_ns; + wg_policy->curr_cycles += cycles; + wg_policy->last_cyc_update_time = upto; +} + +static void waltgov_calc_avg_cap(struct waltgov_policy *wg_policy, u64 curr_ws, + unsigned int prev_freq) +{ + u64 last_ws = wg_policy->last_ws; + unsigned int avg_freq; + + BUG_ON(curr_ws < last_ws); + if (curr_ws <= last_ws) + return; + + /* If we skipped some windows */ + if (curr_ws > (last_ws + sched_ravg_window)) { + avg_freq = prev_freq; + /* Reset tracking history */ + wg_policy->last_cyc_update_time = curr_ws; + } else { + waltgov_track_cycles(wg_policy, prev_freq, curr_ws); + avg_freq = wg_policy->curr_cycles; + avg_freq /= sched_ravg_window / (NSEC_PER_SEC / KHZ); + } + wg_policy->avg_cap = freq_to_util(wg_policy, avg_freq); + wg_policy->curr_cycles = 0; + wg_policy->last_ws = curr_ws; +} + +static void waltgov_fast_switch(struct waltgov_policy *wg_policy, u64 time, + unsigned int next_freq) +{ + struct cpufreq_policy *policy = wg_policy->policy; + unsigned int cpu; + + if (!waltgov_update_next_freq(wg_policy, time, next_freq)) + return; + + waltgov_track_cycles(wg_policy, wg_policy->policy->cur, time); + next_freq = cpufreq_driver_fast_switch(policy, next_freq); + if (!next_freq) + return; + + policy->cur = next_freq; + + if (trace_cpu_frequency_enabled()) { + for_each_cpu(cpu, policy->cpus) + trace_cpu_frequency(next_freq, cpu); + } +} + +static void waltgov_deferred_update(struct waltgov_policy *wg_policy, u64 time, + unsigned int next_freq) +{ + if (!waltgov_update_next_freq(wg_policy, time, next_freq)) + return; + + walt_irq_work_queue(&wg_policy->irq_work); +} + +#define TARGET_LOAD 80 +static unsigned int get_next_freq(struct waltgov_policy *wg_policy, + unsigned long util, unsigned long max) +{ + struct cpufreq_policy *policy = wg_policy->policy; + /* + * TODO: + unsigned int freq = arch_scale_freq_invariant() ? + policy->cpuinfo.max_freq : policy->cur; + */ + unsigned int freq = policy->cpuinfo.max_freq; + + freq = map_util_freq(util, freq, max); + trace_waltgov_next_freq(policy->cpu, util, max, freq); + + if (freq == wg_policy->cached_raw_freq && !wg_policy->need_freq_update) + return wg_policy->next_freq; + + wg_policy->need_freq_update = false; + wg_policy->cached_raw_freq = freq; + return cpufreq_driver_resolve_freq(policy, freq); +} + +static unsigned long waltgov_get_util(struct waltgov_cpu *wg_cpu) +{ + struct rq *rq = cpu_rq(wg_cpu->cpu); + unsigned long max = arch_scale_cpu_capacity(wg_cpu->cpu); + unsigned long util; + + wg_cpu->max = max; + util = cpu_util_freq_walt(wg_cpu->cpu, &wg_cpu->walt_load); + return uclamp_rq_util_with(rq, util, NULL); +} + +#define NL_RATIO 75 +#define DEFAULT_HISPEED_LOAD 90 +#define DEFAULT_CPU0_RTG_BOOST_FREQ 1000000 +#define DEFAULT_CPU4_RTG_BOOST_FREQ 0 +#define DEFAULT_CPU7_RTG_BOOST_FREQ 0 +static void waltgov_walt_adjust(struct waltgov_cpu *wg_cpu, unsigned long *util, + unsigned long *max) +{ + struct waltgov_policy *wg_policy = wg_cpu->wg_policy; + bool is_migration = wg_cpu->flags & WALT_CPUFREQ_IC_MIGRATION; + bool is_rtg_boost = wg_cpu->walt_load.rtgb_active; + unsigned long nl = wg_cpu->walt_load.nl; + unsigned long cpu_util = wg_cpu->util; + bool is_hiload; + unsigned long pl = wg_cpu->walt_load.pl; + + if (is_rtg_boost) + *util = max(*util, wg_policy->rtg_boost_util); + + is_hiload = (cpu_util >= mult_frac(wg_policy->avg_cap, + wg_policy->tunables->hispeed_load, + 100)); + + if (is_hiload && !is_migration) + *util = max(*util, wg_policy->hispeed_util); + + if (is_hiload && nl >= mult_frac(cpu_util, NL_RATIO, 100)) + *util = *max; + + if (wg_policy->tunables->pl) { + if (sysctl_sched_conservative_pl) + pl = mult_frac(pl, TARGET_LOAD, 100); + *util = max(*util, pl); + } +} + +static inline unsigned long target_util(struct waltgov_policy *wg_policy, + unsigned int freq) +{ + unsigned long util; + + util = freq_to_util(wg_policy, freq); + util = mult_frac(util, TARGET_LOAD, 100); + return util; +} + +static unsigned int waltgov_next_freq_shared(struct waltgov_cpu *wg_cpu, u64 time) +{ + struct waltgov_policy *wg_policy = wg_cpu->wg_policy; + struct cpufreq_policy *policy = wg_policy->policy; + unsigned long util = 0, max = 1; + unsigned int j; + + for_each_cpu(j, policy->cpus) { + struct waltgov_cpu *j_wg_cpu = &per_cpu(waltgov_cpu, j); + unsigned long j_util, j_max; + + /* + * If the util value for all CPUs in a policy is 0, just using > + * will result in a max value of 1. WALT stats can later update + * the aggregated util value, causing get_next_freq() to compute + * freq = max_freq * 1.25 * (util / max) for nonzero util, + * leading to spurious jumps to fmax. + */ + j_util = j_wg_cpu->util; + j_max = j_wg_cpu->max; + + if (j_util * max >= j_max * util) { + util = j_util; + max = j_max; + } + + waltgov_walt_adjust(j_wg_cpu, &util, &max); + } + + return get_next_freq(wg_policy, util, max); +} + +static void waltgov_update_freq(struct waltgov_callback *cb, u64 time, + unsigned int flags) +{ + struct waltgov_cpu *wg_cpu = container_of(cb, struct waltgov_cpu, cb); + struct waltgov_policy *wg_policy = wg_cpu->wg_policy; + unsigned long hs_util, boost_util; + unsigned int next_f; + + if (!wg_policy->tunables->pl && flags & WALT_CPUFREQ_PL) + return; + + wg_cpu->util = waltgov_get_util(wg_cpu); + wg_cpu->flags = flags; + raw_spin_lock(&wg_policy->update_lock); + + if (wg_policy->max != wg_cpu->max) { + wg_policy->max = wg_cpu->max; + hs_util = target_util(wg_policy, + wg_policy->tunables->hispeed_freq); + wg_policy->hispeed_util = hs_util; + + boost_util = target_util(wg_policy, + wg_policy->tunables->rtg_boost_freq); + wg_policy->rtg_boost_util = boost_util; + } + + waltgov_calc_avg_cap(wg_policy, wg_cpu->walt_load.ws, + wg_policy->policy->cur); + + trace_waltgov_util_update(wg_cpu->cpu, wg_cpu->util, wg_policy->avg_cap, + wg_cpu->max, wg_cpu->walt_load.nl, + wg_cpu->walt_load.pl, + wg_cpu->walt_load.rtgb_active, flags); + + if (waltgov_should_update_freq(wg_policy, time) && + !(flags & WALT_CPUFREQ_CONTINUE)) { + next_f = waltgov_next_freq_shared(wg_cpu, time); + + if (wg_policy->policy->fast_switch_enabled) + waltgov_fast_switch(wg_policy, time, next_f); + else + waltgov_deferred_update(wg_policy, time, next_f); + } + + raw_spin_unlock(&wg_policy->update_lock); +} + +static void waltgov_work(struct kthread_work *work) +{ + struct waltgov_policy *wg_policy = container_of(work, struct waltgov_policy, work); + unsigned int freq; + unsigned long flags; + + raw_spin_lock_irqsave(&wg_policy->update_lock, flags); + freq = wg_policy->next_freq; + waltgov_track_cycles(wg_policy, wg_policy->policy->cur, + ktime_get_ns()); + raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags); + + mutex_lock(&wg_policy->work_lock); + __cpufreq_driver_target(wg_policy->policy, freq, CPUFREQ_RELATION_L); + mutex_unlock(&wg_policy->work_lock); +} + +static void waltgov_irq_work(struct irq_work *irq_work) +{ + struct waltgov_policy *wg_policy; + + wg_policy = container_of(irq_work, struct waltgov_policy, irq_work); + + kthread_queue_work(&wg_policy->worker, &wg_policy->work); +} + +/************************** sysfs interface ************************/ + +static inline struct waltgov_tunables *to_waltgov_tunables(struct gov_attr_set *attr_set) +{ + return container_of(attr_set, struct waltgov_tunables, attr_set); +} + +static DEFINE_MUTEX(min_rate_lock); + +static void update_min_rate_limit_ns(struct waltgov_policy *wg_policy) +{ + mutex_lock(&min_rate_lock); + wg_policy->min_rate_limit_ns = min(wg_policy->up_rate_delay_ns, + wg_policy->down_rate_delay_ns); + mutex_unlock(&min_rate_lock); +} + +static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + + return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->up_rate_limit_us); +} + +static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + + return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->down_rate_limit_us); +} + +static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + struct waltgov_policy *wg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->up_rate_limit_us = rate_limit_us; + + list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) { + wg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_ns(wg_policy); + } + + return count; +} + +static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + struct waltgov_policy *wg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->down_rate_limit_us = rate_limit_us; + + list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) { + wg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_ns(wg_policy); + } + + return count; +} + +static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us); +static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us); + +static ssize_t hispeed_load_show(struct gov_attr_set *attr_set, char *buf) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + + return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->hispeed_load); +} + +static ssize_t hispeed_load_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + + if (kstrtouint(buf, 10, &tunables->hispeed_load)) + return -EINVAL; + + tunables->hispeed_load = min(100U, tunables->hispeed_load); + + return count; +} + +static ssize_t hispeed_freq_show(struct gov_attr_set *attr_set, char *buf) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + + return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->hispeed_freq); +} + +static ssize_t hispeed_freq_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + unsigned int val; + struct waltgov_policy *wg_policy; + unsigned long hs_util; + unsigned long flags; + + if (kstrtouint(buf, 10, &val)) + return -EINVAL; + + tunables->hispeed_freq = val; + list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) { + raw_spin_lock_irqsave(&wg_policy->update_lock, flags); + hs_util = target_util(wg_policy, + wg_policy->tunables->hispeed_freq); + wg_policy->hispeed_util = hs_util; + raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags); + } + + return count; +} + +static ssize_t rtg_boost_freq_show(struct gov_attr_set *attr_set, char *buf) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + + return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->rtg_boost_freq); +} + +static ssize_t rtg_boost_freq_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + unsigned int val; + struct waltgov_policy *wg_policy; + unsigned long boost_util; + unsigned long flags; + + if (kstrtouint(buf, 10, &val)) + return -EINVAL; + + tunables->rtg_boost_freq = val; + list_for_each_entry(wg_policy, &attr_set->policy_list, tunables_hook) { + raw_spin_lock_irqsave(&wg_policy->update_lock, flags); + boost_util = target_util(wg_policy, + wg_policy->tunables->rtg_boost_freq); + wg_policy->rtg_boost_util = boost_util; + raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags); + } + + return count; +} + +static ssize_t pl_show(struct gov_attr_set *attr_set, char *buf) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + + return scnprintf(buf, PAGE_SIZE, "%u\n", tunables->pl); +} + +static ssize_t pl_store(struct gov_attr_set *attr_set, const char *buf, + size_t count) +{ + struct waltgov_tunables *tunables = to_waltgov_tunables(attr_set); + + if (kstrtobool(buf, &tunables->pl)) + return -EINVAL; + + return count; +} + +static struct governor_attr hispeed_load = __ATTR_RW(hispeed_load); +static struct governor_attr hispeed_freq = __ATTR_RW(hispeed_freq); +static struct governor_attr rtg_boost_freq = __ATTR_RW(rtg_boost_freq); +static struct governor_attr pl = __ATTR_RW(pl); + +static struct attribute *waltgov_attributes[] = { + &up_rate_limit_us.attr, + &down_rate_limit_us.attr, + &hispeed_load.attr, + &hispeed_freq.attr, + &rtg_boost_freq.attr, + &pl.attr, + NULL +}; + +static struct kobj_type waltgov_tunables_ktype = { + .default_attrs = waltgov_attributes, + .sysfs_ops = &governor_sysfs_ops, +}; + +/********************** cpufreq governor interface *********************/ + +static struct cpufreq_governor walt_gov; + +static struct waltgov_policy *waltgov_policy_alloc(struct cpufreq_policy *policy) +{ + struct waltgov_policy *wg_policy; + + wg_policy = kzalloc(sizeof(*wg_policy), GFP_KERNEL); + if (!wg_policy) + return NULL; + + wg_policy->policy = policy; + raw_spin_lock_init(&wg_policy->update_lock); + return wg_policy; +} + +static void waltgov_policy_free(struct waltgov_policy *wg_policy) +{ + kfree(wg_policy); +} + +static int waltgov_kthread_create(struct waltgov_policy *wg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = wg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + kthread_init_work(&wg_policy->work, waltgov_work); + kthread_init_worker(&wg_policy->worker); + thread = kthread_create(kthread_worker_fn, &wg_policy->worker, + "waltgov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create waltgov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + wg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&wg_policy->irq_work, waltgov_irq_work); + mutex_init(&wg_policy->work_lock); + + wake_up_process(thread); + + return 0; +} + +static void waltgov_kthread_stop(struct waltgov_policy *wg_policy) +{ + /* kthread only required for slow path */ + if (wg_policy->policy->fast_switch_enabled) + return; + + kthread_flush_worker(&wg_policy->worker); + kthread_stop(wg_policy->thread); + mutex_destroy(&wg_policy->work_lock); +} + +static void waltgov_tunables_save(struct cpufreq_policy *policy, + struct waltgov_tunables *tunables) +{ + int cpu; + struct waltgov_tunables *cached = per_cpu(cached_tunables, policy->cpu); + + if (!cached) { + cached = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (!cached) + return; + + for_each_cpu(cpu, policy->related_cpus) + per_cpu(cached_tunables, cpu) = cached; + } + + cached->pl = tunables->pl; + cached->hispeed_load = tunables->hispeed_load; + cached->rtg_boost_freq = tunables->rtg_boost_freq; + cached->hispeed_freq = tunables->hispeed_freq; + cached->up_rate_limit_us = tunables->up_rate_limit_us; + cached->down_rate_limit_us = tunables->down_rate_limit_us; +} + +static void waltgov_tunables_restore(struct cpufreq_policy *policy) +{ + struct waltgov_policy *wg_policy = policy->governor_data; + struct waltgov_tunables *tunables = wg_policy->tunables; + struct waltgov_tunables *cached = per_cpu(cached_tunables, policy->cpu); + + if (!cached) + return; + + tunables->pl = cached->pl; + tunables->hispeed_load = cached->hispeed_load; + tunables->rtg_boost_freq = cached->rtg_boost_freq; + tunables->hispeed_freq = cached->hispeed_freq; + tunables->up_rate_limit_us = cached->up_rate_limit_us; + tunables->down_rate_limit_us = cached->down_rate_limit_us; +} + +static int waltgov_init(struct cpufreq_policy *policy) +{ + struct waltgov_policy *wg_policy; + struct waltgov_tunables *tunables; + int ret = 0; + + /* State should be equivalent to EXIT */ + if (policy->governor_data) + return -EBUSY; + + cpufreq_enable_fast_switch(policy); + + if (policy->fast_switch_possible && !policy->fast_switch_enabled) + BUG_ON(1); + + wg_policy = waltgov_policy_alloc(policy); + if (!wg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } + + ret = waltgov_kthread_create(wg_policy); + if (ret) + goto free_wg_policy; + + tunables = kzalloc(sizeof(*tunables), GFP_KERNEL); + if (!tunables) { + ret = -ENOMEM; + goto stop_kthread; + } + + gov_attr_set_init(&tunables->attr_set, &wg_policy->tunables_hook); + tunables->hispeed_load = DEFAULT_HISPEED_LOAD; + + switch (policy->cpu) { + default: + case 0: + tunables->rtg_boost_freq = DEFAULT_CPU0_RTG_BOOST_FREQ; + break; + case 4: + tunables->rtg_boost_freq = DEFAULT_CPU4_RTG_BOOST_FREQ; + break; + case 7: + tunables->rtg_boost_freq = DEFAULT_CPU7_RTG_BOOST_FREQ; + break; + } + + policy->governor_data = wg_policy; + wg_policy->tunables = tunables; + waltgov_tunables_restore(policy); + + ret = kobject_init_and_add(&tunables->attr_set.kobj, &waltgov_tunables_ktype, + get_governor_parent_kobj(policy), "%s", + walt_gov.name); + if (ret) + goto fail; + + return 0; + +fail: + kobject_put(&tunables->attr_set.kobj); + policy->governor_data = NULL; + kfree(tunables); +stop_kthread: + waltgov_kthread_stop(wg_policy); +free_wg_policy: + waltgov_policy_free(wg_policy); +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + + pr_err("initialization failed (error %d)\n", ret); + return ret; +} + +static void waltgov_exit(struct cpufreq_policy *policy) +{ + struct waltgov_policy *wg_policy = policy->governor_data; + struct waltgov_tunables *tunables = wg_policy->tunables; + unsigned int count; + + count = gov_attr_set_put(&tunables->attr_set, &wg_policy->tunables_hook); + policy->governor_data = NULL; + if (!count) { + waltgov_tunables_save(policy, tunables); + kfree(tunables); + } + + waltgov_kthread_stop(wg_policy); + waltgov_policy_free(wg_policy); + cpufreq_disable_fast_switch(policy); +} + +static int waltgov_start(struct cpufreq_policy *policy) +{ + struct waltgov_policy *wg_policy = policy->governor_data; + unsigned int cpu; + + wg_policy->up_rate_delay_ns = + wg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC; + wg_policy->down_rate_delay_ns = + wg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_ns(wg_policy); + wg_policy->last_freq_update_time = 0; + wg_policy->next_freq = 0; + wg_policy->limits_changed = false; + wg_policy->need_freq_update = false; + wg_policy->cached_raw_freq = 0; + + for_each_cpu(cpu, policy->cpus) { + struct waltgov_cpu *wg_cpu = &per_cpu(waltgov_cpu, cpu); + + memset(wg_cpu, 0, sizeof(*wg_cpu)); + wg_cpu->cpu = cpu; + wg_cpu->wg_policy = wg_policy; + } + + for_each_cpu(cpu, policy->cpus) { + struct waltgov_cpu *wg_cpu = &per_cpu(waltgov_cpu, cpu); + + waltgov_add_callback(cpu, &wg_cpu->cb, waltgov_update_freq); + } + + return 0; +} + +static void waltgov_stop(struct cpufreq_policy *policy) +{ + struct waltgov_policy *wg_policy = policy->governor_data; + unsigned int cpu; + + for_each_cpu(cpu, policy->cpus) + waltgov_remove_callback(cpu); + + synchronize_rcu(); + + if (!policy->fast_switch_enabled) { + irq_work_sync(&wg_policy->irq_work); + kthread_cancel_work_sync(&wg_policy->work); + } +} + +static void waltgov_limits(struct cpufreq_policy *policy) +{ + struct waltgov_policy *wg_policy = policy->governor_data; + unsigned long flags, now; + unsigned int freq; + + if (!policy->fast_switch_enabled) { + mutex_lock(&wg_policy->work_lock); + raw_spin_lock_irqsave(&wg_policy->update_lock, flags); + waltgov_track_cycles(wg_policy, wg_policy->policy->cur, + ktime_get_ns()); + raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags); + cpufreq_policy_apply_limits(policy); + mutex_unlock(&wg_policy->work_lock); + } else { + raw_spin_lock_irqsave(&wg_policy->update_lock, flags); + freq = policy->cur; + now = ktime_get_ns(); + + /* + * cpufreq_driver_resolve_freq() has a clamp, so we do not need + * to do any sort of additional validation here. + */ + freq = cpufreq_driver_resolve_freq(policy, freq); + wg_policy->cached_raw_freq = freq; + waltgov_fast_switch(wg_policy, now, freq); + raw_spin_unlock_irqrestore(&wg_policy->update_lock, flags); + } + + wg_policy->limits_changed = true; +} + +static struct cpufreq_governor walt_gov = { + .name = "walt", + .init = waltgov_init, + .exit = waltgov_exit, + .start = waltgov_start, + .stop = waltgov_stop, + .limits = waltgov_limits, + .owner = THIS_MODULE, +}; + +int waltgov_register(void) +{ + return cpufreq_register_governor(&walt_gov); +} diff --git a/kernel/sched/walt/fixup.c b/kernel/sched/walt/fixup.c new file mode 100644 index 000000000000..21c3abe003ec --- /dev/null +++ b/kernel/sched/walt/fixup.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2016-2021, The Linux Foundation. All rights reserved. + */ + +#include + +#include "walt.h" + +unsigned int cpuinfo_max_freq_cached; + +char sched_lib_name[LIB_PATH_LENGTH]; +unsigned int sched_lib_mask_force; + +bool is_sched_lib_based_app(pid_t pid) +{ + const char *name = NULL; + char *libname, *lib_list; + struct vm_area_struct *vma; + char path_buf[LIB_PATH_LENGTH]; + char *tmp_lib_name; + bool found = false; + struct task_struct *p; + struct mm_struct *mm; + + if (strnlen(sched_lib_name, LIB_PATH_LENGTH) == 0) + return false; + + tmp_lib_name = kmalloc(LIB_PATH_LENGTH, GFP_KERNEL); + if (!tmp_lib_name) + return false; + + rcu_read_lock(); + + p = pid ? get_pid_task(find_vpid(pid), PIDTYPE_PID) : current; + if (!p) { + rcu_read_unlock(); + kfree(tmp_lib_name); + return false; + } + + /* Prevent p going away */ + get_task_struct(p); + rcu_read_unlock(); + + mm = get_task_mm(p); + if (!mm) + goto put_task_struct; + + down_read(&mm->mmap_lock); + for (vma = mm->mmap; vma ; vma = vma->vm_next) { + if (vma->vm_file && vma->vm_flags & VM_EXEC) { + name = d_path(&vma->vm_file->f_path, + path_buf, LIB_PATH_LENGTH); + if (IS_ERR(name)) + goto release_sem; + + strlcpy(tmp_lib_name, sched_lib_name, LIB_PATH_LENGTH); + lib_list = tmp_lib_name; + while ((libname = strsep(&lib_list, ","))) { + libname = skip_spaces(libname); + if (strnstr(name, libname, + strnlen(name, LIB_PATH_LENGTH))) { + found = true; + goto release_sem; + } + } + } + } + +release_sem: + up_read(&mm->mmap_lock); + mmput(mm); +put_task_struct: + put_task_struct(p); + kfree(tmp_lib_name); + return found; +} + +void android_vh_show_max_freq(void *unused, struct cpufreq_policy *policy, + unsigned int *max_freq) +{ + if (!cpuinfo_max_freq_cached) + return; + + if (!(BIT(policy->cpu) & sched_lib_mask_force)) + return; + + if (is_sched_lib_based_app(current->pid)) + *max_freq = cpuinfo_max_freq_cached << 1; +} diff --git a/kernel/sched/walt/input-boost.c b/kernel/sched/walt/input-boost.c new file mode 100644 index 000000000000..c20454535dab --- /dev/null +++ b/kernel/sched/walt/input-boost.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2013-2015,2017,2019-2021, The Linux Foundation. All rights reserved. + */ + +#define pr_fmt(fmt) "input-boost: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "walt.h" + +#define input_boost_attr_rw(_name) \ +static struct kobj_attribute _name##_attr = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +#define show_one(file_name) \ +static ssize_t show_##file_name \ +(struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ +{ \ + return scnprintf(buf, PAGE_SIZE, "%u\n", file_name); \ +} + +#define store_one(file_name) \ +static ssize_t store_##file_name \ +(struct kobject *kobj, struct kobj_attribute *attr, \ +const char *buf, size_t count) \ +{ \ + \ + sscanf(buf, "%u", &file_name); \ + return count; \ +} + +struct cpu_sync { + int cpu; + unsigned int input_boost_min; + unsigned int input_boost_freq; +}; + +static DEFINE_PER_CPU(struct cpu_sync, sync_info); +static struct workqueue_struct *input_boost_wq; + +static struct work_struct input_boost_work; + +static bool sched_boost_active; + +static struct delayed_work input_boost_rem; +static u64 last_input_time; +#define MIN_INPUT_INTERVAL (150 * USEC_PER_MSEC) + +static DEFINE_PER_CPU(struct freq_qos_request, qos_req); + +static void boost_adjust_notify(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + struct cpu_sync *s = &per_cpu(sync_info, cpu); + unsigned int ib_min = s->input_boost_min; + struct freq_qos_request *req = &per_cpu(qos_req, cpu); + int ret; + + pr_debug("CPU%u policy min before boost: %u kHz\n", + cpu, policy->min); + pr_debug("CPU%u boost min: %u kHz\n", cpu, ib_min); + + ret = freq_qos_update_request(req, ib_min); + + if (ret < 0) + pr_err("Failed to update freq constraint in boost_adjust: %d\n", + ib_min); + + pr_debug("CPU%u policy min after boost: %u kHz\n", cpu, policy->min); +} + +static void update_policy_online(void) +{ + unsigned int i; + struct cpufreq_policy *policy; + struct cpumask online_cpus; + + /* Re-evaluate policy to trigger adjust notifier for online CPUs */ + get_online_cpus(); + online_cpus = *cpu_online_mask; + for_each_cpu(i, &online_cpus) { + policy = cpufreq_cpu_get(i); + if (!policy) { + pr_err("%s: cpufreq policy not found for cpu%d\n", + __func__, i); + return; + } + + cpumask_andnot(&online_cpus, &online_cpus, + policy->related_cpus); + boost_adjust_notify(policy); + } + put_online_cpus(); +} + +static void do_input_boost_rem(struct work_struct *work) +{ + unsigned int i, ret; + struct cpu_sync *i_sync_info; + + /* Reset the input_boost_min for all CPUs in the system */ + pr_debug("Resetting input boost min for all CPUs\n"); + for_each_possible_cpu(i) { + i_sync_info = &per_cpu(sync_info, i); + i_sync_info->input_boost_min = 0; + } + + /* Update policies for all online CPUs */ + update_policy_online(); + + if (sched_boost_active) { + ret = sched_set_boost(0); + if (!ret) + pr_err("input-boost: sched boost disable failed\n"); + sched_boost_active = false; + } +} + +static void do_input_boost(struct work_struct *work) +{ + unsigned int i, ret; + struct cpu_sync *i_sync_info; + + cancel_delayed_work_sync(&input_boost_rem); + if (sched_boost_active) { + sched_set_boost(0); + sched_boost_active = false; + } + + /* Set the input_boost_min for all CPUs in the system */ + pr_debug("Setting input boost min for all CPUs\n"); + for (i = 0; i < 8; i++) { + i_sync_info = &per_cpu(sync_info, i); + i_sync_info->input_boost_min = sysctl_input_boost_freq[i]; + } + + /* Update policies for all online CPUs */ + update_policy_online(); + + /* Enable scheduler boost to migrate tasks to big cluster */ + if (sysctl_sched_boost_on_input > 0) { + ret = sched_set_boost(sysctl_sched_boost_on_input); + if (ret) + pr_err("input-boost: sched boost enable failed\n"); + else + sched_boost_active = true; + } + + queue_delayed_work(input_boost_wq, &input_boost_rem, + msecs_to_jiffies(sysctl_input_boost_ms)); +} + +static void inputboost_input_event(struct input_handle *handle, + unsigned int type, unsigned int code, int value) +{ + u64 now; + int cpu; + int enabled = 0; + + for_each_possible_cpu(cpu) { + if (sysctl_input_boost_freq[cpu] > 0) { + enabled = 1; + break; + } + } + if (!enabled) + return; + + now = ktime_to_us(ktime_get()); + if (now - last_input_time < MIN_INPUT_INTERVAL) + return; + + if (work_pending(&input_boost_work)) + return; + + queue_work(input_boost_wq, &input_boost_work); + last_input_time = ktime_to_us(ktime_get()); +} + +static int inputboost_input_connect(struct input_handler *handler, + struct input_dev *dev, const struct input_device_id *id) +{ + struct input_handle *handle; + int error; + + handle = kzalloc(sizeof(struct input_handle), GFP_KERNEL); + if (!handle) + return -ENOMEM; + + handle->dev = dev; + handle->handler = handler; + handle->name = "cpufreq"; + + error = input_register_handle(handle); + if (error) + goto err2; + + error = input_open_device(handle); + if (error) + goto err1; + + return 0; +err1: + input_unregister_handle(handle); +err2: + kfree(handle); + return error; +} + +static void inputboost_input_disconnect(struct input_handle *handle) +{ + input_close_device(handle); + input_unregister_handle(handle); + kfree(handle); +} + +static const struct input_device_id inputboost_ids[] = { + /* multi-touch touchscreen */ + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT | + INPUT_DEVICE_ID_MATCH_ABSBIT, + .evbit = { BIT_MASK(EV_ABS) }, + .absbit = { [BIT_WORD(ABS_MT_POSITION_X)] = + BIT_MASK(ABS_MT_POSITION_X) | + BIT_MASK(ABS_MT_POSITION_Y) + }, + }, + /* touchpad */ + { + .flags = INPUT_DEVICE_ID_MATCH_KEYBIT | + INPUT_DEVICE_ID_MATCH_ABSBIT, + .keybit = { [BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH) }, + .absbit = { [BIT_WORD(ABS_X)] = + BIT_MASK(ABS_X) | BIT_MASK(ABS_Y) + }, + }, + /* Keypad */ + { + .flags = INPUT_DEVICE_ID_MATCH_EVBIT, + .evbit = { BIT_MASK(EV_KEY) }, + }, + { }, +}; + +static struct input_handler inputboost_input_handler = { + .event = inputboost_input_event, + .connect = inputboost_input_connect, + .disconnect = inputboost_input_disconnect, + .name = "input-boost", + .id_table = inputboost_ids, +}; + +struct kobject *input_boost_kobj; +int input_boost_init(void) +{ + int cpu, ret; + struct cpu_sync *s; + struct cpufreq_policy *policy; + struct freq_qos_request *req; + + input_boost_wq = alloc_workqueue("inputboost_wq", WQ_HIGHPRI, 0); + if (!input_boost_wq) + return -EFAULT; + + INIT_WORK(&input_boost_work, do_input_boost); + INIT_DELAYED_WORK(&input_boost_rem, do_input_boost_rem); + + for_each_possible_cpu(cpu) { + s = &per_cpu(sync_info, cpu); + s->cpu = cpu; + req = &per_cpu(qos_req, cpu); + policy = cpufreq_cpu_get(cpu); + if (!policy) { + pr_err("%s: cpufreq policy not found for cpu%d\n", + __func__, cpu); + return -ESRCH; + } + + ret = freq_qos_add_request(&policy->constraints, req, + FREQ_QOS_MIN, policy->min); + if (ret < 0) { + pr_err("%s: Failed to add freq constraint (%d)\n", + __func__, ret); + return ret; + } + } + + ret = input_register_handler(&inputboost_input_handler); + return 0; +} diff --git a/kernel/sched/walt/preemptirq_long.c b/kernel/sched/walt/preemptirq_long.c new file mode 100644 index 000000000000..45b77150bf0a --- /dev/null +++ b/kernel/sched/walt/preemptirq_long.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020-2021 The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#define CREATE_TRACE_POINTS +#include "preemptirq_long.h" + +#define IRQSOFF_SENTINEL 0x0fffDEAD + +static unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000; +static unsigned int sysctl_irqsoff_tracing_threshold_ns = 5000000; +static unsigned int sysctl_irqsoff_dmesg_output_enabled; +static unsigned int sysctl_irqsoff_crash_sentinel_value; +static unsigned int sysctl_irqsoff_crash_threshold_ns = 10000000; + +static unsigned int half_million = 500000; +static unsigned int one_hundred_million = 100000000; +static unsigned int one_million = 1000000; + +static DEFINE_PER_CPU(u64, irq_disabled_ts); + +/* + * preemption disable tracking require additional context + * to rule out false positives. see the comment in + * test_preempt_disable_long() for more details. + */ +struct preempt_store { + u64 ts; + int pid; + unsigned long ncsw; +}; +static DEFINE_PER_CPU(struct preempt_store, the_ps); + +static void note_irq_disable(void *u1, unsigned long u2, unsigned long u3) +{ + if (is_idle_task(current)) + return; + + /* + * We just have to note down the time stamp here. We + * use stacktrace trigger feature to print the stacktrace. + */ + this_cpu_write(irq_disabled_ts, sched_clock()); +} + +static void test_irq_disable_long(void *u1, unsigned long u2, unsigned long u3) +{ + u64 ts = this_cpu_read(irq_disabled_ts); + + if (!ts) + return; + + this_cpu_write(irq_disabled_ts, 0); + ts = sched_clock() - ts; + + if (ts > sysctl_irqsoff_tracing_threshold_ns) { + trace_irq_disable_long(ts); + + if (sysctl_irqsoff_dmesg_output_enabled == IRQSOFF_SENTINEL) + printk_deferred("D=%llu C:(%ps<-%ps<-%ps<-%ps)\n", + ts, (void *)CALLER_ADDR2, + (void *)CALLER_ADDR3, + (void *)CALLER_ADDR4, + (void *)CALLER_ADDR5); + } + + if (sysctl_irqsoff_crash_sentinel_value == IRQSOFF_SENTINEL && + ts > sysctl_irqsoff_crash_threshold_ns) { + printk_deferred("delta=%llu(ns) > crash_threshold=%u(ns) Task=%s\n", + ts, sysctl_irqsoff_crash_threshold_ns, + current->comm); + BUG_ON(1); + } +} + +static void note_preempt_disable(void *u1, unsigned long u2, unsigned long u3) +{ + struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id()); + + ps->ts = sched_clock(); + ps->pid = current->pid; + ps->ncsw = current->nvcsw + current->nivcsw; +} + +static void test_preempt_disable_long(void *u1, unsigned long u2, + unsigned long u3) +{ + struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id()); + u64 delta = 0; + + if (!ps->ts) + return; + + /* + * schedule() calls __schedule() with preemption disabled. + * if we had entered idle and exiting idle now, we think + * preemption is disabled the whole time. Detect this by + * checking if the preemption is disabled across the same + * task. There is a possiblity that the same task is scheduled + * after idle. To rule out this possibility, compare the + * context switch count also. + */ + if (ps->pid == current->pid && (ps->ncsw == current->nvcsw + + current->nivcsw)) + delta = sched_clock() - ps->ts; + + ps->ts = 0; + if (delta > sysctl_preemptoff_tracing_threshold_ns) + trace_preempt_disable_long(delta); +} + +static struct ctl_table preemptirq_long_table[] = { + { + .procname = "preemptoff_tracing_threshold_ns", + .data = &sysctl_preemptoff_tracing_threshold_ns, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "irqsoff_tracing_threshold_ns", + .data = &sysctl_irqsoff_tracing_threshold_ns, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = &half_million, + .extra2 = &one_hundred_million, + }, + { + .procname = "irqsoff_dmesg_output_enabled", + .data = &sysctl_irqsoff_dmesg_output_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "irqsoff_crash_sentinel_value", + .data = &sysctl_irqsoff_crash_sentinel_value, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "irqsoff_crash_threshold_ns", + .data = &sysctl_irqsoff_crash_threshold_ns, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = &one_million, + .extra2 = &one_hundred_million, + }, + { } +}; + +int preemptirq_long_init(void) +{ + if (!register_sysctl("preemptirq", preemptirq_long_table)) { + pr_err("Fail to register sysctl table\n"); + return -EPERM; + } + + register_trace_android_rvh_irqs_disable(note_irq_disable, NULL); + register_trace_android_rvh_irqs_enable(test_irq_disable_long, NULL); + register_trace_android_rvh_preempt_disable(note_preempt_disable, NULL); + register_trace_android_rvh_preempt_enable(test_preempt_disable_long, + NULL); + + return 0; +} diff --git a/include/trace/events/preemptirq_long.h b/kernel/sched/walt/preemptirq_long.h similarity index 89% rename from include/trace/events/preemptirq_long.h rename to kernel/sched/walt/preemptirq_long.h index a486134efdb9..48ede42a0655 100644 --- a/include/trace/events/preemptirq_long.h +++ b/kernel/sched/walt/preemptirq_long.h @@ -1,13 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2020 The Linux Foundation. All rights reserved. + * Copyright (c) 2021 The Linux Foundation. All rights reserved. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM preemptirq_long #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH trace/events +#define TRACE_INCLUDE_PATH . #if !defined(_TRACE_PREEMPTIRQ_LONG_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PREEMPTIRQ_LONG_H diff --git a/kernel/sched/walt/qc_vas.c b/kernel/sched/walt/qc_vas.c new file mode 100644 index 000000000000..9db134f8363f --- /dev/null +++ b/kernel/sched/walt/qc_vas.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include + +#include "walt.h" + +#ifdef CONFIG_HOTPLUG_CPU + +cpumask_t pending_active_mask = CPU_MASK_NONE; +int sched_pause_count(const cpumask_t *mask, bool include_offline) +{ + cpumask_t count_mask = CPU_MASK_NONE; + cpumask_t pause_mask = CPU_MASK_NONE; + + if (cpumask_any(&pending_active_mask) >= nr_cpu_ids) { + /* initialize pending_active_state */ + cpumask_copy(&pending_active_mask, cpu_active_mask); + } + + if (include_offline) { + + /* get all offline or paused cpus */ + cpumask_complement(&pause_mask, &pending_active_mask); + cpumask_complement(&count_mask, cpu_online_mask); + cpumask_or(&count_mask, &count_mask, &pause_mask); + + /* get all offline or paused cpus in this cluster */ + cpumask_and(&count_mask, &count_mask, mask); + } else { + cpumask_andnot(&count_mask, mask, &pending_active_mask); + } + + return cpumask_weight(&count_mask); +} + +void sched_pause_pending(int cpu) +{ + cpumask_clear_cpu(cpu, &pending_active_mask); +} + +void sched_unpause_pending(int cpu) +{ + cpumask_set_cpu(cpu, &pending_active_mask); +} + +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/sched/walt/sched_avg.c b/kernel/sched/walt/sched_avg.c new file mode 100644 index 000000000000..cec81ea98fba --- /dev/null +++ b/kernel/sched/walt/sched_avg.c @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2012, 2015-2021, The Linux Foundation. All rights reserved. + */ + +/* + * Scheduler hook for average runqueue determination + */ +#include +#include +#include +#include +#include + +#include "walt.h" +#include "trace.h" + +static DEFINE_PER_CPU(u64, nr_prod_sum); +static DEFINE_PER_CPU(u64, last_time); +static DEFINE_PER_CPU(u64, nr_big_prod_sum); +static DEFINE_PER_CPU(u64, nr); +static DEFINE_PER_CPU(u64, nr_max); + +static DEFINE_PER_CPU(spinlock_t, nr_lock) = __SPIN_LOCK_UNLOCKED(nr_lock); +static s64 last_get_time; + +static DEFINE_PER_CPU(atomic64_t, busy_hyst_end_time) = ATOMIC64_INIT(0); + +static DEFINE_PER_CPU(u64, hyst_time); +static DEFINE_PER_CPU(u64, coloc_hyst_busy); +static DEFINE_PER_CPU(u64, coloc_hyst_time); + +#define NR_THRESHOLD_PCT 15 +#define MAX_RTGB_TIME (sysctl_sched_coloc_busy_hyst_max_ms * NSEC_PER_MSEC) + +/** + * sched_get_nr_running_avg + * @return: Average nr_running, iowait and nr_big_tasks value since last poll. + * Returns the avg * 100 to return up to two decimal points + * of accuracy. + * + * Obtains the average nr_running value since the last poll. + * This function may not be called concurrently with itself + */ +void sched_get_nr_running_avg(struct sched_avg_stats *stats) +{ + int cpu; + u64 curr_time = sched_clock(); + u64 period = curr_time - last_get_time; + u64 tmp_nr, tmp_misfit; + bool any_hyst_time = false; + + if (!period) + return; + + /* read and reset nr_running counts */ + for_each_possible_cpu(cpu) { + unsigned long flags; + u64 diff; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + + tmp_nr = per_cpu(nr_prod_sum, cpu); + tmp_nr += per_cpu(nr, cpu) * diff; + tmp_nr = div64_u64((tmp_nr * 100), period); + + tmp_misfit = per_cpu(nr_big_prod_sum, cpu); + tmp_misfit += walt_big_tasks(cpu) * diff; + tmp_misfit = div64_u64((tmp_misfit * 100), period); + + /* + * NR_THRESHOLD_PCT is to make sure that the task ran + * at least 85% in the last window to compensate any + * over estimating being done. + */ + stats[cpu].nr = (int)div64_u64((tmp_nr + NR_THRESHOLD_PCT), + 100); + stats[cpu].nr_misfit = (int)div64_u64((tmp_misfit + + NR_THRESHOLD_PCT), 100); + stats[cpu].nr_max = per_cpu(nr_max, cpu); + stats[cpu].nr_scaled = tmp_nr; + + trace_sched_get_nr_running_avg(cpu, stats[cpu].nr, + stats[cpu].nr_misfit, stats[cpu].nr_max, + stats[cpu].nr_scaled); + + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr_prod_sum, cpu) = 0; + per_cpu(nr_big_prod_sum, cpu) = 0; + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); + } + + for_each_possible_cpu(cpu) { + if (per_cpu(coloc_hyst_time, cpu)) { + any_hyst_time = true; + break; + } + } + if (any_hyst_time && get_rtgb_active_time() >= MAX_RTGB_TIME) + sched_update_hyst_times(); + + last_get_time = curr_time; + +} +EXPORT_SYMBOL(sched_get_nr_running_avg); + +void sched_update_hyst_times(void) +{ + bool rtgb_active; + int cpu; + unsigned long cpu_cap, coloc_busy_pct; + + rtgb_active = is_rtgb_active() && (sched_boost() != CONSERVATIVE_BOOST) + && (get_rtgb_active_time() < MAX_RTGB_TIME); + + for_each_possible_cpu(cpu) { + cpu_cap = arch_scale_cpu_capacity(cpu); + coloc_busy_pct = sysctl_sched_coloc_busy_hyst_cpu_busy_pct[cpu]; + per_cpu(hyst_time, cpu) = (BIT(cpu) + & sysctl_sched_busy_hyst_enable_cpus) ? + sysctl_sched_busy_hyst : 0; + per_cpu(coloc_hyst_time, cpu) = ((BIT(cpu) + & sysctl_sched_coloc_busy_hyst_enable_cpus) + && rtgb_active) ? + sysctl_sched_coloc_busy_hyst_cpu[cpu] : 0; + per_cpu(coloc_hyst_busy, cpu) = mult_frac(cpu_cap, + coloc_busy_pct, 100); + } +} + +#define BUSY_NR_RUN 3 +#define BUSY_LOAD_FACTOR 10 +static inline void update_busy_hyst_end_time(int cpu, bool dequeue, + unsigned long prev_nr_run, u64 curr_time) +{ + bool nr_run_trigger = false; + bool load_trigger = false, coloc_load_trigger = false; + u64 agg_hyst_time; + + if (!per_cpu(hyst_time, cpu) && !per_cpu(coloc_hyst_time, cpu)) + return; + + if (prev_nr_run >= BUSY_NR_RUN && per_cpu(nr, cpu) < BUSY_NR_RUN) + nr_run_trigger = true; + + if (dequeue && (cpu_util(cpu) * BUSY_LOAD_FACTOR) > + capacity_orig_of(cpu)) + load_trigger = true; + + if (dequeue && cpu_util(cpu) > per_cpu(coloc_hyst_busy, cpu)) + coloc_load_trigger = true; + + agg_hyst_time = max((nr_run_trigger || load_trigger) ? + per_cpu(hyst_time, cpu) : 0, + (nr_run_trigger || coloc_load_trigger) ? + per_cpu(coloc_hyst_time, cpu) : 0); + + if (agg_hyst_time) + atomic64_set(&per_cpu(busy_hyst_end_time, cpu), + curr_time + agg_hyst_time); +} + +int sched_busy_hyst_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + if (table->maxlen > (sizeof(unsigned int) * num_possible_cpus())) + table->maxlen = sizeof(unsigned int) * num_possible_cpus(); + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write) + sched_update_hyst_times(); + + return ret; +} + +/** + * sched_update_nr_prod + * @cpu: The core id of the nr running driver. + * @enq: enqueue/dequeue happening on this CPU. + * @return: N/A + * + * Update average with latest nr_running value for CPU + */ +void sched_update_nr_prod(int cpu, bool enq) +{ + u64 diff; + u64 curr_time; + unsigned long flags, nr_running; + + spin_lock_irqsave(&per_cpu(nr_lock, cpu), flags); + nr_running = per_cpu(nr, cpu); + curr_time = sched_clock(); + diff = curr_time - per_cpu(last_time, cpu); + BUG_ON((s64)diff < 0); + per_cpu(last_time, cpu) = curr_time; + per_cpu(nr, cpu) = cpu_rq(cpu)->nr_running; + + if (per_cpu(nr, cpu) > per_cpu(nr_max, cpu)) + per_cpu(nr_max, cpu) = per_cpu(nr, cpu); + + update_busy_hyst_end_time(cpu, !enq, nr_running, curr_time); + + per_cpu(nr_prod_sum, cpu) += nr_running * diff; + per_cpu(nr_big_prod_sum, cpu) += walt_big_tasks(cpu) * diff; + spin_unlock_irqrestore(&per_cpu(nr_lock, cpu), flags); +} + +/* + * Returns the CPU utilization % in the last window. + */ +unsigned int sched_get_cpu_util(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 util; + unsigned long capacity, flags; + unsigned int busy; + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + raw_spin_lock_irqsave(&rq->lock, flags); + + capacity = capacity_orig_of(cpu); + + util = wrq->prev_runnable_sum + wrq->grp_time.prev_runnable_sum; + util = div64_u64(util, sched_ravg_window >> SCHED_CAPACITY_SHIFT); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + util = (util >= capacity) ? capacity : util; + busy = div64_ul((util * 100), capacity); + return busy; +} + +u64 sched_lpm_disallowed_time(int cpu) +{ + u64 now = sched_clock(); + u64 bias_end_time = atomic64_read(&per_cpu(busy_hyst_end_time, cpu)); + + if (now < bias_end_time) + return bias_end_time - now; + + return 0; +} +EXPORT_SYMBOL(sched_lpm_disallowed_time); diff --git a/kernel/sched/walt/sysctl.c b/kernel/sched/walt/sysctl.c new file mode 100644 index 000000000000..6dd40e05cf13 --- /dev/null +++ b/kernel/sched/walt/sysctl.c @@ -0,0 +1,900 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. + */ + +#include "walt.h" + +static int neg_three = -3; +static int three = 3; +static int two_hundred_fifty_five = 255; +static unsigned int ns_per_sec = NSEC_PER_SEC; +static unsigned int one_hundred_thousand = 100000; +static unsigned int two_hundred_million = 200000000; +static int __maybe_unused two = 2; +static int __maybe_unused four = 4; +static int one_hundred = 100; +static int one_thousand = 1000; + +/* + * CFS task prio range is [100 ... 139] + * 120 is the default prio. + * RTG boost range is [100 ... 119] because giving + * boost for [120 .. 139] does not make sense. + * 99 means disabled and it is the default value. + */ +static unsigned int min_cfs_boost_prio = 99; +static unsigned int max_cfs_boost_prio = 119; + +unsigned int sysctl_sched_capacity_margin_up_pct[MAX_MARGIN_LEVELS]; +unsigned int sysctl_sched_capacity_margin_dn_pct[MAX_MARGIN_LEVELS]; +unsigned int sysctl_sched_busy_hyst_enable_cpus; +unsigned int sysctl_sched_busy_hyst; +unsigned int sysctl_sched_coloc_busy_hyst_enable_cpus; +unsigned int sysctl_sched_coloc_busy_hyst_cpu[WALT_NR_CPUS]; +unsigned int sysctl_sched_coloc_busy_hyst_max_ms; +unsigned int sysctl_sched_coloc_busy_hyst_cpu_busy_pct[WALT_NR_CPUS]; +unsigned int sysctl_sched_boost; +unsigned int sysctl_sched_wake_up_idle[2]; +unsigned int sysctl_input_boost_ms; +unsigned int sysctl_input_boost_freq[8]; +unsigned int sysctl_sched_boost_on_input; +unsigned int sysctl_sched_init_stage; +unsigned int sysctl_sched_load_boost[WALT_NR_CPUS]; + +/* sysctl nodes accesed by other files */ +unsigned int __read_mostly sysctl_sched_coloc_downmigrate_ns; +unsigned int __read_mostly sysctl_sched_group_downmigrate_pct; +unsigned int __read_mostly sysctl_sched_group_upmigrate_pct; +unsigned int __read_mostly sysctl_sched_window_stats_policy; +unsigned int sysctl_sched_ravg_window_nr_ticks; +unsigned int sysctl_sched_dynamic_ravg_window_enable; +unsigned int sysctl_sched_walt_rotate_big_tasks; +unsigned int sysctl_sched_task_unfilter_period; +unsigned int __read_mostly sysctl_sched_asym_cap_sibling_freq_match_pct; +unsigned int sysctl_walt_low_latency_task_threshold; /* disabled by default */ +unsigned int sysctl_task_read_pid; +unsigned int sysctl_sched_conservative_pl; +unsigned int sysctl_sched_min_task_util_for_boost = 51; +unsigned int sysctl_sched_min_task_util_for_colocation = 35; +unsigned int sysctl_sched_many_wakeup_threshold = WALT_MANY_WAKEUP_DEFAULT; +const int sched_user_hint_max = 1000; + +static void init_tg_pointers(void) +{ + struct cgroup_subsys_state *css = &root_task_group.css; + struct cgroup_subsys_state *top_css = css; + + /* ptrs are already initialized */ + if (task_group_topapp) + return; + + css_for_each_child(css, top_css) { + if (!strcmp(css->cgroup->kn->name, "top-app")) { + task_group_topapp = css_tg(css); + walt_init_topapp_tg(task_group_topapp); + } else if (!strcmp(css->cgroup->kn->name, "foreground")) { + task_group_foreground = css_tg(css); + walt_init_foreground_tg(task_group_foreground); + } else { + walt_init_tg(css_tg(css)); + } + } +} + +static int walt_init_stage_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + int old_value = sysctl_sched_init_stage; + + mutex_lock(&mutex); + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto unlock; + + if (sysctl_sched_init_stage == 1 && + old_value != sysctl_sched_init_stage) { + init_tg_pointers(); + } + +unlock: + mutex_unlock(&mutex); + return ret; +} + +static int walt_proc_group_thresholds_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + static DEFINE_MUTEX(mutex); + struct rq *rq = cpu_rq(cpumask_first(cpu_possible_mask)); + unsigned long flags; + + if (unlikely(num_sched_clusters <= 0)) + return -EPERM; + + mutex_lock(&mutex); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) { + mutex_unlock(&mutex); + return ret; + } + + /* + * The load scale factor update happens with all + * rqs locked. so acquiring 1 CPU rq lock and + * updating the thresholds is sufficient for + * an atomic update. + */ + raw_spin_lock_irqsave(&rq->lock, flags); + walt_update_group_thresholds(); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + mutex_unlock(&mutex); + + return ret; +} + +static int walt_proc_user_hint_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + unsigned int old_value; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + + sched_user_hint_reset_time = jiffies + HZ; + old_value = sysctl_sched_user_hint; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write || (old_value == sysctl_sched_user_hint)) + goto unlock; + + walt_irq_work_queue(&walt_migration_irq_work); + +unlock: + mutex_unlock(&mutex); + return ret; +} + +static int sched_ravg_window_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = -EPERM; + static DEFINE_MUTEX(mutex); + int val = sysctl_sched_ravg_window_nr_ticks; + + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + }; + + mutex_lock(&mutex); + + if (write && (HZ != 250 || !sysctl_sched_dynamic_ravg_window_enable)) + goto unlock; + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (ret || !write || (val == sysctl_sched_ravg_window_nr_ticks)) + goto unlock; + + if (val != 2 && val != 3 && val != 4 && val != 5 && val != 8) { + ret = -EINVAL; + goto unlock; + } + + sysctl_sched_ravg_window_nr_ticks = val; + sched_window_nr_ticks_change(); + +unlock: + mutex_unlock(&mutex); + return ret; +} + +enum { + TASK_BEGIN = 0, + WAKE_UP_IDLE, + INIT_TASK_LOAD, + GROUP_ID, + PER_TASK_BOOST, + PER_TASK_BOOST_PERIOD_MS, + LOW_LATENCY, +}; + +static int sched_task_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, param; + struct task_struct *task; + int pid_and_val[2] = {-1, -1}; + int val; + struct walt_task_struct *wts; + + struct ctl_table tmp = { + .data = &pid_and_val, + .maxlen = sizeof(pid_and_val), + .mode = table->mode, + }; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + + if (!write) { + if (sysctl_task_read_pid <= 0) { + ret = -ENOENT; + goto unlock_mutex; + } + task = get_pid_task(find_vpid(sysctl_task_read_pid), + PIDTYPE_PID); + if (!task) { + ret = -ENOENT; + goto put_task; + } + wts = (struct walt_task_struct *) task->android_vendor_data1; + pid_and_val[0] = sysctl_task_read_pid; + param = (unsigned long)table->data; + switch (param) { + case WAKE_UP_IDLE: + pid_and_val[1] = wts->wake_up_idle; + break; + case INIT_TASK_LOAD: + pid_and_val[1] = wts->init_load_pct; + break; + case GROUP_ID: + pid_and_val[1] = sched_get_group_id(task); + break; + case PER_TASK_BOOST: + pid_and_val[1] = wts->boost; + break; + case PER_TASK_BOOST_PERIOD_MS: + pid_and_val[1] = + div64_ul(wts->boost_period, + 1000000UL); + break; + case LOW_LATENCY: + pid_and_val[1] = wts->low_latency; + break; + default: + ret = -EINVAL; + goto put_task; + } + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + goto put_task; + } + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (ret) + goto unlock_mutex; + + if (pid_and_val[0] <= 0 || pid_and_val[1] < 0) { + ret = -ENOENT; + goto unlock_mutex; + } + + /* parsed the values successfully in pid_and_val[] array */ + task = get_pid_task(find_vpid(pid_and_val[0]), PIDTYPE_PID); + if (!task) { + ret = -ENOENT; + goto unlock_mutex; + } + wts = (struct walt_task_struct *) task->android_vendor_data1; + param = (unsigned long)table->data; + val = pid_and_val[1]; + switch (param) { + case WAKE_UP_IDLE: + wts->wake_up_idle = val; + break; + case INIT_TASK_LOAD: + if (pid_and_val[1] < 0 || pid_and_val[1] > 100) { + ret = -EINVAL; + goto put_task; + } + wts->init_load_pct = val; + break; + case GROUP_ID: + ret = sched_set_group_id(task, val); + break; + case PER_TASK_BOOST: + if (val < TASK_BOOST_NONE || val >= TASK_BOOST_END) { + ret = -EINVAL; + goto put_task; + } + wts->boost = val; + if (val == 0) + wts->boost_period = 0; + break; + case PER_TASK_BOOST_PERIOD_MS: + if (wts->boost == 0 && val) { + /* setting boost period w/o boost is invalid */ + ret = -EINVAL; + goto put_task; + } + wts->boost_period = (u64)val * 1000 * 1000; + wts->boost_expires = sched_clock() + wts->boost_period; + break; + case LOW_LATENCY: + wts->low_latency = val; + break; + default: + ret = -EINVAL; + } + +put_task: + put_task_struct(task); +unlock_mutex: + mutex_unlock(&mutex); + + return ret; +} + +static int sched_load_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, i; + unsigned int *data = (unsigned int *)table->data; + int val[WALT_NR_CPUS]; + + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(val), + .mode = table->mode, + }; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + + if (!write) { + ret = proc_dointvec(table, write, buffer, lenp, ppos); + goto unlock_mutex; + } + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (ret) + goto unlock_mutex; + + for (i = 0; i < WALT_NR_CPUS; i++) { + if (val[i] < -100 || val[i] > 1000) { + ret = -EINVAL; + goto unlock_mutex; + } + } + + /* all things checkout update the value */ + for (i = 0; i < WALT_NR_CPUS; i++) + data[i] = val[i]; + +unlock_mutex: + mutex_unlock(&mutex); + + return ret; +} + +#ifdef CONFIG_PROC_SYSCTL +static void sched_update_updown_migrate_values(bool up) +{ + int i = 0, cpu; + struct walt_sched_cluster *cluster; + int cap_margin_levels = num_sched_clusters - 1; + + if (cap_margin_levels > 1) { + /* + * No need to worry about CPUs in last cluster + * if there are more than 2 clusters in the system + */ + for_each_sched_cluster(cluster) { + for_each_cpu(cpu, &cluster->cpus) { + if (up) + sched_capacity_margin_up[cpu] = + SCHED_FIXEDPOINT_SCALE * 100 / + sysctl_sched_capacity_margin_up_pct[i]; + else + sched_capacity_margin_down[cpu] = + SCHED_FIXEDPOINT_SCALE * 100 / + sysctl_sched_capacity_margin_dn_pct[i]; + } + + if (++i >= cap_margin_levels) + break; + } + } else { + for_each_possible_cpu(cpu) { + if (up) + sched_capacity_margin_up[cpu] = + + SCHED_FIXEDPOINT_SCALE * 100 / + sysctl_sched_capacity_margin_up_pct[0]; + else + sched_capacity_margin_down[cpu] = + sysctl_sched_capacity_margin_dn_pct[0]; + } + } +} + +int sched_updown_migrate_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, i; + unsigned int *data = (unsigned int *)table->data; + static DEFINE_MUTEX(mutex); + int cap_margin_levels = num_sched_clusters ? num_sched_clusters - 1 : 0; + int val[MAX_MARGIN_LEVELS]; + struct ctl_table tmp = { + .data = &val, + .maxlen = sizeof(int) * cap_margin_levels, + .mode = table->mode, + }; + + if (cap_margin_levels <= 0) + return -EINVAL; + + mutex_lock(&mutex); + + if (!write) { + ret = proc_dointvec(table, write, buffer, lenp, ppos); + goto unlock_mutex; + } + + ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); + if (ret) + goto unlock_mutex; + + /* check if valid pct values are passed in */ + for (i = 0; i < cap_margin_levels; i++) { + if (val[i] <= 0 || val[i] > 100) { + ret = -EINVAL; + goto unlock_mutex; + } + } + + /* check up pct is greater than dn pct */ + if (data == &sysctl_sched_capacity_margin_up_pct[0]) { + for (i = 0; i < cap_margin_levels; i++) { + if (val[i] < sysctl_sched_capacity_margin_dn_pct[i]) { + ret = -EINVAL; + goto unlock_mutex; + } + } + } else { + for (i = 0; i < cap_margin_levels; i++) { + if (sysctl_sched_capacity_margin_up_pct[i] < val[i]) { + ret = -EINVAL; + goto unlock_mutex; + } + } + } + + /* all things checkout update the value */ + for (i = 0; i < cap_margin_levels; i++) + data[i] = val[i]; + + /* update individual cpu thresholds */ + sched_update_updown_migrate_values(data == &sysctl_sched_capacity_margin_up_pct[0]); + +unlock_mutex: + mutex_unlock(&mutex); + + return ret; +} +#endif /* CONFIG_PROC_SYSCTL */ + +struct ctl_table input_boost_sysctls[] = { + { + .procname = "input_boost_ms", + .data = &sysctl_input_boost_ms, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred_thousand, + }, + { + .procname = "input_boost_freq", + .data = &sysctl_input_boost_freq, + .maxlen = sizeof(unsigned int) * 8, + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { + .procname = "sched_boost_on_input", + .data = &sysctl_sched_boost_on_input, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { } +}; + +struct ctl_table walt_table[] = { + { + .procname = "sched_init_stage", + .data = &sysctl_sched_init_stage, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = walt_init_stage_handler, + }, + { + .procname = "sched_user_hint", + .data = &sysctl_sched_user_hint, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = walt_proc_user_hint_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&sched_user_hint_max, + }, + { + .procname = "sched_window_stats_policy", + .data = &sysctl_sched_window_stats_policy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &four, + }, + { + .procname = "sched_group_upmigrate", + .data = &sysctl_sched_group_upmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = walt_proc_group_thresholds_handler, + .extra1 = &sysctl_sched_group_downmigrate_pct, + }, + { + .procname = "sched_group_downmigrate", + .data = &sysctl_sched_group_downmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = walt_proc_group_thresholds_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_sched_group_upmigrate_pct, + }, + { + .procname = "sched_boost", + .data = &sysctl_sched_boost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_boost_handler, + .extra1 = &neg_three, + .extra2 = &three, + }, + { + .procname = "sched_conservative_pl", + .data = &sysctl_sched_conservative_pl, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_many_wakeup_threshold", + .data = &sysctl_sched_many_wakeup_threshold, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &two, + .extra2 = &one_thousand, + }, + { + .procname = "sched_walt_rotate_big_tasks", + .data = &sysctl_sched_walt_rotate_big_tasks, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_min_task_util_for_boost", + .data = &sysctl_sched_min_task_util_for_boost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, + { + .procname = "sched_min_task_util_for_colocation", + .data = &sysctl_sched_min_task_util_for_colocation, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, + { + .procname = "sched_asym_cap_sibling_freq_match_pct", + .data = &sysctl_sched_asym_cap_sibling_freq_match_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &one_hundred, + }, + { + .procname = "sched_coloc_downmigrate_ns", + .data = &sysctl_sched_coloc_downmigrate_ns, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + }, + { + .procname = "sched_task_unfilter_period", + .data = &sysctl_sched_task_unfilter_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &two_hundred_million, + }, + { + .procname = "sched_busy_hysteresis_enable_cpus", + .data = &sysctl_sched_busy_hyst_enable_cpus, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_busy_hyst_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &two_hundred_fifty_five, + }, + { + .procname = "sched_busy_hyst_ns", + .data = &sysctl_sched_busy_hyst, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_busy_hyst_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &ns_per_sec, + }, + { + .procname = "sched_coloc_busy_hysteresis_enable_cpus", + .data = &sysctl_sched_coloc_busy_hyst_enable_cpus, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_busy_hyst_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &two_hundred_fifty_five, + }, + { + .procname = "sched_coloc_busy_hyst_cpu_ns", + .data = &sysctl_sched_coloc_busy_hyst_cpu, + .maxlen = sizeof(unsigned int) * WALT_NR_CPUS, + .mode = 0644, + .proc_handler = sched_busy_hyst_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &ns_per_sec, + }, + { + .procname = "sched_coloc_busy_hyst_max_ms", + .data = &sysctl_sched_coloc_busy_hyst_max_ms, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_busy_hyst_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred_thousand, + }, + { + .procname = "sched_coloc_busy_hyst_cpu_busy_pct", + .data = &sysctl_sched_coloc_busy_hyst_cpu_busy_pct, + .maxlen = sizeof(unsigned int) * WALT_NR_CPUS, + .mode = 0644, + .proc_handler = sched_busy_hyst_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, + { + .procname = "sched_ravg_window_nr_ticks", + .data = &sysctl_sched_ravg_window_nr_ticks, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_ravg_window_handler, + }, + { + .procname = "sched_dynamic_ravg_window_enable", + .data = &sysctl_sched_dynamic_ravg_window_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_upmigrate", + .data = &sysctl_sched_capacity_margin_up_pct, + .maxlen = sizeof(unsigned int) * MAX_MARGIN_LEVELS, + .mode = 0644, + .proc_handler = sched_updown_migrate_handler, + }, + { + .procname = "sched_downmigrate", + .data = &sysctl_sched_capacity_margin_dn_pct, + .maxlen = sizeof(unsigned int) * MAX_MARGIN_LEVELS, + .mode = 0644, + .proc_handler = sched_updown_migrate_handler, + }, + { + .procname = "sched_prefer_spread", + .data = &sysctl_sched_prefer_spread, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &four, + }, + { + .procname = "walt_rtg_cfs_boost_prio", + .data = &sysctl_walt_rtg_cfs_boost_prio, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_cfs_boost_prio, + .extra2 = &max_cfs_boost_prio, + }, + { + .procname = "walt_low_latency_task_threshold", + .data = &sysctl_walt_low_latency_task_threshold, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, + { + .procname = "sched_force_lb_enable", + .data = &sysctl_sched_force_lb_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_lib_name", + .data = sched_lib_name, + .maxlen = LIB_PATH_LENGTH, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "sched_lib_mask_force", + .data = &sched_lib_mask_force, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &two_hundred_fifty_five, + }, + { + .procname = "input_boost", + .mode = 0555, + .child = input_boost_sysctls, + }, + { + .procname = "sched_wake_up_idle", + .data = (int *) WAKE_UP_IDLE, + .maxlen = sizeof(unsigned int) * 2, + .mode = 0644, + .proc_handler = sched_task_handler, + }, + { + .procname = "sched_init_task_load", + .data = (int *) INIT_TASK_LOAD, + .maxlen = sizeof(unsigned int) * 2, + .mode = 0644, + .proc_handler = sched_task_handler, + }, + { + .procname = "sched_group_id", + .data = (int *) GROUP_ID, + .maxlen = sizeof(unsigned int) * 2, + .mode = 0644, + .proc_handler = sched_task_handler, + }, + { + .procname = "sched_per_task_boost", + .data = (int *) PER_TASK_BOOST, + .maxlen = sizeof(unsigned int) * 2, + .mode = 0644, + .proc_handler = sched_task_handler, + }, + { + .procname = "sched_per_task_boost_period_ms", + .data = (int *) PER_TASK_BOOST_PERIOD_MS, + .maxlen = sizeof(unsigned int) * 2, + .mode = 0644, + .proc_handler = sched_task_handler, + }, + { + .procname = "sched_low_latency", + .data = (int *) LOW_LATENCY, + .maxlen = sizeof(unsigned int) * 2, + .mode = 0644, + .proc_handler = sched_task_handler, + }, + { + .procname = "sched_task_read_pid", + .data = &sysctl_task_read_pid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_load_boost", + .data = &sysctl_sched_load_boost, + .maxlen = sizeof(unsigned int) * 8, + .mode = 0644, + .proc_handler = sched_load_boost_handler, + }, + { } +}; + +struct ctl_table walt_base_table[] = { + { + .procname = "walt", + .mode = 0555, + .child = walt_table, + }, + { }, +}; + +void walt_tunables(void) +{ + int i; + + for (i = 0; i < MAX_MARGIN_LEVELS; i++) { + sysctl_sched_capacity_margin_up_pct[i] = 95; /* ~5% margin */ + sysctl_sched_capacity_margin_dn_pct[i] = 85; /* ~15% margin */ + } + + sysctl_sched_group_upmigrate_pct = 100; + + sysctl_sched_group_downmigrate_pct = 95; + + sysctl_sched_asym_cap_sibling_freq_match_pct = 100; + + sysctl_sched_task_unfilter_period = 100000000; + + sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG; + + sysctl_sched_ravg_window_nr_ticks = (HZ / NR_WINDOWS_PER_SEC); + + sysctl_sched_dynamic_ravg_window_enable = (HZ == 250); + + sched_load_granule = DEFAULT_SCHED_RAVG_WINDOW / NUM_LOAD_INDICES; + + sysctl_sched_min_task_util_for_boost = 51; + + sysctl_sched_min_task_util_for_colocation = 35; + + for (i = 0; i < WALT_NR_CPUS; i++) { + sysctl_sched_coloc_busy_hyst_cpu[i] = 39000000; + sysctl_sched_coloc_busy_hyst_cpu_busy_pct[i] = 10; + } + + sysctl_sched_coloc_busy_hyst_enable_cpus = 112; + + sysctl_sched_coloc_busy_hyst_max_ms = 5000; + + sysctl_walt_rtg_cfs_boost_prio = 99; /* disabled by default */ + + sched_ravg_window = DEFAULT_SCHED_RAVG_WINDOW; + + sysctl_input_boost_ms = 40; + + for (i = 0; i < 8; i++) + sysctl_input_boost_freq[i] = 0; +} diff --git a/kernel/sched/walt/trace.c b/kernel/sched/walt/trace.c new file mode 100644 index 000000000000..cf9c34077f2c --- /dev/null +++ b/kernel/sched/walt/trace.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved. + */ + +#include "walt.h" + +static inline void __window_data(u32 *dst, u32 *src) +{ + if (src) + memcpy(dst, src, nr_cpu_ids * sizeof(u32)); + else + memset(dst, 0, nr_cpu_ids * sizeof(u32)); +} + +struct trace_seq; +const char *__window_print(struct trace_seq *p, const u32 *buf, int buf_len) +{ + int i; + const char *ret = p->buffer + seq_buf_used(&p->seq); + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%u ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} + +static inline s64 __rq_update_sum(struct rq *rq, bool curr, bool new) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + if (curr) + if (new) + return wrq->nt_curr_runnable_sum; + else + return wrq->curr_runnable_sum; + else + if (new) + return wrq->nt_prev_runnable_sum; + else + return wrq->prev_runnable_sum; +} + +static inline s64 __grp_update_sum(struct rq *rq, bool curr, bool new) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + if (curr) + if (new) + return wrq->grp_time.nt_curr_runnable_sum; + else + return wrq->grp_time.curr_runnable_sum; + else + if (new) + return wrq->grp_time.nt_prev_runnable_sum; + else + return wrq->grp_time.prev_runnable_sum; +} + +static inline s64 +__get_update_sum(struct rq *rq, enum migrate_types migrate_type, + bool src, bool new, bool curr) +{ + switch (migrate_type) { + case RQ_TO_GROUP: + if (src) + return __rq_update_sum(rq, curr, new); + else + return __grp_update_sum(rq, curr, new); + case GROUP_TO_RQ: + if (src) + return __grp_update_sum(rq, curr, new); + else + return __rq_update_sum(rq, curr, new); + default: + WARN_ON_ONCE(1); + return -EINVAL; + } +} + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/kernel/sched/walt/trace.h b/kernel/sched/walt/trace.h new file mode 100644 index 000000000000..75be13d809bf --- /dev/null +++ b/kernel/sched/walt/trace.h @@ -0,0 +1,1097 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM schedwalt + +#if !defined(_TRACE_WALT_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WALT_H + +#include + +#include "walt.h" + +struct rq; +struct group_cpu_time; +struct walt_task_struct; +struct walt_rq; +struct walt_related_thread_group; + +extern const char *task_event_names[]; + +TRACE_EVENT(sched_update_pred_demand, + + TP_PROTO(struct task_struct *p, u32 runtime, int pct, + unsigned int pred_demand, struct walt_task_struct *wts), + + TP_ARGS(p, runtime, pct, pred_demand, wts), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned int, runtime) + __field(int, pct) + __field(unsigned int, pred_demand) + __array(u8, bucket, NUM_BUSY_BUCKETS) + __field(int, cpu) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->pct = pct; + __entry->pred_demand = pred_demand; + memcpy(__entry->bucket, wts->busy_buckets, + NUM_BUSY_BUCKETS * sizeof(u8)); + __entry->cpu = task_cpu(p); + ), + + TP_printk("%d (%s): runtime %u pct %d cpu %d pred_demand %u (buckets: %u %u %u %u %u %u %u %u %u %u)", + __entry->pid, __entry->comm, + __entry->runtime, __entry->pct, __entry->cpu, + __entry->pred_demand, __entry->bucket[0], __entry->bucket[1], + __entry->bucket[2], __entry->bucket[3], __entry->bucket[4], + __entry->bucket[5], __entry->bucket[6], __entry->bucket[7], + __entry->bucket[8], __entry->bucket[9]) +); + +TRACE_EVENT(sched_update_history, + + TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples, + enum task_event evt, struct walt_rq *wrq, struct walt_task_struct *wts), + + TP_ARGS(rq, p, runtime, samples, evt, wrq, wts), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned int, runtime) + __field(int, samples) + __field(enum task_event, evt) + __field(unsigned int, demand) + __field(unsigned int, coloc_demand) + __field(unsigned int, pred_demand) + __array(u32, hist, RAVG_HIST_SIZE_MAX) + __field(unsigned int, nr_big_tasks) + __field(int, cpu) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->runtime = runtime; + __entry->samples = samples; + __entry->evt = evt; + __entry->demand = wts->demand; + __entry->coloc_demand = wts->coloc_demand; + __entry->pred_demand = wts->pred_demand; + memcpy(__entry->hist, wts->sum_history, + RAVG_HIST_SIZE_MAX * sizeof(u32)); + __entry->nr_big_tasks = wrq->walt_stats.nr_big_tasks; + __entry->cpu = rq->cpu; + ), + + TP_printk("%d (%s): runtime %u samples %d event %s demand %u coloc_demand %u pred_demand %u (hist: %u %u %u %u %u) cpu %d nr_big %u", + __entry->pid, __entry->comm, + __entry->runtime, __entry->samples, + task_event_names[__entry->evt], + __entry->demand, __entry->coloc_demand, __entry->pred_demand, + __entry->hist[0], __entry->hist[1], + __entry->hist[2], __entry->hist[3], + __entry->hist[4], __entry->cpu, __entry->nr_big_tasks) +); + +TRACE_EVENT(sched_get_task_cpu_cycles, + + TP_PROTO(int cpu, int event, u64 cycles, + u64 exec_time, struct task_struct *p), + + TP_ARGS(cpu, event, cycles, exec_time, p), + + TP_STRUCT__entry( + __field(int, cpu) + __field(int, event) + __field(u64, cycles) + __field(u64, exec_time) + __field(u32, freq) + __field(u32, legacy_freq) + __field(u32, max_freq) + __field(pid_t, pid) + __array(char, comm, TASK_COMM_LEN) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->event = event; + __entry->cycles = cycles; + __entry->exec_time = exec_time; + __entry->freq = cpu_cycles_to_freq(cycles, exec_time); + __entry->legacy_freq = sched_cpu_legacy_freq(cpu); + __entry->max_freq = cpu_max_freq(cpu); + __entry->pid = p->pid; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + ), + + TP_printk("cpu=%d event=%d cycles=%llu exec_time=%llu freq=%u legacy_freq=%u max_freq=%u task=%d (%s)", + __entry->cpu, __entry->event, __entry->cycles, + __entry->exec_time, __entry->freq, __entry->legacy_freq, + __entry->max_freq, __entry->pid, __entry->comm) +); + +TRACE_EVENT(sched_update_task_ravg, + + TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt, + u64 wallclock, u64 irqtime, + struct group_cpu_time *cpu_time, struct walt_rq *wrq, + struct walt_task_struct *wts), + + TP_ARGS(p, rq, evt, wallclock, irqtime, cpu_time, wrq, wts), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(pid_t, cur_pid) + __field(unsigned int, cur_freq) + __field(u64, wallclock) + __field(u64, mark_start) + __field(u64, delta_m) + __field(u64, win_start) + __field(u64, delta) + __field(u64, irqtime) + __field(enum task_event, evt) + __field(unsigned int, demand) + __field(unsigned int, coloc_demand) + __field(unsigned int, sum) + __field(int, cpu) + __field(unsigned int, pred_demand) + __field(u64, rq_cs) + __field(u64, rq_ps) + __field(u64, grp_cs) + __field(u64, grp_ps) + __field(u64, grp_nt_cs) + __field(u64, grp_nt_ps) + __field(u32, curr_window) + __field(u32, prev_window) + __dynamic_array(u32, curr_sum, nr_cpu_ids) + __dynamic_array(u32, prev_sum, nr_cpu_ids) + __field(u64, nt_cs) + __field(u64, nt_ps) + __field(u64, active_time) + __field(u32, curr_top) + __field(u32, prev_top) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = wrq->window_start; + __entry->delta = (wallclock - wrq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + __entry->cur_pid = rq->curr->pid; + __entry->cur_freq = wrq->task_exec_scale; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = wts->mark_start; + __entry->delta_m = (wallclock - wts->mark_start); + __entry->demand = wts->demand; + __entry->coloc_demand = wts->coloc_demand; + __entry->sum = wts->sum; + __entry->irqtime = irqtime; + __entry->pred_demand = wts->pred_demand; + __entry->rq_cs = wrq->curr_runnable_sum; + __entry->rq_ps = wrq->prev_runnable_sum; + __entry->grp_cs = cpu_time ? cpu_time->curr_runnable_sum : 0; + __entry->grp_ps = cpu_time ? cpu_time->prev_runnable_sum : 0; + __entry->grp_nt_cs = cpu_time ? + cpu_time->nt_curr_runnable_sum : 0; + __entry->grp_nt_ps = cpu_time ? + cpu_time->nt_prev_runnable_sum : 0; + __entry->curr_window = wts->curr_window; + __entry->prev_window = wts->prev_window; + __window_data(__get_dynamic_array(curr_sum), + wts->curr_window_cpu); + __window_data(__get_dynamic_array(prev_sum), + wts->prev_window_cpu); + __entry->nt_cs = wrq->nt_curr_runnable_sum; + __entry->nt_ps = wrq->nt_prev_runnable_sum; + __entry->active_time = wts->active_time; + __entry->curr_top = wrq->curr_top; + __entry->prev_top = wrq->prev_top; + ), + + TP_printk("wc %llu ws %llu delta %llu event %s cpu %d cur_freq %u cur_pid %d task %d (%s) ms %llu delta %llu demand %u coloc_demand: %u sum %u irqtime %llu pred_demand %u rq_cs %llu rq_ps %llu cur_window %u (%s) prev_window %u (%s) nt_cs %llu nt_ps %llu active_time %u grp_cs %lld grp_ps %lld, grp_nt_cs %llu, grp_nt_ps: %llu curr_top %u prev_top %u", + __entry->wallclock, __entry->win_start, __entry->delta, + task_event_names[__entry->evt], __entry->cpu, + __entry->cur_freq, __entry->cur_pid, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, __entry->coloc_demand, + __entry->sum, __entry->irqtime, __entry->pred_demand, + __entry->rq_cs, __entry->rq_ps, __entry->curr_window, + __window_print(p, __get_dynamic_array(curr_sum), nr_cpu_ids), + __entry->prev_window, + __window_print(p, __get_dynamic_array(prev_sum), nr_cpu_ids), + __entry->nt_cs, __entry->nt_ps, + __entry->active_time, __entry->grp_cs, + __entry->grp_ps, __entry->grp_nt_cs, __entry->grp_nt_ps, + __entry->curr_top, __entry->prev_top) +); + +TRACE_EVENT(sched_update_task_ravg_mini, + + TP_PROTO(struct task_struct *p, struct rq *rq, enum task_event evt, + u64 wallclock, u64 irqtime, + struct group_cpu_time *cpu_time, struct walt_rq *wrq, + struct walt_task_struct *wts), + + TP_ARGS(p, rq, evt, wallclock, irqtime, cpu_time, wrq, wts), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(u64, wallclock) + __field(u64, mark_start) + __field(u64, delta_m) + __field(u64, win_start) + __field(u64, delta) + __field(enum task_event, evt) + __field(unsigned int, demand) + __field(int, cpu) + __field(u64, rq_cs) + __field(u64, rq_ps) + __field(u64, grp_cs) + __field(u64, grp_ps) + __field(u32, curr_window) + __field(u32, prev_window) + ), + + TP_fast_assign( + __entry->wallclock = wallclock; + __entry->win_start = wrq->window_start; + __entry->delta = (wallclock - wrq->window_start); + __entry->evt = evt; + __entry->cpu = rq->cpu; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->mark_start = wts->mark_start; + __entry->delta_m = (wallclock - wts->mark_start); + __entry->demand = wts->demand; + __entry->rq_cs = wrq->curr_runnable_sum; + __entry->rq_ps = wrq->prev_runnable_sum; + __entry->grp_cs = cpu_time ? cpu_time->curr_runnable_sum : 0; + __entry->grp_ps = cpu_time ? cpu_time->prev_runnable_sum : 0; + __entry->curr_window = wts->curr_window; + __entry->prev_window = wts->prev_window; + ), + + TP_printk("wc %llu ws %llu delta %llu event %s cpu %d task %d (%s) ms %llu delta %llu demand %u rq_cs %llu rq_ps %llu cur_window %u prev_window %u grp_cs %lld grp_ps %lld", + __entry->wallclock, __entry->win_start, __entry->delta, + task_event_names[__entry->evt], __entry->cpu, + __entry->pid, __entry->comm, __entry->mark_start, + __entry->delta_m, __entry->demand, + __entry->rq_cs, __entry->rq_ps, __entry->curr_window, + __entry->prev_window, __entry->grp_cs, __entry->grp_ps) +); + +struct migration_sum_data; +extern const char *migrate_type_names[]; + +TRACE_EVENT(sched_set_preferred_cluster, + + TP_PROTO(struct walt_related_thread_group *grp, u64 total_demand), + + TP_ARGS(grp, total_demand), + + TP_STRUCT__entry( + __field(int, id) + __field(u64, total_demand) + __field(bool, skip_min) + ), + + TP_fast_assign( + __entry->id = grp->id; + __entry->total_demand = total_demand; + __entry->skip_min = grp->skip_min; + ), + + TP_printk("group_id %d total_demand %llu skip_min %d", + __entry->id, __entry->total_demand, + __entry->skip_min) +); + +TRACE_EVENT(sched_migration_update_sum, + + TP_PROTO(struct task_struct *p, enum migrate_types migrate_type, + struct rq *rq), + + TP_ARGS(p, migrate_type, rq), + + TP_STRUCT__entry( + __field(int, tcpu) + __field(int, pid) + __field(enum migrate_types, migrate_type) + __field(s64, src_cs) + __field(s64, src_ps) + __field(s64, dst_cs) + __field(s64, dst_ps) + __field(s64, src_nt_cs) + __field(s64, src_nt_ps) + __field(s64, dst_nt_cs) + __field(s64, dst_nt_ps) + ), + + TP_fast_assign( + __entry->tcpu = task_cpu(p); + __entry->pid = p->pid; + __entry->migrate_type = migrate_type; + __entry->src_cs = __get_update_sum(rq, migrate_type, + true, false, true); + __entry->src_ps = __get_update_sum(rq, migrate_type, + true, false, false); + __entry->dst_cs = __get_update_sum(rq, migrate_type, + false, false, true); + __entry->dst_ps = __get_update_sum(rq, migrate_type, + false, false, false); + __entry->src_nt_cs = __get_update_sum(rq, migrate_type, + true, true, true); + __entry->src_nt_ps = __get_update_sum(rq, migrate_type, + true, true, false); + __entry->dst_nt_cs = __get_update_sum(rq, migrate_type, + false, true, true); + __entry->dst_nt_ps = __get_update_sum(rq, migrate_type, + false, true, false); + ), + + TP_printk("pid %d task_cpu %d migrate_type %s src_cs %llu src_ps %llu dst_cs %lld dst_ps %lld src_nt_cs %llu src_nt_ps %llu dst_nt_cs %lld dst_nt_ps %lld", + __entry->pid, __entry->tcpu, + migrate_type_names[__entry->migrate_type], + __entry->src_cs, __entry->src_ps, __entry->dst_cs, + __entry->dst_ps, __entry->src_nt_cs, __entry->src_nt_ps, + __entry->dst_nt_cs, __entry->dst_nt_ps) +); + +TRACE_EVENT(sched_set_boost, + + TP_PROTO(int type), + + TP_ARGS(type), + + TP_STRUCT__entry( + __field(int, type) + ), + + TP_fast_assign( + __entry->type = type; + ), + + TP_printk("type %d", __entry->type) +); + +TRACE_EVENT(sched_load_to_gov, + + TP_PROTO(struct rq *rq, u64 aggr_grp_load, u32 tt_load, + int freq_aggr, u64 load, int policy, + int big_task_rotation, + unsigned int user_hint, + struct walt_rq *wrq), + TP_ARGS(rq, aggr_grp_load, tt_load, freq_aggr, load, policy, + big_task_rotation, user_hint, wrq), + + TP_STRUCT__entry( + __field(int, cpu) + __field(int, policy) + __field(int, ed_task_pid) + __field(u64, aggr_grp_load) + __field(int, freq_aggr) + __field(u64, tt_load) + __field(u64, rq_ps) + __field(u64, grp_rq_ps) + __field(u64, nt_ps) + __field(u64, grp_nt_ps) + __field(u64, pl) + __field(u64, load) + __field(int, big_task_rotation) + __field(unsigned int, user_hint) + ), + + TP_fast_assign( + __entry->cpu = cpu_of(rq); + __entry->policy = policy; + __entry->ed_task_pid = + wrq->ed_task ? wrq->ed_task->pid : -1; + __entry->aggr_grp_load = aggr_grp_load; + __entry->freq_aggr = freq_aggr; + __entry->tt_load = tt_load; + __entry->rq_ps = wrq->prev_runnable_sum; + __entry->grp_rq_ps = wrq->grp_time.prev_runnable_sum; + __entry->nt_ps = wrq->nt_prev_runnable_sum; + __entry->grp_nt_ps = wrq->grp_time.nt_prev_runnable_sum; + __entry->pl = wrq->walt_stats.pred_demands_sum_scaled; + __entry->load = load; + __entry->big_task_rotation = big_task_rotation; + __entry->user_hint = user_hint; + ), + + TP_printk("cpu=%d policy=%d ed_task_pid=%d aggr_grp_load=%llu freq_aggr=%d tt_load=%llu rq_ps=%llu grp_rq_ps=%llu nt_ps=%llu grp_nt_ps=%llu pl=%llu load=%llu big_task_rotation=%d user_hint=%u", + __entry->cpu, __entry->policy, __entry->ed_task_pid, + __entry->aggr_grp_load, __entry->freq_aggr, + __entry->tt_load, __entry->rq_ps, __entry->grp_rq_ps, + __entry->nt_ps, __entry->grp_nt_ps, __entry->pl, __entry->load, + __entry->big_task_rotation, __entry->user_hint) +); + +TRACE_EVENT(core_ctl_eval_need, + + TP_PROTO(unsigned int cpu, unsigned int old_need, + unsigned int new_need, unsigned int updated), + TP_ARGS(cpu, old_need, new_need, updated), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, old_need) + __field(u32, new_need) + __field(u32, updated) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->old_need = old_need; + __entry->new_need = new_need; + __entry->updated = updated; + ), + TP_printk("cpu=%u, old_need=%u, new_need=%u, updated=%u", __entry->cpu, + __entry->old_need, __entry->new_need, __entry->updated) +); + +TRACE_EVENT(core_ctl_set_busy, + + TP_PROTO(unsigned int cpu, unsigned int busy, + unsigned int old_is_busy, unsigned int is_busy), + TP_ARGS(cpu, busy, old_is_busy, is_busy), + TP_STRUCT__entry( + __field(u32, cpu) + __field(u32, busy) + __field(u32, old_is_busy) + __field(u32, is_busy) + __field(bool, high_irqload) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->busy = busy; + __entry->old_is_busy = old_is_busy; + __entry->is_busy = is_busy; + __entry->high_irqload = sched_cpu_high_irqload(cpu); + ), + TP_printk("cpu=%u, busy=%u, old_is_busy=%u, new_is_busy=%u high_irqload=%d", + __entry->cpu, __entry->busy, __entry->old_is_busy, + __entry->is_busy, __entry->high_irqload) +); + +TRACE_EVENT(core_ctl_set_boost, + + TP_PROTO(u32 refcount, s32 ret), + TP_ARGS(refcount, ret), + TP_STRUCT__entry( + __field(u32, refcount) + __field(s32, ret) + ), + TP_fast_assign( + __entry->refcount = refcount; + __entry->ret = ret; + ), + TP_printk("refcount=%u, ret=%d", __entry->refcount, __entry->ret) +); + +TRACE_EVENT(core_ctl_update_nr_need, + + TP_PROTO(int cpu, int nr_need, int prev_misfit_need, + int nrrun, int max_nr, int nr_prev_assist), + + TP_ARGS(cpu, nr_need, prev_misfit_need, nrrun, max_nr, nr_prev_assist), + + TP_STRUCT__entry( + __field(int, cpu) + __field(int, nr_need) + __field(int, prev_misfit_need) + __field(int, nrrun) + __field(int, max_nr) + __field(int, nr_prev_assist) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr_need = nr_need; + __entry->prev_misfit_need = prev_misfit_need; + __entry->nrrun = nrrun; + __entry->max_nr = max_nr; + __entry->nr_prev_assist = nr_prev_assist; + ), + + TP_printk("cpu=%d nr_need=%d prev_misfit_need=%d nrrun=%d max_nr=%d nr_prev_assist=%d", + __entry->cpu, __entry->nr_need, __entry->prev_misfit_need, + __entry->nrrun, __entry->max_nr, __entry->nr_prev_assist) +); + +TRACE_EVENT(core_ctl_notif_data, + + TP_PROTO(u32 nr_big, u32 ta_load, u32 *ta_util, u32 *cur_cap), + + TP_ARGS(nr_big, ta_load, ta_util, cur_cap), + + TP_STRUCT__entry( + __field(u32, nr_big) + __field(u32, ta_load) + __array(u32, ta_util, MAX_CLUSTERS) + __array(u32, cur_cap, MAX_CLUSTERS) + ), + + TP_fast_assign( + __entry->nr_big = nr_big; + __entry->ta_load = ta_load; + memcpy(__entry->ta_util, ta_util, MAX_CLUSTERS * sizeof(u32)); + memcpy(__entry->cur_cap, cur_cap, MAX_CLUSTERS * sizeof(u32)); + ), + + TP_printk("nr_big=%u ta_load=%u ta_util=(%u %u %u) cur_cap=(%u %u %u)", + __entry->nr_big, __entry->ta_load, + __entry->ta_util[0], __entry->ta_util[1], + __entry->ta_util[2], __entry->cur_cap[0], + __entry->cur_cap[1], __entry->cur_cap[2]) +); + +/* + * Tracepoint for sched_get_nr_running_avg + */ +TRACE_EVENT(sched_get_nr_running_avg, + + TP_PROTO(int cpu, int nr, int nr_misfit, int nr_max, int nr_scaled), + + TP_ARGS(cpu, nr, nr_misfit, nr_max, nr_scaled), + + TP_STRUCT__entry( + __field(int, cpu) + __field(int, nr) + __field(int, nr_misfit) + __field(int, nr_max) + __field(int, nr_scaled) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr = nr; + __entry->nr_misfit = nr_misfit; + __entry->nr_max = nr_max; + __entry->nr_scaled = nr_scaled; + ), + + TP_printk("cpu=%d nr=%d nr_misfit=%d nr_max=%d nr_scaled=%d", + __entry->cpu, __entry->nr, __entry->nr_misfit, __entry->nr_max, + __entry->nr_scaled) +); + +/* + * sched_pause - called when cores are paused/unpaused + * + * @start: 1 if start of pause/resume op, 0 otherwise + * @requested_cpus: mask of cpus requested in this op + * @active_cpus: mask of currently active cpus + * @start_time: time of the start of the operation + * @pause: 1 if pausing, 0 if resuming + */ +TRACE_EVENT(sched_pause, + + TP_PROTO(unsigned int start, unsigned int requested_cpus, unsigned int active_cpus, + u64 start_time, unsigned char pause), + + TP_ARGS(start, requested_cpus, active_cpus, start_time, pause), + + TP_STRUCT__entry( + __field(u32, start) + __field(u32, requested_cpus) + __field(u32, active_cpus) + __field(u32, time) + __field(unsigned char, pause) + ), + + TP_fast_assign( + __entry->start = start; + __entry->requested_cpus = requested_cpus; + __entry->active_cpus = active_cpus; + __entry->time = div64_u64(sched_clock() - start_time, 1000); + __entry->pause = pause; + ), + + TP_printk("start=%d req cpus=0x%x act cpus=0x%x time=%u us paused=%d", + __entry->start, __entry->requested_cpus, __entry->active_cpus, + __entry->time, __entry->pause) +); + +TRACE_EVENT(sched_ravg_window_change, + + TP_PROTO(unsigned int sched_ravg_window, unsigned int new_sched_ravg_window + , u64 change_time), + + TP_ARGS(sched_ravg_window, new_sched_ravg_window, change_time), + + TP_STRUCT__entry( + __field(unsigned int, sched_ravg_window) + __field(unsigned int, new_sched_ravg_window) + __field(u64, change_time) + ), + + TP_fast_assign( + __entry->sched_ravg_window = sched_ravg_window; + __entry->new_sched_ravg_window = new_sched_ravg_window; + __entry->change_time = change_time; + ), + + TP_printk("from=%u to=%u at=%lu", + __entry->sched_ravg_window, __entry->new_sched_ravg_window, + __entry->change_time) +); + +TRACE_EVENT(waltgov_util_update, + TP_PROTO(int cpu, + unsigned long util, unsigned long avg_cap, + unsigned long max_cap, unsigned long nl, unsigned long pl, + unsigned int rtgb, unsigned int flags), + TP_ARGS(cpu, util, avg_cap, max_cap, nl, pl, rtgb, flags), + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned long, util) + __field(unsigned long, avg_cap) + __field(unsigned long, max_cap) + __field(unsigned long, nl) + __field(unsigned long, pl) + __field(unsigned int, rtgb) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->util = util; + __entry->avg_cap = avg_cap; + __entry->max_cap = max_cap; + __entry->nl = nl; + __entry->pl = pl; + __entry->rtgb = rtgb; + __entry->flags = flags; + ), + TP_printk("cpu=%d util=%lu avg_cap=%lu max_cap=%lu nl=%lu pl=%lu rtgb=%u flags=0x%x", + __entry->cpu, __entry->util, __entry->avg_cap, + __entry->max_cap, __entry->nl, + __entry->pl, __entry->rtgb, __entry->flags) +); + +TRACE_EVENT(waltgov_next_freq, + TP_PROTO(unsigned int cpu, unsigned long util, unsigned long max, + unsigned int freq), + TP_ARGS(cpu, util, max, freq), + TP_STRUCT__entry( + __field(unsigned int, cpu) + __field(unsigned long, util) + __field(unsigned long, max) + __field(unsigned int, freq) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->util = util; + __entry->max = max; + __entry->freq = freq; + ), + TP_printk("cpu=%u util=%lu max=%lu freq=%u", + __entry->cpu, + __entry->util, + __entry->max, + __entry->freq) +); + +TRACE_EVENT(walt_active_load_balance, + + TP_PROTO(struct task_struct *p, int prev_cpu, int new_cpu, struct walt_task_struct *wts), + + TP_ARGS(p, prev_cpu, new_cpu, wts), + + TP_STRUCT__entry( + __field(pid_t, pid) + __field(bool, misfit) + __field(int, prev_cpu) + __field(int, new_cpu) + ), + + TP_fast_assign( + __entry->pid = p->pid; + __entry->misfit = wts->misfit; + __entry->prev_cpu = prev_cpu; + __entry->new_cpu = new_cpu; + ), + + TP_printk("pid=%d misfit=%d prev_cpu=%d new_cpu=%d\n", + __entry->pid, __entry->misfit, __entry->prev_cpu, + __entry->new_cpu) +); + +TRACE_EVENT(walt_find_busiest_queue, + + TP_PROTO(int dst_cpu, int busiest_cpu, unsigned long src_mask), + + TP_ARGS(dst_cpu, busiest_cpu, src_mask), + + TP_STRUCT__entry( + __field(int, dst_cpu) + __field(int, busiest_cpu) + __field(unsigned long, src_mask) + ), + + TP_fast_assign( + __entry->dst_cpu = dst_cpu; + __entry->busiest_cpu = busiest_cpu; + __entry->src_mask = src_mask; + ), + + TP_printk("dst_cpu=%d busiest_cpu=%d src_mask=%lx\n", + __entry->dst_cpu, __entry->busiest_cpu, + __entry->src_mask) +); + +TRACE_EVENT(walt_nohz_balance_kick, + + TP_PROTO(struct rq *rq), + + TP_ARGS(rq), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned int, nr_running) + __field(unsigned int, nr_cfs_running) + ), + + TP_fast_assign( + __entry->cpu = rq->cpu; + __entry->nr_running = rq->nr_running; + __entry->nr_cfs_running = rq->cfs.h_nr_running; + ), + + TP_printk("cpu=%d nr_running=%u nr_cfs_running=%u\n", + __entry->cpu, __entry->nr_running, + __entry->nr_cfs_running) +); + +TRACE_EVENT(walt_newidle_balance, + + TP_PROTO(int this_cpu, int busy_cpu, int pulled), + + TP_ARGS(this_cpu, busy_cpu, pulled), + + TP_STRUCT__entry( + __field(int, this_cpu) + __field(int, busy_cpu) + __field(int, pulled) + __field(unsigned int, this_nr_running) + ), + + TP_fast_assign( + __entry->this_cpu = this_cpu; + __entry->busy_cpu = busy_cpu; + __entry->pulled = pulled; + __entry->this_nr_running = cpu_rq(this_cpu)->nr_running; + ), + + TP_printk("this_cpu=%d busy_cpu=%d pulled=%d this_nr_running=%u\n", + __entry->this_cpu, __entry->busy_cpu, __entry->pulled, + __entry->this_nr_running) +); + +TRACE_EVENT(walt_lb_cpu_util, + + TP_PROTO(int cpu, struct walt_rq *wrq), + + TP_ARGS(cpu, wrq), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned int, nr_running) + __field(unsigned int, cfs_nr_running) + __field(unsigned int, nr_big) + __field(unsigned int, nr_rtg_high_prio_tasks) + __field(unsigned int, cpu_util) + __field(unsigned int, capacity_orig) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr_running = cpu_rq(cpu)->nr_running; + __entry->cfs_nr_running = cpu_rq(cpu)->cfs.h_nr_running; + __entry->nr_big = wrq->walt_stats.nr_big_tasks; + __entry->nr_rtg_high_prio_tasks = walt_nr_rtg_high_prio(cpu); + __entry->cpu_util = cpu_util(cpu); + __entry->capacity_orig = capacity_orig_of(cpu); + ), + + TP_printk("cpu=%d nr_running=%u cfs_nr_running=%u nr_big=%u nr_rtg_hp=%u cpu_util=%u capacity_orig=%u", + __entry->cpu, __entry->nr_running, __entry->cfs_nr_running, + __entry->nr_big, __entry->nr_rtg_high_prio_tasks, + __entry->cpu_util, __entry->capacity_orig) +); + +TRACE_EVENT(sched_cpu_util, + + TP_PROTO(int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field(unsigned int, cpu) + __field(unsigned int, nr_running) + __field(long, cpu_util) + __field(long, cpu_util_cum) + __field(unsigned int, capacity_curr) + __field(unsigned int, capacity) + __field(unsigned int, capacity_orig) + __field(unsigned int, idle_exit_latency) + __field(u64, irqload) + __field(int, online) + __field(int, inactive) + __field(int, reserved) + __field(int, high_irq_load) + __field(unsigned int, nr_rtg_high_prio_tasks) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->nr_running = cpu_rq(cpu)->nr_running; + __entry->cpu_util = cpu_util(cpu); + __entry->cpu_util_cum = cpu_util_cum(cpu, 0); + __entry->capacity_curr = capacity_curr_of(cpu); + __entry->capacity = capacity_of(cpu); + __entry->capacity_orig = capacity_orig_of(cpu); + __entry->idle_exit_latency = walt_get_idle_exit_latency(cpu_rq(cpu)); + __entry->irqload = sched_irqload(cpu); + __entry->online = cpu_online(cpu); + __entry->inactive = !cpu_active(cpu); + __entry->reserved = is_reserved(cpu); + __entry->high_irq_load = sched_cpu_high_irqload(cpu); + __entry->nr_rtg_high_prio_tasks = walt_nr_rtg_high_prio(cpu); + ), + + TP_printk("cpu=%d nr_running=%d cpu_util=%ld cpu_util_cum=%ld capacity_curr=%u capacity=%u capacity_orig=%u idle_exit_latency=%u irqload=%llu online=%u, inactive=%u, reserved=%u, high_irq_load=%u nr_rtg_hp=%u", + __entry->cpu, __entry->nr_running, __entry->cpu_util, + __entry->cpu_util_cum, __entry->capacity_curr, + __entry->capacity, __entry->capacity_orig, + __entry->idle_exit_latency, __entry->irqload, __entry->online, + __entry->inactive, __entry->reserved, __entry->high_irq_load, + __entry->nr_rtg_high_prio_tasks) +); + +TRACE_EVENT(sched_compute_energy, + + TP_PROTO(struct task_struct *p, int eval_cpu, + unsigned long eval_energy, + unsigned long prev_energy, + unsigned long best_energy, + unsigned long best_energy_cpu), + + TP_ARGS(p, eval_cpu, eval_energy, prev_energy, best_energy, + best_energy_cpu), + + TP_STRUCT__entry( + __field(int, pid) + __array(char, comm, TASK_COMM_LEN) + __field(unsigned long, util) + __field(int, prev_cpu) + __field(unsigned long, prev_energy) + __field(int, eval_cpu) + __field(unsigned long, eval_energy) + __field(int, best_energy_cpu) + __field(unsigned long, best_energy) + ), + + TP_fast_assign( + __entry->pid = p->pid; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->util = task_util(p); + __entry->prev_cpu = task_cpu(p); + __entry->prev_energy = prev_energy; + __entry->eval_cpu = eval_cpu; + __entry->eval_energy = eval_energy; + __entry->best_energy_cpu = best_energy_cpu; + __entry->best_energy = best_energy; + ), + + TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d prev_energy=%lu eval_cpu=%d eval_energy=%lu best_energy_cpu=%d best_energy=%lu", + __entry->pid, __entry->comm, __entry->util, __entry->prev_cpu, + __entry->prev_energy, __entry->eval_cpu, __entry->eval_energy, + __entry->best_energy_cpu, __entry->best_energy) +) + +TRACE_EVENT(sched_task_util, + + TP_PROTO(struct task_struct *p, unsigned long candidates, + int best_energy_cpu, bool sync, int need_idle, int fastpath, + bool placement_boost, u64 start_t, + bool uclamp_boosted, bool is_rtg, bool rtg_skip_min, + int start_cpu), + + TP_ARGS(p, candidates, best_energy_cpu, sync, need_idle, fastpath, + placement_boost, start_t, uclamp_boosted, is_rtg, rtg_skip_min, + start_cpu), + + TP_STRUCT__entry( + __field(int, pid) + __array(char, comm, TASK_COMM_LEN) + __field(unsigned long, util) + __field(unsigned long, candidates) + __field(int, prev_cpu) + __field(int, best_energy_cpu) + __field(bool, sync) + __field(int, need_idle) + __field(int, fastpath) + __field(int, placement_boost) + __field(int, rtg_cpu) + __field(u64, latency) + __field(bool, uclamp_boosted) + __field(bool, is_rtg) + __field(bool, rtg_skip_min) + __field(int, start_cpu) + __field(u32, unfilter) + __field(unsigned long, cpus_allowed) + __field(int, task_boost) + __field(bool, low_latency) + ), + + TP_fast_assign( + __entry->pid = p->pid; + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->util = task_util(p); + __entry->prev_cpu = task_cpu(p); + __entry->candidates = candidates; + __entry->best_energy_cpu = best_energy_cpu; + __entry->sync = sync; + __entry->need_idle = need_idle; + __entry->fastpath = fastpath; + __entry->placement_boost = placement_boost; + __entry->latency = (sched_clock() - start_t); + __entry->uclamp_boosted = uclamp_boosted; + __entry->is_rtg = is_rtg; + __entry->rtg_skip_min = rtg_skip_min; + __entry->start_cpu = start_cpu; + __entry->unfilter = + ((struct walt_task_struct *) p->android_vendor_data1)->unfilter; + __entry->cpus_allowed = cpumask_bits(&p->cpus_mask)[0]; + __entry->task_boost = per_task_boost(p); + __entry->low_latency = walt_low_latency_task(p); + ), + + TP_printk("pid=%d comm=%s util=%lu prev_cpu=%d candidates=%#lx best_energy_cpu=%d sync=%d need_idle=%d fastpath=%d placement_boost=%d latency=%llu stune_boosted=%d is_rtg=%d rtg_skip_min=%d start_cpu=%d unfilter=%u affinity=%lx task_boost=%d low_latency=%d", + __entry->pid, __entry->comm, __entry->util, __entry->prev_cpu, + __entry->candidates, __entry->best_energy_cpu, __entry->sync, + __entry->need_idle, __entry->fastpath, __entry->placement_boost, + __entry->latency, __entry->uclamp_boosted, + __entry->is_rtg, __entry->rtg_skip_min, __entry->start_cpu, + __entry->unfilter, __entry->cpus_allowed, __entry->task_boost, + __entry->low_latency) +); + +/* + * Tracepoint for find_best_target + */ +TRACE_EVENT(sched_find_best_target, + + TP_PROTO(struct task_struct *tsk, + unsigned long min_util, int start_cpu, + int best_idle, int most_spare_cap, int target, + int order_index, int end_index, + int skip, bool running), + + TP_ARGS(tsk, min_util, start_cpu, + best_idle, most_spare_cap, target, + order_index, end_index, skip, running), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(unsigned long, min_util) + __field(int, start_cpu) + __field(int, best_idle) + __field(int, most_spare_cap) + __field(int, target) + __field(int, order_index) + __field(int, end_index) + __field(int, skip) + __field(bool, running) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->min_util = min_util; + __entry->start_cpu = start_cpu; + __entry->best_idle = best_idle; + __entry->most_spare_cap = most_spare_cap; + __entry->target = target; + __entry->order_index = order_index; + __entry->end_index = end_index; + __entry->skip = skip; + __entry->running = running; + ), + + TP_printk("pid=%d comm=%s start_cpu=%d best_idle=%d most_spare_cap=%d target=%d order_index=%d end_index=%d skip=%d running=%d", + __entry->pid, __entry->comm, + __entry->start_cpu, + __entry->best_idle, + __entry->most_spare_cap, + __entry->target, + __entry->order_index, + __entry->end_index, + __entry->skip, + __entry->running) +); + +TRACE_EVENT(sched_enq_deq_task, + + TP_PROTO(struct task_struct *p, bool enqueue, + unsigned int cpus_allowed), + + TP_ARGS(p, enqueue, cpus_allowed), + + TP_STRUCT__entry( + __array(char, comm, TASK_COMM_LEN) + __field(pid_t, pid) + __field(int, prio) + __field(int, cpu) + __field(bool, enqueue) + __field(unsigned int, nr_running) + __field(unsigned int, rt_nr_running) + __field(unsigned int, cpus_allowed) + __field(unsigned int, demand) + __field(unsigned int, pred_demand) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->cpu = task_cpu(p); + __entry->enqueue = enqueue; + __entry->nr_running = task_rq(p)->nr_running; + __entry->rt_nr_running = task_rq(p)->rt.rt_nr_running; + __entry->cpus_allowed = cpus_allowed; + __entry->demand = task_load(p); + __entry->pred_demand = task_pl(p); + ), + + TP_printk("cpu=%d %s comm=%s pid=%d prio=%d nr_running=%u rt_nr_running=%u affine=%x demand=%u pred_demand=%u", + __entry->cpu, + __entry->enqueue ? "enqueue" : "dequeue", + __entry->comm, __entry->pid, + __entry->prio, __entry->nr_running, + __entry->rt_nr_running, + __entry->cpus_allowed, __entry->demand, + __entry->pred_demand) +); +#endif /* _TRACE_WALT_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../kernel/sched/walt +#define TRACE_INCLUDE_FILE trace + +#include diff --git a/kernel/sched/walt/walt.c b/kernel/sched/walt/walt.c new file mode 100644 index 000000000000..33822e50ec52 --- /dev/null +++ b/kernel/sched/walt/walt.c @@ -0,0 +1,4136 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2016-2021, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "walt.h" +#include "trace.h" + +const char *task_event_names[] = { + "PUT_PREV_TASK", + "PICK_NEXT_TASK", + "TASK_WAKE", + "TASK_MIGRATE", + "TASK_UPDATE", + "IRQ_UPDATE" +}; + +const char *migrate_type_names[] = { + "GROUP_TO_RQ", + "RQ_TO_GROUP", + "RQ_TO_RQ", + "GROUP_TO_GROUP" +}; + +#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0 +#define SCHED_ACCOUNT_WAIT_TIME 1 + +#define EARLY_DETECTION_DURATION 9500000 +#define MAX_NUM_CGROUP_COLOC_ID 20 + +#define MAX_NR_CLUSTERS 3 + +#define FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK 0 +#define FREQ_REPORT_CPU_LOAD 1 +#define FREQ_REPORT_TOP_TASK 2 + +#define NEW_TASK_ACTIVE_TIME 100000000 + +unsigned int sysctl_sched_user_hint; + +static ktime_t ktime_last; +static bool sched_ktime_suspended; + +static bool use_cycle_counter; +static DEFINE_MUTEX(cluster_lock); +static u64 walt_load_reported_window; + +static struct irq_work walt_cpufreq_irq_work; +struct irq_work walt_migration_irq_work; +unsigned int walt_rotation_enabled; +cpumask_t asym_cap_sibling_cpus = CPU_MASK_NONE; +unsigned int sched_boost_type; +enum sched_boost_policy boost_policy; + +unsigned int __read_mostly sched_ravg_window = 20000000; +unsigned int min_max_possible_capacity = 1024; +unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */ +/* Initial task load. Newly created tasks are assigned this load. */ +unsigned int __read_mostly sched_init_task_load_windows; +/* + * Task load is categorized into buckets for the purpose of top task tracking. + * The entire range of load from 0 to sched_ravg_window needs to be covered + * in NUM_LOAD_INDICES number of buckets. Therefore the size of each bucket + * is given by sched_ravg_window / NUM_LOAD_INDICES. Since the default value + * of sched_ravg_window is DEFAULT_SCHED_RAVG_WINDOW, use that to compute + * sched_load_granule. + */ +unsigned int __read_mostly sched_load_granule; +__read_mostly bool sched_predl = true; + +/* + *@boost:should be 0,1,2. + *@period:boost time based on ms units. + */ +int set_task_boost(int boost, u64 period) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) current->android_vendor_data1; + + if (boost < TASK_BOOST_NONE || boost >= TASK_BOOST_END) + return -EINVAL; + if (boost) { + wts->boost = boost; + wts->boost_period = (u64)period * 1000 * 1000; + wts->boost_expires = sched_clock() + wts->boost_period; + } else { + wts->boost = 0; + wts->boost_expires = 0; + wts->boost_period = 0; + } + return 0; +} + +u64 sched_ktime_clock(void) +{ + if (unlikely(sched_ktime_suspended)) + return ktime_to_ns(ktime_last); + return ktime_get_ns(); +} + +static void sched_resume(void) +{ + sched_ktime_suspended = false; +} + +static int sched_suspend(void) +{ + ktime_last = ktime_get(); + sched_ktime_suspended = true; + return 0; +} + +static struct syscore_ops sched_syscore_ops = { + .resume = sched_resume, + .suspend = sched_suspend +}; + +int sched_init_ops(void) +{ + register_syscore_ops(&sched_syscore_ops); + return 0; +} + +void acquire_rq_locks_irqsave(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + int level = 0; + + local_irq_save(*flags); + + for_each_cpu(cpu, cpus) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->lock, level); + level++; + } +} + +void release_rq_locks_irqrestore(const cpumask_t *cpus, + unsigned long *flags) +{ + int cpu; + + for_each_cpu(cpu, cpus) + raw_spin_unlock(&cpu_rq(cpu)->lock); + local_irq_restore(*flags); +} + +static unsigned int walt_cpu_high_irqload; + +__read_mostly unsigned int sched_ravg_hist_size = 5; + +static __read_mostly unsigned int sched_io_is_busy = 1; + +/* Window size (in ns) */ +__read_mostly unsigned int new_sched_ravg_window = DEFAULT_SCHED_RAVG_WINDOW; + +static DEFINE_SPINLOCK(sched_ravg_window_lock); +u64 sched_ravg_window_change_time; + +/* + * A after-boot constant divisor for cpu_util_freq_walt() to apply the load + * boost. + */ +static __read_mostly unsigned int walt_cpu_util_freq_divisor; + +unsigned int __read_mostly sched_init_task_load_windows_scaled; +unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15; + +/* Size of bitmaps maintained to track top tasks */ +static const unsigned int top_tasks_bitmap_size = + BITS_TO_LONGS(NUM_LOAD_INDICES + 1) * sizeof(unsigned long); + +/* + * This governs what load needs to be used when reporting CPU busy time + * to the cpufreq governor. + */ +__read_mostly unsigned int sysctl_sched_freq_reporting_policy; + +__read_mostly unsigned int walt_scale_demand_divisor; +#define scale_demand(d) ((d)/walt_scale_demand_divisor) + +#define SCHED_PRINT(arg) pr_emerg("%s=%llu", #arg, arg) +#define STRG(arg) #arg + +static inline void walt_task_dump(struct task_struct *p) +{ + char buff[WALT_NR_CPUS * 16]; + int i, j = 0; + int buffsz = WALT_NR_CPUS * 16; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + SCHED_PRINT(p->pid); + SCHED_PRINT(wts->mark_start); + SCHED_PRINT(wts->demand); + SCHED_PRINT(wts->coloc_demand); + SCHED_PRINT(sched_ravg_window); + SCHED_PRINT(new_sched_ravg_window); + + for (i = 0 ; i < nr_cpu_ids; i++) + j += scnprintf(buff + j, buffsz - j, "%u ", + wts->curr_window_cpu[i]); + printk_deferred("%s=%d (%s)\n", STRG(wts->curr_window), + wts->curr_window, buff); + + for (i = 0, j = 0 ; i < nr_cpu_ids; i++) + j += scnprintf(buff + j, buffsz - j, "%u ", + wts->prev_window_cpu[i]); + printk_deferred("%s=%d (%s)\n", STRG(wts->prev_window), + wts->prev_window, buff); + + SCHED_PRINT(wts->last_wake_ts); + SCHED_PRINT(wts->last_enqueued_ts); + SCHED_PRINT(wts->misfit); + SCHED_PRINT(wts->unfilter); +} + +static inline void walt_rq_dump(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *tsk = cpu_curr(cpu); + int i; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + /* + * Increment the task reference so that it can't be + * freed on a remote CPU. Since we are going to + * enter panic, there is no need to decrement the + * task reference. Decrementing the task reference + * can't be done in atomic context, especially with + * rq locks held. + */ + get_task_struct(tsk); + pr_emerg("CPU:%d nr_running:%u current: %d (%s)\n", + cpu, rq->nr_running, tsk->pid, tsk->comm); + + printk_deferred("=========================================="); + SCHED_PRINT(wrq->window_start); + SCHED_PRINT(wrq->prev_window_size); + SCHED_PRINT(wrq->curr_runnable_sum); + SCHED_PRINT(wrq->prev_runnable_sum); + SCHED_PRINT(wrq->nt_curr_runnable_sum); + SCHED_PRINT(wrq->nt_prev_runnable_sum); + SCHED_PRINT(wrq->cum_window_demand_scaled); + SCHED_PRINT(wrq->task_exec_scale); + SCHED_PRINT(wrq->grp_time.curr_runnable_sum); + SCHED_PRINT(wrq->grp_time.prev_runnable_sum); + SCHED_PRINT(wrq->grp_time.nt_curr_runnable_sum); + SCHED_PRINT(wrq->grp_time.nt_prev_runnable_sum); + for (i = 0 ; i < NUM_TRACKED_WINDOWS; i++) { + printk_deferred("wrq->load_subs[%d].window_start=%llu)\n", i, + wrq->load_subs[i].window_start); + printk_deferred("wrq->load_subs[%d].subs=%llu)\n", i, + wrq->load_subs[i].subs); + printk_deferred("wrq->load_subs[%d].new_subs=%llu)\n", i, + wrq->load_subs[i].new_subs); + } + walt_task_dump(tsk); + SCHED_PRINT(sched_capacity_margin_up[cpu]); + SCHED_PRINT(sched_capacity_margin_down[cpu]); +} + +static inline void walt_dump(void) +{ + int cpu; + + pr_emerg("============ WALT RQ DUMP START ==============\n"); + pr_emerg("Sched ktime_get: %llu\n", sched_ktime_clock()); + pr_emerg("Time last window changed=%lu\n", + sched_ravg_window_change_time); + for_each_online_cpu(cpu) + walt_rq_dump(cpu); + SCHED_PRINT(max_possible_capacity); + SCHED_PRINT(min_max_possible_capacity); + pr_emerg("============ WALT RQ DUMP END ==============\n"); +} + +static int in_sched_bug; +#define SCHED_BUG_ON(condition) \ +({ \ + if (unlikely(!!(condition)) && !in_sched_bug) { \ + in_sched_bug = 1; \ + walt_dump(); \ + BUG_ON(condition); \ + } \ +}) + +static inline void +fixup_cumulative_runnable_avg(struct walt_sched_stats *stats, + s64 demand_scaled_delta, + s64 pred_demand_scaled_delta) +{ + stats->cumulative_runnable_avg_scaled += demand_scaled_delta; + BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0); + + stats->pred_demands_sum_scaled += pred_demand_scaled_delta; + BUG_ON((s64)stats->pred_demands_sum_scaled < 0); +} + +static void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled, + u16 updated_pred_demand_scaled) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + s64 task_load_delta = (s64)updated_demand_scaled - + wts->demand_scaled; + s64 pred_demand_delta = (s64)updated_pred_demand_scaled - + wts->pred_demand_scaled; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + fixup_cumulative_runnable_avg(&wrq->walt_stats, task_load_delta, + pred_demand_delta); +} + +/* + * Demand aggregation for frequency purpose: + * + * CPU demand of tasks from various related groups is aggregated per-cluster and + * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined + * by just wrq->prev_runnable_sum. + * + * Some examples follow, which assume: + * Cluster0 = CPU0-3, Cluster1 = CPU4-7 + * One related thread group A that has tasks A0, A1, A2 + * + * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of + * tasks belonging to group A are accumulated when they run on cpu X. + * + * CX->curr/prev_sum = counters in which cpu execution stats of all tasks + * not belonging to group A are accumulated when they run on cpu X + * + * Lets say the stats for window M was as below: + * + * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms + * Task A0 ran 5ms on CPU0 + * Task B0 ran 1ms on CPU0 + * + * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms + * Task A1 ran 4ms on CPU1 + * Task A2 ran 2ms on CPU1 + * Task B1 ran 5ms on CPU1 + * + * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0 + * CPU2 idle + * + * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0 + * CPU3 idle + * + * In this case, CPU1 was most busy going by just its prev_sum counter. Demand + * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy + * time reported to governor will be: + * + * + * C0 busy time = 1ms + * C1 busy time = 5 + 5 + 6 = 16ms + * + */ +__read_mostly bool sched_freq_aggr_en; + +static u64 +update_window_start(struct rq *rq, u64 wallclock, int event) +{ + s64 delta; + int nr_windows; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + u64 old_window_start = wrq->window_start; + + delta = wallclock - wrq->window_start; + if (delta < 0) { + pr_emerg("WALT-BUG CPU%d; wallclock=%llu is lesser than window_start=%llu", + rq->cpu, wallclock, wrq->window_start); + SCHED_BUG_ON(1); + } + if (delta < sched_ravg_window) + return old_window_start; + + nr_windows = div64_u64(delta, sched_ravg_window); + wrq->window_start += (u64)nr_windows * (u64)sched_ravg_window; + + wrq->cum_window_demand_scaled = + wrq->walt_stats.cumulative_runnable_avg_scaled; + wrq->prev_window_size = sched_ravg_window; + + return old_window_start; +} + +/* + * Assumes rq_lock is held and wallclock was recorded in the same critical + * section as this function's invocation. + */ +static inline u64 read_cycle_counter(int cpu, u64 wallclock) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + if (wrq->last_cc_update != wallclock) { + wrq->cycles = qcom_cpufreq_get_cpu_cycle_counter(cpu); + wrq->last_cc_update = wallclock; + } + + return wrq->cycles; +} + +static void update_task_cpu_cycles(struct task_struct *p, int cpu, + u64 wallclock) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (use_cycle_counter) + wts->cpu_cycles = read_cycle_counter(cpu, wallclock); +} + +static inline bool is_ed_enabled(void) +{ + return (walt_rotation_enabled || (sched_boost_policy() != + SCHED_BOOST_NONE)); +} + +static inline bool is_ed_task(struct task_struct *p, u64 wallclock) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return (wallclock - wts->last_wake_ts >= EARLY_DETECTION_DURATION); +} + +static bool is_ed_task_present(struct rq *rq, u64 wallclock) +{ + struct task_struct *p; + int loop_max = 10; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + wrq->ed_task = NULL; + + if (!is_ed_enabled() || !rq->cfs.h_nr_running) + return false; + + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) { + if (!loop_max) + break; + + if (is_ed_task(p, wallclock)) { + wrq->ed_task = p; + return true; + } + + loop_max--; + } + + return false; +} + +static void walt_sched_account_irqstart(int cpu, struct task_struct *curr) +{ + struct rq *rq = cpu_rq(cpu); + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + if (!wrq->window_start) + return; + + /* We're here without rq->lock held, IRQ disabled */ + raw_spin_lock(&rq->lock); + update_task_cpu_cycles(curr, cpu, sched_ktime_clock()); + raw_spin_unlock(&rq->lock); +} + +static void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); +static void walt_sched_account_irqend(int cpu, struct task_struct *curr, u64 delta) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + walt_update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), delta); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +/* + * Return total number of tasks "eligible" to run on higher capacity cpus + */ +unsigned int walt_big_tasks(int cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + return wrq->walt_stats.nr_big_tasks; +} + +void clear_walt_request(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + clear_reserved(cpu); + if (wrq->push_task) { + struct task_struct *push_task = NULL; + + raw_spin_lock_irqsave(&rq->lock, flags); + if (wrq->push_task) { + clear_reserved(rq->push_cpu); + push_task = wrq->push_task; + wrq->push_task = NULL; + } + rq->active_balance = 0; + raw_spin_unlock_irqrestore(&rq->lock, flags); + if (push_task) + put_task_struct(push_task); + } +} + +/* + * Special case the last index and provide a fast path for index = 0. + * Note that sched_load_granule can change underneath us if we are not + * holding any runqueue locks while calling the two functions below. + */ +static u32 top_task_load(struct rq *rq) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + int index = wrq->prev_top; + u8 prev = 1 - wrq->curr_table; + + if (!index) { + int msb = NUM_LOAD_INDICES - 1; + + if (!test_bit(msb, wrq->top_tasks_bitmap[prev])) + return 0; + else + return sched_load_granule; + } else if (index == NUM_LOAD_INDICES - 1) { + return sched_ravg_window; + } else { + return (index + 1) * sched_load_granule; + } +} + +unsigned long sched_user_hint_reset_time; +static bool is_cluster_hosting_top_app(struct walt_sched_cluster *cluster); + +static inline bool +should_apply_suh_freq_boost(struct walt_sched_cluster *cluster) +{ + if (sched_freq_aggr_en || !sysctl_sched_user_hint || + !cluster->aggr_grp_load) + return false; + + return is_cluster_hosting_top_app(cluster); +} + +static inline u64 freq_policy_load(struct rq *rq) +{ + unsigned int reporting_policy = sysctl_sched_freq_reporting_policy; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_sched_cluster *cluster = wrq->cluster; + u64 aggr_grp_load = cluster->aggr_grp_load; + u64 load, tt_load = 0; + struct task_struct *cpu_ksoftirqd = per_cpu(ksoftirqd, cpu_of(rq)); + + if (wrq->ed_task != NULL) { + load = sched_ravg_window; + goto done; + } + + if (sched_freq_aggr_en) + load = wrq->prev_runnable_sum + aggr_grp_load; + else + load = wrq->prev_runnable_sum + + wrq->grp_time.prev_runnable_sum; + + if (cpu_ksoftirqd && cpu_ksoftirqd->state == TASK_RUNNING) + load = max_t(u64, load, task_load(cpu_ksoftirqd)); + + tt_load = top_task_load(rq); + switch (reporting_policy) { + case FREQ_REPORT_MAX_CPU_LOAD_TOP_TASK: + load = max_t(u64, load, tt_load); + break; + case FREQ_REPORT_TOP_TASK: + load = tt_load; + break; + case FREQ_REPORT_CPU_LOAD: + break; + default: + break; + } + + if (should_apply_suh_freq_boost(cluster)) { + if (is_suh_max()) + load = sched_ravg_window; + else + load = div64_u64(load * sysctl_sched_user_hint, + (u64)100); + } + +done: + trace_sched_load_to_gov(rq, aggr_grp_load, tt_load, sched_freq_aggr_en, + load, reporting_policy, walt_rotation_enabled, + sysctl_sched_user_hint, wrq); + return load; +} + +static bool rtgb_active; + +static inline unsigned long +__cpu_util_freq_walt(int cpu, struct walt_cpu_load *walt_load) +{ + u64 util, util_unboosted; + struct rq *rq = cpu_rq(cpu); + unsigned long capacity = capacity_orig_of(cpu); + int boost; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + boost = sysctl_sched_load_boost[cpu]; + util_unboosted = util = freq_policy_load(rq); + util = div64_u64(util * (100 + boost), + walt_cpu_util_freq_divisor); + + if (walt_load) { + u64 nl = wrq->nt_prev_runnable_sum + + wrq->grp_time.nt_prev_runnable_sum; + u64 pl = wrq->walt_stats.pred_demands_sum_scaled; + + /* do_pl_notif() needs unboosted signals */ + wrq->old_busy_time = div64_u64(util_unboosted, + sched_ravg_window >> + SCHED_CAPACITY_SHIFT); + wrq->old_estimated_time = pl; + + nl = div64_u64(nl * (100 + boost), walt_cpu_util_freq_divisor); + + walt_load->nl = nl; + walt_load->pl = pl; + walt_load->ws = walt_load_reported_window; + walt_load->rtgb_active = rtgb_active; + } + + return (util >= capacity) ? capacity : util; +} + +#define ADJUSTED_ASYM_CAP_CPU_UTIL(orig, other, x) \ + (max(orig, mult_frac(other, x, 100))) + +unsigned long +cpu_util_freq_walt(int cpu, struct walt_cpu_load *walt_load) +{ + struct walt_cpu_load wl_other = {0}; + unsigned long util = 0, util_other = 0; + unsigned long capacity = capacity_orig_of(cpu); + int i, mpct = sysctl_sched_asym_cap_sibling_freq_match_pct; + + if (!cpumask_test_cpu(cpu, &asym_cap_sibling_cpus)) + return __cpu_util_freq_walt(cpu, walt_load); + + for_each_cpu(i, &asym_cap_sibling_cpus) { + if (i == cpu) + util = __cpu_util_freq_walt(cpu, walt_load); + else + util_other = __cpu_util_freq_walt(i, &wl_other); + } + + if (cpu == cpumask_last(&asym_cap_sibling_cpus)) + mpct = 100; + + util = ADJUSTED_ASYM_CAP_CPU_UTIL(util, util_other, mpct); + + walt_load->nl = ADJUSTED_ASYM_CAP_CPU_UTIL(walt_load->nl, wl_other.nl, + mpct); + walt_load->pl = ADJUSTED_ASYM_CAP_CPU_UTIL(walt_load->pl, wl_other.pl, + mpct); + + return (util >= capacity) ? capacity : util; +} + +/* + * In this function we match the accumulated subtractions with the current + * and previous windows we are operating with. Ignore any entries where + * the window start in the load_subtraction struct does not match either + * the curent or the previous window. This could happen whenever CPUs + * become idle or busy with interrupts disabled for an extended period. + */ +static inline void account_load_subtractions(struct rq *rq) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + u64 ws = wrq->window_start; + u64 prev_ws = ws - wrq->prev_window_size; + struct load_subtractions *ls = wrq->load_subs; + int i; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + if (ls[i].window_start == ws) { + wrq->curr_runnable_sum -= ls[i].subs; + wrq->nt_curr_runnable_sum -= ls[i].new_subs; + } else if (ls[i].window_start == prev_ws) { + wrq->prev_runnable_sum -= ls[i].subs; + wrq->nt_prev_runnable_sum -= ls[i].new_subs; + } + + ls[i].subs = 0; + ls[i].new_subs = 0; + } + + SCHED_BUG_ON((s64)wrq->prev_runnable_sum < 0); + SCHED_BUG_ON((s64)wrq->curr_runnable_sum < 0); + SCHED_BUG_ON((s64)wrq->nt_prev_runnable_sum < 0); + SCHED_BUG_ON((s64)wrq->nt_curr_runnable_sum < 0); +} + +static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + wrq->load_subs[index].window_start = ws; + wrq->load_subs[index].subs = 0; + wrq->load_subs[index].new_subs = 0; +} + +static int get_top_index(unsigned long *bitmap, unsigned long old_top) +{ + int index = find_next_bit(bitmap, NUM_LOAD_INDICES, old_top); + + if (index == NUM_LOAD_INDICES) + return 0; + + return NUM_LOAD_INDICES - 1 - index; +} + +static bool get_subtraction_index(struct rq *rq, u64 ws) +{ + int i; + u64 oldest = ULLONG_MAX; + int oldest_index = 0; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + for (i = 0; i < NUM_TRACKED_WINDOWS; i++) { + u64 entry_ws = wrq->load_subs[i].window_start; + + if (ws == entry_ws) + return i; + + if (entry_ws < oldest) { + oldest = entry_ws; + oldest_index = i; + } + } + + create_subtraction_entry(rq, ws, oldest_index); + return oldest_index; +} + +static void update_rq_load_subtractions(int index, struct rq *rq, + u32 sub_load, bool new_task) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + wrq->load_subs[index].subs += sub_load; + if (new_task) + wrq->load_subs[index].new_subs += sub_load; +} + +static inline struct walt_sched_cluster *cpu_cluster(int cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + return wrq->cluster; +} + +static void update_cluster_load_subtractions(struct task_struct *p, + int cpu, u64 ws, bool new_task) +{ + struct walt_sched_cluster *cluster = cpu_cluster(cpu); + struct cpumask cluster_cpus = cluster->cpus; + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + u64 prev_ws = ws - wrq->prev_window_size; + int i; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + cpumask_clear_cpu(cpu, &cluster_cpus); + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(i, &cluster_cpus) { + struct rq *rq = cpu_rq(i); + int index; + + if (wts->curr_window_cpu[i]) { + index = get_subtraction_index(rq, ws); + update_rq_load_subtractions(index, rq, + wts->curr_window_cpu[i], new_task); + wts->curr_window_cpu[i] = 0; + } + + if (wts->prev_window_cpu[i]) { + index = get_subtraction_index(rq, prev_ws); + update_rq_load_subtractions(index, rq, + wts->prev_window_cpu[i], new_task); + wts->prev_window_cpu[i] = 0; + } + } + + raw_spin_unlock(&cluster->load_lock); +} + +static inline void inter_cluster_migration_fixup + (struct task_struct *p, int new_cpu, int task_cpu, bool new_task) +{ + struct rq *dest_rq = cpu_rq(new_cpu); + struct rq *src_rq = cpu_rq(task_cpu); + struct walt_rq *dest_wrq = (struct walt_rq *) dest_rq->android_vendor_data1; + struct walt_rq *src_wrq = (struct walt_rq *) src_rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (same_freq_domain(new_cpu, task_cpu)) + return; + + wts->curr_window_cpu[new_cpu] = wts->curr_window; + wts->prev_window_cpu[new_cpu] = wts->prev_window; + + dest_wrq->curr_runnable_sum += wts->curr_window; + dest_wrq->prev_runnable_sum += wts->prev_window; + + if (src_wrq->curr_runnable_sum < wts->curr_window_cpu[task_cpu]) { + printk_deferred("WALT-BUG pid=%u CPU%d -> CPU%d src_crs=%llu is lesser than task_contrib=%llu", + p->pid, src_rq->cpu, dest_rq->cpu, + src_wrq->curr_runnable_sum, + wts->curr_window_cpu[task_cpu]); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + src_wrq->curr_runnable_sum -= wts->curr_window_cpu[task_cpu]; + + if (src_wrq->prev_runnable_sum < wts->prev_window_cpu[task_cpu]) { + printk_deferred("WALT-BUG pid=%u CPU%d -> CPU%d src_prs=%llu is lesser than task_contrib=%llu", + p->pid, src_rq->cpu, dest_rq->cpu, + src_wrq->prev_runnable_sum, + wts->prev_window_cpu[task_cpu]); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + src_wrq->prev_runnable_sum -= wts->prev_window_cpu[task_cpu]; + + if (new_task) { + dest_wrq->nt_curr_runnable_sum += wts->curr_window; + dest_wrq->nt_prev_runnable_sum += wts->prev_window; + + if (src_wrq->nt_curr_runnable_sum < + wts->curr_window_cpu[task_cpu]) { + printk_deferred("WALT-BUG pid=%u CPU%d -> CPU%d src_nt_crs=%llu is lesser than task_contrib=%llu", + p->pid, src_rq->cpu, dest_rq->cpu, + src_wrq->nt_curr_runnable_sum, + wts->curr_window_cpu[task_cpu]); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + src_wrq->nt_curr_runnable_sum -= + wts->curr_window_cpu[task_cpu]; + + if (src_wrq->nt_prev_runnable_sum < + wts->prev_window_cpu[task_cpu]) { + printk_deferred("WALT-BUG pid=%u CPU%d -> CPU%d src_nt_prs=%llu is lesser than task_contrib=%llu", + p->pid, src_rq->cpu, dest_rq->cpu, + src_wrq->nt_prev_runnable_sum, + wts->prev_window_cpu[task_cpu]); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + src_wrq->nt_prev_runnable_sum -= + wts->prev_window_cpu[task_cpu]; + } + + wts->curr_window_cpu[task_cpu] = 0; + wts->prev_window_cpu[task_cpu] = 0; + + update_cluster_load_subtractions(p, task_cpu, + src_wrq->window_start, new_task); +} + +static u32 load_to_index(u32 load) +{ + u32 index = load / sched_load_granule; + + return min(index, (u32)(NUM_LOAD_INDICES - 1)); +} + +static void +migrate_top_tasks(struct task_struct *p, struct rq *src_rq, struct rq *dst_rq) +{ + int index; + int top_index; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + u32 curr_window = wts->curr_window; + u32 prev_window = wts->prev_window; + struct walt_rq *dst_wrq = (struct walt_rq *) dst_rq->android_vendor_data1; + struct walt_rq *src_wrq = (struct walt_rq *) src_rq->android_vendor_data1; + u8 src = src_wrq->curr_table; + u8 dst = dst_wrq->curr_table; + u8 *src_table; + u8 *dst_table; + + if (curr_window) { + src_table = src_wrq->top_tasks[src]; + dst_table = dst_wrq->top_tasks[dst]; + index = load_to_index(curr_window); + src_table[index] -= 1; + dst_table[index] += 1; + + if (!src_table[index]) + __clear_bit(NUM_LOAD_INDICES - index - 1, + src_wrq->top_tasks_bitmap[src]); + + if (dst_table[index] == 1) + __set_bit(NUM_LOAD_INDICES - index - 1, + dst_wrq->top_tasks_bitmap[dst]); + + if (index > dst_wrq->curr_top) + dst_wrq->curr_top = index; + + top_index = src_wrq->curr_top; + if (index == top_index && !src_table[index]) + src_wrq->curr_top = get_top_index( + src_wrq->top_tasks_bitmap[src], top_index); + } + + if (prev_window) { + src = 1 - src; + dst = 1 - dst; + src_table = src_wrq->top_tasks[src]; + dst_table = dst_wrq->top_tasks[dst]; + index = load_to_index(prev_window); + src_table[index] -= 1; + dst_table[index] += 1; + + if (!src_table[index]) + __clear_bit(NUM_LOAD_INDICES - index - 1, + src_wrq->top_tasks_bitmap[src]); + + if (dst_table[index] == 1) + __set_bit(NUM_LOAD_INDICES - index - 1, + dst_wrq->top_tasks_bitmap[dst]); + + if (index > dst_wrq->prev_top) + dst_wrq->prev_top = index; + + top_index = src_wrq->prev_top; + if (index == top_index && !src_table[index]) + src_wrq->prev_top = get_top_index( + src_wrq->top_tasks_bitmap[src], top_index); + } +} + +static inline bool is_new_task(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return wts->active_time < NEW_TASK_ACTIVE_TIME; +} + +static void fixup_busy_time(struct task_struct *p, int new_cpu) +{ + struct rq *src_rq = task_rq(p); + struct rq *dest_rq = cpu_rq(new_cpu); + u64 wallclock; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + bool new_task; + struct walt_related_thread_group *grp; + long pstate; + struct walt_rq *dest_wrq = (struct walt_rq *) dest_rq->android_vendor_data1; + struct walt_rq *src_wrq = (struct walt_rq *) src_rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (!p->on_rq && p->state != TASK_WAKING) + return; + + pstate = p->state; + + if (pstate == TASK_WAKING) + double_rq_lock(src_rq, dest_rq); + + wallclock = sched_ktime_clock(); + + walt_update_task_ravg(task_rq(p)->curr, task_rq(p), + TASK_UPDATE, + wallclock, 0); + walt_update_task_ravg(dest_rq->curr, dest_rq, + TASK_UPDATE, wallclock, 0); + + walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, + wallclock, 0); + + update_task_cpu_cycles(p, new_cpu, wallclock); + + new_task = is_new_task(p); + /* Protected by rq_lock */ + grp = wts->grp; + + /* + * For frequency aggregation, we continue to do migration fixups + * even for intra cluster migrations. This is because, the aggregated + * load has to reported on a single CPU regardless. + */ + if (grp) { + struct group_cpu_time *cpu_time; + + cpu_time = &src_wrq->grp_time; + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + cpu_time = &dest_wrq->grp_time; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (wts->curr_window) { + *src_curr_runnable_sum -= wts->curr_window; + *dst_curr_runnable_sum += wts->curr_window; + if (new_task) { + *src_nt_curr_runnable_sum -= wts->curr_window; + *dst_nt_curr_runnable_sum += wts->curr_window; + } + } + + if (wts->prev_window) { + *src_prev_runnable_sum -= wts->prev_window; + *dst_prev_runnable_sum += wts->prev_window; + if (new_task) { + *src_nt_prev_runnable_sum -= wts->prev_window; + *dst_nt_prev_runnable_sum += wts->prev_window; + } + } + } else { + inter_cluster_migration_fixup(p, new_cpu, + task_cpu(p), new_task); + } + + migrate_top_tasks(p, src_rq, dest_rq); + + if (!same_freq_domain(new_cpu, task_cpu(p))) { + src_wrq->notif_pending = true; + dest_wrq->notif_pending = true; + walt_irq_work_queue(&walt_migration_irq_work); + } + + if (is_ed_enabled()) { + if (p == src_wrq->ed_task) { + src_wrq->ed_task = NULL; + dest_wrq->ed_task = p; + } else if (is_ed_task(p, wallclock)) { + dest_wrq->ed_task = p; + } + } + + if (pstate == TASK_WAKING) + double_rq_unlock(src_rq, dest_rq); +} + +static void set_window_start(struct rq *rq) +{ + static int sync_cpu_available; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_rq *sync_wrq; + struct walt_task_struct *wts = (struct walt_task_struct *) rq->curr->android_vendor_data1; + + if (likely(wrq->window_start)) + return; + + if (!sync_cpu_available) { + wrq->window_start = 1; + sync_cpu_available = 1; + atomic64_set(&walt_irq_work_lastq_ws, wrq->window_start); + walt_load_reported_window = + atomic64_read(&walt_irq_work_lastq_ws); + + } else { + struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask)); + + sync_wrq = (struct walt_rq *) sync_rq->android_vendor_data1; + raw_spin_unlock(&rq->lock); + double_rq_lock(rq, sync_rq); + wrq->window_start = sync_wrq->window_start; + wrq->curr_runnable_sum = wrq->prev_runnable_sum = 0; + wrq->nt_curr_runnable_sum = wrq->nt_prev_runnable_sum = 0; + raw_spin_unlock(&sync_rq->lock); + } + + wts->mark_start = wrq->window_start; +} + +#define INC_STEP 8 +#define DEC_STEP 2 +#define CONSISTENT_THRES 16 +#define INC_STEP_BIG 16 +/* + * bucket_increase - update the count of all buckets + * + * @buckets: array of buckets tracking busy time of a task + * @idx: the index of bucket to be incremented + * + * Each time a complete window finishes, count of bucket that runtime + * falls in (@idx) is incremented. Counts of all other buckets are + * decayed. The rate of increase and decay could be different based + * on current count in the bucket. + */ +static inline void bucket_increase(u8 *buckets, int idx) +{ + int i, step; + + for (i = 0; i < NUM_BUSY_BUCKETS; i++) { + if (idx != i) { + if (buckets[i] > DEC_STEP) + buckets[i] -= DEC_STEP; + else + buckets[i] = 0; + } else { + step = buckets[i] >= CONSISTENT_THRES ? + INC_STEP_BIG : INC_STEP; + if (buckets[i] > U8_MAX - step) + buckets[i] = U8_MAX; + else + buckets[i] += step; + } + } +} + +static inline int busy_to_bucket(u32 normalized_rt) +{ + int bidx; + + bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load()); + bidx = min(bidx, NUM_BUSY_BUCKETS - 1); + + /* + * Combine lowest two buckets. The lowest frequency falls into + * 2nd bucket and thus keep predicting lowest bucket is not + * useful. + */ + if (!bidx) + bidx++; + + return bidx; +} + +/* + * get_pred_busy - calculate predicted demand for a task on runqueue + * + * @p: task whose prediction is being updated + * @start: starting bucket. returned prediction should not be lower than + * this bucket. + * @runtime: runtime of the task. returned prediction should not be lower + * than this runtime. + * Note: @start can be derived from @runtime. It's passed in only to + * avoid duplicated calculation in some cases. + * + * A new predicted busy time is returned for task @p based on @runtime + * passed in. The function searches through buckets that represent busy + * time equal to or bigger than @runtime and attempts to find the bucket + * to use for prediction. Once found, it searches through historical busy + * time and returns the latest that falls into the bucket. If no such busy + * time exists, it returns the medium of that bucket. + */ +static u32 get_pred_busy(struct task_struct *p, + int start, u32 runtime) +{ + int i; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + u8 *buckets = wts->busy_buckets; + u32 *hist = wts->sum_history; + u32 dmin, dmax; + u64 cur_freq_runtime = 0; + int first = NUM_BUSY_BUCKETS, final; + u32 ret = runtime; + + /* skip prediction for new tasks due to lack of history */ + if (unlikely(is_new_task(p))) + goto out; + + /* find minimal bucket index to pick */ + for (i = start; i < NUM_BUSY_BUCKETS; i++) { + if (buckets[i]) { + first = i; + break; + } + } + /* if no higher buckets are filled, predict runtime */ + if (first >= NUM_BUSY_BUCKETS) + goto out; + + /* compute the bucket for prediction */ + final = first; + + /* determine demand range for the predicted bucket */ + if (final < 2) { + /* lowest two buckets are combined */ + dmin = 0; + final = 1; + } else { + dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS); + } + dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS); + + /* + * search through runtime history and return first runtime that falls + * into the range of predicted bucket. + */ + for (i = 0; i < sched_ravg_hist_size; i++) { + if (hist[i] >= dmin && hist[i] < dmax) { + ret = hist[i]; + break; + } + } + /* no historical runtime within bucket found, use average of the bin */ + if (ret < dmin) + ret = (dmin + dmax) / 2; + /* + * when updating in middle of a window, runtime could be higher + * than all recorded history. Always predict at least runtime. + */ + ret = max(runtime, ret); +out: + trace_sched_update_pred_demand(p, runtime, + mult_frac((unsigned int)cur_freq_runtime, 100, + sched_ravg_window), ret, wts); + return ret; +} + +static inline u32 calc_pred_demand(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (wts->pred_demand >= wts->curr_window) + return wts->pred_demand; + + return get_pred_busy(p, busy_to_bucket(wts->curr_window), + wts->curr_window); +} + +/* + * predictive demand of a task is calculated at the window roll-over. + * if the task current window busy time exceeds the predicted + * demand, update it here to reflect the task needs. + */ +static void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event) +{ + u32 new, old; + u16 new_scaled; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (!sched_predl) + return; + + if (is_idle_task(p)) + return; + + if (event != PUT_PREV_TASK && event != TASK_UPDATE && + (!SCHED_FREQ_ACCOUNT_WAIT_TIME || + (event != TASK_MIGRATE && + event != PICK_NEXT_TASK))) + return; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME) + return; + } + + new = calc_pred_demand(p); + old = wts->pred_demand; + + if (old >= new) + return; + + new_scaled = scale_demand(new); + if (task_on_rq_queued(p) && (!task_has_dl_policy(p) || + !p->dl.dl_throttled)) + fixup_walt_sched_stats_common(rq, p, + wts->demand_scaled, + new_scaled); + + wts->pred_demand = new; + wts->pred_demand_scaled = new_scaled; +} + +static void clear_top_tasks_bitmap(unsigned long *bitmap) +{ + memset(bitmap, 0, top_tasks_bitmap_size); + __set_bit(NUM_LOAD_INDICES, bitmap); +} + +static inline void clear_top_tasks_table(u8 *table) +{ + memset(table, 0, NUM_LOAD_INDICES * sizeof(u8)); +} + +static void update_top_tasks(struct task_struct *p, struct rq *rq, + u32 old_curr_window, int new_window, bool full_window) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + u8 curr = wrq->curr_table; + u8 prev = 1 - curr; + u8 *curr_table = wrq->top_tasks[curr]; + u8 *prev_table = wrq->top_tasks[prev]; + int old_index, new_index, update_index; + u32 curr_window = wts->curr_window; + u32 prev_window = wts->prev_window; + bool zero_index_update; + + if (old_curr_window == curr_window && !new_window) + return; + + old_index = load_to_index(old_curr_window); + new_index = load_to_index(curr_window); + + if (!new_window) { + zero_index_update = !old_curr_window && curr_window; + if (old_index != new_index || zero_index_update) { + if (old_curr_window) + curr_table[old_index] -= 1; + if (curr_window) + curr_table[new_index] += 1; + if (new_index > wrq->curr_top) + wrq->curr_top = new_index; + } + + if (!curr_table[old_index]) + __clear_bit(NUM_LOAD_INDICES - old_index - 1, + wrq->top_tasks_bitmap[curr]); + + if (curr_table[new_index] == 1) + __set_bit(NUM_LOAD_INDICES - new_index - 1, + wrq->top_tasks_bitmap[curr]); + + return; + } + + /* + * The window has rolled over for this task. By the time we get + * here, curr/prev swaps would has already occurred. So we need + * to use prev_window for the new index. + */ + update_index = load_to_index(prev_window); + + if (full_window) { + /* + * Two cases here. Either 'p' ran for the entire window or + * it didn't run at all. In either case there is no entry + * in the prev table. If 'p' ran the entire window, we just + * need to create a new entry in the prev table. In this case + * update_index will be correspond to sched_ravg_window + * so we can unconditionally update the top index. + */ + if (prev_window) { + prev_table[update_index] += 1; + wrq->prev_top = update_index; + } + + if (prev_table[update_index] == 1) + __set_bit(NUM_LOAD_INDICES - update_index - 1, + wrq->top_tasks_bitmap[prev]); + } else { + zero_index_update = !old_curr_window && prev_window; + if (old_index != update_index || zero_index_update) { + if (old_curr_window) + prev_table[old_index] -= 1; + + prev_table[update_index] += 1; + + if (update_index > wrq->prev_top) + wrq->prev_top = update_index; + + if (!prev_table[old_index]) + __clear_bit(NUM_LOAD_INDICES - old_index - 1, + wrq->top_tasks_bitmap[prev]); + + if (prev_table[update_index] == 1) + __set_bit(NUM_LOAD_INDICES - update_index - 1, + wrq->top_tasks_bitmap[prev]); + } + } + + if (curr_window) { + curr_table[new_index] += 1; + + if (new_index > wrq->curr_top) + wrq->curr_top = new_index; + + if (curr_table[new_index] == 1) + __set_bit(NUM_LOAD_INDICES - new_index - 1, + wrq->top_tasks_bitmap[curr]); + } +} + +static void rollover_top_tasks(struct rq *rq, bool full_window) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + u8 curr_table = wrq->curr_table; + u8 prev_table = 1 - curr_table; + int curr_top = wrq->curr_top; + + clear_top_tasks_table(wrq->top_tasks[prev_table]); + clear_top_tasks_bitmap(wrq->top_tasks_bitmap[prev_table]); + + if (full_window) { + curr_top = 0; + clear_top_tasks_table(wrq->top_tasks[curr_table]); + clear_top_tasks_bitmap(wrq->top_tasks_bitmap[curr_table]); + } + + wrq->curr_table = prev_table; + wrq->prev_top = curr_top; + wrq->curr_top = 0; +} + +static u32 empty_windows[WALT_NR_CPUS]; + +static void rollover_task_window(struct task_struct *p, bool full_window) +{ + u32 *curr_cpu_windows = empty_windows; + u32 curr_window; + int i; + struct walt_rq *wrq = (struct walt_rq *) task_rq(p)->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + /* Rollover the sum */ + curr_window = 0; + + if (!full_window) { + curr_window = wts->curr_window; + curr_cpu_windows = wts->curr_window_cpu; + } + + wts->prev_window = curr_window; + wts->curr_window = 0; + + /* Roll over individual CPU contributions */ + for (i = 0; i < nr_cpu_ids; i++) { + wts->prev_window_cpu[i] = curr_cpu_windows[i]; + wts->curr_window_cpu[i] = 0; + } + + if (is_new_task(p)) + wts->active_time += wrq->prev_window_size; +} + +static inline int cpu_is_waiting_on_io(struct rq *rq) +{ + if (!sched_io_is_busy) + return 0; + + return atomic_read(&rq->nr_iowait); +} + +static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p, + u64 irqtime, int event) +{ + if (is_idle_task(p)) { + /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */ + if (event == PICK_NEXT_TASK) + return 0; + + /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */ + return irqtime || cpu_is_waiting_on_io(rq); + } + + if (event == TASK_WAKE) + return 0; + + if (event == PUT_PREV_TASK || event == IRQ_UPDATE) + return 1; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0; + } + + /* TASK_MIGRATE, PICK_NEXT_TASK left */ + return SCHED_FREQ_ACCOUNT_WAIT_TIME; +} + +#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y) + +static inline u64 scale_exec_time(u64 delta, struct rq *rq) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + return (delta * wrq->task_exec_scale) >> 10; +} + +/* Convert busy time to frequency equivalent + * Assumes load is scaled to 1024 + */ +static inline unsigned int load_to_freq(struct rq *rq, unsigned int load) +{ + return mult_frac(cpu_max_possible_freq(cpu_of(rq)), load, + (unsigned int)arch_scale_cpu_capacity(cpu_of(rq))); +} + +static bool do_pl_notif(struct rq *rq) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + u64 prev = wrq->old_busy_time; + u64 pl = wrq->walt_stats.pred_demands_sum_scaled; + int cpu = cpu_of(rq); + + /* If already at max freq, bail out */ + if (capacity_orig_of(cpu) == capacity_curr_of(cpu)) + return false; + + prev = max(prev, wrq->old_estimated_time); + + /* 400 MHz filter. */ + return (pl > prev) && (load_to_freq(rq, pl - prev) > 400000); +} + +static void rollover_cpu_window(struct rq *rq, bool full_window) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + u64 curr_sum = wrq->curr_runnable_sum; + u64 nt_curr_sum = wrq->nt_curr_runnable_sum; + u64 grp_curr_sum = wrq->grp_time.curr_runnable_sum; + u64 grp_nt_curr_sum = wrq->grp_time.nt_curr_runnable_sum; + + if (unlikely(full_window)) { + curr_sum = 0; + nt_curr_sum = 0; + grp_curr_sum = 0; + grp_nt_curr_sum = 0; + } + + wrq->prev_runnable_sum = curr_sum; + wrq->nt_prev_runnable_sum = nt_curr_sum; + wrq->grp_time.prev_runnable_sum = grp_curr_sum; + wrq->grp_time.nt_prev_runnable_sum = grp_nt_curr_sum; + + wrq->curr_runnable_sum = 0; + wrq->nt_curr_runnable_sum = 0; + wrq->grp_time.curr_runnable_sum = 0; + wrq->grp_time.nt_curr_runnable_sum = 0; +} + +/* + * Account cpu activity in its + * busy time counters(wrq->curr/prev_runnable_sum) + */ +static void update_cpu_busy_time(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) +{ + int new_window, full_window = 0; + int p_is_curr_task = (p == rq->curr); + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + u64 mark_start = wts->mark_start; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + u64 window_start = wrq->window_start; + u32 window_size = wrq->prev_window_size; + u64 delta; + u64 *curr_runnable_sum = &wrq->curr_runnable_sum; + u64 *prev_runnable_sum = &wrq->prev_runnable_sum; + u64 *nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum; + u64 *nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum; + bool new_task; + struct walt_related_thread_group *grp; + int cpu = rq->cpu; + u32 old_curr_window = wts->curr_window; + + new_window = mark_start < window_start; + if (new_window) + full_window = (window_start - mark_start) >= window_size; + + /* + * Handle per-task window rollover. We don't care about the + * idle task. + */ + if (!is_idle_task(p)) { + if (new_window) + rollover_task_window(p, full_window); + } + + new_task = is_new_task(p); + + if (p_is_curr_task && new_window) { + rollover_cpu_window(rq, full_window); + rollover_top_tasks(rq, full_window); + } + + if (!account_busy_for_cpu_time(rq, p, irqtime, event)) + goto done; + + grp = wts->grp; + if (grp) { + struct group_cpu_time *cpu_time = &wrq->grp_time; + + curr_runnable_sum = &cpu_time->curr_runnable_sum; + prev_runnable_sum = &cpu_time->prev_runnable_sum; + + nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + } + + if (!new_window) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. No rollover + * since we didn't start a new window. An example of this is + * when a task starts execution and then sleeps within the + * same window. + */ + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) + delta = wallclock - mark_start; + else + delta = irqtime; + delta = scale_exec_time(delta, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p)) { + wts->curr_window += delta; + wts->curr_window_cpu[cpu] += delta; + } + + goto done; + } + + if (!p_is_curr_task) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has also started, but p is not the current task, so the + * window is not rolled over - just split up and account + * as necessary into curr and prev. The window is only + * rolled over when a new window is processed for the current + * task. + * + * Irqtime can't be accounted by a task that isn't the + * currently running task. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + wts->prev_window += delta; + wts->prev_window_cpu[cpu] += delta; + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + wts->prev_window = delta; + wts->prev_window_cpu[cpu] = delta; + } + + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + wts->curr_window = delta; + wts->curr_window_cpu[cpu] = delta; + + goto done; + } + + if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. If any of these three above conditions are true + * then this busy time can't be accounted as irqtime. + * + * Busy time for the idle task need not be accounted. + * + * An example of this would be a task that starts execution + * and then sleeps once a new window has begun. + */ + + if (!full_window) { + /* + * A full window hasn't elapsed, account partial + * contribution to previous completed window. + */ + delta = scale_exec_time(window_start - mark_start, rq); + if (!is_idle_task(p)) { + wts->prev_window += delta; + wts->prev_window_cpu[cpu] += delta; + } + } else { + /* + * Since at least one full window has elapsed, + * the contribution to the previous window is the + * full window (window_size). + */ + delta = scale_exec_time(window_size, rq); + if (!is_idle_task(p)) { + wts->prev_window = delta; + wts->prev_window_cpu[cpu] = delta; + } + } + + /* + * Rollover is done here by overwriting the values in + * prev_runnable_sum and curr_runnable_sum. + */ + *prev_runnable_sum += delta; + if (new_task) + *nt_prev_runnable_sum += delta; + + /* Account piece of busy time in the current window. */ + delta = scale_exec_time(wallclock - window_start, rq); + *curr_runnable_sum += delta; + if (new_task) + *nt_curr_runnable_sum += delta; + + if (!is_idle_task(p)) { + wts->curr_window = delta; + wts->curr_window_cpu[cpu] = delta; + } + + goto done; + } + + if (irqtime) { + /* + * account_busy_for_cpu_time() = 1 so busy time needs + * to be accounted to the current window. A new window + * has started and p is the current task so rollover is + * needed. The current task must be the idle task because + * irqtime is not accounted for any other task. + * + * Irqtime will be accounted each time we process IRQ activity + * after a period of idleness, so we know the IRQ busy time + * started at wallclock - irqtime. + */ + + SCHED_BUG_ON(!is_idle_task(p)); + mark_start = wallclock - irqtime; + + /* + * Roll window over. If IRQ busy time was just in the current + * window then that is all that need be accounted. + */ + if (mark_start > window_start) { + *curr_runnable_sum = scale_exec_time(irqtime, rq); + return; + } + + /* + * The IRQ busy time spanned multiple windows. Process the + * busy time preceding the current window start first. + */ + delta = window_start - mark_start; + if (delta > window_size) + delta = window_size; + delta = scale_exec_time(delta, rq); + *prev_runnable_sum += delta; + + /* Process the remaining IRQ busy time in the current window. */ + delta = wallclock - window_start; + wrq->curr_runnable_sum = scale_exec_time(delta, rq); + + return; + } + +done: + if (!is_idle_task(p)) + update_top_tasks(p, rq, old_curr_window, + new_window, full_window); +} + +static inline u32 predict_and_update_buckets( + struct task_struct *p, u32 runtime) { + int bidx; + u32 pred_demand; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (!sched_predl) + return 0; + + bidx = busy_to_bucket(runtime); + pred_demand = get_pred_busy(p, bidx, runtime); + bucket_increase(wts->busy_buckets, bidx); + + return pred_demand; +} + +static int +account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event) +{ + /* + * No need to bother updating task demand for the idle task. + */ + if (is_idle_task(p)) + return 0; + + /* + * When a task is waking up it is completing a segment of non-busy + * time. Likewise, if wait time is not treated as busy time, then + * when a task begins to run or is migrated, it is not running and + * is completing a segment of non-busy time. + */ + if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME && + (event == PICK_NEXT_TASK || event == TASK_MIGRATE))) + return 0; + + /* + * The idle exit time is not accounted for the first task _picked_ up to + * run on the idle CPU. + */ + if (event == PICK_NEXT_TASK && rq->curr == rq->idle) + return 0; + + /* + * TASK_UPDATE can be called on sleeping task, when its moved between + * related groups + */ + if (event == TASK_UPDATE) { + if (rq->curr == p) + return 1; + + return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0; + } + + return 1; +} + +/* + * Called when new window is starting for a task, to record cpu usage over + * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 + * when, say, a real-time task runs without preemption for several windows at a + * stretch. + */ +static void update_history(struct rq *rq, struct task_struct *p, + u32 runtime, int samples, int event) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + u32 *hist = &wts->sum_history[0]; + int ridx, widx; + u32 max = 0, avg, demand, pred_demand; + u64 sum = 0; + u16 demand_scaled, pred_demand_scaled; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + /* Ignore windows where task had no activity */ + if (!runtime || is_idle_task(p) || !samples) + goto done; + + /* Push new 'runtime' value onto stack */ + widx = sched_ravg_hist_size - 1; + ridx = widx - samples; + for (; ridx >= 0; --widx, --ridx) { + hist[widx] = hist[ridx]; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) { + hist[widx] = runtime; + sum += hist[widx]; + if (hist[widx] > max) + max = hist[widx]; + } + + wts->sum = 0; + + if (sysctl_sched_window_stats_policy == WINDOW_STATS_RECENT) { + demand = runtime; + } else if (sysctl_sched_window_stats_policy == WINDOW_STATS_MAX) { + demand = max; + } else { + avg = div64_u64(sum, sched_ravg_hist_size); + if (sysctl_sched_window_stats_policy == WINDOW_STATS_AVG) + demand = avg; + else + demand = max(avg, runtime); + } + pred_demand = predict_and_update_buckets(p, runtime); + demand_scaled = scale_demand(demand); + pred_demand_scaled = scale_demand(pred_demand); + + /* + * A throttled deadline sched class task gets dequeued without + * changing p->on_rq. Since the dequeue decrements walt stats + * avoid decrementing it here again. + * + * When window is rolled over, the cumulative window demand + * is reset to the cumulative runnable average (contribution from + * the tasks on the runqueue). If the current task is dequeued + * already, it's demand is not included in the cumulative runnable + * average. So add the task demand separately to cumulative window + * demand. + */ + if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { + if (task_on_rq_queued(p)) + fixup_walt_sched_stats_common(rq, p, + demand_scaled, pred_demand_scaled); + } + + wts->demand = demand; + wts->demand_scaled = demand_scaled; + wts->coloc_demand = div64_u64(sum, sched_ravg_hist_size); + wts->pred_demand = pred_demand; + wts->pred_demand_scaled = pred_demand_scaled; + + if (demand_scaled > sysctl_sched_min_task_util_for_colocation) + wts->unfilter = sysctl_sched_task_unfilter_period; + else + if (wts->unfilter) + wts->unfilter = max_t(int, 0, + wts->unfilter - wrq->prev_window_size); + +done: + trace_sched_update_history(rq, p, runtime, samples, event, wrq, wts); +} + +static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + delta = scale_exec_time(delta, rq); + wts->sum += delta; + if (unlikely(wts->sum > sched_ravg_window)) + wts->sum = sched_ravg_window; + + return delta; +} + +/* + * Account cpu demand of task and/or update task's cpu demand history + * + * ms = wts->mark_start; + * wc = wallclock + * ws = wrq->window_start + * + * Three possibilities: + * + * a) Task event is contained within one window. + * window_start < mark_start < wallclock + * + * ws ms wc + * | | | + * V V V + * |---------------| + * + * In this case, wts->sum is updated *iff* event is appropriate + * (ex: event == PUT_PREV_TASK) + * + * b) Task event spans two windows. + * mark_start < window_start < wallclock + * + * ms ws wc + * | | | + * V V V + * -----|------------------- + * + * In this case, wts->sum is updated with (ws - ms) *iff* event + * is appropriate, then a new window sample is recorded followed + * by wts->sum being set to (wc - ws) *iff* event is appropriate. + * + * c) Task event spans more than two windows. + * + * ms ws_tmp ws wc + * | | | | + * V V V V + * ---|-------|-------|-------|-------|------ + * | | + * |<------ nr_full_windows ------>| + * + * In this case, wts->sum is updated with (ws_tmp - ms) first *iff* + * event is appropriate, window sample of wts->sum is recorded, + * 'nr_full_window' samples of window_size is also recorded *iff* + * event is appropriate and finally wts->sum is set to (wc - ws) + * *iff* event is appropriate. + * + * IMPORTANT : Leave wts->mark_start unchanged, as update_cpu_busy_time() + * depends on it! + */ +static u64 update_task_demand(struct task_struct *p, struct rq *rq, + int event, u64 wallclock) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + u64 mark_start = wts->mark_start; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + u64 delta, window_start = wrq->window_start; + int new_window, nr_full_windows; + u32 window_size = sched_ravg_window; + u64 runtime; + + new_window = mark_start < window_start; + if (!account_busy_for_task_demand(rq, p, event)) { + if (new_window) + /* + * If the time accounted isn't being accounted as + * busy time, and a new window started, only the + * previous window need be closed out with the + * pre-existing demand. Multiple windows may have + * elapsed, but since empty windows are dropped, + * it is not necessary to account those. + */ + update_history(rq, p, wts->sum, 1, event); + return 0; + } + + if (!new_window) { + /* + * The simple case - busy time contained within the existing + * window. + */ + return add_to_task_demand(rq, p, wallclock - mark_start); + } + + /* + * Busy time spans at least two windows. Temporarily rewind + * window_start to first window boundary after mark_start. + */ + delta = window_start - mark_start; + nr_full_windows = div64_u64(delta, window_size); + window_start -= (u64)nr_full_windows * (u64)window_size; + + /* Process (window_start - mark_start) first */ + runtime = add_to_task_demand(rq, p, window_start - mark_start); + + /* Push new sample(s) into task's demand history */ + update_history(rq, p, wts->sum, 1, event); + if (nr_full_windows) { + u64 scaled_window = scale_exec_time(window_size, rq); + + update_history(rq, p, scaled_window, nr_full_windows, event); + runtime += nr_full_windows * scaled_window; + } + + /* + * Roll window_start back to current to process any remainder + * in current window. + */ + window_start += (u64)nr_full_windows * (u64)window_size; + + /* Process (wallclock - window_start) next */ + mark_start = window_start; + runtime += add_to_task_demand(rq, p, wallclock - mark_start); + + return runtime; +} + +static inline unsigned int cpu_cur_freq(int cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + return wrq->cluster->cur_freq; +} + +static void +update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 cur_cycles; + u64 cycles_delta; + u64 time_delta; + int cpu = cpu_of(rq); + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + lockdep_assert_held(&rq->lock); + + if (!use_cycle_counter) { + wrq->task_exec_scale = DIV64_U64_ROUNDUP(cpu_cur_freq(cpu) * + arch_scale_cpu_capacity(cpu), + wrq->cluster->max_possible_freq); + return; + } + + cur_cycles = read_cycle_counter(cpu, wallclock); + + /* + * If current task is idle task and irqtime == 0 CPU was + * indeed idle and probably its cycle counter was not + * increasing. We still need estimatied CPU frequency + * for IO wait time accounting. Use the previously + * calculated frequency in such a case. + */ + if (!is_idle_task(rq->curr) || irqtime) { + if (unlikely(cur_cycles < wts->cpu_cycles)) + cycles_delta = cur_cycles + (U64_MAX - + wts->cpu_cycles); + else + cycles_delta = cur_cycles - wts->cpu_cycles; + cycles_delta = cycles_delta * NSEC_PER_MSEC; + + if (event == IRQ_UPDATE && is_idle_task(p)) + /* + * Time between mark_start of idle task and IRQ handler + * entry time is CPU cycle counter stall period. + * Upon IRQ handler entry walt_sched_account_irqstart() + * replenishes idle task's cpu cycle counter so + * cycles_delta now represents increased cycles during + * IRQ handler rather than time between idle entry and + * IRQ exit. Thus use irqtime as time delta. + */ + time_delta = irqtime; + else + time_delta = wallclock - wts->mark_start; + SCHED_BUG_ON((s64)time_delta < 0); + + wrq->task_exec_scale = DIV64_U64_ROUNDUP(cycles_delta * + arch_scale_cpu_capacity(cpu), + time_delta * + wrq->cluster->max_possible_freq); + + trace_sched_get_task_cpu_cycles(cpu, event, + cycles_delta, time_delta, p); + } + + wts->cpu_cycles = cur_cycles; +} + +static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq) +{ + u64 result; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + if (old_window_start == wrq->window_start) + return; + + result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start, + wrq->window_start); + if (result == old_window_start) + walt_irq_work_queue(&walt_cpufreq_irq_work); +} + +/* Reflect task activity on its demand and cpu's busy time statistics */ +static void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime) +{ + u64 old_window_start; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (!wrq->window_start || wts->mark_start == wallclock) + return; + + lockdep_assert_held(&rq->lock); + + old_window_start = update_window_start(rq, wallclock, event); + + if (!wts->mark_start) { + update_task_cpu_cycles(p, cpu_of(rq), wallclock); + goto done; + } + + update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime); + update_task_demand(p, rq, event, wallclock); + update_cpu_busy_time(p, rq, event, wallclock, irqtime); + update_task_pred_demand(rq, p, event); + + trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime, + &wrq->grp_time, wrq, wts); + trace_sched_update_task_ravg_mini(p, rq, event, wallclock, irqtime, + &wrq->grp_time, wrq, wts); + +done: + wts->mark_start = wallclock; + + run_walt_irq_work(old_window_start, rq); +} + +u32 sched_get_init_task_load(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return wts->init_load_pct; +} + +int sched_set_init_task_load(struct task_struct *p, int init_load_pct) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (init_load_pct < 0 || init_load_pct > 100) + return -EINVAL; + + wts->init_load_pct = init_load_pct; + + return 0; +} + +static void init_new_task_load(struct task_struct *p) +{ + int i; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + struct walt_task_struct *cur_wts = + (struct walt_task_struct *) current->android_vendor_data1; + u32 init_load_windows = sched_init_task_load_windows; + u32 init_load_windows_scaled = sched_init_task_load_windows_scaled; + u32 init_load_pct = cur_wts->init_load_pct; + + wts->init_load_pct = 0; + rcu_assign_pointer(wts->grp, NULL); + INIT_LIST_HEAD(&wts->grp_list); + + wts->mark_start = 0; + wts->sum = 0; + wts->curr_window = 0; + wts->prev_window = 0; + wts->active_time = 0; + for (i = 0; i < NUM_BUSY_BUCKETS; ++i) + wts->busy_buckets[i] = 0; + + wts->cpu_cycles = 0; + + memset(wts->curr_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS); + memset(wts->prev_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS); + + if (init_load_pct) { + init_load_windows = div64_u64((u64)init_load_pct * + (u64)sched_ravg_window, 100); + init_load_windows_scaled = scale_demand(init_load_windows); + } + + wts->demand = init_load_windows; + wts->demand_scaled = init_load_windows_scaled; + wts->coloc_demand = init_load_windows; + wts->pred_demand = 0; + wts->pred_demand_scaled = 0; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + wts->sum_history[i] = init_load_windows; + wts->misfit = false; + wts->rtg_high_prio = false; + wts->unfilter = sysctl_sched_task_unfilter_period; +} + +static void init_existing_task_load(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + init_new_task_load(p); + cpumask_copy(&wts->cpus_requested, &p->cpus_mask); +} + +static void walt_task_dead(struct task_struct *p) +{ + sched_set_group_id(p, 0); +} + +static void reset_task_stats(struct task_struct *p) +{ + int i = 0; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + memset(wts->curr_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS); + memset(wts->prev_window_cpu, 0, sizeof(u32) * WALT_NR_CPUS); + + wts->mark_start = 0; + wts->sum = 0; + wts->demand = 0; + wts->coloc_demand = 0; + for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) + wts->sum_history[i] = 0; + wts->curr_window = 0; + wts->prev_window = 0; + wts->pred_demand = 0; + for (i = 0; i < NUM_BUSY_BUCKETS; ++i) + wts->busy_buckets[i] = 0; + wts->demand_scaled = 0; + wts->pred_demand_scaled = 0; + wts->active_time = 0; +} + +static void mark_task_starting(struct task_struct *p) +{ + u64 wallclock; + struct rq *rq = task_rq(p); + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (!wrq->window_start) { + reset_task_stats(p); + return; + } + + wallclock = sched_ktime_clock(); + wts->mark_start = wts->last_wake_ts = wallclock; + wts->last_enqueued_ts = wallclock; + update_task_cpu_cycles(p, cpu_of(rq), wallclock); +} + +/* + * Task groups whose aggregate demand on a cpu is more than + * sched_group_upmigrate need to be up-migrated if possible. + */ +unsigned int __read_mostly sched_group_upmigrate = 20000000; + +/* + * Task groups, once up-migrated, will need to drop their aggregate + * demand to less than sched_group_downmigrate before they are "down" + * migrated. + */ +unsigned int __read_mostly sched_group_downmigrate = 19000000; + +void walt_update_group_thresholds(void) +{ + unsigned int min_scale = arch_scale_cpu_capacity( + cluster_first_cpu(sched_cluster[0])); + u64 min_ms = min_scale * (sched_ravg_window >> SCHED_CAPACITY_SHIFT); + + sched_group_upmigrate = div64_ul(min_ms * + sysctl_sched_group_upmigrate_pct, 100); + sched_group_downmigrate = div64_ul(min_ms * + sysctl_sched_group_downmigrate_pct, 100); +} + +struct walt_sched_cluster *sched_cluster[WALT_NR_CPUS]; +__read_mostly int num_sched_clusters; + +struct list_head cluster_head; + +static struct walt_sched_cluster init_cluster = { + .list = LIST_HEAD_INIT(init_cluster.list), + .id = 0, + .cur_freq = 1, + .max_possible_freq = 1, + .aggr_grp_load = 0, +}; + +static void init_clusters(void) +{ + init_cluster.cpus = *cpu_possible_mask; + raw_spin_lock_init(&init_cluster.load_lock); + INIT_LIST_HEAD(&cluster_head); + list_add(&init_cluster.list, &cluster_head); +} + +static void +insert_cluster(struct walt_sched_cluster *cluster, struct list_head *head) +{ + struct walt_sched_cluster *tmp; + struct list_head *iter = head; + + list_for_each_entry(tmp, head, list) { + if (arch_scale_cpu_capacity(cluster_first_cpu(cluster)) + < arch_scale_cpu_capacity(cluster_first_cpu(tmp))) + break; + iter = &tmp->list; + } + + list_add(&cluster->list, iter); +} + +static struct walt_sched_cluster *alloc_new_cluster(const struct cpumask *cpus) +{ + struct walt_sched_cluster *cluster = NULL; + + cluster = kzalloc(sizeof(struct walt_sched_cluster), GFP_ATOMIC); + BUG_ON(!cluster); + + INIT_LIST_HEAD(&cluster->list); + cluster->cur_freq = 1; + cluster->max_possible_freq = 1; + + raw_spin_lock_init(&cluster->load_lock); + cluster->cpus = *cpus; + + return cluster; +} + +static void add_cluster(const struct cpumask *cpus, struct list_head *head) +{ + struct walt_sched_cluster *cluster = alloc_new_cluster(cpus); + int i; + struct walt_rq *wrq; + + for_each_cpu(i, cpus) { + wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1; + wrq->cluster = cluster; + } + + insert_cluster(cluster, head); + num_sched_clusters++; +} + +static void cleanup_clusters(struct list_head *head) +{ + struct walt_sched_cluster *cluster, *tmp; + int i; + struct walt_rq *wrq; + + list_for_each_entry_safe(cluster, tmp, head, list) { + for_each_cpu(i, &cluster->cpus) { + wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1; + wrq->cluster = &init_cluster; + } + list_del(&cluster->list); + num_sched_clusters--; + kfree(cluster); + } +} + +static inline void assign_cluster_ids(struct list_head *head) +{ + struct walt_sched_cluster *cluster; + int pos = 0; + + list_for_each_entry(cluster, head, list) { + cluster->id = pos; + sched_cluster[pos++] = cluster; + } + + WARN_ON(pos > MAX_NR_CLUSTERS); +} + +static inline void +move_list(struct list_head *dst, struct list_head *src, bool sync_rcu) +{ + struct list_head *first, *last; + + first = src->next; + last = src->prev; + + if (sync_rcu) { + INIT_LIST_HEAD_RCU(src); + synchronize_rcu(); + } + + first->prev = dst; + dst->prev = last; + last->next = dst; + + /* Ensure list sanity before making the head visible to all CPUs. */ + smp_mb(); + dst->next = first; +} + +static void update_all_clusters_stats(void) +{ + struct walt_sched_cluster *cluster; + u64 highest_mpc = 0, lowest_mpc = U64_MAX; + + for_each_sched_cluster(cluster) { + u64 mpc = arch_scale_cpu_capacity( + cluster_first_cpu(cluster)); + + if (mpc > highest_mpc) + highest_mpc = mpc; + + if (mpc < lowest_mpc) + lowest_mpc = mpc; + } + + max_possible_capacity = highest_mpc; + min_max_possible_capacity = lowest_mpc; + walt_update_group_thresholds(); +} + +static bool walt_clusters_parsed; +cpumask_t __read_mostly **cpu_array; + +static void init_cpu_array(void) +{ + int i; + + cpu_array = kcalloc(num_sched_clusters, sizeof(cpumask_t *), + GFP_ATOMIC | __GFP_NOFAIL); + if (!cpu_array) + SCHED_BUG_ON(1); + + for (i = 0; i < num_sched_clusters; i++) { + cpu_array[i] = kcalloc(num_sched_clusters, sizeof(cpumask_t), + GFP_ATOMIC | __GFP_NOFAIL); + if (!cpu_array[i]) + SCHED_BUG_ON(1); + } +} + +static void build_cpu_array(void) +{ + int i; + + if (!cpu_array) + SCHED_BUG_ON(1); + /* Construct cpu_array row by row */ + for (i = 0; i < num_sched_clusters; i++) { + int j, k = 1; + + /* Fill out first column with appropriate cpu arrays */ + cpumask_copy(&cpu_array[i][0], &sched_cluster[i]->cpus); + /* + * k starts from column 1 because 0 is filled + * Fill clusters for the rest of the row, + * above i in ascending order + */ + for (j = i + 1; j < num_sched_clusters; j++) { + cpumask_copy(&cpu_array[i][k], + &sched_cluster[j]->cpus); + k++; + } + + /* + * k starts from where we left off above. + * Fill clusters below i in descending order. + */ + for (j = i - 1; j >= 0; j--) { + cpumask_copy(&cpu_array[i][k], + &sched_cluster[j]->cpus); + k++; + } + } +} + +static void walt_get_possible_siblings(int cpuid, struct cpumask *cluster_cpus) +{ + int cpu; + struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; + + if (cpuid_topo->package_id == -1) + return; + + for_each_possible_cpu(cpu) { + cpu_topo = &cpu_topology[cpu]; + + if (cpuid_topo->package_id != cpu_topo->package_id) + continue; + cpumask_set_cpu(cpu, cluster_cpus); + } +} + +static void walt_update_cluster_topology(void) +{ + struct cpumask cpus = *cpu_possible_mask; + struct cpumask cluster_cpus; + struct walt_sched_cluster *cluster; + struct list_head new_head; + int i; + struct walt_rq *wrq; + + INIT_LIST_HEAD(&new_head); + + for_each_cpu(i, &cpus) { + cpumask_clear(&cluster_cpus); + walt_get_possible_siblings(i, &cluster_cpus); + if (cpumask_empty(&cluster_cpus)) { + WARN(1, "WALT: Invalid cpu topology!!"); + cleanup_clusters(&new_head); + return; + } + cpumask_andnot(&cpus, &cpus, &cluster_cpus); + add_cluster(&cluster_cpus, &new_head); + } + + assign_cluster_ids(&new_head); + + list_for_each_entry(cluster, &new_head, list) { + struct cpufreq_policy *policy; + + policy = cpufreq_cpu_get_raw(cluster_first_cpu(cluster)); + /* + * walt_update_cluster_topology() must be called AFTER policies + * for all cpus are initialized. If not, simply BUG(). + */ + SCHED_BUG_ON(!policy); + + if (policy) { + cluster->max_possible_freq = policy->cpuinfo.max_freq; + for_each_cpu(i, &cluster->cpus) { + wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1; + cpumask_copy(&wrq->freq_domain_cpumask, + policy->related_cpus); + } + cpuinfo_max_freq_cached = (cpuinfo_max_freq_cached > + policy->cpuinfo.max_freq) ?: policy->cpuinfo.max_freq; + } + } + + /* + * Ensure cluster ids are visible to all CPUs before making + * cluster_head visible. + */ + move_list(&cluster_head, &new_head, false); + update_all_clusters_stats(); + cluster = NULL; + + for_each_sched_cluster(cluster) { + if (cpumask_weight(&cluster->cpus) == 1) + cpumask_or(&asym_cap_sibling_cpus, + &asym_cap_sibling_cpus, &cluster->cpus); + } + + if (cpumask_weight(&asym_cap_sibling_cpus) == 1) + cpumask_clear(&asym_cap_sibling_cpus); + + init_cpu_array(); + build_cpu_array(); + + walt_clusters_parsed = true; +} + +static int cpufreq_notifier_trans(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data; + unsigned int cpu = freq->policy->cpu, new_freq = freq->new; + unsigned long flags; + struct walt_sched_cluster *cluster; + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + struct cpumask policy_cpus = wrq->freq_domain_cpumask; + int i, j; + + if (use_cycle_counter) + return NOTIFY_DONE; + wrq = (struct walt_rq *) cpu_rq(cpumask_first(&policy_cpus))->android_vendor_data1; + if (wrq->cluster == &init_cluster) + return NOTIFY_DONE; + + if (val != CPUFREQ_POSTCHANGE) + return NOTIFY_DONE; + + if (cpu_cur_freq(cpu) == new_freq) + return NOTIFY_OK; + + for_each_cpu(i, &policy_cpus) { + wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1; + cluster = wrq->cluster; + + for_each_cpu(j, &cluster->cpus) { + struct rq *rq = cpu_rq(j); + + raw_spin_lock_irqsave(&rq->lock, flags); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, + sched_ktime_clock(), 0); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + cluster->cur_freq = new_freq; + cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus); + } + + return NOTIFY_OK; +} + +static struct notifier_block notifier_trans_block = { + .notifier_call = cpufreq_notifier_trans +}; + +static void walt_init_cycle_counter(void) +{ + if (qcom_cpufreq_get_cpu_cycle_counter(smp_processor_id()) != U64_MAX) { + use_cycle_counter = true; + return; + } + + cpufreq_register_notifier(¬ifier_trans_block, + CPUFREQ_TRANSITION_NOTIFIER); +} + +static void transfer_busy_time(struct rq *rq, + struct walt_related_thread_group *grp, + struct task_struct *p, int event); + +/* + * Enable colocation and frequency aggregation for all threads in a process. + * The children inherits the group id from the parent. + */ + +struct walt_related_thread_group + *related_thread_groups[MAX_NUM_CGROUP_COLOC_ID]; +static LIST_HEAD(active_related_thread_groups); +static DEFINE_RWLOCK(related_thread_group_lock); + +static inline +void update_best_cluster(struct walt_related_thread_group *grp, + u64 demand, bool boost) +{ + if (boost) { + /* + * since we are in boost, we can keep grp on min, the boosts + * will ensure tasks get to bigs + */ + grp->skip_min = false; + return; + } + + if (is_suh_max()) + demand = sched_group_upmigrate; + + if (!grp->skip_min) { + if (demand >= sched_group_upmigrate) + grp->skip_min = true; + return; + } + if (demand < sched_group_downmigrate) { + if (!sysctl_sched_coloc_downmigrate_ns) { + grp->skip_min = false; + return; + } + if (!grp->downmigrate_ts) { + grp->downmigrate_ts = grp->last_update; + return; + } + if (grp->last_update - grp->downmigrate_ts > + sysctl_sched_coloc_downmigrate_ns) { + grp->downmigrate_ts = 0; + grp->skip_min = false; + } + } else if (grp->downmigrate_ts) + grp->downmigrate_ts = 0; +} + +static void _set_preferred_cluster(struct walt_related_thread_group *grp) +{ + struct task_struct *p; + u64 combined_demand = 0; + bool group_boost = false; + u64 wallclock; + bool prev_skip_min = grp->skip_min; + struct walt_task_struct *wts; + struct list_head *task_list; + + if (list_empty(&grp->tasks)) { + grp->skip_min = false; + goto out; + } + + if (!hmp_capable()) { + grp->skip_min = false; + goto out; + } + + wallclock = sched_ktime_clock(); + + /* + * wakeup of two or more related tasks could race with each other and + * could result in multiple calls to _set_preferred_cluster being issued + * at same time. Avoid overhead in such cases of rechecking preferred + * cluster + */ + if (wallclock - grp->last_update < sched_ravg_window / 10) + return; + + list_for_each(task_list, &grp->tasks) { + p = (struct task_struct *) task_list; + wts = (struct walt_task_struct *) p->android_vendor_data1; + if (task_boost_policy(p) == SCHED_BOOST_ON_BIG) { + group_boost = true; + break; + } + + if (wts->mark_start < wallclock - + (sched_ravg_window * sched_ravg_hist_size)) + continue; + + combined_demand += wts->coloc_demand; + if (!trace_sched_set_preferred_cluster_enabled()) { + if (combined_demand > sched_group_upmigrate) + break; + } + } + + grp->last_update = wallclock; + update_best_cluster(grp, combined_demand, group_boost); + trace_sched_set_preferred_cluster(grp, combined_demand); + +out: + if (grp->id == DEFAULT_CGROUP_COLOC_ID + && grp->skip_min != prev_skip_min) { + if (grp->skip_min) + grp->start_ts = sched_clock(); + sched_update_hyst_times(); + } +} + +static void set_preferred_cluster(struct walt_related_thread_group *grp) +{ + raw_spin_lock(&grp->lock); + _set_preferred_cluster(grp); + raw_spin_unlock(&grp->lock); +} + +static int update_preferred_cluster(struct walt_related_thread_group *grp, + struct task_struct *p, u32 old_load, bool from_tick) +{ + u32 new_load = task_load(p); + + if (!grp) + return 0; + + if (unlikely(from_tick && is_suh_max())) + return 1; + + /* + * Update if task's load has changed significantly or a complete window + * has passed since we last updated preference + */ + if (abs(new_load - old_load) > sched_ravg_window / 4 || + sched_ktime_clock() - grp->last_update > sched_ravg_window) + return 1; + + return 0; +} + +#define ADD_TASK 0 +#define REM_TASK 1 + +static inline struct walt_related_thread_group* +lookup_related_thread_group(unsigned int group_id) +{ + return related_thread_groups[group_id]; +} + +static int alloc_related_thread_groups(void) +{ + int i; + struct walt_related_thread_group *grp; + + /* groupd_id = 0 is invalid as it's special id to remove group. */ + for (i = 1; i < MAX_NUM_CGROUP_COLOC_ID; i++) { + grp = kzalloc(sizeof(*grp), GFP_ATOMIC | GFP_NOWAIT); + BUG_ON(!grp); + + grp->id = i; + INIT_LIST_HEAD(&grp->tasks); + INIT_LIST_HEAD(&grp->list); + raw_spin_lock_init(&grp->lock); + + related_thread_groups[i] = grp; + } + + return 0; +} + +static void remove_task_from_group(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + struct walt_related_thread_group *grp = wts->grp; + struct rq *rq; + int empty_group = 1; + struct rq_flags rf; + + raw_spin_lock(&grp->lock); + + rq = __task_rq_lock(p, &rf); + transfer_busy_time(rq, wts->grp, p, REM_TASK); + list_del_init(&wts->grp_list); + rcu_assign_pointer(wts->grp, NULL); + __task_rq_unlock(rq, &rf); + + if (!list_empty(&grp->tasks)) { + empty_group = 0; + _set_preferred_cluster(grp); + } + + raw_spin_unlock(&grp->lock); + + /* Reserved groups cannot be destroyed */ + if (empty_group && grp->id != DEFAULT_CGROUP_COLOC_ID) + /* + * We test whether grp->list is attached with list_empty() + * hence re-init the list after deletion. + */ + list_del_init(&grp->list); +} + +static int +add_task_to_group(struct task_struct *p, struct walt_related_thread_group *grp) +{ + struct rq *rq; + struct rq_flags rf; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + raw_spin_lock(&grp->lock); + + /* + * Change wts->grp under rq->lock. Will prevent races with read-side + * reference of wts->grp in various hot-paths + */ + rq = __task_rq_lock(p, &rf); + transfer_busy_time(rq, grp, p, ADD_TASK); + list_add(&wts->grp_list, &grp->tasks); + rcu_assign_pointer(wts->grp, grp); + __task_rq_unlock(rq, &rf); + + _set_preferred_cluster(grp); + + raw_spin_unlock(&grp->lock); + + return 0; +} + +#ifdef CONFIG_UCLAMP_TASK_GROUP +static inline bool uclamp_task_colocated(struct task_struct *p) +{ + struct cgroup_subsys_state *css; + struct task_group *tg; + bool colocate; + struct walt_task_group *wtg; + + rcu_read_lock(); + css = task_css(p, cpu_cgrp_id); + if (!css) { + rcu_read_unlock(); + return false; + } + tg = container_of(css, struct task_group, css); + wtg = (struct walt_task_group *) tg->android_vendor_data1; + colocate = wtg->colocate; + rcu_read_unlock(); + + return colocate; +} +#else +static inline bool uclamp_task_colocated(struct task_struct *p) +{ + return false; +} +#endif /* CONFIG_UCLAMP_TASK_GROUP */ + +static void add_new_task_to_grp(struct task_struct *new) +{ + unsigned long flags; + struct walt_related_thread_group *grp; + struct walt_task_struct *wts = (struct walt_task_struct *) new->android_vendor_data1; + + /* + * If the task does not belong to colocated schedtune + * cgroup, nothing to do. We are checking this without + * lock. Even if there is a race, it will be added + * to the co-located cgroup via cgroup attach. + */ + if (!uclamp_task_colocated(new)) + return; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flags); + + /* + * It's possible that someone already added the new task to the + * group. or it might have taken out from the colocated schedtune + * cgroup. check these conditions under lock. + */ + if (!uclamp_task_colocated(new) || wts->grp) { + write_unlock_irqrestore(&related_thread_group_lock, flags); + return; + } + + raw_spin_lock(&grp->lock); + + rcu_assign_pointer(wts->grp, grp); + list_add(&wts->grp_list, &grp->tasks); + + raw_spin_unlock(&grp->lock); + write_unlock_irqrestore(&related_thread_group_lock, flags); +} + +static int __sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + int rc = 0; + unsigned long flags; + struct walt_related_thread_group *grp = NULL; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (group_id >= MAX_NUM_CGROUP_COLOC_ID) + return -EINVAL; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + write_lock(&related_thread_group_lock); + + /* Switching from one group to another directly is not permitted */ + if ((!wts->grp && !group_id) || (wts->grp && group_id)) + goto done; + + if (!group_id) { + remove_task_from_group(p); + goto done; + } + + grp = lookup_related_thread_group(group_id); + if (list_empty(&grp->list)) + list_add(&grp->list, &active_related_thread_groups); + + rc = add_task_to_group(p, grp); +done: + write_unlock(&related_thread_group_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return rc; +} + +int sched_set_group_id(struct task_struct *p, unsigned int group_id) +{ + /* DEFAULT_CGROUP_COLOC_ID is a reserved id */ + if (group_id == DEFAULT_CGROUP_COLOC_ID) + return -EINVAL; + + return __sched_set_group_id(p, group_id); +} + +unsigned int sched_get_group_id(struct task_struct *p) +{ + unsigned int group_id; + struct walt_related_thread_group *grp; + + rcu_read_lock(); + grp = task_related_thread_group(p); + group_id = grp ? grp->id : 0; + rcu_read_unlock(); + + return group_id; +} + +/* + * We create a default colocation group at boot. There is no need to + * synchronize tasks between cgroups at creation time because the + * correct cgroup hierarchy is not available at boot. Therefore cgroup + * colocation is turned off by default even though the colocation group + * itself has been allocated. Furthermore this colocation group cannot + * be destroyted once it has been created. All of this has been as part + * of runtime optimizations. + * + * The job of synchronizing tasks to the colocation group is done when + * the colocation flag in the cgroup is turned on. + */ +static int create_default_coloc_group(void) +{ + struct walt_related_thread_group *grp = NULL; + unsigned long flags; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + write_lock_irqsave(&related_thread_group_lock, flags); + list_add(&grp->list, &active_related_thread_groups); + write_unlock_irqrestore(&related_thread_group_lock, flags); + return 0; +} + +static int sync_cgroup_colocation(struct task_struct *p, bool insert) +{ + unsigned int grp_id = insert ? DEFAULT_CGROUP_COLOC_ID : 0; + + return __sched_set_group_id(p, grp_id); +} + +static void android_rvh_cpu_cgroup_attach(void *unused, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + bool colocate; + struct task_group *tg; + struct walt_task_group *wtg; + + cgroup_taskset_first(tset, &css); + if (!css) + return; + + tg = container_of(css, struct task_group, css); + wtg = (struct walt_task_group *) tg->android_vendor_data1; + colocate = wtg->colocate; + + cgroup_taskset_for_each(task, css, tset) + sync_cgroup_colocation(task, colocate); +} + +static bool is_cluster_hosting_top_app(struct walt_sched_cluster *cluster) +{ + struct walt_related_thread_group *grp; + bool grp_on_min; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + + if (!grp) + return false; + + grp_on_min = !grp->skip_min && + (sched_boost_policy() != SCHED_BOOST_ON_BIG); + + return (is_min_capacity_cluster(cluster) == grp_on_min); +} + +static void note_task_waking(struct task_struct *p, u64 wallclock) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + wts->last_wake_ts = wallclock; +} + +/* + * Task's cpu usage is accounted in: + * wrq->curr/prev_runnable_sum, when its ->grp is NULL + * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL + * + * Transfer task's cpu usage between those counters when transitioning between + * groups + */ +static void transfer_busy_time(struct rq *rq, + struct walt_related_thread_group *grp, + struct task_struct *p, int event) +{ + u64 wallclock; + struct group_cpu_time *cpu_time; + u64 *src_curr_runnable_sum, *dst_curr_runnable_sum; + u64 *src_prev_runnable_sum, *dst_prev_runnable_sum; + u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum; + u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum; + int migrate_type; + int cpu = cpu_of(rq); + bool new_task; + int i; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + wallclock = sched_ktime_clock(); + + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0); + new_task = is_new_task(p); + + cpu_time = &wrq->grp_time; + if (event == ADD_TASK) { + migrate_type = RQ_TO_GROUP; + + src_curr_runnable_sum = &wrq->curr_runnable_sum; + dst_curr_runnable_sum = &cpu_time->curr_runnable_sum; + src_prev_runnable_sum = &wrq->prev_runnable_sum; + dst_prev_runnable_sum = &cpu_time->prev_runnable_sum; + + src_nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + + if (*src_curr_runnable_sum < wts->curr_window_cpu[cpu]) { + printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_crs=%llu is lesser than task_contrib=%llu", + p->pid, cpu, event, *src_curr_runnable_sum, + wts->curr_window_cpu[cpu]); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + *src_curr_runnable_sum -= wts->curr_window_cpu[cpu]; + + if (*src_prev_runnable_sum < wts->prev_window_cpu[cpu]) { + printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_prs=%llu is lesser than task_contrib=%llu", + p->pid, cpu, event, *src_prev_runnable_sum, + wts->prev_window_cpu[cpu]); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + *src_prev_runnable_sum -= wts->prev_window_cpu[cpu]; + + if (new_task) { + if (*src_nt_curr_runnable_sum < + wts->curr_window_cpu[cpu]) { + printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_nt_crs=%llu is lesser than task_contrib=%llu", + p->pid, cpu, event, + *src_nt_curr_runnable_sum, + wts->curr_window_cpu[cpu]); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + *src_nt_curr_runnable_sum -= + wts->curr_window_cpu[cpu]; + + if (*src_nt_prev_runnable_sum < + wts->prev_window_cpu[cpu]) { + printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_nt_prs=%llu is lesser than task_contrib=%llu", + p->pid, cpu, event, + *src_nt_prev_runnable_sum, + wts->prev_window_cpu[cpu]); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + *src_nt_prev_runnable_sum -= + wts->prev_window_cpu[cpu]; + } + + update_cluster_load_subtractions(p, cpu, + wrq->window_start, new_task); + + } else { + migrate_type = GROUP_TO_RQ; + + src_curr_runnable_sum = &cpu_time->curr_runnable_sum; + dst_curr_runnable_sum = &wrq->curr_runnable_sum; + src_prev_runnable_sum = &cpu_time->prev_runnable_sum; + dst_prev_runnable_sum = &wrq->prev_runnable_sum; + + src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum; + dst_nt_curr_runnable_sum = &wrq->nt_curr_runnable_sum; + src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum; + dst_nt_prev_runnable_sum = &wrq->nt_prev_runnable_sum; + + if (*src_curr_runnable_sum < wts->curr_window) { + printk_deferred("WALT-UG pid=%u CPU=%d event=%d src_crs=%llu is lesser than task_contrib=%llu", + p->pid, cpu, event, *src_curr_runnable_sum, + wts->curr_window); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + *src_curr_runnable_sum -= wts->curr_window; + + if (*src_prev_runnable_sum < wts->prev_window) { + printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_prs=%llu is lesser than task_contrib=%llu", + p->pid, cpu, event, *src_prev_runnable_sum, + wts->prev_window); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + *src_prev_runnable_sum -= wts->prev_window; + + if (new_task) { + if (*src_nt_curr_runnable_sum < wts->curr_window) { + printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_nt_crs=%llu is lesser than task_contrib=%llu", + p->pid, cpu, event, + *src_nt_curr_runnable_sum, + wts->curr_window); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + *src_nt_curr_runnable_sum -= wts->curr_window; + + if (*src_nt_prev_runnable_sum < wts->prev_window) { + printk_deferred("WALT-BUG pid=%u CPU=%d event=%d src_nt_prs=%llu is lesser than task_contrib=%llu", + p->pid, cpu, event, + *src_nt_prev_runnable_sum, + wts->prev_window); + walt_task_dump(p); + SCHED_BUG_ON(1); + } + *src_nt_prev_runnable_sum -= wts->prev_window; + } + + /* + * Need to reset curr/prev windows for all CPUs, not just the + * ones in the same cluster. Since inter cluster migrations + * did not result in the appropriate book keeping, the values + * per CPU would be inaccurate. + */ + for_each_possible_cpu(i) { + wts->curr_window_cpu[i] = 0; + wts->prev_window_cpu[i] = 0; + } + } + + *dst_curr_runnable_sum += wts->curr_window; + *dst_prev_runnable_sum += wts->prev_window; + if (new_task) { + *dst_nt_curr_runnable_sum += wts->curr_window; + *dst_nt_prev_runnable_sum += wts->prev_window; + } + + /* + * When a task enter or exits a group, it's curr and prev windows are + * moved to a single CPU. This behavior might be sub-optimal in the + * exit case, however, it saves us the overhead of handling inter + * cluster migration fixups while the task is part of a related group. + */ + wts->curr_window_cpu[cpu] = wts->curr_window; + wts->prev_window_cpu[cpu] = wts->prev_window; + + trace_sched_migration_update_sum(p, migrate_type, rq); +} + +bool is_rtgb_active(void) +{ + struct walt_related_thread_group *grp; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + return grp && grp->skip_min; +} + +u64 get_rtgb_active_time(void) +{ + struct walt_related_thread_group *grp; + u64 now = sched_clock(); + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + + if (grp && grp->skip_min && grp->start_ts) + return now - grp->start_ts; + + return 0; +} + +static void walt_init_window_dep(void); +static void walt_tunables_fixup(void) +{ + if (likely(num_sched_clusters > 0)) + walt_update_group_thresholds(); + walt_init_window_dep(); +} + +static void walt_update_irqload(struct rq *rq) +{ + u64 irq_delta = 0; + unsigned int nr_windows = 0; + u64 cur_irq_time; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + u64 last_irq_window = READ_ONCE(wrq->last_irq_window); + + if (wrq->window_start > last_irq_window) + nr_windows = div64_u64(wrq->window_start - last_irq_window, + sched_ravg_window); + + /* Decay CPU's irqload by 3/4 for each window. */ + if (nr_windows < 10) + wrq->avg_irqload = mult_frac(wrq->avg_irqload, 3, 4); + else + wrq->avg_irqload = 0; + + cur_irq_time = irq_time_read(cpu_of(rq)); + if (cur_irq_time > wrq->prev_irq_time) + irq_delta = cur_irq_time - wrq->prev_irq_time; + + wrq->avg_irqload += irq_delta; + wrq->prev_irq_time = cur_irq_time; + + if (nr_windows < SCHED_HIGH_IRQ_TIMEOUT) + wrq->high_irqload = (wrq->avg_irqload >= + walt_cpu_high_irqload); + else + wrq->high_irqload = false; +} + +/* + * Runs in hard-irq context. This should ideally run just after the latest + * window roll-over. + */ +static void walt_irq_work(struct irq_work *irq_work) +{ + struct walt_sched_cluster *cluster; + struct rq *rq; + int cpu; + u64 wc; + bool is_migration = false, is_asym_migration = false; + u64 total_grp_load = 0, min_cluster_grp_load = 0; + int level = 0; + unsigned long flags; + struct walt_rq *wrq; + + /* Am I the window rollover work or the migration work? */ + if (irq_work == &walt_migration_irq_work) + is_migration = true; + + for_each_cpu(cpu, cpu_possible_mask) { + if (level == 0) + raw_spin_lock(&cpu_rq(cpu)->lock); + else + raw_spin_lock_nested(&cpu_rq(cpu)->lock, level); + level++; + } + + wc = sched_ktime_clock(); + walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws); + for_each_sched_cluster(cluster) { + u64 aggr_grp_load = 0; + + raw_spin_lock(&cluster->load_lock); + + for_each_cpu(cpu, &cluster->cpus) { + rq = cpu_rq(cpu); + wrq = (struct walt_rq *) rq->android_vendor_data1; + if (rq->curr) { + walt_update_task_ravg(rq->curr, rq, + TASK_UPDATE, wc, 0); + account_load_subtractions(rq); + aggr_grp_load += + wrq->grp_time.prev_runnable_sum; + } + if (is_migration && wrq->notif_pending && + cpumask_test_cpu(cpu, &asym_cap_sibling_cpus)) { + is_asym_migration = true; + wrq->notif_pending = false; + } + } + + cluster->aggr_grp_load = aggr_grp_load; + total_grp_load += aggr_grp_load; + + if (is_min_capacity_cluster(cluster)) + min_cluster_grp_load = aggr_grp_load; + raw_spin_unlock(&cluster->load_lock); + } + + if (total_grp_load) { + if (cpumask_weight(&asym_cap_sibling_cpus)) { + u64 big_grp_load = + total_grp_load - min_cluster_grp_load; + + for_each_cpu(cpu, &asym_cap_sibling_cpus) + cpu_cluster(cpu)->aggr_grp_load = big_grp_load; + } + rtgb_active = is_rtgb_active(); + } else { + rtgb_active = false; + } + + if (!is_migration && sysctl_sched_user_hint && time_after(jiffies, + sched_user_hint_reset_time)) + sysctl_sched_user_hint = 0; + + for_each_sched_cluster(cluster) { + cpumask_t cluster_online_cpus; + unsigned int num_cpus, i = 1; + + cpumask_and(&cluster_online_cpus, &cluster->cpus, + cpu_online_mask); + num_cpus = cpumask_weight(&cluster_online_cpus); + for_each_cpu(cpu, &cluster_online_cpus) { + int wflag = 0; + + /* + * FIXME: + * + * For now supporting both schedutil and waltgov. + * This is not by design but for convenience. + */ + rq = cpu_rq(cpu); + wrq = (struct walt_rq *) rq->android_vendor_data1; + + if (is_migration) { + if (wrq->notif_pending) { + wrq->notif_pending = false; + + wflag |= WALT_CPUFREQ_IC_MIGRATION; + } + } else { + wflag |= WALT_CPUFREQ_ROLLOVER; + } + + if (is_asym_migration && cpumask_test_cpu(cpu, + &asym_cap_sibling_cpus)) { + wflag |= WALT_CPUFREQ_IC_MIGRATION; + } + + if (i == num_cpus) + waltgov_run_callback(cpu_rq(cpu), wflag); + else + waltgov_run_callback(cpu_rq(cpu), wflag | + WALT_CPUFREQ_CONTINUE); + i++; + + if (!is_migration) + walt_update_irqload(rq); + } + } + + /* + * If the window change request is in pending, good place to + * change sched_ravg_window since all rq locks are acquired. + * + * If the current window roll over is delayed such that the + * mark_start (current wallclock with which roll over is done) + * of the current task went past the window start with the + * updated new window size, delay the update to the next + * window roll over. Otherwise the CPU counters (prs and crs) are + * not rolled over properly as mark_start > window_start. + */ + if (!is_migration) { + spin_lock_irqsave(&sched_ravg_window_lock, flags); + wrq = (struct walt_rq *) this_rq()->android_vendor_data1; + if ((sched_ravg_window != new_sched_ravg_window) && + (wc < wrq->window_start + new_sched_ravg_window)) { + sched_ravg_window_change_time = sched_ktime_clock(); + trace_sched_ravg_window_change(sched_ravg_window, + new_sched_ravg_window, + sched_ravg_window_change_time); + sched_ravg_window = new_sched_ravg_window; + walt_tunables_fixup(); + } + spin_unlock_irqrestore(&sched_ravg_window_lock, flags); + } + + for_each_cpu(cpu, cpu_possible_mask) + raw_spin_unlock(&cpu_rq(cpu)->lock); + + if (!is_migration) { + wrq = (struct walt_rq *) this_rq()->android_vendor_data1; + core_ctl_check(wrq->window_start); + } +} + +void walt_rotation_checkpoint(int nr_big) +{ + if (!hmp_capable()) + return; + + if (!sysctl_sched_walt_rotate_big_tasks || sched_boost() != NO_BOOST) { + walt_rotation_enabled = 0; + return; + } + + walt_rotation_enabled = nr_big >= num_possible_cpus(); +} + +void walt_fill_ta_data(struct core_ctl_notif_data *data) +{ + struct walt_related_thread_group *grp; + unsigned long flags; + u64 total_demand = 0, wallclock; + struct task_struct *p; + int min_cap_cpu, scale = 1024; + struct walt_sched_cluster *cluster; + int i = 0; + struct walt_task_struct *wts; + struct list_head *task_list; + + grp = lookup_related_thread_group(DEFAULT_CGROUP_COLOC_ID); + + raw_spin_lock_irqsave(&grp->lock, flags); + if (list_empty(&grp->tasks)) { + raw_spin_unlock_irqrestore(&grp->lock, flags); + goto fill_util; + } + + wallclock = sched_ktime_clock(); + + list_for_each(task_list, &grp->tasks) { + p = (struct task_struct *) task_list; + wts = (struct walt_task_struct *) p->android_vendor_data1; + if (wts->mark_start < wallclock - + (sched_ravg_window * sched_ravg_hist_size)) + continue; + + total_demand += wts->coloc_demand; + } + + raw_spin_unlock_irqrestore(&grp->lock, flags); + + /* + * Scale the total demand to the lowest capacity CPU and + * convert into percentage. + * + * P = total_demand/sched_ravg_window * 1024/scale * 100 + */ + + min_cap_cpu = cpumask_first(&cpu_array[0][0]); + if (min_cap_cpu != -1) + scale = arch_scale_cpu_capacity(min_cap_cpu); + + data->coloc_load_pct = div64_u64(total_demand * 1024 * 100, + (u64)sched_ravg_window * scale); + +fill_util: + for_each_sched_cluster(cluster) { + int fcpu = cluster_first_cpu(cluster); + + if (i == MAX_CLUSTERS) + break; + + scale = arch_scale_cpu_capacity(fcpu); + data->ta_util_pct[i] = div64_u64(cluster->aggr_grp_load * 1024 * + 100, (u64)sched_ravg_window * scale); + + scale = arch_scale_freq_capacity(fcpu); + data->cur_cap_pct[i] = (scale * 100)/1024; + i++; + } +} + +static void walt_init_window_dep(void) +{ + walt_cpu_util_freq_divisor = + (sched_ravg_window >> SCHED_CAPACITY_SHIFT) * 100; + walt_scale_demand_divisor = sched_ravg_window >> SCHED_CAPACITY_SHIFT; + + sched_init_task_load_windows = + div64_u64((u64)sysctl_sched_init_task_load_pct * + (u64)sched_ravg_window, 100); + sched_init_task_load_windows_scaled = + scale_demand(sched_init_task_load_windows); + + walt_cpu_high_irqload = div64_u64((u64)sched_ravg_window * 95, (u64) 100); +} + +static void walt_init_once(void) +{ + init_irq_work(&walt_migration_irq_work, walt_irq_work); + init_irq_work(&walt_cpufreq_irq_work, walt_irq_work); + walt_init_window_dep(); +} + +static void walt_sched_init_rq(struct rq *rq) +{ + int j; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + if (cpu_of(rq) == 0) + walt_init_once(); + + cpumask_set_cpu(cpu_of(rq), &wrq->freq_domain_cpumask); + + wrq->walt_stats.cumulative_runnable_avg_scaled = 0; + wrq->prev_window_size = sched_ravg_window; + wrq->window_start = 0; + wrq->walt_stats.nr_big_tasks = 0; + wrq->walt_flags = 0; + wrq->avg_irqload = 0; + wrq->prev_irq_time = 0; + wrq->last_irq_window = 0; + wrq->high_irqload = false; + wrq->task_exec_scale = 1024; + wrq->push_task = NULL; + + /* + * All cpus part of same cluster by default. This avoids the + * need to check for wrq->cluster being non-NULL in hot-paths + * like select_best_cpu() + */ + wrq->cluster = &init_cluster; + wrq->curr_runnable_sum = wrq->prev_runnable_sum = 0; + wrq->nt_curr_runnable_sum = wrq->nt_prev_runnable_sum = 0; + memset(&wrq->grp_time, 0, sizeof(struct group_cpu_time)); + wrq->old_busy_time = 0; + wrq->old_estimated_time = 0; + wrq->walt_stats.pred_demands_sum_scaled = 0; + wrq->walt_stats.nr_rtg_high_prio_tasks = 0; + wrq->ed_task = NULL; + wrq->curr_table = 0; + wrq->prev_top = 0; + wrq->curr_top = 0; + wrq->last_cc_update = 0; + wrq->cycles = 0; + for (j = 0; j < NUM_TRACKED_WINDOWS; j++) { + memset(&wrq->load_subs[j], 0, + sizeof(struct load_subtractions)); + wrq->top_tasks[j] = kcalloc(NUM_LOAD_INDICES, + sizeof(u8), GFP_ATOMIC | GFP_NOWAIT); + /* No other choice */ + BUG_ON(!wrq->top_tasks[j]); + clear_top_tasks_bitmap(wrq->top_tasks_bitmap[j]); + } + wrq->cum_window_demand_scaled = 0; + wrq->notif_pending = false; +} + +void sched_window_nr_ticks_change(void) +{ + unsigned long flags; + + spin_lock_irqsave(&sched_ravg_window_lock, flags); + new_sched_ravg_window = mult_frac(sysctl_sched_ravg_window_nr_ticks, + NSEC_PER_SEC, HZ); + spin_unlock_irqrestore(&sched_ravg_window_lock, flags); +} + +static void +walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + fixup_cumulative_runnable_avg(&wrq->walt_stats, wts->demand_scaled, + wts->pred_demand_scaled); +} + +static void +walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + fixup_cumulative_runnable_avg(&wrq->walt_stats, + -(s64)wts->demand_scaled, + -(s64)wts->pred_demand_scaled); +} + +static void inc_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (wts->misfit) + wrq->walt_stats.nr_big_tasks++; + + wts->rtg_high_prio = task_rtg_high_prio(p); + if (wts->rtg_high_prio) + wrq->walt_stats.nr_rtg_high_prio_tasks++; +} + +static void dec_rq_walt_stats(struct rq *rq, struct task_struct *p) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (wts->misfit) + wrq->walt_stats.nr_big_tasks--; + + if (wts->rtg_high_prio) + wrq->walt_stats.nr_rtg_high_prio_tasks--; + + BUG_ON(wrq->walt_stats.nr_big_tasks < 0); +} + +static void android_rvh_wake_up_new_task(void *unused, struct task_struct *new) +{ + add_new_task_to_grp(new); +} + +/* + * The intention of this hook is to update cpu_capacity_orig as well as + * (*capacity), otherwise we will end up capacity_of() > capacity_orig_of(). + */ +static void android_rvh_update_cpu_capacity(void *unused, int cpu, unsigned long *capacity) +{ + unsigned long max_capacity = arch_scale_cpu_capacity(cpu); + unsigned long thermal_pressure = arch_scale_thermal_pressure(cpu); + unsigned long thermal_cap; + + /* + * thermal_pressure = max_capacity - curr_cap_as_per_thermal. + * so, + * curr_cap_as_per_thermal = max_capacity - thermal_pressure. + */ + + thermal_cap = max_capacity - thermal_pressure; + + /* + * TODO: + * Thermal is taken care now. but what about limits via + * cpufreq max. we don't have arch_scale_max_freq_capacity() + * in 5.10 now. + * + * Two options: + * #1 either port that max_frq_cap patch to AOSP + * #2 register for cpufreq policy updates.. + */ + cpu_rq(cpu)->cpu_capacity_orig = min(cpu_rq(cpu)->cpu_capacity_orig, + thermal_cap); + *capacity = cpu_rq(cpu)->cpu_capacity_orig; +} + +static void android_rvh_sched_cpu_starting(void *unused, int cpu) +{ + unsigned long flags; + struct rq *rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + clear_walt_request(cpu); +} + +static void android_rvh_sched_cpu_dying(void *unused, int cpu) +{ + clear_walt_request(cpu); +} + +static void android_rvh_set_task_cpu(void *unused, struct task_struct *p, unsigned int new_cpu) +{ + if (new_cpu < 0) + return; + fixup_busy_time(p, (int) new_cpu); +} + +static void android_rvh_sched_fork(void *unused, struct task_struct *p) +{ + init_new_task_load(p); +} + +static void android_rvh_new_task_stats(void *unused, struct task_struct *p) +{ + mark_task_starting(p); +} + +static void android_rvh_account_irq(void *unused, struct task_struct *curr, int cpu, s64 delta) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + if (!!(curr->flags & PF_IDLE)) { + if (hardirq_count() || in_serving_softirq()) + walt_sched_account_irqend(cpu, curr, delta); + else + walt_sched_account_irqstart(cpu, curr); + } + wrq->last_irq_window = wrq->window_start; +} + +static void android_rvh_flush_task(void *unused, struct task_struct *p) +{ + walt_task_dead(p); +} + +static void android_rvh_enqueue_task(void *unused, struct rq *rq, struct task_struct *p, int flags) +{ + u64 wallclock = sched_ktime_clock(); + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + wts->last_enqueued_ts = wallclock; + sched_update_nr_prod(rq->cpu, true); + + if (fair_policy(p->policy)) { + wts->misfit = !task_fits_max(p, rq->cpu); + inc_rq_walt_stats(rq, p); + } + + walt_inc_cumulative_runnable_avg(rq, p); + trace_sched_enq_deq_task(p, 1, cpumask_bits(&p->cpus_mask)[0]); +} + +static void android_rvh_dequeue_task(void *unused, struct rq *rq, struct task_struct *p, int flags) +{ + /* + * TODO: remove later. + * We don't have to check if p is ed task and clear it. the below + * code calls is_ed_task_present() which clears the rq's ed_task + * unconditionally. + */ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + if (p == wrq->ed_task) + is_ed_task_present(rq, sched_ktime_clock()); + + sched_update_nr_prod(rq->cpu, false); + + if (fair_policy(p->policy)) + dec_rq_walt_stats(rq, p); + + walt_dec_cumulative_runnable_avg(rq, p); + trace_sched_enq_deq_task(p, 0, cpumask_bits(&p->cpus_mask)[0]); +} + +static void android_rvh_update_misfit_status(void *unused, struct task_struct *p, + struct rq *rq, bool *need_update) +{ + struct walt_task_struct *wts; + struct walt_rq *wrq; + bool old_misfit, misfit; + int change; + + *need_update = false; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + wrq = (struct walt_rq *) rq->android_vendor_data1; + wts = (struct walt_task_struct *) p->android_vendor_data1; + old_misfit = wts->misfit; + + if (task_fits_capacity(p, capacity_orig_of(cpu_of(rq)), rq->cpu)) + rq->misfit_task_load = 0; + else + rq->misfit_task_load = task_load(p); + + misfit = rq->misfit_task_load; + + change = misfit - old_misfit; + if (change) { + sched_update_nr_prod(rq->cpu, true); + wts->misfit = misfit; + wrq->walt_stats.nr_big_tasks += change; + BUG_ON(wrq->walt_stats.nr_big_tasks < 0); + } +} + +/* utility function to update walt signals at wakeup */ +static void android_rvh_try_to_wake_up(void *unused, struct task_struct *p) +{ + struct rq *rq = cpu_rq(task_cpu(p)); + struct rq_flags rf; + u64 wallclock; + unsigned int old_load; + struct walt_related_thread_group *grp = NULL; + + rq_lock_irqsave(rq, &rf); + old_load = task_load(p); + wallclock = sched_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + note_task_waking(p, wallclock); + rq_unlock_irqrestore(rq, &rf); + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (update_preferred_cluster(grp, p, old_load, false)) + set_preferred_cluster(grp); + rcu_read_unlock(); +} + +static void android_rvh_try_to_wake_up_success(void *unused, struct task_struct *p) +{ + unsigned long flags; + int cpu = p->cpu; + + if (!sched_predl) + return; + + raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags); + if (do_pl_notif(cpu_rq(cpu))) + waltgov_run_callback(cpu_rq(cpu), WALT_CPUFREQ_PL); + raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags); +} + +static void android_rvh_tick_entry(void *unused, struct rq *rq) +{ + u64 wallclock; + u32 old_load; + struct walt_related_thread_group *grp; + + set_window_start(rq); + wallclock = sched_ktime_clock(); + + old_load = task_load(rq->curr); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + + rcu_read_lock(); + grp = task_related_thread_group(rq->curr); + if (update_preferred_cluster(grp, rq->curr, old_load, true)) + set_preferred_cluster(grp); + rcu_read_unlock(); + + if (is_ed_task_present(rq, wallclock)) + waltgov_run_callback(rq, WALT_CPUFREQ_EARLY_DET); + + /* TODO + * currently load balancer registered for a post-hook which + * takes care of rotation and migration for misfit tasks. + * + * See if that can also be done here. + */ +} + +static void android_rvh_schedule(void *unused, struct task_struct *prev, + struct task_struct *next, struct rq *rq) +{ + u64 wallclock = sched_ktime_clock(); + struct walt_task_struct *wts = (struct walt_task_struct *) prev->android_vendor_data1; + + if (likely(prev != next)) { + if (!prev->on_rq) + wts->last_sleep_ts = wallclock; + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); + } else { + walt_update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); + } +} + +static void android_rvh_resume_cpus(void *unused, struct cpumask *resuming_cpus, int *err) +{ + int i; + struct rq *rq; + unsigned long flags; + + /* + * send a reschedule event on all resumed CPUs + * which trigger newly idle load balance. + */ + for_each_cpu(i, resuming_cpus) { + rq = cpu_rq(i); + raw_spin_lock_irqsave(&rq->lock, flags); + resched_curr(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + + *err = 0; +} + +static void android_rvh_update_cpus_allowed(void *unused, struct task_struct *p, + cpumask_var_t cpus_requested, + const struct cpumask *new_mask, int *ret) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (cpumask_subset(&wts->cpus_requested, cpus_requested)) + *ret = set_cpus_allowed_ptr(p, &wts->cpus_requested); +} + +static void android_rvh_sched_fork_init(void *unused, struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + wts->last_sleep_ts = 0; + wts->wake_up_idle = false; + wts->boost = 0; + wts->boost_expires = 0; + wts->boost_period = false; + wts->low_latency = false; +} + +static void android_rvh_ttwu_cond(void *unused, bool *cond) +{ + *cond = sysctl_sched_many_wakeup_threshold < WALT_MANY_WAKEUP_DEFAULT; +} + +static void android_rvh_sched_exec(void *unused, bool *cond) +{ + *cond = true; +} + +static void android_rvh_build_perf_domains(void *unused, bool *eas_check) +{ + *eas_check = true; +} + +static void register_walt_hooks(void) +{ + register_trace_android_rvh_wake_up_new_task(android_rvh_wake_up_new_task, NULL); + register_trace_android_rvh_update_cpu_capacity(android_rvh_update_cpu_capacity, NULL); + register_trace_android_rvh_sched_cpu_starting(android_rvh_sched_cpu_starting, NULL); + register_trace_android_rvh_sched_cpu_dying(android_rvh_sched_cpu_dying, NULL); + register_trace_android_rvh_set_task_cpu(android_rvh_set_task_cpu, NULL); + register_trace_android_rvh_new_task_stats(android_rvh_new_task_stats, NULL); + register_trace_android_rvh_sched_fork(android_rvh_sched_fork, NULL); + register_trace_android_rvh_account_irq(android_rvh_account_irq, NULL); + register_trace_android_rvh_flush_task(android_rvh_flush_task, NULL); + register_trace_android_rvh_update_misfit_status(android_rvh_update_misfit_status, NULL); + register_trace_android_rvh_enqueue_task(android_rvh_enqueue_task, NULL); + register_trace_android_rvh_dequeue_task(android_rvh_dequeue_task, NULL); + register_trace_android_rvh_try_to_wake_up(android_rvh_try_to_wake_up, NULL); + register_trace_android_rvh_try_to_wake_up_success(android_rvh_try_to_wake_up_success, NULL); + register_trace_android_rvh_tick_entry(android_rvh_tick_entry, NULL); + register_trace_android_rvh_schedule(android_rvh_schedule, NULL); + register_trace_android_rvh_resume_cpus(android_rvh_resume_cpus, NULL); + register_trace_android_vh_show_max_freq(android_vh_show_max_freq, NULL); + register_trace_android_rvh_cpu_cgroup_attach(android_rvh_cpu_cgroup_attach, NULL); + register_trace_android_rvh_update_cpus_allowed(android_rvh_update_cpus_allowed, NULL); + register_trace_android_rvh_sched_fork_init(android_rvh_sched_fork_init, NULL); + register_trace_android_rvh_ttwu_cond(android_rvh_ttwu_cond, NULL); + register_trace_android_rvh_sched_exec(android_rvh_sched_exec, NULL); + register_trace_android_rvh_build_perf_domains(android_rvh_build_perf_domains, NULL); +} + +atomic64_t walt_irq_work_lastq_ws; + +static int walt_init_stop_handler(void *data) +{ + int cpu; + struct task_struct *g, *p; + u64 window_start_ns, nr_windows; + struct walt_rq *wrq; + + read_lock(&tasklist_lock); + for_each_possible_cpu(cpu) { + raw_spin_lock(&cpu_rq(cpu)->lock); + } + + do_each_thread(g, p) { + init_existing_task_load(p); + } while_each_thread(g, p); + + window_start_ns = ktime_get_ns(); + nr_windows = div64_u64(window_start_ns, sched_ravg_window); + window_start_ns = (u64)nr_windows * (u64)sched_ravg_window; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + /* Create task members for idle thread */ + init_new_task_load(rq->idle); + + walt_sched_init_rq(rq); + + wrq = (struct walt_rq *) rq->android_vendor_data1; + wrq->window_start = window_start_ns; + } + + atomic64_set(&walt_irq_work_lastq_ws, window_start_ns); + + register_walt_hooks(); + walt_lb_init(); + walt_rt_init(); + walt_cfs_init(); + create_default_coloc_group(); + + walt_update_cluster_topology(); + + for_each_possible_cpu(cpu) { + raw_spin_unlock(&cpu_rq(cpu)->lock); + } + read_unlock(&tasklist_lock); + + return 0; +} + +static int walt_module_init(void) +{ + struct ctl_table_header *hdr; + int i; + + walt_tunables(); + + sched_init_ops(); + BUG_ON(alloc_related_thread_groups()); + walt_init_cycle_counter(); + init_clusters(); + stop_machine(walt_init_stop_handler, NULL, NULL); + + hdr = register_sysctl_table(walt_base_table); + kmemleak_not_leak(hdr); + + input_boost_init(); + core_ctl_init(); + waltgov_register(); + + i = match_string(sched_feat_names, __SCHED_FEAT_NR, "TTWU_QUEUE"); + static_key_disable_cpuslocked(&sched_feat_keys[i]); + sysctl_sched_features &= ~(1UL << i); + + return 0; +} + +module_init(walt_module_init); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/sched/walt/walt.h b/kernel/sched/walt/walt.h new file mode 100644 index 000000000000..0315be82852a --- /dev/null +++ b/kernel/sched/walt/walt.h @@ -0,0 +1,1006 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved. + */ + +#ifndef _WALT_H +#define _WALT_H + +#include "../../../kernel/sched/sched.h" +#include "../../../fs/proc/internal.h" +#include +#include + +#ifdef CONFIG_HZ_300 +/* + * Tick interval becomes to 3333333 due to + * rounding error when HZ=300. + */ +#define DEFAULT_SCHED_RAVG_WINDOW (3333333 * 5) +#else +/* Min window size (in ns) = 16ms */ +#define DEFAULT_SCHED_RAVG_WINDOW 16000000 +#endif + +/* Max window size (in ns) = 1s */ +#define MAX_SCHED_RAVG_WINDOW 1000000000 + +#define NR_WINDOWS_PER_SEC (NSEC_PER_SEC / DEFAULT_SCHED_RAVG_WINDOW) + +#define SCHED_CPUFREQ_MIGRATION (1U << 1) +#define SCHED_CPUFREQ_INTERCLUSTER_MIG (1U << 3) +#define SCHED_CPUFREQ_WALT (1U << 4) +#define SCHED_CPUFREQ_PL (1U << 5) +#define SCHED_CPUFREQ_EARLY_DET (1U << 6) +#define SCHED_CPUFREQ_CONTINUE (1U << 8) + +#define MAX_CLUSTERS 3 +/* MAX_MARGIN_LEVELS should be one less than MAX_CLUSTERS */ +#define MAX_MARGIN_LEVELS (MAX_CLUSTERS - 1) + +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + +/* Note: this need to be in sync with migrate_type_names array */ +enum migrate_types { + GROUP_TO_RQ, + RQ_TO_GROUP, +}; + +enum task_boost_type { + TASK_BOOST_NONE = 0, + TASK_BOOST_ON_MID, + TASK_BOOST_ON_MAX, + TASK_BOOST_STRICT_MAX, + TASK_BOOST_END, +}; + +#define WALT_NR_CPUS 8 +#define RAVG_HIST_SIZE_MAX 5 +#define NUM_BUSY_BUCKETS 10 + +struct walt_task_struct { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the current window + * + * 'prev_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the previous window + * + * 'curr_window' represents the sum of all entries in curr_window_cpu + * + * 'prev_window' represents the sum of all entries in prev_window_cpu + * + * 'pred_demand' represents task's current predicted cpu busy time + * + * 'busy_buckets' groups historical busy time into different buckets + * used for prediction + * + * 'demand_scaled' represents task's demand scaled to 1024 + */ + u64 mark_start; + u32 sum, demand; + u32 coloc_demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 curr_window_cpu[WALT_NR_CPUS]; + u32 prev_window_cpu[WALT_NR_CPUS]; + u32 curr_window, prev_window; + u32 pred_demand; + u8 busy_buckets[NUM_BUSY_BUCKETS]; + u16 demand_scaled; + u16 pred_demand_scaled; + u64 active_time; + u64 last_win_size; + int boost; + bool wake_up_idle; + bool misfit; + bool rtg_high_prio; + bool low_latency; + u64 boost_period; + u64 boost_expires; + u64 last_sleep_ts; + u32 init_load_pct; + u32 unfilter; + u64 last_wake_ts; + u64 last_enqueued_ts; + struct walt_related_thread_group __rcu *grp; + struct list_head grp_list; + u64 cpu_cycles; + cpumask_t cpus_requested; +}; + +/*End linux/sched.h port */ +/*SCHED.H PORT*/ +extern __read_mostly bool sched_predl; + +struct walt_cpu_load { + unsigned long nl; + unsigned long pl; + bool rtgb_active; + u64 ws; +}; + +#define DECLARE_BITMAP_ARRAY(name, nr, bits) \ + unsigned long name[nr][BITS_TO_LONGS(bits)] + +struct walt_sched_stats { + int nr_big_tasks; + u64 cumulative_runnable_avg_scaled; + u64 pred_demands_sum_scaled; + unsigned int nr_rtg_high_prio_tasks; +}; + +#define NUM_TRACKED_WINDOWS 2 +#define NUM_LOAD_INDICES 1000 + +struct group_cpu_time { + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +}; + +struct load_subtractions { + u64 window_start; + u64 subs; + u64 new_subs; +}; + +struct walt_rq { + struct task_struct *push_task; + struct walt_sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct walt_sched_stats walt_stats; + + u64 window_start; + u32 prev_window_size; + unsigned long walt_flags; + + u64 avg_irqload; + u64 last_irq_window; + u64 prev_irq_time; + struct task_struct *ed_task; + u64 task_exec_scale; + u64 old_busy_time; + u64 old_estimated_time; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 cum_window_demand_scaled; + struct group_cpu_time grp_time; + struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; + DECLARE_BITMAP_ARRAY(top_tasks_bitmap, + NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES); + u8 *top_tasks[NUM_TRACKED_WINDOWS]; + u8 curr_table; + int prev_top; + int curr_top; + bool notif_pending; + bool high_irqload; + u64 last_cc_update; + u64 cycles; +}; + +struct walt_sched_cluster { + raw_spinlock_t load_lock; + struct list_head list; + struct cpumask cpus; + int id; + /* + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq; + unsigned int max_possible_freq; + u64 aggr_grp_load; +}; + +struct walt_related_thread_group { + int id; + raw_spinlock_t lock; + struct list_head tasks; + struct list_head list; + bool skip_min; + struct rcu_head rcu; + u64 last_update; + u64 downmigrate_ts; + u64 start_ts; +}; + +extern struct walt_sched_cluster *sched_cluster[WALT_NR_CPUS]; + +extern struct walt_sched_cluster *rq_cluster(struct rq *rq); + +/*END SCHED.H PORT*/ + +extern int num_sched_clusters; +extern unsigned int sched_capacity_margin_up[WALT_NR_CPUS]; +extern unsigned int sched_capacity_margin_down[WALT_NR_CPUS]; +extern cpumask_t asym_cap_sibling_cpus; +extern cpumask_t __read_mostly **cpu_array; + +extern void sched_update_nr_prod(int cpu, bool enq); +extern unsigned int walt_big_tasks(int cpu); +extern void walt_rotate_work_init(void); +extern void walt_rotation_checkpoint(int nr_big); +extern void walt_fill_ta_data(struct core_ctl_notif_data *data); +extern int sched_set_group_id(struct task_struct *p, unsigned int group_id); +extern unsigned int sched_get_group_id(struct task_struct *p); +extern int sched_set_init_task_load(struct task_struct *p, int init_load_pct); +extern u32 sched_get_init_task_load(struct task_struct *p); +extern void core_ctl_check(u64 wallclock); +extern int sched_set_boost(int enable); +extern int sched_pause_count(const cpumask_t *mask, bool include_offline); +extern void sched_pause_pending(int cpu); +extern void sched_unpause_pending(int cpu); +extern int sched_wake_up_idle_show(struct seq_file *m, void *v); +extern ssize_t sched_wake_up_idle_write(struct file *file, + const char __user *buf, size_t count, loff_t *offset); +extern int sched_wake_up_idle_open(struct inode *inode, struct file *filp); +extern int sched_init_task_load_show(struct seq_file *m, void *v); +extern ssize_t sched_init_task_load_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset); +extern int sched_init_task_load_open(struct inode *inode, struct file *filp); +extern int sched_group_id_show(struct seq_file *m, void *v); +extern ssize_t sched_group_id_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset); +extern int sched_group_id_open(struct inode *inode, struct file *filp); +extern int sched_pause_cpus(struct cpumask *pause_cpus); +extern int sched_unpause_cpus(struct cpumask *unpause_cpus); + +extern int core_ctl_set_boost(bool boost); +extern void core_ctl_notifier_register(struct notifier_block *n); +extern void core_ctl_notifier_unregister(struct notifier_block *n); +extern unsigned int sched_get_cpu_util(int cpu); +extern void sched_update_hyst_times(void); +extern u64 sched_lpm_disallowed_time(int cpu); +extern int +sched_updown_migrate_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern int sched_busy_hyst_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +extern u64 sched_ktime_clock(void); +extern void clear_walt_request(int cpu); +extern void walt_init_tg(struct task_group *tg); +extern void walt_init_topapp_tg(struct task_group *tg); +extern void walt_init_foreground_tg(struct task_group *tg); +extern int register_walt_callback(void); +extern void set_cpu_array(void); +extern int sched_init_ops(void); +extern int core_ctl_init(void); +extern void acquire_rq_locks_irqsave(const cpumask_t *cpus, + unsigned long *flags); +extern void release_rq_locks_irqrestore(const cpumask_t *cpus, + unsigned long *flags); +extern struct list_head cluster_head; +extern int set_sched_ravg_window(char *str); +extern int set_sched_predl(char *str); +extern int input_boost_init(void); +extern int core_ctl_init(void); + +extern atomic64_t walt_irq_work_lastq_ws; +extern unsigned int __read_mostly sched_ravg_window; +extern unsigned int min_max_possible_capacity; +extern unsigned int max_possible_capacity; +extern unsigned int __read_mostly sched_init_task_load_windows; +extern unsigned int __read_mostly sched_load_granule; + +/* 1ms default for 20ms window size scaled to 1024 */ +extern unsigned int sysctl_sched_min_task_util_for_boost; +/* 0.68ms default for 20ms window size scaled to 1024 */ +extern unsigned int sysctl_sched_min_task_util_for_colocation; +extern unsigned int sysctl_sched_busy_hyst_enable_cpus; +extern unsigned int sysctl_sched_busy_hyst; +extern unsigned int sysctl_sched_coloc_busy_hyst_enable_cpus; +extern unsigned int sysctl_sched_coloc_busy_hyst_cpu[WALT_NR_CPUS]; +extern unsigned int sysctl_sched_coloc_busy_hyst_max_ms; +extern unsigned int sysctl_sched_coloc_busy_hyst_cpu_busy_pct[WALT_NR_CPUS]; +extern unsigned int sysctl_sched_boost; /* To/from userspace */ +extern unsigned int sysctl_sched_capacity_margin_up[MAX_MARGIN_LEVELS]; +extern unsigned int sysctl_sched_capacity_margin_down[MAX_MARGIN_LEVELS]; +extern unsigned int sched_boost_type; /* currently activated sched boost */ +extern enum sched_boost_policy boost_policy; +extern unsigned int sysctl_input_boost_ms; +extern unsigned int sysctl_input_boost_freq[8]; +extern unsigned int sysctl_sched_boost_on_input; +extern unsigned int sysctl_sched_load_boost[WALT_NR_CPUS]; +extern unsigned int sysctl_sched_user_hint; +extern unsigned int sysctl_sched_conservative_pl; +#define WALT_MANY_WAKEUP_DEFAULT 1000 +extern unsigned int sysctl_sched_many_wakeup_threshold; +extern unsigned int sysctl_walt_rtg_cfs_boost_prio; +extern __read_mostly unsigned int sysctl_sched_force_lb_enable; +extern const int sched_user_hint_max; +extern unsigned int sysctl_sched_prefer_spread; + +#define for_each_sched_cluster(cluster) \ + list_for_each_entry_rcu(cluster, &cluster_head, list) + +static inline u32 cpu_cycles_to_freq(u64 cycles, u64 period) +{ + return div64_u64(cycles, period); +} + +static inline unsigned int sched_cpu_legacy_freq(int cpu) +{ + unsigned long curr_cap = arch_scale_freq_capacity(cpu); + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + return (curr_cap * (u64) wrq->cluster->max_possible_freq) >> + SCHED_CAPACITY_SHIFT; +} + +extern __read_mostly bool sched_freq_aggr_en; +static inline void walt_enable_frequency_aggregation(bool enable) +{ + sched_freq_aggr_en = enable; +} + +#ifndef CONFIG_IRQ_TIME_ACCOUNTING +static inline u64 irq_time_read(int cpu) { return 0; } +#endif + +/*Sysctl related interface*/ +#define WINDOW_STATS_RECENT 0 +#define WINDOW_STATS_MAX 1 +#define WINDOW_STATS_MAX_RECENT_AVG 2 +#define WINDOW_STATS_AVG 3 +#define WINDOW_STATS_INVALID_POLICY 4 + +extern unsigned int __read_mostly sysctl_sched_coloc_downmigrate_ns; +extern unsigned int __read_mostly sysctl_sched_group_downmigrate_pct; +extern unsigned int __read_mostly sysctl_sched_group_upmigrate_pct; +extern unsigned int __read_mostly sysctl_sched_window_stats_policy; +extern unsigned int sysctl_sched_ravg_window_nr_ticks; +extern unsigned int sysctl_sched_dynamic_ravg_window_enable; +extern unsigned int sysctl_sched_walt_rotate_big_tasks; +extern unsigned int sysctl_sched_task_unfilter_period; +extern unsigned int __read_mostly sysctl_sched_asym_cap_sibling_freq_match_pct; +extern unsigned int sysctl_walt_low_latency_task_threshold; /* disabled by default */ +extern unsigned int sysctl_task_read_pid; +extern struct ctl_table walt_table[]; +extern struct ctl_table walt_base_table[]; +extern void walt_tunables(void); +extern void walt_update_group_thresholds(void); +extern void sched_window_nr_ticks_change(void); +extern unsigned long sched_user_hint_reset_time; +extern struct irq_work walt_migration_irq_work; +extern __read_mostly unsigned int new_sched_ravg_window; +extern struct task_group *task_group_topapp; +extern struct task_group *task_group_foreground; + +#define LIB_PATH_LENGTH 512 +extern unsigned int cpuinfo_max_freq_cached; +extern char sched_lib_name[LIB_PATH_LENGTH]; +extern unsigned int sched_lib_mask_force; +extern bool is_sched_lib_based_app(pid_t pid); +void android_vh_show_max_freq(void *unused, struct cpufreq_policy *policy, + unsigned int *max_freq); + +/* WALT cpufreq interface */ +#define WALT_CPUFREQ_ROLLOVER (1U << 0) +#define WALT_CPUFREQ_CONTINUE (1U << 1) +#define WALT_CPUFREQ_IC_MIGRATION (1U << 2) +#define WALT_CPUFREQ_PL (1U << 3) +#define WALT_CPUFREQ_EARLY_DET (1U << 4) + +#define NO_BOOST 0 +#define FULL_THROTTLE_BOOST 1 +#define CONSERVATIVE_BOOST 2 +#define RESTRAINED_BOOST 3 +#define FULL_THROTTLE_BOOST_DISABLE -1 +#define CONSERVATIVE_BOOST_DISABLE -2 +#define RESTRAINED_BOOST_DISABLE -3 +#define MAX_NUM_BOOST_TYPE (RESTRAINED_BOOST+1) + +enum sched_boost_policy { + SCHED_BOOST_NONE, + SCHED_BOOST_ON_BIG, + SCHED_BOOST_ON_ALL, +}; + +struct walt_task_group { + /* + * Controls whether tasks of this cgroup should be colocated with each + * other and tasks of other cgroups that have the same flag turned on. + */ + bool colocate; + /* + * array indicating whether this task group participates in the + * particular boost type + */ + bool sched_boost_enable[MAX_NUM_BOOST_TYPE]; +}; + +struct sched_avg_stats { + int nr; + int nr_misfit; + int nr_max; + int nr_scaled; +}; + +struct waltgov_callback { + void (*func)(struct waltgov_callback *cb, u64 time, unsigned int flags); +}; + +DECLARE_PER_CPU(struct waltgov_callback *, waltgov_cb_data); + +static inline void waltgov_add_callback(int cpu, struct waltgov_callback *cb, + void (*func)(struct waltgov_callback *cb, u64 time, + unsigned int flags)) +{ + if (WARN_ON(!cb || !func)) + return; + + if (WARN_ON(per_cpu(waltgov_cb_data, cpu))) + return; + + cb->func = func; + rcu_assign_pointer(per_cpu(waltgov_cb_data, cpu), cb); +} + +static inline void waltgov_remove_callback(int cpu) +{ + rcu_assign_pointer(per_cpu(waltgov_cb_data, cpu), NULL); +} + +static inline void waltgov_run_callback(struct rq *rq, unsigned int flags) +{ + struct waltgov_callback *cb; + + cb = rcu_dereference_sched(*per_cpu_ptr(&waltgov_cb_data, cpu_of(rq))); + if (cb) + cb->func(cb, sched_ktime_clock(), flags); +} + +extern unsigned long cpu_util_freq_walt(int cpu, struct walt_cpu_load *walt_load); +int waltgov_register(void); + +extern void walt_lb_init(void); +extern unsigned int walt_rotation_enabled; + +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static inline unsigned long capacity_curr_of(int cpu) +{ + unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + + return cap_scale(max_cap, scale_freq); +} + +static inline unsigned long task_util(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return wts->demand_scaled; +} + +static inline unsigned long cpu_util(int cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + u64 walt_cpu_util = wrq->walt_stats.cumulative_runnable_avg_scaled; + + return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu)); +} + +static inline unsigned long cpu_util_cum(int cpu, int delta) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + u64 util = wrq->cum_window_demand_scaled; + unsigned long capacity = capacity_orig_of(cpu); + + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +extern unsigned int capacity_margin_freq; + +static inline unsigned long +add_capacity_margin(unsigned long cpu_capacity, int cpu) +{ + cpu_capacity = cpu_capacity * capacity_margin_freq * + (100 + sysctl_sched_load_boost[cpu]); + cpu_capacity /= 100; + cpu_capacity /= SCHED_CAPACITY_SCALE; + return cpu_capacity; +} + +static inline enum sched_boost_policy sched_boost_policy(void) +{ + return boost_policy; +} + +static inline int sched_boost(void) +{ + return sched_boost_type; +} + +static inline bool rt_boost_on_big(void) +{ + return sched_boost() == FULL_THROTTLE_BOOST ? + (sched_boost_policy() == SCHED_BOOST_ON_BIG) : false; +} + +static inline bool is_full_throttle_boost(void) +{ + return sched_boost() == FULL_THROTTLE_BOOST; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + struct cgroup_subsys_state *css; + struct task_group *tg; + bool sched_boost_enabled; + struct walt_task_group *wtg; + + /* optimization for FT boost, skip looking at tg */ + if (sched_boost() == FULL_THROTTLE_BOOST) + return true; + + rcu_read_lock(); + css = task_css(p, cpu_cgrp_id); + if (!css) { + rcu_read_unlock(); + return false; + } + tg = container_of(css, struct task_group, css); + wtg = (struct walt_task_group *) tg->android_vendor_data1; + sched_boost_enabled = wtg->sched_boost_enable[sched_boost()]; + rcu_read_unlock(); + + return sched_boost_enabled; +} + +static inline bool task_placement_boost_enabled(struct task_struct *p) +{ + if (likely(sched_boost_policy() == SCHED_BOOST_NONE)) + return false; + + return task_sched_boost(p); +} + +static inline enum sched_boost_policy task_boost_policy(struct task_struct *p) +{ + enum sched_boost_policy policy; + + if (likely(sched_boost_policy() == SCHED_BOOST_NONE)) + return SCHED_BOOST_NONE; + + policy = task_sched_boost(p) ? sched_boost_policy() : SCHED_BOOST_NONE; + if (policy == SCHED_BOOST_ON_BIG) { + /* + * Filter out tasks less than min task util threshold + * under conservative boost. + */ + if (sched_boost() == CONSERVATIVE_BOOST && + task_util(p) <= sysctl_sched_min_task_util_for_boost) + policy = SCHED_BOOST_NONE; + } + + return policy; +} + +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline bool __cpu_overutilized(int cpu, int delta) +{ + return (capacity_orig_of(cpu) * 1024) < + ((cpu_util(cpu) + delta) * sched_capacity_margin_up[cpu]); +} + +static inline bool cpu_overutilized(int cpu) +{ + return __cpu_overutilized(cpu, 0); +} + +static inline int asym_cap_siblings(int cpu1, int cpu2) +{ + return (cpumask_test_cpu(cpu1, &asym_cap_sibling_cpus) && + cpumask_test_cpu(cpu2, &asym_cap_sibling_cpus)); +} + +static inline bool asym_cap_sibling_group_has_capacity(int dst_cpu, int margin) +{ + int sib1, sib2; + int nr_running; + unsigned long total_util, total_capacity; + + if (cpumask_empty(&asym_cap_sibling_cpus) || + cpumask_test_cpu(dst_cpu, &asym_cap_sibling_cpus)) + return false; + + sib1 = cpumask_first(&asym_cap_sibling_cpus); + sib2 = cpumask_last(&asym_cap_sibling_cpus); + + if (!cpu_active(sib1) || !cpu_active(sib2)) + return false; + + nr_running = cpu_rq(sib1)->cfs.h_nr_running + + cpu_rq(sib2)->cfs.h_nr_running; + + if (nr_running <= 2) + return true; + + total_capacity = capacity_of(sib1) + capacity_of(sib2); + total_util = cpu_util(sib1) + cpu_util(sib2); + + return ((total_capacity * 100) > (total_util * margin)); +} + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(src_cpu)->android_vendor_data1; + + if (src_cpu == dst_cpu) + return 1; + + if (asym_cap_siblings(src_cpu, dst_cpu)) + return 1; + + return cpumask_test_cpu(dst_cpu, &wrq->freq_domain_cpumask); +} + +static inline unsigned long task_util_est(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return wts->demand_scaled; +} + +#ifdef CONFIG_UCLAMP_TASK +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return clamp(task_util_est(p), + uclamp_eff_value(p, UCLAMP_MIN), + uclamp_eff_value(p, UCLAMP_MAX)); +} +#else +static inline unsigned long uclamp_task_util(struct task_struct *p) +{ + return task_util_est(p); +} +#endif + +static inline int per_task_boost(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (wts->boost_period) { + if (sched_clock() > wts->boost_expires) { + wts->boost_period = 0; + wts->boost_expires = 0; + wts->boost = 0; + } + } + return wts->boost; +} + +static inline int cluster_first_cpu(struct walt_sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} + +static inline bool is_max_capacity_cpu(int cpu) +{ + return arch_scale_cpu_capacity(cpu) == max_possible_capacity; +} + +static inline bool is_min_capacity_cpu(int cpu) +{ + return arch_scale_cpu_capacity(cpu) == min_max_possible_capacity; +} + +static inline bool is_min_capacity_cluster(struct walt_sched_cluster *cluster) +{ + return is_min_capacity_cpu(cluster_first_cpu(cluster)); +} + +/* + * This is only for tracepoints to print the avg irq load. For + * task placment considerations, use sched_cpu_high_irqload(). + */ +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + s64 delta; + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + delta = wrq->window_start - wrq->last_irq_window; + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return wrq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + return wrq->high_irqload; +} + +static inline u64 +scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) +{ + return div64_u64(load * (u64)src_freq, (u64)dst_freq); +} + +static inline unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +static inline int same_cluster(int src_cpu, int dst_cpu) +{ + struct walt_rq *src_wrq = (struct walt_rq *) cpu_rq(src_cpu)->android_vendor_data1; + struct walt_rq *dest_wrq = (struct walt_rq *) cpu_rq(dst_cpu)->android_vendor_data1; + + return src_wrq->cluster == dest_wrq->cluster; +} + +static inline bool is_suh_max(void) +{ + return sysctl_sched_user_hint == sched_user_hint_max; +} + +#define DEFAULT_CGROUP_COLOC_ID 1 +static inline bool walt_should_kick_upmigrate(struct task_struct *p, int cpu) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + struct walt_related_thread_group *rtg = wts->grp; + + if (is_suh_max() && rtg && rtg->id == DEFAULT_CGROUP_COLOC_ID && + rtg->skip_min && wts->unfilter) + return is_min_capacity_cpu(cpu); + + return false; +} + +extern bool is_rtgb_active(void); +extern u64 get_rtgb_active_time(void); + +static inline unsigned int walt_nr_rtg_high_prio(int cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + return wrq->walt_stats.nr_rtg_high_prio_tasks; +} + +static inline bool task_fits_capacity(struct task_struct *p, + long capacity, + int cpu) +{ + unsigned int margin; + + /* + * Derive upmigration/downmigrate margin wrt the src/dest CPU. + */ + if (capacity_orig_of(task_cpu(p)) > capacity_orig_of(cpu)) + margin = sched_capacity_margin_down[cpu]; + else + margin = sched_capacity_margin_up[task_cpu(p)]; + + return capacity * 1024 > uclamp_task_util(p) * margin; +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_orig_of(cpu); + unsigned long max_capacity = max_possible_capacity; + unsigned long task_boost = per_task_boost(p); + + if (capacity == max_capacity) + return true; + + if (is_min_capacity_cpu(cpu)) { + if (task_boost_policy(p) == SCHED_BOOST_ON_BIG || + task_boost > 0 || + uclamp_boosted(p) || + walt_should_kick_upmigrate(p, cpu)) + return false; + } else { /* mid cap cpu */ + if (task_boost > TASK_BOOST_ON_MID) + return false; + } + + return task_fits_capacity(p, capacity, cpu); +} + +/* applying the task threshold for all types of low latency tasks. */ +static inline bool walt_low_latency_task(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return wts->low_latency && + (task_util(p) < sysctl_walt_low_latency_task_threshold); +} + +static inline unsigned int walt_get_idle_exit_latency(struct rq *rq) +{ + struct cpuidle_state *idle = idle_get_state(rq); + + if (idle) + return idle->exit_latency; + + return UINT_MAX; +} + +extern void sched_get_nr_running_avg(struct sched_avg_stats *stats); +extern void sched_update_hyst_times(void); + +extern enum sched_boost_policy sched_boost_policy(void); +extern void walt_rt_init(void); +extern void walt_cfs_init(void); +extern int walt_find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, + int sync, int sibling_count_hint); + +static inline unsigned int cpu_max_possible_freq(int cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + + return wrq->cluster->max_possible_freq; +} + +static inline unsigned int cpu_max_freq(int cpu) +{ + return mult_frac(cpu_max_possible_freq(cpu), capacity_orig_of(cpu), + arch_scale_cpu_capacity(cpu)); +} + +static inline unsigned int task_load(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return wts->demand; +} + +static inline unsigned int task_pl(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return wts->pred_demand; +} + +static inline bool task_in_related_thread_group(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return (rcu_access_pointer(wts->grp) != NULL); +} + +static inline bool task_rtg_high_prio(struct task_struct *p) +{ + return task_in_related_thread_group(p) && + (p->prio <= sysctl_walt_rtg_cfs_boost_prio); +} + +static inline struct walt_related_thread_group +*task_related_thread_group(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return rcu_dereference(wts->grp); +} + +#define CPU_RESERVED 1 +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + return test_bit(CPU_RESERVED, &wrq->walt_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + return test_and_set_bit(CPU_RESERVED, &wrq->walt_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + clear_bit(CPU_RESERVED, &wrq->walt_flags); +} + +static inline bool +task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return cpu_of(rq) == task_cpu(p) && (p->on_rq || + wts->last_sleep_ts >= wrq->window_start); +} + +static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) +{ + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + + wrq->cum_window_demand_scaled += scaled_delta; + if (unlikely((s64)wrq->cum_window_demand_scaled < 0)) + wrq->cum_window_demand_scaled = 0; +} + +static inline void walt_irq_work_queue(struct irq_work *work) +{ + if (likely(cpu_online(raw_smp_processor_id()))) + irq_work_queue(work); + else + irq_work_queue_on(work, cpumask_any(cpu_online_mask)); +} + +#define PF_WAKE_UP_IDLE 1 +static inline u32 sched_get_wake_up_idle(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return wts->wake_up_idle; +} + +static inline int sched_set_wake_up_idle(struct task_struct *p, + int wake_up_idle) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + wts->wake_up_idle = !!wake_up_idle; + return 0; +} + +static inline void set_wake_up_idle(bool enabled) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) current->android_vendor_data1; + + wts->wake_up_idle = enabled; +} + +extern int set_task_boost(int boost, u64 period); + +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + +#endif /* _WALT_H */ diff --git a/kernel/sched/walt/walt_cfs.c b/kernel/sched/walt/walt_cfs.c new file mode 100644 index 000000000000..c99337e54ea2 --- /dev/null +++ b/kernel/sched/walt/walt_cfs.c @@ -0,0 +1,785 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. + */ + +#include +#include + +#include "walt.h" +#include "trace.h" +#include "../../../drivers/android/binder_trace.h" + +/* Migration margins */ +unsigned int sched_capacity_margin_up[WALT_NR_CPUS] = { + [0 ... WALT_NR_CPUS-1] = 1078 /* ~5% margin */ +}; +unsigned int sched_capacity_margin_down[WALT_NR_CPUS] = { + [0 ... WALT_NR_CPUS-1] = 1205 /* ~15% margin */ +}; + +__read_mostly unsigned int sysctl_sched_prefer_spread; +unsigned int sysctl_walt_rtg_cfs_boost_prio = 99; /* disabled by default */ +unsigned int sched_small_task_threshold = 102; +__read_mostly unsigned int sysctl_sched_force_lb_enable = 1; +unsigned int capacity_margin_freq = 1280; /* ~20% margin */ + +static inline bool prefer_spread_on_idle(int cpu, bool new_ilb) +{ + switch (sysctl_sched_prefer_spread) { + case 1: + return is_min_capacity_cpu(cpu); + case 2: + return true; + case 3: + return (new_ilb && is_min_capacity_cpu(cpu)); + case 4: + return new_ilb; + default: + return false; + } +} + +static inline bool +bias_to_this_cpu(struct task_struct *p, int cpu, int start_cpu) +{ + bool base_test = cpumask_test_cpu(cpu, &p->cpus_mask) && + cpu_active(cpu); + bool start_cap_test = (capacity_orig_of(cpu) >= + capacity_orig_of(start_cpu)); + + return base_test && start_cap_test; +} + +static inline bool task_demand_fits(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_orig_of(cpu); + unsigned long max_capacity = max_possible_capacity; + + if (capacity == max_capacity) + return true; + + return task_fits_capacity(p, capacity, cpu); +} + +struct find_best_target_env { + bool is_rtg; + int need_idle; + bool boosted; + int fastpath; + int start_cpu; + int order_index; + int end_index; + bool strict_max; + int skip_cpu; +}; + +/* + * cpu_util_without: compute cpu utilization without any contributions from *p + * @cpu: the CPU which utilization is requested + * @p: the task which utilization should be discounted + * + * The utilization of a CPU is defined by the utilization of tasks currently + * enqueued on that CPU as well as tasks which are currently sleeping after an + * execution on that CPU. + * + * This method returns the utilization of the specified CPU by discounting the + * utilization of the specified task, whenever the task is currently + * contributing to the CPU utilization. + */ +static unsigned long cpu_util_without(int cpu, struct task_struct *p) +{ + unsigned int util; + + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (likely(p->state == TASK_WAKING)) + return cpu_util(cpu); + + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_util(cpu); + + util = max_t(long, cpu_util(cpu) - task_util(p), 0); + + /* + * Utilization (estimated) can exceed the CPU capacity, thus let's + * clamp to the maximum CPU capacity to ensure consistency with + * the cpu_util call. + */ + return min_t(unsigned long, util, capacity_orig_of(cpu)); +} + +static inline bool walt_get_rtg_status(struct task_struct *p) +{ + struct walt_related_thread_group *grp; + bool ret = false; + + rcu_read_lock(); + + grp = task_related_thread_group(p); + if (grp) + ret = grp->skip_min; + + rcu_read_unlock(); + + return ret; +} + +static inline bool walt_task_skip_min_cpu(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + return sched_boost() != CONSERVATIVE_BOOST && + walt_get_rtg_status(p) && wts->unfilter; +} + +static inline bool walt_is_many_wakeup(int sibling_count_hint) +{ + return sibling_count_hint >= sysctl_sched_many_wakeup_threshold; +} + +static inline bool walt_target_ok(int target_cpu, int order_index) +{ + return !((order_index != num_sched_clusters - 1) && + (cpumask_weight(&cpu_array[order_index][0]) == 1) && + (target_cpu == cpumask_first(&cpu_array[order_index][0]))); +} + +static void walt_get_indicies(struct task_struct *p, int *order_index, + int *end_index, int task_boost, bool boosted) +{ + int i = 0; + + *order_index = 0; + *end_index = 0; + + if (num_sched_clusters <= 1) + return; + + if (task_boost > TASK_BOOST_ON_MID) { + *order_index = num_sched_clusters - 1; + return; + } + + if (is_full_throttle_boost()) { + *order_index = num_sched_clusters - 1; + if ((*order_index > 1) && task_demand_fits(p, + cpumask_first(&cpu_array[*order_index][1]))) + *end_index = 1; + return; + } + + if (boosted || task_boost_policy(p) == SCHED_BOOST_ON_BIG || + walt_task_skip_min_cpu(p)) + *order_index = 1; + + for (i = *order_index ; i < num_sched_clusters - 1; i++) { + if (task_demand_fits(p, cpumask_first(&cpu_array[i][0]))) + break; + } + + *order_index = i; +} + +enum fastpaths { + NONE = 0, + SYNC_WAKEUP, + PREV_CPU_FASTPATH, +}; + +static void walt_find_best_target(struct sched_domain *sd, + cpumask_t *candidates, + struct task_struct *p, + struct find_best_target_env *fbt_env) +{ + unsigned long min_util = uclamp_task_util(p); + long target_max_spare_cap = 0; + unsigned long best_idle_cuml_util = ULONG_MAX; + unsigned int min_exit_latency = UINT_MAX; + int best_idle_cpu = -1; + int target_cpu = -1; + int i, start_cpu; + long spare_wake_cap, most_spare_wake_cap = 0; + int most_spare_cap_cpu = -1; + int prev_cpu = task_cpu(p); + int active_candidate = -1; + int order_index = fbt_env->order_index, end_index = fbt_env->end_index; + int cluster; + unsigned int target_nr_rtg_high_prio = UINT_MAX; + bool rtg_high_prio_task = task_rtg_high_prio(p); + cpumask_t visit_cpus; + bool io_task_pack = (order_index > 0 && p->in_iowait); + struct cfs_rq *cfs_rq; + + /* Find start CPU based on boost value */ + start_cpu = fbt_env->start_cpu; + + if (fbt_env->strict_max || io_task_pack) + target_max_spare_cap = LONG_MIN; + + if (p->state == TASK_RUNNING) + most_spare_wake_cap = ULONG_MAX; + + /* fast path for prev_cpu */ + if (((capacity_orig_of(prev_cpu) == capacity_orig_of(start_cpu)) || + asym_cap_siblings(prev_cpu, start_cpu)) && + cpu_active(prev_cpu) && cpu_online(prev_cpu) && + available_idle_cpu(prev_cpu)) { + target_cpu = prev_cpu; + fbt_env->fastpath = PREV_CPU_FASTPATH; + cpumask_set_cpu(target_cpu, candidates); + goto out; + } + + for (cluster = 0; cluster < num_sched_clusters; cluster++) { + cpumask_and(&visit_cpus, &p->cpus_mask, + &cpu_array[order_index][cluster]); + for_each_cpu(i, &visit_cpus) { + unsigned long capacity_orig = capacity_orig_of(i); + unsigned long wake_util, new_util, new_util_cuml; + long spare_cap; + unsigned int idle_exit_latency = UINT_MAX; + + trace_sched_cpu_util(i); + + if (!cpu_active(i)) + continue; + + if (active_candidate == -1) + active_candidate = i; + + /* + * This CPU is the target of an active migration that's + * yet to complete. Avoid placing another task on it. + */ + if (is_reserved(i)) + continue; + + if (sched_cpu_high_irqload(i)) + continue; + + if (fbt_env->skip_cpu == i) + continue; + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + wake_util = cpu_util_without(i, p); + new_util = wake_util + uclamp_task_util(p); + spare_wake_cap = capacity_orig - wake_util; + + if (spare_wake_cap > most_spare_wake_cap) { + most_spare_wake_cap = spare_wake_cap; + most_spare_cap_cpu = i; + } + + if ((per_task_boost(cpu_rq(i)->curr) == + TASK_BOOST_STRICT_MAX) && + !fbt_env->strict_max) + continue; + + /* get rq's utilization with this task included */ + cfs_rq = &cpu_rq(i)->cfs; + new_util_cuml = READ_ONCE(cfs_rq->avg.util_avg) + min_util; + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + new_util = max(min_util, new_util); + if (!(fbt_env->strict_max || io_task_pack) && + new_util > capacity_orig) + continue; + + /* + * Pre-compute the maximum possible capacity we expect + * to have available on this CPU once the task is + * enqueued here. + */ + spare_cap = capacity_orig - new_util; + + /* + * Find an optimal backup IDLE CPU for non latency + * sensitive tasks. + * + * Looking for: + * - favoring shallowest idle states + * i.e. avoid to wakeup deep-idle CPUs + * + * The following code path is used by non latency + * sensitive tasks if IDLE CPUs are available. If at + * least one of such CPUs are available it sets the + * best_idle_cpu to the most suitable idle CPU to be + * selected. + * + * If idle CPUs are available, favour these CPUs to + * improve performances by spreading tasks. + * Indeed, the energy_diff() computed by the caller + * will take care to ensure the minimization of energy + * consumptions without affecting performance. + */ + if (available_idle_cpu(i)) { + idle_exit_latency = walt_get_idle_exit_latency(cpu_rq(i)); + + /* + * Prefer shallowest over deeper idle state cpu, + * of same capacity cpus. + */ + if (idle_exit_latency > min_exit_latency) + continue; + if (min_exit_latency == idle_exit_latency && + (best_idle_cpu == prev_cpu || + (i != prev_cpu && + new_util_cuml > best_idle_cuml_util))) + continue; + + min_exit_latency = idle_exit_latency; + best_idle_cuml_util = new_util_cuml; + best_idle_cpu = i; + continue; + } + + /* + * Consider only idle CPUs for active migration. + */ + if (p->state == TASK_RUNNING) + continue; + + /* + * Try to spread the rtg high prio tasks so that they + * don't preempt each other. This is a optimisitc + * check assuming rtg high prio can actually preempt + * the current running task with the given vruntime + * boost. + */ + if (rtg_high_prio_task) { + if (walt_nr_rtg_high_prio(i) > target_nr_rtg_high_prio) + continue; + + /* Favor CPUs with maximum spare capacity */ + if (walt_nr_rtg_high_prio(i) == target_nr_rtg_high_prio && + spare_cap < target_max_spare_cap) + continue; + } else { + /* Favor CPUs with maximum spare capacity */ + if (spare_cap < target_max_spare_cap) + continue; + } + + target_max_spare_cap = spare_cap; + target_nr_rtg_high_prio = walt_nr_rtg_high_prio(i); + target_cpu = i; + } + + if (best_idle_cpu != -1) + break; + + if ((cluster >= end_index) && (target_cpu != -1) && + walt_target_ok(target_cpu, order_index)) + break; + } + + if (best_idle_cpu != -1) + target_cpu = -1; + /* + * We set both idle and target as long as they are valid CPUs. + * If we don't find either, then we fallback to most_spare_cap, + * If we don't find most spare cap, we fallback to prev_cpu, + * provided that the prev_cpu is active. + * If the prev_cpu is not active, we fallback to active_candidate. + */ + + if (unlikely(target_cpu == -1)) { + if (best_idle_cpu != -1) + target_cpu = best_idle_cpu; + else if (most_spare_cap_cpu != -1) + target_cpu = most_spare_cap_cpu; + else if (!cpu_active(prev_cpu)) + target_cpu = active_candidate; + } + + if (target_cpu != -1) + cpumask_set_cpu(target_cpu, candidates); + if (best_idle_cpu != -1 && target_cpu != best_idle_cpu) + cpumask_set_cpu(best_idle_cpu, candidates); +out: + trace_sched_find_best_target(p, min_util, start_cpu, + best_idle_cpu, most_spare_cap_cpu, + target_cpu, order_index, end_index, + fbt_env->skip_cpu, p->state == TASK_RUNNING); +} + +static inline unsigned long +cpu_util_next_walt(int cpu, struct task_struct *p, int dst_cpu) +{ + struct walt_rq *wrq = (struct walt_rq *) cpu_rq(cpu)->android_vendor_data1; + unsigned long util = wrq->walt_stats.cumulative_runnable_avg_scaled; + bool queued = task_on_rq_queued(p); + + /* + * When task is queued, + * (a) The evaluating CPU (cpu) is task's current CPU. If the + * task is migrating, discount the task contribution from the + * evaluation cpu. + * (b) The evaluating CPU (cpu) is task's current CPU. If the + * task is NOT migrating, nothing to do. The contribution is + * already present on the evaluation CPU. + * (c) The evaluating CPU (cpu) is not task's current CPU. But + * the task is migrating to the evaluating CPU. So add the + * task contribution to it. + * (d) The evaluating CPU (cpu) is neither the current CPU nor + * the destination CPU. don't care. + * + * When task is NOT queued i.e waking. Task contribution is not + * present on any CPU. + * + * (a) If the evaluating CPU is the destination CPU, add the task + * contribution. + * (b) The evaluation CPU is not the destination CPU, don't care. + */ + if (unlikely(queued)) { + if (task_cpu(p) == cpu) { + if (dst_cpu != cpu) + util = max_t(long, util - task_util(p), 0); + } else if (dst_cpu == cpu) { + util += task_util(p); + } + } else if (dst_cpu == cpu) { + util += task_util(p); + } + + return min_t(unsigned long, util, capacity_orig_of(cpu)); +} + +/* + * compute_energy(): Estimates the energy that @pd would consume if @p was + * migrated to @dst_cpu. compute_energy() predicts what will be the utilization + * landscape of @pd's CPUs after the task migration, and uses the Energy Model + * to compute what would be the energy if we decided to actually migrate that + * task. + */ +static long +compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +{ + struct cpumask *pd_mask = perf_domain_span(pd); + unsigned long max_util = 0, sum_util = 0; + int cpu; + unsigned long cpu_util; + + /* + * The capacity state of CPUs of the current rd can be driven by CPUs + * of another rd if they belong to the same pd. So, account for the + * utilization of these CPUs too by masking pd with cpu_online_mask + * instead of the rd span. + * + * If an entire pd is outside of the current rd, it will not appear in + * its pd list and will not be accounted by compute_energy(). + */ + for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { + cpu_util = cpu_util_next_walt(cpu, p, dst_cpu); + sum_util += cpu_util; + max_util = max(max_util, cpu_util); + } + + return em_cpu_energy(pd->em_pd, max_util, sum_util); +} + +static inline long +walt_compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +{ + long energy = 0; + + for (; pd; pd = pd->next) + energy += compute_energy(p, dst_cpu, pd); + + return energy; +} + +static inline int wake_to_idle(struct task_struct *p) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + struct walt_task_struct *cur_wts = + (struct walt_task_struct *) current->android_vendor_data1; + + return (cur_wts->wake_up_idle || wts->wake_up_idle); +} + +/* return true if cpu should be chosen over best_energy_cpu */ +static inline bool select_cpu_same_energy(int cpu, int best_cpu, int prev_cpu) +{ + if (capacity_orig_of(cpu) < capacity_orig_of(best_cpu)) + return true; + + if (best_cpu == prev_cpu) + return false; + + if (available_idle_cpu(best_cpu) && walt_get_idle_exit_latency(cpu_rq(best_cpu)) <= 1) + return false; /* best_cpu is idle wfi or shallower */ + + if (available_idle_cpu(cpu) && walt_get_idle_exit_latency(cpu_rq(cpu)) <= 1) + return true; /* new cpu is idle wfi or shallower */ + + /* + * If we are this far this must be a tie between a busy and deep idle, + * pick the busy. + */ + return available_idle_cpu(best_cpu); +} + +static DEFINE_PER_CPU(cpumask_t, energy_cpus); +int walt_find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, + int sync, int sibling_count_hint) +{ + unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int weight, cpu = smp_processor_id(), best_energy_cpu = prev_cpu; + struct perf_domain *pd; + unsigned long cur_energy; + cpumask_t *candidates; + bool is_rtg, curr_is_rtg; + struct find_best_target_env fbt_env; + bool need_idle = wake_to_idle(p) || uclamp_latency_sensitive(p); + u64 start_t = 0; + int delta = 0; + int task_boost = per_task_boost(p); + bool is_uclamp_boosted = uclamp_boosted(p); + bool boosted = is_uclamp_boosted || (task_boost > 0); + int start_cpu, order_index, end_index; + + if (walt_is_many_wakeup(sibling_count_hint) && prev_cpu != cpu && + cpumask_test_cpu(prev_cpu, &p->cpus_mask)) + return prev_cpu; + + if (unlikely(!cpu_array)) + return -EPERM; + + walt_get_indicies(p, &order_index, &end_index, task_boost, boosted); + start_cpu = cpumask_first(&cpu_array[order_index][0]); + + is_rtg = task_in_related_thread_group(p); + curr_is_rtg = task_in_related_thread_group(cpu_rq(cpu)->curr); + + fbt_env.fastpath = 0; + fbt_env.need_idle = need_idle; + + if (trace_sched_task_util_enabled()) + start_t = sched_clock(); + + /* Pre-select a set of candidate CPUs. */ + candidates = this_cpu_ptr(&energy_cpus); + cpumask_clear(candidates); + + if (sync && (need_idle || (is_rtg && curr_is_rtg))) + sync = 0; + + if (sync && bias_to_this_cpu(p, cpu, start_cpu)) { + best_energy_cpu = cpu; + fbt_env.fastpath = SYNC_WAKEUP; + goto done; + } + + rcu_read_lock(); + pd = rcu_dereference(rd->pd); + if (!pd) + goto fail; + + fbt_env.is_rtg = is_rtg; + fbt_env.start_cpu = start_cpu; + fbt_env.order_index = order_index; + fbt_env.end_index = end_index; + fbt_env.boosted = boosted; + fbt_env.strict_max = is_rtg && + (task_boost == TASK_BOOST_STRICT_MAX); + fbt_env.skip_cpu = walt_is_many_wakeup(sibling_count_hint) ? + cpu : -1; + + walt_find_best_target(NULL, candidates, p, &fbt_env); + + /* Bail out if no candidate was found. */ + weight = cpumask_weight(candidates); + if (!weight) + goto unlock; + + /* If there is only one sensible candidate, select it now. */ + cpu = cpumask_first(candidates); + if (weight == 1 && (available_idle_cpu(cpu) || cpu == prev_cpu)) { + best_energy_cpu = cpu; + goto unlock; + } + + if (p->state == TASK_WAKING) + delta = task_util(p); + + if (task_placement_boost_enabled(p) || fbt_env.need_idle || + boosted || is_rtg || __cpu_overutilized(prev_cpu, delta) || + !task_fits_max(p, prev_cpu) || !cpu_active(prev_cpu)) { + best_energy_cpu = cpu; + goto unlock; + } + + if (cpumask_test_cpu(prev_cpu, &p->cpus_mask)) + prev_delta = best_delta = + walt_compute_energy(p, prev_cpu, pd); + else + prev_delta = best_delta = ULONG_MAX; + + /* Select the best candidate energy-wise. */ + for_each_cpu(cpu, candidates) { + if (cpu == prev_cpu) + continue; + + cur_energy = walt_compute_energy(p, cpu, pd); + trace_sched_compute_energy(p, cpu, cur_energy, + prev_delta, best_delta, best_energy_cpu); + + if (cur_energy < best_delta) { + best_delta = cur_energy; + best_energy_cpu = cpu; + } else if (cur_energy == best_delta) { + if (select_cpu_same_energy(cpu, best_energy_cpu, + prev_cpu)) { + best_delta = cur_energy; + best_energy_cpu = cpu; + } + } + } + +unlock: + rcu_read_unlock(); + + /* + * Pick the prev CPU, if best energy CPU can't saves at least 6% of + * the energy used by prev_cpu. + */ + if (!(available_idle_cpu(best_energy_cpu) && + walt_get_idle_exit_latency(cpu_rq(best_energy_cpu)) <= 1) && + (prev_delta != ULONG_MAX) && (best_energy_cpu != prev_cpu) && + ((prev_delta - best_delta) <= prev_delta >> 4) && + (capacity_orig_of(prev_cpu) <= capacity_orig_of(start_cpu))) + best_energy_cpu = prev_cpu; + +done: + trace_sched_task_util(p, cpumask_bits(candidates)[0], best_energy_cpu, + sync, fbt_env.need_idle, fbt_env.fastpath, + task_boost_policy(p), start_t, boosted, is_rtg, + walt_get_rtg_status(p), start_cpu); + + return best_energy_cpu; + +fail: + rcu_read_unlock(); + return -EPERM; +} + +static void +walt_select_task_rq_fair(void *unused, struct task_struct *p, int prev_cpu, + int sd_flag, int wake_flags, int *target_cpu) +{ + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + int sibling_count_hint = p->wake_q_head ? p->wake_q_head->count : 1; + + *target_cpu = walt_find_energy_efficient_cpu(p, prev_cpu, sync, sibling_count_hint); + if (unlikely(*target_cpu < 0)) + *target_cpu = prev_cpu; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static unsigned long task_h_load(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + update_cfs_rq_h_load(cfs_rq); + return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, + cfs_rq_load_avg(cfs_rq) + 1); +} +#else +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.avg.load_avg; +} +#endif + +static void walt_update_misfit_status(void *unused, struct task_struct *p, + struct rq *rq, bool *need_update) +{ + *need_update = false; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_max(p, cpu_of(rq))) { + rq->misfit_task_load = 0; + return; + } + + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); +} + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static void walt_place_entity(void *unused, struct sched_entity *se, u64 *vruntime) +{ + if (entity_is_task(se)) { + unsigned long thresh = sysctl_sched_latency; + + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + if ((per_task_boost(task_of(se)) == TASK_BOOST_STRICT_MAX) || + walt_low_latency_task(task_of(se)) || + task_rtg_high_prio(task_of(se))) { + *vruntime -= sysctl_sched_latency; + *vruntime -= thresh; + se->vruntime = *vruntime; + } + } +} + +static void walt_binder_low_latency_set(void *unused, struct task_struct *task) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) task->android_vendor_data1; + + if (task && current->signal && + (current->signal->oom_score_adj == 0) && + (current->prio < DEFAULT_PRIO)) + wts->low_latency = true; +} + +static void walt_binder_low_latency_clear(void *unused, struct binder_transaction *t) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) current->android_vendor_data1; + + if (wts->low_latency) + wts->low_latency = false; +} + +void walt_cfs_init(void) +{ + register_trace_android_rvh_select_task_rq_fair(walt_select_task_rq_fair, NULL); + register_trace_android_rvh_update_misfit_status(walt_update_misfit_status, NULL); + register_trace_android_rvh_place_entity(walt_place_entity, NULL); + + register_trace_android_vh_binder_wakeup_ilocked(walt_binder_low_latency_set, NULL); + register_trace_binder_transaction_received(walt_binder_low_latency_clear, NULL); +} diff --git a/kernel/sched/walt/walt_debug.c b/kernel/sched/walt/walt_debug.c new file mode 100644 index 000000000000..f8679c032f39 --- /dev/null +++ b/kernel/sched/walt/walt_debug.c @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. + */ + +#include + +#include + +#include "walt.h" +#include "walt_debug.h" + +static void dump_throttled_rt_tasks(void *unused, int cpu, u64 clock, + ktime_t rt_period, u64 rt_runtime, s64 rt_period_timer_expires) +{ + printk_deferred("sched: RT throttling activated for cpu %d\n", cpu); + printk_deferred("rt_period_timer: expires=%lld now=%llu runtime=%llu period=%llu\n", + rt_period_timer_expires, ktime_get_ns(), rt_runtime, rt_period); + printk_deferred("potential CPU hogs:\n"); +#ifdef CONFIG_SCHED_INFO + if (sched_info_on()) + printk_deferred("current %s (%d) is running for %llu nsec\n", + current->comm, current->pid, + clock - current->sched_info.last_arrival); +#endif + BUG(); +} + +static void android_rvh_schedule_bug(void *unused, void *unused2) +{ + BUG(); +} + +static int __init walt_debug_init(void) +{ + int ret; + + ret = preemptirq_long_init(); + if (!ret) + return ret; + + register_trace_android_vh_dump_throttled_rt_tasks(dump_throttled_rt_tasks, NULL); + register_trace_android_rvh_schedule_bug(android_rvh_schedule_bug, NULL); + + return 0; +} +module_init(walt_debug_init); + +MODULE_DESCRIPTION("QTI WALT Debug Module"); +MODULE_LICENSE("GPL v2"); diff --git a/kernel/sched/walt/walt_debug.h b/kernel/sched/walt/walt_debug.h new file mode 100644 index 000000000000..282836c14743 --- /dev/null +++ b/kernel/sched/walt/walt_debug.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2021, The Linux Foundation. All rights reserved. + */ +int preemptirq_long_init(void); diff --git a/kernel/sched/walt/walt_lb.c b/kernel/sched/walt/walt_lb.c new file mode 100644 index 000000000000..45c68da4f2c2 --- /dev/null +++ b/kernel/sched/walt/walt_lb.c @@ -0,0 +1,742 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. + */ + +#include + +#include "walt.h" +#include "trace.h" + +extern u64 sched_ktime_clock(void); // TODO +static void walt_detach_task(struct task_struct *p, struct rq *src_rq, + struct rq *dst_rq) +{ + deactivate_task(src_rq, p, 0); + double_lock_balance(src_rq, dst_rq); + if (!(src_rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(src_rq); + set_task_cpu(p, dst_rq->cpu); + double_unlock_balance(src_rq, dst_rq); +} + +static void walt_attach_task(struct task_struct *p, struct rq *rq) +{ + activate_task(rq, p, 0); + check_preempt_curr(rq, p, 0); +} + +static int walt_lb_active_migration(void *data) +{ + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct walt_rq *wrq = (struct walt_rq *) busiest_rq->android_vendor_data1; + struct task_struct *push_task = wrq->push_task; + int push_task_detached = 0; + + raw_spin_lock_irq(&busiest_rq->lock); + + /* sanity checks before initiating the pull */ + if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) + goto out_unlock; + + if (unlikely(busiest_cpu != raw_smp_processor_id() || + !busiest_rq->active_balance)) + goto out_unlock; + + if (busiest_rq->nr_running <= 1) + goto out_unlock; + + BUG_ON(busiest_rq == target_rq); + + if (task_on_rq_queued(push_task) && + push_task->state == TASK_RUNNING && + task_cpu(push_task) == busiest_cpu && + cpu_active(target_cpu)) { + walt_detach_task(push_task, busiest_rq, target_rq); + push_task_detached = 1; + } + +out_unlock: /* called with busiest_rq lock */ + busiest_rq->active_balance = 0; + target_cpu = busiest_rq->push_cpu; + clear_reserved(target_cpu); + wrq->push_task = NULL; + raw_spin_unlock(&busiest_rq->lock); + + if (push_task_detached) { + if (push_task_detached) { + raw_spin_lock(&target_rq->lock); + walt_attach_task(push_task, target_rq); + raw_spin_unlock(&target_rq->lock); + } + } + put_task_struct(push_task); + + local_irq_enable(); + return 0; +} + +struct walt_lb_rotate_work { + struct work_struct w; + struct task_struct *src_task; + struct task_struct *dst_task; + int src_cpu; + int dst_cpu; +}; + +DEFINE_PER_CPU(struct walt_lb_rotate_work, walt_lb_rotate_works); + +static void walt_lb_rotate_work_func(struct work_struct *work) +{ + struct walt_lb_rotate_work *wr = container_of(work, + struct walt_lb_rotate_work, w); + + migrate_swap(wr->src_task, wr->dst_task, wr->dst_cpu, wr->src_cpu); + + put_task_struct(wr->src_task); + put_task_struct(wr->dst_task); + + clear_reserved(wr->src_cpu); + clear_reserved(wr->dst_cpu); +} + +static void walt_lb_rotate_work_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct walt_lb_rotate_work *wr = &per_cpu(walt_lb_rotate_works, i); + + INIT_WORK(&wr->w, walt_lb_rotate_work_func); + } +} + +#define WALT_ROTATION_THRESHOLD_NS 16000000 +static void walt_lb_check_for_rotation(struct rq *src_rq) +{ + u64 wc, wait, max_wait = 0, run, max_run = 0; + int deserved_cpu = nr_cpu_ids, dst_cpu = nr_cpu_ids; + int i, src_cpu = cpu_of(src_rq); + struct rq *dst_rq; + struct walt_lb_rotate_work *wr = NULL; + struct walt_task_struct *wts; + + if (!is_min_capacity_cpu(src_cpu)) + return; + + wc = sched_ktime_clock(); + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + + if (!is_min_capacity_cpu(i)) + break; + + if (is_reserved(i)) + continue; + + if (!rq->misfit_task_load) + continue; + + wts = (struct walt_task_struct *) rq->curr->android_vendor_data1; + wait = wc - wts->last_enqueued_ts; + if (wait > max_wait) { + max_wait = wait; + deserved_cpu = i; + } + } + + if (deserved_cpu != src_cpu) + return; + + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + + if (is_min_capacity_cpu(i)) + continue; + + if (is_reserved(i)) + continue; + + if (rq->curr->prio < MAX_RT_PRIO) + continue; + + if (rq->nr_running > 1) + continue; + + wts = (struct walt_task_struct *) rq->curr->android_vendor_data1; + run = wc - wts->last_enqueued_ts; + + if (run < WALT_ROTATION_THRESHOLD_NS) + continue; + + if (run > max_run) { + max_run = run; + dst_cpu = i; + } + } + + if (dst_cpu == nr_cpu_ids) + return; + + dst_rq = cpu_rq(dst_cpu); + + double_rq_lock(src_rq, dst_rq); + if (dst_rq->curr->prio >= MAX_RT_PRIO && dst_rq->curr != dst_rq->idle && + src_rq->curr->prio >= MAX_RT_PRIO && src_rq->curr != src_rq->idle) { + get_task_struct(src_rq->curr); + get_task_struct(dst_rq->curr); + + mark_reserved(src_cpu); + mark_reserved(dst_cpu); + wr = &per_cpu(walt_lb_rotate_works, src_cpu); + + wr->src_task = src_rq->curr; + wr->dst_task = dst_rq->curr; + + wr->src_cpu = src_cpu; + wr->dst_cpu = dst_cpu; + } + double_rq_unlock(src_rq, dst_rq); + + if (wr) + queue_work_on(src_cpu, system_highpri_wq, &wr->w); +} + +static inline bool _walt_can_migrate_task(struct task_struct *p, int dst_cpu, + bool to_lower) +{ + struct walt_rq *wrq = (struct walt_rq *) task_rq(p)->android_vendor_data1; + + if (to_lower) { + if (p->in_iowait) + return false; + if (per_task_boost(p) == TASK_BOOST_STRICT_MAX && + task_in_related_thread_group(p)) + return false; + } + + /* Don't detach task if it is under active migration */ + if (wrq->push_task == p) + return false; + + return true; +} + +static inline bool need_active_lb(struct task_struct *p, int dst_cpu, + int src_cpu) +{ + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (cpu_rq(src_cpu)->active_balance) + return false; + + if (capacity_orig_of(dst_cpu) <= capacity_orig_of(src_cpu)) + return false; + + if (!wts->misfit) + return false; + + return true; +} + +static int walt_lb_pull_tasks(int dst_cpu, int src_cpu) +{ + struct rq *dst_rq = cpu_rq(dst_cpu); + struct rq *src_rq = cpu_rq(src_cpu); + unsigned long flags; + struct task_struct *pulled_task = NULL, *p; + bool active_balance = false, to_lower; + struct walt_rq *wrq = (struct walt_rq *) src_rq->android_vendor_data1; + struct walt_task_struct *wts; + + BUG_ON(src_cpu == dst_cpu); + + to_lower = capacity_orig_of(dst_cpu) < capacity_orig_of(src_cpu); + + raw_spin_lock_irqsave(&src_rq->lock, flags); + list_for_each_entry_reverse(p, &src_rq->cfs_tasks, se.group_node) { + + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) + continue; + + if (!_walt_can_migrate_task(p, dst_cpu, to_lower)) + continue; + + if (task_running(src_rq, p)) { + + if (need_active_lb(p, dst_cpu, src_cpu)) { + active_balance = true; + break; + } + continue; + } + + walt_detach_task(p, src_rq, dst_rq); + pulled_task = p; + break; + } + + if (active_balance) { + src_rq->active_balance = 1; + src_rq->push_cpu = dst_cpu; + get_task_struct(p); + wrq->push_task = p; + mark_reserved(dst_cpu); + } + /* lock must be dropped before waking the stopper */ + raw_spin_unlock_irqrestore(&src_rq->lock, flags); + + /* + * Using our custom active load balance callback so that + * the push_task is really pulled onto this CPU. + */ + if (active_balance) { + wts = (struct walt_task_struct *) p->android_vendor_data1; + trace_walt_active_load_balance(p, src_cpu, dst_cpu, wts); + stop_one_cpu_nowait(src_cpu, walt_lb_active_migration, + src_rq, &src_rq->active_balance_work); + return 0; /* we did not pull any task here */ + } + + if (!pulled_task) + return 0; + + raw_spin_lock_irqsave(&dst_rq->lock, flags); + walt_attach_task(p, dst_rq); + raw_spin_unlock_irqrestore(&dst_rq->lock, flags); + + return 1; /* we pulled 1 task */ +} + +static int walt_lb_find_busiest_similar_cap_cpu(int dst_cpu, const cpumask_t *src_mask) +{ + int i; + int busiest_cpu = -1; + int busiest_nr = 1; /* we need atleast 2 */ + unsigned long util, busiest_util = 0; + struct walt_rq *wrq; + + for_each_cpu(i, src_mask) { + wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1; + trace_walt_lb_cpu_util(i, wrq); + + if (cpu_rq(i)->cfs.h_nr_running < 2) + continue; + + util = cpu_util(i); + if (util < busiest_util) + continue; + + busiest_nr = cpu_rq(i)->cfs.h_nr_running; + busiest_util = util; + busiest_cpu = i; + } + + return busiest_cpu; +} + +#define SMALL_TASK_THRESHOLD 102 +static int walt_lb_find_busiest_higher_cap_cpu(int dst_cpu, const cpumask_t *src_mask) +{ + int i; + int busiest_cpu = -1; + int busiest_nr = 1; /* we need atleast 2 */ + unsigned long util, busiest_util = 0; + unsigned long total_capacity = 0, total_util = 0, total_nr = 0; + int total_cpus = 0; + struct walt_rq *wrq; + + for_each_cpu(i, src_mask) { + + if (!cpu_active(i)) + continue; + + wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1; + trace_walt_lb_cpu_util(i, wrq); + + util = cpu_util(i); + total_cpus += 1; + total_util += util; + total_capacity += capacity_orig_of(i); + total_nr += cpu_rq(i)->cfs.h_nr_running; + + if (cpu_rq(i)->cfs.h_nr_running < 2) + continue; + + if (cpu_rq(i)->cfs.h_nr_running == 2 && + task_util(cpu_rq(i)->curr) < SMALL_TASK_THRESHOLD) + continue; + + /* + * During rotation, two silver fmax tasks gets + * placed on gold/prime and the CPU may not be + * overutilized but for rotation, we have to + * spread out. + */ + if (!walt_rotation_enabled && !cpu_overutilized(i)) + continue; + + if (util < busiest_util) + continue; + + busiest_nr = cpu_rq(i)->cfs.h_nr_running; + busiest_util = util; + busiest_cpu = i; + } + + /* + * Don't allow migrating to lower cluster unless this high + * capacity cluster is sufficiently loaded. + */ + if (!walt_rotation_enabled) { + if (total_nr <= total_cpus || total_util * 1280 < total_capacity * 1024) + busiest_cpu = -1; + } + + return busiest_cpu; +} + +static int walt_lb_find_busiest_lower_cap_cpu(int dst_cpu, const cpumask_t *src_mask) +{ + int i; + int busiest_cpu = -1; + int busiest_nr = 1; /* we need atleast 2 */ + unsigned long util, busiest_util = 0; + unsigned long total_capacity = 0, total_util = 0, total_nr = 0; + int total_cpus = 0; + int busy_nr_big_tasks = 0; + struct walt_rq *wrq; + + /* + * A higher capacity CPU is looking at a lower capacity + * cluster. active balance and big tasks are in play. + * other than that, it is very much same as above. we + * really don't need this as a separate block. will + * refactor this after final testing is done. + */ + for_each_cpu(i, src_mask) { + wrq = (struct walt_rq *) cpu_rq(i)->android_vendor_data1; + + if (!cpu_active(i)) + continue; + + trace_walt_lb_cpu_util(i, wrq); + + util = cpu_util(i); + total_cpus += 1; + total_util += util; + total_capacity += capacity_orig_of(i); + total_nr += cpu_rq(i)->cfs.h_nr_running; + + /* + * no point in selecting this CPU as busy, as + * active balance is in progress. + */ + if (cpu_rq(i)->active_balance) + continue; + + if (cpu_rq(i)->cfs.h_nr_running < 2 && !wrq->walt_stats.nr_big_tasks) + continue; + + if (!walt_rotation_enabled && !cpu_overutilized(i)) + continue; + + if (util < busiest_util) + continue; + + busiest_nr = cpu_rq(i)->cfs.h_nr_running; + busiest_util = util; + busiest_cpu = i; + busy_nr_big_tasks = wrq->walt_stats.nr_big_tasks; + } + + if (!walt_rotation_enabled && !busy_nr_big_tasks) { + if (total_nr <= total_cpus || total_util * 1280 < total_capacity * 1024) + busiest_cpu = -1; + } + + return busiest_cpu; +} + +static int walt_lb_find_busiest_cpu(int dst_cpu, const cpumask_t *src_mask) +{ + int fsrc_cpu = cpumask_first(src_mask); + int busiest_cpu; + + if (capacity_orig_of(dst_cpu) == capacity_orig_of(fsrc_cpu)) + busiest_cpu = walt_lb_find_busiest_similar_cap_cpu(dst_cpu, + src_mask); + else if (capacity_orig_of(dst_cpu) < capacity_orig_of(fsrc_cpu)) + busiest_cpu = walt_lb_find_busiest_lower_cap_cpu(dst_cpu, + src_mask); + else + busiest_cpu = walt_lb_find_busiest_higher_cap_cpu(dst_cpu, + src_mask); + + return busiest_cpu; +} + +static DEFINE_RAW_SPINLOCK(walt_lb_migration_lock); +static void walt_lb_tick(void *unused, struct rq *rq) +{ + int prev_cpu = rq->cpu, new_cpu, ret; + struct task_struct *p = rq->curr; + unsigned long flags; + struct walt_rq *wrq = (struct walt_rq *) rq->android_vendor_data1; + struct walt_task_struct *wts = (struct walt_task_struct *) p->android_vendor_data1; + + if (!rq->misfit_task_load) + return; + + if (p->state != TASK_RUNNING || p->nr_cpus_allowed == 1) + return; + + raw_spin_lock_irqsave(&walt_lb_migration_lock, flags); + + if (walt_rotation_enabled) { + walt_lb_check_for_rotation(rq); + goto out_unlock; + } + + rcu_read_lock(); + new_cpu = walt_find_energy_efficient_cpu(p, prev_cpu, 0, 1); + rcu_read_unlock(); + + if (new_cpu < 0 || same_cluster(new_cpu, prev_cpu)) + goto out_unlock; + + raw_spin_lock(&rq->lock); + if (rq->active_balance) { + raw_spin_unlock(&rq->lock); + goto out_unlock; + } + rq->active_balance = 1; + rq->push_cpu = new_cpu; + get_task_struct(p); + wrq->push_task = p; + raw_spin_unlock(&rq->lock); + + mark_reserved(new_cpu); + raw_spin_unlock_irqrestore(&walt_lb_migration_lock, flags); + + trace_walt_active_load_balance(p, prev_cpu, new_cpu, wts); + ret = stop_one_cpu_nowait(prev_cpu, + walt_lb_active_migration, rq, + &rq->active_balance_work); + if (!ret) + clear_reserved(new_cpu); + else + wake_up_if_idle(new_cpu); + + return; + +out_unlock: + raw_spin_unlock_irqrestore(&walt_lb_migration_lock, flags); +} + +static void walt_newidle_balance(void *unused, struct rq *this_rq, + struct rq_flags *rf, int *pulled_task, + int *done) +{ + int this_cpu = this_rq->cpu; + struct walt_rq *wrq = (struct walt_rq *) this_rq->android_vendor_data1; + int order_index = wrq->cluster->id; + int cluster = 0; + int busy_cpu; + + if (unlikely(!cpu_array)) + return; + + /* + * newly idle load balance is completely handled here, so + * set done to skip the load balance by the caller. + */ + *done = 1; + *pulled_task = 0; + + /* + * This CPU is about to enter idle, so clear the + * misfit_task_load and mark the idle stamp. + */ + this_rq->misfit_task_load = 0; + this_rq->idle_stamp = rq_clock(this_rq); + + if (!cpu_active(this_cpu)) + return; + + if (!READ_ONCE(this_rq->rd->overload)) + return; + + rq_unpin_lock(this_rq, rf); + raw_spin_unlock(&this_rq->lock); + + /* + * careful, we dropped the lock, and has to be acquired + * before returning. Since rq lock is dropped, tasks + * can be queued remotely, so keep a check on nr_running + * and bail out. + */ + do { + busy_cpu = walt_lb_find_busiest_cpu(this_cpu, + &cpu_array[order_index][cluster]); + + /* we got the busy/src cpu here. */ + if (busy_cpu != -1 || this_rq->nr_running > 0) + break; + + } while (++cluster < num_sched_clusters); + + /* sanity checks before attempting the pull */ + if (busy_cpu == -1 || this_rq->nr_running > 0 || (busy_cpu == this_cpu)) + goto out; + + *pulled_task = walt_lb_pull_tasks(this_cpu, busy_cpu); + +out: + raw_spin_lock(&this_rq->lock); + if (this_rq->cfs.h_nr_running && !*pulled_task) + *pulled_task = 1; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + *pulled_task = -1; + + /* reset the idle time stamp if we pulled any task */ + if (*pulled_task) + this_rq->idle_stamp = 0; + + rq_repin_lock(this_rq, rf); + + trace_walt_newidle_balance(this_cpu, busy_cpu, *pulled_task); +} + +static void walt_find_busiest_queue(void *unused, int dst_cpu, + struct sched_group *group, + struct cpumask *env_cpus, + struct rq **busiest, int *done) +{ + int fsrc_cpu = group_first_cpu(group); + int busiest_cpu = -1; + struct cpumask src_mask; + + *done = 1; + *busiest = NULL; + + /* + * same cluster means, there will only be 1 + * CPU in the busy group, so just select it. + */ + if (same_cluster(dst_cpu, fsrc_cpu)) { + busiest_cpu = fsrc_cpu; + goto done; + } + + /* + * We will allow inter cluster migrations + * only if the source group is sufficiently + * loaded. The upstream load balancer is a + * bit more generous. + * + * re-using the same code that we use it + * for newly idle load balance. The policies + * remain same. + */ + cpumask_and(&src_mask, sched_group_span(group), env_cpus); + busiest_cpu = walt_lb_find_busiest_cpu(dst_cpu, &src_mask); +done: + if (busiest_cpu != -1) + *busiest = cpu_rq(busiest_cpu); + + trace_walt_find_busiest_queue(dst_cpu, busiest_cpu, src_mask.bits[0]); +} + +static void walt_migrate_queued_task(void *unused, struct rq *rq, + struct rq_flags *rf, + struct task_struct *p, + int new_cpu, int *detached) +{ + /* + * WALT expects both source and destination rqs to be + * held when set_task_cpu() is called on a queued task. + * so implementing this detach hook. unpin the lock + * before detaching and repin it later to make lockdep + * happy. + */ + BUG_ON(!rf); + + rq_unpin_lock(rq, rf); + walt_detach_task(p, rq, cpu_rq(new_cpu)); + rq_repin_lock(rq, rf); + + *detached = 1; +} + +/* + * we only decide if nohz balance kick is needed or not. the + * first CPU in the nohz.idle will come out of idle and do + * load balance on behalf of every CPU. adding another hook + * to decide which cpu to kick is useless. most of the time, + * it is impossible to decide which CPU has to come out because + * we get to kick only once. + */ +static void walt_nohz_balancer_kick(void *unused, struct rq *rq, + unsigned int *flags, int *done) +{ + *done = 1; + + /* + * tick path migration takes care of misfit task. + * so we have to check for nr_running >= 2 here. + */ + if (rq->nr_running >= 2 && cpu_overutilized(rq->cpu)) { + *flags = NOHZ_KICK_MASK; + trace_walt_nohz_balance_kick(rq); + } +} + +static void walt_can_migrate_task(void *unused, struct task_struct *p, + int dst_cpu, int *can_migrate) +{ + bool to_lower; + + to_lower = capacity_orig_of(dst_cpu) < capacity_orig_of(task_cpu(p)); + + if (_walt_can_migrate_task(p, dst_cpu, to_lower)) + return; + + *can_migrate = 0; +} + +/* + * when WALT becomes module, this init will be called from + * another file and we don't have to define module_init(). + */ +void walt_lb_init(void) +{ + /* + * Any task movement outside task placement is called + * load balance, so moving the tick path and rotation + * code to here. we also use our custom active load balance + * stopper function instad of adding hooks to + * active_load_balance_cpu_stop() in fair.c + */ + walt_lb_rotate_work_init(); + + register_trace_android_rvh_migrate_queued_task(walt_migrate_queued_task, NULL); + register_trace_android_rvh_sched_nohz_balancer_kick(walt_nohz_balancer_kick, NULL); + register_trace_android_rvh_can_migrate_task(walt_can_migrate_task, NULL); + register_trace_android_rvh_find_busiest_queue(walt_find_busiest_queue, NULL); + register_trace_android_rvh_sched_newidle_balance(walt_newidle_balance, NULL); + + /* + * TODO: + * scheduler tick is not a restricted hook so multiple entities + * can register for it. but from WALT, we will have only 1 hook + * and it will call our load balancer function later. + */ + register_trace_android_vh_scheduler_tick(walt_lb_tick, NULL); +} diff --git a/kernel/sched/walt/walt_rt.c b/kernel/sched/walt/walt_rt.c new file mode 100644 index 000000000000..87a783c9b974 --- /dev/null +++ b/kernel/sched/walt/walt_rt.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved. + */ + +#include + +#include "walt.h" +#include "trace.h" + +static void rt_energy_aware_wake_cpu(void *unused, struct task_struct *task, + struct cpumask *lowest_mask, int ret, int *best_cpu) +{ + int cpu; + unsigned long util, best_cpu_util = ULONG_MAX; + unsigned long best_cpu_util_cum = ULONG_MAX; + unsigned long util_cum; + unsigned long tutil = task_util(task); + unsigned int best_idle_exit_latency = UINT_MAX; + unsigned int cpu_idle_exit_latency = UINT_MAX; + bool boost_on_big = rt_boost_on_big(); + int cluster; + int order_index = (boost_on_big && num_sched_clusters > 1) ? 1 : 0; + + if (!ret) + return; /* No targets found */ + + rcu_read_lock(); + for (cluster = 0; cluster < num_sched_clusters; cluster++) { + for_each_cpu_and(cpu, lowest_mask, &cpu_array[order_index][cluster]) { + trace_sched_cpu_util(cpu); + + if (!cpu_active(cpu)) + continue; + + if (sched_cpu_high_irqload(cpu)) + continue; + + if (__cpu_overutilized(cpu, tutil)) + continue; + + util = cpu_util(cpu); + + /* Find the least loaded CPU */ + if (util > best_cpu_util) + continue; + + /* + * If the previous CPU has same load, keep it as + * best_cpu. + */ + if (best_cpu_util == util && *best_cpu == task_cpu(task)) + continue; + + /* + * If candidate CPU is the previous CPU, select it. + * Otherwise, if its load is same with best_cpu and in + * a shallower C-state, select it. If all above + * conditions are same, select the least cumulative + * window demand CPU. + */ + cpu_idle_exit_latency = walt_get_idle_exit_latency(cpu_rq(cpu)); + + util_cum = cpu_util_cum(cpu, 0); + if (cpu != task_cpu(task) && best_cpu_util == util) { + if (best_idle_exit_latency < cpu_idle_exit_latency) + continue; + + if (best_idle_exit_latency == cpu_idle_exit_latency && + best_cpu_util_cum < util_cum) + continue; + } + + best_idle_exit_latency = cpu_idle_exit_latency; + best_cpu_util_cum = util_cum; + best_cpu_util = util; + *best_cpu = cpu; + } + + if (*best_cpu != -1) + break; + } + + rcu_read_unlock(); +} + +void walt_rt_init(void) +{ + register_trace_android_rvh_find_lowest_rq(rt_energy_aware_wake_cpu, NULL); +} diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2fd8fd19eff0..38b5602bfa63 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -101,5 +101,3 @@ obj-$(CONFIG_IPC_LOGGING) += qcom_ipc_logging.o qcom_ipc_logging-y := ipc_logging.o ipc_logging_debug.o libftrace-y := ftrace.o - -obj-$(CONFIG_PREEMPTIRQ_TRACEPOINTS) += preemptirq_long.o diff --git a/modules.list.msm.lahaina b/modules.list.msm.lahaina index dd702ece3c6c..5a067d49eb8b 100755 --- a/modules.list.msm.lahaina +++ b/modules.list.msm.lahaina @@ -81,3 +81,4 @@ memory_dump_v2.ko llcc-qcom.ko qcom_edac.ko kryo_arm64_edac.ko +qcom-cpufreq-hw.ko