ANDROID: sched: Per-Sched-domain over utilization

The current implementation of overutilization, aborts energy aware scheduling if any cpu in the system is over-utilized. This patch introduces over utilization flag per sched domain level instead of a single flag system wide. Load balancing is done at the sched domain where any of the cpu is over utilized. If energy aware scheduling is enabled and no cpu in a sched domain is overuttilized, load balancing is skipped for that sched domain and energy aware scheduling continues at that level. The implementation takes advantage of the shared sched_domain structure that is common across all the sched domains at a level. The new flag introduced is placed in this structure so that all the sched domains the same level share the flag. In case of an overutilized cpu, the flag gets set at level1 sched_domain. The flag at the parent sched_domain level gets set in either of the two following scenarios. 1. There is a misfit task in one of the cpu's in this sched_domain. 2. The total utilization of the domain is greater than the domain capacity The flag is cleared if no cpu in a sched domain is overutilized. This implementation still can have corner scenarios with respect to misfit tasks. For example consider a sched group with n cpus and n+1 70%utilized tasks. Ideally this is a case for load balance to happen in a parent sched domain. But neither the total group utilization is high enough for the load balance to be triggered in the parent domain nor there is a cpu with a single overutilized task so that aload balance is triggered in a parent domain. But again this could be a purely academic sceanrio, as during task wake up these tasks will be placed more appropriately. Signed-off-by: Thara Gopinath <thara.gopinath@linaro.org> Change-Id: I3f327cff4080096a3e58208dd72c9b7f7913cdb2 Signed-off-by: Chris Redpath <chris.redpath@arm.com> Git-commit: addef37808728c719d8c095a75bcf81befdacdaf Git-repo: https://android.googlesource.com/kernel/common/ [clingutla@codeaurora.org: Resolved trivial merge conflicts] Signed-off-by: Lingutla Chandrasekhar <clingutla@codeaurora.org> [satyap@codeaurora.org: port to 5.x and resolve trivial merge conflicts] Signed-off-by: Satya Durga Srinivasu Prabhala <satyap@codeaurora.org>
2017-06-23 10:37:05 +05:30 · 2017-06-23 10:37:05 +05:30 · de785069ec
commit de785069ec
parent 7b456d0f2c
4 changed files with 99 additions and 10 deletions
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@ -66,6 +66,10 @@ struct sched_domain_shared {
 	atomic_t	ref;
 	atomic_t	nr_busy_cpus;
 	int		has_idle_cores;
+
+#ifdef CONFIG_SCHED_WALT
+	bool            overutilized;
+#endif
 };

 struct sched_domain {
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@ -5297,12 +5297,40 @@ bool cpu_overutilized(int cpu)
 	return __cpu_overutilized(cpu, 0);
 }

+#ifdef CONFIG_SCHED_WALT
+static bool sd_overutilized(struct sched_domain *sd)
+{
+	return sd->shared->overutilized;
+}
+
+static void set_sd_overutilized(struct sched_domain *sd)
+{
+	sd->shared->overutilized = true;
+}
+
+static void clear_sd_overutilized(struct sched_domain *sd)
+{
+	sd->shared->overutilized = false;
+}
+#endif
+
 static inline void update_overutilized_status(struct rq *rq)
 {
+#ifdef CONFIG_SCHED_WALT
+	struct sched_domain *sd;
+
+	rcu_read_lock();
+	sd = rcu_dereference(rq->sd);
+	if (sd && !sd_overutilized(sd) &&
+	    cpu_overutilized(rq->cpu))
+		set_sd_overutilized(sd);
+	rcu_read_unlock();
+#else
 	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
 		WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
 		trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
 	}
+#endif
 }
 #else
 static inline void update_overutilized_status(struct rq *rq) { }
@ -8047,7 +8075,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	if (static_branch_unlikely(&sched_energy_present)) {
 		struct root_domain *rd = env->dst_rq->rd;

-		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized) &&
+		if ((rcu_dereference(rd->pd) && !sd_overutilized(env->sd)) &&
 					env->idle == CPU_NEWLY_IDLE &&
 					!task_in_related_thread_group(p)) {
 			long util_cum_dst, util_cum_src;
@ -8549,6 +8577,7 @@ struct sd_lb_stats {
 	unsigned long total_running;
 	unsigned long total_load;	/* Total load of all groups in sd */
 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
+	unsigned long total_util;	/* Total util of all groups in sd */
 	unsigned long avg_load;	/* Average load across all groups in sd */

 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@ -8569,6 +8598,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 		.total_running = 0UL,
 		.total_load = 0UL,
 		.total_capacity = 0UL,
+		.total_util = 0UL,
 		.busiest_stat = {
 			.avg_load = 0UL,
 			.sum_nr_running = 0,
@ -8944,9 +8974,13 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		if (nr_running > 1)
 			*sg_status |= SG_OVERLOAD;

-		if (cpu_overutilized(i))
+		if (cpu_overutilized(i)) {
 			*sg_status |= SG_OVERUTILIZED;

+			if (rq->misfit_task_load)
+				*sg_status |= SG_HAS_MISFIT_TASK;
+		}
+
 #ifdef CONFIG_NUMA_BALANCING
 		sgs->nr_numa_running += rq->nr_numa_running;
 		sgs->nr_preferred_running += rq->nr_preferred_running;
@ -9191,6 +9225,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		sds->total_running += sgs->sum_nr_running;
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;
+		sds->total_util += sgs->group_util;

 		trace_sched_load_balance_sg_stats(sg->cpumask[0],
 				sgs->group_type, sgs->idle_cpus,
@ -9223,6 +9258,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		/* update overload indicator if we are at root domain */
 		WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);

+#ifndef CONFIG_SCHED_WALT
 		/* Update over-utilization (tipping point, U >= 0) indicator */
 		WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
 		trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
@ -9231,7 +9267,50 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd

 		WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
 		trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
+#endif
 	}
+
+#ifdef CONFIG_SCHED_WALT
+	if (sg_status & SG_OVERUTILIZED)
+		set_sd_overutilized(env->sd);
+	else
+		clear_sd_overutilized(env->sd);
+
+	/*
+	 * If there is a misfit task in one cpu in this sched_domain
+	 * it is likely that the imbalance cannot be sorted out among
+	 * the cpu's in this sched_domain. In this case set the
+	 * overutilized flag at the parent sched_domain.
+	 */
+	if (sg_status & SG_HAS_MISFIT_TASK) {
+		struct sched_domain *sd = env->sd->parent;
+
+		/*
+		 * In case of a misfit task, load balance at the parent
+		 * sched domain level will make sense only if the the cpus
+		 * have a different capacity. If cpus at a domain level have
+		 * the same capacity, the misfit task cannot be well
+		 * accomodated	in any of the cpus and there in no point in
+		 * trying a load balance at this level
+		 */
+		while (sd) {
+			if (sd->flags & SD_ASYM_CPUCAPACITY) {
+				set_sd_overutilized(sd);
+				break;
+			}
+			sd = sd->parent;
+		}
+	}
+
+	/*
+	 * If the domain util is greater that domain capacity, load balancing
+	 * needs to be done at the next sched domain level as well.
+	 */
+	if (env->sd->parent &&
+	    sds->total_capacity * 1024 < sds->total_util *
+			 sched_capacity_margin_up[group_first_cpu(sds->local)])
+		set_sd_overutilized(env->sd->parent);
+#endif
 }

 /**
@ -9523,7 +9602,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (sched_energy_enabled()) {
 		struct root_domain *rd = env->dst_rq->rd;

+#ifdef CONFIG_SCHED_WALT
+		if (rcu_dereference(rd->pd) && !sd_overutilized(env->sd)) {
+#else
 		if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) {
+#endif
 			int cpu_local, cpu_busiest;
 			unsigned long capacity_local, capacity_busiest;

@ -10389,6 +10472,11 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 		}
 		max_cost += sd->max_newidle_lb_cost;

+#ifdef CONFIG_SCHED_WALT
+		if (!sd_overutilized(sd))
+			continue;
+#endif
+
 		if (!(sd->flags & SD_LOAD_BALANCE))
 			continue;

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -822,6 +822,7 @@ struct max_cpu_capacity {
 /* Scheduling group status flags */
 #define SG_OVERLOAD		0x1 /* More than one runnable task on a CPU. */
 #define SG_OVERUTILIZED		0x2 /* One or more CPUs are over-utilized. */
+#define SG_HAS_MISFIT_TASK	0x4 /* Group has misfit task. */

 /*
 * We add the notion of a root-domain which will be used to define per-domain
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@ -1441,15 +1441,11 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->cache_nice_tries = 1;
 	}

-	/*
-	 * For all levels sharing cache; connect a sched_domain_shared
-	 * instance.
-	 */
-	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-		atomic_inc(&sd->shared->ref);
+	sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+	atomic_inc(&sd->shared->ref);
+
+	if (sd->flags & SD_SHARE_PKG_RESOURCES)
 		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-	}

 	sd->private = sdd;