From 4c2f02b7b604d652cfcf3a580e7330a5711b4540 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Thu, 17 Dec 2015 07:34:27 +0800 Subject: [PATCH 01/71] sched/fair: Fix new task's load avg removed from source CPU in wake_up_new_task() If a newly created task is selected to go to a different CPU in fork balance when it wakes up the first time, its load averages should not be removed from the source CPU since they are never added to it before. The same is also applicable to a never used group entity. Fix it in remove_entity_load_avg(): when entity's last_update_time is 0, simply return. This should precisely identify the case in question, because in other migrations, the last_update_time is set to 0 after remove_entity_load_avg(). Reported-by: Steve Muckle Signed-off-by: Yuyang Du [peterz: cfs_rq_last_update_time] Signed-off-by: Peter Zijlstra (Intel) Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Morten Rasmussen Cc: Patrick Bellasi Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Link: http://lkml.kernel.org/r/20151216233427.GJ28098@intel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cfdc0e61066c..6e7d36f2ed8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2809,6 +2809,27 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } +#ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + u64 last_update_time_copy; + u64 last_update_time; + + do { + last_update_time_copy = cfs_rq->load_last_update_time_copy; + smp_rmb(); + last_update_time = cfs_rq->avg.last_update_time; + } while (last_update_time != last_update_time_copy); + + return last_update_time; +} +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ + return cfs_rq->avg.last_update_time; +} +#endif + /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). @@ -2818,17 +2839,14 @@ void remove_entity_load_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; -#ifndef CONFIG_64BIT - u64 last_update_time_copy; + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; - do { - last_update_time_copy = cfs_rq->load_last_update_time_copy; - smp_rmb(); - last_update_time = cfs_rq->avg.last_update_time; - } while (last_update_time != last_update_time_copy); -#else - last_update_time = cfs_rq->avg.last_update_time; -#endif + last_update_time = cfs_rq_last_update_time(cfs_rq); __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); From 6ccce9a636497376b02791a73fccfa12cc007480 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 17 Sep 2015 16:10:56 +0100 Subject: [PATCH 02/71] cpufreq: Frequency invariant scheduler load-tracking support Implements cpufreq_scale_freq_capacity() to provide the scheduler with a frequency scaling correction factor for more accurate load-tracking. The factor is: current_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for freq_scale. Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 29 +++++++++++++++++++++++++++++ include/linux/cpufreq.h | 3 +++ 2 files changed, 32 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 8412ce5f93a7..46a7d62c8174 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -347,6 +347,31 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) #endif } +/********************************************************************* + * FREQUENCY INVARIANT CPU CAPACITY * + *********************************************************************/ + +static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; + +static void +scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) +{ + unsigned long cur = freqs ? freqs->new : policy->cur; + unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + int cpu; + + pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", + cpumask_pr_args(policy->cpus), cur, policy->max, scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(freq_scale, cpu) = scale; +} + +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { @@ -450,6 +475,8 @@ void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, spin_unlock(&policy->transition_lock); + scale_freq_capacity(policy, freqs); + cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin); @@ -2126,6 +2153,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, blocking_notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY, new_policy); + scale_freq_capacity(new_policy, NULL); + policy->min = new_policy->min; policy->max = new_policy->max; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 177c7680c1a8..11c65f536b6c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -616,4 +616,7 @@ unsigned int cpufreq_generic_get(unsigned int cpu); int cpufreq_generic_init(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table, unsigned int transition_latency); + +struct sched_domain; +unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif /* _LINUX_CPUFREQ_H */ From 491f9d1db1058183c36429f1debe410e068cc9f2 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 12:47:48 +0100 Subject: [PATCH 03/71] arm: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Signed-off-by: Dietmar Eggemann --- arch/arm/include/asm/topology.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 370f7a732900..a69917b7d2c9 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,11 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#include +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +#endif + #else static inline void init_cpu_topology(void) { } From a1f0c358b74b2a7665160c45e95faeee2fa97f2a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:15:11 +0100 Subject: [PATCH 04/71] arm64: Enable frequency invariant scheduler load-tracking support Defines arch_scale_freq_capacity() to use cpufreq implementation. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_freq_capacity() has to be declared extern in topology.h. Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index a3e9d6fdbf21..72c47c3fa7b3 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,6 +22,12 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +#ifdef CONFIG_CPU_FREQ +#define arch_scale_freq_capacity cpufreq_scale_freq_capacity +struct sched_domain; +extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +#endif + #include #endif /* _ASM_ARM_TOPOLOGY_H */ From ab004f8fce2a93afb36be89c3c4e1fa3f5e2481d Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 14 Apr 2015 16:25:31 +0100 Subject: [PATCH 05/71] arm: Update arch_scale_cpu_capacity() to reflect change to define arch_scale_cpu_capacity() is no longer a weak function but a #define instead. Include the #define in topology.h. cc: Russell King Signed-off-by: Morten Rasmussen --- arch/arm/include/asm/topology.h | 2 ++ arch/arm/kernel/topology.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index a69917b7d2c9..e3e596cbb1a7 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -28,6 +28,8 @@ const struct cpumask *cpu_coregroup_mask(int cpu); #include #define arch_scale_freq_capacity cpufreq_scale_freq_capacity #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #else diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 08b7847bf912..614554765e44 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -42,7 +42,7 @@ */ static DEFINE_PER_CPU(unsigned long, cpu_scale); -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { return per_cpu(cpu_scale, cpu); } From 6d8dc122538b51610d2b9ed64b5a4885853a2adc Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 7 May 2015 18:46:15 +0100 Subject: [PATCH 06/71] sched: Store system-wide maximum cpu capacity in root domain To be able to compare the capacity of the target cpu with the highest cpu capacity of the system in the wakeup path, store the system-wide maximum cpu capacity in the root domain. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++++++++ kernel/sched/sched.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eb70592f03f6..879bdffc928b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6971,6 +6971,7 @@ static int build_sched_domains(const struct cpumask *cpu_map, enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; + struct rq *rq = NULL; int i, ret = -ENOMEM; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); @@ -7021,11 +7022,18 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); + + if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) + rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); + if (rq) + pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); + ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b242775bf670..2b7ffa5a20ad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -543,6 +543,9 @@ struct root_domain { */ cpumask_var_t rto_mask; struct cpupri cpupri; + + /* Maximum cpu capacity in the system. */ + unsigned long max_cpu_capacity; }; extern struct root_domain def_root_domain; From dccf3b267743493acdcb81d250537dce37028bd5 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 19:53:49 +0100 Subject: [PATCH 07/71] sched: Add cpu capacity awareness to wakeup balancing Wakeup balancing is completely unaware of cpu capacity, cpu utilization and task utilization. The task is preferably placed on a cpu which is idle in the instant the wakeup happens. New tasks (SD_BALANCE_{FORK,EXEC} are placed on an idle cpu in the idlest group if such can be found, otherwise it goes on the least loaded one. Existing tasks (SD_BALANCE_WAKE) are placed on the previous cpu or an idle cpu sharing the same last level cache unless the wakee_flips heuristic in wake_wide() decides to fallback to considering cpus outside SD_LLC. Hence existing tasks are not guaranteed to get a chance to migrate to a different group at wakeup in case the current one has reduced cpu capacity (due RT/IRQ pressure or different uarch e.g. ARM big.LITTLE). They may eventually get pulled by other cpus doing periodic/idle/nohz_idle balance, but it may take quite a while before it happens. This patch adds capacity awareness to find_idlest_{group,queue} (used by SD_BALANCE_{FORK,EXEC} and SD_BALANCE_WAKE under certain circumstances) such that groups/cpus that can accommodate the waking task based on task utilization are preferred. In addition, wakeup of existing tasks (SD_BALANCE_WAKE) is sent through find_idlest_{group,queue} also if the task doesn't fit the capacity of the previous cpu to allow it to escape (override wake_affine) when necessary instead of relying on periodic/idle/nohz_idle balance to eventually sort it out. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6e7d36f2ed8f..5e112bba1b5a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4742,6 +4742,43 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) return 1; } +static inline unsigned long task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +static unsigned int capacity_margin = 1280; /* ~20% margin */ + +static inline bool __task_fits(struct task_struct *p, int cpu, int util) +{ + unsigned long capacity = capacity_of(cpu); + + util += task_util(p); + + return (capacity * 1024) > (util * capacity_margin); +} + +static inline bool task_fits_max(struct task_struct *p, int cpu) +{ + unsigned long capacity = capacity_of(cpu); + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + + if (capacity == max_capacity) + return true; + + if (capacity * capacity_margin > max_capacity * 1024) + return true; + + return __task_fits(p, cpu, 0); +} + +static int cpu_util(int cpu); + +static inline bool task_fits_spare(struct task_struct *p, int cpu) +{ + return __task_fits(p, cpu, cpu_util(cpu)); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -4751,7 +4788,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; + struct sched_group *fit_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long fit_capacity = ULONG_MAX; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4782,6 +4821,15 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load = target_load(i, load_idx); avg_load += load; + + /* + * Look for most energy-efficient group that can fit + * that can fit the task. + */ + if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) { + fit_capacity = capacity_of(i); + fit_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4795,6 +4843,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } } while (group = group->next, group != sd->groups); + if (fit_group) + return fit_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; @@ -4815,7 +4866,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { - if (idle_cpu(i)) { + if (task_fits_spare(p, i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -4827,7 +4878,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) min_exit_latency = idle->exit_latency; latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; - } else if ((!idle || idle->exit_latency == min_exit_latency) && + } else if (idle_cpu(i) && + (!idle || idle->exit_latency == min_exit_latency) && rq->idle_stamp > latest_idle_timestamp) { /* * If equal or no active idle state, then @@ -4836,6 +4888,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) */ latest_idle_timestamp = rq->idle_stamp; shallowest_idle_cpu = i; + } else if (shallowest_idle_cpu == -1) { + /* + * If we haven't found an idle CPU yet + * pick a non-idle one that can fit the task as + * fallback. + */ + shallowest_idle_cpu = i; } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(i); @@ -4950,7 +5009,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = !wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); rcu_read_lock(); for_each_domain(cpu, tmp) { From 8088e56399b9f71938924888bfca31d2b967fbff Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Mon, 6 Jul 2015 15:01:10 +0100 Subject: [PATCH 08/71] sched: Consider spare cpu capacity at task wake-up find_idlest_group() selects the wake-up target group purely based on group load which leads to suboptimal choices in low load scenarios. An idle group with reduced capacity (due to RT tasks or different cpu type) isn't necessarily a better target than a lightly loaded group with higher capacity. The patch adds spare capacity as an additional group selection parameter. The target group is now selected based on the following criteria: 1. Return the group with the cpu with most spare capacity and this capacity is significant if such group exists. Significant spare capacity is currently at least 20% to spare. 2. Return the group with the lowest load, unless it is the local group in which case NULL is returned and the search is continued at the next (lower) level. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5e112bba1b5a..bebc8367edee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4788,9 +4788,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int sd_flag) { struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *fit_group = NULL; + struct sched_group *fit_group = NULL, *spare_group = NULL; unsigned long min_load = ULONG_MAX, this_load = 0; unsigned long fit_capacity = ULONG_MAX; + unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE; int load_idx = sd->forkexec_idx; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -4798,7 +4799,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, load_idx = sd->wake_idx; do { - unsigned long load, avg_load; + unsigned long load, avg_load, spare_capacity; int local_group; int i; @@ -4830,6 +4831,16 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, fit_capacity = capacity_of(i); fit_group = group; } + + /* + * Look for group which has most spare capacity on a + * single cpu. + */ + spare_capacity = capacity_of(i) - cpu_util(i); + if (spare_capacity > max_spare_capacity) { + max_spare_capacity = spare_capacity; + spare_group = group; + } } /* Adjust by relative CPU capacity of the group */ @@ -4846,6 +4857,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (fit_group) return fit_group; + if (spare_group) + return spare_group; + if (!idlest || 100*this_load < imbalance*min_load) return NULL; return idlest; From 6e01e504ec553c47f12a0c9b87e98e9a14d2c200 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 26 Jan 2015 19:47:28 +0000 Subject: [PATCH 09/71] sched: Enable idle balance to pull single task towards cpu with higher capacity We do not want to miss out on the ability to pull a single remaining task from a potential source cpu towards an idle destination cpu. Add an extra criteria to need_active_balance() to kick off active load balance if the source cpu is over-utilized and has lower capacity than the destination cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bebc8367edee..fe40cdb8e640 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4779,6 +4779,11 @@ static inline bool task_fits_spare(struct task_struct *p, int cpu) return __task_fits(p, cpu, cpu_util(cpu)); } +static bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -6995,6 +7000,13 @@ static int need_active_balance(struct lb_env *env) return 1; } + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From 6d7558516fa7ba1db893fd909ab66e1d35021c09 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 2 Jul 2015 17:16:34 +0100 Subject: [PATCH 10/71] sched: Prevent unnecessary active balance of single task in sched group Scenarios with the busiest group having just one task and the local being idle on topologies with sched groups with different numbers of cpus manage to dodge all load-balance bailout conditions resulting the nr_balance_failed counter to be incremented. This eventually causes a pointless active migration of the task. This patch prevents this by not incrementing the counter when the busiest group only has one task. ASYM_PACKING migrations and migrations due to reduced capacity should still take place as these are explicitly captured by need_active_balance(). A better solution would be to not attempt the load-balance in the first place, but that requires significant changes to the order of bailout conditions and statistics gathering. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe40cdb8e640..632163ef7628 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5629,6 +5629,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -6591,6 +6592,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + if (!env->sd->parent) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) @@ -7219,7 +7222,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); From ee7ea859e3c8eb15faba30907c0a314d14d339f6 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:43:28 +0000 Subject: [PATCH 11/71] sched: Documentation for scheduler energy cost model This documentation patch provides an overview of the experimental scheduler energy costing model, associated data structures, and a reference recipe on how platforms can be characterized to derive energy models. Signed-off-by: Morten Rasmussen --- Documentation/scheduler/sched-energy.txt | 362 +++++++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 Documentation/scheduler/sched-energy.txt diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt new file mode 100644 index 000000000000..dab2f9088b33 --- /dev/null +++ b/Documentation/scheduler/sched-energy.txt @@ -0,0 +1,362 @@ +Energy cost model for energy-aware scheduling (EXPERIMENTAL) + +Introduction +============= + +The basic energy model uses platform energy data stored in sched_group_energy +data structures attached to the sched_groups in the sched_domain hierarchy. The +energy cost model offers two functions that can be used to guide scheduling +decisions: + +1. static unsigned int sched_group_energy(struct energy_env *eenv) +2. static int energy_diff(struct energy_env *eenv) + +sched_group_energy() estimates the energy consumed by all cpus in a specific +sched_group including any shared resources owned exclusively by this group of +cpus. Resources shared with other cpus are excluded (e.g. later level caches). + +energy_diff() estimates the total energy impact of a utilization change. That +is, adding, removing, or migrating utilization (tasks). + +Both functions use a struct energy_env to specify the scenario to be evaluated: + + struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; + }; + +sg_top: sched_group to be evaluated. Not used by energy_diff(). + +sg_cap: sched_group covering the cpus in the same frequency domain. Set by +sched_group_energy(). + +cap_idx: Capacity state to be used for energy calculations. Set by +find_new_capacity(). + +util_delta: Amount of utilization to be added, removed, or migrated. + +src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be +-1 if no source (e.g. task wake-up). + +dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1 +if utilization is removed (e.g. terminating tasks). + +energy: Result of sched_group_energy(). + +The metric used to represent utilization is the actual per-entity running time +averaged over time using a geometric series. Very similar to the existing +per-entity load-tracking, but _not_ scaled by task priority and capped by the +capacity of the cpu. The latter property does mean that utilization may +underestimate the compute requirements for task on fully/over utilized cpus. +The greatest potential for energy savings without affecting performance too much +is scenarios where the system isn't fully utilized. If the system is deemed +fully utilized load-balancing should be done with task load (includes task +priority) instead in the interest of fairness and performance. + + +Background and Terminology +=========================== + +To make it clear from the start: + +energy = [joule] (resource like a battery on powered devices) +power = energy/time = [joule/second] = [watt] + +The goal of energy-aware scheduling is to minimize energy, while still getting +the job done. That is, we want to maximize: + + performance [inst/s] + -------------------- + power [W] + +which is equivalent to minimizing: + + energy [J] + ----------- + instruction + +while still getting 'good' performance. It is essentially an alternative +optimization objective to the current performance-only objective for the +scheduler. This alternative considers two objectives: energy-efficiency and +performance. Hence, there needs to be a user controllable knob to switch the +objective. Since it is early days, this is currently a sched_feature +(ENERGY_AWARE). + +The idea behind introducing an energy cost model is to allow the scheduler to +evaluate the implications of its decisions rather than applying energy-saving +techniques blindly that may only have positive effects on some platforms. At +the same time, the energy cost model must be as simple as possible to minimize +the scheduler latency impact. + +Platform topology +------------------ + +The system topology (cpus, caches, and NUMA information, not peripherals) is +represented in the scheduler by the sched_domain hierarchy which has +sched_groups attached at each level that covers one or more cpus (see +sched-domains.txt for more details). To add energy awareness to the scheduler +we need to consider power and frequency domains. + +Power domain: + +A power domain is a part of the system that can be powered on/off +independently. Power domains are typically organized in a hierarchy where you +may be able to power down just a cpu or a group of cpus along with any +associated resources (e.g. shared caches). Powering up a cpu means that all +power domains it is a part of in the hierarchy must be powered up. Hence, it is +more expensive to power up the first cpu that belongs to a higher level power +domain than powering up additional cpus in the same high level domain. Two +level power domain hierarchy example: + + Power source + +-------------------------------+----... +per group PD G G + | +----------+ | + +--------+-------| Shared | (other groups) +per-cpu PD G G | resource | + | | +----------+ + +-------+ +-------+ + | CPU 0 | | CPU 1 | + +-------+ +-------+ + +Frequency domain: + +Frequency domains (P-states) typically cover the same group of cpus as one of +the power domain levels. That is, there might be several smaller power domains +sharing the same frequency (P-state) or there might be a power domain spanning +multiple frequency domains. + +From a scheduling point of view there is no need to know the actual frequencies +[Hz]. All the scheduler cares about is the compute capacity available at the +current state (P-state) the cpu is in and any other available states. For that +reason, and to also factor in any cpu micro-architecture differences, compute +capacity scaling states are called 'capacity states' in this document. For SMP +systems this is equivalent to P-states. For mixed micro-architecture systems +(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture +performance relative to the other cpus in the system. + +Energy modelling: +------------------ + +Due to the hierarchical nature of the power domains, the most obvious way to +model energy costs is therefore to associate power and energy costs with +domains (groups of cpus). Energy costs of shared resources are associated with +the group of cpus that share the resources, only the cost of powering the +cpu itself and any private resources (e.g. private L1 caches) is associated +with the per-cpu groups (lowest level). + +For example, for an SMP system with per-cpu power domains and a cluster level +(group of cpus) power domain we get the overall energy costs to be: + + energy = energy_cluster + n * energy_cpu + +where 'n' is the number of cpus powered up and energy_cluster is the cost paid +as soon as any cpu in the cluster is powered up. + +The power and frequency domains can naturally be mapped onto the existing +sched_domain hierarchy and sched_groups by adding the necessary data to the +existing data structures. + +The energy model considers energy consumption from two contributors (shown in +the illustration below): + +1. Busy energy: Energy consumed while a cpu and the higher level groups that it +belongs to are busy running tasks. Busy energy is associated with the state of +the cpu, not an event. The time the cpu spends in this state varies. Thus, the +most obvious platform parameter for this contribution is busy power +(energy/time). + +2. Idle energy: Energy consumed while a cpu and higher level groups that it +belongs to are idle (in a C-state). Like busy energy, idle energy is associated +with the state of the cpu. Thus, the platform parameter for this contribution +is idle power (energy/time). + +Energy consumed during transitions from an idle-state (C-state) to a busy state +(P-state) or going the other way is ignored by the model to simplify the energy +model calculations. + + + Power + ^ + | busy->idle idle->busy + | transition transition + | + | _ __ + | / \ / \__________________ + |______________/ \ / + | \ / + | Busy \ Idle / Busy + | low P-state \____________/ high P-state + | + +------------------------------------------------------------> time + +Busy |--------------| |-----------------| + +Wakeup |------| |------| + +Idle |------------| + + +The basic algorithm +==================== + +The basic idea is to determine the total energy impact when utilization is +added or removed by estimating the impact at each level in the sched_domain +hierarchy starting from the bottom (sched_group contains just a single cpu). +The energy cost comes from busy time (sched_group is awake because one or more +cpus are busy) and idle time (in an idle-state). Energy model numbers account +for energy costs associated with all cpus in the sched_group as a group. + + for_each_domain(cpu, sd) { + sg = sched_group_of(cpu) + energy_before = curr_util(sg) * busy_power(sg) + + (1-curr_util(sg)) * idle_power(sg) + energy_after = new_util(sg) * busy_power(sg) + + (1-new_util(sg)) * idle_power(sg) + energy_diff += energy_before - energy_after + + } + + return energy_diff + +{curr, new}_util: The cpu utilization at the lowest level and the overall +non-idle time for the entire group for higher levels. Utilization is in the +range 0.0 to 1.0 in the pseudo-code. + +busy_power: The power consumption of the sched_group. + +idle_power: The power consumption of the sched_group when idle. + +Note: It is a fundamental assumption that the utilization is (roughly) scale +invariant. Task utilization tracking factors in any frequency scaling and +performance scaling differences due to difference cpu microarchitectures such +that task utilization can be used across the entire system. + + +Platform energy data +===================== + +struct sched_group_energy can be attached to sched_groups in the sched_domain +hierarchy and has the following members: + +cap_states: + List of struct capacity_state representing the supported capacity states + (P-states). struct capacity_state has two members: cap and power, which + represents the compute capacity and the busy_power of the state. The + list must be ordered by capacity low->high. + +nr_cap_states: + Number of capacity states in cap_states list. + +idle_states: + List of struct idle_state containing idle_state power cost for each + idle-state supported by the system orderd by shallowest state first. + All states must be included at all level in the hierarchy, i.e. a + sched_group spanning just a single cpu must also include coupled + idle-states (cluster states). In addition to the cpuidle idle-states, + the list must also contain an entry for the idling using the arch + default idle (arch_idle_cpu()). Despite this state may not be a true + hardware idle-state it is considered the shallowest idle-state in the + energy model and must be the first entry. cpus may enter this state + (possibly 'active idling') if cpuidle decides not enter a cpuidle + idle-state. Default idle may not be used when cpuidle is enabled. + In this case, it should just be a copy of the first cpuidle idle-state. + +nr_idle_states: + Number of idle states in idle_states list. + +There are no unit requirements for the energy cost data. Data can be normalized +with any reference, however, the normalization must be consistent across all +energy cost data. That is, one bogo-joule/watt must be the same quantity for +data, but we don't care what it is. + +A recipe for platform characterization +======================================= + +Obtaining the actual model data for a particular platform requires some way of +measuring power/energy. There isn't a tool to help with this (yet). This +section provides a recipe for use as reference. It covers the steps used to +characterize the ARM TC2 development platform. This sort of measurements is +expected to be done anyway when tuning cpuidle and cpufreq for a given +platform. + +The energy model needs two types of data (struct sched_group_energy holds +these) for each sched_group where energy costs should be taken into account: + +1. Capacity state information + +A list containing the compute capacity and power consumption when fully +utilized attributed to the group as a whole for each available capacity state. +At the lowest level (group contains just a single cpu) this is the power of the +cpu alone without including power consumed by resources shared with other cpus. +It basically needs to fit the basic modelling approach described in "Background +and Terminology" section: + + energy_system = energy_shared + n * energy_cpu + +for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at +the lowest level. 'energy_shared' is included at the next level which +represents the group of cpus among which the resources are shared. + +This model is, of course, a simplification of reality. Thus, power/energy +attributions might not always exactly represent how the hardware is designed. +Also, busy power is likely to depend on the workload. It is therefore +recommended to use a representative mix of workloads when characterizing the +capacity states. + +If the group has no capacity scaling support, the list will contain a single +state where power is the busy power attributed to the group. The capacity +should be set to a default value (1024). + +When frequency domains include multiple power domains, the group representing +the frequency domain and all child groups share capacity states. This must be +indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at +all levels that share the capacity state must have the list of capacity states +with the power set to the contribution of the individual group. + +2. Idle power information + +Stored in the idle_states list. The power number is the group idle power +consumption in each idle state as well when the group is idle but has not +entered an idle-state ('active idle' as mentioned earlier). Due to the way the +energy model is defined, the idle power of the deepest group idle state can +alternatively be accounted for in the parent group busy power. In that case the +group idle state power values are offset such that the idle power of the +deepest state is zero. It is less intuitive, but it is easier to measure as +idle power consumed by the group and the busy/idle power of the parent group +cannot be distinguished without per group measurement points. + +Measuring capacity states and idle power: + +The capacity states' capacity and power can be estimated by running a benchmark +workload at each available capacity state. By restricting the benchmark to run +on subsets of cpus it is possible to extrapolate the power consumption of +shared resources. + +ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a +shared L2 cache. TC2 has on-chip energy counters per cluster. Running a +benchmark workload on just one cpu in a cluster means that power is consumed in +the cluster (higher level group) and a single cpu (lowest level group). Adding +another benchmark task to another cpu increases the power consumption by the +amount consumed by the additional cpu. Hence, it is possible to extrapolate the +cluster busy power. + +For platforms that don't have energy counters or equivalent instrumentation +built-in, it may be possible to use an external DAQ to acquire similar data. + +If the benchmark includes some performance score (for example sysbench cpu +benchmark), this can be used to record the compute capacity. + +Measuring idle power requires insight into the idle state implementation on the +particular platform. Specifically, if the platform has coupled idle-states (or +package states). To measure non-coupled per-cpu idle-states it is necessary to +keep one cpu busy to keep any shared resources alive to isolate the idle power +of the cpu from idle/busy power of the shared resources. The cpu can be tricked +into different per-cpu idle states by disabling the other states. Based on +various combinations of measurements with specific cpus busy and disabling +idle-states it is possible to extrapolate the idle-state power. From c27797804c7ef9f14072c3f7a898ed681036e2d7 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:45:51 +0000 Subject: [PATCH 12/71] sched: Make energy awareness a sched feature This patch introduces the ENERGY_AWARE sched feature, which is implemented using jump labels when SCHED_DEBUG is defined. It is statically set false when SCHED_DEBUG is not defined. Hence this doesn't allow energy awareness to be enabled without SCHED_DEBUG. This sched_feature knob will be replaced later with a more appropriate control knob when things have matured a bit. ENERGY_AWARE is based on per-entity load-tracking hence FAIR_GROUP_SCHED must be enable. This dependency isn't checked at compile time yet. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++++ kernel/sched/features.h | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 632163ef7628..efc65c3db6b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4651,6 +4651,11 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +static inline bool energy_aware(void) +{ + return sched_feat(ENERGY_AWARE); +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 69631fa46c2f..b634151ce286 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -69,3 +69,8 @@ SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) SCHED_FEAT(ATTACH_AGE_LOAD, true) +/* + * Energy aware scheduling. Use platform energy model to guide scheduling + * decisions optimizing for energy efficiency. + */ +SCHED_FEAT(ENERGY_AWARE, false) From 799fe13ed50f843ad190f2592989e9d0f21977ea Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:08:45 +0000 Subject: [PATCH 13/71] sched: Introduce energy data structures The struct sched_group_energy represents the per sched_group related data which is needed for energy aware scheduling. It contains: (1) number of elements of the idle state array (2) pointer to the idle state array which comprises 'power consumption' for each idle state (3) number of elements of the capacity state array (4) pointer to the capacity state array which comprises 'compute capacity and power consumption' tuples for each capacity state The struct sched_group obtains a pointer to a struct sched_group_energy. The function pointer sched_domain_energy_f is introduced into struct sched_domain_topology_level which will allow the arch to pass a particular struct sched_group_energy from the topology shim layer into the scheduler core. The function pointer sched_domain_energy_f has an 'int cpu' parameter since the folding of two adjacent sd levels via sd degenerate doesn't work for all sd levels. I.e. it is not possible for example to use this feature to provide per-cpu energy in sd level DIE on ARM's TC2 platform. It was discussed that the folding of sd levels approach is preferable over the cpu parameter approach, simply because the user (the arch specifying the sd topology table) can introduce less errors. But since it is not working, the 'int cpu' parameter is the only way out. It's possible to use the folding of sd levels approach for sched_domain_flags_f and the cpu parameter approach for the sched_domain_energy_f at the same time though. With the use of the 'int cpu' parameter, an extra check function has to be provided to make sure that all cpus spanned by a sched group are provisioned with the same energy data. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- include/linux/sched.h | 19 +++++++++++++++++++ kernel/sched/sched.h | 1 + 2 files changed, 20 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 21a6e9649012..2b809c9ffefe 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1022,6 +1022,22 @@ struct sched_domain_attr { extern int sched_domain_level_max; +struct capacity_state { + unsigned long cap; /* compute capacity */ + unsigned long power; /* power consumption at this compute capacity */ +}; + +struct idle_state { + unsigned long power; /* power consumption in this idle state */ +}; + +struct sched_group_energy { + unsigned int nr_idle_states; /* number of idle states */ + struct idle_state *idle_states; /* ptr to idle state array */ + unsigned int nr_cap_states; /* number of capacity states */ + struct capacity_state *cap_states; /* ptr to capacity state array */ +}; + struct sched_group; struct sched_domain { @@ -1120,6 +1136,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); +typedef +const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu); #define SDTL_OVERLAP 0x01 @@ -1132,6 +1150,7 @@ struct sd_data { struct sched_domain_topology_level { sched_domain_mask_f mask; sched_domain_flags_f sd_flags; + sched_domain_energy_f energy; int flags; int numa_level; struct sd_data data; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2b7ffa5a20ad..1813cba2995d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -863,6 +863,7 @@ struct sched_group { unsigned int group_weight; struct sched_group_capacity *sgc; + const struct sched_group_energy const *sge; /* * The CPUs this group covers. From f1d6884d8ae852924811e88c700a42b76f50cb20 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:20:20 +0000 Subject: [PATCH 14/71] sched: Initialize energy data structures The sched_group_energy (sge) pointer of the first sched_group (sg) in the sched_domain (sd) is initialized to point to the appropriate (in terms of sd level and cpu) sge data defined in the arch and so to the correct part of the Energy Model (EM). Energy-aware scheduling allows that a system has only EM data up to a certain sd level (so called highest energy aware balancing sd level). A check in init_sched_energy() enforces that all sd's below this sd level contain EM data. The 'int cpu' parameter of sched_domain_energy_f requires that check_sched_energy_data() makes sure that all cpus spanned by a sg are provisioned with the same EM data. This patch has also been tested with feature FORCE_SD_OVERLAP enabled. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 65 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 879bdffc928b..6cba963436ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6304,6 +6304,66 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); } +/* + * Check that the per-cpu provided sd energy data is consistent for all cpus + * within the mask. + */ +static inline void check_sched_energy_data(int cpu, sched_domain_energy_f fn, + const struct cpumask *cpumask) +{ + const struct sched_group_energy * const sge = fn(cpu); + struct cpumask mask; + int i; + + if (cpumask_weight(cpumask) <= 1) + return; + + cpumask_xor(&mask, cpumask, get_cpu_mask(cpu)); + + for_each_cpu(i, &mask) { + const struct sched_group_energy * const e = fn(i); + int y; + + BUG_ON(e->nr_idle_states != sge->nr_idle_states); + + for (y = 0; y < (e->nr_idle_states); y++) { + BUG_ON(e->idle_states[y].power != + sge->idle_states[y].power); + } + + BUG_ON(e->nr_cap_states != sge->nr_cap_states); + + for (y = 0; y < (e->nr_cap_states); y++) { + BUG_ON(e->cap_states[y].cap != sge->cap_states[y].cap); + BUG_ON(e->cap_states[y].power != + sge->cap_states[y].power); + } + } +} + +static void init_sched_energy(int cpu, struct sched_domain *sd, + sched_domain_energy_f fn) +{ + if (!(fn && fn(cpu))) + return; + + if (cpu != group_balance_cpu(sd->groups)) + return; + + if (sd->child && !sd->child->groups->sge) { + pr_err("BUG: EAS setup broken for CPU%d\n", cpu); +#ifdef CONFIG_SCHED_DEBUG + pr_err(" energy data on %s but not on %s domain\n", + sd->name, sd->child->name); +#endif + return; + } + + check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups)); + + sd->groups->sge = fn(cpu); +} + /* * Initializers for schedule domains * Non-inlined to reduce accumulated stack pressure in build_sched_domains() @@ -7010,10 +7070,13 @@ static int build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU capacity for physical packages and nodes */ for (i = nr_cpumask_bits-1; i >= 0; i--) { + struct sched_domain_topology_level *tl = sched_domain_topology; + if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) { + init_sched_energy(i, sd, tl->energy); claim_allocations(i, sd); init_sched_groups_capacity(i, sd); } From 4a51b7abe3de01d2cf09d85d65dd3aa8f73b2861 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 13:50:46 +0000 Subject: [PATCH 15/71] sched: Introduce SD_SHARE_CAP_STATES sched_domain flag cpufreq is currently keeping it a secret which cpus are sharing clock source. The scheduler needs to know about clock domains as well to become more energy aware. The SD_SHARE_CAP_STATES domain flag indicates whether cpus belonging to the sched_domain share capacity states (P-states). There is no connection with cpufreq (yet). The flag must be set by the arch specific topology code. cc: Russell King cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- arch/arm/kernel/topology.c | 3 ++- include/linux/sched.h | 1 + kernel/sched/core.c | 10 +++++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 614554765e44..38e7be162b79 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -277,7 +277,8 @@ void store_cpu_topology(unsigned int cpuid) static inline int cpu_corepower_flags(void) { - return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN; + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; } static struct sched_domain_topology_level arm_topology[] = { diff --git a/include/linux/sched.h b/include/linux/sched.h index 2b809c9ffefe..e84fd00640fb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -990,6 +990,7 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ #define SD_NUMA 0x4000 /* cross-node balancing */ +#define SD_SHARE_CAP_STATES 0x8000 /* Domain members share capacity state */ #ifdef CONFIG_SCHED_SMT static inline int cpu_smt_flags(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6cba963436ba..aecaaff17a45 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5775,7 +5775,8 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | - SD_SHARE_POWERDOMAIN)) { + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES)) { if (sd->groups != sd->groups->next) return 0; } @@ -5807,7 +5808,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | - SD_SHARE_POWERDOMAIN); + SD_SHARE_POWERDOMAIN | + SD_SHARE_CAP_STATES); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -6472,6 +6474,7 @@ static int sched_domains_curr_level; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain + * SD_SHARE_CAP_STATES - describes shared capacity states * * Odd one out: * SD_ASYM_PACKING - describes SMT quirks @@ -6481,7 +6484,8 @@ static int sched_domains_curr_level; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_SHARE_POWERDOMAIN) + SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, int cpu) From 2d7f52930c90956ab122b317d5a5b85b7a09013d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 10 Jul 2015 13:57:19 +0100 Subject: [PATCH 16/71] arm: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used instead of the arm arch specific cpu_efficiency and dtb property 'clock-frequency' values as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 38e7be162b79..da1c611a3b5e 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -153,6 +153,8 @@ static void __init parse_dt_topology(void) } +static const struct sched_group_energy * const cpu_core_energy(int cpu); + /* * Look for a customed capacity of a CPU in the cpu_capacity table during the * boot. The update of all CPUs is in O(n^2) for heteregeneous system but the @@ -160,10 +162,14 @@ static void __init parse_dt_topology(void) */ static void update_cpu_capacity(unsigned int cpu) { - if (!cpu_capacity(cpu)) - return; + unsigned long capacity = SCHED_CAPACITY_SCALE; - set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity); + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); pr_info("CPU%u: update cpu_capacity %lu\n", cpu, arch_scale_cpu_capacity(NULL, cpu)); From bdd6c49fb4ea1d9072abb886290f9187807cdf73 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 11:53:48 +0100 Subject: [PATCH 17/71] arm64: Cpu invariant scheduler load-tracking and capacity support Provides the scheduler with a cpu scaling correction factor for more accurate load-tracking and cpu capacity handling. The Energy Model (EM) (in fact the capacity value of the last element of the capacity states vector of the core (MC) level sched_group_energy structure) is used as the source for this cpu scaling factor. The cpu capacity value depends on the micro-architecture and the maximum frequency of the cpu. The maximum frequency part should not be confused with the frequency invariant scheduler load-tracking support which deals with frequency related scaling due to DFVS functionality. Signed-off-by: Juri Lelli Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 4 ++- arch/arm64/kernel/topology.c | 42 +++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 72c47c3fa7b3..9370a336c934 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -22,11 +22,13 @@ void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); +struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity -struct sched_domain; extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); #endif +#define arch_scale_cpu_capacity scale_cpu_capacity +extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); #include diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 694f6deedbab..fb99a6735fd4 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -23,6 +23,18 @@ #include #include +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; + +unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -211,6 +223,35 @@ const struct cpumask *cpu_coregroup_mask(int cpu) return &cpu_topology[cpu].core_sibling; } +static inline int cpu_corepower_flags(void) +{ + return SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN | \ + SD_SHARE_CAP_STATES; +} + +static struct sched_domain_topology_level arm64_topology[] = { +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + +static void update_cpu_capacity(unsigned int cpu) +{ + unsigned long capacity = SCHED_CAPACITY_SCALE; + + if (cpu_core_energy(cpu)) { + int max_cap_idx = cpu_core_energy(cpu)->nr_cap_states - 1; + capacity = cpu_core_energy(cpu)->cap_states[max_cap_idx].cap; + } + + set_capacity_scale(cpu, capacity); + + pr_info("CPU%d: update cpu_capacity %lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); +} + static void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; @@ -272,6 +313,7 @@ void store_cpu_topology(unsigned int cpuid) topology_populated: update_siblings_masks(cpuid); + update_cpu_capacity(cpuid); } static void __init reset_cpu_topology(void) From 2596462e0ae2e4104e8b5cceb69faeaff7401600 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 13 Jan 2015 14:11:28 +0000 Subject: [PATCH 18/71] sched: Compute cpu capacity available at current frequency capacity_orig_of() returns the max available compute capacity of a cpu. For scale-invariant utilization tracking and energy-aware scheduling decisions it is useful to know the compute capacity available at the current OPP of a cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efc65c3db6b5..ea966d9193cd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4651,6 +4651,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); From e416e248104ffc575fcce258fee8246f1cd6dba1 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 11 Dec 2014 15:25:29 +0000 Subject: [PATCH 19/71] sched: Relocated cpu_util() and change return type Move cpu_util() to an earlier position in fair.c and change return type to unsigned long as negative usage doesn't make much sense. All other load and capacity related functions use unsigned long including the caller of cpu_util(). cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 70 ++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ea966d9193cd..318141042a3e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4662,6 +4662,40 @@ static unsigned long capacity_curr_of(int cpu) >> SCHED_CAPACITY_SHIFT; } +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static unsigned long cpu_util(int cpu) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + return (util >= capacity) ? capacity : util; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); @@ -4788,8 +4822,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } -static int cpu_util(int cpu); - static inline bool task_fits_spare(struct task_struct *p, int cpu) { return __task_fits(p, cpu, cpu_util(cpu)); @@ -4988,40 +5020,6 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static int cpu_util(int cpu) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - return (util >= capacity) ? capacity : util; -} - /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, From 184b138bb77183ccd44e0bcb3716d92532eda2d3 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 17:08:52 +0000 Subject: [PATCH 20/71] sched: Highest energy aware balancing sched_domain level pointer Add another member to the family of per-cpu sched_domain shortcut pointers. This one, sd_ea, points to the highest level at which energy model is provided. At this level and all levels below all sched_groups have energy model data attached. Partial energy model information is possible but restricted to providing energy model data for lower level sched_domains (sd_ea and below) and leaving load-balancing on levels above to non-energy-aware load-balancing. For example, it is possible to apply energy-aware scheduling within each socket on a multi-socket system and let normal scheduling handle load-balancing between sockets. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 11 ++++++++++- kernel/sched/sched.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index aecaaff17a45..9d82f4c47e3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5993,11 +5993,12 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_ea); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; + struct sched_domain *busy_sd = NULL, *ea_sd = NULL; int id = cpu; int size = 1; @@ -6018,6 +6019,14 @@ static void update_top_cache_domain(int cpu) sd = highest_flag_domain(cpu, SD_ASYM_PACKING); rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + + for_each_domain(cpu, sd) { + if (sd->groups->sge) + ea_sd = sd; + else + break; + } + rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1813cba2995d..a77516a15ba0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -839,6 +839,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_ea); struct sched_group_capacity { atomic_t ref; From 7a31b5cce774cd17698b5062061c5a2522e1bdd7 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 18 Dec 2014 14:47:18 +0000 Subject: [PATCH 21/71] sched: Calculate energy consumption of sched_group For energy-aware load-balancing decisions it is necessary to know the energy consumption estimates of groups of cpus. This patch introduces a basic function, sched_group_energy(), which estimates the energy consumption of the cpus in the group and any resources shared by the members of the group. NOTE: The function has five levels of identation and breaks the 80 character limit. Refactoring is necessary. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 4 ++ kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 1 + 3 files changed, 161 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9d82f4c47e3b..a8f33eedbe8f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5994,6 +5994,7 @@ DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); DEFINE_PER_CPU(struct sched_domain *, sd_ea); +DEFINE_PER_CPU(struct sched_domain *, sd_scs); static void update_top_cache_domain(int cpu) { @@ -6027,6 +6028,9 @@ static void update_top_cache_domain(int cpu) break; } rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd); + + sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES); + rcu_assign_pointer(per_cpu(sd_scs, cpu), sd); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 318141042a3e..49f7567c6b07 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4701,6 +4701,162 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +/* + * cpu_norm_util() returns the cpu util relative to a specific capacity, + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for + * energy calculations. Using the scale-invariant util returned by + * cpu_util() and approximating scale-invariant util by: + * + * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time + * + * the normalized util can be found using the specific capacity. + * + * capacity = capacity_orig * curr_freq/max_freq + * + * norm_util = running_time/time ~ util/capacity + */ +static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +{ + int util = cpu_util(cpu); + + if (util >= capacity) + return SCHED_CAPACITY_SCALE; + + return (util << SCHED_CAPACITY_SHIFT)/capacity; +} + +static unsigned long group_max_util(struct sched_group *sg) +{ + int i; + unsigned long max_util = 0; + + for_each_cpu(i, sched_group_cpus(sg)) + max_util = max(max_util, cpu_util(i)); + + return max_util; +} + +/* + * group_norm_util() returns the approximated group util relative to it's + * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in + * energy calculations. Since task executions may or may not overlap in time in + * the group the true normalized util is between max(cpu_norm_util(i)) and + * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The + * latter is used as the estimate as it leads to a more pessimistic energy + * estimate (more busy). + */ +static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +{ + int i; + unsigned long util_sum = 0; + unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + + for_each_cpu(i, sched_group_cpus(sg)) + util_sum += cpu_norm_util(i, capacity); + + if (util_sum > SCHED_CAPACITY_SCALE) + return SCHED_CAPACITY_SCALE; + return util_sum; +} + +static int find_new_capacity(struct sched_group *sg, + const struct sched_group_energy const *sge) +{ + int idx; + unsigned long util = group_max_util(sg); + + for (idx = 0; idx < sge->nr_cap_states; idx++) { + if (sge->cap_states[idx].cap >= util) + return idx; + } + + return idx; +} + +/* + * sched_group_energy(): Computes the absolute energy consumption of cpus + * belonging to the sched_group including shared resources shared only by + * members of the group. Iterates over all cpus in the hierarchy below the + * sched_group starting from the bottom working it's way up before going to + * the next cpu until all cpus are covered at all levels. The current + * implementation is likely to gather the same util statistics multiple times. + * This can probably be done in a faster but more complex way. + * Note: sched_group_energy() may fail when racing with sched_domain updates. + */ +static int sched_group_energy(struct sched_group *sg_top) +{ + struct sched_domain *sd; + int cpu, total_energy = 0; + struct cpumask visit_cpus; + struct sched_group *sg; + + WARN_ON(!sg_top->sge); + + cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + + while (!cpumask_empty(&visit_cpus)) { + struct sched_group *sg_shared_cap = NULL; + + cpu = cpumask_first(&visit_cpus); + + /* + * Is the group utilization affected by cpus outside this + * sched_group? + */ + sd = rcu_dereference(per_cpu(sd_scs, cpu)); + + if (!sd) + /* + * We most probably raced with hotplug; returning a + * wrong energy estimation is better than entering an + * infinite loop. + */ + return -EINVAL; + + if (sd->parent) + sg_shared_cap = sd->parent->groups; + + for_each_domain(cpu, sd) { + sg = sd->groups; + + /* Has this sched_domain already been visited? */ + if (sd->child && group_first_cpu(sg) != cpu) + break; + + do { + struct sched_group *sg_cap_util; + unsigned long group_util; + int sg_busy_energy, sg_idle_energy, cap_idx; + + if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) + sg_cap_util = sg_shared_cap; + else + sg_cap_util = sg; + + cap_idx = find_new_capacity(sg_cap_util, sg->sge); + group_util = group_norm_util(sg, cap_idx); + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) + >> SCHED_CAPACITY_SHIFT; + + total_energy += sg_busy_energy + sg_idle_energy; + + if (!sd->child) + cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); + + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + goto next_cpu; + + } while (sg = sg->next, sg != sd->groups); + } +next_cpu: + continue; + } + + return total_energy; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a77516a15ba0..a618db2936d4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -840,6 +840,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_busy); DECLARE_PER_CPU(struct sched_domain *, sd_asym); DECLARE_PER_CPU(struct sched_domain *, sd_ea); +DECLARE_PER_CPU(struct sched_domain *, sd_scs); struct sched_group_capacity { atomic_t ref; From 4a535730fb7c9afad6b0bbc96dc8dc3d1f6ae0ed Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 2 Jan 2015 14:21:56 +0000 Subject: [PATCH 22/71] sched: Extend sched_group_energy to test load-balancing decisions Extended sched_group_energy() to support energy prediction with usage (tasks) added/removed from a specific cpu or migrated between a pair of cpus. Useful for load-balancing decision making. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 49f7567c6b07..f3de37414a66 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4688,12 +4688,21 @@ static unsigned long capacity_curr_of(int cpu) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static unsigned long cpu_util(int cpu) +static unsigned long __cpu_util(int cpu, int delta) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); - return (util >= capacity) ? capacity : util; + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); } static inline bool energy_aware(void) @@ -4701,8 +4710,18 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +struct energy_env { + struct sched_group *sg_top; + struct sched_group *sg_cap; + int cap_idx; + int util_delta; + int src_cpu; + int dst_cpu; + int energy; +}; + /* - * cpu_norm_util() returns the cpu util relative to a specific capacity, + * __cpu_norm_util() returns the cpu util relative to a specific capacity, * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for * energy calculations. Using the scale-invariant util returned by * cpu_util() and approximating scale-invariant util by: @@ -4715,9 +4734,9 @@ static inline bool energy_aware(void) * * norm_util = running_time/time ~ util/capacity */ -static unsigned long cpu_norm_util(int cpu, unsigned long capacity) +static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) { - int util = cpu_util(cpu); + int util = __cpu_util(cpu, delta); if (util >= capacity) return SCHED_CAPACITY_SCALE; @@ -4725,13 +4744,25 @@ static unsigned long cpu_norm_util(int cpu, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct sched_group *sg) +static int calc_util_delta(struct energy_env *eenv, int cpu) { - int i; + if (cpu == eenv->src_cpu) + return -eenv->util_delta; + if (cpu == eenv->dst_cpu) + return eenv->util_delta; + return 0; +} + +static +unsigned long group_max_util(struct energy_env *eenv) +{ + int i, delta; unsigned long max_util = 0; - for_each_cpu(i, sched_group_cpus(sg)) - max_util = max(max_util, cpu_util(i)); + for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { + delta = calc_util_delta(eenv, i); + max_util = max(max_util, __cpu_util(i, delta)); + } return max_util; } @@ -4745,31 +4776,36 @@ static unsigned long group_max_util(struct sched_group *sg) * latter is used as the estimate as it leads to a more pessimistic energy * estimate (more busy). */ -static unsigned long group_norm_util(struct sched_group *sg, int cap_idx) +static unsigned +long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i; + int i, delta; unsigned long util_sum = 0; - unsigned long capacity = sg->sge->cap_states[cap_idx].cap; + unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; - for_each_cpu(i, sched_group_cpus(sg)) - util_sum += cpu_norm_util(i, capacity); + for_each_cpu(i, sched_group_cpus(sg)) { + delta = calc_util_delta(eenv, i); + util_sum += __cpu_norm_util(i, capacity, delta); + } if (util_sum > SCHED_CAPACITY_SCALE) return SCHED_CAPACITY_SCALE; return util_sum; } -static int find_new_capacity(struct sched_group *sg, +static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy const *sge) { int idx; - unsigned long util = group_max_util(sg); + unsigned long util = group_max_util(eenv); for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) - return idx; + break; } + eenv->cap_idx = idx; + return idx; } @@ -4783,16 +4819,16 @@ static int find_new_capacity(struct sched_group *sg, * This can probably be done in a faster but more complex way. * Note: sched_group_energy() may fail when racing with sched_domain updates. */ -static int sched_group_energy(struct sched_group *sg_top) +static int sched_group_energy(struct energy_env *eenv) { struct sched_domain *sd; int cpu, total_energy = 0; struct cpumask visit_cpus; struct sched_group *sg; - WARN_ON(!sg_top->sge); + WARN_ON(!eenv->sg_top->sge); - cpumask_copy(&visit_cpus, sched_group_cpus(sg_top)); + cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top)); while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; @@ -4824,17 +4860,16 @@ static int sched_group_energy(struct sched_group *sg_top) break; do { - struct sched_group *sg_cap_util; unsigned long group_util; int sg_busy_energy, sg_idle_energy, cap_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) - sg_cap_util = sg_shared_cap; + eenv->sg_cap = sg_shared_cap; else - sg_cap_util = sg; + eenv->sg_cap = sg; - cap_idx = find_new_capacity(sg_cap_util, sg->sge); - group_util = group_norm_util(sg, cap_idx); + cap_idx = find_new_capacity(eenv, sg->sge); + group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) >> SCHED_CAPACITY_SHIFT; sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) @@ -4845,7 +4880,7 @@ static int sched_group_energy(struct sched_group *sg_top) if (!sd->child) cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); - if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top))) + if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top))) goto next_cpu; } while (sg = sg->next, sg != sd->groups); @@ -4854,7 +4889,8 @@ static int sched_group_energy(struct sched_group *sg_top) continue; } - return total_energy; + eenv->energy = total_energy; + return 0; } /* From 9aaa5bace5bb94b0d0c0d0a0f6ee031e74b87245 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 6 Jan 2015 17:34:05 +0000 Subject: [PATCH 23/71] sched: Estimate energy impact of scheduling decisions Adds a generic energy-aware helper function, energy_diff(), that calculates energy impact of adding, removing, and migrating utilization in the system. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f3de37414a66..043715612227 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4893,6 +4893,58 @@ static int sched_group_energy(struct energy_env *eenv) return 0; } +static inline bool cpu_in_sg(struct sched_group *sg, int cpu) +{ + return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); +} + +/* + * energy_diff(): Estimate the energy impact of changing the utilization + * distribution. eenv specifies the change: utilisation amount, source, and + * destination cpu. Source or destination cpu may be -1 in which case the + * utilization is removed from or added to the system (e.g. task wake-up). If + * both are specified, the utilization is migrated. + */ +static int energy_diff(struct energy_env *eenv) +{ + struct sched_domain *sd; + struct sched_group *sg; + int sd_cpu = -1, energy_before = 0, energy_after = 0; + + struct energy_env eenv_before = { + .util_delta = 0, + .src_cpu = eenv->src_cpu, + .dst_cpu = eenv->dst_cpu, + }; + + if (eenv->src_cpu == eenv->dst_cpu) + return 0; + + sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); + + if (!sd) + return 0; /* Error */ + + sg = sd->groups; + + do { + if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { + eenv_before.sg_top = eenv->sg_top = sg; + + if (sched_group_energy(&eenv_before)) + return 0; /* Invalid result abort */ + energy_before += eenv_before.energy; + + if (sched_group_energy(eenv)) + return 0; /* Invalid result abort */ + energy_after += eenv->energy; + } + } while (sg = sg->next, sg != sd->groups); + + return energy_after-energy_before; +} + /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * A waker of many should wake a different task than the one last awakened From a172e650694734ea2ae277bc414779e0b6fe1044 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 16:49:57 +0100 Subject: [PATCH 24/71] sched: Add over-utilization/tipping point indicator Energy-aware scheduling is only meant to be active while the system is _not_ over-utilized. That is, there are spare cycles available to shift tasks around based on their actual utilization to get a more energy-efficient task distribution without depriving any tasks. When above the tipping point task placement is done the traditional way based on load_avg, spreading the tasks across as many cpus as possible based on priority scaled load to preserve smp_nice. Below the tipping point we want to use util_avg instead. We need to define a criteria for when we make the switch. The util_avg for each cpu converges towards 100% (1024) regardless of how many task additional task we may put on it. If we define over-utilized as: sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity) some individual cpus may be over-utilized running multiple tasks even when the above condition is false. That should be okay as long as we try to spread the tasks out to avoid per-cpu over-utilization as much as possible and if all tasks have the _same_ priority. If the latter isn't true, we have to consider priority to preserve smp_nice. For example, we could have n_cpus nice=-10 util_avg=55% tasks and n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less over-utilized than 55%+60% for those cpus that have to be shared. The system utilization is only 85% of the system capacity, but we are breaking smp_nice. To be sure not to break smp_nice, we have defined over-utilization conservatively as when any cpu in the system is fully utilized at it's highest frequency instead: cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg to factor in priority to preserve smp_nice. With this definition, we can skip periodic load-balance as no cpu has an always-running task when the system is not over-utilized. All tasks will be periodic and we can balance them at wake-up. This conservative condition does however mean that some scenarios that could benefit from energy-aware decisions even if one cpu is fully utilized would not get those benefits. For system where some cpus might have reduced capacity on some cpus (RT-pressure and/or big.LITTLE), we want periodic load-balance checks as soon a just a single cpu is fully utilized as it might one of those with reduced capacity and in that case we want to migrate it. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 31 +++++++++++++++++++++++++------ kernel/sched/sched.h | 3 +++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 043715612227..3cfb4be3d88f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static bool cpu_overutilized(int cpu); + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -4154,6 +4156,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int task_new = !(flags & ENQUEUE_WAKEUP); for_each_sched_entity(se) { if (se->on_rq) @@ -4185,9 +4188,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { add_nr_running(rq, 1); - + if (!task_new && !rq->rd->overutilized && + cpu_overutilized(rq->cpu)) + rq->rd->overutilized = true; + } hrtick_update(rq); } @@ -6651,11 +6657,12 @@ group_type group_classify(struct sched_group *group, * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. + * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs, - bool *overload) + bool *overload, bool *overutilized) { unsigned long load; int i; @@ -6685,6 +6692,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_weighted_load += weighted_cpuload(i); if (idle_cpu(i)) sgs->idle_cpus++; + + if (cpu_overutilized(i)) + *overutilized = true; } /* Adjust by relative CPU capacity of the group */ @@ -6790,7 +6800,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; - bool overload = false; + bool overload = false, overutilized = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; @@ -6812,7 +6822,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd } update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + &overload, &overutilized); if (local_group) goto next_group; @@ -6856,8 +6866,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; - } + /* Update over-utilization (tipping point, U >= 0) indicator */ + if (env->dst_rq->rd->overutilized != overutilized) + env->dst_rq->rd->overutilized = overutilized; + } else { + if (!env->dst_rq->rd->overutilized && overutilized) + env->dst_rq->rd->overutilized = true; + } } /** @@ -8250,6 +8266,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) + rq->rd->overutilized = true; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a618db2936d4..5323d4fc1109 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -528,6 +528,9 @@ struct root_domain { /* Indicate more than one runnable task for any CPU */ bool overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + bool overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). From 2706f607bfd0aac647ba45a97f3dcfff821cc04a Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 27 Jan 2015 13:48:07 +0000 Subject: [PATCH 25/71] sched, cpuidle: Track cpuidle state index in the scheduler The idle-state of each cpu is currently pointed to by rq->idle_state but there isn't any information in the struct cpuidle_state that can used to look up the idle-state energy model data stored in struct sched_group_energy. For this purpose is necessary to store the idle state index as well. Ideally, the idle-state data should be unified. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- drivers/cpuidle/cpuidle.c | 4 ++-- include/linux/cpuidle.h | 2 +- kernel/sched/idle.c | 3 ++- kernel/sched/sched.h | 21 +++++++++++++++++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 17a6dc0e2111..b9a6cceb7bac 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -192,7 +192,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, } /* Take note of the planned idle state. */ - sched_idle_set_state(target_state); + sched_idle_set_state(target_state, index); trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ktime_get(); @@ -205,7 +205,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ - sched_idle_set_state(NULL); + sched_idle_set_state(NULL, -1); if (broadcast) { if (WARN_ON_ONCE(!irqs_disabled())) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 786ad32631a6..6eae1576499e 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -204,7 +204,7 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, #endif /* kernel/sched/idle.c */ -extern void sched_idle_set_state(struct cpuidle_state *idle_state); +extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index); extern void default_idle_call(void); #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..cbc130efbc5b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -19,9 +19,10 @@ * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. */ -void sched_idle_set_state(struct cpuidle_state *idle_state) +void sched_idle_set_state(struct cpuidle_state *idle_state, int index) { idle_set_state(this_rq(), idle_state); + idle_set_state_idx(this_rq(), index); } static int __read_mostly cpu_idle_force_poll; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5323d4fc1109..2301b0476b90 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -693,6 +693,7 @@ struct rq { #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; + int idle_state_idx; #endif }; @@ -1285,6 +1286,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) WARN_ON(!rcu_read_lock_held()); return rq->idle_state; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ + rq->idle_state_idx = idle_state_idx; +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state_idx; +} #else static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) @@ -1295,6 +1307,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + return -1; +} #endif extern void sysrq_sched_debug_show(void); From 424fc5dbdf5fd08b0c08483c0fa806da6ac40b11 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 27 Jan 2015 14:04:17 +0000 Subject: [PATCH 26/71] sched: Determine the current sched_group idle-state To estimate the energy consumption of a sched_group in sched_group_energy() it is necessary to know which idle-state the group is in when it is idle. For now, it is assumed that this is the current idle-state (though it might be wrong). Based on the individual cpu idle-states group_idle_state() finds the group idle-state. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3cfb4be3d88f..c11bc7392916 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4815,6 +4815,20 @@ static int find_new_capacity(struct energy_env *eenv, return idx; } +static int group_idle_state(struct sched_group *sg) +{ + int i, state = INT_MAX; + + /* Find the shallowest idle state in the sched group. */ + for_each_cpu(i, sched_group_cpus(sg)) + state = min(state, idle_get_state_idx(cpu_rq(i))); + + /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ + state++; + + return state; +} + /* * sched_group_energy(): Computes the absolute energy consumption of cpus * belonging to the sched_group including shared resources shared only by @@ -4867,7 +4881,8 @@ static int sched_group_energy(struct energy_env *eenv) do { unsigned long group_util; - int sg_busy_energy, sg_idle_energy, cap_idx; + int sg_busy_energy, sg_idle_energy; + int cap_idx, idle_idx; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; @@ -4875,11 +4890,13 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) - >> SCHED_CAPACITY_SHIFT; - sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power) - >> SCHED_CAPACITY_SHIFT; + >> SCHED_CAPACITY_SHIFT; + sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) + * sg->sge->idle_states[idle_idx].power) + >> SCHED_CAPACITY_SHIFT; total_energy += sg_busy_energy + sg_idle_energy; From ee1a0a273f0f234e734cfbceb4a72df472eb5dc5 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Sat, 9 May 2015 20:03:19 +0100 Subject: [PATCH 27/71] sched: Energy-aware wake-up task placement Let available compute capacity and estimated energy impact select wake-up target cpu when energy-aware scheduling is enabled and the system in not over-utilized (above the tipping point). energy_aware_wake_cpu() attempts to find group of cpus with sufficient compute capacity to accommodate the task and find a cpu with enough spare capacity to handle the task within that group. Preference is given to cpus with enough spare capacity at the current OPP. Finally, the energy impact of the new target and the previous task cpu is compared to select the wake-up target cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 89 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c11bc7392916..682b4ae9ebd7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5287,6 +5287,86 @@ static int select_idle_sibling(struct task_struct *p, int target) return target; } +static int energy_aware_wake_cpu(struct task_struct *p, int target) +{ + struct sched_domain *sd; + struct sched_group *sg, *sg_target; + int target_max_cap = INT_MAX; + int target_cpu = task_cpu(p); + int i; + + sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p))); + + if (!sd) + return target; + + sg = sd->groups; + sg_target = sg; + + /* + * Find group with sufficient capacity. We only get here if no cpu is + * overutilized. We may end up overutilizing a cpu by adding the task, + * but that should not be any worse than select_idle_sibling(). + * load_balance() should sort it out later as we get above the tipping + * point. + */ + do { + /* Assuming all cpus are the same in group */ + int max_cap_cpu = group_first_cpu(sg); + + /* + * Assume smaller max capacity means more energy-efficient. + * Ideally we should query the energy model for the right + * answer but it easily ends up in an exhaustive search. + */ + if (capacity_of(max_cap_cpu) < target_max_cap && + task_fits_max(p, max_cap_cpu)) { + sg_target = sg; + target_max_cap = capacity_of(max_cap_cpu); + } + } while (sg = sg->next, sg != sd->groups); + + /* Find cpu with sufficient capacity */ + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) { + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + int new_util = cpu_util(i) + task_util(p); + + if (new_util > capacity_orig_of(i)) + continue; + + if (new_util < capacity_curr_of(i)) { + target_cpu = i; + if (cpu_rq(i)->nr_running) + break; + } + + /* cpu has capacity at higher OPP, keep it as fallback */ + if (target_cpu == task_cpu(p)) + target_cpu = i; + } + + if (target_cpu != task_cpu(p)) { + struct energy_env eenv = { + .util_delta = task_util(p), + .src_cpu = task_cpu(p), + .dst_cpu = target_cpu, + }; + + /* Not enough spare capacity on previous cpu */ + if (cpu_overutilized(task_cpu(p))) + return target_cpu; + + if (energy_diff(&eenv) >= 0) + return task_cpu(p); + } + + return target_cpu; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -5309,8 +5389,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f int sync = wake_flags & WF_SYNC; if (sd_flag & SD_BALANCE_WAKE) - want_affine = !wake_wide(p) && task_fits_max(p, cpu) && - cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); + want_affine = (!wake_wide(p) && task_fits_max(p, cpu) && + cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) || + energy_aware(); rcu_read_lock(); for_each_domain(cpu, tmp) { @@ -5340,7 +5421,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } if (!sd) { - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) + new_cpu = energy_aware_wake_cpu(p, prev_cpu); + else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, new_cpu); } else while (sd) { From 298ee11fae708706e9c3bc9e8ae53bd2f7e2a0c2 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 10 May 2015 15:17:32 +0100 Subject: [PATCH 28/71] sched: Consider a not over-utilized energy-aware system as balanced In case the system operates below the tipping point indicator, introduced in ("sched: Add over-utilization/tipping point indicator"), bail out in find_busiest_group after the dst and src group statistics have been checked. There is simply no need to move usage around because all involved cpus still have spare cycles available. For an energy-aware system below its tipping point, we rely on the task placement of the wakeup path. This works well for short running tasks. The existence of long running tasks on one of the involved cpus lets the system operate over its tipping point. To be able to move such a task (whose load can't be used to average the load among the cpus) from a src cpu with lower capacity than the dst_cpu, an additional rule has to be implemented in need_active_balance. Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 682b4ae9ebd7..ae4f1ee9cf0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7194,6 +7194,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (energy_aware() && !env->dst_rq->rd->overutilized) + goto out_balanced; + local = &sds.local_stat; busiest = &sds.busiest_stat; From c6110ac05befce0b5e4bfb23b41b13140f4d6bc8 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 3 Feb 2015 13:54:11 +0000 Subject: [PATCH 29/71] sched: Disable energy-unfriendly nohz kicks With energy-aware scheduling enabled nohz_kick_needed() generates many nohz idle-balance kicks which lead to nothing when multiple tasks get packed on a single cpu to save energy. This causes unnecessary wake-ups and hence wastes energy. Make these conditions depend on !energy_aware() for now until the energy-aware nohz story gets sorted out. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ae4f1ee9cf0c..7327221c991e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8259,12 +8259,13 @@ static inline bool nohz_kick_needed(struct rq *rq) if (time_before(now, nohz.next_balance)) return false; - if (rq->nr_running >= 2) + if (rq->nr_running >= 2 && + (!energy_aware() || cpu_overutilized(cpu))) return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { + if (sd && !energy_aware()) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); From 6155da129de0f9b68371d834209932bccf1ae3fd Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 17:56:20 +0100 Subject: [PATCH 30/71] Documentation: DT bindings for energy model cost data required by EAS EAS (energy aware scheduling) provides the scheduler with an alternative objective - energy efficiency - as opposed to it's current performance oriented objectives. EAS relies on a simple platform energy cost model to guide scheduling decisions. The model only considers the CPU subsystem. This patch adds documentation describing DT bindings that should be used to supply the scheduler with an energy cost model. Signed-off-by: Robin Randhawa --- .../bindings/scheduler/sched-energy-costs.txt | 360 ++++++++++++++++++ 1 file changed, 360 insertions(+) create mode 100644 Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt new file mode 100644 index 000000000000..11216f09e596 --- /dev/null +++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt @@ -0,0 +1,360 @@ +=========================================================== +Energy cost bindings for Energy Aware Scheduling +=========================================================== + +=========================================================== +1 - Introduction +=========================================================== + +This note specifies bindings required for energy-aware scheduling +(EAS)[1]. Historically, the scheduler's primary objective has been +performance. EAS aims to provide an alternative objective - energy +efficiency. EAS relies on a simple platform energy cost model to +guide scheduling decisions. The model only considers the CPU +subsystem. + +This note is aligned with the definition of the layout of physical +CPUs in the system as described in the ARM topology binding +description [2]. The concept is applicable to any system so long as +the cost model data is provided for those processing elements in +that system's topology that EAS is required to service. + +Processing elements refer to hardware threads, CPUs and clusters of +related CPUs in increasing order of hierarchy. + +EAS requires two key cost metrics - busy costs and idle costs. Busy +costs comprise of a list of compute capacities for the processing +element in question and the corresponding power consumption at that +capacity. Idle costs comprise of a list of power consumption values +for each idle state [C-state] that the processing element supports. +For a detailed description of these metrics, their derivation and +their use see [3]. + +These cost metrics are required for processing elements in all +scheduling domain levels that EAS is required to service. + +=========================================================== +2 - energy-costs node +=========================================================== + +Energy costs for the processing elements in scheduling domains that +EAS is required to service are defined in the energy-costs node +which acts as a container for the actual per processing element cost +nodes. A single energy-costs node is required for a given system. + +- energy-costs node + + Usage: Required + + Description: The energy-costs node is a container node and + it's sub-nodes describe costs for each processing element at + all scheduling domain levels that EAS is required to + service. + + Node name must be "energy-costs". + + The energy-costs node's parent node must be the cpus node. + + The energy-costs node's child nodes can be: + + - one or more cost nodes. + + Any other configuration is considered invalid. + +The energy-costs node can only contain a single type of child node +whose bindings are described in paragraph 4. + +=========================================================== +3 - energy-costs node child nodes naming convention +=========================================================== + +energy-costs child nodes must follow a naming convention where the +node name must be "thread-costN", "core-costN", "cluster-costN" +depending on whether the costs in the node are for a thread, core or +cluster. N (where N = {0, 1, ...}) is the node number and has no +bearing to the OS' logical thread, core or cluster index. + +=========================================================== +4 - cost node bindings +=========================================================== + +Bindings for cost nodes are defined as follows: + +- cluster-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple clusters and each cluster + serviced by EAS must have a corresponding cluster-costs + node. + + The cluster-cost node name must be "cluster-costN" as + described in 3 above. + + A cluster-cost node must be a leaf node with no children. + + Properties for cluster-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- core-cost node + + Description: must be declared within an energy-costs node. A + system can contain multiple cores and each core serviced by + EAS must have a corresponding core-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for core-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +- thread-cost node + + Description: must be declared within an energy-costs node. A + system can contain cores with multiple hardware threads and + each thread serviced by EAS must have a corresponding + thread-cost node. + + The core-cost node name must be "core-costN" as described in + 3 above. + + A core-cost node must be a leaf node with no children. + + Properties for thread-cost nodes are described in paragraph + 5 below. + + Any other configuration is considered invalid. + +=========================================================== +5 - Cost node properties +========================================================== + +All cost node types must have only the following properties: + +- busy-cost-data + + Usage: required + Value type: An array of 2-item tuples. Each item is of type + u32. + Definition: The first item in the tuple is the capacity + value as described in [3]. The second item in the tuple is + the energy cost value as described in [3]. + +- idle-cost-data + + Usage: required + Value type: An array of 1-item tuples. The item is of type + u32. + Definition: The item in the tuple is the energy cost value + as described in [3]. + +=========================================================== +4 - Extensions to the cpu node +=========================================================== + +The cpu node is extended with a property that establishes the +connection between the processing element represented by the cpu +node and the cost-nodes associated with this processing element. + +The connection is expressed in line with the topological hierarchy +that this processing element belongs to starting with the level in +the hierarchy that this processing element itself belongs to through +to the highest level that EAS is required to service. The +connection cannot be sparse and must be contiguous from the +processing element's level through to the highest desired level. The +highest desired level must be the same for all processing elements. + +Example: Given that a cpu node may represent a thread that is a part +of a core, this property may contain multiple elements which +associate the thread with cost nodes describing the costs for the +thread itself, the core the thread belongs to, the cluster the core +belongs to and so on. The elements must be ordered from the lowest +level nodes to the highest desired level that EAS must service. The +highest desired level must be the same for all cpu nodes. The +elements must not be sparse: there must be elements for the current +thread, the next level of hierarchy (core) and so on without any +'holes'. + +Example: Given that a cpu node may represent a core that is a part +of a cluster of related cpus this property may contain multiple +elements which associate the core with cost nodes describing the +costs for the core itself, the cluster the core belongs to and so +on. The elements must be ordered from the lowest level nodes to the +highest desired level that EAS must service. The highest desired +level must be the same for all cpu nodes. The elements must not be +sparse: there must be elements for the current thread, the next +level of hierarchy (core) and so on without any 'holes'. + +If the system comprises of hierarchical clusters of clusters, this +property will contain multiple associations with the relevant number +of cluster elements in hierarchical order. + +Property added to the cpu node: + +- sched-energy-costs + + Usage: required + Value type: List of phandles + Definition: a list of phandles to specific cost nodes in the + energy-costs parent node that correspond to the processing + element represented by this cpu node in hierarchical order + of topology. + + The order of phandles in the list is significant. The first + phandle is to the current processing element's own cost + node. Subsequent phandles are to higher hierarchical level + cost nodes up until the maximum level that EAS is to + service. + + All cpu nodes must have the same highest level cost node. + + The phandle list must not be sparsely populated with handles + to non-contiguous hierarchical levels. See commentary above + for clarity. + + Any other configuration is invalid. + +=========================================================== +5 - Example dts +=========================================================== + +Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one +cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus): + +cpus { + #address-cells = <2>; + #size-cells = <0>; + . + . + . + A57_0: cpu@0 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x0>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A57_1: cpu@1 { + compatible = "arm,cortex-a57","arm,armv8"; + reg = <0x0 0x1>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A57_L2>; + clocks = <&scpi_dvfs 0>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>; + }; + + A53_0: cpu@100 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x100>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_1: cpu@101 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x101>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_2: cpu@102 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x102>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + A53_3: cpu@103 { + compatible = "arm,cortex-a53","arm,armv8"; + reg = <0x0 0x103>; + device_type = "cpu"; + enable-method = "psci"; + next-level-cache = <&A53_L2>; + clocks = <&scpi_dvfs 1>; + cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; + sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>; + }; + + energy-costs { + CPU_COST_0: core-cost0 { + busy-cost-data = < + 417 168 + 579 251 + 744 359 + 883 479 + 1024 616 + >; + idle-cost-data = < + 15 + 0 + >; + }; + CPU_COST_1: core-cost1 { + busy-cost-data = < + 235 33 + 302 46 + 368 61 + 406 76 + 447 93 + >; + idle-cost-data = < + 6 + 0 + >; + }; + CLUSTER_COST_0: cluster-cost0 { + busy-cost-data = < + 417 24 + 579 32 + 744 43 + 883 49 + 1024 64 + >; + idle-cost-data = < + 65 + 24 + >; + }; + CLUSTER_COST_1: cluster-cost1 { + busy-cost-data = < + 235 26 + 303 30 + 368 39 + 406 47 + 447 57 + >; + idle-cost-data = < + 56 + 17 + >; + }; + }; +}; + +=============================================================================== +[1] https://lkml.org/lkml/2015/5/12/728 +[2] Documentation/devicetree/bindings/topology.txt +[3] Documentation/scheduler/sched-energy.txt From f5fdff425e4e23a75b3412a0fddd29c71a269473 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Mon, 29 Jun 2015 18:01:58 +0100 Subject: [PATCH 31/71] sched: Support for extracting EAS energy costs from DT This patch implements support for extracting energy cost data from DT. The data should conform to the DT bindings for energy cost data needed by EAS (energy aware scheduling). Signed-off-by: Robin Randhawa --- include/linux/sched_energy.h | 36 ++++++++++ kernel/sched/Makefile | 2 +- kernel/sched/energy.c | 124 +++++++++++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 include/linux/sched_energy.h create mode 100644 kernel/sched/energy.c diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h new file mode 100644 index 000000000000..a3f1627ac609 --- /dev/null +++ b/include/linux/sched_energy.h @@ -0,0 +1,36 @@ +#ifndef _LINUX_SCHED_ENERGY_H +#define _LINUX_SCHED_ENERGY_H + +#include +#include + +/* + * There doesn't seem to be an NR_CPUS style max number of sched domain + * levels so here's an arbitrary constant one for the moment. + * + * The levels alluded to here correspond to entries in struct + * sched_domain_topology_level that are meant to be populated by arch + * specific code (topology.c). + */ +#define NR_SD_LEVELS 8 + +#define SD_LEVEL0 0 +#define SD_LEVEL1 1 +#define SD_LEVEL2 2 +#define SD_LEVEL3 3 +#define SD_LEVEL4 4 +#define SD_LEVEL5 5 +#define SD_LEVEL6 6 +#define SD_LEVEL7 7 + +/* + * Convenience macro for iterating through said sd levels. + */ +#define for_each_possible_sd_level(level) \ + for (level = 0; level < NR_SD_LEVELS; level++) + +extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +void init_sched_energy_costs(void); + +#endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..a541b5ce1dcc 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -12,7 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif obj-y += core.o loadavg.o clock.o cputime.o -obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o +obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o energy.o obj-y += wait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c new file mode 100644 index 000000000000..b0656b7a93e3 --- /dev/null +++ b/kernel/sched/energy.c @@ -0,0 +1,124 @@ +/* + * Obtain energy cost data from DT and populate relevant scheduler data + * structures. + * + * Copyright (C) 2015 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#define pr_fmt(fmt) "sched-energy: " fmt + +#define DEBUG + +#include +#include +#include +#include +#include +#include + +struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS]; + +static void free_resources(void) +{ + int cpu, sd_level; + struct sched_group_energy *sge; + + for_each_possible_cpu(cpu) { + for_each_possible_sd_level(sd_level) { + sge = sge_array[cpu][sd_level]; + if (sge) { + kfree(sge->cap_states); + kfree(sge->idle_states); + kfree(sge); + } + } + } +} + +void init_sched_energy_costs(void) +{ + struct device_node *cn, *cp; + struct capacity_state *cap_states; + struct idle_state *idle_states; + struct sched_group_energy *sge; + const struct property *prop; + int sd_level, i, nstates, cpu; + const __be32 *val; + + for_each_possible_cpu(cpu) { + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_warn("CPU device node missing for CPU %d\n", cpu); + return; + } + + if (!of_find_property(cn, "sched-energy-costs", NULL)) { + pr_warn("CPU device node has no sched-energy-costs\n"); + return; + } + + for_each_possible_sd_level(sd_level) { + cp = of_parse_phandle(cn, "sched-energy-costs", sd_level); + if (!cp) + break; + + prop = of_find_property(cp, "busy-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No busy-cost data, skipping sched_energy init\n"); + goto out; + } + + sge = kcalloc(1, sizeof(struct sched_group_energy), + GFP_NOWAIT); + + nstates = (prop->length / sizeof(u32)) / 2; + cap_states = kcalloc(nstates, + sizeof(struct capacity_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) { + cap_states[i].cap = be32_to_cpup(val++); + cap_states[i].power = be32_to_cpup(val++); + } + + sge->nr_cap_states = nstates; + sge->cap_states = cap_states; + + prop = of_find_property(cp, "idle-cost-data", NULL); + if (!prop || !prop->value) { + pr_warn("No idle-cost data, skipping sched_energy init\n"); + goto out; + } + + nstates = (prop->length / sizeof(u32)); + idle_states = kcalloc(nstates, + sizeof(struct idle_state), + GFP_NOWAIT); + + for (i = 0, val = prop->value; i < nstates; i++) + idle_states[i].power = be32_to_cpup(val++); + + sge->nr_idle_states = nstates; + sge->idle_states = idle_states; + + sge_array[cpu][sd_level] = sge; + } + } + + pr_info("Sched-energy-costs installed from DT\n"); + return; + +out: + free_resources(); +} From f903bd14ff9d8d0ef499f3fff6cb93f72932e4d9 Mon Sep 17 00:00:00 2001 From: Robin Randhawa Date: Tue, 9 Jun 2015 15:10:00 +0100 Subject: [PATCH 32/71] arm64, topology: Updates to use DT bindings for EAS costing data With the bindings and the associated accessors to extract data from the bindings in place, remove the static hard-coded data from topology.c and use the accesors instead. Signed-off-by: Robin Randhawa --- arch/arm64/kernel/topology.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index fb99a6735fd4..b5b43af6a7dc 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -218,6 +220,33 @@ static int __init parse_dt_topology(void) struct cpu_topology cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); +/* sd energy functions */ +static inline +const struct sched_group_energy * const cpu_cluster_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu); + return NULL; + } + + return sge; +} + +static inline +const struct sched_group_energy * const cpu_core_energy(int cpu) +{ + struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0]; + + if (!sge) { + pr_warn("Invalid sched_group_energy for CPU%d\n", cpu); + return NULL; + } + + return sge; +} + const struct cpumask *cpu_coregroup_mask(int cpu) { return &cpu_topology[cpu].core_sibling; @@ -344,4 +373,8 @@ void __init init_cpu_topology(void) */ if (of_have_populated_dt() && parse_dt_topology()) reset_cpu_topology(); + else + set_sched_topology(arm64_topology); + + init_sched_energy_costs(); } From 25fd93de74994e4bdcec2ec4ed0814207a8e0e4b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 22 Sep 2015 16:47:48 +0100 Subject: [PATCH 33/71] cpufreq: Max freq invariant scheduler load-tracking and cpu capacity support Implements cpufreq_scale_max_freq_capacity() to provide the scheduler with a maximum frequency scaling correction factor for more accurate load-tracking and cpu capacity handling by being able to deal with frequency capping. This scaling factor describes the influence of running a cpu with a current maximum frequency lower than the absolute possible maximum frequency on load tracking and cpu capacity. The factor is: current_max_freq(cpu) << SCHED_CAPACITY_SHIFT / max_freq(cpu) In fact, max_freq_scale should be a struct cpufreq_policy data member. But this would require that the scheduler hot path (__update_load_avg()) would have to grab the cpufreq lock. This can be avoided by using per-cpu data initialized to SCHED_CAPACITY_SCALE for max_freq_scale. Signed-off-by: Dietmar Eggemann --- drivers/cpufreq/cpufreq.c | 19 +++++++++++++++++++ include/linux/cpufreq.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 46a7d62c8174..96f1a8860819 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -352,12 +352,14 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci) *********************************************************************/ static DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE; static void scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { unsigned long cur = freqs ? freqs->new : policy->cur; unsigned long scale = (cur << SCHED_CAPACITY_SHIFT) / policy->max; + struct cpufreq_cpuinfo *cpuinfo = &policy->cpuinfo; int cpu; pr_debug("cpus %*pbl cur/cur max freq %lu/%u kHz freq scale %lu\n", @@ -365,6 +367,18 @@ scale_freq_capacity(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) for_each_cpu(cpu, policy->cpus) per_cpu(freq_scale, cpu) = scale; + + if (freqs) + return; + + scale = (policy->max << SCHED_CAPACITY_SHIFT) / cpuinfo->max_freq; + + pr_debug("cpus %*pbl cur max/max freq %u/%u kHz max freq scale %lu\n", + cpumask_pr_args(policy->cpus), policy->max, cpuinfo->max_freq, + scale); + + for_each_cpu(cpu, policy->cpus) + per_cpu(max_freq_scale, cpu) = scale; } unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) @@ -372,6 +386,11 @@ unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu) return per_cpu(freq_scale, cpu); } +unsigned long cpufreq_scale_max_freq_capacity(int cpu) +{ + return per_cpu(max_freq_scale, cpu); +} + static void __cpufreq_notify_transition(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, unsigned int state) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 11c65f536b6c..9b1d61e1f1d7 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -619,4 +619,5 @@ int cpufreq_generic_init(struct cpufreq_policy *policy, struct sched_domain; unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif /* _LINUX_CPUFREQ_H */ From 500f94bc1cfeb6846314fa151c81d01607c259a7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 26 Sep 2015 18:19:54 +0100 Subject: [PATCH 34/71] sched: Update max cpu capacity in case of max frequency constraints Wakeup balancing uses cpu capacity awareness and needs to know the system-wide maximum cpu capacity. Patch "sched: Store system-wide maximum cpu capacity in root domain" finds the system-wide maximum cpu capacity during scheduler domain hierarchy setup. This is sufficient as long as maximum frequency invariance is not enabled. If it is enabled, the system-wide maximum cpu capacity can change between scheduler domain hierarchy setups due to frequency capping. The cpu capacity is changed in update_cpu_capacity() which is called in load balance on the lowest scheduler domain hierarchy level. To be able to know if a change in cpu capacity for a certain cpu also has an effect on the system-wide maximum cpu capacity it is normally necessary to iterate over all cpus. This would be way too costly. That's why this patch follows a different approach. The unsigned long max_cpu_capacity value in struct root_domain is replaced with a struct max_cpu_capacity, containing value (the max_cpu_capacity) and cpu (the cpu index of the cpu providing the maximum cpu_capacity). Changes to the system-wide maximum cpu capacity and the cpu index are made if: 1 System-wide maximum cpu capacity < cpu capacity 2 System-wide maximum cpu capacity > cpu capacity and cpu index == cpu There are no changes to the system-wide maximum cpu capacity in all other cases. Atomic read and write access to the pair (max_cpu_capacity.val, max_cpu_capacity.cpu) is enforced by max_cpu_capacity.lock. The access to max_cpu_capacity.val in task_fits_max() is still performed without taking the max_cpu_capacity.lock. The code to set max cpu capacity in build_sched_domains() has been removed because the whole functionality is now provided by update_cpu_capacity() instead. This approach can introduce errors temporarily, e.g. in case the cpu currently providing the max cpu capacity has its cpu capacity lowered due to frequency capping and calls update_cpu_capacity() before any cpu which might provide the max cpu now. There is also an outstanding question: Should the cpu capacity of a cpu going idle be set to a very small value? Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 8 ++------ kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++- kernel/sched/sched.h | 10 +++++++++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8f33eedbe8f..20df4c102fba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5888,6 +5888,8 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; + + init_max_cpu_capacity(&rd->max_cpu_capacity); return 0; free_rto_mask: @@ -7105,15 +7107,9 @@ static int build_sched_domains(const struct cpumask *cpu_map, rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); cpu_attach_domain(sd, d.rd, i); - - if (rq->cpu_capacity_orig > rq->rd->max_cpu_capacity) - rq->rd->max_cpu_capacity = rq->cpu_capacity_orig; } rcu_read_unlock(); - if (rq) - pr_info("max cpu_capacity %lu\n", rq->rd->max_cpu_capacity); - ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7327221c991e..91dfc8aa8aff 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5078,7 +5078,7 @@ static inline bool __task_fits(struct task_struct *p, int cpu, int util) static inline bool task_fits_max(struct task_struct *p, int cpu) { unsigned long capacity = capacity_of(cpu); - unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity; + unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val; if (capacity == max_capacity) return true; @@ -6564,13 +6564,43 @@ static unsigned long scale_rt_capacity(int cpu) return 1; } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) +{ + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2301b0476b90..aac581932eff 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -510,6 +510,12 @@ struct dl_rq { #ifdef CONFIG_SMP +struct max_cpu_capacity { + raw_spinlock_t lock; + unsigned long val; + int cpu; +}; + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -548,7 +554,7 @@ struct root_domain { struct cpupri cpupri; /* Maximum cpu capacity in the system. */ - unsigned long max_cpu_capacity; + struct max_cpu_capacity max_cpu_capacity; }; extern struct root_domain def_root_domain; @@ -1340,6 +1346,8 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); + static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; From 5f21ffcba8f88b503b39040d7c3f45bdb3167ddc Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 23 Sep 2015 17:59:55 +0100 Subject: [PATCH 35/71] arm: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Signed-off-by: Dietmar Eggemann --- arch/arm/kernel/topology.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index da1c611a3b5e..0308342def8c 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c @@ -44,7 +44,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale); unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#if CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From 51f57a62888f44ef9694660edf408e05dcf7f31e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 25 Sep 2015 17:34:15 +0100 Subject: [PATCH 36/71] arm64: Enable max freq invariant scheduler load-tracking and capacity support Maximum Frequency Invariance has to be part of Cpu Invariance because Frequency Invariance deals only with differences in load-tracking introduces by Dynamic Frequency Scaling and not with limiting the possible range of cpu frequency. By placing Maximum Frequency Invariance into Cpu Invariance, load-tracking is scaled via arch_scale_cpu_capacity() in __update_load_avg() and cpu capacity is scaled via arch_scale_cpu_capacity() in update_cpu_capacity(). To be able to save the extra multiplication in the scheduler hotpath (__update_load_avg()) we could: 1 Inform cpufreq about base cpu capacity at boot and let it handle scale_cpu_capacity() as well. 2 Use the cpufreq policy callback which would update a per-cpu current cpu_scale and this value would be return in scale_cpu_capacity(). 3 Use per-cpu current max_freq_scale and current cpu_scale with the current patch. Including in topology.h like for the arm arch doesn't work because of CONFIG_COMPAT=y (Kernel support for 32-bit EL0). That's why cpufreq_scale_max_freq_capacity() has to be declared extern in topology.h. Signed-off-by: Dietmar Eggemann --- arch/arm64/include/asm/topology.h | 1 + arch/arm64/kernel/topology.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 9370a336c934..bbd362cd1ed1 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -26,6 +26,7 @@ struct sched_domain; #ifdef CONFIG_CPU_FREQ #define arch_scale_freq_capacity cpufreq_scale_freq_capacity extern unsigned long cpufreq_scale_freq_capacity(struct sched_domain *sd, int cpu); +extern unsigned long cpufreq_scale_max_freq_capacity(int cpu); #endif #define arch_scale_cpu_capacity scale_cpu_capacity extern unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu); diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index b5b43af6a7dc..5b2c67a510d8 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -29,7 +29,13 @@ static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; unsigned long scale_cpu_capacity(struct sched_domain *sd, int cpu) { +#ifdef CONFIG_CPU_FREQ + unsigned long max_freq_scale = cpufreq_scale_max_freq_capacity(cpu); + + return per_cpu(cpu_scale, cpu) * max_freq_scale >> SCHED_CAPACITY_SHIFT; +#else return per_cpu(cpu_scale, cpu); +#endif } static void set_capacity_scale(unsigned int cpu, unsigned long capacity) From 300acf8ed98bf224ad6f13c9851e6b221390427a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Wed, 13 Jan 2016 15:49:44 +0000 Subject: [PATCH 37/71] sched: Do eas idle balance regardless of the rq avg idle value EAS relies on idle balance to migrate a misfit task towards a cpu with higher capacity. When such a cpu becomes idle, idle balance should happen even if the rq avg idle is smaller than the sched migration cost (default 500us). The rq avg idle is updated during the wakeup of a task in case the rq has a non-null idle_stamp. This value stays unchanged and valid until the next task wakes up on this cpu after an idle period. So rq avg idle could be smaller than sched migration cost preventing the idle balance from happening. In this case we would be at the mercy of wakeup, periodic or nohz-idle load balancing to put another task on this cpu. To break this dependency towards rq avg idle make EAS idle balance independent from this rq avg idle has to be larger than sched migration cost. Signed-off-by: Dietmar Eggemann --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 91dfc8aa8aff..7d58b1fea71d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7772,8 +7772,9 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); - if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + if (!energy_aware() && + (this_rq->avg_idle < sysctl_sched_migration_cost || + !this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) From ba3afb70e359658384c4738c409c47c812564222 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:43:49 +0000 Subject: [PATCH 38/71] sched: Add per-cpu max capacity to sched_group_capacity struct sched_group_capacity currently represents the compute capacity sum of all cpus in the sched_group. Unless it is divided by the group_weight to get the average capacity per cpu it hides differences in cpu capacity for mixed capacity systems (e.g. high RT/IRQ utilization or ARM big.LITTLE). But even the average may not be sufficient if the group covers cpus of different capacities. Instead, by extending struct sched_group_capacity to indicate max per-cpu capacity in the group a suitable group for a given task utilization can easily be found such that cpus with reduced capacity can be avoided for tasks with high utilization (not implemented by this patch). Signed-off-by: Morten Rasmussen --- kernel/sched/core.c | 3 ++- kernel/sched/fair.c | 17 ++++++++++++----- kernel/sched/sched.h | 3 ++- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 20df4c102fba..c05e95175b0a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5714,7 +5714,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_CONT " %*pbl", cpumask_pr_args(sched_group_cpus(group))); if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { - printk(KERN_CONT " (cpu_capacity = %d)", + printk(KERN_CONT " (cpu_capacity = %lu)", group->sgc->capacity); } @@ -6193,6 +6193,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d58b1fea71d..9762288ee47b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6609,13 +6609,14 @@ skip_unlock: __attribute__ ((unused)); cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity; + unsigned long capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6628,6 +6629,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } capacity = 0; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6652,11 +6654,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) */ if (unlikely(!rq->sd)) { capacity += capacity_of(cpu); - continue; + } else { + sgc = rq->sd->groups->sgc; + capacity += sgc->capacity; } - sgc = rq->sd->groups->sgc; - capacity += sgc->capacity; + max_capacity = max(capacity, max_capacity); } } else { /* @@ -6666,12 +6669,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity += group->sgc->capacity; + struct sched_group_capacity *sgc = group->sgc; + + capacity += sgc->capacity; + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; + sdg->sgc->max_capacity = max_capacity; } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aac581932eff..967d70d49af9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -858,7 +858,8 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity; + unsigned long capacity; + unsigned long max_capacity; /* Max per-cpu capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* From d9ae12756d61f7532241602161338220647e07df Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:47:54 +0000 Subject: [PATCH 39/71] sched: Add group_misfit_task load-balance type To maximize throughput in systems with reduced capacity cpus (e.g. high RT/IRQ load and/or ARM big.LITTLE) load-balancing has to consider task and cpu utilization as well as per-cpu compute capacity when load-balancing in addition to the current average load based load-balancing policy. Tasks that are scheduled on a reduced capacity cpu need to be identified and migrated to a higher capacity cpu if possible. To implement this additional policy an additional group_type (load-balance scenario) is added: group_misfit_task. This represents scenarios where a sched_group has tasks that are not suitable for its per-cpu capacity. group_misfit_task is only considered if the system is not overloaded in any other way (group_imbalanced or group_overloaded). Identifying misfit tasks requires the rq lock to be held. To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 29 ++++++++++++++++++++++------- kernel/sched/sched.h | 1 + 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9762288ee47b..b16e1b7878f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5739,6 +5739,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; simple: cfs_rq = &rq->cfs; @@ -5760,9 +5762,12 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev) if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + rq->misfit_task = !task_fits_max(p, rq->cpu); + return p; idle: + rq->misfit_task = 0; /* * This is OK, because current is on_cpu, which avoids it being picked * for load-balance and preemption/IRQs are still disabled avoiding @@ -5975,6 +5980,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -6446,12 +6458,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -6467,6 +6473,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + int group_misfit_task; /* A cpu has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6783,6 +6790,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task) + return group_misfit_task; + return group_other; } @@ -6830,8 +6840,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (idle_cpu(i)) sgs->idle_cpus++; - if (cpu_overutilized(i)) + if (cpu_overutilized(i)) { *overutilized = true; + if (!sgs->group_misfit_task && rq->misfit_task) + sgs->group_misfit_task = capacity_of(i); + } } /* Adjust by relative CPU capacity of the group */ @@ -8412,6 +8425,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) rq->rd->overutilized = true; + + rq->misfit_task = !task_fits_max(curr, rq->cpu); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 967d70d49af9..fcbb3e07accc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -584,6 +584,7 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; + unsigned int misfit_task; #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; From a52dae6571b52b8daa54b6d9f7a3e12c54c0649d Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 25 Feb 2016 12:51:35 +0000 Subject: [PATCH 40/71] sched: Consider misfit tasks when load-balancing With the new group_misfit_task load-balancing scenario additional policy conditions are needed when load-balancing. Misfit task balancing only makes sense between source group with lower capacity than the target group. If capacities are the same, fallback to normal group_other balancing. The aim is to balance tasks such that no task has its throughput hindered by compute capacity if a cpu with more capacity is available. Load-balancing is generally based on average load in the sched_groups, but for misfitting tasks it is necessary to introduce exceptions to migrate tasks against usual metrics and optimize throughput. This patch ensures the following load-balance for mixed capacity systems (e.g. ARM big.LITTLE) for always-running tasks: 1. Place a task on each cpu starting in order from cpus with highest capacity to lowest until all cpus are in use (i.e. one task on each cpu). 2. Once all cpus are in use balance according to compute capacity such that load per capacity is approximately the same regardless of the compute capacity (i.e. big cpus get more tasks than little cpus). Necessary changes are introduced in find_busiest_group(), calculate_imbalance(), and find_busiest_queue(). This includes passing the group_type on to find_busiest_queue() through struct lb_env, which is currently only considers imbalance and not the imbalance situation (group_type). To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each cpu is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen --- kernel/sched/fair.c | 71 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b16e1b7878f6..5b8a9c68f3fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6016,6 +6016,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type busiest_group_type; struct list_head tasks; }; @@ -6780,6 +6781,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) return false; } + +/* + * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * per-cpu capacity than sched_group ref. + */ +static inline bool +group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE < + ref->sgc->max_capacity; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -6886,9 +6899,25 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; + /* + * Candidate sg doesn't face any serious load-balance problems + * so don't pick it if the local sg is already filled up. + */ + if (sgs->group_type == group_other && + !group_has_capacity(env, &sds->local_stat)) + return false; + if (sgs->avg_load <= busiest->avg_load) return false; + /* + * Candiate sg has no more than one task per cpu and has higher + * per-cpu capacity. No reason to pull tasks to less capable cpus. + */ + if (sgs->sum_nr_running <= sgs->group_weight && + group_smaller_cpu_capacity(sds->local, sg)) + return false; + /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; @@ -6994,6 +7023,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sgs->group_type = group_classify(sg, sgs); } + /* + * Ignore task groups with misfit tasks if local group has no + * capacity or if per-cpu capacity isn't higher. + */ + if (sgs->group_type == group_misfit_task && + (!group_has_capacity(env, &sds->local_stat) || + !group_smaller_cpu_capacity(sg, sds->local))) + sgs->group_type = group_other; + if (update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; @@ -7170,6 +7208,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { + /* Misfitting tasks should be migrated in any case */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = busiest->group_misfit_task; + return; + } + + /* + * Busiest group is overloaded, local is not, use the spare + * cycles to maximize throughput + */ + if (busiest->group_type == group_overloaded && + local->group_type <= group_misfit_task) { + env->imbalance = busiest->load_per_task; + return; + } + env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -7203,6 +7257,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task); + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -7276,6 +7335,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfitting tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) { + goto force_balance; + } + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -7299,7 +7363,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * might end up to just move the imbalance on another group */ if ((busiest->group_type != group_overloaded) && - (local->idle_cpus <= (busiest->idle_cpus + 1))) + (local->idle_cpus <= (busiest->idle_cpus + 1)) && + !group_smaller_cpu_capacity(sds.busiest, sds.local)) goto out_balanced; } else { /* @@ -7312,6 +7377,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) } force_balance: + env->busiest_group_type = busiest->group_type; /* Looks like there is an imbalance. Compute it */ calculate_imbalance(env, &sds); return sds.busiest; @@ -7370,7 +7436,8 @@ static struct rq *find_busiest_queue(struct lb_env *env, */ if (rq->nr_running == 1 && wl > env->imbalance && - !check_cpu_capacity(rq, env->sd)) + !check_cpu_capacity(rq, env->sd) && + env->busiest_group_type != group_misfit_task) continue; /* From 41cd340c358f44bf50471dcb8ab1d79a276d3da5 Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:27 +0100 Subject: [PATCH 41/71] cpufreq: introduce cpufreq_driver_is_slow Some architectures and platforms perform CPU frequency transitions through a non-blocking method, while some might block or sleep. Even when frequency transitions do not block or sleep they may be very slow. This distinction is important when trying to change frequency from a non-interruptible context in a scheduler hot path. Describe this distinction with a cpufreq driver flag, CPUFREQ_DRIVER_FAST. The default is to not have this flag set, thus erring on the side of caution. cpufreq_driver_is_slow() is also introduced in this patch. Setting the above flag will allow this function to return false. [smuckle@linaro.org: change flag/API to include drivers that are too slow for scheduler hot paths, in addition to those that block/sleep] Cc: Rafael J. Wysocki Cc: Viresh Kumar Signed-off-by: Michael Turquette Signed-off-by: Steve Muckle --- drivers/cpufreq/cpufreq.c | 6 ++++++ include/linux/cpufreq.h | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 96f1a8860819..5ca8e9517aae 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -154,6 +154,12 @@ bool have_governor_per_policy(void) } EXPORT_SYMBOL_GPL(have_governor_per_policy); +bool cpufreq_driver_is_slow(void) +{ + return !(cpufreq_driver->flags & CPUFREQ_DRIVER_FAST); +} +EXPORT_SYMBOL_GPL(cpufreq_driver_is_slow); + struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy) { if (have_governor_per_policy()) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 9b1d61e1f1d7..b5ef79aae048 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -160,6 +160,7 @@ u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); int cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); +bool cpufreq_driver_is_slow(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); #else static inline unsigned int cpufreq_get(unsigned int cpu) @@ -317,6 +318,14 @@ struct cpufreq_driver { */ #define CPUFREQ_NEED_INITIAL_FREQ_CHECK (1 << 5) +/* + * Indicates that it is safe to call cpufreq_driver_target from + * non-interruptable context in scheduler hot paths. Drivers must + * opt-in to this flag, as the safe default is that they might sleep + * or be too slow for hot path use. + */ +#define CPUFREQ_DRIVER_FAST (1 << 6) + int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); From 4b01d9785bb0adb3ddea2624c8411cca0865b772 Mon Sep 17 00:00:00 2001 From: Michael Turquette Date: Tue, 30 Jun 2015 12:45:48 +0100 Subject: [PATCH 42/71] sched: scheduler-driven cpu frequency selection Scheduler-driven CPU frequency selection hopes to exploit both per-task and global information in the scheduler to improve frequency selection policy, achieving lower power consumption, improved responsiveness/performance, and less reliance on heuristics and tunables. For further discussion on the motivation of this integration see [0]. This patch implements a shim layer between the Linux scheduler and the cpufreq subsystem. The interface accepts capacity requests from the CFS, RT and deadline sched classes. The requests from each sched class are summed on each CPU with a margin applied to the CFS and RT capacity requests to provide some headroom. Deadline requests are expected to be precise enough given their nature to not require headroom. The maximum total capacity request for a CPU in a frequency domain drives the requested frequency for that domain. Policy is determined by both the sched classes and this shim layer. Note that this algorithm is event-driven. There is no polling loop to check cpu idle time nor any other method which is unsynchronized with the scheduler, aside from a throttling mechanism to ensure frequency changes are not attempted faster than the hardware can accommodate them. Thanks to Juri Lelli for contributing design ideas, code and test results, and to Ricky Liang for initialization and static key inc/dec fixes. [0] http://article.gmane.org/gmane.linux.kernel/1499836 [smuckle@linaro.org: various additions and fixes, revised commit text] CC: Ricky Liang Signed-off-by: Michael Turquette Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- drivers/cpufreq/Kconfig | 20 ++ include/linux/cpufreq.h | 3 + include/linux/sched.h | 8 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_sched.c | 358 +++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 51 +++++ 7 files changed, 442 insertions(+), 1 deletion(-) create mode 100644 kernel/sched/cpufreq_sched.c diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig index 659879a56dba..6f2e96cc680e 100644 --- a/drivers/cpufreq/Kconfig +++ b/drivers/cpufreq/Kconfig @@ -102,6 +102,14 @@ config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE Be aware that not all cpufreq drivers support the conservative governor. If unsure have a look at the help section of the driver. Fallback governor will be the performance governor. + +config CPU_FREQ_DEFAULT_GOV_SCHED + bool "sched" + select CPU_FREQ_GOV_SCHED + help + Use the CPUfreq governor 'sched' as default. This scales + cpu frequency using CPU utilization estimates from the + scheduler. endchoice config CPU_FREQ_GOV_PERFORMANCE @@ -183,6 +191,18 @@ config CPU_FREQ_GOV_CONSERVATIVE If in doubt, say N. +config CPU_FREQ_GOV_SCHED + bool "'sched' cpufreq governor" + depends on CPU_FREQ + select CPU_FREQ_GOV_COMMON + help + 'sched' - this governor scales cpu frequency from the + scheduler as a function of cpu capacity utilization. It does + not evaluate utilization on a periodic basis (as ondemand + does) but instead is event-driven by the scheduler. + + If in doubt, say N. + comment "CPU frequency scaling drivers" config CPUFREQ_DT diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b5ef79aae048..b5433349938a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -496,6 +496,9 @@ extern struct cpufreq_governor cpufreq_gov_ondemand; #elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE) extern struct cpufreq_governor cpufreq_gov_conservative; #define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_conservative) +#elif defined(CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED) +extern struct cpufreq_governor cpufreq_gov_sched; +#define CPUFREQ_DEFAULT_GOVERNOR (&cpufreq_gov_sched) #endif /********************************************************************* diff --git a/include/linux/sched.h b/include/linux/sched.h index e84fd00640fb..38f082659ba6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -928,6 +928,14 @@ enum cpu_idle_type { #define SCHED_CAPACITY_SHIFT 10 #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) +struct sched_capacity_reqs { + unsigned long cfs; + unsigned long rt; + unsigned long dl; + + unsigned long total; +}; + /* * Wake-queues are lists of tasks with a pending wakeup, whose * callers have already marked the task as woken internally, diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a541b5ce1dcc..0eabc9db4c3d 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -19,3 +19,4 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c new file mode 100644 index 000000000000..58bca8d2ca65 --- /dev/null +++ b/kernel/sched/cpufreq_sched.c @@ -0,0 +1,358 @@ +/* + * Copyright (C) 2015 Michael Turquette + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sched.h" + +#define THROTTLE_NSEC 50000000 /* 50ms default */ + +struct static_key __read_mostly __sched_freq = STATIC_KEY_INIT_FALSE; +static bool __read_mostly cpufreq_driver_slow; + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static struct cpufreq_governor cpufreq_gov_sched; +#endif + +static DEFINE_PER_CPU(unsigned long, enabled); +DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); + +/** + * gov_data - per-policy data internal to the governor + * @throttle: next throttling period expiry. Derived from throttle_nsec + * @throttle_nsec: throttle period length in nanoseconds + * @task: worker thread for dvfs transition that may block/sleep + * @irq_work: callback used to wake up worker thread + * @requested_freq: last frequency requested by the sched governor + * + * struct gov_data is the per-policy cpufreq_sched-specific data structure. A + * per-policy instance of it is created when the cpufreq_sched governor receives + * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data + * member of struct cpufreq_policy. + * + * Readers of this data must call down_read(policy->rwsem). Writers must + * call down_write(policy->rwsem). + */ +struct gov_data { + ktime_t throttle; + unsigned int throttle_nsec; + struct task_struct *task; + struct irq_work irq_work; + unsigned int requested_freq; +}; + +static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy, + unsigned int freq) +{ + struct gov_data *gd = policy->governor_data; + + /* avoid race with cpufreq_sched_stop */ + if (!down_write_trylock(&policy->rwsem)) + return; + + __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L); + + gd->throttle = ktime_add_ns(ktime_get(), gd->throttle_nsec); + up_write(&policy->rwsem); +} + +static bool finish_last_request(struct gov_data *gd) +{ + ktime_t now = ktime_get(); + + if (ktime_after(now, gd->throttle)) + return false; + + while (1) { + int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); + + usec_left /= NSEC_PER_USEC; + usleep_range(usec_left, usec_left + 100); + now = ktime_get(); + if (ktime_after(now, gd->throttle)) + return true; + } +} + +/* + * we pass in struct cpufreq_policy. This is safe because changing out the + * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), + * which tears down all of the data structures and __cpufreq_governor(policy, + * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the + * new policy pointer + */ +static int cpufreq_sched_thread(void *data) +{ + struct sched_param param; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned int new_request = 0; + unsigned int last_request = 0; + int ret; + + policy = (struct cpufreq_policy *) data; + gd = policy->governor_data; + + param.sched_priority = 50; + ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m); + if (ret) { + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + do_exit(-EINVAL); + } else { + pr_debug("%s: kthread (%d) set to SCHED_FIFO\n", + __func__, gd->task->pid); + } + + do { + set_current_state(TASK_INTERRUPTIBLE); + new_request = gd->requested_freq; + if (new_request == last_request) { + schedule(); + } else { + /* + * if the frequency thread sleeps while waiting to be + * unthrottled, start over to check for a newer request + */ + if (finish_last_request(gd)) + continue; + last_request = new_request; + cpufreq_sched_try_driver_target(policy, new_request); + } + } while (!kthread_should_stop()); + + return 0; +} + +static void cpufreq_sched_irq_work(struct irq_work *irq_work) +{ + struct gov_data *gd; + + gd = container_of(irq_work, struct gov_data, irq_work); + if (!gd) + return; + + wake_up_process(gd->task); +} + +static void update_fdomain_capacity_request(int cpu) +{ + unsigned int freq_new, index_new, cpu_tmp; + struct cpufreq_policy *policy; + struct gov_data *gd; + unsigned long capacity = 0; + + /* + * Avoid grabbing the policy if possible. A test is still + * required after locking the CPU's policy to avoid racing + * with the governor changing. + */ + if (!per_cpu(enabled, cpu)) + return; + + policy = cpufreq_cpu_get(cpu); + if (IS_ERR_OR_NULL(policy)) + return; + + if (policy->governor != &cpufreq_gov_sched || + !policy->governor_data) + goto out; + + gd = policy->governor_data; + + /* find max capacity requested by cpus in this policy */ + for_each_cpu(cpu_tmp, policy->cpus) { + struct sched_capacity_reqs *scr; + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp); + capacity = max(capacity, scr->total); + } + + /* Convert the new maximum capacity request into a cpu frequency */ + freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + if (cpufreq_frequency_table_target(policy, policy->freq_table, + freq_new, CPUFREQ_RELATION_L, + &index_new)) + goto out; + freq_new = policy->freq_table[index_new].frequency; + + if (freq_new == gd->requested_freq) + goto out; + + gd->requested_freq = freq_new; + + /* + * Throttling is not yet supported on platforms with fast cpufreq + * drivers. + */ + if (cpufreq_driver_slow) + irq_work_queue_on(&gd->irq_work, cpu); + else + cpufreq_sched_try_driver_target(policy, freq_new); + +out: + cpufreq_cpu_put(policy); +} + +void update_cpu_capacity_request(int cpu, bool request) +{ + unsigned long new_capacity; + struct sched_capacity_reqs *scr; + + /* The rq lock serializes access to the CPU's sched_capacity_reqs. */ + lockdep_assert_held(&cpu_rq(cpu)->lock); + + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + + new_capacity = scr->cfs + scr->rt; + new_capacity = new_capacity * capacity_margin + / SCHED_CAPACITY_SCALE; + new_capacity += scr->dl; + + if (new_capacity == scr->total) + return; + + scr->total = new_capacity; + if (request) + update_fdomain_capacity_request(cpu); +} + +static inline void set_sched_freq(void) +{ + static_key_slow_inc(&__sched_freq); +} + +static inline void clear_sched_freq(void) +{ + static_key_slow_dec(&__sched_freq); +} + +static int cpufreq_sched_policy_init(struct cpufreq_policy *policy) +{ + struct gov_data *gd; + int cpu; + + for_each_cpu(cpu, policy->cpus) + memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0, + sizeof(struct sched_capacity_reqs)); + + gd = kzalloc(sizeof(*gd), GFP_KERNEL); + if (!gd) + return -ENOMEM; + + gd->throttle_nsec = policy->cpuinfo.transition_latency ? + policy->cpuinfo.transition_latency : + THROTTLE_NSEC; + pr_debug("%s: throttle threshold = %u [ns]\n", + __func__, gd->throttle_nsec); + + if (cpufreq_driver_is_slow()) { + cpufreq_driver_slow = true; + gd->task = kthread_create(cpufreq_sched_thread, policy, + "kschedfreq:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR_OR_NULL(gd->task)) { + pr_err("%s: failed to create kschedfreq thread\n", + __func__); + goto err; + } + get_task_struct(gd->task); + kthread_bind_mask(gd->task, policy->related_cpus); + wake_up_process(gd->task); + init_irq_work(&gd->irq_work, cpufreq_sched_irq_work); + } + + policy->governor_data = gd; + set_sched_freq(); + + return 0; + +err: + kfree(gd); + return -ENOMEM; +} + +static int cpufreq_sched_policy_exit(struct cpufreq_policy *policy) +{ + struct gov_data *gd = policy->governor_data; + + clear_sched_freq(); + if (cpufreq_driver_slow) { + kthread_stop(gd->task); + put_task_struct(gd->task); + } + + policy->governor_data = NULL; + + kfree(gd); + return 0; +} + +static int cpufreq_sched_start(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 1; + + return 0; +} + +static int cpufreq_sched_stop(struct cpufreq_policy *policy) +{ + int cpu; + + for_each_cpu(cpu, policy->cpus) + per_cpu(enabled, cpu) = 0; + + return 0; +} + +static int cpufreq_sched_setup(struct cpufreq_policy *policy, + unsigned int event) +{ + switch (event) { + case CPUFREQ_GOV_POLICY_INIT: + return cpufreq_sched_policy_init(policy); + case CPUFREQ_GOV_POLICY_EXIT: + return cpufreq_sched_policy_exit(policy); + case CPUFREQ_GOV_START: + return cpufreq_sched_start(policy); + case CPUFREQ_GOV_STOP: + return cpufreq_sched_stop(policy); + case CPUFREQ_GOV_LIMITS: + break; + } + return 0; +} + +#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED +static +#endif +struct cpufreq_governor cpufreq_gov_sched = { + .name = "sched", + .governor = cpufreq_sched_setup, + .owner = THIS_MODULE, +}; + +static int __init cpufreq_sched_init(void) +{ + int cpu; + + for_each_cpu(cpu, cpu_possible_mask) + per_cpu(enabled, cpu) = 0; + return cpufreq_register_governor(&cpufreq_gov_sched); +} + +/* Try to make this the default governor */ +fs_initcall(cpufreq_sched_init); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b8a9c68f3fb..3eb5beb590a8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5064,7 +5064,7 @@ static inline unsigned long task_util(struct task_struct *p) return p->se.avg.util_avg; } -static unsigned int capacity_margin = 1280; /* ~20% margin */ +unsigned int capacity_margin = 1280; /* ~20% margin */ static inline bool __task_fits(struct task_struct *p, int cpu, int util) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fcbb3e07accc..ac28024b2d7a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1455,6 +1455,57 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +extern unsigned int capacity_margin; +extern struct static_key __sched_freq; + +static inline bool sched_freq(void) +{ + return static_key_false(&__sched_freq); +} + +DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); +void update_cpu_capacity_request(int cpu, bool request); + +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).cfs != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).cfs = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).rt != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).rt = capacity; + update_cpu_capacity_request(cpu, request); + } +} + +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ + if (per_cpu(cpu_sched_capacity_reqs, cpu).dl != capacity) { + per_cpu(cpu_sched_capacity_reqs, cpu).dl = capacity; + update_cpu_capacity_request(cpu, request); + } +} +#else +static inline bool sched_freq(void) { return false; } +static inline void set_cfs_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_rt_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +static inline void set_dl_cpu_capacity(int cpu, bool request, + unsigned long capacity) +{ } +#endif + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); From 51eff27cb559af7f2b8fc4439893ad249576b5cd Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 19 Aug 2015 19:47:12 +0100 Subject: [PATCH 43/71] sched/fair: add triggers for OPP change requests Each time a task is {en,de}queued we might need to adapt the current frequency to the new usage. Add triggers on {en,de}queue_task_fair() for this purpose. Only trigger a freq request if we are effectively waking up or going to sleep. Filter out load balancing related calls to reduce the number of triggers. [smuckle@linaro.org: resolve merge conflicts, define task_new, use renamed static key sched_freq] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/fair.c | 46 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3eb5beb590a8..844788da00be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,21 @@ static inline void hrtick_update(struct rq *rq) } #endif +static unsigned long capacity_orig_of(int cpu); +static int cpu_util(int cpu); + +static void update_capacity_of(int cpu) +{ + unsigned long req_cap; + + if (!sched_freq()) + return; + + /* Convert scale-invariant capacity to cpu. */ + req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + set_cfs_cpu_capacity(cpu, true, req_cap); +} + static bool cpu_overutilized(int cpu); /* @@ -4193,6 +4208,20 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new && !rq->rd->overutilized && cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + + /* + * We want to potentially trigger a freq switch + * request only for tasks that are waking up; this is + * because we get here also during load balancing, but + * in these cases it seems wise to trigger as single + * request after load balancing is done. + * + * XXX: how about fork()? Do we need a special + * flag/something to tell if we are here after a + * fork() (wakeup_task_new)? + */ + if (!task_new) + update_capacity_of(cpu_of(rq)); } hrtick_update(rq); } @@ -4251,9 +4280,24 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_shares(cfs_rq); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + /* + * We want to potentially trigger a freq switch + * request only for tasks that are going to sleep; + * this is because we get here also during load + * balancing, but in these cases it seems wise to + * trigger as single request after load balancing is + * done. + */ + if (task_sleep) { + if (rq->cfs.nr_running) + update_capacity_of(cpu_of(rq)); + else if (sched_freq()) + set_cfs_cpu_capacity(cpu_of(rq), false, 0); + } + } hrtick_update(rq); } From e457ca2619941a621eabb01178ad3d69b990822d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 26 Jun 2015 12:14:23 +0100 Subject: [PATCH 44/71] sched/{core,fair}: trigger OPP change request on fork() Patch "sched/fair: add triggers for OPP change requests" introduced OPP change triggers for enqueue_task_fair(), but the trigger was operating only for wakeups. Fact is that it makes sense to consider wakeup_new also (i.e., fork()), as we don't know anything about a newly created task and thus we most certainly want to jump to max OPP to not harm performance too much. However, it is not currently possible (or at least it wasn't evident to me how to do so :/) to tell new wakeups from other (non wakeup) operations. This patch introduces an additional flag in sched.h that is only set at fork() time and it is then consumed in enqueue_task_fair() for our purpose. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 9 +++------ kernel/sched/sched.h | 1 + 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c05e95175b0a..23823e329c11 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2387,7 +2387,7 @@ void wake_up_new_task(struct task_struct *p) #endif rq = __task_rq_lock(p); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_WAKEUP_NEW); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 844788da00be..3937f79240df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4171,7 +4171,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; - int task_new = !(flags & ENQUEUE_WAKEUP); + int task_new = flags & ENQUEUE_WAKEUP_NEW; + int task_wakeup = flags & ENQUEUE_WAKEUP; for_each_sched_entity(se) { if (se->on_rq) @@ -4215,12 +4216,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * because we get here also during load balancing, but * in these cases it seems wise to trigger as single * request after load balancing is done. - * - * XXX: how about fork()? Do we need a special - * flag/something to tell if we are here after a - * fork() (wakeup_task_new)? */ - if (!task_new) + if (task_new || task_wakeup) update_capacity_of(cpu_of(rq)); } hrtick_update(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ac28024b2d7a..0052ada93ae8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1181,6 +1181,7 @@ static const u32 prio_to_wmult[40] = { #endif #define ENQUEUE_REPLENISH 0x08 #define ENQUEUE_RESTORE 0x10 +#define ENQUEUE_WAKEUP_NEW 0x20 #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 From 6700c5c1708013afe68aa81258bf07ae7b5d945a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 25 Jun 2015 14:37:27 +0100 Subject: [PATCH 45/71] sched/fair: cpufreq_sched triggers for load balancing As we don't trigger freq changes from {en,de}queue_task_fair() during load balancing, we need to do explicitly so on load balancing paths. [smuckle@linaro.org: move update_capacity_of calls so rq lock is held] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/fair.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3937f79240df..950046b1b8eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6384,6 +6384,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p) { raw_spin_lock(&rq->lock); attach_task(rq, p); + /* + * We want to potentially raise target_cpu's OPP. + */ + update_capacity_of(cpu_of(rq)); raw_spin_unlock(&rq->lock); } @@ -6405,6 +6409,11 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } + /* + * We want to potentially raise env.dst_cpu's OPP. + */ + update_capacity_of(env->dst_cpu); + raw_spin_unlock(&env->dst_rq->lock); } @@ -7667,6 +7676,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, * ld_moved - cumulative load moved across iterations */ cur_ld_moved = detach_tasks(&env); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + if (cur_ld_moved) + update_capacity_of(env.src_cpu); /* * We've detached some tasks from busiest_rq. Every @@ -8037,8 +8051,13 @@ static int active_load_balance_cpu_stop(void *data) schedstat_inc(sd, alb_count); p = detach_one_task(&env); - if (p) + if (p) { schedstat_inc(sd, alb_pushed); + /* + * We want to potentially lower env.src_cpu's OPP. + */ + update_capacity_of(env.src_cpu); + } else schedstat_inc(sd, alb_failed); } From 8bd4863f301d6eb99f7a20651738c16c5f4edbab Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 25 Jun 2015 14:12:33 +0100 Subject: [PATCH 46/71] sched/fair: jump to max OPP when crossing UP threshold Since the true utilization of a long running task is not detectable while it is running and might be bigger than the current cpu capacity, create the maximum cpu capacity head room by requesting the maximum cpu capacity once the cpu usage plus the capacity margin exceeds the current capacity. This is also done to try to harm the performance of a task the least. Original fair-class only version authored by Juri Lelli . cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Juri Lelli Signed-off-by: Steve Muckle --- kernel/sched/core.c | 41 ++++++++++++++++++++++++++ kernel/sched/fair.c | 66 ------------------------------------------ kernel/sched/sched.h | 68 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 66 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 23823e329c11..d60a5feb4f48 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2854,6 +2854,45 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_CPU_FREQ_GOV_SCHED +static unsigned long sum_capacity_reqs(unsigned long cfs_cap, + struct sched_capacity_reqs *scr) +{ + unsigned long total = cfs_cap + scr->rt; + + total = total * capacity_margin; + total /= SCHED_CAPACITY_SCALE; + total += scr->dl; + return total; +} + +static void sched_freq_tick(int cpu) +{ + struct sched_capacity_reqs *scr; + unsigned long capacity_orig, capacity_curr; + + if (!sched_freq()) + return; + + capacity_orig = capacity_orig_of(cpu); + capacity_curr = capacity_curr_of(cpu); + if (capacity_curr == capacity_orig) + return; + + /* + * To make free room for a task that is building up its "real" + * utilization and to harm its performance the least, request + * a jump to max OPP as soon as the margin of free capacity is + * impacted (specified by capacity_margin). + */ + scr = &per_cpu(cpu_sched_capacity_reqs, cpu); + if (capacity_curr < sum_capacity_reqs(cpu_util(cpu), scr)) + set_cfs_cpu_capacity(cpu, true, capacity_max); +} +#else +static inline void sched_freq_tick(int cpu) { } +#endif + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -2880,6 +2919,8 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); + + sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 950046b1b8eb..bb1d0e3d7835 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,9 +4144,6 @@ static inline void hrtick_update(struct rq *rq) } #endif -static unsigned long capacity_orig_of(int cpu); -static int cpu_util(int cpu); - static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4521,15 +4518,6 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - -static unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} static unsigned long cpu_avg_load_per_task(int cpu) { @@ -4698,60 +4686,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - -/* - * cpu_util returns the amount of capacity of a CPU that is used by CFS - * tasks. The unit of the return value must be the one of capacity so we can - * compare the utilization with the capacity of the CPU that is available for - * CFS task (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - */ -static unsigned long __cpu_util(int cpu, int delta) -{ - unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; - unsigned long capacity = capacity_orig_of(cpu); - - delta += util; - if (delta < 0) - return 0; - - return (delta >= capacity) ? capacity : delta; -} - -static unsigned long cpu_util(int cpu) -{ - return __cpu_util(cpu, 0); -} - static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0052ada93ae8..6aaff682adb8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1456,7 +1456,75 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) } #endif +#ifdef CONFIG_SMP +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + +/* + * cpu_util returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the utilization with the capacity of the CPU that is available for + * CFS task (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + */ +static inline unsigned long __cpu_util(int cpu, int delta) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static inline unsigned long cpu_util(int cpu) +{ + return __cpu_util(cpu, 0); +} + +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +static inline unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + +#endif + #ifdef CONFIG_CPU_FREQ_GOV_SCHED +#define capacity_max SCHED_CAPACITY_SCALE extern unsigned int capacity_margin; extern struct static_key __sched_freq; From 61b0e4f12648a567ee648b74dfae2e1d3ccbc314 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 25 Nov 2015 15:59:25 -0800 Subject: [PATCH 47/71] sched/cpufreq_sched: add trace events Trace events will aid in debugging, profiling and tuning. Signed-off-by: Steve Muckle --- include/trace/events/cpufreq_sched.h | 87 ++++++++++++++++++++++++++++ kernel/sched/cpufreq_sched.c | 9 +++ 2 files changed, 96 insertions(+) create mode 100644 include/trace/events/cpufreq_sched.h diff --git a/include/trace/events/cpufreq_sched.h b/include/trace/events/cpufreq_sched.h new file mode 100644 index 000000000000..a46cd088e969 --- /dev/null +++ b/include/trace/events/cpufreq_sched.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2015 Steve Muckle + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM cpufreq_sched + +#if !defined(_TRACE_CPUFREQ_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CPUFREQ_SCHED_H + +#include +#include + +TRACE_EVENT(cpufreq_sched_throttled, + TP_PROTO(unsigned int rem), + TP_ARGS(rem), + TP_STRUCT__entry( + __field( unsigned int, rem) + ), + TP_fast_assign( + __entry->rem = rem; + ), + TP_printk("throttled - %d usec remaining", __entry->rem) +); + +TRACE_EVENT(cpufreq_sched_request_opp, + TP_PROTO(int cpu, + unsigned long capacity, + unsigned int freq_new, + unsigned int requested_freq), + TP_ARGS(cpu, capacity, freq_new, requested_freq), + TP_STRUCT__entry( + __field( int, cpu) + __field( unsigned long, capacity) + __field( unsigned int, freq_new) + __field( unsigned int, requested_freq) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->capacity = capacity; + __entry->freq_new = freq_new; + __entry->requested_freq = requested_freq; + ), + TP_printk("cpu %d cap change, cluster cap request %ld => OPP %d " + "(cur %d)", + __entry->cpu, __entry->capacity, __entry->freq_new, + __entry->requested_freq) +); + +TRACE_EVENT(cpufreq_sched_update_capacity, + TP_PROTO(int cpu, + bool request, + struct sched_capacity_reqs *scr, + unsigned long new_capacity), + TP_ARGS(cpu, request, scr, new_capacity), + TP_STRUCT__entry( + __field( int, cpu) + __field( bool, request) + __field( unsigned long, cfs) + __field( unsigned long, rt) + __field( unsigned long, dl) + __field( unsigned long, total) + __field( unsigned long, new_total) + ), + TP_fast_assign( + __entry->cpu = cpu; + __entry->request = request; + __entry->cfs = scr->cfs; + __entry->rt = scr->rt; + __entry->dl = scr->dl; + __entry->total = scr->total; + __entry->new_total = new_capacity; + ), + TP_printk("cpu=%d set_cap=%d cfs=%ld rt=%ld dl=%ld old_tot=%ld " + "new_tot=%ld", + __entry->cpu, __entry->request, __entry->cfs, __entry->rt, + __entry->dl, __entry->total, __entry->new_total) +); + +#endif /* _TRACE_CPUFREQ_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 58bca8d2ca65..5afe56a82491 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -14,6 +14,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "sched.h" #define THROTTLE_NSEC 50000000 /* 50ms default */ @@ -78,6 +81,7 @@ static bool finish_last_request(struct gov_data *gd) int usec_left = ktime_to_ns(ktime_sub(gd->throttle, now)); usec_left /= NSEC_PER_USEC; + trace_cpufreq_sched_throttled(usec_left); usleep_range(usec_left, usec_left + 100); now = ktime_get(); if (ktime_after(now, gd->throttle)) @@ -186,6 +190,9 @@ static void update_fdomain_capacity_request(int cpu) goto out; freq_new = policy->freq_table[index_new].frequency; + trace_cpufreq_sched_request_opp(cpu, capacity, freq_new, + gd->requested_freq); + if (freq_new == gd->requested_freq) goto out; @@ -222,6 +229,8 @@ void update_cpu_capacity_request(int cpu, bool request) if (new_capacity == scr->total) return; + trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity); + scr->total = new_capacity; if (request) update_fdomain_capacity_request(cpu); From e724121586dfa7df4301dedaf7af5ce7facaf4a2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 20 Oct 2015 10:46:26 +0200 Subject: [PATCH 48/71] sched: remove call of sched_avg_update from sched_rt_avg_update rt_avg is only used to scale the available CPU's capacity for CFS tasks. As the update of this scaling is done during periodic load balance, we only have to ensure that sched_avg_update has been called before any periodic load balancing. This requirement is already fulfilled by __update_cpu_load so the call in sched_rt_avg_update, which is part of the hotpath, is useless. Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/sched.h | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6aaff682adb8..770e119ca692 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1578,7 +1578,6 @@ static inline void set_dl_cpu_capacity(int cpu, bool request, static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); - sched_avg_update(rq); } #else static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } From a4de8df5ba84d819832509c03a0f58f4fbcc76e4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 3 Nov 2015 10:39:01 +0100 Subject: [PATCH 49/71] sched: deadline: use deadline bandwidth in scale_rt_capacity Instead of monitoring the exec time of deadline tasks to evaluate the CPU capacity consumed by deadline scheduler class, we can directly calculate it thanks to the sum of utilization of deadline tasks on the CPU. We can remove deadline tasks from rt_avg metric and directly use the average bandwidth of deadline scheduler in scale_rt_capacity. Based in part on a similar patch from Luca Abeni . Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/deadline.c | 33 +++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 8 ++++++++ kernel/sched/sched.h | 2 ++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e285f9..9d9eb50d4059 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,24 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw += se_bw; +} + +static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + u64 se_bw = dl_se->dl_bw; + + dl_rq->avg_bw -= se_bw; + if (dl_rq->avg_bw < 0) { + WARN_ON(1); + dl_rq->avg_bw = 0; + } +} + static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq) { struct sched_dl_entity *dl_se = &p->dl; @@ -494,6 +512,9 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + if (dl_se->dl_new) + add_average_bw(dl_se, dl_rq); + /* * The arrival of a new instance needs special treatment, i.e., * the actual scheduling parameters have to be "renewed". @@ -741,8 +762,6 @@ static void update_curr_dl(struct rq *rq) curr->se.exec_start = rq_clock_task(rq); cpuacct_charge(curr, delta_exec); - sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; if (dl_runtime_exceeded(dl_se)) { dl_se->dl_throttled = 1; @@ -1241,6 +1260,8 @@ static void task_fork_dl(struct task_struct *p) static void task_dead_dl(struct task_struct *p) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); + struct dl_rq *dl_rq = dl_rq_of_se(&p->dl); + struct rq *rq = rq_of_dl_rq(dl_rq); /* * Since we are TASK_DEAD we won't slip out of the domain! @@ -1249,6 +1270,8 @@ static void task_dead_dl(struct task_struct *p) /* XXX we should retain the bw until 0-lag */ dl_b->total_bw -= p->dl.dl_bw; raw_spin_unlock_irq(&dl_b->lock); + + clear_average_bw(&p->dl, &rq->dl); } static void set_curr_task_dl(struct rq *rq) @@ -1556,7 +1579,9 @@ static int push_dl_task(struct rq *rq) } deactivate_task(rq, next_task, 0); + clear_average_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); + add_average_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -1644,7 +1669,9 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + clear_average_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); + add_average_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -1750,6 +1777,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) if (!start_dl_timer(p)) __dl_clear_params(p); + clear_average_bw(&p->dl, &rq->dl); + /* * Since this might be the only -deadline task on the rq, * this is the right place to try to pull some other one diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb1d0e3d7835..651e160cdd67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6550,6 +6550,14 @@ static unsigned long scale_rt_capacity(int cpu) used = div_u64(avg, total); + /* + * deadline bandwidth is defined at system level so we must + * weight this bandwidth with the max capacity of the system. + * As a reminder, avg_bw is 20bits width and + * scale_cpu_capacity is 10 bits width + */ + used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu)); + if (likely(used < SCHED_CAPACITY_SCALE)) return SCHED_CAPACITY_SCALE - used; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 770e119ca692..983c132af11b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -506,6 +506,8 @@ struct dl_rq { #else struct dl_bw dl_bw; #endif + /* This is the "average utilization" for this runqueue */ + s64 avg_bw; }; #ifdef CONFIG_SMP From 778431874e9815c04d82d30630c5c0400d4e53db Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 26 Oct 2015 18:14:50 +0100 Subject: [PATCH 50/71] sched: rt scheduler sets capacity requirement RT tasks don't provide any running constraints like deadline ones except their running priority. The only current usable input to estimate the capacity needed by RT tasks is the rt_avg metric. We use it to estimate the CPU capacity needed for the RT scheduler class. In order to monitor the evolution for RT task load, we must peridiocally check it during the tick. Then, we use the estimated capacity of the last activity to estimate the next one which can not be that accurate but is a good starting point without any impact on the wake up path of RT tasks. Signed-off-by: Vincent Guittot Signed-off-by: Steve Muckle --- kernel/sched/rt.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8ec86abe0ea1..9694204660b7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1426,6 +1426,41 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } +#ifdef CONFIG_SMP +static void sched_rt_update_capacity_req(struct rq *rq) +{ + u64 total, used, age_stamp, avg; + s64 delta; + + if (!sched_freq()) + return; + + sched_avg_update(rq); + /* + * Since we're reading these variables without serialization make sure + * we read them once before doing sanity checks on them. + */ + age_stamp = READ_ONCE(rq->age_stamp); + avg = READ_ONCE(rq->rt_avg); + delta = rq_clock(rq) - age_stamp; + + if (unlikely(delta < 0)) + delta = 0; + + total = sched_avg_period() + delta; + + used = div_u64(avg, total); + if (unlikely(used > SCHED_CAPACITY_SCALE)) + used = SCHED_CAPACITY_SCALE; + + set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used)); +} +#else +static inline void sched_rt_update_capacity_req(struct rq *rq) +{ } + +#endif + static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { @@ -1494,8 +1529,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev) if (prev->sched_class == &rt_sched_class) update_curr_rt(rq); - if (!rt_rq->rt_queued) + if (!rt_rq->rt_queued) { + /* + * The next task to be picked on this rq will have a lower + * priority than rt tasks so we can spend some time to update + * the capacity used by rt tasks based on the last activity. + * This value will be the used as an estimation of the next + * activity. + */ + sched_rt_update_capacity_req(rq); return NULL; + } put_prev_task(rq, prev); @@ -2212,6 +2256,9 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) update_curr_rt(rq); + if (rq->rt.rt_nr_running) + sched_rt_update_capacity_req(rq); + watchdog(rq, p); /* From 7493d0ec81f075ac4de3bbd5c0317e8715b935c1 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:55:51 +0000 Subject: [PATCH 51/71] fixup! sched: scheduler-driven cpu frequency selection Signed-off-by: Juri Lelli --- kernel/sched/cpufreq_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 5afe56a82491..e1d208e101ed 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -119,9 +119,9 @@ static int cpufreq_sched_thread(void *data) } do { - set_current_state(TASK_INTERRUPTIBLE); new_request = gd->requested_freq; if (new_request == last_request) { + set_current_state(TASK_INTERRUPTIBLE); schedule(); } else { /* From b1d147f7a93900b6f93b4dc1c927b638fc0b40f1 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 11 Dec 2015 11:58:05 +0000 Subject: [PATCH 52/71] fixup! sched/fair: jump to max OPP when crossing UP threshold Signed-off-by: Juri Lelli --- kernel/sched/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d60a5feb4f48..b37b3b5ae63e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2910,6 +2910,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); update_cpu_load_active(rq); calc_global_load_tick(rq); + sched_freq_tick(cpu); raw_spin_unlock(&rq->lock); perf_event_task_tick(); @@ -2919,8 +2920,6 @@ void scheduler_tick(void) trigger_load_balance(rq); #endif rq_last_tick_reset(rq); - - sched_freq_tick(cpu); } #ifdef CONFIG_NO_HZ_FULL From 62d5c406ae5f97318744b9ba2fc1da13e8cb13ce Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 30 Jun 2015 12:03:26 +0100 Subject: [PATCH 53/71] sched/tune: add detailed documentation The topic of a single simple power-performance tunable, that is wholly scheduler centric, and has well defined and predictable properties has come up on several occasions in the past. With techniques such as a scheduler driven DVFS, we now have a good framework for implementing such a tunable. This patch provides a detailed description of the motivations and design decisions behind the implementation of the SchedTune. cc: Jonathan Corbet cc: linux-doc@vger.kernel.org Signed-off-by: Patrick Bellasi --- Documentation/scheduler/sched-tune.txt | 367 +++++++++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 Documentation/scheduler/sched-tune.txt diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt new file mode 100644 index 000000000000..cb795e643eba --- /dev/null +++ b/Documentation/scheduler/sched-tune.txt @@ -0,0 +1,367 @@ + Central, scheduler-driven, power-performance control + (EXPERIMENTAL) + +Abstract +======== + +The topic of a single simple power-performance tunable, that is wholly +scheduler centric, and has well defined and predictable properties has come up +on several occasions in the past [1,2]. With techniques such as a scheduler +driven DVFS [3], we now have a good framework for implementing such a tunable. +This document describes the overall ideas behind its design and implementation. + + +Table of Contents +================= + +1. Motivation +2. Introduction +3. Signal Boosting Strategy +4. OPP selection using boosted CPU utilization +5. Per task group boosting +6. Question and Answers + - What about "auto" mode? + - What about boosting on a congested system? + - How CPUs are boosted when we have tasks with multiple boost values? +7. References + + +1. Motivation +============= + +Sched-DVFS [3] is a new event-driven cpufreq governor which allows the +scheduler to select the optimal DVFS operating point (OPP) for running a task +allocated to a CPU. The introduction of sched-DVFS enables running workloads at +the most energy efficient OPPs. + +However, sometimes it may be desired to intentionally boost the performance of +a workload even if that could imply a reasonable increase in energy +consumption. For example, in order to reduce the response time of a task, we +may want to run the task at a higher OPP than the one that is actually required +by it's CPU bandwidth demand. + +This last requirement is especially important if we consider that one of the +main goals of the sched-DVFS component is to replace all currently available +CPUFreq policies. Since sched-DVFS is event based, as opposed to the sampling +driven governors we currently have, it is already more responsive at selecting +the optimal OPP to run tasks allocated to a CPU. However, just tracking the +actual task load demand may not be enough from a performance standpoint. For +example, it is not possible to get behaviors similar to those provided by the +"performance" and "interactive" CPUFreq governors. + +This document describes an implementation of a tunable, stacked on top of the +sched-DVFS which extends its functionality to support task performance +boosting. + +By "performance boosting" we mean the reduction of the time required to +complete a task activation, i.e. the time elapsed from a task wakeup to its +next deactivation (e.g. because it goes back to sleep or it terminates). For +example, if we consider a simple periodic task which executes the same workload +for 5[s] every 20[s] while running at a certain OPP, a boosted execution of +that task must complete each of its activations in less than 5[s]. + +A previous attempt [5] to introduce such a boosting feature has not been +successful mainly because of the complexity of the proposed solution. The +approach described in this document exposes a single simple interface to +user-space. This single tunable knob allows the tuning of system wide +scheduler behaviours ranging from energy efficiency at one end through to +incremental performance boosting at the other end. This first tunable affects +all tasks. However, a more advanced extension of the concept is also provided +which uses CGroups to boost the performance of only selected tasks while using +the energy efficient default for all others. + +The rest of this document introduces in more details the proposed solution +which has been named SchedTune. + + +2. Introduction +=============== + +SchedTune exposes a simple user-space interface with a single power-performance +tunable: + + /proc/sys/kernel/sched_cfs_boost + +This permits expressing a boost value as an integer in the range [0..100]. + +A value of 0 (default) configures the CFS scheduler for maximum energy +efficiency. This means that sched-DVFS runs the tasks at the minimum OPP +required to satisfy their workload demand. +A value of 100 configures scheduler for maximum performance, which translates +to the selection of the maximum OPP on that CPU. + +The range between 0 and 100 can be set to satisfy other scenarios suitably. For +example to satisfy interactive response or depending on other system events +(battery level etc). + +A CGroup based extension is also provided, which permits further user-space +defined task classification to tune the scheduler for different goals depending +on the specific nature of the task, e.g. background vs interactive vs +low-priority. + +The overall design of the SchedTune module is built on top of "Per-Entity Load +Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating +Performance Point (OPP) selection. +Each time a task is allocated on a CPU, sched-DVFS has the opportunity to tune +the operating frequency of that CPU to better match the workload demand. The +selection of the actual OPP being activated is influenced by the global boost +value, or the boost value for the task CGroup when in use. + +This simple biasing approach leverages existing frameworks, which means minimal +modifications to the scheduler, and yet it allows to achieve a range of +different behaviours all from a single simple tunable knob. +The only new concept introduced is that of signal boosting. + + +3. Signal Boosting Strategy +=========================== + +The whole PELT machinery works based on the value of a few load tracking signals +which basically track the CPU bandwidth requirements for tasks and the capacity +of CPUs. The basic idea behind the SchedTune knob is to artificially inflate +some of these load tracking signals to make a task or RQ appears more demanding +that it actually is. + +Which signals have to be inflated depends on the specific "consumer". However, +independently from the specific (signal, consumer) pair, it is important to +define a simple and possibly consistent strategy for the concept of boosting a +signal. + +A boosting strategy defines how the "abstract" user-space defined +sched_cfs_boost value is translated into an internal "margin" value to be added +to a signal to get its inflated value: + + margin := boosting_strategy(sched_cfs_boost, signal) + boosted_signal := signal + margin + +Different boosting strategies were identified and analyzed before selecting the +one found to be most effective. + +Signal Proportional Compensation (SPC) +-------------------------------------- + +In this boosting strategy the sched_cfs_boost value is used to compute a +margin which is proportional to the complement of the original signal. +When a signal has a maximum possible value, its complement is defined as +the delta from the actual value and its possible maximum. + +Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as +the maximum possible value, the margin becomes: + + margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal) + +Using this boosting strategy: +- a 100% sched_cfs_boost means that the signal is scaled to the maximum value +- each value in the range of sched_cfs_boost effectively inflates the signal in + question by a quantity which is proportional to the maximum value. + +For example, by applying the SPC boosting strategy to the selection of the OPP +to run a task it is possible to achieve these behaviors: + +- 0% boosting: run the task at the minimum OPP required by its workload +- 100% boosting: run the task at the maximum OPP available for the CPU +- 50% boosting: run at the half-way OPP between minimum and maximum + +Which means that, at 50% boosting, a task will be scheduled to run at half of +the maximum theoretically achievable performance on the specific target +platform. + +A graphical representation of an SPC boosted signal is represented in the +following figure where: + a) "-" represents the original signal + b) "b" represents a 50% boosted signal + c) "p" represents a 100% boosted signal + + + ^ + | SCHED_LOAD_SCALE + +-----------------------------------------------------------------+ + |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp + | + | boosted_signal + | bbbbbbbbbbbbbbbbbbbbbbbb + | + | original signal + | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+ + | | + |bbbbbbbbbbbbbbbbbb | + | | + | | + | | + | +-----------------------+ + | | + | | + | | + |------------------+ + | + | + +-----------------------------------------------------------------------> + +The plot above shows a ramped load signal (titled 'original_signal') and it's +boosted equivalent. For each step of the original signal the boosted signal +corresponding to a 50% boost is midway from the original signal and the upper +bound. Boosting by 100% generates a boosted signal which is always saturated to +the upper bound. + + +4. OPP selection using boosted CPU utilization +============================================== + +It is worth calling out that the implementation does not introduce any new load +signals. Instead, it provides an API to tune existing signals. This tuning is +done on demand and only in scheduler code paths where it is sensible to do so. +The new API calls are defined to return either the default signal or a boosted +one, depending on the value of sched_cfs_boost. This is a clean an non invasive +modification of the existing existing code paths. + +The signal representing a CPU's utilization is boosted according to the +previously described SPC boosting strategy. To sched-DVFS, this allows a CPU +(ie CFS run-queue) to appear more used then it actually is. + +Thus, with the sched_cfs_boost enabled we have the following main functions to +get the current utilization of a CPU: + + cpu_util() + boosted_cpu_util() + +The new boosted_cpu_util() is similar to the first but returns a boosted +utilization signal which is a function of the sched_cfs_boost value. + +This function is used in the CFS scheduler code paths where sched-DVFS needs to +decide the OPP to run a CPU at. +For example, this allows selecting the highest OPP for a CPU which has +the boost value set to 100%. + + +5. Per task group boosting +========================== + +The availability of a single knob which is used to boost all tasks in the +system is certainly a simple solution but it quite likely doesn't fit many +utilization scenarios, especially in the mobile device space. + +For example, on battery powered devices there usually are many background +services which are long running and need energy efficient scheduling. On the +other hand, some applications are more performance sensitive and require an +interactive response and/or maximum performance, regardless of the energy cost. +To better service such scenarios, the SchedTune implementation has an extension +that provides a more fine grained boosting interface. + +A new CGroup controller, namely "schedtune", could be enabled which allows to +defined and configure task groups with different boosting values. +Tasks that require special performance can be put into separate CGroups. +The value of the boost associated with the tasks in this group can be specified +using a single knob exposed by the CGroup controller: + + schedtune.boost + +This knob allows the definition of a boost value that is to be used for +SPC boosting of all tasks attached to this group. + +The current schedtune controller implementation is really simple and has these +main characteristics: + + 1) It is only possible to create 1 level depth hierarchies + + The root control groups define the system-wide boost value to be applied + by default to all tasks. Its direct subgroups are named "boost groups" and + they define the boost value for specific set of tasks. + Further nested subgroups are not allowed since they do not have a sensible + meaning from a user-space standpoint. + + 2) It is possible to define only a limited number of "boost groups" + + This number is defined at compile time and by default configured to 16. + This is a design decision motivated by two main reasons: + a) In a real system we do not expect utilization scenarios with more then few + boost groups. For example, a reasonable collection of groups could be + just "background", "interactive" and "performance". + b) It simplifies the implementation considerably, especially for the code + which has to compute the per CPU boosting once there are multiple + RUNNABLE tasks with different boost values. + +Such a simple design should allow servicing the main utilization scenarios identified +so far. It provides a simple interface which can be used to manage the +power-performance of all tasks or only selected tasks. +Moreover, this interface can be easily integrated by user-space run-times (e.g. +Android, ChromeOS) to implement a QoS solution for task boosting based on tasks +classification, which has been a long standing requirement. + +Setup and usage +--------------- + +0. Use a kernel with CGROUP_SCHEDTUNE support enabled + +1. Check that the "schedtune" CGroup controller is available: + + root@linaro-nano:~# cat /proc/cgroups + #subsys_name hierarchy num_cgroups enabled + cpuset 0 1 1 + cpu 0 1 1 + schedtune 0 1 1 + +2. Mount a tmpfs to create the CGroups mount point (Optional) + + root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup + +3. Mount the "schedtune" controller + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune + root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune + +4. Setup the system-wide boost value (Optional) + + If not configured the root control group has a 0% boost value, which + basically disables boosting for all tasks in the system thus running in + an energy-efficient mode. + + root@linaro-nano:~# echo $SYSBOOST > /sys/fs/cgroup/stune/schedtune.boost + +5. Create task groups and configure their specific boost value (Optional) + + For example here we create a "performance" boost group configure to boost + all its tasks to 100% + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance + root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost + +6. Move tasks into the boost group + + For example, the following moves the tasks with PID $TASKPID (and all its + threads) into the "performance" boost group. + + root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs + +This simple configuration allows only the threads of the $TASKPID task to run, +when needed, at the highest OPP in the most capable CPU of the system. + + +6. Question and Answers +======================= + +What about "auto" mode? +----------------------- + +The 'auto' mode as described in [5] can be implemented by interfacing SchedTune +with some suitable user-space element. This element could use the exposed +system-wide or cgroup based interface. + +How are multiple groups of tasks with different boost values managed? +--------------------------------------------------------------------- + +The current SchedTune implementation keeps track of the boosted RUNNABLE tasks +on a CPU. Once sched-DVFS selects the OPP to run a CPU at, the CPU utilization +is boosted with a value which is the maximum of the boost values of the +currently RUNNABLE tasks in its RQ. + +This allows sched-DVFS to boost a CPU only while there are boosted tasks ready +to run and switch back to the energy efficient mode as soon as the last boosted +task is dequeued. + + +7. References +============= +[1] http://lwn.net/Articles/552889 +[2] http://lkml.org/lkml/2012/5/18/91 +[3] http://lkml.org/lkml/2015/6/26/620 + From a6332cfa791c2b39c77ef778323963f513694602 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:11:44 +0100 Subject: [PATCH 54/71] sched/tune: add sysctl interface to define a boost value The current (CFS) scheduler implementation does not allow "to boost" tasks performance by running them at a higher OPP compared to the minimum required to meet their workload demands. To support tasks performance boosting the scheduler should provide a "knob" which allows to tune how much the system is going to be optimised for energy efficiency vs performance. This patch is the first of a series which provides a simple interface to define a tuning knob. One system-wide "boost" tunable is exposed via: /proc/sys/kernel/sched_cfs_boost which can be configured in the range [0..100], to define a percentage where: - 0% boost requires to operate in "standard" mode by scheduling tasks at the minimum capacities required by the workload demand - 100% boost requires to push at maximum the task performances, "regardless" of the incurred energy consumption A boost value in between these two boundaries is used to bias the power/performance trade-off, the higher the boost value the more the scheduler is biased toward performance boosting instead of energy efficiency. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/sched/sysctl.h | 16 ++++++++++++++++ init/Kconfig | 26 ++++++++++++++++++++++++++ kernel/sched/Makefile | 1 + kernel/sched/tune.c | 17 +++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 5 files changed, 71 insertions(+) create mode 100644 kernel/sched/tune.c diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index c9e4731cf10b..4479e48c7712 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -77,6 +77,22 @@ extern int sysctl_sched_rt_runtime; extern unsigned int sysctl_sched_cfs_bandwidth_slice; #endif +#ifdef CONFIG_SCHED_TUNE +extern unsigned int sysctl_sched_cfs_boost; +int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos); +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return sysctl_sched_cfs_boost; +} +#else +static inline unsigned int get_sysctl_sched_cfs_boost(void) +{ + return 0; +} +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern unsigned int sysctl_sched_autogroup_enabled; #endif diff --git a/init/Kconfig b/init/Kconfig index 235c7a2c0d20..83d8c02e7a57 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1237,6 +1237,32 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TUNE + bool "Boosting for CFS tasks (EXPERIMENTAL)" + help + This option enables the system-wide support for task boosting. + When this support is enabled a new sysctl interface is exposed to + userspace via: + /proc/sys/kernel/sched_cfs_boost + which allows to set a system-wide boost value in range [0..100]. + + The currently boosting strategy is implemented in such a way that: + - a 0% boost value requires to operate in "standard" mode by + scheduling all tasks at the minimum capacities required by their + workload demand + - a 100% boost value requires to push at maximum the task + performances, "regardless" of the incurred energy consumption + + A boost value in between these two boundaries is used to bias the + power/performance trade-off, the higher the boost value the more the + scheduler is biased toward performance boosting instead of energy + efficiency. + + Since this support exposes a single system-wide knob, the specified + boost value is applied to all (CFS) tasks in the system. + + If unsure, say N. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 0eabc9db4c3d..c6a85f813dfd 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -18,5 +18,6 @@ obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c new file mode 100644 index 000000000000..4c44b1a4ad98 --- /dev/null +++ b/kernel/sched/tune.c @@ -0,0 +1,17 @@ +#include "sched.h" + +unsigned int sysctl_sched_cfs_boost __read_mostly; + +int +sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + return 0; +} + diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..827ba256fdad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -434,6 +434,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#ifdef CONFIG_SCHED_TUNE + { + .procname = "sched_cfs_boost", + .data = &sysctl_sched_cfs_boost, + .maxlen = sizeof(sysctl_sched_cfs_boost), + .mode = 0644, + .proc_handler = &sysctl_sched_cfs_boost_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", From 82a3b9a3c6b40c8048b73e04aab04603cecf8ca1 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 18:32:36 +0100 Subject: [PATCH 55/71] sched/fair: add function to convert boost value into "margin" The basic idea of the boost knob is to "artificially inflate" a signal to make a task or logical CPU appears more demanding than it actually is. Independently from the specific signal, a consistent and possibly simple semantic for the concept of "signal boosting" must define: 1. how we translate the boost percentage into a "margin" value to be added to the original signal to inflate 2. what is the meaning of a boost value from a user-space perspective This patch provides the implementation of a possible boost semantic, named "Signal Proportional Compensation" (SPC), where the boost percentage (BP) is used to compute a margin (M) which is proportional to the complement of the original signal (OS): M = BP * (SCHED_LOAD_SCALE - OS) The computed margin then added to the OS to obtain the Boosted Signal (BS) BS = OS + M The proposed boost semantic has these main features: - each signal gets a boost which is proportional to its delta with respect to the maximum available capacity in the system (i.e. SCHED_LOAD_SCALE) - a 100% boosting has a clear understanding from a user-space perspective, since it means simply to run (possibly) "all" tasks at the max OPP - each boosting value means to improve the task performance by a quantity which is proportional to the maximum achievable performance on that system Thus this semantics is somehow forcing a behaviour which is: 50% boosting means to run at half-way between the current and the maximum performance which a task could achieve on that system This patch provides the code to implement a fast integer division to convert a boost percentage (BP) value into a margin (M). NOTE: this code is suitable for all signals operating in range [0..SCHED_LOAD_SCALE] cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 651e160cdd67..1b6a57837433 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5074,6 +5074,44 @@ static bool cpu_overutilized(int cpu) return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); } +#ifdef CONFIG_SCHED_TUNE + +static unsigned long +schedtune_margin(unsigned long signal, unsigned long boost) +{ + unsigned long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_LOAD_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + margin = SCHED_LOAD_SCALE - signal; + margin *= boost; + + /* + * Fast integer division by constant: + * Constant : (C) = 100 + * Precision : 0.1% (P) = 0.1 + * Reference : C * 100 / P (R) = 100000 + * + * Thus: + * Shift bits : ceil(log(R,2)) (S) = 17 + * Mult const : round(2^S/C) (M) = 1311 + * + * + */ + margin *= 1311; + margin >>= 17; + + return margin; +} + +#endif /* CONFIG_SCHED_TUNE */ + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From d24dad20431ada01a0f60de20b9eabd6f2c441ac Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 26 Jun 2015 09:55:06 +0100 Subject: [PATCH 56/71] sched/fair: add boosted CPU usage The CPU usage signal is used by the scheduler as an estimation of the overall bandwidth currently allocated on a CPU. When SchedDVFS is in use, this signal affects the selection of the operating points (OPP) required to accommodate all the workload allocated in a CPU. A convenient way to boost the performance of tasks running on a CPU, which is also little intrusive, is to boost the CPU usage signal each time it is used to select an OPP. This patch introduces a new function: get_boosted_cpu_usage(cpu) to return a boosted value for the usage of a specified CPU. The margin added to the original usage is: 1. computed based on the "boosting strategy" in use 2. proportional to the system-wide boost value defined by provided user-space interface The boosted signal is used by SchedDVFS (transparently) each time it requires to get an estimation of the capacity required for a CPU. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1b6a57837433..eff548fae43e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4144,6 +4144,8 @@ static inline void hrtick_update(struct rq *rq) } #endif +static inline unsigned long boosted_cpu_util(int cpu); + static void update_capacity_of(int cpu) { unsigned long req_cap; @@ -4152,7 +4154,8 @@ static void update_capacity_of(int cpu) return; /* Convert scale-invariant capacity to cpu. */ - req_cap = cpu_util(cpu) * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); + req_cap = boosted_cpu_util(cpu); + req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); } @@ -5110,8 +5113,36 @@ schedtune_margin(unsigned long signal, unsigned long boost) return margin; } +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + unsigned int boost = get_sysctl_sched_cfs_boost(); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline unsigned int +schedtune_cpu_margin(unsigned long util) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ +static inline unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util(cpu); + unsigned long margin = schedtune_cpu_margin(util); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. From 10e34a4dd975b27ea2874e60e6a8ffd0448d30d5 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 23 Jun 2015 09:17:54 +0100 Subject: [PATCH 57/71] sched/tune: add initial support for CGroups based boosting To support task performance boosting, the usage of a single knob has the advantage to be a simple solution, both from the implementation and the usability standpoint. However, on a real system it can be difficult to identify a single value for the knob which fits the needs of multiple different tasks. For example, some kernel threads and/or user-space background services should be better managed the "standard" way while we still want to be able to boost the performance of specific workloads. In order to improve the flexibility of the task boosting mechanism this patch is the first of a small series which extends the previous implementation to introduce a "per task group" support. This first patch introduces just the basic CGroups support, a new "schedtune" CGroups controller is added which allows to configure different boost value for different groups of tasks. To keep the implementation simple but still effective for a boosting strategy, the new controller: 1. allows only a two layer hierarchy 2. supports only a limited number of boost groups A two layer hierarchy allows to place each task either: a) in the root control group thus being subject to a system-wide boosting value b) in a child of the root group thus being subject to the specific boost value defined by that "boost group" The limited number of "boost groups" supported is mainly motivated by the observation that in a real system it could be useful to have only few classes of tasks which deserve different treatment. For example, background vs foreground or interactive vs low-priority. As an additional benefit, a limited number of boost groups allows also to have a simpler implementation especially for the code required to compute the boost value for CPUs which have runnable tasks belonging to different boost groups. cc: Tejun Heo cc: Li Zefan cc: Johannes Weiner cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- include/linux/cgroup_subsys.h | 4 + init/Kconfig | 17 +++ kernel/sched/tune.c | 223 ++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 4 + 4 files changed, 248 insertions(+) diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 1a96fdaa33d5..e133705d794a 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -26,6 +26,10 @@ SUBSYS(cpu) SUBSYS(cpuacct) #endif +#if IS_ENABLED(CONFIG_CGROUP_SCHEDTUNE) +SUBSYS(schedtune) +#endif + #if IS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(io) #endif diff --git a/init/Kconfig b/init/Kconfig index 83d8c02e7a57..5d9097e2b805 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -999,6 +999,23 @@ config CGROUP_CPUACCT Provides a simple Resource Controller for monitoring the total CPU consumed by the tasks in a cgroup. +config CGROUP_SCHEDTUNE + bool "CFS tasks boosting cgroup subsystem (EXPERIMENTAL)" + depends on SCHED_TUNE + help + This option provides the "schedtune" controller which improves the + flexibility of the task boosting mechanism by introducing the support + to define "per task" boost values. + + This new controller: + 1. allows only a two layers hierarchy, where the root defines the + system-wide boost value and its direct childrens define each one a + different "class of tasks" to be boosted with a different value + 2. supports up to 16 different task classes, each one which could be + configured with a different boost value + + Say N if unsure. + config PAGE_COUNTER bool diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 4c44b1a4ad98..c4b611424376 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,230 @@ +#include +#include +#include +#include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; +#ifdef CONFIG_CGROUP_SCHEDTUNE + +/* + * EAS scheduler tunables for task groups. + */ + +/* SchdTune tunables for a group of tasks */ +struct schedtune { + /* SchedTune CGroup subsystem */ + struct cgroup_subsys_state css; + + /* Boost group allocated ID */ + int idx; + + /* Boost value for tasks on that SchedTune CGroup */ + int boost; + +}; + +static inline struct schedtune *css_st(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct schedtune, css) : NULL; +} + +static inline struct schedtune *task_schedtune(struct task_struct *tsk) +{ + return css_st(task_css(tsk, schedtune_cgrp_id)); +} + +static inline struct schedtune *parent_st(struct schedtune *st) +{ + return css_st(st->css.parent); +} + +/* + * SchedTune root control group + * The root control group is used to defined a system-wide boosting tuning, + * which is applied to all tasks in the system. + * Task specific boost tuning could be specified by creating and + * configuring a child control group under the root one. + * By default, system-wide boosting is disabled, i.e. no boosting is applied + * to tasks which are not into a child control group. + */ +static struct schedtune +root_schedtune = { + .boost = 0, +}; + +/* + * Maximum number of boost groups to support + * When per-task boosting is used we still allow only limited number of + * boost groups for two main reasons: + * 1. on a real system we usually have only few classes of workloads which + * make sense to boost with different values (e.g. background vs foreground + * tasks, interactive vs low-priority tasks) + * 2. a limited number allows for a simpler and more memory/time efficient + * implementation especially for the computation of the per-CPU boost + * value + */ +#define BOOSTGROUPS_COUNT 4 + +/* Array of configured boostgroups */ +static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { + &root_schedtune, + NULL, +}; + +/* SchedTune boost groups + * Keep track of all the boost groups which impact on CPU, for example when a + * CPU has two RUNNABLE tasks belonging to two different boost groups and thus + * likely with different boost values. + * Since on each system we expect only a limited number of boost groups, here + * we use a simple array to keep track of the metrics required to compute the + * maximum per-CPU boosting value. + */ +struct boost_groups { + /* Maximum boost value for all RUNNABLE tasks on a CPU */ + unsigned boost_max; + struct { + /* The boost for tasks on that boost group */ + unsigned boost; + /* Count of RUNNABLE tasks on that boost group */ + unsigned tasks; + } group[BOOSTGROUPS_COUNT]; +}; + +/* Boost groups affecting each CPU in the system */ +DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); + +static u64 +boost_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->boost; +} + +static int +boost_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 boost) +{ + struct schedtune *st = css_st(css); + + if (boost < 0 || boost > 100) + return -EINVAL; + + st->boost = boost; + if (css == &root_schedtune.css) + sysctl_sched_cfs_boost = boost; + + return 0; +} + +static struct cftype files[] = { + { + .name = "boost", + .read_u64 = boost_read, + .write_u64 = boost_write, + }, + { } /* terminate */ +}; + +static int +schedtune_boostgroup_init(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = st; + + return 0; +} + +static int +schedtune_init(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + } + + pr_info(" schedtune configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); + return 0; +} + +static struct cgroup_subsys_state * +schedtune_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct schedtune *st; + int idx; + + if (!parent_css) { + schedtune_init(); + return &root_schedtune.css; + } + + /* Allow only single level hierachies */ + if (parent_css != &root_schedtune.css) { + pr_err("Nested SchedTune boosting groups not allowed\n"); + return ERR_PTR(-ENOMEM); + } + + /* Allow only a limited number of boosting groups */ + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) + if (!allocated_group[idx]) + break; + if (idx == BOOSTGROUPS_COUNT) { + pr_err("Trying to create more than %d SchedTune boosting groups\n", + BOOSTGROUPS_COUNT); + return ERR_PTR(-ENOSPC); + } + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto out; + + /* Initialize per CPUs boost group support */ + st->idx = idx; + if (schedtune_boostgroup_init(st)) + goto release; + + return &st->css; + +release: + kfree(st); +out: + return ERR_PTR(-ENOMEM); +} + +static void +schedtune_boostgroup_release(struct schedtune *st) +{ + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = NULL; +} + +static void +schedtune_css_free(struct cgroup_subsys_state *css) +{ + struct schedtune *st = css_st(css); + + schedtune_boostgroup_release(st); + kfree(st); +} + +struct cgroup_subsys schedtune_cgrp_subsys = { + .css_alloc = schedtune_css_alloc, + .css_free = schedtune_css_free, + .legacy_cftypes = files, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + int sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 827ba256fdad..98c0f6ef01f1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -439,7 +439,11 @@ static struct ctl_table kern_table[] = { .procname = "sched_cfs_boost", .data = &sysctl_sched_cfs_boost, .maxlen = sizeof(sysctl_sched_cfs_boost), +#ifdef CONFIG_CGROUP_SCHEDTUNE + .mode = 0444, +#else .mode = 0644, +#endif .proc_handler = &sysctl_sched_cfs_boost_handler, .extra1 = &zero, .extra2 = &one_hundred, From 8f8434ae70d59f7bb846beacea0f05e1db5144ad Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 12:31:35 +0000 Subject: [PATCH 58/71] sched/tune: compute and keep track of per CPU boost value When per task boosting is enabled, we could have multiple RUNNABLE tasks which are concurrently scheduled on the same CPU but each one with a different boost value. For example, we could have a scenarios like this: Task SchedTune CGroup Boost Value T1 root 0 T2 low-priority 10 T3 interactive 90 In these conditions we expect a CPU to be configured according to a proper "aggregation" of the required boost values for all the tasks currently scheduled on this CPU. A suitable aggregation function is the one which tracks the MAX boost value for all the tasks RUNNABLE on a CPU. This approach allows to always satisfy the most boost demanding task while at the same time: a) boosting all the concurrently scheduled tasks thus reducing potential co-scheduling side-effects on demanding tasks b) reduce the number of frequency switch requested towards SchedDVFS, thus being more friendly to architectures with slow frequency switching times Every time a task enters/exits the RQ of a CPU the max boost value should be updated considering all the boost groups currently "affecting" that CPU, i.e. which have at least one RUNNABLE task currently allocated on that CPU. This patch introduces the required support to keep track of the boost groups currently affecting CPUs. Thanks to the limited number of boost groups, a small and memory efficient per-cpu array of boost groups values (cpu_boost_groups) is used which is updated for each CPU entry by schedtune_boostgroup_update() but only when a schedtune CGroup boost value is updated. However, this is expected to be a rare operation, perhaps done just one time at system boot time. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 77 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index c4b611424376..be60b8d97dc4 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -97,6 +97,67 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +static void +schedtune_cpu_update(int cpu) +{ + struct boost_groups *bg; + unsigned boost_max; + int idx; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* The root boost group is always active */ + boost_max = bg->group[0].boost; + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { + /* + * A boost group affects a CPU only if it has + * RUNNABLE tasks on that CPU + */ + if (bg->group[idx].tasks == 0) + continue; + boost_max = max(boost_max, bg->group[idx].boost); + } + + bg->boost_max = boost_max; +} + +static int +schedtune_boostgroup_update(int idx, int boost) +{ + struct boost_groups *bg; + int cur_boost_max; + int old_boost; + int cpu; + + /* Update per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + + /* + * Keep track of current boost values to compute the per CPU + * maximum only when it has been affected by the new value of + * the updated boost group + */ + cur_boost_max = bg->boost_max; + old_boost = bg->group[idx].boost; + + /* Update the boost value of this boost group */ + bg->group[idx].boost = boost; + + /* Check if this update increase current max */ + if (boost > cur_boost_max && bg->group[idx].tasks) { + bg->boost_max = boost; + continue; + } + + /* Check if this update has decreased current max */ + if (cur_boost_max == old_boost && old_boost > boost) + schedtune_cpu_update(cpu); + } + + return 0; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -118,6 +179,9 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, if (css == &root_schedtune.css) sysctl_sched_cfs_boost = boost; + /* Update CPU boost */ + schedtune_boostgroup_update(st->idx, st->boost); + return 0; } @@ -133,9 +197,19 @@ static struct cftype files[] = { static int schedtune_boostgroup_init(struct schedtune *st) { + struct boost_groups *bg; + int cpu; + /* Keep track of allocated boost groups */ allocated_group[st->idx] = st; + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + bg->group[st->idx].boost = 0; + bg->group[st->idx].tasks = 0; + } + return 0; } @@ -203,6 +277,9 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) static void schedtune_boostgroup_release(struct schedtune *st) { + /* Reset this boost group */ + schedtune_boostgroup_update(st->idx, 0); + /* Keep track of allocated boost groups */ allocated_group[st->idx] = NULL; } From fa75e9d655a9ac65390512d234e6885ca817b865 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 7 Jul 2015 15:33:20 +0100 Subject: [PATCH 59/71] sched/{fair,tune}: track RUNNABLE tasks impact on per CPU boost value When per-task boosting is enabled, every time a task enters/exits a CPU its boost value could impact the currently selected OPP for that CPU. Thus, the "aggregated" boost value for that CPU potentially needs to be updated to match the current maximum boost value among all the tasks currently RUNNABLE on that CPU. This patch introduces the required support to keep track of which boost groups are impacting a CPU. Each time a task is enqueued/dequeued to/from a CPU its boost group is used to increment a per-cpu counter of RUNNABLE tasks on that CPU. Only when the number of runnable tasks for a specific boost group becomes 1 or 0 the corresponding boost group changes its effects on that CPU, specifically: a) boost_group::tasks == 1: this boost group starts to impact the CPU b) boost_group::tasks == 0: this boost group stops to impact the CPU In each of these two conditions the aggregation function: sched_cpu_update(cpu) could be required to run in order to identify the new maximum boost value required for the CPU. The proposed patch minimizes the number of times the aggregation function is executed while still providing the required support to always boost a CPU to the maximum boost value required by all its currently RUNNABLE tasks. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 17 +++++++--- kernel/sched/tune.c | 82 +++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 23 +++++++++++++ 3 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 kernel/sched/tune.h diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eff548fae43e..2dbe1ff0a90b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -34,6 +34,7 @@ #include #include "sched.h" +#include "tune.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -4210,6 +4211,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cpu_overutilized(rq->cpu)) rq->rd->overutilized = true; + schedtune_enqueue_task(p, cpu_of(rq)); + /* * We want to potentially trigger a freq switch * request only for tasks that are waking up; this is @@ -4279,6 +4282,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { sub_nr_running(rq, 1); + schedtune_dequeue_task(p, cpu_of(rq)); /* * We want to potentially trigger a freq switch @@ -5114,10 +5118,15 @@ schedtune_margin(unsigned long signal, unsigned long boost) } static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { - unsigned int boost = get_sysctl_sched_cfs_boost(); + unsigned int boost; +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_cpu_boost(cpu); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif if (boost == 0) return 0; @@ -5127,7 +5136,7 @@ schedtune_cpu_margin(unsigned long util) #else /* CONFIG_SCHED_TUNE */ static inline unsigned int -schedtune_cpu_margin(unsigned long util) +schedtune_cpu_margin(unsigned long util, int cpu) { return 0; } @@ -5138,7 +5147,7 @@ static inline unsigned long boosted_cpu_util(int cpu) { unsigned long util = cpu_util(cpu); - unsigned long margin = schedtune_cpu_margin(util); + unsigned long margin = schedtune_cpu_margin(util, cpu); return util + margin; } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index be60b8d97dc4..ccc3540dcaf2 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "sched.h" @@ -158,6 +159,87 @@ schedtune_boostgroup_update(int idx, int boost) return 0; } +static inline void +schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) +{ + struct boost_groups *bg; + int tasks; + + bg = &per_cpu(cpu_boost_groups, cpu); + + /* Update boosted tasks count while avoiding to make it negative */ + if (task_count < 0 && bg->group[idx].tasks <= -task_count) + bg->group[idx].tasks = 0; + else + bg->group[idx].tasks += task_count; + + /* Boost group activation or deactivation on that RQ */ + tasks = bg->group[idx].tasks; + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_enqueue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, 1); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_dequeue_task(struct task_struct *p, int cpu) +{ + struct schedtune *st; + int idx; + + /* + * When a task is marked PF_EXITING by do_exit() it's going to be + * dequeued and enqueued multiple times in the exit path. + * Thus we avoid any further update, since we do not want to change + * CPU boosting while the task is exiting. + * The last dequeue will be done by cgroup exit() callback. + */ + if (p->flags & PF_EXITING) + return; + + /* Get task boost group */ + rcu_read_lock(); + st = task_schedtune(p); + idx = st->idx; + rcu_read_unlock(); + + schedtune_tasks_update(p, cpu, idx, -1); +} + +int schedtune_cpu_boost(int cpu) +{ + struct boost_groups *bg; + + bg = &per_cpu(cpu_boost_groups, cpu); + return bg->boost_max; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h new file mode 100644 index 000000000000..561b5171a19b --- /dev/null +++ b/kernel/sched/tune.h @@ -0,0 +1,23 @@ + +#ifdef CONFIG_SCHED_TUNE + +#ifdef CONFIG_CGROUP_SCHEDTUNE + +int schedtune_cpu_boost(int cpu); + +void schedtune_enqueue_task(struct task_struct *p, int cpu); +void schedtune_dequeue_task(struct task_struct *p, int cpu); + +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_CGROUP_SCHEDTUNE */ + +#else /* CONFIG_SCHED_TUNE */ + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#endif /* CONFIG_SCHED_TUNE */ From aa0c021706f3049cca12f22bdaca4ce56157e056 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:31:53 +0000 Subject: [PATCH 60/71] sched/fair: add boosted task utilization The task utilization signal, which is derived from PELT signals and properly scaled to be architecture and frequency invariant, is used by EAS as an estimation of the task requirements in terms of CPU bandwidth. When the energy aware scheduler is in use, this signal affects the CPU selection. Thus, a convenient way to bias that decision, which is also little intrusive, is to boost the task utilization signal each time it is required to support them. This patch introduces the new function: boosted_task_util(task) which returns a boosted value for the utilization of the specified task. The margin added to the original utilization is: 1. computed based on the "boosting strategy" in use 2. proportional to boost value defined either by the sysctl interface, when global boosting is in use, or the "taskgroup" value, when per-task boosting is enabled. The boosted signal is used by EAS a. transparently, via its integration into the task_fits() function b. explicitly, in the energy-aware wakeup path Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++-- kernel/sched/tune.c | 14 ++++++++++++++ kernel/sched/tune.h | 1 + 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2dbe1ff0a90b..f582d58daea5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5048,11 +5048,13 @@ static inline unsigned long task_util(struct task_struct *p) unsigned int capacity_margin = 1280; /* ~20% margin */ +static inline unsigned long boosted_task_util(struct task_struct *task); + static inline bool __task_fits(struct task_struct *p, int cpu, int util) { unsigned long capacity = capacity_of(cpu); - util += task_util(p); + util += boosted_task_util(p); return (capacity * 1024) > (util * capacity_margin); } @@ -5133,6 +5135,27 @@ schedtune_cpu_margin(unsigned long util, int cpu) return schedtune_margin(util, boost); } +static inline unsigned long +schedtune_task_margin(struct task_struct *task) +{ + unsigned int boost; + unsigned long util; + unsigned long margin; + +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return 0; + + util = task_util(task); + margin = schedtune_margin(util, boost); + + return margin; +} + #else /* CONFIG_SCHED_TUNE */ static inline unsigned int @@ -5141,6 +5164,12 @@ schedtune_cpu_margin(unsigned long util, int cpu) return 0; } +static inline unsigned int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + #endif /* CONFIG_SCHED_TUNE */ static inline unsigned long @@ -5152,6 +5181,15 @@ boosted_cpu_util(int cpu) return util + margin; } +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util(task); + unsigned long margin = schedtune_task_margin(task); + + return util + margin; +} + /* * find_idlest_group finds and returns the least busy CPU group within the * domain. @@ -5386,7 +5424,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) * so prev_cpu will receive a negative bias due to the double * accounting. However, the blocked utilization may be zero. */ - int new_util = cpu_util(i) + task_util(p); + int new_util = cpu_util(i) + boosted_task_util(p); if (new_util > capacity_orig_of(i)) continue; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index ccc3540dcaf2..3253a8732ba5 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -240,6 +240,20 @@ int schedtune_cpu_boost(int cpu) return bg->boost_max; } +int schedtune_task_boost(struct task_struct *p) +{ + struct schedtune *st; + int task_boost; + + /* Get task boost value */ + rcu_read_lock(); + st = task_schedtune(p); + task_boost = st->boost; + rcu_read_unlock(); + + return task_boost; +} + static u64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index 561b5171a19b..d756ce7b06e0 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -4,6 +4,7 @@ #ifdef CONFIG_CGROUP_SCHEDTUNE int schedtune_cpu_boost(int cpu); +int schedtune_task_boost(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); From 81cc39cc812f53517f0bf4881795e2917aace6c5 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Thu, 14 Jan 2016 18:35:13 +0000 Subject: [PATCH 61/71] sched/fair: keep track of energy/capacity variations The current EAS implementation does not allow "to boost" tasks performances, for example by running them at an higher OPP (or a more capable CPU), even if that could require a "reasonable" increase in energy consumption. To defined how much reasonable is an energy increase with respect to a required boost value, it is required to define and compute a trade-off between the expected energy and performance variations. However, the current EAS implementation considers only energy variations while completely disregard the impact on performance for the selection of a certain schedule candidate. This patch extends the eenv energy environment to keep track of both energy and performance deltas which are implied by the activation of a schedule candidate. The performance variation is estimated considering the different capacities of the CPUs in which the task could be scheduled. The idea is that while running on a CPU with higher capacity (e.g. higher operating point) the task could (potentially) complete faster and thus get better performance. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f582d58daea5..3b80ad01aa8b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4706,6 +4706,16 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + struct { + int before; + int after; + int diff; + } nrg; + struct { + int before; + int after; + int delta; + } cap; }; /* @@ -4872,6 +4882,22 @@ static int sched_group_energy(struct energy_env *eenv) eenv->sg_cap = sg; cap_idx = find_new_capacity(eenv, sg->sge); + + if (sg->group_weight == 1) { + /* Remove capacity of src CPU (before task move) */ + if (eenv->util_delta == 0 && + cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { + eenv->cap.before = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta -= eenv->cap.before; + } + /* Add capacity of dst CPU (after task move) */ + if (eenv->util_delta != 0 && + cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { + eenv->cap.after = sg->sge->cap_states[cap_idx].cap; + eenv->cap.delta += eenv->cap.after; + } + } + idle_idx = group_idle_state(sg); group_util = group_norm_util(eenv, sg); sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) @@ -4920,6 +4946,8 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .nrg = { 0, 0, 0 }, + .cap = { 0, 0, 0 }, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -4941,13 +4969,21 @@ static int energy_diff(struct energy_env *eenv) return 0; /* Invalid result abort */ energy_before += eenv_before.energy; + /* Keep track of SRC cpu (before) capacity */ + eenv->cap.before = eenv_before.cap.before; + eenv->cap.delta = eenv_before.cap.delta; + if (sched_group_energy(eenv)) return 0; /* Invalid result abort */ energy_after += eenv->energy; } } while (sg = sg->next, sg != sd->groups); - return energy_after-energy_before; + eenv->nrg.before = energy_before; + eenv->nrg.after = energy_after; + eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + + return eenv->nrg.diff; } /* From eac91ca0659786f2ceef9c4a5b63b9dc686dab2d Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Tue, 12 Jan 2016 18:12:13 +0000 Subject: [PATCH 62/71] sched/tune: add support to compute normalized energy The current EAS implementation considers only energy variations, while it disregards completely the impact on performance for the selection of a certain schedule candidate. Moreover, it also makes its decision based on the "absolute" value of expected energy variations. In order to properly define a trade-off strategy between increased energy consumption and performances benefits it is required to compare energy variations with performance variations. Thus, both performance and energy metrics must be expressed in comparable units. While the performance variations are expressed in terms of capacity deltas, which are defined in the range [0..SCHED_LOAD_SCALE], the same scale is not used for energy variations. This patch introduces the function: schedtune_normalize_energy(energy_diff) which returns a normalized value in the same range of capacity variations, i.e. [0..SCHED_LOAD_SCALE]. A proper set of energy normalization constants are required to provide a fast division by a constant during the normalziation of the energy_diff. The value of these constants depends on the specific energy model and topology of a target device. Thus, this patch provides also the required support for the computation at boot time of this set of variables. Signed-off-by: Patrick Bellasi --- kernel/sched/tune.c | 321 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/tune.h | 7 + 2 files changed, 328 insertions(+) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 3253a8732ba5..f4fbbcd28373 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -1,7 +1,9 @@ #include #include +#include #include #include +#include #include #include @@ -9,6 +11,84 @@ unsigned int sysctl_sched_cfs_boost __read_mostly; +/* + * System energy normalization constants + */ +static struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +} schedtune_target_nrg; + +/* Performance Boost region (B) threshold params */ +static int perf_boost_idx; + +/* Performance Constraint region (C) threshold params */ +static int perf_constrain_idx; + +/** + * Performance-Energy (P-E) Space thresholds constants + */ +struct threshold_params { + int nrg_gain; + int cap_gain; +}; + +/* + * System specific P-E space thresholds constants + */ +static struct threshold_params +threshold_gains[] = { + { 0, 4 }, /* >= 0% */ + { 0, 4 }, /* >= 10% */ + { 1, 4 }, /* >= 20% */ + { 2, 4 }, /* >= 30% */ + { 3, 4 }, /* >= 40% */ + { 4, 3 }, /* >= 50% */ + { 4, 2 }, /* >= 60% */ + { 4, 1 }, /* >= 70% */ + { 4, 0 }, /* >= 80% */ + { 4, 0 } /* >= 90% */ +}; + +static int +__schedtune_accept_deltas(int nrg_delta, int cap_delta, + int perf_boost_idx, int perf_constrain_idx) +{ + int payoff = -INT_MAX; + + /* Performance Boost (B) region */ + if (nrg_delta > 0 && cap_delta > 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta < cap_gain / nrg_gain + * which is: + * nrg_delta * cap_gain > cap_delta * nrg_gain + */ + payoff = nrg_delta * threshold_gains[perf_boost_idx].cap_gain; + payoff -= cap_delta * threshold_gains[perf_boost_idx].nrg_gain; + return payoff; + } + + /* Performance Constraint (C) region */ + if (nrg_delta < 0 && cap_delta < 0) { + /* + * Evaluate "Performance Boost" vs "Energy Increase" + * payoff criteria: + * cap_delta / nrg_delta > cap_gain / nrg_gain + * which is: + * cap_delta * nrg_gain > nrg_delta * cap_gain + */ + payoff = cap_delta * threshold_gains[perf_constrain_idx].nrg_gain; + payoff -= nrg_delta * threshold_gains[perf_constrain_idx].cap_gain; + return payoff; + } + + /* Default: reject schedule candidate */ + return payoff; +} + #ifdef CONFIG_CGROUP_SCHEDTUNE /* @@ -26,6 +106,11 @@ struct schedtune { /* Boost value for tasks on that SchedTune CGroup */ int boost; + /* Performance Boost (B) region threshold params */ + int perf_boost_idx; + + /* Performance Constraint (C) region threshold params */ + int perf_constrain_idx; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -55,8 +140,37 @@ static inline struct schedtune *parent_st(struct schedtune *st) static struct schedtune root_schedtune = { .boost = 0, + .perf_boost_idx = 0, + .perf_constrain_idx = 0, }; +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + struct schedtune *ct; + int perf_boost_idx; + int perf_constrain_idx; + + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + /* Get task specific perf Boost/Constraints indexes */ + rcu_read_lock(); + ct = task_schedtune(task); + perf_boost_idx = ct->perf_boost_idx; + perf_constrain_idx = ct->perf_constrain_idx; + rcu_read_unlock(); + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + /* * Maximum number of boost groups to support * When per-task boosting is used we still allow only limited number of @@ -396,6 +510,24 @@ struct cgroup_subsys schedtune_cgrp_subsys = { .early_init = 1, }; +#else /* CONFIG_CGROUP_SCHEDTUNE */ + +int +schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task) +{ + /* Optimal (O) region */ + if (nrg_delta < 0 && cap_delta > 0) + return INT_MAX; + + /* Suboptimal (S) region */ + if (nrg_delta > 0 && cap_delta < 0) + return -INT_MAX; + + return __schedtune_accept_deltas(nrg_delta, cap_delta, + perf_boost_idx, perf_constrain_idx); +} + #endif /* CONFIG_CGROUP_SCHEDTUNE */ int @@ -408,6 +540,195 @@ sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* Performance Boost (B) region threshold params */ + perf_boost_idx = sysctl_sched_cfs_boost; + perf_boost_idx /= 10; + + /* Performance Constraint (C) region threshold params */ + perf_constrain_idx = 100 - sysctl_sched_cfs_boost; + perf_constrain_idx /= 10; + return 0; } +/* + * System energy normalization + * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE], + * corresponding to the specified energy variation. + */ +int +schedtune_normalize_energy(int energy_diff) +{ + u32 normalized_nrg; + int max_delta; + +#ifdef CONFIG_SCHED_DEBUG + /* Check for boundaries */ + max_delta = schedtune_target_nrg.max_power; + max_delta -= schedtune_target_nrg.min_power; + WARN_ON(abs(energy_diff) >= max_delta); +#endif + + /* Do scaling using positive numbers to increase the range */ + normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff; + + /* Scale by energy magnitude */ + normalized_nrg <<= SCHED_LOAD_SHIFT; + + /* Normalize on max energy for target platform */ + normalized_nrg = reciprocal_divide( + normalized_nrg, schedtune_target_nrg.rdiv); + + return (energy_diff < 0) ? -normalized_nrg : normalized_nrg; +} + +#ifdef CONFIG_SCHED_DEBUG +static void +schedtune_test_nrg(unsigned long delta_pwr) +{ + unsigned long test_delta_pwr; + unsigned long test_norm_pwr; + int idx; + + /* + * Check normalization constants using some constant system + * energy values + */ + pr_info("schedtune: verify normalization constants...\n"); + for (idx = 0; idx < 6; ++idx) { + test_delta_pwr = delta_pwr >> idx; + + /* Normalize on max energy for target platform */ + test_norm_pwr = reciprocal_divide( + test_delta_pwr << SCHED_LOAD_SHIFT, + schedtune_target_nrg.rdiv); + + pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n", + idx, test_delta_pwr, test_norm_pwr); + } +} +#else +#define schedtune_test_nrg(delta_pwr) +#endif + +/* + * Compute the min/max power consumption of a cluster and all its CPUs + */ +static void +schedtune_add_cluster_nrg( + struct sched_domain *sd, + struct sched_group *sg, + struct target_nrg *ste) +{ + struct sched_domain *sd2; + struct sched_group *sg2; + + struct cpumask *cluster_cpus; + char str[32]; + + unsigned long min_pwr; + unsigned long max_pwr; + int cpu; + + /* Get Cluster energy using EM data for the first CPU */ + cluster_cpus = sched_group_cpus(sg); + snprintf(str, 32, "CLUSTER[%*pbl]", + cpumask_pr_args(cluster_cpus)); + + min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power; + max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power; + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Keep track of this cluster's energy in the computation of the + * overall system energy + */ + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + /* Get CPU energy using EM data for each CPU in the group */ + for_each_cpu(cpu, cluster_cpus) { + /* Get a SD view for the specific CPU */ + for_each_domain(cpu, sd2) { + /* Get the CPU group */ + sg2 = sd2->groups; + min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power; + max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power; + + ste->min_power += min_pwr; + ste->max_power += max_pwr; + + snprintf(str, 32, "CPU[%d]", cpu); + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + str, min_pwr, max_pwr); + + /* + * Assume we have EM data only at the CPU and + * the upper CLUSTER level + */ + BUG_ON(!cpumask_equal( + sched_group_cpus(sg), + sched_group_cpus(sd2->parent->groups) + )); + break; + } + } +} + +/* + * Initialize the constants required to compute normalized energy. + * The values of these constants depends on the EM data for the specific + * target system and topology. + * Thus, this function is expected to be called by the code + * that bind the EM to the topology information. + */ +static int +schedtune_init_late(void) +{ + struct target_nrg *ste = &schedtune_target_nrg; + unsigned long delta_pwr = 0; + struct sched_domain *sd; + struct sched_group *sg; + + pr_info("schedtune: init normalization constants...\n"); + ste->max_power = 0; + ste->min_power = 0; + + rcu_read_lock(); + + /* + * When EAS is in use, we always have a pointer to the highest SD + * which provides EM data. + */ + sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask))); + if (!sd) { + pr_info("schedtune: no energy model data\n"); + goto nodata; + } + + sg = sd->groups; + do { + schedtune_add_cluster_nrg(sd, sg, ste); + } while (sg = sg->next, sg != sd->groups); + + rcu_read_unlock(); + + pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n", + "SYSTEM", ste->min_power, ste->max_power); + + /* Compute normalization constants */ + delta_pwr = ste->max_power - ste->min_power; + ste->rdiv = reciprocal_value(delta_pwr); + pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n", + ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2); + + schedtune_test_nrg(delta_pwr); + return 0; + +nodata: + rcu_read_unlock(); + return -EINVAL; +} +late_initcall(schedtune_init_late); + diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index d756ce7b06e0..f7273a5d994a 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -16,9 +16,16 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #endif /* CONFIG_CGROUP_SCHEDTUNE */ +int schedtune_normalize_energy(int energy); +int schedtune_accept_deltas(int nrg_delta, int cap_delta, + struct task_struct *task); + #else /* CONFIG_SCHED_TUNE */ #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) +#define schedtune_normalize_energy(energy) energy +#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta + #endif /* CONFIG_SCHED_TUNE */ From 8c825e1cbf7bdf109e9d860e8f751db395f1d8fe Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 15 Jan 2016 15:48:03 +0000 Subject: [PATCH 63/71] sched/fair: filter energy_diff() based on energy_payoff value Once the SchedTune support is enabled and the CPU bandwidth demand of a task is boosted, we could expect increased energy consumptions which are balanced by corresponding increases of tasks performance. However, the current implementation of the energy_diff() function accepts all and _only_ the schedule candidates which results into a reduced expected system energy, which works against the boosting strategy. This patch links the energy_diff() function with the "energy payoff" engine provided by SchedTune. The energy variation computed by the energy_diff() function is now filtered using the SchedTune support to evaluated the energy payoff for a boosted task. With that patch, the energy_diff() function is going to reported as "acceptable schedule candidate" only the schedule candidate which corresponds to a positive energy_payoff. Signed-off-by: Patrick Bellasi --- kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3b80ad01aa8b..1191c4b1b119 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4706,9 +4706,12 @@ struct energy_env { int src_cpu; int dst_cpu; int energy; + int payoff; + struct task_struct *task; struct { int before; int after; + int delta; int diff; } nrg; struct { @@ -4929,6 +4932,44 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +#ifdef CONFIG_SCHED_TUNE +static int energy_diff_evaluate(struct energy_env *eenv) +{ + unsigned int boost; + int nrg_delta; + + /* Return energy diff when boost margin is 0 */ +#ifdef CONFIG_CGROUP_SCHEDTUNE + boost = schedtune_task_boost(eenv->task); +#else + boost = get_sysctl_sched_cfs_boost(); +#endif + if (boost == 0) + return eenv->nrg.diff; + + /* Compute normalized energy diff */ + nrg_delta = schedtune_normalize_energy(eenv->nrg.diff); + eenv->nrg.delta = nrg_delta; + + eenv->payoff = schedtune_accept_deltas( + eenv->nrg.delta, + eenv->cap.delta, + eenv->task); + + /* + * When SchedTune is enabled, the energy_diff() function will return + * the computed energy payoff value. Since the energy_diff() return + * value is expected to be negative by its callers, this evaluation + * function return a negative value each time the evaluation return a + * positive payoff, which is the condition for the acceptance of + * a scheduling decision + */ + return -eenv->payoff; +} +#else /* CONFIG_SCHED_TUNE */ +#define energy_diff_evaluate(eenv) eenv->nrg.diff +#endif + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -4946,7 +4987,7 @@ static int energy_diff(struct energy_env *eenv) .util_delta = 0, .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, - .nrg = { 0, 0, 0 }, + .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, }; @@ -4982,8 +5023,9 @@ static int energy_diff(struct energy_env *eenv) eenv->nrg.before = energy_before; eenv->nrg.after = energy_after; eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before; + eenv->payoff = 0; - return eenv->nrg.diff; + return energy_diff_evaluate(eenv); } /* @@ -5481,6 +5523,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target) .util_delta = task_util(p), .src_cpu = task_cpu(p), .dst_cpu = target_cpu, + .task = p, }; /* Not enough spare capacity on previous cpu */ From 5c6c1646ff57407e5dfc386b47aedcfe22534ebd Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:06:24 +0000 Subject: [PATCH 64/71] DEBUG: sched: add tracepoint for cpu/freq scale invariance Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 24 ++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9b90c57517a9..20a7216f8abb 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -562,6 +562,30 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); + +TRACE_EVENT(sched_contrib_scale_f, + + TP_PROTO(int cpu, unsigned long freq_scale_factor, + unsigned long cpu_scale_factor), + + TP_ARGS(cpu, freq_scale_factor, cpu_scale_factor), + + TP_STRUCT__entry( + __field(int, cpu) + __field(unsigned long, freq_scale_factor) + __field(unsigned long, cpu_scale_factor) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->freq_scale_factor = freq_scale_factor; + __entry->cpu_scale_factor = cpu_scale_factor; + ), + + TP_printk("cpu=%d freq_scale_factor=%lu cpu_scale_factor=%lu", + __entry->cpu, __entry->freq_scale_factor, + __entry->cpu_scale_factor) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1191c4b1b119..ec2e8aecc4f3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2587,6 +2587,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, scale_freq = arch_scale_freq_capacity(NULL, cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu); /* delta_w is the amount already accumulated against our next period */ delta_w = sa->period_contrib; From 1b79543f25c0047013e8e6b64df9f01e248de8ea Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:27 +0000 Subject: [PATCH 65/71] DEBUG: sched: add tracepoint for task load/util signals Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 43 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 3 +++ 2 files changed, 46 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 20a7216f8abb..99f3e648c197 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -586,6 +586,49 @@ TRACE_EVENT(sched_contrib_scale_f, __entry->cpu, __entry->freq_scale_factor, __entry->cpu_scale_factor) ); + +/* + * Tracepoint for accounting sched averages for tasks. + */ +TRACE_EVENT(sched_load_avg_task, + + TP_PROTO(struct task_struct *tsk, struct sched_avg *avg), + + TP_ARGS(tsk, avg), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + __field( u64, load_sum ) + __field( u32, util_sum ) + __field( u32, period_contrib ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = task_cpu(tsk); + __entry->load_avg = avg->load_avg; + __entry->util_avg = avg->util_avg; + __entry->load_sum = avg->load_sum; + __entry->util_sum = avg->util_sum; + __entry->period_contrib = avg->period_contrib; + ), + + TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu util_avg=%lu load_sum=%llu" + " util_sum=%u period_contrib=%u", + __entry->comm, + __entry->pid, + __entry->cpu, + __entry->load_avg, + __entry->util_avg, + (u64)__entry->load_sum, + (u32)__entry->util_sum, + (u32)__entry->period_contrib) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ec2e8aecc4f3..8cdb4e5592f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2731,6 +2731,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) + trace_sched_load_avg_task(task_of(se), &se->avg); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From d39b657251494af6718b38aa0e5343b499cd6459 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 9 Nov 2015 12:07:48 +0000 Subject: [PATCH 66/71] DEBUG: sched: add tracepoint for CPU load/util signals Signed-off-by: Juri Lelli --- include/trace/events/sched.h | 25 +++++++++++++++++++++++++ kernel/sched/fair.c | 1 + 2 files changed, 26 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 99f3e648c197..f58760c2f712 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -629,6 +629,31 @@ TRACE_EVENT(sched_load_avg_task, (u32)__entry->util_sum, (u32)__entry->period_contrib) ); + +/* + * Tracepoint for accounting sched averages for cpus. + */ +TRACE_EVENT(sched_load_avg_cpu, + + TP_PROTO(int cpu, struct cfs_rq *cfs_rq), + + TP_ARGS(cpu, cfs_rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( unsigned long, util_avg ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->load_avg = cfs_rq->avg.load_avg; + __entry->util_avg = cfs_rq->avg.util_avg; + ), + + TP_printk("cpu=%d load_avg=%lu util_avg=%lu", + __entry->cpu, __entry->load_avg, __entry->util_avg) +); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8cdb4e5592f9..b5ea9ec335d8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2734,6 +2734,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (entity_is_task(se)) trace_sched_load_avg_task(task_of(se), &se->avg); + trace_sched_load_avg_cpu(cpu, cfs_rq); } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) From d4ba5dad7e722c9b89f8a13efd746b291b85f1c0 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Thu, 30 Apr 2015 17:35:23 +0100 Subject: [PATCH 67/71] DEBUG: sched,cpufreq: add cpu_capacity change tracepoint This is useful when we want to compare cpu utilization and cpu curr capacity side by side. Signed-off-by: Juri Lelli --- drivers/cpufreq/cpufreq.c | 4 ++++ include/linux/sched.h | 2 ++ include/trace/events/power.h | 7 +++++++ kernel/sched/fair.c | 11 +++++++++++ kernel/sched/sched.h | 11 ----------- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 5ca8e9517aae..0f30d11fb528 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include static LIST_HEAD(cpufreq_policy_list); @@ -473,6 +474,7 @@ static void cpufreq_notify_post_transition(struct cpufreq_policy *policy, void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs) { + int cpu; /* * Catch double invocations of _begin() which lead to self-deadlock. @@ -501,6 +503,8 @@ void cpufreq_freq_transition_begin(struct cpufreq_policy *policy, spin_unlock(&policy->transition_lock); scale_freq_capacity(policy, freqs); + for_each_cpu(cpu, policy->cpus) + trace_cpu_capacity(capacity_curr_of(cpu), cpu); cpufreq_notify_transition(policy, freqs, CPUFREQ_PRECHANGE); } diff --git a/include/linux/sched.h b/include/linux/sched.h index 38f082659ba6..1456269f8d40 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1047,6 +1047,8 @@ struct sched_group_energy { struct capacity_state *cap_states; /* ptr to capacity state array */ }; +unsigned long capacity_curr_of(int cpu); + struct sched_group; struct sched_domain { diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 284244ebfe8d..f4be04e44252 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -120,6 +120,13 @@ DEFINE_EVENT(cpu, cpu_frequency, TP_ARGS(frequency, cpu_id) ); +DEFINE_EVENT(cpu, cpu_capacity, + + TP_PROTO(unsigned int capacity, unsigned int cpu_id), + + TP_ARGS(capacity, cpu_id) +); + TRACE_EVENT(device_pm_callback_start, TP_PROTO(struct device *dev, const char *pm_ops, int event), diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5ea9ec335d8..9b74b86aba3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4698,6 +4698,17 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) #endif +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig * + arch_scale_freq_capacity(NULL, cpu) + >> SCHED_CAPACITY_SHIFT; +} + static inline bool energy_aware(void) { return sched_feat(ENERGY_AWARE); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 983c132af11b..db3f3db9849c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1512,17 +1512,6 @@ static inline unsigned long cpu_util(int cpu) return __cpu_util(cpu, 0); } -/* - * Returns the current capacity of cpu after applying both - * cpu and freq scaling. - */ -static inline unsigned long capacity_curr_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig * - arch_scale_freq_capacity(NULL, cpu) - >> SCHED_CAPACITY_SHIFT; -} - #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHED From d5d6e149f5ca9210318b9b9b7d5bba92a838b8e0 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 14 Nov 2014 16:25:50 +0000 Subject: [PATCH 68/71] DEBUG: sched: add energy procfs interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes the energy data available via procfs. The related files are placed as sub-directory named 'energy' inside the /proc/sys/kernel/sched_domain/cpuX/domainY/groupZ directory for those cpu/domain/group tuples which have energy information. The following example depicts the contents of /proc/sys/kernel/sched_domain/cpu0/domain0/group[01] for a system which has energy information attached to domain level 0. ├── cpu0 │ ├── domain0 │ │ ├── busy_factor │ │ ├── busy_idx │ │ ├── cache_nice_tries │ │ ├── flags │ │ ├── forkexec_idx │ │ ├── group0 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── group1 │ │ │ └── energy │ │ │ ├── cap_states │ │ │ ├── idle_states │ │ │ ├── nr_cap_states │ │ │ └── nr_idle_states │ │ ├── idle_idx │ │ ├── imbalance_pct │ │ ├── max_interval │ │ ├── max_newidle_lb_cost │ │ ├── min_interval │ │ ├── name │ │ ├── newidle_idx │ │ └── wake_idx │ └── domain1 │ ├── busy_factor │ ├── busy_idx │ ├── cache_nice_tries │ ├── flags │ ├── forkexec_idx │ ├── idle_idx │ ├── imbalance_pct │ ├── max_interval │ ├── max_newidle_lb_cost │ ├── min_interval │ ├── name │ ├── newidle_idx │ └── wake_idx The files 'nr_idle_states' and 'nr_cap_states' contain a scalar value whereas 'idle_states' and 'cap_states' contain a vector of power consumption at this idle state respectively (compute capacity, power consumption) at this capacity state. Signed-off-by: Dietmar Eggemann --- kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b37b3b5ae63e..9b560a1f3d65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5412,10 +5412,61 @@ set_table_entry(struct ctl_table *entry, } } +static struct ctl_table * +sd_alloc_ctl_energy_table(struct sched_group_energy *sge) +{ + struct ctl_table *table = sd_alloc_ctl_entry(5); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power, + sge->nr_idle_states*sizeof(struct idle_state), 0644, + proc_doulongvec_minmax, false); + set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap, + sge->nr_cap_states*sizeof(struct capacity_state), 0644, + proc_doulongvec_minmax, false); + + return table; +} + +static struct ctl_table * +sd_alloc_ctl_group_table(struct sched_group *sg) +{ + struct ctl_table *table = sd_alloc_ctl_entry(2); + + if (table == NULL) + return NULL; + + table->procname = kstrdup("energy", GFP_KERNEL); + table->mode = 0555; + table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge); + + return table; +} + static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(14); + struct ctl_table *table; + unsigned int nr_entries = 14; + + int i = 0; + struct sched_group *sg = sd->groups; + + if (sg->sge) { + int nr_sgs = 0; + + do {} while (nr_sgs++, sg = sg->next, sg != sd->groups); + + nr_entries += nr_sgs; + } + + table = sd_alloc_ctl_entry(nr_entries); if (table == NULL) return NULL; @@ -5448,7 +5499,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(long), 0644, proc_doulongvec_minmax, false); set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ + sg = sd->groups; + if (sg->sge) { + char buf[32]; + struct ctl_table *entry = &table[13]; + + do { + snprintf(buf, 32, "group%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_group_table(sg); + } while (entry++, i++, sg = sg->next, sg != sd->groups); + } + /* &table[nr_entries-1] is terminator */ return table; } From e7de7e0c0423335ca4877282402c52676521bcd0 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:49:07 +0100 Subject: [PATCH 69/71] DEBUG: schedtune: add tracepoint for SchedTune configuration update Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 21 +++++++++++++++++++++ kernel/sched/tune.c | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f58760c2f712..8ae43a2ebfda 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -654,6 +654,27 @@ TRACE_EVENT(sched_load_avg_cpu, TP_printk("cpu=%d load_avg=%lu util_avg=%lu", __entry->cpu, __entry->load_avg, __entry->util_avg) ); + +/* + * Tracepoint for sched_tune_config settings + */ +TRACE_EVENT(sched_tune_config, + + TP_PROTO(int boost), + + TP_ARGS(boost), + + TP_STRUCT__entry( + __field( int, boost ) + ), + + TP_fast_assign( + __entry->boost = boost; + ), + + TP_printk("boost=%d ", __entry->boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index f4fbbcd28373..3920baad104f 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -7,6 +7,8 @@ #include #include +#include + #include "sched.h" unsigned int sysctl_sched_cfs_boost __read_mostly; @@ -392,6 +394,8 @@ boost_write(struct cgroup_subsys_state *css, struct cftype *cft, /* Update CPU boost */ schedtune_boostgroup_update(st->idx, st->boost); + trace_sched_tune_config(st->boost); + return 0; } From f9f68de79c8ed0b669dd683d41b72885fc4aef06 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Mon, 22 Jun 2015 13:51:07 +0100 Subject: [PATCH 70/71] DEBUG: schedtune: add tracepoint for CPU boost signal Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 29 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8ae43a2ebfda..eb812071458b 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -675,6 +675,33 @@ TRACE_EVENT(sched_tune_config, TP_printk("boost=%d ", __entry->boost) ); +/* + * Tracepoint for accounting CPU boosted utilization + */ +TRACE_EVENT(sched_boost_cpu, + + TP_PROTO(int cpu, unsigned long util, unsigned long margin), + + TP_ARGS(cpu, util, margin), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, util ) + __field( unsigned long, margin ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("cpu=%d util=%lu margin=%lu", + __entry->cpu, + __entry->util, + __entry->margin) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b74b86aba3a..3bc56e0ea1ab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5272,6 +5272,8 @@ boosted_cpu_util(int cpu) unsigned long util = cpu_util(cpu); unsigned long margin = schedtune_cpu_margin(util, cpu); + trace_sched_boost_cpu(cpu, util, margin); + return util + margin; } From 47f5ae6dc1512c991100a9691d120fbd122069c4 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 24 Jun 2015 15:36:08 +0100 Subject: [PATCH 71/71] DEBUG: schedtune: add tracepoint for schedtune_tasks_update() values Signed-off-by: Patrick Bellasi --- include/trace/events/sched.h | 62 ++++++++++++++++++++++++++++++++++++ kernel/sched/tune.c | 12 ++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index eb812071458b..7ec9dcfc701a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -702,6 +702,68 @@ TRACE_EVENT(sched_boost_cpu, __entry->margin) ); +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_tasks_update, + + TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, + unsigned int boost, unsigned int max_boost), + + TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( int, tasks ) + __field( int, idx ) + __field( unsigned int, boost ) + __field( unsigned int, max_boost ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = cpu; + __entry->tasks = tasks; + __entry->idx = idx; + __entry->boost = boost; + __entry->max_boost = max_boost; + ), + + TP_printk("pid=%d comm=%s " + "cpu=%d tasks=%d idx=%d boost=%u max_boost=%u", + __entry->pid, __entry->comm, + __entry->cpu, __entry->tasks, __entry->idx, + __entry->boost, __entry->max_boost) +); + +/* + * Tracepoint for schedtune_boostgroup_update + */ +TRACE_EVENT(sched_tune_boostgroup_update, + + TP_PROTO(int cpu, int variation, int max_boost), + + TP_ARGS(cpu, variation, max_boost), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( int, variation ) + __field( int, max_boost ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->variation = variation; + __entry->max_boost = max_boost; + ), + + TP_printk("cpu=%d variation=%d max_boost=%d", + __entry->cpu, __entry->variation, __entry->max_boost) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 3920baad104f..9d8eeb438140 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -264,12 +264,18 @@ schedtune_boostgroup_update(int idx, int boost) /* Check if this update increase current max */ if (boost > cur_boost_max && bg->group[idx].tasks) { bg->boost_max = boost; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ - if (cur_boost_max == old_boost && old_boost > boost) + if (cur_boost_max == old_boost && old_boost > boost) { schedtune_cpu_update(cpu); + trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); + continue; + } + + trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max); } return 0; @@ -293,6 +299,10 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) tasks = bg->group[idx].tasks; if (tasks == 1 || tasks == 0) schedtune_cpu_update(cpu); + + trace_sched_tune_tasks_update(p, cpu, tasks, idx, + bg->group[idx].boost, bg->boost_max); + } /*