From f2d1d0fea1d5e0b3395babab9c2ce185898f432b Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:31 +0100 Subject: [PATCH 01/75] UPSTREAM: sched/topology: Add SD_ASYM_CPUCAPACITY flag detection The SD_ASYM_CPUCAPACITY sched_domain flag is supposed to mark the sched_domain in the hierarchy where all CPU capacities are visible for any CPU's point of view on asymmetric CPU capacity systems. The scheduler can then take to take capacity asymmetry into account when balancing at this level. It also serves as an indicator for how wide task placement heuristics have to search to consider all available CPU capacities as asymmetric systems might often appear symmetric at smallest level(s) of the sched_domain hierarchy. The flag has been around for while but so far only been set by out-of-tree code in Android kernels. One solution is to let each architecture provide the flag through a custom sched_domain topology array and associated mask and flag functions. However, SD_ASYM_CPUCAPACITY is special in the sense that it depends on the capacity and presence of all CPUs in the system, i.e. when hotplugging all CPUs out except those with one particular CPU capacity the flag should disappear even if the sched_domains don't collapse. Similarly, the flag is affected by cpusets where load-balancing is turned off. Detecting when the flags should be set therefore depends not only on topology information but also the cpuset configuration and hotplug state. The arch code doesn't have easy access to the cpuset configuration. Instead, this patch implements the flag detection in generic code where cpusets and hotplug state is already taken care of. All the arch is responsible for is to implement arch_scale_cpu_capacity() and force a full rebuild of the sched_domain hierarchy if capacities are updated, e.g. later in the boot process when cpufreq has initialized. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-2-git-send-email-morten.rasmussen@arm.com [ Fixed 'CPU' capitalization. ] Signed-off-by: Ingo Molnar (cherry picked from commit 05484e0984487d42e97c417cbb0697fa9d16e7e9) Signed-off-by: Quentin Perret Change-Id: I1d5f695a95f8d023f1ecf14ecb71a558ceb67ed6 --- include/linux/sched/topology.h | 6 +-- kernel/sched/topology.c | 81 +++++++++++++++++++++++++++++++--- 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 26347741ba50..6b9976180c1e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -23,10 +23,10 @@ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ -#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ +#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 505a41c42b96..5c4d583d53ee 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1061,7 +1061,6 @@ static struct cpumask ***sched_domains_numa_masks; * SD_SHARE_PKG_RESOURCES - describes shared caches * SD_NUMA - describes NUMA topologies * SD_SHARE_POWERDOMAIN - describes shared power domain - * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies * * Odd one out, which beside describing the topology has a quirk also * prescribes the desired behaviour that goes along with it: @@ -1073,13 +1072,12 @@ static struct cpumask ***sched_domains_numa_masks; SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING | \ - SD_ASYM_CPUCAPACITY | \ SD_SHARE_POWERDOMAIN) static struct sched_domain * sd_init(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { struct sd_data *sdd = &tl->data; struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); @@ -1100,6 +1098,9 @@ sd_init(struct sched_domain_topology_level *tl, "wrong sd_flags in topology description\n")) sd_flags &= ~TOPOLOGY_SD_FLAGS; + /* Apply detected topology flags */ + sd_flags |= dflags; + *sd = (struct sched_domain){ .min_interval = sd_weight, .max_interval = 2*sd_weight, @@ -1604,9 +1605,9 @@ static void __sdt_free(const struct cpumask *cpu_map) static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, - struct sched_domain *child, int cpu) + struct sched_domain *child, int dflags, int cpu) { - struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); + struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu); if (child) { sd->level = child->level + 1; @@ -1632,6 +1633,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve return sd; } +/* + * Find the sched_domain_topology_level where all CPU capacities are visible + * for all CPUs. + */ +static struct sched_domain_topology_level +*asym_cpu_capacity_level(const struct cpumask *cpu_map) +{ + int i, j, asym_level = 0; + bool asym = false; + struct sched_domain_topology_level *tl, *asym_tl = NULL; + unsigned long cap; + + /* Is there any asymmetry? */ + cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); + + for_each_cpu(i, cpu_map) { + if (arch_scale_cpu_capacity(NULL, i) != cap) { + asym = true; + break; + } + } + + if (!asym) + return NULL; + + /* + * Examine topology from all CPU's point of views to detect the lowest + * sched_domain_topology_level where a highest capacity CPU is visible + * to everyone. + */ + for_each_cpu(i, cpu_map) { + unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); + int tl_id = 0; + + for_each_sd_topology(tl) { + if (tl_id < asym_level) + goto next_level; + + for_each_cpu_and(j, tl->mask(i), cpu_map) { + unsigned long capacity; + + capacity = arch_scale_cpu_capacity(NULL, j); + + if (capacity <= max_capacity) + continue; + + max_capacity = capacity; + asym_level = tl_id; + asym_tl = tl; + } +next_level: + tl_id++; + } + } + + return asym_tl; +} + + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs @@ -1644,18 +1704,27 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct s_data d; struct rq *rq = NULL; int i, ret = -ENOMEM; + struct sched_domain_topology_level *tl_asym; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) goto error; + tl_asym = asym_cpu_capacity_level(cpu_map); + /* Set up domains for CPUs specified by the cpu_map: */ for_each_cpu(i, cpu_map) { struct sched_domain_topology_level *tl; sd = NULL; for_each_sd_topology(tl) { - sd = build_sched_domain(tl, cpu_map, attr, sd, i); + int dflags = 0; + + if (tl == tl_asym) + dflags |= SD_ASYM_CPUCAPACITY; + + sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); + if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; if (tl->flags & SDTL_OVERLAP) From 9e986d2718fde5c0ca0cd201b3cab0c94b860ac2 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:32 +0100 Subject: [PATCH 02/75] UPSTREAM: sched/topology, drivers/base/arch_topology: Rebuild the sched_domain hierarchy when capacities change The setting of SD_ASYM_CPUCAPACITY depends on the per-CPU capacities. These might not have their final values when the hierarchy is initially built as the values depend on cpufreq to be initialized or the values being set through sysfs. To ensure that the flags are set correctly we need to rebuild the sched_domain hierarchy whenever the reported per-CPU capacity (arch_scale_cpu_capacity()) changes. This patch ensure that a full sched_domain rebuild happens when CPU capacity changes occur. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit bb1fbdd3c3fd12b612c7d8cdf13bd6bfeebdefa3) Signed-off-by: Quentin Perret Change-Id: I30c38e43371827767f22ae6706f2fcecfbfca103 --- drivers/base/arch_topology.c | 26 ++++++++++++++++++++++++++ include/linux/arch_topology.h | 1 + 2 files changed, 27 insertions(+) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index e7cb0c6ade81..edfcf8d982e4 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -15,6 +15,7 @@ #include #include #include +#include DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; @@ -47,6 +48,9 @@ static ssize_t cpu_capacity_show(struct device *dev, return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id)); } +static void update_topology_flags_workfn(struct work_struct *work); +static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn); + static ssize_t cpu_capacity_store(struct device *dev, struct device_attribute *attr, const char *buf, @@ -72,6 +76,8 @@ static ssize_t cpu_capacity_store(struct device *dev, topology_set_cpu_scale(i, new_capacity); mutex_unlock(&cpu_scale_mutex); + schedule_work(&update_topology_flags_work); + return count; } @@ -96,6 +102,25 @@ static int register_cpu_capacity_sysctl(void) } subsys_initcall(register_cpu_capacity_sysctl); +static int update_topology; + +int topology_update_cpu_topology(void) +{ + return update_topology; +} + +/* + * Updating the sched_domains can't be done directly from cpufreq callbacks + * due to locking, so queue the work for later. + */ +static void update_topology_flags_workfn(struct work_struct *work) +{ + update_topology = 1; + rebuild_sched_domains(); + pr_debug("sched_domain hierarchy rebuilt, flags updated\n"); + update_topology = 0; +} + static u32 capacity_scale; static u32 *raw_capacity; @@ -201,6 +226,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, if (cpumask_empty(cpus_to_visit)) { topology_normalize_cpu_scale(); + schedule_work(&update_topology_flags_work); free_raw_capacity(); pr_debug("cpu_capacity: parsing done\n"); schedule_work(&parsing_done_work); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 2b709416de05..d9bdc1a7f4e7 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -9,6 +9,7 @@ #include void topology_normalize_cpu_scale(void); +int topology_update_cpu_topology(void); struct device_node; bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu); From d647aa9f4155a111d5bf915f1628e2e5fe1e11cd Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:33 +0100 Subject: [PATCH 03/75] UPSTREAM: sched/topology, arch/arm64: Rebuild the sched_domain hierarchy when the CPU capacity changes Asymmetric CPU capacity can not necessarily be determined accurately at the time the initial sched_domain hierarchy is built during boot. It is therefore necessary to be able to force a full rebuild of the hierarchy later triggered by the arch_topology driver. A full rebuild requires the arch-code to implement arch_update_cpu_topology() which isn't yet implemented for arm64. This patch points the arm64 implementation to arch_topology driver to ensure that full hierarchy rebuild happens when needed. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 3ba09df4b8b6e3f01ed6381e8fb890840fd0bca3) Signed-off-by: Quentin Perret Change-Id: Iaeb47eca5aace999c93b5e3928824d8327ab2152 --- arch/arm64/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 49a0fee4f89b..0524f2438649 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -45,6 +45,9 @@ int pcibus_to_node(struct pci_bus *bus); /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale +/* Enable topology flag updates */ +#define arch_update_cpu_topology topology_update_cpu_topology + #include #endif /* _ASM_ARM_TOPOLOGY_H */ From 8ea7fca65f4e85850d08ec3b40b300664303fbb0 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 20 Jul 2018 14:32:34 +0100 Subject: [PATCH 04/75] UPSTREAM: sched/topology, arch/arm: Rebuild sched_domain hierarchy when CPU capacity changes Asymmetric CPU capacity can not necessarily be determined accurately at the time the initial sched_domain hierarchy is built during boot. It is therefore necessary to be able to force a full rebuild of the hierarchy later triggered by the arch_topology driver. A full rebuild requires the arch-code to implement arch_update_cpu_topology() which isn't yet implemented for arm. This patch points the arm implementation to arch_topology driver to ensure that full hierarchy rebuild happens when needed. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1532093554-30504-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit e1799a80a4f5a463f252b7325da8bb66dfd55471) Signed-off-by: Quentin Perret Change-Id: Ic5043190dfebf10547a9625530a8e1158002b243 --- arch/arm/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 5d88d2f22b2c..2a786f54d8b8 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -33,6 +33,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu); /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale +/* Enable topology flag updates */ +#define arch_update_cpu_topology topology_update_cpu_topology + #else static inline void init_cpu_topology(void) { } From b1c506a01410ec07845aa7a7449ae4859b2fe8e7 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:39 +0100 Subject: [PATCH 05/75] UPSTREAM: sched/topology: Add static_key for asymmetric CPU capacity optimizations The existing asymmetric CPU capacity code should cause minimal overhead for others. Putting it behind a static_key, it has been done for SMT optimizations, would make it easier to extend and improve without causing harm to others moving forward. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-2-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit df054e8445a4011e3d693c2268129c0456108663) Signed-off-by: Quentin Perret Change-Id: I6f8de25a9bb593a0032ad0ad7977c269b180ec51 --- kernel/sched/fair.c | 3 +++ kernel/sched/sched.h | 1 + kernel/sched/topology.c | 9 ++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 908c9cdae2f0..392096b4c12a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6280,6 +6280,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) { long min_cap, max_cap; + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return 0; + min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9683f458aec7..34f79bbd07a2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1188,6 +1188,7 @@ DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); DECLARE_PER_CPU(struct sched_domain *, sd_asym); +extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { atomic_t ref; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5c4d583d53ee..b0cdf5e95bda 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) { @@ -1705,6 +1706,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; struct sched_domain_topology_level *tl_asym; + bool has_asym = false; alloc_state = __visit_domain_allocation_hell(&d, cpu_map); if (alloc_state != sa_rootdomain) @@ -1720,8 +1722,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for_each_sd_topology(tl) { int dflags = 0; - if (tl == tl_asym) + if (tl == tl_asym) { dflags |= SD_ASYM_CPUCAPACITY; + has_asym = true; + } sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); @@ -1773,6 +1777,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } rcu_read_unlock(); + if (has_asym) + static_branch_enable_cpuslocked(&sched_asym_cpucapacity); + if (rq && sched_debug_enabled) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); From bca5744aee54211539b1cb6990306479872a2b59 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:40 +0100 Subject: [PATCH 06/75] UPSTREAM: sched/fair: Add 'group_misfit_task' load-balance type To maximize throughput in systems with asymmetric CPU capacities (e.g. ARM big.LITTLE) load-balancing has to consider task and CPU utilization as well as per-CPU compute capacity when load-balancing in addition to the current average load based load-balancing policy. Tasks with high utilization that are scheduled on a lower capacity CPU need to be identified and migrated to a higher capacity CPU if possible to maximize throughput. To implement this additional policy an additional group_type (load-balance scenario) is added: 'group_misfit_task'. This represents scenarios where a sched_group has one or more tasks that are not suitable for its per-CPU capacity. 'group_misfit_task' is only considered if the system is not overloaded or imbalanced ('group_imbalanced' or 'group_overloaded'). Identifying misfit tasks requires the rq lock to be held. To avoid taking remote rq locks to examine source sched_groups for misfit tasks, each CPU is responsible for tracking misfit tasks themselves and update the rq->misfit_task flag. This means checking task utilization when tasks are scheduled and on sched_tick. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-3-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 3b1baa6496e6b7ad016342a9d256bdfb072ce902) Signed-off-by: Quentin Perret Change-Id: If859d4e275a25e20ca004ed980c6e563e68d74ca --- kernel/sched/fair.c | 54 +++++++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 2 ++ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 392096b4c12a..e4f39170dd8d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); +static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -1456,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, static unsigned long weighted_cpuload(struct rq *rq); static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); -static unsigned long capacity_of(int cpu); /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -3723,6 +3723,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) WRITE_ONCE(p->se.avg.util_est, ue); } +static inline int task_fits_capacity(struct task_struct *p, long capacity) +{ + return capacity * 1024 > task_util_est(p) * capacity_margin; +} + +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) +{ + if (!static_branch_unlikely(&sched_asym_cpucapacity)) + return; + + if (!p) { + rq->misfit_task_load = 0; + return; + } + + if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + rq->misfit_task_load = 0; + return; + } + + rq->misfit_task_load = task_h_load(p); +} + #else /* CONFIG_SMP */ #define UPDATE_TG 0x0 @@ -3752,6 +3775,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} +static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -6293,7 +6317,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) /* Bring task utilization in sync with prev_cpu */ sync_entity_load_avg(&p->se); - return min_cap * 1024 < task_util(p) * capacity_margin; + return !task_fits_capacity(p, min_cap); } /* @@ -6712,9 +6736,12 @@ done: __maybe_unused; if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + update_misfit_status(p, rq); + return p; idle: + update_misfit_status(NULL, rq); new_tasks = idle_balance(rq, rf); /* @@ -6920,6 +6947,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +enum group_type { + group_other = 0, + group_misfit_task, + group_imbalanced, + group_overloaded, +}; + #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 #define LBF_DST_PINNED 0x04 @@ -7493,12 +7527,6 @@ static unsigned long task_h_load(struct task_struct *p) /********** Helpers for find_busiest_group ************************/ -enum group_type { - group_other = 0, - group_imbalanced, - group_overloaded, -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -7514,6 +7542,7 @@ struct sg_lb_stats { unsigned int group_weight; enum group_type group_type; int group_no_capacity; + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -7806,6 +7835,9 @@ group_type group_classify(struct sched_group *group, if (sg_imbalanced(group)) return group_imbalanced; + if (sgs->group_misfit_task_load) + return group_misfit_task; + return group_other; } @@ -7880,6 +7912,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, */ if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; + + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + sgs->group_misfit_task_load < rq->misfit_task_load) + sgs->group_misfit_task_load = rq->misfit_task_load; } /* Adjust by relative CPU capacity of the group */ @@ -9661,6 +9697,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) if (static_branch_unlikely(&sched_numa_balancing)) task_tick_numa(rq, curr); + + update_misfit_status(curr, rq); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 34f79bbd07a2..8ba7d2403caf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -845,6 +845,8 @@ struct rq { unsigned char idle_balance; + unsigned long misfit_task_load; + /* For active balancing */ int active_balance; int push_cpu; From 92819a68c9fa72686f8166735b8d575888620abf Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:41 +0100 Subject: [PATCH 07/75] UPSTREAM: sched/fair: Add sched_group per-CPU max capacity The current sg->min_capacity tracks the lowest per-CPU compute capacity available in the sched_group when rt/irq pressure is taken into account. Minimum capacity isn't the ideal metric for tracking if a sched_group needs offloading to another sched_group for some scenarios, e.g. a sched_group with multiple CPUs if only one is under heavy pressure. Tracking maximum capacity isn't perfect either but a better choice for some situations as it indicates that the sched_group definitely compute capacity constrained either due to rt/irq pressure on all CPUs or asymmetric CPU capacities (e.g. big.LITTLE). Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-4-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit e3d6d0cb66f2351cbfd09fbae04eb9804afe9577) Signed-off-by: Quentin Perret Change-Id: I56324d1543d8d0d052a66cd94c7f8e5083bd9c28 --- kernel/sched/fair.c | 24 ++++++++++++++++++++---- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 2 ++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e4f39170dd8d..96027ee1ab56 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7651,13 +7651,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; + sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, min_capacity; + unsigned long capacity, min_capacity, max_capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -7671,6 +7672,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity = 0; min_capacity = ULONG_MAX; + max_capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -7701,6 +7703,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } min_capacity = min(capacity, min_capacity); + max_capacity = max(capacity, max_capacity); } } else { /* @@ -7714,12 +7717,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu) capacity += sgc->capacity; min_capacity = min(sgc->min_capacity, min_capacity); + max_capacity = max(sgc->max_capacity, max_capacity); group = group->next; } while (group != child->groups); } sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = min_capacity; + sdg->sgc->max_capacity = max_capacity; } /* @@ -7815,16 +7820,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* - * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller + * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool -group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * capacity_margin < ref->sgc->min_capacity * 1024; } +/* + * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller + * per-CPU capacity_orig than sched_group ref. + */ +static inline bool +group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) +{ + return sg->sgc->max_capacity * capacity_margin < + ref->sgc->max_capacity * 1024; +} + static inline enum group_type group_classify(struct sched_group *group, struct sg_lb_stats *sgs) @@ -7970,7 +7986,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && - group_smaller_cpu_capacity(sds->local, sg)) + group_smaller_min_cpu_capacity(sds->local, sg)) return false; asym_packing: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8ba7d2403caf..38127177ffbf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1200,6 +1200,7 @@ struct sched_group_capacity { */ unsigned long capacity; unsigned long min_capacity; /* Min per-CPU capacity in group */ + unsigned long max_capacity; /* Max per-CPU capacity in group */ unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b0cdf5e95bda..2536e1b938f9 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -693,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd, sg_span = sched_group_span(sg); sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; } static int @@ -852,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; + sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; return sg; } From fa4d08f31dedd8fa999deee5d146fef01189d4d1 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:42 +0100 Subject: [PATCH 08/75] UPSTREAM: sched/fair: Consider misfit tasks when load-balancing On asymmetric CPU capacity systems load intensive tasks can end up on CPUs that don't suit their compute demand. In this scenarios 'misfit' tasks should be migrated to CPUs with higher compute capacity to ensure better throughput. group_misfit_task indicates this scenario, but tweaks to the load-balance code are needed to make the migrations happen. Misfit balancing only makes sense between a source group of lower per-CPU capacity and destination group of higher compute capacity. Otherwise, misfit balancing is ignored. group_misfit_task has lowest priority so any imbalance due to overload is dealt with first. The modifications are: 1. Only pick a group containing misfit tasks as the busiest group if the destination group has higher capacity and has spare capacity. 2. When the busiest group is a 'misfit' group, skip the usual average load and group capacity checks. 3. Set the imbalance for 'misfit' balancing sufficiently high for a task to be pulled ignoring average load. 4. Pick the CPU with the highest misfit load as the source CPU. 5. If the misfit task is alone on the source CPU, go for active balancing. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-5-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit cad68e552e7774b68ae6a2c5fedb792936098b72) Signed-off-by: Quentin Perret Change-Id: If41e108f3200ade85b4a130e21eaee62bb3860fd --- kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96027ee1ab56..fcdf8a6b357e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6984,6 +6984,7 @@ struct lb_env { unsigned int loop_max; enum fbq_type fbq_type; + enum group_type src_grp_type; struct list_head tasks; }; @@ -7967,6 +7968,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, { struct sg_lb_stats *busiest = &sds->busiest_stat; + /* + * Don't try to pull misfit tasks we can't help. + * We can use max_capacity here as reduction in capacity on some + * CPUs in the group should either be possible to resolve + * internally or be covered by avg_load imbalance (eventually). + */ + if (sgs->group_type == group_misfit_task && + (!group_smaller_max_cpu_capacity(sg, sds->local) || + !group_has_capacity(env, &sds->local_stat))) + return false; + if (sgs->group_type > busiest->group_type) return true; @@ -7989,6 +8001,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, group_smaller_min_cpu_capacity(sds->local, sg)) return false; + /* + * If we have more than one misfit sg go with the biggest misfit. + */ + if (sgs->group_type == group_misfit_task && + sgs->group_misfit_task_load < busiest->group_misfit_task_load) + return false; + asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) @@ -8286,8 +8305,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * factors in sg capacity and sgs with smaller group_type are * skipped when updating the busiest sg: */ - if (busiest->avg_load <= sds->avg_load || - local->avg_load >= sds->avg_load) { + if (busiest->group_type != group_misfit_task && + (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load)) { env->imbalance = 0; return fix_small_imbalance(env, sds); } @@ -8321,6 +8341,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; + /* Boost imbalance to allow misfit task to be balanced. */ + if (busiest->group_type == group_misfit_task) { + env->imbalance = max_t(long, env->imbalance, + busiest->group_misfit_task_load); + } + /* * if *imbalance is less than the average load per runnable task * there is no guarantee that any tasks will be moved so we'll have @@ -8387,6 +8413,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; + /* Misfit tasks should be dealt with regardless of the avg load */ + if (busiest->group_type == group_misfit_task) + goto force_balance; + /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -8424,6 +8454,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) force_balance: /* Looks like there is an imbalance. Compute it */ + env->src_grp_type = busiest->group_type; calculate_imbalance(env, &sds); return env->imbalance ? sds.busiest : NULL; @@ -8471,6 +8502,19 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; + /* + * For ASYM_CPUCAPACITY domains with misfit tasks we simply + * seek the "biggest" misfit task. + */ + if (env->src_grp_type == group_misfit_task) { + if (rq->misfit_task_load > busiest_load) { + busiest_load = rq->misfit_task_load; + busiest = rq; + } + + continue; + } + capacity = capacity_of(i); wl = weighted_cpuload(rq); @@ -8540,6 +8584,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (env->src_grp_type == group_misfit_task) + return 1; + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From d66ab02a3c6d82093e09c82c9030df487327e5c6 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:43 +0100 Subject: [PATCH 09/75] UPSTREAM: sched/fair: Kick nohz balance if rq->misfit_task_load There already are a few conditions in nohz_kick_needed() to ensure a nohz kick is triggered, but they are not enough for some misfit task scenarios. Excluding asym packing, those are: - rq->nr_running >=2: Not relevant here because we are running a misfit task, it needs to be migrated regardless and potentially through active balance. - sds->nr_busy_cpus > 1: If there is only the misfit task being run on a group of low capacity CPUs, this will be evaluated to False. - rq->cfs.h_nr_running >=1 && check_cpu_capacity(): Not relevant here, misfit task needs to be migrated regardless of rt/IRQ pressure As such, this commit adds an rq->misfit_task_load condition to trigger a nohz kick. The idea to kick a nohz balance for misfit tasks originally came from Leo Yan , and a similar patch was submitted for the Android Common Kernel - see: https://lists.linaro.org/pipermail/eas-dev/2016-September/000551.html Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-6-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 5fbdfae5221a5208ed8e7653fc1c4b31de420f74) Signed-off-by: Quentin Perret Change-Id: I1ad94a5688987950355b4fe0190cd5965aec035e --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fcdf8a6b357e..762bb0f1a7df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9229,7 +9229,7 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; - if (rq->nr_running >= 2) { + if (rq->nr_running >= 2 || rq->misfit_task_load) { flags = NOHZ_KICK_MASK; goto out; } From e45affc45535186c3f787bd7aabb889b5fdb8392 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:44 +0100 Subject: [PATCH 10/75] UPSTREAM: sched/fair: Change 'prefer_sibling' type to bool This variable is entirely local to update_sd_lb_stats, so we can safely change its type and slightly clean up its initialisation. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-7-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit dbbad719449e06d73db21598d6eee178f7a54b3b) Signed-off-by: Quentin Perret Change-Id: Ida430fc3f4c5acbc996da93f886b821d488281e2 --- kernel/sched/fair.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 762bb0f1a7df..8706a7b2a559 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8076,11 +8076,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx, prefer_sibling = 0; + int load_idx; bool overload = false; - - if (child && child->flags & SD_PREFER_SIBLING) - prefer_sibling = 1; + bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) From f68d8febf638aad9a19174e6d2fa80869b8ab9f7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:45 +0100 Subject: [PATCH 11/75] UPSTREAM: sched/core: Change root_domain->overload type to int sizeof(_Bool) is implementation defined, so let's just go with 'int' as is done for other structures e.g. sched_domain_shared->has_idle_cores. The local 'overload' variable used in update_sd_lb_stats can remain bool, as it won't impact any struct layout and can be assigned to the root_domain field. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-8-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 575638d1047eb057a5cdf95cc0b3c084e1279508) Signed-off-by: Quentin Perret Change-Id: I05f2778c80b0c47a9f336b9d2baf1c9daae1f8f6 --- kernel/sched/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 38127177ffbf..c655b4feffcf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -718,7 +718,7 @@ struct root_domain { cpumask_var_t online; /* Indicate more than one runnable task for any CPU */ - bool overload; + int overload; /* * The bit corresponding to a CPU gets set here if such CPU has more @@ -1701,7 +1701,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP if (!rq->rd->overload) - rq->rd->overload = true; + rq->rd->overload = 1; #endif } From 6497830fe9dfa6f0375631a03dfc68ffd6c402c1 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:46 +0100 Subject: [PATCH 12/75] UPSTREAM: sched/fair: Wrap rq->rd->overload accesses with READ/WRITE_ONCE() This variable can be read and set locklessly within update_sd_lb_stats(). As such, READ/WRITE_ONCE() are added to make sure nothing terribly wrong can happen because of the compiler. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-9-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit e90c8fe15a3bf93a23088bcf1a56a0fa391d4e50) Signed-off-by: Quentin Perret Change-Id: Id46db398099d15a051566806f7ca0092d3973788 --- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8706a7b2a559..55a7c2bcb094 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8152,8 +8152,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (env->dst_rq->rd->overload != overload) - env->dst_rq->rd->overload = overload; + if (READ_ONCE(env->dst_rq->rd->overload) != overload) + WRITE_ONCE(env->dst_rq->rd->overload, overload); } } @@ -9596,7 +9596,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || - !this_rq->rd->overload) { + !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c655b4feffcf..47ced388d473 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1700,8 +1700,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP - if (!rq->rd->overload) - rq->rd->overload = 1; + if (!READ_ONCE(rq->rd->overload)) + WRITE_ONCE(rq->rd->overload, 1); #endif } From 9551c952a2ef6dd8b96d0391005be54a29554761 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 4 Jul 2018 11:17:47 +0100 Subject: [PATCH 13/75] UPSTREAM: sched/fair: Set rq->rd->overload when misfit Idle balance is a great opportunity to pull a misfit task. However, there are scenarios where misfit tasks are present but idle balance is prevented by the overload flag. A good example of this is a workload of n identical tasks. Let's suppose we have a 2+2 Arm big.LITTLE system. We then spawn 4 fairly CPU-intensive tasks - for the sake of simplicity let's say they are just CPU hogs, even when running on big CPUs. They are identical tasks, so on an SMP system they should all end at (roughly) the same time. However, in our case the LITTLE CPUs are less performing than the big CPUs, so tasks running on the LITTLEs will have a longer completion time. This means that the big CPUs will complete their work earlier, at which point they should pull the tasks from the LITTLEs. What we want to happen is summarized as follows: a,b,c,d are our CPU-hogging tasks _ signifies idling LITTLE_0 | a a a a _ _ LITTLE_1 | b b b b _ _ ---------|------------- big_0 | c c c c a a big_1 | d d d d b b ^ ^ Tasks end on the big CPUs, idle balance happens and the misfit tasks are pulled straight away This however won't happen, because currently the overload flag is only set when there is any CPU that has more than one runnable task - which may very well not be the case here if our CPU-hogging workload is all there is to run. As such, this commit sets the overload flag in update_sg_lb_stats when a group is flagged as having a misfit task. Signed-off-by: Valentin Schneider Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-10-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 757ffdd705ee942fc8150b17942d968601d2a15b) Signed-off-by: Quentin Perret Change-Id: I608765f3dd4238202bce5e6996c2711b28cbf4ea --- kernel/sched/fair.c | 6 ++++-- kernel/sched/sched.h | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55a7c2bcb094..95af63ada3fa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7887,7 +7887,7 @@ static bool update_nohz_stats(struct rq *rq, bool force) * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate more than one runnable task for any CPU. + * @overload: Indicate pullable load (e.g. >1 runnable task). */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, @@ -7931,8 +7931,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; if (env->sd->flags & SD_ASYM_CPUCAPACITY && - sgs->group_misfit_task_load < rq->misfit_task_load) + sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; + *overload = 1; + } } /* Adjust by relative CPU capacity of the group */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 47ced388d473..b6c4c8619700 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -717,7 +717,11 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; - /* Indicate more than one runnable task for any CPU */ + /* + * Indicate pullable load on at least one CPU, e.g: + * - More than one runnable task + * - Running task is misfit + */ int overload; /* From cf203a200b47054ba2baa334ca9199558bdb5bde Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Wed, 4 Jul 2018 11:17:48 +0100 Subject: [PATCH 14/75] UPSTREAM: sched/fair: Don't move tasks to lower capacity CPUs unless necessary When lower capacity CPUs are load balancing and considering to pull something from a higher capacity group, we should not pull tasks from a CPU with only one task running as this is guaranteed to impede progress for that task. If there is more than one task running, load balance in the higher capacity group would have already made any possible moves to resolve imbalance and we should make better use of system compute capacity by moving a task if we still have more than one running. Signed-off-by: Chris Redpath Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-11-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 4ad3831a9d4af5e36da5d44a3b9c6522d0353cee) Signed-off-by: Quentin Perret Change-Id: I49e78e10920364ad94d3bc4f1407208cd1e60aba --- kernel/sched/fair.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 95af63ada3fa..4f623d62f1ee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8517,6 +8517,17 @@ static struct rq *find_busiest_queue(struct lb_env *env, capacity = capacity_of(i); + /* + * For ASYM_CPUCAPACITY domains, don't pick a CPU that could + * eventually lead to active_balancing high->low capacity. + * Higher per-CPU capacity is considered better than balancing + * average load. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + capacity_of(env->dst_cpu) < capacity && + rq->nr_running == 1) + continue; + wl = weighted_cpuload(rq); /* From d9634ab459ac4ded9333e7de238255ba2cd9924c Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Jul 2018 11:17:50 +0100 Subject: [PATCH 15/75] UPSTREAM: sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains The 'prefer sibling' sched_domain flag is intended to encourage spreading tasks to sibling sched_domain to take advantage of more caches and core for SMT systems. It has recently been changed to be on all non-NUMA topology level. However, spreading across domains with CPU capacity asymmetry isn't desirable, e.g. spreading from high capacity to low capacity CPUs even if high capacity CPUs aren't overutilized might give access to more cache but the CPU will be slower and possibly lead to worse overall throughput. To prevent this, we need to remove SD_PREFER_SIBLING on the sched_domain level immediately below SD_ASYM_CPUCAPACITY. Signed-off-by: Morten Rasmussen Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dietmar.eggemann@arm.com Cc: gaku.inami.xh@renesas.com Cc: valentin.schneider@arm.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1530699470-29808-13-git-send-email-morten.rasmussen@arm.com Signed-off-by: Ingo Molnar (cherry picked from commit 9c63e84db29bcf584040931ad97c2edd11e35f6c) Signed-off-by: Quentin Perret Change-Id: Iad5af7e95aedf555f4a8ffffaed7bc430fb8a169 --- kernel/sched/topology.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2536e1b938f9..7ffad0d3a4eb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1126,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE - | 0*SD_PREFER_SIBLING + | 1*SD_PREFER_SIBLING | 0*SD_NUMA | sd_flags , @@ -1152,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_ASYM_CPUCAPACITY) { struct sched_domain *t = sd; + /* + * Don't attempt to spread across CPUs of different capacities. + */ + if (sd->child) + sd->child->flags &= ~SD_PREFER_SIBLING; + for_each_lower_domain(t) t->flags |= SD_BALANCE_WAKE; } if (sd->flags & SD_SHARE_CPUCAPACITY) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { - sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 117; sd->cache_nice_tries = 1; sd->busy_idx = 2; @@ -1173,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->busy_idx = 3; sd->idle_idx = 2; + sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { sd->flags &= ~(SD_BALANCE_EXEC | @@ -1182,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { - sd->flags |= SD_PREFER_SIBLING; sd->cache_nice_tries = 1; sd->busy_idx = 2; sd->idle_idx = 1; From 30f6e1b5c90712afd918e622a26e7a2bd7f72cc7 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:14:59 +0100 Subject: [PATCH 16/75] FROMLIST: sched: Relocate arch_scale_cpu_capacity By default, arch_scale_cpu_capacity() is only visible from within the kernel/sched folder. Relocate it to include/linux/sched/topology.h to make it visible to other clients needing to know about the capacity of CPUs, such as the Energy Model framework. Change-Id: I144c7299e122201dbcadc431d55d0a6d24d90005 Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-2-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- include/linux/sched/topology.h | 19 +++++++++++++++++++ kernel/sched/sched.h | 21 --------------------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 6b9976180c1e..5e56d6b1e217 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -202,6 +202,17 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); # define SD_INIT_NAME(type) #endif +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) + return sd->smt_gain / sd->span_weight; + + return SCHED_CAPACITY_SCALE; +} +#endif + #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -217,6 +228,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + #endif /* !CONFIG_SMP */ static inline int task_node(const struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b6c4c8619700..b2dc595308ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1764,27 +1764,6 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif -#ifdef CONFIG_SMP -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) -{ - if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) - return sd->smt_gain / sd->span_weight; - - return SCHED_CAPACITY_SCALE; -} -#endif -#else -#ifndef arch_scale_cpu_capacity -static __always_inline -unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} -#endif -#endif - struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); From f3e45428b75019460b2c0fb3134b0ea7d4741c41 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:00 +0100 Subject: [PATCH 17/75] FROMLIST: sched/cpufreq: Prepare schedutil for Energy Aware Scheduling Schedutil requests frequency by aggregating utilization signals from the scheduler (CFS, RT, DL, IRQ) and applying a 25% margin on top of them. Since Energy Aware Scheduling (EAS) needs to be able to predict the frequency requests, it needs to forecast the decisions made by the governor. In order to prepare the introduction of EAS, introduce schedutil_freq_util() to centralize the aforementioned signal aggregation and make it available to both schedutil and EAS. Since frequency selection and energy estimation still need to deal with RT and DL signals slightly differently, schedutil_freq_util() is called with a different 'type' parameter in those two contexts, and returns an aggregated utilization signal accordingly. While at it, introduce the map_util_freq() function which is designed to make schedutil's 25% margin usable easily for both sugov and EAS. As EAS will be able to predict schedutil's frequency requests more accurately than any other governor by design, it'd be sensible to make sure EAS cannot be used without schedutil. This will be done later, once EAS has actually been introduced. Change-Id: Idbeeb00926045507b73f9cba37630b38ae0816c0 Cc: Ingo Molnar Cc: Peter Zijlstra Suggested-by: Peter Zijlstra Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-3-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- include/linux/sched/cpufreq.h | 6 +++ kernel/sched/cpufreq_schedutil.c | 92 +++++++++++++++++++++----------- kernel/sched/sched.h | 30 +++++++++++ 3 files changed, 97 insertions(+), 31 deletions(-) diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 59667444669f..afa940cd50dc 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -20,6 +20,12 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data, void (*func)(struct update_util_data *data, u64 time, unsigned int flags)); void cpufreq_remove_update_util_hook(int cpu); + +static inline unsigned long map_util_freq(unsigned long util, + unsigned long freq, unsigned long cap) +{ + return (freq + (freq >> 2)) * util / cap; +} #endif /* CONFIG_CPU_FREQ */ #endif /* _LINUX_SCHED_CPUFREQ_H */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3fffad3bc8a8..105cf70fcb69 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -13,6 +13,7 @@ #include "sched.h" +#include #include struct sugov_tunables { @@ -167,7 +168,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - freq = (freq + (freq >> 2)) * util / max; + freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; @@ -197,15 +198,13 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type) { - struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util, irq, max; + struct rq *rq = cpu_rq(cpu); + unsigned long util, irq; - sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); - sg_cpu->bw_dl = cpu_bw_dl(rq); - - if (rt_rq_is_runnable(&rq->rt)) + if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) return max; /* @@ -223,20 +222,33 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) * utilization (PELT windows are synchronized) we can directly add them * to obtain the CPU's actual utilization. */ - util = cpu_util_cfs(rq); + util = util_cfs; util += cpu_util_rt(rq); - /* - * We do not make cpu_util_dl() a permanent part of this sum because we - * want to use cpu_bw_dl() later on, but we need to check if the - * CFS+RT+DL sum is saturated (ie. no idle time) such that we select - * f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. - */ - if ((util + cpu_util_dl(rq)) >= max) - return max; + if (type == FREQUENCY_UTIL) { + /* + * For frequency selection we do not make cpu_util_dl() a + * permanent part of this sum because we want to use + * cpu_bw_dl() later on, but we need to check if the + * CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us + * to not quite hit saturation when we should -- + * something for later. + */ + + if ((util + cpu_util_dl(rq)) >= max) + return max; + } else { + /* + * OTOH, for energy computation we need the estimated + * running time, so include util_dl and ignore dl_bw. + */ + util += cpu_util_dl(rq); + if (util >= max) + return max; + } /* * There is still idle time; further improve the number by using the @@ -250,17 +262,35 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) util = scale_irq_capacity(util, irq, max); util += irq; - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - return min(max, util + sg_cpu->bw_dl); + if (type == FREQUENCY_UTIL) { + /* + * Bandwidth required by DEADLINE must always be granted + * while, for FAIR and RT, we use blocked utilization of + * IDLE CPUs as a mechanism to gracefully reduce the + * frequency when no tasks show up for longer periods of + * time. + * + * Ideally we would like to set bw_dl as min/guaranteed + * freq and util + bw_dl as requested freq. However, + * cpufreq is not yet ready for such an interface. So, + * we only do the latter for now. + */ + util += cpu_bw_dl(rq); + } + + return min(max, util); +} + +static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +{ + struct rq *rq = cpu_rq(sg_cpu->cpu); + unsigned long util = cpu_util_cfs(rq); + unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + + sg_cpu->max = max; + sg_cpu->bw_dl = cpu_bw_dl(rq); + + return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); } /** diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b2dc595308ea..15fbfab2bcad 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2176,6 +2176,31 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL +/** + * enum schedutil_type - CPU utilization type + * @FREQUENCY_UTIL: Utilization used to select frequency + * @ENERGY_UTIL: Utilization used during energy calculation + * + * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time + * need to be aggregated differently depending on the usage made of them. This + * enum is used within schedutil_freq_util() to differentiate the types of + * utilization expected by the callers, and adjust the aggregation accordingly. + */ +enum schedutil_type { + FREQUENCY_UTIL, + ENERGY_UTIL, +}; + +unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type); + +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + unsigned long max = arch_scale_cpu_capacity(NULL, cpu); + + return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); +} + static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2202,6 +2227,11 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } +#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + return cfs; +} #endif #ifdef HAVE_SCHED_AVG_IRQ From 9cbf2b68cabf4660c97674d0a74d5f191c561c2f Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:01 +0100 Subject: [PATCH 18/75] FROMLIST: PM: Introduce an Energy Model management framework Several subsystems in the kernel (task scheduler and/or thermal at the time of writing) can benefit from knowing about the energy consumed by CPUs. Yet, this information can come from different sources (DT or firmware for example), in different formats, hence making it hard to exploit without a standard API. As an attempt to address this, introduce a centralized Energy Model (EM) management framework which aggregates the power values provided by drivers into a table for each performance domain in the system. The power cost tables are made available to interested clients (e.g. task scheduler or thermal) via platform-agnostic APIs. The overall design is represented by the diagram below (focused on Arm-related drivers as an example, but applicable to any architecture): +---------------+ +-----------------+ +-------------+ | Thermal (IPA) | | Scheduler (EAS) | | Other | +---------------+ +-----------------+ +-------------+ | | em_pd_energy() | | | em_cpu_get() | +-----------+ | +--------+ | | | v v v +---------------------+ | | | Energy Model | | | | Framework | | | +---------------------+ ^ ^ ^ | | | em_register_perf_domain() +----------+ | +---------+ | | | +---------------+ +---------------+ +--------------+ | cpufreq-dt | | arm_scmi | | Other | +---------------+ +---------------+ +--------------+ ^ ^ ^ | | | +--------------+ +---------------+ +--------------+ | Device Tree | | Firmware | | ? | +--------------+ +---------------+ +--------------+ Drivers (typically, but not limited to, CPUFreq drivers) can register data in the EM framework using the em_register_perf_domain() API. The calling driver must provide a callback function with a standardized signature that will be used by the EM framework to build the power cost tables of the performance domain. This design should offer a lot of flexibility to calling drivers which are free of reading information from any location and to use any technique to compute power costs. Moreover, the capacity states registered by drivers in the EM framework are not required to match real performance states of the target. This is particularly important on targets where the performance states are not known by the OS. The power cost coefficients managed by the EM framework are specified in milli-watts. Although the two potential users of those coefficients (IPA and EAS) only need relative correctness, IPA specifically needs to compare the power of CPUs with the power of other components (GPUs, for example), which are still expressed in absolute terms in their respective subsystems. Hence, specifying the power of CPUs in milli-watts should help transitioning IPA to using the EM framework without introducing new problems by keeping units comparable across sub-systems. On the longer term, the EM of other devices than CPUs could also be managed by the EM framework, which would enable to remove the absolute unit. However, this is not absolutely required as a first step, so this extension of the EM framework is left for later. On the client side, the EM framework offers APIs to access the power cost tables of a CPU (em_cpu_get()), and to estimate the energy consumed by the CPUs of a performance domain (em_pd_energy()). Clients such as the task scheduler can then use these APIs to access the shared data structures holding the Energy Model of CPUs. Change-Id: I384cb3d28f37fe82c2943d7208a4cf5dcca2b6bd Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-4-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- include/linux/energy_model.h | 187 ++++++++++++++++++++++++++++++++ kernel/power/Kconfig | 15 +++ kernel/power/Makefile | 2 + kernel/power/energy_model.c | 201 +++++++++++++++++++++++++++++++++++ 4 files changed, 405 insertions(+) create mode 100644 include/linux/energy_model.h create mode 100644 kernel/power/energy_model.c diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h new file mode 100644 index 000000000000..aa027f7bcb3e --- /dev/null +++ b/include/linux/energy_model.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ENERGY_MODEL_H +#define _LINUX_ENERGY_MODEL_H +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_ENERGY_MODEL +/** + * em_cap_state - Capacity state of a performance domain + * @frequency: The CPU frequency in KHz, for consistency with CPUFreq + * @power: The power consumed by 1 CPU at this level, in milli-watts + * @cost: The cost coefficient associated with this level, used during + * energy calculation. Equal to: power * max_frequency / frequency + */ +struct em_cap_state { + unsigned long frequency; + unsigned long power; + unsigned long cost; +}; + +/** + * em_perf_domain - Performance domain + * @table: List of capacity states, in ascending order + * @nr_cap_states: Number of capacity states + * @cpus: Cpumask covering the CPUs of the domain + * + * A "performance domain" represents a group of CPUs whose performance is + * scaled together. All CPUs of a performance domain must have the same + * micro-architecture. Performance domains often have a 1-to-1 mapping with + * CPUFreq policies. + */ +struct em_perf_domain { + struct em_cap_state *table; + int nr_cap_states; + unsigned long cpus[0]; +}; + +#define EM_CPU_MAX_POWER 0xFFFF + +struct em_data_callback { + /** + * active_power() - Provide power at the next capacity state of a CPU + * @power : Active power at the capacity state in mW (modified) + * @freq : Frequency at the capacity state in kHz (modified) + * @cpu : CPU for which we do this operation + * + * active_power() must find the lowest capacity state of 'cpu' above + * 'freq' and update 'power' and 'freq' to the matching active power + * and frequency. + * + * The power is the one of a single CPU in the domain, expressed in + * milli-watts. It is expected to fit in the [0, EM_CPU_MAX_POWER] + * range. + * + * Return 0 on success. + */ + int (*active_power)(unsigned long *power, unsigned long *freq, int cpu); +}; +#define EM_DATA_CB(_active_power_cb) { .active_power = &_active_power_cb } + +struct em_perf_domain *em_cpu_get(int cpu); +int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb); + +/** + * em_pd_energy() - Estimates the energy consumed by the CPUs of a perf. domain + * @pd : performance domain for which energy has to be estimated + * @max_util : highest utilization among CPUs of the domain + * @sum_util : sum of the utilization of all CPUs in the domain + * + * Return: the sum of the energy consumed by the CPUs of the domain assuming + * a capacity state satisfying the max utilization of the domain. + */ +static inline unsigned long em_pd_energy(struct em_perf_domain *pd, + unsigned long max_util, unsigned long sum_util) +{ + unsigned long freq, scale_cpu; + struct em_cap_state *cs; + int i, cpu; + + /* + * In order to predict the capacity state, map the utilization of the + * most utilized CPU of the performance domain to a requested frequency, + * like schedutil. + */ + cpu = cpumask_first(to_cpumask(pd->cpus)); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + cs = &pd->table[pd->nr_cap_states - 1]; + freq = map_util_freq(max_util, cs->frequency, scale_cpu); + + /* + * Find the lowest capacity state of the Energy Model above the + * requested frequency. + */ + for (i = 0; i < pd->nr_cap_states; i++) { + cs = &pd->table[i]; + if (cs->frequency >= freq) + break; + } + + /* + * The capacity of a CPU in the domain at that capacity state (cs) + * can be computed as: + * + * cs->freq * scale_cpu + * cs->cap = -------------------- (1) + * cpu_max_freq + * + * So, ignoring the costs of idle states (which are not available in + * the EM), the energy consumed by this CPU at that capacity state is + * estimated as: + * + * cs->power * cpu_util + * cpu_nrg = -------------------- (2) + * cs->cap + * + * since 'cpu_util / cs->cap' represents its percentage of busy time. + * + * NOTE: Although the result of this computation actually is in + * units of power, it can be manipulated as an energy value + * over a scheduling period, since it is assumed to be + * constant during that interval. + * + * By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product + * of two terms: + * + * cs->power * cpu_max_freq cpu_util + * cpu_nrg = ------------------------ * --------- (3) + * cs->freq scale_cpu + * + * The first term is static, and is stored in the em_cap_state struct + * as 'cs->cost'. + * + * Since all CPUs of the domain have the same micro-architecture, they + * share the same 'cs->cost', and the same CPU capacity. Hence, the + * total energy of the domain (which is the simple sum of the energy of + * all of its CPUs) can be factorized as: + * + * cs->cost * \Sum cpu_util + * pd_nrg = ------------------------ (4) + * scale_cpu + */ + return cs->cost * sum_util / scale_cpu; +} + +/** + * em_pd_nr_cap_states() - Get the number of capacity states of a perf. domain + * @pd : performance domain for which this must be done + * + * Return: the number of capacity states in the performance domain table + */ +static inline int em_pd_nr_cap_states(struct em_perf_domain *pd) +{ + return pd->nr_cap_states; +} + +#else +struct em_perf_domain {}; +struct em_data_callback {}; +#define EM_DATA_CB(_active_power_cb) { } + +static inline int em_register_perf_domain(cpumask_t *span, + unsigned int nr_states, struct em_data_callback *cb) +{ + return -EINVAL; +} +static inline struct em_perf_domain *em_cpu_get(int cpu) +{ + return NULL; +} +static inline unsigned long em_pd_energy(struct em_perf_domain *pd, + unsigned long max_util, unsigned long sum_util) +{ + return 0; +} +static inline int em_pd_nr_cap_states(struct em_perf_domain *pd) +{ + return 0; +} +#endif + +#endif diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3a6c2f87699e..f8fe57d1022e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -298,3 +298,18 @@ config PM_GENERIC_DOMAINS_OF config CPU_PM bool + +config ENERGY_MODEL + bool "Energy Model for CPUs" + depends on SMP + depends on CPU_FREQ + default n + help + Several subsystems (thermal and/or the task scheduler for example) + can leverage information about the energy consumed by CPUs to make + smarter decisions. This config option enables the framework from + which subsystems can access the energy models. + + The exact usage of the energy model is subsystem-dependent. + + If in doubt, say N. diff --git a/kernel/power/Makefile b/kernel/power/Makefile index a3f79f0eef36..e7e47d9be1e5 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o + +obj-$(CONFIG_ENERGY_MODEL) += energy_model.o diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c new file mode 100644 index 000000000000..d9dc2c38764a --- /dev/null +++ b/kernel/power/energy_model.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Energy Model of CPUs + * + * Copyright (c) 2018, Arm ltd. + * Written by: Quentin Perret, Arm ltd. + */ + +#define pr_fmt(fmt) "energy_model: " fmt + +#include +#include +#include +#include +#include + +/* Mapping of each CPU to the performance domain to which it belongs. */ +static DEFINE_PER_CPU(struct em_perf_domain *, em_data); + +/* + * Mutex serializing the registrations of performance domains and letting + * callbacks defined by drivers sleep. + */ +static DEFINE_MUTEX(em_pd_mutex); + +static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, + struct em_data_callback *cb) +{ + unsigned long opp_eff, prev_opp_eff = ULONG_MAX; + unsigned long power, freq, prev_freq = 0; + int i, ret, cpu = cpumask_first(span); + struct em_cap_state *table; + struct em_perf_domain *pd; + u64 fmax; + + if (!cb->active_power) + return NULL; + + pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL); + if (!pd) + return NULL; + + table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL); + if (!table) + goto free_pd; + + /* Build the list of capacity states for this performance domain */ + for (i = 0, freq = 0; i < nr_states; i++, freq++) { + /* + * active_power() is a driver callback which ceils 'freq' to + * lowest capacity state of 'cpu' above 'freq' and updates + * 'power' and 'freq' accordingly. + */ + ret = cb->active_power(&power, &freq, cpu); + if (ret) { + pr_err("pd%d: invalid cap. state: %d\n", cpu, ret); + goto free_cs_table; + } + + /* + * We expect the driver callback to increase the frequency for + * higher capacity states. + */ + if (freq <= prev_freq) { + pr_err("pd%d: non-increasing freq: %lu\n", cpu, freq); + goto free_cs_table; + } + + /* + * The power returned by active_state() is expected to be + * positive, in milli-watts and to fit into 16 bits. + */ + if (!power || power > EM_CPU_MAX_POWER) { + pr_err("pd%d: invalid power: %lu\n", cpu, power); + goto free_cs_table; + } + + table[i].power = power; + table[i].frequency = prev_freq = freq; + + /* + * The hertz/watts efficiency ratio should decrease as the + * frequency grows on sane platforms. But this isn't always + * true in practice so warn the user if a higher OPP is more + * power efficient than a lower one. + */ + opp_eff = freq / power; + if (opp_eff >= prev_opp_eff) + pr_warn("pd%d: hertz/watts ratio non-monotonically decreasing: em_cap_state %d >= em_cap_state%d\n", + cpu, i, i - 1); + prev_opp_eff = opp_eff; + } + + /* Compute the cost of each capacity_state. */ + fmax = (u64) table[nr_states - 1].frequency; + for (i = 0; i < nr_states; i++) { + table[i].cost = div64_u64(fmax * table[i].power, + table[i].frequency); + } + + pd->table = table; + pd->nr_cap_states = nr_states; + cpumask_copy(to_cpumask(pd->cpus), span); + + return pd; + +free_cs_table: + kfree(table); +free_pd: + kfree(pd); + + return NULL; +} + +/** + * em_cpu_get() - Return the performance domain for a CPU + * @cpu : CPU to find the performance domain for + * + * Return: the performance domain to which 'cpu' belongs, or NULL if it doesn't + * exist. + */ +struct em_perf_domain *em_cpu_get(int cpu) +{ + return READ_ONCE(per_cpu(em_data, cpu)); +} +EXPORT_SYMBOL_GPL(em_cpu_get); + +/** + * em_register_perf_domain() - Register the Energy Model of a performance domain + * @span : Mask of CPUs in the performance domain + * @nr_states : Number of capacity states to register + * @cb : Callback functions providing the data of the Energy Model + * + * Create Energy Model tables for a performance domain using the callbacks + * defined in cb. + * + * If multiple clients register the same performance domain, all but the first + * registration will be ignored. + * + * Return 0 on success + */ +int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, + struct em_data_callback *cb) +{ + unsigned long cap, prev_cap = 0; + struct em_perf_domain *pd; + int cpu, ret = 0; + + if (!span || !nr_states || !cb) + return -EINVAL; + + /* + * Use a mutex to serialize the registration of performance domains and + * let the driver-defined callback functions sleep. + */ + mutex_lock(&em_pd_mutex); + + for_each_cpu(cpu, span) { + /* Make sure we don't register again an existing domain. */ + if (READ_ONCE(per_cpu(em_data, cpu))) { + ret = -EEXIST; + goto unlock; + } + + /* + * All CPUs of a domain must have the same micro-architecture + * since they all share the same table. + */ + cap = arch_scale_cpu_capacity(NULL, cpu); + if (prev_cap && prev_cap != cap) { + pr_err("CPUs of %*pbl must have the same capacity\n", + cpumask_pr_args(span)); + ret = -EINVAL; + goto unlock; + } + prev_cap = cap; + } + + /* Create the performance domain and add it to the Energy Model. */ + pd = em_create_pd(span, nr_states, cb); + if (!pd) { + ret = -EINVAL; + goto unlock; + } + + for_each_cpu(cpu, span) { + /* + * The per-cpu array can be read concurrently from em_cpu_get(). + * The barrier enforces the ordering needed to make sure readers + * can only access well formed em_perf_domain structs. + */ + smp_store_release(per_cpu_ptr(&em_data, cpu), pd); + } + + pr_debug("Created perf domain %*pbl\n", cpumask_pr_args(span)); +unlock: + mutex_unlock(&em_pd_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(em_register_perf_domain); From b8c80a0f421142b3b410162d7ced6f32b5fd14bf Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:02 +0100 Subject: [PATCH 19/75] FROMLIST: PM / EM: Expose the Energy Model in sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Expose the Energy Model (read-only) of all performance domains in sysfs for convenience. To do so, add a kobject to the CPU subsystem under the umbrella of which a kobject for each performance domain is attached. The resulting hierarchy is as follows for a platform with two performance domains for example: /sys/devices/system/cpu/energy_model ├── pd0 │   ├── cost │   ├── cpus │   ├── frequency │   └── power └── pd4 ├── cost ├── cpus ├── frequency └── power In this implementation, the kobject abstraction is only used as a convenient way of exposing data to sysfs. However, it could also be used in the future to allocate and release performance domains in a more dynamic way using reference counting. Change-Id: Ia98bcae21c3578e385be9c6b030c9adff8210909 Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-5-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- include/linux/energy_model.h | 2 + kernel/power/energy_model.c | 90 ++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index aa027f7bcb3e..55deab2b38dc 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -27,6 +27,7 @@ struct em_cap_state { * em_perf_domain - Performance domain * @table: List of capacity states, in ascending order * @nr_cap_states: Number of capacity states + * @kobj: Kobject used to expose the domain in sysfs * @cpus: Cpumask covering the CPUs of the domain * * A "performance domain" represents a group of CPUs whose performance is @@ -37,6 +38,7 @@ struct em_cap_state { struct em_perf_domain { struct em_cap_state *table; int nr_cap_states; + struct kobject kobj; unsigned long cpus[0]; }; diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index d9dc2c38764a..5ec376d4f2f3 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -23,6 +23,82 @@ static DEFINE_PER_CPU(struct em_perf_domain *, em_data); */ static DEFINE_MUTEX(em_pd_mutex); +static struct kobject *em_kobject; + +/* Getters for the attributes of em_perf_domain objects */ +struct em_pd_attr { + struct attribute attr; + ssize_t (*show)(struct em_perf_domain *pd, char *buf); + ssize_t (*store)(struct em_perf_domain *pd, const char *buf, size_t s); +}; + +#define EM_ATTR_LEN 13 +#define show_table_attr(_attr) \ +static ssize_t show_##_attr(struct em_perf_domain *pd, char *buf) \ +{ \ + ssize_t cnt = 0; \ + int i; \ + for (i = 0; i < pd->nr_cap_states; i++) { \ + if (cnt >= (ssize_t) (PAGE_SIZE / sizeof(char) \ + - (EM_ATTR_LEN + 2))) \ + goto out; \ + cnt += scnprintf(&buf[cnt], EM_ATTR_LEN + 1, "%lu ", \ + pd->table[i]._attr); \ + } \ +out: \ + cnt += sprintf(&buf[cnt], "\n"); \ + return cnt; \ +} + +show_table_attr(power); +show_table_attr(frequency); +show_table_attr(cost); + +static ssize_t show_cpus(struct em_perf_domain *pd, char *buf) +{ + return sprintf(buf, "%*pbl\n", cpumask_pr_args(to_cpumask(pd->cpus))); +} + +#define pd_attr(_name) em_pd_##_name##_attr +#define define_pd_attr(_name) static struct em_pd_attr pd_attr(_name) = \ + __ATTR(_name, 0444, show_##_name, NULL) + +define_pd_attr(power); +define_pd_attr(frequency); +define_pd_attr(cost); +define_pd_attr(cpus); + +static struct attribute *em_pd_default_attrs[] = { + &pd_attr(power).attr, + &pd_attr(frequency).attr, + &pd_attr(cost).attr, + &pd_attr(cpus).attr, + NULL +}; + +#define to_pd(k) container_of(k, struct em_perf_domain, kobj) +#define to_pd_attr(a) container_of(a, struct em_pd_attr, attr) + +static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + struct em_perf_domain *pd = to_pd(kobj); + struct em_pd_attr *pd_attr = to_pd_attr(attr); + ssize_t ret; + + ret = pd_attr->show(pd, buf); + + return ret; +} + +static const struct sysfs_ops em_pd_sysfs_ops = { + .show = show, +}; + +static struct kobj_type ktype_em_pd = { + .sysfs_ops = &em_pd_sysfs_ops, + .default_attrs = em_pd_default_attrs, +}; + static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, struct em_data_callback *cb) { @@ -102,6 +178,11 @@ static struct em_perf_domain *em_create_pd(cpumask_t *span, int nr_states, pd->nr_cap_states = nr_states; cpumask_copy(to_cpumask(pd->cpus), span); + ret = kobject_init_and_add(&pd->kobj, &ktype_em_pd, em_kobject, + "pd%u", cpu); + if (ret) + pr_err("pd%d: failed kobject_init_and_add(): %d\n", cpu, ret); + return pd; free_cs_table: @@ -155,6 +236,15 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, */ mutex_lock(&em_pd_mutex); + if (!em_kobject) { + em_kobject = kobject_create_and_add("energy_model", + &cpu_subsys.dev_root->kobj); + if (!em_kobject) { + ret = -ENODEV; + goto unlock; + } + } + for_each_cpu(cpu, span) { /* Make sure we don't register again an existing domain. */ if (READ_ONCE(per_cpu(em_data, cpu))) { From 2b73358bce6f54b614c970d159fb558f333b53b4 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:03 +0100 Subject: [PATCH 20/75] FROMLIST: sched/topology: Reference the Energy Model of CPUs when available The existing scheduling domain hierarchy is defined to map to the cache topology of the system. However, Energy Aware Scheduling (EAS) requires more knowledge about the platform, and specifically needs to know about the span of Performance Domains (PD), which do not always align with caches. To address this issue, use the Energy Model (EM) of the system to extend the scheduler topology code with a representation of the PDs, alongside the scheduling domains. More specifically, a linked list of PDs is attached to each root domain. When multiple root domains are in use, each list contains only the PDs covering the CPUs of its root domain. If a PD spans over CPUs of multiple different root domains, it will be duplicated in all lists. The lists are fully maintained by the scheduler from partition_sched_domains() in order to cope with hotplug and cpuset changes. As for scheduling domains, the list are protected by RCU to ensure safe concurrent updates. Change-Id: I27195ab35072210bdef91e78944d1407ff61f644 Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-6-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- kernel/sched/sched.h | 21 +++++++ kernel/sched/topology.c | 134 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 151 insertions(+), 4 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 15fbfab2bcad..bd47fb56b09c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -702,6 +703,12 @@ static inline bool sched_asym_prefer(int a, int b) return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); } +struct perf_domain { + struct em_perf_domain *em_pd; + struct perf_domain *next; + struct rcu_head rcu; +}; + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -754,6 +761,12 @@ struct root_domain { struct cpupri cpupri; unsigned long max_cpu_capacity; + + /* + * NULL-terminated list of performance domains intersecting with the + * CPUs of the rd. Protected by RCU. + */ + struct perf_domain *pd; }; extern struct root_domain def_root_domain; @@ -2261,3 +2274,11 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned return util; } #endif + +#ifdef CONFIG_SMP +#ifdef CONFIG_ENERGY_MODEL +#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) +#else +#define perf_domain_span(pd) NULL +#endif +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7ffad0d3a4eb..d0f31a476fcf 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,6 +201,116 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +#ifdef CONFIG_ENERGY_MODEL +static void free_pd(struct perf_domain *pd) +{ + struct perf_domain *tmp; + + while (pd) { + tmp = pd->next; + kfree(pd); + pd = tmp; + } +} + +static struct perf_domain *find_pd(struct perf_domain *pd, int cpu) +{ + while (pd) { + if (cpumask_test_cpu(cpu, perf_domain_span(pd))) + return pd; + pd = pd->next; + } + + return NULL; +} + +static struct perf_domain *pd_init(int cpu) +{ + struct em_perf_domain *obj = em_cpu_get(cpu); + struct perf_domain *pd; + + if (!obj) { + if (sched_debug()) + pr_info("%s: no EM found for CPU%d\n", __func__, cpu); + return NULL; + } + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) + return NULL; + pd->em_pd = obj; + + return pd; +} + +static void perf_domain_debug(const struct cpumask *cpu_map, + struct perf_domain *pd) +{ + if (!sched_debug() || !pd) + return; + + printk(KERN_DEBUG "root_domain %*pbl: ", cpumask_pr_args(cpu_map)); + + while (pd) { + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", + cpumask_first(perf_domain_span(pd)), + cpumask_pr_args(perf_domain_span(pd)), + em_pd_nr_cap_states(pd->em_pd)); + pd = pd->next; + } + + printk(KERN_CONT "\n"); +} + +static void destroy_perf_domain_rcu(struct rcu_head *rp) +{ + struct perf_domain *pd; + + pd = container_of(rp, struct perf_domain, rcu); + free_pd(pd); +} + +static void build_perf_domains(const struct cpumask *cpu_map) +{ + struct perf_domain *pd = NULL, *tmp; + int cpu = cpumask_first(cpu_map); + struct root_domain *rd = cpu_rq(cpu)->rd; + int i; + + for_each_cpu(i, cpu_map) { + /* Skip already covered CPUs. */ + if (find_pd(pd, i)) + continue; + + /* Create the new pd and add it to the local list. */ + tmp = pd_init(i); + if (!tmp) + goto free; + tmp->next = pd; + pd = tmp; + } + + perf_domain_debug(cpu_map, pd); + + /* Attach the new list of performance domains to the root domain. */ + tmp = rd->pd; + rcu_assign_pointer(rd->pd, pd); + if (tmp) + call_rcu(&tmp->rcu, destroy_perf_domain_rcu); + + return; + +free: + free_pd(pd); + tmp = rd->pd; + rcu_assign_pointer(rd->pd, NULL); + if (tmp) + call_rcu(&tmp->rcu, destroy_perf_domain_rcu); +} +#else +static void free_pd(struct perf_domain *pd) { } +#endif /* CONFIG_ENERGY_MODEL */ + static void free_rootdomain(struct rcu_head *rcu) { struct root_domain *rd = container_of(rcu, struct root_domain, rcu); @@ -211,6 +321,7 @@ static void free_rootdomain(struct rcu_head *rcu) free_cpumask_var(rd->rto_mask); free_cpumask_var(rd->online); free_cpumask_var(rd->span); + free_pd(rd->pd); kfree(rd); } @@ -1961,8 +2072,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], /* Destroy deleted domains: */ for (i = 0; i < ndoms_cur; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_cur[i], doms_new[j]) - && dattrs_equal(dattr_cur, i, dattr_new, j)) + if (cpumask_equal(doms_cur[i], doms_new[j]) && + dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; } /* No match - a current sched domain not in new doms_new[] */ @@ -1982,8 +2093,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], /* Build new domains: */ for (i = 0; i < ndoms_new; i++) { for (j = 0; j < n && !new_topology; j++) { - if (cpumask_equal(doms_new[i], doms_cur[j]) - && dattrs_equal(dattr_new, i, dattr_cur, j)) + if (cpumask_equal(doms_new[i], doms_cur[j]) && + dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; } /* No match - add a new doms_new */ @@ -1992,6 +2103,21 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ; } +#ifdef CONFIG_ENERGY_MODEL + /* Build perf. domains: */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < n; j++) { + if (cpumask_equal(doms_new[i], doms_cur[j]) && + cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) + goto match3; + } + /* No match - add perf. domains for a new rd */ + build_perf_domains(doms_new[i]); +match3: + ; + } +#endif + /* Remember the new sched domains: */ if (doms_cur != &fallback_doms) free_sched_domains(doms_cur, ndoms_cur); From a4dffa6a326a710b5c287d1302027f48cd240f2a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:04 +0100 Subject: [PATCH 21/75] FROMLIST: sched/topology: Lowest CPU asymmetry sched_domain level pointer Add another member to the family of per-cpu sched_domain shortcut pointers. This one, sd_asym_cpucapacity, points to the lowest level at which the SD_ASYM_CPUCAPACITY flag is set. While at it, rename the sd_asym shortcut to sd_asym_packing to avoid confusions. Generally speaking, the largest opportunity to save energy via scheduling comes from a smarter exploitation of heterogeneous platforms (i.e. big.LITTLE). Consequently, the sd_asym_cpucapacity shortcut will be used at first as the lowest domain where Energy-Aware Scheduling (EAS) should be applied. For example, it is possible to apply EAS within a socket on a multi-socket system, as long as each socket has an asymmetric topology. Energy-aware cross-sockets wake-up balancing can only happen if this_cpu and prev_cpu are in different sockets. Change-Id: Ie777a1733991d40ce063b318e915199ba3c5416a cc: Ingo Molnar cc: Peter Zijlstra Suggested-by: Morten Rasmussen Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-7-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 3 ++- kernel/sched/topology.c | 8 ++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4f623d62f1ee..789ab597747e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9269,7 +9269,7 @@ static void nohz_balancer_kick(struct rq *rq) } } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); if (sd) { for_each_cpu(i, sched_domain_span(sd)) { if (i == cpu || diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index bd47fb56b09c..fc2fad28b0d4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1206,7 +1206,8 @@ DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain *, sd_numa); -DECLARE_PER_CPU(struct sched_domain *, sd_asym); +DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); +DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; struct sched_group_capacity { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d0f31a476fcf..59421c0f1b54 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -508,7 +508,8 @@ DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_asym); +DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); +DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); static void update_top_cache_domain(int cpu) @@ -534,7 +535,10 @@ static void update_top_cache_domain(int cpu) rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); sd = highest_flag_domain(cpu, SD_ASYM_PACKING); - rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); + rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); + + sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY); + rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); } /* From 7aeb1244ecf7e17d2ee7bc518fe4936ab5c1edb3 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:05 +0100 Subject: [PATCH 22/75] FROMLIST: sched/topology: Disable EAS on inappropriate platforms Energy Aware Scheduling (EAS) in its current form is most relevant on platforms with asymmetric CPU topologies (e.g. Arm big.LITTLE) since this is where there is a lot of potential for saving energy through scheduling. This is particularly true since the Energy Model only includes the active power costs of CPUs, hence not providing enough data to compare packing-vs-spreading strategies. As such, disable EAS on root domains where the SD_ASYM_CPUCAPACITY flag is not set. While at it, disable EAS on systems where the complexity of the Energy Model is too high since that could lead to unacceptable scheduling overhead. All in all, EAS can be used on a root domain if and only if: 1. an Energy Model is available; 2. the root domain has an asymmetric CPU capacity topology; 3. the complexity of the root domain's EM is low enough to keep scheduling overheads low. Change-Id: Ia557fbb226be44ed40d7d22661773326276bf9c8 cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-8-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- kernel/sched/topology.c | 49 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 59421c0f1b54..bd082454f319 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -270,12 +270,45 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) free_pd(pd); } +/* + * EAS can be used on a root domain if it meets all the following conditions: + * 1. an Energy Model (EM) is available; + * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. + * 3. the EM complexity is low enough to keep scheduling overheads low; + * + * The complexity of the Energy Model is defined as: + * + * C = nr_pd * (nr_cpus + nr_cs) + * + * with parameters defined as: + * - nr_pd: the number of performance domains + * - nr_cpus: the number of CPUs + * - nr_cs: the sum of the number of capacity states of all performance + * domains (for example, on a system with 2 performance domains, + * with 10 capacity states each, nr_cs = 2 * 10 = 20). + * + * It is generally not a good idea to use such a model in the wake-up path on + * very complex platforms because of the associated scheduling overheads. The + * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs + * with per-CPU DVFS and less than 8 capacity states each, for example. + */ +#define EM_MAX_COMPLEXITY 2048 + static void build_perf_domains(const struct cpumask *cpu_map) { + int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; - int i; + + /* EAS is enabled for asymmetric CPU capacity topologies. */ + if (!per_cpu(sd_asym_cpucapacity, cpu)) { + if (sched_debug()) { + pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", + cpumask_pr_args(cpu_map)); + } + goto free; + } for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ @@ -288,6 +321,20 @@ static void build_perf_domains(const struct cpumask *cpu_map) goto free; tmp->next = pd; pd = tmp; + + /* + * Count performance domains and capacity states for the + * complexity check. + */ + nr_pd++; + nr_cs += em_pd_nr_cap_states(pd->em_pd); + } + + /* Bail out if the Energy Model complexity is too high. */ + if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { + WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", + cpumask_pr_args(cpu_map)); + goto free; } perf_domain_debug(cpu_map, pd); From 1e6b1214f10750396bef23b0d43a2ec8e0c32e2e Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:06 +0100 Subject: [PATCH 23/75] FROMLIST: sched/topology: Make Energy Aware Scheduling depend on schedutil Energy Aware Scheduling (EAS) is designed with the assumption that frequencies of CPUs follow their utilization value. When using a CPUFreq governor other than schedutil, the chances of this assumption being true are small, if any. When schedutil is being used, EAS' predictions are at least consistent with the frequency requests. Although those requests have no guarantees to be honored by the hardware, they should at least guide DVFS in the right direction and provide some hope in regards to the EAS model being accurate. To make sure EAS is only used in a sane configuration, create a strong dependency on schedutil being used. Since having sugov compiled-in does not provide that guarantee, make CPUFreq call a scheduler function on governor changes hence letting it rebuild the scheduling domains, check the governors of the online CPUs, and enable/disable EAS accordingly. Change-Id: I872949134f97d2772fc681b7393eaed7f0e224f2 Cc: Ingo Molnar Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-9-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- drivers/cpufreq/cpufreq.c | 2 ++ include/linux/sched/cpufreq.h | 9 ++++++++ kernel/sched/cpufreq_schedutil.c | 37 ++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 4 +--- kernel/sched/topology.c | 28 ++++++++++++++++++++---- 5 files changed, 71 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index f53fb41efb7b..bde9606c84c1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -2277,6 +2278,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, ret = cpufreq_start_governor(policy); if (!ret) { pr_debug("cpufreq: governor change\n"); + sched_cpufreq_governor_change(policy, old_gov); return 0; } cpufreq_exit_governor(policy); diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index afa940cd50dc..a2ead52feb17 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -2,6 +2,7 @@ #ifndef _LINUX_SCHED_CPUFREQ_H #define _LINUX_SCHED_CPUFREQ_H +#include #include /* @@ -28,4 +29,12 @@ static inline unsigned long map_util_freq(unsigned long util, } #endif /* CONFIG_CPU_FREQ */ +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + struct cpufreq_governor *old_gov); +#else +static inline void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + struct cpufreq_governor *old_gov) { } +#endif + #endif /* _LINUX_SCHED_CPUFREQ_H */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 105cf70fcb69..804eb7ae944a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -631,7 +631,7 @@ static struct kobj_type sugov_tunables_ktype = { /********************** cpufreq governor interface *********************/ -static struct cpufreq_governor schedutil_gov; +struct cpufreq_governor schedutil_gov; static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) { @@ -890,7 +890,7 @@ static void sugov_limits(struct cpufreq_policy *policy) sg_policy->need_freq_update = true; } -static struct cpufreq_governor schedutil_gov = { +struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, .dynamic_switching = true, @@ -913,3 +913,36 @@ static int __init sugov_register(void) return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); + +#ifdef CONFIG_ENERGY_MODEL +extern bool sched_energy_update; +extern struct mutex sched_energy_mutex; + +static void rebuild_sd_workfn(struct work_struct *work) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} +static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); + +/* + * EAS shouldn't be attempted without sugov, so rebuild the sched_domains + * on governor changes to make sure the scheduler knows about it. + */ +void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + struct cpufreq_governor *old_gov) +{ + if (old_gov == &schedutil_gov || policy->governor == &schedutil_gov) { + /* + * When called from the cpufreq_register_driver() path, the + * cpu_hotplug_lock is already held, so use a work item to + * avoid nested locking in rebuild_sched_domains(). + */ + schedule_work(&rebuild_sd_work); + } + +} +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fc2fad28b0d4..19d288e8f575 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2276,10 +2276,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned } #endif -#ifdef CONFIG_SMP -#ifdef CONFIG_ENERGY_MODEL +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) #define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) #else #define perf_domain_span(pd) NULL #endif -#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index bd082454f319..8f9e1fab67cb 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,7 +201,10 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } -#ifdef CONFIG_ENERGY_MODEL +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +DEFINE_MUTEX(sched_energy_mutex); +bool sched_energy_update; + static void free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -275,6 +278,7 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. the EM complexity is low enough to keep scheduling overheads low; + * 4. schedutil is driving the frequency of all CPUs of the rd; * * The complexity of the Energy Model is defined as: * @@ -294,12 +298,15 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) */ #define EM_MAX_COMPLEXITY 2048 +extern struct cpufreq_governor schedutil_gov; static void build_perf_domains(const struct cpumask *cpu_map) { int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; + struct cpufreq_policy *policy; + struct cpufreq_governor *gov; /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { @@ -315,6 +322,19 @@ static void build_perf_domains(const struct cpumask *cpu_map) if (find_pd(pd, i)) continue; + /* Do not attempt EAS if schedutil is not being used. */ + policy = cpufreq_cpu_get(i); + if (!policy) + goto free; + gov = policy->governor; + cpufreq_cpu_put(policy); + if (gov != &schedutil_gov) { + if (rd->pd) + pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", + cpumask_pr_args(cpu_map)); + goto free; + } + /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) @@ -356,7 +376,7 @@ static void build_perf_domains(const struct cpumask *cpu_map) } #else static void free_pd(struct perf_domain *pd) { } -#endif /* CONFIG_ENERGY_MODEL */ +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ static void free_rootdomain(struct rcu_head *rcu) { @@ -2154,10 +2174,10 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ; } -#ifdef CONFIG_ENERGY_MODEL +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) /* Build perf. domains: */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < n; j++) { + for (j = 0; j < n && !sched_energy_update; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) goto match3; From 78814285a99b1e3f8851e45af0d4c6ee7076f1d9 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:07 +0100 Subject: [PATCH 24/75] FROMLIST: sched: Introduce sched_energy_present static key In order to make sure Energy Aware Scheduling (EAS) will not impact systems where no Energy Model is available, introduce a static key guarding the access to EAS code. Since EAS is enabled on a per-root-domain basis, the static key is enabled when at least one root domain meets all conditions for EAS. Change-Id: Ifa3e490e023d3f57b2f1b1272d5ea58d6ae726ab Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-10-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- kernel/sched/sched.h | 4 ++++ kernel/sched/topology.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 19d288e8f575..5e2b8ac22298 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2281,3 +2281,7 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned #else #define perf_domain_span(pd) NULL #endif + +#ifdef CONFIG_SMP +extern struct static_key_false sched_energy_present; +#endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8f9e1fab67cb..7a142bd4d161 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -201,6 +201,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; } +DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; @@ -273,6 +274,35 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) free_pd(pd); } +static void sched_energy_start(int ndoms_new, cpumask_var_t doms_new[]) +{ + /* + * The conditions for EAS to start are checked during the creation of + * root domains. If one of them meets all conditions, it will have a + * non-null list of performance domains. + */ + while (ndoms_new) { + if (cpu_rq(cpumask_first(doms_new[ndoms_new - 1]))->rd->pd) + goto enable; + ndoms_new--; + } + + if (static_branch_unlikely(&sched_energy_present)) { + if (sched_debug()) + pr_info("%s: stopping EAS\n", __func__); + static_branch_disable_cpuslocked(&sched_energy_present); + } + + return; + +enable: + if (!static_branch_unlikely(&sched_energy_present)) { + if (sched_debug()) + pr_info("%s: starting EAS\n", __func__); + static_branch_enable_cpuslocked(&sched_energy_present); + } +} + /* * EAS can be used on a root domain if it meets all the following conditions: * 1. an Energy Model (EM) is available; @@ -2187,6 +2217,7 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], match3: ; } + sched_energy_start(ndoms_new, doms_new); #endif /* Remember the new sched domains: */ From b78eec5fb7c2bc0b77bbf2fe9711a107cfe6c20d Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:08 +0100 Subject: [PATCH 25/75] FROMLIST: sched: Introduce a sysctl for Energy Aware Scheduling In its current state, Energy Aware Scheduling (EAS) starts automatically on asymmetric platforms having an Energy Model (EM). However, there are users who want to have an EM (for thermal management for example), but don't want EAS with it. In order to let users disable EAS explicitly, introduce a new sysctl called 'sched_energy_aware'. It is enabled by default so that EAS can start automatically on platforms where it makes sense. Flipping it to 0 rebuilds the scheduling domains and disables EAS. Change-Id: I55764e70bf5e90795d2269ec9135ae6e82794a2b Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-11-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- include/linux/sched/sysctl.h | 7 +++++++ kernel/sched/topology.c | 29 +++++++++++++++++++++++++++++ kernel/sysctl.c | 11 +++++++++++ 3 files changed, 47 insertions(+) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index a9c32daeb9d8..99ce6d728df7 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -83,4 +83,11 @@ extern int sysctl_schedstats(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern unsigned int sysctl_sched_energy_aware; +extern int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 7a142bd4d161..db7dbeeeef49 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -203,9 +203,35 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) DEFINE_STATIC_KEY_FALSE(sched_energy_present); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +#ifdef CONFIG_PROC_SYSCTL +int sched_energy_aware_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, state; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + state = static_branch_unlikely(&sched_energy_present); + if (state != sysctl_sched_energy_aware) { + mutex_lock(&sched_energy_mutex); + sched_energy_update = 1; + rebuild_sched_domains(); + sched_energy_update = 0; + mutex_unlock(&sched_energy_mutex); + } + } + + return ret; +} +#endif + static void free_pd(struct perf_domain *pd) { struct perf_domain *tmp; @@ -338,6 +364,9 @@ static void build_perf_domains(const struct cpumask *cpu_map) struct cpufreq_policy *policy; struct cpufreq_governor *gov; + if (!sysctl_sched_energy_aware) + goto free; + /* EAS is enabled for asymmetric CPU capacity topologies. */ if (!per_cpu(sd_asym_cpucapacity, cpu)) { if (sched_debug()) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cc02050fd0c4..481309a11174 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -466,6 +466,17 @@ static struct ctl_table kern_table[] = { .extra1 = &one, }, #endif +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", + .data = &sysctl_sched_energy_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_energy_aware_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking", From e201c56b96cf84424d9f71e703d7e1bbabae9003 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:09 +0100 Subject: [PATCH 26/75] FROMLIST: sched/fair: Clean-up update_sg_lb_stats parameters In preparation for the introduction of a new root domain flag which can be set during load balance (the 'overutilized' flag), clean-up the set of parameters passed to update_sg_lb_stats(). More specifically, the 'local_group' and 'local_idx' parameters can be removed since they can easily be reconstructed from within the function. While at it, transform the 'overload' parameter into a flag stored in the 'sg_status' parameter hence facilitating the definition of new flags when needed. Change-Id: Ic2ccb51fdc08d7da0f8cc0442ef97cbcb4a52c86 Cc: Ingo Molnar Cc: Peter Zijlstra Suggested-by: Peter Zijlstra Suggested-by: Valentin Schneider Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-12-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 27 +++++++++++---------------- kernel/sched/sched.h | 3 +++ 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 789ab597747e..1c1e036e3eab 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7884,16 +7884,16 @@ static bool update_nohz_stats(struct rq *rq, bool force) * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. * @group: sched_group whose statistics are to be updated. - * @load_idx: Load index of sched_domain of this_cpu for load calc. - * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. - * @overload: Indicate pullable load (e.g. >1 runnable task). + * @sg_status: Holds flag indicating the status of the sched_group */ static inline void update_sg_lb_stats(struct lb_env *env, - struct sched_group *group, int load_idx, - int local_group, struct sg_lb_stats *sgs, - bool *overload) + struct sched_group *group, + struct sg_lb_stats *sgs, + int *sg_status) { + int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); + int load_idx = get_sd_load_idx(env->sd, env->idle); unsigned long load; int i, nr_running; @@ -7917,7 +7917,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, nr_running = rq->nr_running; if (nr_running > 1) - *overload = true; + *sg_status |= SG_OVERLOAD; #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; @@ -7933,7 +7933,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; - *overload = 1; + *sg_status |= SG_OVERLOAD; } } @@ -8078,17 +8078,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; - int load_idx; - bool overload = false; bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; + int sg_status = 0; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) env->flags |= LBF_NOHZ_STATS; #endif - load_idx = get_sd_load_idx(env->sd, env->idle); - do { struct sg_lb_stats *sgs = &tmp_sgs; int local_group; @@ -8103,8 +8100,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sg, load_idx, local_group, sgs, - &overload); + update_sg_lb_stats(env, sg, sgs, &sg_status); if (local_group) goto next_group; @@ -8154,8 +8150,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!env->sd->parent) { /* update overload indicator if we are at root domain */ - if (READ_ONCE(env->dst_rq->rd->overload) != overload) - WRITE_ONCE(env->dst_rq->rd->overload, overload); + WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5e2b8ac22298..5dba713b938e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -709,6 +709,9 @@ struct perf_domain { struct rcu_head rcu; }; +/* Scheduling group status flags */ +#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ + /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by From 4f37ec6ac1f09d89f1d5a9d15695adf04eb34465 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 16 Oct 2018 11:15:10 +0100 Subject: [PATCH 27/75] FROMLIST: sched: Add over-utilization/tipping point indicator Energy-aware scheduling is only meant to be active while the system is _not_ over-utilized. That is, there are spare cycles available to shift tasks around based on their actual utilization to get a more energy-efficient task distribution without depriving any tasks. When above the tipping point task placement is done the traditional way based on load_avg, spreading the tasks across as many cpus as possible based on priority scaled load to preserve smp_nice. Below the tipping point we want to use util_avg instead. We need to define a criteria for when we make the switch. The util_avg for each cpu converges towards 100% regardless of how many additional tasks we may put on it. If we define over-utilized as: sum_{cpus}(rq.cfs.avg.util_avg) + margin > sum_{cpus}(rq.capacity) some individual cpus may be over-utilized running multiple tasks even when the above condition is false. That should be okay as long as we try to spread the tasks out to avoid per-cpu over-utilization as much as possible and if all tasks have the _same_ priority. If the latter isn't true, we have to consider priority to preserve smp_nice. For example, we could have n_cpus nice=-10 util_avg=55% tasks and n_cpus/2 nice=0 util_avg=60% tasks. Balancing based on util_avg we are likely to end up with nice=-10 tasks sharing cpus and nice=0 tasks getting their own as we 1.5*n_cpus tasks in total and 55%+55% is less over-utilized than 55%+60% for those cpus that have to be shared. The system utilization is only 85% of the system capacity, but we are breaking smp_nice. To be sure not to break smp_nice, we have defined over-utilization conservatively as when any cpu in the system is fully utilized at its highest frequency instead: cpu_rq(any).cfs.avg.util_avg + margin > cpu_rq(any).capacity IOW, as soon as one cpu is (nearly) 100% utilized, we switch to load_avg to factor in priority to preserve smp_nice. With this definition, we can skip periodic load-balance as no cpu has an always-running task when the system is not over-utilized. All tasks will be periodic and we can balance them at wake-up. This conservative condition does however mean that some scenarios that could benefit from energy-aware decisions even if one cpu is fully utilized would not get those benefits. For systems where some cpus might have reduced capacity on some cpus (RT-pressure and/or big.LITTLE), we want periodic load-balance checks as soon a just a single cpu is fully utilized as it might one of those with reduced capacity and in that case we want to migrate it. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen [ Added a comment explaining why new tasks are not accounted during overutilization detection ] Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-13-quentin.perret@arm.com> Signed-off-by: Quentin Perret Change-Id: I19f816054adfd2dfa9a69fa92c1589f62794a218 --- kernel/sched/fair.c | 59 ++++++++++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 4 +++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c1e036e3eab..f3cf26b86042 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5094,6 +5094,24 @@ static inline void hrtick_update(struct rq *rq) } #endif +#ifdef CONFIG_SMP +static inline unsigned long cpu_util(int cpu); +static unsigned long capacity_of(int cpu); + +static inline bool cpu_overutilized(int cpu) +{ + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); +} + +static inline void update_overutilized_status(struct rq *rq) +{ + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); +} +#else +static inline void update_overutilized_status(struct rq *rq) { } +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -5151,8 +5169,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); } - if (!se) + if (!se) { add_nr_running(rq, 1); + /* + * Since new tasks are assigned an initial util_avg equal to + * half of the spare capacity of their CPU, tiny tasks have the + * ability to cross the overutilized threshold, which will + * result in the load balancer ruining all the task placement + * done by EAS. As a way to mitigate that effect, do not account + * for the first enqueue operation of new tasks during the + * overutilized flag detection. + * + * A better way of solving this problem would be to wait for + * the PELT signals of tasks to converge before taking them + * into account, but that is not straightforward to implement, + * and the following generally works well enough in practice. + */ + if (flags & ENQUEUE_WAKEUP) + update_overutilized_status(rq); + + } hrtick_update(rq); } @@ -7919,6 +7955,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, if (nr_running > 1) *sg_status |= SG_OVERLOAD; + if (cpu_overutilized(i)) + *sg_status |= SG_OVERUTILIZED; + #ifdef CONFIG_NUMA_BALANCING sgs->nr_numa_running += rq->nr_numa_running; sgs->nr_preferred_running += rq->nr_preferred_running; @@ -8149,8 +8188,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd env->fbq_type = fbq_classify_group(&sds->busiest_stat); if (!env->sd->parent) { + struct root_domain *rd = env->dst_rq->rd; + /* update overload indicator if we are at root domain */ - WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD); + WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); + + /* Update over-utilization (tipping point, U >= 0) indicator */ + WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); + } else if (sg_status & SG_OVERUTILIZED) { + WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); } } @@ -8377,6 +8423,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + + if (static_branch_unlikely(&sched_energy_present)) { + struct root_domain *rd = env->dst_rq->rd; + + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + goto out_balanced; + } + local = &sds.local_stat; busiest = &sds.busiest_stat; @@ -9768,6 +9822,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) task_tick_numa(rq, curr); update_misfit_status(curr, rq); + update_overutilized_status(task_rq(curr)); } /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5dba713b938e..4cdf52e832a1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -711,6 +711,7 @@ struct perf_domain { /* Scheduling group status flags */ #define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ +#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ /* * We add the notion of a root-domain which will be used to define per-domain @@ -734,6 +735,9 @@ struct root_domain { */ int overload; + /* Indicate one or more cpus over-utilized (tipping point) */ + int overutilized; + /* * The bit corresponding to a CPU gets set here if such CPU has more * than one runnable -deadline task (as it is below for RT tasks). From 0d548be9f8815fee7420b844de6f0a47a6e2d478 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:11 +0100 Subject: [PATCH 28/75] FROMLIST: sched/fair: Introduce an energy estimation helper function In preparation for the definition of an energy-aware wakeup path, introduce a helper function to estimate the consequence on system energy when a specific task wakes-up on a specific CPU. compute_energy() estimates the capacity state to be reached by all performance domains and estimates the consumption of each online CPU according to its Energy Model and its percentage of busy time. Change-Id: Ia291deb16ec9a75f3c0252abbb5d864e3300562d Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-14-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 76 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f3cf26b86042..82924d2b2460 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6356,6 +6356,82 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) return !task_fits_capacity(p, min_cap); } +/* + * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) + * to @dst_cpu. + */ +static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +{ + struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; + unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + + /* + * If @p migrates from @cpu to another, remove its contribution. Or, + * if @p migrates from another CPU to @cpu, add its contribution. In + * the other cases, @cpu is not impacted by the migration, so the + * util_avg should already be correct. + */ + if (task_cpu(p) == cpu && dst_cpu != cpu) + sub_positive(&util, task_util(p)); + else if (task_cpu(p) != cpu && dst_cpu == cpu) + util += task_util(p); + + if (sched_feat(UTIL_EST)) { + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + + /* + * During wake-up, the task isn't enqueued yet and doesn't + * appear in the cfs_rq->avg.util_est.enqueued of any rq, + * so just add it (if needed) to "simulate" what will be + * cpu_util() after the task has been enqueued. + */ + if (dst_cpu == cpu) + util_est += _task_util_est(p); + + util = max(util, util_est); + } + + return min(util, capacity_orig_of(cpu)); +} + +/* + * compute_energy(): Estimates the energy that would be consumed if @p was + * migrated to @dst_cpu. compute_energy() predicts what will be the utilization + * landscape of the * CPUs after the task migration, and uses the Energy Model + * to compute what would be the energy if we decided to actually migrate that + * task. + */ +static long +compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +{ + long util, max_util, sum_util, energy = 0; + int cpu; + + for (; pd; pd = pd->next) { + max_util = sum_util = 0; + /* + * The capacity state of CPUs of the current rd can be driven by + * CPUs of another rd if they belong to the same performance + * domain. So, account for the utilization of these CPUs too + * by masking pd with cpu_online_mask instead of the rd span. + * + * If an entire performance domain is outside of the current rd, + * it will not appear in its pd list and will not be accounted + * by compute_energy(). + */ + for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { + util = cpu_util_next(cpu, p, dst_cpu); + util = schedutil_energy_util(cpu, util); + max_util = max(util, max_util); + sum_util += util; + } + + energy += em_pd_energy(pd->em_pd, max_util, sum_util); + } + + return energy; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, From 54655c9f85a92530929a49caccfa9ba51aabb638 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 16 Oct 2018 11:15:12 +0100 Subject: [PATCH 29/75] FROMLIST: sched/fair: Select an energy-efficient CPU on task wake-up If an Energy Model (EM) is available and if the system isn't overutilized, re-route waking tasks into an energy-aware placement algorithm. The selection of an energy-efficient CPU for a task is achieved by estimating the impact on system-level active energy resulting from the placement of the task on the CPU with the highest spare capacity in each performance domain. This strategy spreads tasks in a performance domain and avoids overly aggressive task packing. The best CPU energy-wise is then selected if it saves a large enough amount of energy with respect to prev_cpu. Although it has already shown significant benefits on some existing targets, this approach cannot scale to platforms with numerous CPUs. This is an attempt to do something useful as writing a fast heuristic that performs reasonably well on a broad spectrum of architectures isn't an easy task. As such, the scope of usability of the energy-aware wake-up path is restricted to systems with the SD_ASYM_CPUCAPACITY flag set, and where the EM isn't too complex. Change-Id: I8c6384af904668f405319ed4e05054a7fa449192 Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Quentin Perret Message-Id: <20181016101513.26919-15-quentin.perret@arm.com> Signed-off-by: Quentin Perret --- kernel/sched/fair.c | 143 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82924d2b2460..5c51ecbb824d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6432,6 +6432,137 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) return energy; } +/* + * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the + * waking task. find_energy_efficient_cpu() looks for the CPU with maximum + * spare capacity in each performance domain and uses it as a potential + * candidate to execute the task. Then, it uses the Energy Model to figure + * out which of the CPU candidates is the most energy-efficient. + * + * The rationale for this heuristic is as follows. In a performance domain, + * all the most energy efficient CPU candidates (according to the Energy + * Model) are those for which we'll request a low frequency. When there are + * several CPUs for which the frequency request will be the same, we don't + * have enough data to break the tie between them, because the Energy Model + * only includes active power costs. With this model, if we assume that + * frequency requests follow utilization (e.g. using schedutil), the CPU with + * the maximum spare capacity in a performance domain is guaranteed to be among + * the best candidates of the performance domain. + * + * In practice, it could be preferable from an energy standpoint to pack + * small tasks on a CPU in order to let other CPUs go in deeper idle states, + * but that could also hurt our chances to go cluster idle, and we have no + * ways to tell with the current Energy Model if this is actually a good + * idea or not. So, find_energy_efficient_cpu() basically favors + * cluster-packing, and spreading inside a cluster. That should at least be + * a good thing for latency, and this is consistent with the idea that most + * of the energy savings of EAS come from the asymmetry of the system, and + * not so much from breaking the tie between identical CPUs. That's also the + * reason why EAS is enabled in the topology code only for systems where + * SD_ASYM_CPUCAPACITY is set. + * + * NOTE: Forkees are not accepted in the energy-aware wake-up path because + * they don't have any useful utilization data yet and it's not possible to + * forecast their impact on energy consumption. Consequently, they will be + * placed by find_idlest_cpu() on the least loaded CPU, which might turn out + * to be energy-inefficient in some use-cases. The alternative would be to + * bias new tasks towards specific types of CPUs first, or to try to infer + * their util_avg from the parent task, but those heuristics could hurt + * other use-cases too. So, until someone finds a better way to solve this, + * let's keep things simple by re-using the existing slow path. + */ + +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) +{ + unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int cpu, best_energy_cpu = prev_cpu; + struct perf_domain *head, *pd; + unsigned long cpu_cap, util; + struct sched_domain *sd; + + rcu_read_lock(); + pd = rcu_dereference(rd->pd); + if (!pd || READ_ONCE(rd->overutilized)) + goto fail; + head = pd; + + /* + * Energy-aware wake-up happens on the lowest sched_domain starting + * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. + */ + sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); + while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + sd = sd->parent; + if (!sd) + goto fail; + + sync_entity_load_avg(&p->se); + if (!task_util_est(p)) + goto unlock; + + for (; pd; pd = pd->next) { + unsigned long cur_energy, spare_cap, max_spare_cap = 0; + int max_spare_cap_cpu = -1; + + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + continue; + + /* Skip CPUs that will be overutilized. */ + util = cpu_util_next(cpu, p, cpu); + cpu_cap = capacity_of(cpu); + if (cpu_cap * 1024 < util * capacity_margin) + continue; + + /* Always use prev_cpu as a candidate. */ + if (cpu == prev_cpu) { + prev_energy = compute_energy(p, prev_cpu, head); + best_energy = min(best_energy, prev_energy); + continue; + } + + /* + * Find the CPU with the maximum spare capacity in + * the performance domain + */ + spare_cap = cpu_cap - util; + if (spare_cap > max_spare_cap) { + max_spare_cap = spare_cap; + max_spare_cap_cpu = cpu; + } + } + + /* Evaluate the energy impact of using this CPU. */ + if (max_spare_cap_cpu >= 0) { + cur_energy = compute_energy(p, max_spare_cap_cpu, head); + if (cur_energy < best_energy) { + best_energy = cur_energy; + best_energy_cpu = max_spare_cap_cpu; + } + } + } +unlock: + rcu_read_unlock(); + + /* + * Pick the best CPU if prev_cpu cannot be used, or if it saves at + * least 6% of the energy used by prev_cpu. + */ + if (prev_energy == ULONG_MAX) + return best_energy_cpu; + + if ((prev_energy - best_energy) > (prev_energy >> 4)) + return best_energy_cpu; + + return prev_cpu; + +fail: + rcu_read_unlock(); + + return -1; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -6455,8 +6586,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) - && cpumask_test_cpu(cpu, &p->cpus_allowed); + + if (static_branch_unlikely(&sched_energy_present)) { + new_cpu = find_energy_efficient_cpu(p, prev_cpu); + if (new_cpu >= 0) + return new_cpu; + new_cpu = prev_cpu; + } + + want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && + cpumask_test_cpu(cpu, &p->cpus_allowed); } rcu_read_lock(); From 97f2533053ebb81b420ea00af435657d97550004 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Thu, 2 Jul 2015 17:16:34 +0100 Subject: [PATCH 30/75] ANDROID: sched: Prevent unnecessary active balance of single task in sched group Scenarios with the busiest group having just one task and the local being idle on topologies with sched groups with different numbers of cpus manage to dodge all load-balance bailout conditions resulting the nr_balance_failed counter to be incremented. This eventually causes a pointless active migration of the task. This patch prevents this by not incrementing the counter when the busiest group only has one task. ASYM_PACKING migrations and migrations due to reduced capacity should still take place as these are explicitly captured by need_active_balance(). A better solution would be to not attempt the load-balance in the first place, but that requires significant changes to the order of bailout conditions and statistics gathering. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Change-Id: I28f69c72febe0211decbe77b7bc3e48839d3d7b3 --- kernel/sched/fair.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c51ecbb824d..735982e07147 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7225,6 +7225,7 @@ struct lb_env { int new_dst_cpu; enum cpu_idle_type idle; long imbalance; + unsigned int src_grp_nr_running; /* The set of CPUs under consideration for load-balancing */ struct cpumask *cpus; @@ -8402,6 +8403,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (env->sd->flags & SD_NUMA) env->fbq_type = fbq_classify_group(&sds->busiest_stat); + env->src_grp_nr_running = sds->busiest_stat.sum_nr_running; + if (!env->sd->parent) { struct root_domain *rd = env->dst_rq->rd; @@ -9080,7 +9083,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) - sd->nr_balance_failed++; + if (env.src_grp_nr_running > 1) + sd->nr_balance_failed++; if (need_active_balance(&env)) { unsigned long flags; From af31057250f3937ca217dcf8ebf7c5f8d16d10ff Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 26 Jan 2015 19:47:28 +0000 Subject: [PATCH 31/75] ANDROID: sched: Enable idle balance to pull single task towards cpu with higher capacity We do not want to miss out on the ability to pull a single remaining task from a potential source cpu towards an idle destination cpu. Add an extra criteria to need_active_balance() to kick off active load balance if the source cpu is over-utilized and has lower capacity than the destination cpu. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Signed-off-by: Dietmar Eggemann Change-Id: Iea3b42b2a0f8d8a4252e42ba67cc33381a4a1075 --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 735982e07147..e61c553a4736 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8865,6 +8865,13 @@ static int need_active_balance(struct lb_env *env) if (env->src_grp_type == group_misfit_task) return 1; + if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) && + env->src_rq->cfs.h_nr_running == 1 && + cpu_overutilized(env->src_cpu) && + !cpu_overutilized(env->dst_cpu)) { + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From a3f0c668be8e57594cd98f1552d4d46446aea27b Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 3 Jul 2018 10:35:03 +0100 Subject: [PATCH 32/75] ANDROID: arm, arm64: Enable kernel config options required for EAS arm and arm64: Add Cgroups support Add Energy Model Add CpuFreq governors and make schedutil default for arm: Add Cpuset support Add Scheduler autogroups Add DIE sched domain level Signed-off-by: Dietmar Eggemann Change-Id: Ib52a0bd27702c3f2c3d692e49c9c8e2fbbea2cf7 --- arch/arm/configs/multi_v7_defconfig | 16 ++++++++++++---- arch/arm64/configs/defconfig | 10 ++++++++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index fc33444e94f0..2721877d5a11 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -2,6 +2,12 @@ CONFIG_SYSVIPC=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y @@ -116,6 +122,7 @@ CONFIG_PCI_ENDPOINT=y CONFIG_PCI_ENDPOINT_CONFIGFS=y CONFIG_PCI_EPF_TEST=m CONFIG_SMP=y +CONFIG_SCHED_MC=y CONFIG_NR_CPUS=16 CONFIG_SECCOMP=y CONFIG_ARM_APPENDED_DTB=y @@ -124,10 +131,10 @@ CONFIG_KEXEC=y CONFIG_EFI=y CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_STAT=y -CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m -CONFIG_CPU_FREQ_GOV_USERSPACE=m -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y CONFIG_CPUFREQ_DT=y CONFIG_ARM_IMX6Q_CPUFREQ=y @@ -137,6 +144,7 @@ CONFIG_ARM_CPUIDLE=y CONFIG_ARM_ZYNQ_CPUIDLE=y CONFIG_ARM_EXYNOS_CPUIDLE=y CONFIG_KERNEL_MODE_NEON=y +CONFIG_ENERGY_MODEL=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index db8d364f8476..31fd24bff688 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -19,9 +19,13 @@ CONFIG_BLK_CGROUP=y CONFIG_CGROUP_PIDS=y CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y +CONFIG_CGROUPS=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_FREEZER=y CONFIG_USER_NS=y CONFIG_SCHED_AUTOGROUP=y CONFIG_BLK_DEV_INITRD=y @@ -101,13 +105,15 @@ CONFIG_XEN=y CONFIG_COMPAT=y CONFIG_HIBERNATION=y CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_ENERGY_MODEL=y CONFIG_ARM_CPUIDLE=y CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_STAT=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y CONFIG_CPUFREQ_DT=y CONFIG_ACPI_CPPC_CPUFREQ=m From e105a959176ce34db3df5496361be66298eea960 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sun, 18 Feb 2018 14:27:49 +0100 Subject: [PATCH 33/75] ANDROID: arm64: dts: juno: Add dynamic-power-coefficient properties Taken from commit cadf54148974 "arm64: dts: Add IPA parameters to soc thermal zone" wich also sets up SoC thermal zones and bind them to cpufreq cooling devices. We don't want this functionality right now. The commit is for example part of: git.linaro.org/landing-teams/working/arm/kernel-release.git lt_arm/ack-4.9-armlt-18.01 Signed-off-by: Dietmar Eggemann Change-Id: I7c23a58fa49b281ed5df2f60db0514a9b3b50c7b --- arch/arm64/boot/dts/arm/juno.dts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts index 1fb5c5a0f32e..e3069e286256 100644 --- a/arch/arm64/boot/dts/arm/juno.dts +++ b/arch/arm64/boot/dts/arm/juno.dts @@ -98,6 +98,7 @@ A57_0: cpu@0 { clocks = <&scpi_dvfs 0>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <1024>; + dynamic-power-coefficient = <530>; }; A57_1: cpu@1 { @@ -115,6 +116,7 @@ A57_1: cpu@1 { clocks = <&scpi_dvfs 0>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <1024>; + dynamic-power-coefficient = <530>; }; A53_0: cpu@100 { @@ -132,6 +134,7 @@ A53_0: cpu@100 { clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <578>; + dynamic-power-coefficient = <140>; }; A53_1: cpu@101 { @@ -149,6 +152,7 @@ A53_1: cpu@101 { clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <578>; + dynamic-power-coefficient = <140>; }; A53_2: cpu@102 { @@ -166,6 +170,7 @@ A53_2: cpu@102 { clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <578>; + dynamic-power-coefficient = <140>; }; A53_3: cpu@103 { @@ -183,6 +188,7 @@ A53_3: cpu@103 { clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <578>; + dynamic-power-coefficient = <140>; }; A57_L2: l2-cache0 { From 9b249ed13d82ca547f1bf5b0d5420ba90a011041 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 22 May 2018 17:01:09 +0100 Subject: [PATCH 34/75] ANDROID: arm64: dts: juno-r2: Add dynamic-power-coefficient properties Taken from commit cadf54148974 "arm64: dts: Add IPA parameters to soc thermal zone" wich also sets up SoC thermal zones and bind them to cpufreq cooling devices. We don't want this functionality right now. The commit is for example part of: git.linaro.org/landing-teams/working/arm/kernel-release.git lt_arm/ack-4.9-armlt-18.01 Signed-off-by: Dietmar Eggemann Change-Id: Id1a44fb7d222d59f7d44b5f55797d407513eb7e7 --- arch/arm64/boot/dts/arm/juno-r2.dts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/boot/dts/arm/juno-r2.dts b/arch/arm64/boot/dts/arm/juno-r2.dts index ab77adb4f3c2..66f0ec79c864 100644 --- a/arch/arm64/boot/dts/arm/juno-r2.dts +++ b/arch/arm64/boot/dts/arm/juno-r2.dts @@ -99,6 +99,7 @@ A72_0: cpu@0 { clocks = <&scpi_dvfs 0>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <1024>; + dynamic-power-coefficient = <450>; }; A72_1: cpu@1 { @@ -116,6 +117,7 @@ A72_1: cpu@1 { clocks = <&scpi_dvfs 0>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <1024>; + dynamic-power-coefficient = <450>; }; A53_0: cpu@100 { @@ -133,6 +135,7 @@ A53_0: cpu@100 { clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <485>; + dynamic-power-coefficient = <140>; }; A53_1: cpu@101 { @@ -150,6 +153,7 @@ A53_1: cpu@101 { clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <485>; + dynamic-power-coefficient = <140>; }; A53_2: cpu@102 { @@ -167,6 +171,7 @@ A53_2: cpu@102 { clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <485>; + dynamic-power-coefficient = <140>; }; A53_3: cpu@103 { @@ -184,6 +189,7 @@ A53_3: cpu@103 { clocks = <&scpi_dvfs 1>; cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>; capacity-dmips-mhz = <485>; + dynamic-power-coefficient = <140>; }; A72_L2: l2-cache0 { From 6ed388bcd602b6e00d1d7a37f3710ad3222755f2 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 5 Jun 2018 10:35:18 +0100 Subject: [PATCH 35/75] ANDROID: arm: dts: vexpress-v2p-ca15_a7: Add dynamic-power-coefficient properties The values are computed from measuring energy over a 10 secs sysbench workload running at each frequency and affine to 1 or 2 A15's as well as 1 or 2 or 3 A7's. The Power values for individual cpus are calculated from the Energy values divided by workload runtime by taking the difference of the energy values between n+1 and n cpus. P [mW] = C * freq [Mhz] * voltage [mV] * voltage [mV] / 1000000000 C = P [mW] / (freq [Mhz] * voltage [mV] * voltage [mV]) * 1000000000 The actual C (dynamic-power-coefficient) value is the mean value out of all the C values of the OPP's. A15: freq power voltage dyn_pwr_coef [MhZ] [mW] [mV] 0 500.0 534.57550 900 1319.939506 1 600.0 547.15468 900 1125.832675 2 700.0 572.22060 900 1009.207407 3 800.0 607.76592 900 937.910370 4 900.0 648.50552 900 889.582332 5 1000.0 693.86776 900 856.626864 6 1100.0 916.51314 975 876.469442 7 1200.0 1198.57566 1050 905.952880 mean: 990 A7: freq power voltage dyn_pwr_coef [MhZ] [mW] [mV] 0 350.0 40.17430 900 141.708289 1 400.0 42.68700 900 131.750000 2 500.0 54.25716 900 133.968296 3 600.0 64.09914 900 131.891235 4 700.0 74.09736 900 130.683175 5 800.0 82.69694 900 127.618735 6 900.0 113.71386 975 132.911225 7 1000.0 144.94124 1050 131.465977 mean: 133 The ratio between A15 and A7 is 990/113 = 7.44 This value (7.44) is very close to mean ratio between the power value of A15 an A7 of the per sched-domain Energy Model (7.96): mV Mhz MhZ old EM ratio A7 A15 core power 900 350 500 6997/1024 6.83 900 400 600 5177/761 6.80 900 500 700 3846/549 7.01 900 600 800 3524/447 7.88 900 700 900 3125/407 7.68 900 800 1000 2756/334 8.25 975 900 1100 2312/275 8.40 1050 1000 1200 2021/187 10.80 mean: 7.96 Signed-off-by: Dietmar Eggemann Change-Id: I93ef375d05ff769481a07f2c74f061e307cb14d4 --- arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts index ac6b90e9d806..75663ed4817f 100644 --- a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts +++ b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts @@ -42,6 +42,7 @@ cpu0: cpu@0 { cci-control-port = <&cci_control1>; cpu-idle-states = <&CLUSTER_SLEEP_BIG>; capacity-dmips-mhz = <1024>; + dynamic-power-coefficient = <990>; }; cpu1: cpu@1 { @@ -51,6 +52,7 @@ cpu1: cpu@1 { cci-control-port = <&cci_control1>; cpu-idle-states = <&CLUSTER_SLEEP_BIG>; capacity-dmips-mhz = <1024>; + dynamic-power-coefficient = <990>; }; cpu2: cpu@2 { @@ -60,6 +62,7 @@ cpu2: cpu@2 { cci-control-port = <&cci_control2>; cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>; capacity-dmips-mhz = <516>; + dynamic-power-coefficient = <133>; }; cpu3: cpu@3 { @@ -69,6 +72,7 @@ cpu3: cpu@3 { cci-control-port = <&cci_control2>; cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>; capacity-dmips-mhz = <516>; + dynamic-power-coefficient = <133>; }; cpu4: cpu@4 { @@ -78,6 +82,7 @@ cpu4: cpu@4 { cci-control-port = <&cci_control2>; cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>; capacity-dmips-mhz = <516>; + dynamic-power-coefficient = <133>; }; idle-states { From 23977789b537f521292a1544966732c68977191b Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 18 May 2018 11:07:59 +0100 Subject: [PATCH 36/75] FROMLIST: cpufreq: dt: Register an Energy Model The Energy Model framework provides an API to register the active power of CPUs. Call this API from the cpufreq-dt driver with an estimation of the power as P = C * V^2 * f with C, V, and f respectively the capacitance of the CPU and the voltage and frequency of the OPP. The CPU capacitance is read from the "dynamic-power-coefficient" DT binding (originally introduced for thermal/IPA), and the voltage and frequency values from PM_OPP. Change-Id: Id7b79ae3aafcc53574b850cb91a25240ebffbdd4 Cc: "Rafael J. Wysocki" Cc: Viresh Kumar (Removed "OPTIONAL" label from title. Applied from https://lore.kernel.org/lkml/20180912091309.7551-15-quentin.perret@arm.com/) Signed-off-by: Quentin Perret --- drivers/cpufreq/cpufreq-dt.c | 48 +++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 0a9ebf00be46..15ac9754afa2 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -149,8 +150,50 @@ static int resources_available(void) return 0; } +static int __maybe_unused of_est_power(unsigned long *mW, unsigned long *KHz, + int cpu) +{ + unsigned long mV, Hz, MHz; + struct device *cpu_dev; + struct dev_pm_opp *opp; + struct device_node *np; + u32 cap; + u64 tmp; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return -ENODEV; + + np = of_node_get(cpu_dev->of_node); + if (!np) + return -EINVAL; + + if (of_property_read_u32(np, "dynamic-power-coefficient", &cap)) + return -EINVAL; + + Hz = *KHz * 1000; + opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz); + if (IS_ERR(opp)) + return -EINVAL; + + mV = dev_pm_opp_get_voltage(opp) / 1000; + dev_pm_opp_put(opp); + if (!mV) + return -EINVAL; + + MHz = Hz / 1000000; + tmp = (u64)cap * mV * mV * MHz; + do_div(tmp, 1000000000); + + *mW = (unsigned long)tmp; + *KHz = Hz / 1000; + + return 0; +} + static int cpufreq_init(struct cpufreq_policy *policy) { + struct em_data_callback em_cb = EM_DATA_CB(of_est_power); struct cpufreq_frequency_table *freq_table; struct opp_table *opp_table = NULL; struct private_data *priv; @@ -159,7 +202,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) unsigned int transition_latency; bool fallback = false; const char *name; - int ret; + int ret, nr_opp; cpu_dev = get_cpu_device(policy->cpu); if (!cpu_dev) { @@ -226,6 +269,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) ret = -EPROBE_DEFER; goto out_free_opp; } + nr_opp = ret; if (fallback) { cpumask_setall(policy->cpus); @@ -278,6 +322,8 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = transition_latency; policy->dvfs_possible_from_any_cpu = true; + em_register_perf_domain(policy->cpus, nr_opp, &em_cb); + return 0; out_free_cpufreq_table: From d326aa36e63a814891aef3160d81cc8d7d48bf0d Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 12 Sep 2018 14:49:58 +0100 Subject: [PATCH 37/75] ANDROID: PM / OPP: cpufreq-dt: Move power estimation function The cpufreq-dt driver has a function estimating the power of CPUs using the dynamic-power-coefficient DT binding and the parameters of PM_OPP. Since this function can be useful to other drivers, relocate it to PM_OPP which is already a dependency anyway. Change-Id: I34f8f9cd9433c622c82f23f32ae9968a096a4390 Signed-off-by: Quentin Perret --- drivers/cpufreq/cpufreq-dt.c | 43 +----------------------------------- drivers/opp/of.c | 41 ++++++++++++++++++++++++++++++++++ include/linux/pm_opp.h | 5 +++++ 3 files changed, 47 insertions(+), 42 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 15ac9754afa2..83ad2a60991c 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -150,50 +150,9 @@ static int resources_available(void) return 0; } -static int __maybe_unused of_est_power(unsigned long *mW, unsigned long *KHz, - int cpu) -{ - unsigned long mV, Hz, MHz; - struct device *cpu_dev; - struct dev_pm_opp *opp; - struct device_node *np; - u32 cap; - u64 tmp; - - cpu_dev = get_cpu_device(cpu); - if (!cpu_dev) - return -ENODEV; - - np = of_node_get(cpu_dev->of_node); - if (!np) - return -EINVAL; - - if (of_property_read_u32(np, "dynamic-power-coefficient", &cap)) - return -EINVAL; - - Hz = *KHz * 1000; - opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz); - if (IS_ERR(opp)) - return -EINVAL; - - mV = dev_pm_opp_get_voltage(opp) / 1000; - dev_pm_opp_put(opp); - if (!mV) - return -EINVAL; - - MHz = Hz / 1000000; - tmp = (u64)cap * mV * mV * MHz; - do_div(tmp, 1000000000); - - *mW = (unsigned long)tmp; - *KHz = Hz / 1000; - - return 0; -} - static int cpufreq_init(struct cpufreq_policy *policy) { - struct em_data_callback em_cb = EM_DATA_CB(of_est_power); + struct em_data_callback em_cb = EM_DATA_CB(of_dev_pm_opp_get_cpu_power); struct cpufreq_frequency_table *freq_table; struct opp_table *opp_table = NULL; struct private_data *priv; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 7af0ddec936b..34157eac789d 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -777,3 +777,44 @@ struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp) return of_node_get(opp->np); } EXPORT_SYMBOL_GPL(dev_pm_opp_get_of_node); + +int of_dev_pm_opp_get_cpu_power(unsigned long *mW, unsigned long *KHz, int cpu) +{ + unsigned long mV, Hz, MHz; + struct device *cpu_dev; + struct dev_pm_opp *opp; + struct device_node *np; + u32 cap; + u64 tmp; + + cpu_dev = get_cpu_device(cpu); + if (!cpu_dev) + return -ENODEV; + + np = of_node_get(cpu_dev->of_node); + if (!np) + return -EINVAL; + + if (of_property_read_u32(np, "dynamic-power-coefficient", &cap)) + return -EINVAL; + + Hz = *KHz * 1000; + opp = dev_pm_opp_find_freq_ceil(cpu_dev, &Hz); + if (IS_ERR(opp)) + return -EINVAL; + + mV = dev_pm_opp_get_voltage(opp) / 1000; + dev_pm_opp_put(opp); + if (!mV) + return -EINVAL; + + MHz = Hz / 1000000; + tmp = (u64)cap * mV * mV * MHz; + do_div(tmp, 1000000000); + + *mW = (unsigned long)tmp; + *KHz = Hz / 1000; + + return 0; +} +EXPORT_SYMBOL_GPL(of_dev_pm_opp_get_cpu_power); diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index 099b31960dec..11dbffc7f889 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -301,6 +301,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpuma struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev); struct dev_pm_opp *of_dev_pm_opp_find_required_opp(struct device *dev, struct device_node *np); struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp); +int of_dev_pm_opp_get_cpu_power(unsigned long *mW, unsigned long *KHz, int cpu); #else static inline int dev_pm_opp_of_add_table(struct device *dev) { @@ -343,6 +344,10 @@ static inline struct device_node *dev_pm_opp_get_of_node(struct dev_pm_opp *opp) { return NULL; } +static inline int of_dev_pm_opp_get_cpu_power(unsigned long *mW, unsigned long *KHz, int cpu) +{ + return -ENOTSUPP; +} #endif #endif /* __LINUX_OPP_H__ */ From 61dcd890536c4433480fb7c5278b7eb864b783b5 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 18 May 2018 11:08:19 +0100 Subject: [PATCH 38/75] ANDROID: cpufreq: scpi: Register an Energy Model The Energy Model framework provides an API to register the active power of CPUs. This commit calls this API from the scpi-cpufreq driver which uses the power estimation helper from PM_OPP. Change-Id: I113fa2edf8201c1272c9fb5a0c6c39622ae53f94 Signed-off-by: Quentin Perret --- drivers/cpufreq/scpi-cpufreq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index 87a98ec77773..05fc7448f5cb 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -98,11 +99,12 @@ scpi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) static int scpi_cpufreq_init(struct cpufreq_policy *policy) { - int ret; + int ret, nr_opp; unsigned int latency; struct device *cpu_dev; struct scpi_data *priv; struct cpufreq_frequency_table *freq_table; + struct em_data_callback em_cb = EM_DATA_CB(of_dev_pm_opp_get_cpu_power); cpu_dev = get_cpu_device(policy->cpu); if (!cpu_dev) { @@ -135,6 +137,7 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy) ret = -EPROBE_DEFER; goto out_free_opp; } + nr_opp = ret; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { @@ -170,6 +173,9 @@ static int scpi_cpufreq_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = latency; policy->fast_switch_possible = false; + + em_register_perf_domain(policy->cpus, nr_opp, &em_cb); + return 0; out_free_cpufreq_table: From 354f0b0c8da9659cc191af5b3b6708cdd9d75774 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 1 Jun 2018 14:27:24 +0100 Subject: [PATCH 39/75] ANDROID: cpufreq: arm_big_little: Register an Energy Model The Energy Model framework provides an API to register the active power of CPUs. This commit calls this API from the scpi-cpufreq driver which uses the power estimation helper from PM_OPP. Todo: Check if driver can handle -EPROBE_DEFER and if the call to dev_pm_opp_get_opp_count() id realy necessary. Change-Id: Ia808262ef6c9f2cc7819a83e8eb2f602454edfa3 Signed-off-by: Dietmar Eggemann Signed-off-by: Quentin Perret --- drivers/cpufreq/arm_big_little.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/cpufreq/arm_big_little.c b/drivers/cpufreq/arm_big_little.c index cf62a1f64dd7..803d41c629c3 100644 --- a/drivers/cpufreq/arm_big_little.c +++ b/drivers/cpufreq/arm_big_little.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -456,6 +457,7 @@ static int get_cluster_clk_and_freq_table(struct device *cpu_dev, /* Per-CPU initialization */ static int bL_cpufreq_init(struct cpufreq_policy *policy) { + struct em_data_callback em_cb = EM_DATA_CB(of_dev_pm_opp_get_cpu_power); u32 cur_cluster = cpu_to_cluster(policy->cpu); struct device *cpu_dev; int ret; @@ -487,6 +489,14 @@ static int bL_cpufreq_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = arm_bL_ops->get_transition_latency(cpu_dev); + ret = dev_pm_opp_get_opp_count(cpu_dev); + if (ret <= 0) { + dev_dbg(cpu_dev, "OPP table is not ready, deferring probe\n"); + return -EPROBE_DEFER; + } + + em_register_perf_domain(policy->cpus, ret, &em_cb); + if (is_bL_switching_enabled()) per_cpu(cpu_last_req_freq, policy->cpu) = clk_get_cpu_rate(policy->cpu); From b085f79aeeccdbc4401b501091e279f6a6bc1c9e Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 10 Sep 2018 17:28:10 +0100 Subject: [PATCH 40/75] UPSTREAM: firmware: arm_scmi: add a getter for power of performance states The SCMI protocol can be used to get power estimates from firmware corresponding to each performance state of a device. Although these power costs are already managed by the SCMI firmware driver, they are not exposed to any external subsystem yet. Fix this by adding a new get_power() interface to the exisiting perf_ops defined for the SCMI protocol. Change-Id: Iae7b1b60c1955f6590764f3de459a32320eba448 Signed-off-by: Quentin Perret Signed-off-by: Sudeep Holla (cherry picked from commit 1a63fe9a2b1f47af5b2b7436b41824b14999c17a in git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux.git) Signed-off-by: Quentin Perret --- drivers/firmware/arm_scmi/perf.c | 28 ++++++++++++++++++++++++++++ include/linux/scmi_protocol.h | 4 ++++ 2 files changed, 32 insertions(+) diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c index 64342944d917..c8024a39171b 100644 --- a/drivers/firmware/arm_scmi/perf.c +++ b/drivers/firmware/arm_scmi/perf.c @@ -427,6 +427,33 @@ static int scmi_dvfs_freq_get(const struct scmi_handle *handle, u32 domain, return ret; } +static int scmi_dvfs_est_power_get(const struct scmi_handle *handle, u32 domain, + unsigned long *freq, unsigned long *power) +{ + struct scmi_perf_info *pi = handle->perf_priv; + struct perf_dom_info *dom; + unsigned long opp_freq; + int idx, ret = -EINVAL; + struct scmi_opp *opp; + + dom = pi->dom_info + domain; + if (!dom) + return -EIO; + + for (opp = dom->opp, idx = 0; idx < dom->opp_count; idx++, opp++) { + opp_freq = opp->perf * dom->mult_factor; + if (opp_freq < *freq) + continue; + + *freq = opp_freq; + *power = opp->power; + ret = 0; + break; + } + + return ret; +} + static struct scmi_perf_ops perf_ops = { .limits_set = scmi_perf_limits_set, .limits_get = scmi_perf_limits_get, @@ -437,6 +464,7 @@ static struct scmi_perf_ops perf_ops = { .device_opps_add = scmi_dvfs_device_opps_add, .freq_set = scmi_dvfs_freq_set, .freq_get = scmi_dvfs_freq_get, + .est_power_get = scmi_dvfs_est_power_get, }; static int scmi_perf_protocol_init(struct scmi_handle *handle) diff --git a/include/linux/scmi_protocol.h b/include/linux/scmi_protocol.h index f4c9fc0fc755..3105055c00a7 100644 --- a/include/linux/scmi_protocol.h +++ b/include/linux/scmi_protocol.h @@ -91,6 +91,8 @@ struct scmi_clk_ops { * to sustained performance level mapping * @freq_get: gets the frequency for a given device using sustained frequency * to sustained performance level mapping + * @est_power_get: gets the estimated power cost for a given performance domain + * at a given frequency */ struct scmi_perf_ops { int (*limits_set)(const struct scmi_handle *handle, u32 domain, @@ -110,6 +112,8 @@ struct scmi_perf_ops { unsigned long rate, bool poll); int (*freq_get)(const struct scmi_handle *handle, u32 domain, unsigned long *rate, bool poll); + int (*est_power_get)(const struct scmi_handle *handle, u32 domain, + unsigned long *rate, unsigned long *power); }; /** From a0912a31bdaee031c80b842056a83441e774dbb0 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 18 May 2018 11:10:02 +0100 Subject: [PATCH 41/75] ANDROID: cpufreq: scmi: Register an Energy Model The Energy Model framework provides an API to register the active power of CPUs. This commit calls this API from the scmi-cpufreq driver which uses the power costs provided by the firmware. Change-Id: I2e6036acbf004d41f921e1396983b07e022a5399 Signed-off-by: Quentin Perret --- drivers/cpufreq/scmi-cpufreq.c | 36 +++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 50b1551ba894..80a7f8da7e74 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -103,13 +104,42 @@ scmi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) return 0; } +static int __maybe_unused +scmi_get_cpu_power(unsigned long *power, unsigned long *KHz, int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + unsigned long Hz; + int ret, domain; + + if (!cpu_dev) { + pr_err("failed to get cpu%d device\n", cpu); + return -ENODEV; + } + + domain = handle->perf_ops->device_domain_id(cpu_dev); + if (domain < 0) + return domain; + + /* Get the power cost of the performance domain. */ + Hz = *KHz * 1000; + ret = handle->perf_ops->est_power_get(handle, domain, &Hz, power); + if (ret) + return ret; + + /* The EM framework specifies the frequency in KHz. */ + *KHz = Hz / 1000; + + return 0; +} + static int scmi_cpufreq_init(struct cpufreq_policy *policy) { - int ret; + int ret, nr_opp; unsigned int latency; struct device *cpu_dev; struct scmi_data *priv; struct cpufreq_frequency_table *freq_table; + struct em_data_callback em_cb = EM_DATA_CB(scmi_get_cpu_power); cpu_dev = get_cpu_device(policy->cpu); if (!cpu_dev) { @@ -142,6 +172,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) ret = -EPROBE_DEFER; goto out_free_opp; } + nr_opp = ret; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { @@ -171,6 +202,9 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = latency; policy->fast_switch_possible = true; + + em_register_perf_domain(policy->cpus, nr_opp, &em_cb); + return 0; out_free_priv: From 99f3cc6e0581a4f2df8b0c66cf472865ecc93470 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 12 Jul 2018 16:58:37 +0100 Subject: [PATCH 42/75] ANDROID: drivers: Introduce a legacy Energy Model loading driver The Energy Aware Scheduler (EAS) used to rely on statically defined Energy Models (EMs) in the device tree. Now that EAS uses the EM framework, the old-style EMs are not usable by default. To address this issue, introduce a driver able to read DT-based EMs and to load them in the EM framework, hence making them available to EAS. Since EAS now uses only the active costs of CPUs, the idle cost and cluster cost of the old EM are ignored. The driver can be compiled in using the CONFIG_LEGACY_ENERGY_MODEL_DT Kconfig option (off by default). The implementation of the driver is highly inspired by the EM loading code from android-4.14 and before (written by Robin Randhawa ), and the arch_topology driver (Juri Lelli ). Signed-off-by: Quentin Perret Change-Id: I4f525dfb45113ba63f01aaf8e1e809ae6b34dd52 --- drivers/Kconfig | 1 + drivers/Makefile | 2 + drivers/energy_model/Kconfig | 16 +++ drivers/energy_model/Makefile | 3 + drivers/energy_model/legacy_em_dt.c | 193 ++++++++++++++++++++++++++++ 5 files changed, 215 insertions(+) create mode 100644 drivers/energy_model/Kconfig create mode 100644 drivers/energy_model/Makefile create mode 100644 drivers/energy_model/legacy_em_dt.c diff --git a/drivers/Kconfig b/drivers/Kconfig index ab4d43923c4d..681051a96e67 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -219,4 +219,5 @@ source "drivers/siox/Kconfig" source "drivers/slimbus/Kconfig" +source "drivers/energy_model/Kconfig" endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 578f469f72fb..1e847e29661c 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -157,6 +157,8 @@ obj-$(CONFIG_REMOTEPROC) += remoteproc/ obj-$(CONFIG_RPMSG) += rpmsg/ obj-$(CONFIG_SOUNDWIRE) += soundwire/ +obj-$(CONFIG_ENERGY_MODEL) += energy_model/ + # Virtualization drivers obj-$(CONFIG_VIRT_DRIVERS) += virt/ obj-$(CONFIG_HYPERV) += hv/ diff --git a/drivers/energy_model/Kconfig b/drivers/energy_model/Kconfig new file mode 100644 index 000000000000..3fbf968926d5 --- /dev/null +++ b/drivers/energy_model/Kconfig @@ -0,0 +1,16 @@ +config LEGACY_ENERGY_MODEL_DT + bool "Legacy DT-based Energy Model of CPUs" + default n + help + The Energy Aware Scheduler (EAS) used to rely on Energy Models + (EMs) statically defined in the Device Tree. More recent + versions of EAS now rely on the EM framework to get the power + costs of CPUs. + + This driver reads old-style static EMs in DT and feeds them in + the EM framework, hence enabling to use EAS on platforms with + old DT files. Since EAS now uses only the active costs of CPUs, + the cluster-related costs and idle-costs of the old EM are + ignored. + + If in doubt, say N. diff --git a/drivers/energy_model/Makefile b/drivers/energy_model/Makefile new file mode 100644 index 000000000000..7bc0a7e502ea --- /dev/null +++ b/drivers/energy_model/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_LEGACY_ENERGY_MODEL_DT) += legacy_em_dt.o diff --git a/drivers/energy_model/legacy_em_dt.c b/drivers/energy_model/legacy_em_dt.c new file mode 100644 index 000000000000..b608790fcc19 --- /dev/null +++ b/drivers/energy_model/legacy_em_dt.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Legacy Energy Model loading driver + * + * Copyright (C) 2018, ARM Ltd. + * Written by: Quentin Perret, ARM Ltd. + */ + +#define pr_fmt(fmt) "legacy-dt-em: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static cpumask_var_t cpus_to_visit; + +static DEFINE_PER_CPU(unsigned long, nr_states) = 0; + +struct em_state { + unsigned long frequency; + unsigned long power; + unsigned long capacity; +}; +static DEFINE_PER_CPU(struct em_state*, cpu_em) = NULL; + +static void finish_em_loading_workfn(struct work_struct *work); +static DECLARE_WORK(finish_em_loading_work, finish_em_loading_workfn); + +static DEFINE_MUTEX(em_loading_mutex); + +/* + * Callback given to the EM framework. All this does is browse the table + * created by legacy_em_dt(). + */ +static int get_power(unsigned long *mW, unsigned long *KHz, int cpu) +{ + unsigned long nstates = per_cpu(nr_states, cpu); + struct em_state *em = per_cpu(cpu_em, cpu); + int i; + + if (!nstates || !em) + return -ENODEV; + + for (i = 0; i < nstates - 1; i++) { + if (em[i].frequency > *KHz) + break; + } + + *KHz = em[i].frequency; + *mW = em[i].power; + + return 0; +} + +static int init_em_dt_callback(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct em_data_callback em_cb = EM_DATA_CB(get_power); + unsigned long nstates, scale_cpu, max_freq; + struct cpufreq_policy *policy = data; + const struct property *prop; + struct device_node *cn, *cp; + struct em_state *em; + int cpu, i, ret = 0; + const __be32 *tmp; + + if (val != CPUFREQ_NOTIFY) + return 0; + + mutex_lock(&em_loading_mutex); + + /* Do not register twice an energy model */ + for_each_cpu(cpu, policy->cpus) { + if (per_cpu(nr_states, cpu) || per_cpu(cpu_em, cpu)) { + pr_err("EM of CPU%d already loaded\n", cpu); + ret = -EEXIST; + goto unlock; + } + } + + max_freq = policy->cpuinfo.max_freq; + if (!max_freq) { + pr_err("No policy->max for CPU%d\n", cpu); + ret = -EINVAL; + goto unlock; + } + + cpu = cpumask_first(policy->cpus); + cn = of_get_cpu_node(cpu, NULL); + if (!cn) { + pr_err("No device_node for CPU%d\n", cpu); + ret = -ENODEV; + goto unlock; + } + + cp = of_parse_phandle(cn, "sched-energy-costs", 0); + if (!cp) { + pr_err("CPU%d node has no sched-energy-costs\n", cpu); + ret = -ENODEV; + goto unlock; + } + + prop = of_find_property(cp, "busy-cost-data", NULL); + if (!prop || !prop->value) { + pr_err("No busy-cost-data for CPU%d\n", cpu); + ret = -ENODEV; + goto unlock; + } + + nstates = (prop->length / sizeof(u32)) / 2; + em = kcalloc(nstates, sizeof(struct em_cap_state), GFP_KERNEL); + if (!em) { + ret = -ENOMEM; + goto unlock; + } + + /* Copy the capacity and power cost to the table. */ + for (i = 0, tmp = prop->value; i < nstates; i++) { + em[i].capacity = be32_to_cpup(tmp++); + em[i].power = be32_to_cpup(tmp++); + } + + /* Get the CPU capacity (according to the EM) */ + scale_cpu = em[nstates - 1].capacity; + if (!scale_cpu) { + pr_err("CPU%d: capacity cannot be 0\n", cpu); + kfree(em); + ret = -EINVAL; + goto unlock; + } + + /* Re-compute the intermediate frequencies based on the EM. */ + for (i = 0; i < nstates; i++) + em[i].frequency = em[i].capacity * max_freq / scale_cpu; + + /* Assign the table to all CPUs of this policy. */ + for_each_cpu(i, policy->cpus) { + per_cpu(nr_states, i) = nstates; + per_cpu(cpu_em, i) = em; + } + + pr_info("Registering EM of %*pbl\n", cpumask_pr_args(policy->cpus)); + em_register_perf_domain(policy->cpus, nstates, &em_cb); + + /* Finish the work when all possible CPUs have been registered. */ + cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->cpus); + if (cpumask_empty(cpus_to_visit)) + schedule_work(&finish_em_loading_work); + +unlock: + mutex_unlock(&em_loading_mutex); + + return ret; +} + +static struct notifier_block init_em_dt_notifier = { + .notifier_call = init_em_dt_callback, +}; + +static void finish_em_loading_workfn(struct work_struct *work) +{ + cpufreq_unregister_notifier(&init_em_dt_notifier, + CPUFREQ_POLICY_NOTIFIER); + free_cpumask_var(cpus_to_visit); + + /* Let the scheduler know the Energy Model is ready. */ + rebuild_sched_domains(); +} + +static int __init register_cpufreq_notifier(void) +{ + int ret; + + if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(cpus_to_visit, cpu_possible_mask); + + ret = cpufreq_register_notifier(&init_em_dt_notifier, + CPUFREQ_POLICY_NOTIFIER); + + if (ret) + free_cpumask_var(cpus_to_visit); + + return ret; +} +core_initcall(register_cpufreq_notifier); From 68dbff9ce92ae2f923321d150fb72a6d8b05e640 Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Sat, 21 Oct 2017 18:07:35 +0100 Subject: [PATCH 43/75] ANDROID: sched: fair/tune: Add schedtune with cgroups interface Schedtune is the framework we use in Android to allow userspace task classification and provides a CGroup controller which has two attributes per group. * schedtune.boost * schedtune.prefer_idle Schedtune itself provides task and CPU utilization boosting. EAS in the fair scheduler uses boosted utilization and prefer_idle status to control the algorithm used for wakeup task placement. Boosting: The task utilization signal, which is derived from PELT signals and properly scaled to be architecture and frequency invariant, is used by EAS as an estimation of the task requirements in terms of CPU bandwidth. Schedtune allows userspace to assign a percentage boost to each group and this boost is used to calculate an additional utilization margin. The margin added to the original utilization is: 1. computed based on the "boosting strategy" in use 2. proportional to boost value defined by the "taskgroup" value The boosted signal is used by EAS for task placement, and boosted CPU utilization (if boosted tasks are running) is given when schedutil requests utilization. Prefer_idle: When this attribute is 1 for a group, this is used as a signal from userspace that tasks in this group need to be serviced with the minimum latency possible. Previous versions of schedtune had much more functionality around allowing a more tuneable tradeoff between performand and energy, however this has not been used a lot up until now. If necessary, we can easily resurrect it based upon old code. Signed-off-by: Patrick Bellasi Signed-off-by: Chris Redpath (cherry-picked from commit 159c14f0397790405b9e8435184366b31f2ed15b) [ - Removed tracepoints (to be added in a separate patch) - Integrated boosted_cpu_util() with cpu_util_cfs() - Backported Patrick's util_est related fixes from android-4.14 ] Signed-off-by: Quentin Perret Change-Id: Ie2fd63d82f604f34bcbc7e1ca9b5af1bdcc037e0 --- Documentation/scheduler/sched-tune.txt | 413 ++++++++++++++++ include/linux/cgroup_subsys.h | 4 + init/Kconfig | 23 + kernel/sched/Makefile | 1 + kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/fair.c | 118 +++++ kernel/sched/sched.h | 2 + kernel/sched/tune.c | 622 +++++++++++++++++++++++++ kernel/sched/tune.h | 37 ++ 9 files changed, 1221 insertions(+), 1 deletion(-) create mode 100644 Documentation/scheduler/sched-tune.txt create mode 100644 kernel/sched/tune.c create mode 100644 kernel/sched/tune.h diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt new file mode 100644 index 000000000000..5df0ea361311 --- /dev/null +++ b/Documentation/scheduler/sched-tune.txt @@ -0,0 +1,413 @@ + Central, scheduler-driven, power-performance control + (EXPERIMENTAL) + +Abstract +======== + +The topic of a single simple power-performance tunable, that is wholly +scheduler centric, and has well defined and predictable properties has come up +on several occasions in the past [1,2]. With techniques such as a scheduler +driven DVFS [3], we now have a good framework for implementing such a tunable. +This document describes the overall ideas behind its design and implementation. + + +Table of Contents +================= + +1. Motivation +2. Introduction +3. Signal Boosting Strategy +4. OPP selection using boosted CPU utilization +5. Per task group boosting +6. Per-task wakeup-placement-strategy Selection +7. Question and Answers + - What about "auto" mode? + - What about boosting on a congested system? + - How CPUs are boosted when we have tasks with multiple boost values? +8. References + + +1. Motivation +============= + +Sched-DVFS [3] was a new event-driven cpufreq governor which allows the +scheduler to select the optimal DVFS operating point (OPP) for running a task +allocated to a CPU. Later, the cpufreq maintainers introduced a similar +governor, schedutil. The introduction of schedutil also enables running +workloads at the most energy efficient OPPs. + +However, sometimes it may be desired to intentionally boost the performance of +a workload even if that could imply a reasonable increase in energy +consumption. For example, in order to reduce the response time of a task, we +may want to run the task at a higher OPP than the one that is actually required +by it's CPU bandwidth demand. + +This last requirement is especially important if we consider that one of the +main goals of the utilization-driven governor component is to replace all +currently available CPUFreq policies. Since sched-DVFS and schedutil are event +based, as opposed to the sampling driven governors we currently have, they are +already more responsive at selecting the optimal OPP to run tasks allocated to +a CPU. However, just tracking the actual task load demand may not be enough +from a performance standpoint. For example, it is not possible to get +behaviors similar to those provided by the "performance" and "interactive" +CPUFreq governors. + +This document describes an implementation of a tunable, stacked on top of the +utilization-driven governors which extends their functionality to support task +performance boosting. + +By "performance boosting" we mean the reduction of the time required to +complete a task activation, i.e. the time elapsed from a task wakeup to its +next deactivation (e.g. because it goes back to sleep or it terminates). For +example, if we consider a simple periodic task which executes the same workload +for 5[s] every 20[s] while running at a certain OPP, a boosted execution of +that task must complete each of its activations in less than 5[s]. + +A previous attempt [5] to introduce such a boosting feature has not been +successful mainly because of the complexity of the proposed solution. Previous +versions of the approach described in this document exposed a single simple +interface to user-space. This single tunable knob allowed the tuning of +system wide scheduler behaviours ranging from energy efficiency at one end +through to incremental performance boosting at the other end. This first +tunable affects all tasks. However, that is not useful for Android products +so in this version only a more advanced extension of the concept is provided +which uses CGroups to boost the performance of only selected tasks while using +the energy efficient default for all others. + +The rest of this document introduces in more details the proposed solution +which has been named SchedTune. + + +2. Introduction +=============== + +SchedTune exposes a simple user-space interface provided through a new +CGroup controller 'stune' which provides two power-performance tunables +per group: + + //schedtune.prefer_idle + //schedtune.boost + +The CGroup implementation permits arbitrary user-space defined task +classification to tune the scheduler for different goals depending on the +specific nature of the task, e.g. background vs interactive vs low-priority. + +More details are given in section 5. + +2.1 Boosting +============ + +The boost value is expressed as an integer in the range [-100..0..100]. + +A value of 0 (default) configures the CFS scheduler for maximum energy +efficiency. This means that sched-DVFS runs the tasks at the minimum OPP +required to satisfy their workload demand. + +A value of 100 configures scheduler for maximum performance, which translates +to the selection of the maximum OPP on that CPU. + +A value of -100 configures scheduler for minimum performance, which translates +to the selection of the minimum OPP on that CPU. + +The range between -100, 0 and 100 can be set to satisfy other scenarios suitably. +For example to satisfy interactive response or depending on other system events +(battery level etc). + +The overall design of the SchedTune module is built on top of "Per-Entity Load +Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating +Performance Point (OPP) selection. + +Each time a task is allocated on a CPU, cpufreq is given the opportunity to tune +the operating frequency of that CPU to better match the workload demand. The +selection of the actual OPP being activated is influenced by the boost value +for the task CGroup. + +This simple biasing approach leverages existing frameworks, which means minimal +modifications to the scheduler, and yet it allows to achieve a range of +different behaviours all from a single simple tunable knob. + +In EAS schedulers, we use boosted task and CPU utilization for energy +calculation and energy-aware task placement. + +2.2 prefer_idle +=============== + +This is a flag which indicates to the scheduler that userspace would like +the scheduler to focus on energy or to focus on performance. + +A value of 0 (default) signals to the CFS scheduler that tasks in this group +can be placed according to the energy-aware wakeup strategy. + +A value of 1 signals to the CFS scheduler that tasks in this group should be +placed to minimise wakeup latency. + +The value is combined with the boost value - task placement will not be +boost aware however CPU OPP selection is still boost aware. + +Android platforms typically use this flag for application tasks which the +user is currently interacting with. + + +3. Signal Boosting Strategy +=========================== + +The whole PELT machinery works based on the value of a few load tracking signals +which basically track the CPU bandwidth requirements for tasks and the capacity +of CPUs. The basic idea behind the SchedTune knob is to artificially inflate +some of these load tracking signals to make a task or RQ appears more demanding +that it actually is. + +Which signals have to be inflated depends on the specific "consumer". However, +independently from the specific (signal, consumer) pair, it is important to +define a simple and possibly consistent strategy for the concept of boosting a +signal. + +A boosting strategy defines how the "abstract" user-space defined +sched_cfs_boost value is translated into an internal "margin" value to be added +to a signal to get its inflated value: + + margin := boosting_strategy(sched_cfs_boost, signal) + boosted_signal := signal + margin + +Different boosting strategies were identified and analyzed before selecting the +one found to be most effective. + +Signal Proportional Compensation (SPC) +-------------------------------------- + +In this boosting strategy the sched_cfs_boost value is used to compute a +margin which is proportional to the complement of the original signal. +When a signal has a maximum possible value, its complement is defined as +the delta from the actual value and its possible maximum. + +Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as +the maximum possible value, the margin becomes: + + margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal) + +Using this boosting strategy: +- a 100% sched_cfs_boost means that the signal is scaled to the maximum value +- each value in the range of sched_cfs_boost effectively inflates the signal in + question by a quantity which is proportional to the maximum value. + +For example, by applying the SPC boosting strategy to the selection of the OPP +to run a task it is possible to achieve these behaviors: + +- 0% boosting: run the task at the minimum OPP required by its workload +- 100% boosting: run the task at the maximum OPP available for the CPU +- 50% boosting: run at the half-way OPP between minimum and maximum + +Which means that, at 50% boosting, a task will be scheduled to run at half of +the maximum theoretically achievable performance on the specific target +platform. + +A graphical representation of an SPC boosted signal is represented in the +following figure where: + a) "-" represents the original signal + b) "b" represents a 50% boosted signal + c) "p" represents a 100% boosted signal + + + ^ + | SCHED_LOAD_SCALE + +-----------------------------------------------------------------+ + |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp + | + | boosted_signal + | bbbbbbbbbbbbbbbbbbbbbbbb + | + | original signal + | bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+ + | | + |bbbbbbbbbbbbbbbbbb | + | | + | | + | | + | +-----------------------+ + | | + | | + | | + |------------------+ + | + | + +-----------------------------------------------------------------------> + +The plot above shows a ramped load signal (titled 'original_signal') and it's +boosted equivalent. For each step of the original signal the boosted signal +corresponding to a 50% boost is midway from the original signal and the upper +bound. Boosting by 100% generates a boosted signal which is always saturated to +the upper bound. + + +4. OPP selection using boosted CPU utilization +============================================== + +It is worth calling out that the implementation does not introduce any new load +signals. Instead, it provides an API to tune existing signals. This tuning is +done on demand and only in scheduler code paths where it is sensible to do so. +The new API calls are defined to return either the default signal or a boosted +one, depending on the value of sched_cfs_boost. This is a clean an non invasive +modification of the existing existing code paths. + +The signal representing a CPU's utilization is boosted according to the +previously described SPC boosting strategy. To sched-DVFS, this allows a CPU +(ie CFS run-queue) to appear more used then it actually is. + +Thus, with the sched_cfs_boost enabled we have the following main functions to +get the current utilization of a CPU: + + cpu_util() + boosted_cpu_util() + +The new boosted_cpu_util() is similar to the first but returns a boosted +utilization signal which is a function of the sched_cfs_boost value. + +This function is used in the CFS scheduler code paths where sched-DVFS needs to +decide the OPP to run a CPU at. +For example, this allows selecting the highest OPP for a CPU which has +the boost value set to 100%. + + +5. Per task group boosting +========================== + +On battery powered devices there usually are many background services which are +long running and need energy efficient scheduling. On the other hand, some +applications are more performance sensitive and require an interactive +response and/or maximum performance, regardless of the energy cost. + +To better service such scenarios, the SchedTune implementation has an extension +that provides a more fine grained boosting interface. + +A new CGroup controller, namely "schedtune", can be enabled which allows to +defined and configure task groups with different boosting values. +Tasks that require special performance can be put into separate CGroups. +The value of the boost associated with the tasks in this group can be specified +using a single knob exposed by the CGroup controller: + + schedtune.boost + +This knob allows the definition of a boost value that is to be used for +SPC boosting of all tasks attached to this group. + +The current schedtune controller implementation is really simple and has these +main characteristics: + + 1) It is only possible to create 1 level depth hierarchies + + The root control groups define the system-wide boost value to be applied + by default to all tasks. Its direct subgroups are named "boost groups" and + they define the boost value for specific set of tasks. + Further nested subgroups are not allowed since they do not have a sensible + meaning from a user-space standpoint. + + 2) It is possible to define only a limited number of "boost groups" + + This number is defined at compile time and by default configured to 16. + This is a design decision motivated by two main reasons: + a) In a real system we do not expect utilization scenarios with more then few + boost groups. For example, a reasonable collection of groups could be + just "background", "interactive" and "performance". + b) It simplifies the implementation considerably, especially for the code + which has to compute the per CPU boosting once there are multiple + RUNNABLE tasks with different boost values. + +Such a simple design should allow servicing the main utilization scenarios identified +so far. It provides a simple interface which can be used to manage the +power-performance of all tasks or only selected tasks. +Moreover, this interface can be easily integrated by user-space run-times (e.g. +Android, ChromeOS) to implement a QoS solution for task boosting based on tasks +classification, which has been a long standing requirement. + +Setup and usage +--------------- + +0. Use a kernel with CONFIG_SCHED_TUNE support enabled + +1. Check that the "schedtune" CGroup controller is available: + + root@linaro-nano:~# cat /proc/cgroups + #subsys_name hierarchy num_cgroups enabled + cpuset 0 1 1 + cpu 0 1 1 + schedtune 0 1 1 + +2. Mount a tmpfs to create the CGroups mount point (Optional) + + root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup + +3. Mount the "schedtune" controller + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune + root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune + +4. Create task groups and configure their specific boost value (Optional) + + For example here we create a "performance" boost group configure to boost + all its tasks to 100% + + root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance + root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost + +5. Move tasks into the boost group + + For example, the following moves the tasks with PID $TASKPID (and all its + threads) into the "performance" boost group. + + root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs + +This simple configuration allows only the threads of the $TASKPID task to run, +when needed, at the highest OPP in the most capable CPU of the system. + + +6. Per-task wakeup-placement-strategy Selection +=============================================== + +Many devices have a number of CFS tasks in use which require an absolute +minimum wakeup latency, and many tasks for which wakeup latency is not +important. + +For touch-driven environments, removing additional wakeup latency can be +critical. + +When you use the Schedtume CGroup controller, you have access to a second +parameter which allows a group to be marked such that energy_aware task +placement is bypassed for tasks belonging to that group. + +prefer_idle=0 (default - use energy-aware task placement if available) +prefer_idle=1 (never use energy-aware task placement for these tasks) + +Since the regular wakeup task placement algorithm in CFS is biased for +performance, this has the effect of restoring minimum wakeup latency +for the desired tasks whilst still allowing energy-aware wakeup placement +to save energy for other tasks. + + +7. Question and Answers +======================= + +What about "auto" mode? +----------------------- + +The 'auto' mode as described in [5] can be implemented by interfacing SchedTune +with some suitable user-space element. This element could use the exposed +system-wide or cgroup based interface. + +How are multiple groups of tasks with different boost values managed? +--------------------------------------------------------------------- + +The current SchedTune implementation keeps track of the boosted RUNNABLE tasks +on a CPU. The CPU utilization seen by the scheduler-driven cpufreq governors +(and used to select an appropriate OPP) is boosted with a value which is the +maximum of the boost values of the currently RUNNABLE tasks in its RQ. + +This allows cpufreq to boost a CPU only while there are boosted tasks ready +to run and switch back to the energy efficient mode as soon as the last boosted +task is dequeued. + + +8. References +============= +[1] http://lwn.net/Articles/552889 +[2] http://lkml.org/lkml/2012/5/18/91 +[3] http://lkml.org/lkml/2015/6/26/620 diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index acb77dcff3b4..8996c092568b 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -21,6 +21,10 @@ SUBSYS(cpu) SUBSYS(cpuacct) #endif +#if IS_ENABLED(CONFIG_SCHED_TUNE) +SUBSYS(schedtune) +#endif + #if IS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(io) #endif diff --git a/init/Kconfig b/init/Kconfig index 1e234e2f1cba..a19a72ae92cb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -991,6 +991,29 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TUNE + bool "Boosting for CFS tasks (EXPERIMENTAL)" + depends on SMP + help + This option enables support for task classification using a new + cgroup controller, schedtune. Schedtune allows tasks to be given + a boost value and marked as latency-sensitive or not. This option + provides the "schedtune" controller. + + This new controller: + 1. allows only a two layers hierarchy, where the root defines the + system-wide boost value and its direct childrens define each one a + different "class of tasks" to be boosted with a different value + 2. supports up to 16 different task classes, each one which could be + configured with a different boost value + + Latency-sensitive tasks are not subject to energy-aware wakeup + task placement. The boost value assigned to tasks is used to + influence task placement and CPU frequency selection (if + utilization-driven frequency selection is in use). + + If unsure, say N. + config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c38..2389350a8268 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_SCHED_TUNE) += tune.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 804eb7ae944a..20ad939e143d 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -284,7 +284,7 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util = cpu_util_cfs(rq); + unsigned long util = boosted_cpu_util(sg_cpu->cpu); unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); sg_cpu->max = max; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e61c553a4736..460237bda9e2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5131,6 +5131,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ util_est_enqueue(&rq->cfs, p); + /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's update schedtune here to ensure the boost value of the + * current task is accounted for in the selection of the OPP. + * + * We do it also in the case where we enqueue a throttled task; + * we could argue that a throttled task should not boost a CPU, + * however: + * a) properly implementing CPU boosting considering throttled + * tasks will increase a lot the complexity of the solution + * b) it's not easy to quantify the benefits introduced by + * such a more complex solution. + * Thus, for the time being we go for the simple solution and boost + * also for throttled RQs. + */ + schedtune_enqueue_task(p, cpu_of(rq)); + /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag @@ -5206,6 +5224,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's update schedtune here to ensure the boost value of the + * current task is not more accounted for in the selection of the OPP. + */ + schedtune_dequeue_task(p, cpu_of(rq)); + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -5732,6 +5758,98 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } +#ifdef CONFIG_SCHED_TUNE +struct reciprocal_value schedtune_spc_rdiv; + +static long +schedtune_margin(unsigned long signal, long boost) +{ + long long margin = 0; + + /* + * Signal proportional compensation (SPC) + * + * The Boost (B) value is used to compute a Margin (M) which is + * proportional to the complement of the original Signal (S): + * M = B * (SCHED_CAPACITY_SCALE - S) + * The obtained M could be used by the caller to "boost" S. + */ + if (boost >= 0) { + margin = SCHED_CAPACITY_SCALE - signal; + margin *= boost; + } else + margin = -signal * boost; + + margin = reciprocal_divide(margin, schedtune_spc_rdiv); + + if (boost < 0) + margin *= -1; + return margin; +} + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + int boost = schedtune_cpu_boost(cpu); + + if (boost == 0) + return 0; + + return schedtune_margin(util, boost); +} + +static inline long +schedtune_task_margin(struct task_struct *task) +{ + int boost = schedtune_task_boost(task); + unsigned long util; + long margin; + + if (boost == 0) + return 0; + + util = task_util_est(task); + margin = schedtune_margin(util, boost); + + return margin; +} + +unsigned long +boosted_cpu_util(int cpu) +{ + unsigned long util = cpu_util_cfs(cpu_rq(cpu)); + long margin = schedtune_cpu_margin(util, cpu); + + return util + margin; +} + +#else /* CONFIG_SCHED_TUNE */ + +static inline int +schedtune_cpu_margin(unsigned long util, int cpu) +{ + return 0; +} + +static inline int +schedtune_task_margin(struct task_struct *task) +{ + return 0; +} + +#endif /* CONFIG_SCHED_TUNE */ + + + +static inline unsigned long +boosted_task_util(struct task_struct *task) +{ + unsigned long util = task_util_est(task); + long margin = schedtune_task_margin(task); + + return util + margin; +} + static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4cdf52e832a1..6245f808e411 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -80,6 +80,8 @@ # define SCHED_WARN_ON(x) ({ (void)(x), 0; }) #endif +#include "tune.h" + struct rq; struct cpuidle_state; diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c new file mode 100644 index 000000000000..af6de6d3df6d --- /dev/null +++ b/kernel/sched/tune.c @@ -0,0 +1,622 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "sched.h" + +bool schedtune_initialized = false; +extern struct reciprocal_value schedtune_spc_rdiv; + +/* + * EAS scheduler tunables for task groups. + * + * When CGroup support is enabled, we have to synchronize two different + * paths: + * - slow path: where CGroups are created/updated/removed + * - fast path: where tasks in a CGroups are accounted + * + * The slow path tracks (a limited number of) CGroups and maps each on a + * "boost_group" index. The fastpath accounts tasks currently RUNNABLE on each + * "boost_group". + * + * Once a new CGroup is created, a boost group idx is assigned and the + * corresponding "boost_group" marked as valid on each CPU. + * Once a CGroup is release, the corresponding "boost_group" is marked as + * invalid on each CPU. The CPU boost value (boost_max) is aggregated by + * considering only valid boost_groups with a non null tasks counter. + * + * .:: Locking strategy + * + * The fast path uses a spin lock for each CPU boost_group which protects the + * tasks counter. + * + * The "valid" and "boost" values of each CPU boost_group is instead + * protected by the RCU lock provided by the CGroups callbacks. Thus, only the + * slow path can access and modify the boost_group attribtues of each CPU. + * The fast path will catch up the most updated values at the next scheduling + * event (i.e. enqueue/dequeue). + * + * | + * SLOW PATH | FAST PATH + * CGroup add/update/remove | Scheduler enqueue/dequeue events + * | + * | + * | DEFINE_PER_CPU(struct boost_groups) + * | +--------------+----+---+----+----+ + * | | idle | | | | | + * | | boost_max | | | | | + * | +---->lock | | | | | + * struct schedtune allocated_groups | | | group[ ] | | | | | + * +------------------------------+ +-------+ | | +--+---------+-+----+---+----+----+ + * | idx | | | | | | valid | + * | boots / prefer_idle | | | | | | boost | + * | perf_{boost/constraints}_idx | <---------+(*) | | | | tasks | <------------+ + * | css | +-------+ | | +---------+ | + * +-+----------------------------+ | | | | | | | + * ^ | | | | | | | + * | +-------+ | | +---------+ | + * | | | | | | | | + * | | | | | | | | + * | +-------+ | | +---------+ | + * | zmalloc | | | | | | | + * | | | | | | | | + * | +-------+ | | +---------+ | + * + BOOSTGROUPS_COUNT | | BOOSTGROUPS_COUNT | + * schedtune_boostgroup_init() | + | + * | schedtune_{en,de}queue_task() | + * | + + * | schedtune_tasks_update() + * | + */ + +/* SchdTune tunables for a group of tasks */ +struct schedtune { + /* SchedTune CGroup subsystem */ + struct cgroup_subsys_state css; + + /* Boost group allocated ID */ + int idx; + + /* Boost value for tasks on that SchedTune CGroup */ + int boost; + + /* Hint to bias scheduling of tasks on that SchedTune CGroup + * towards idle CPUs */ + int prefer_idle; +}; + +static inline struct schedtune *css_st(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct schedtune, css) : NULL; +} + +static inline struct schedtune *task_schedtune(struct task_struct *tsk) +{ + return css_st(task_css(tsk, schedtune_cgrp_id)); +} + +static inline struct schedtune *parent_st(struct schedtune *st) +{ + return css_st(st->css.parent); +} + +/* + * SchedTune root control group + * The root control group is used to defined a system-wide boosting tuning, + * which is applied to all tasks in the system. + * Task specific boost tuning could be specified by creating and + * configuring a child control group under the root one. + * By default, system-wide boosting is disabled, i.e. no boosting is applied + * to tasks which are not into a child control group. + */ +static struct schedtune +root_schedtune = { + .boost = 0, + .prefer_idle = 0, +}; + +/* + * Maximum number of boost groups to support + * When per-task boosting is used we still allow only limited number of + * boost groups for two main reasons: + * 1. on a real system we usually have only few classes of workloads which + * make sense to boost with different values (e.g. background vs foreground + * tasks, interactive vs low-priority tasks) + * 2. a limited number allows for a simpler and more memory/time efficient + * implementation especially for the computation of the per-CPU boost + * value + */ +#define BOOSTGROUPS_COUNT 5 + +/* Array of configured boostgroups */ +static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { + &root_schedtune, + NULL, +}; + +/* SchedTune boost groups + * Keep track of all the boost groups which impact on CPU, for example when a + * CPU has two RUNNABLE tasks belonging to two different boost groups and thus + * likely with different boost values. + * Since on each system we expect only a limited number of boost groups, here + * we use a simple array to keep track of the metrics required to compute the + * maximum per-CPU boosting value. + */ +struct boost_groups { + /* Maximum boost value for all RUNNABLE tasks on a CPU */ + int boost_max; + struct { + /* True when this boost group maps an actual cgroup */ + bool valid; + /* The boost for tasks on that boost group */ + int boost; + /* Count of RUNNABLE tasks on that boost group */ + unsigned tasks; + } group[BOOSTGROUPS_COUNT]; + /* CPU's boost group locking */ + raw_spinlock_t lock; +}; + +/* Boost groups affecting each CPU in the system */ +DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); + +static void +schedtune_cpu_update(int cpu) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int boost_max; + int idx; + + /* The root boost group is always active */ + boost_max = bg->group[0].boost; + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { + + /* Ignore non boostgroups not mapping a cgroup */ + if (!bg->group[idx].valid) + continue; + + /* + * A boost group affects a CPU only if it has + * RUNNABLE tasks on that CPU + */ + if (bg->group[idx].tasks == 0) + continue; + + boost_max = max(boost_max, bg->group[idx].boost); + } + + /* Ensures boost_max is non-negative when all cgroup boost values + * are neagtive. Avoids under-accounting of cpu capacity which may cause + * task stacking and frequency spikes.*/ + boost_max = max(boost_max, 0); + bg->boost_max = boost_max; +} + +static int +schedtune_boostgroup_update(int idx, int boost) +{ + struct boost_groups *bg; + int cur_boost_max; + int old_boost; + int cpu; + + /* Update per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + + /* CGroups are never associated to non active cgroups */ + BUG_ON(!bg->group[idx].valid); + + /* + * Keep track of current boost values to compute the per CPU + * maximum only when it has been affected by the new value of + * the updated boost group + */ + cur_boost_max = bg->boost_max; + old_boost = bg->group[idx].boost; + + /* Update the boost value of this boost group */ + bg->group[idx].boost = boost; + + /* Check if this update increase current max */ + if (boost > cur_boost_max && bg->group[idx].tasks) { + bg->boost_max = boost; + continue; + } + + /* Check if this update has decreased current max */ + if (cur_boost_max == old_boost && old_boost > boost) { + schedtune_cpu_update(cpu); + continue; + } + } + + return 0; +} + +#define ENQUEUE_TASK 1 +#define DEQUEUE_TASK -1 + +static inline void +schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + int tasks = bg->group[idx].tasks + task_count; + + /* Update boosted tasks count while avoiding to make it negative */ + bg->group[idx].tasks = max(0, tasks); + + /* Boost group activation or deactivation on that RQ */ + if (tasks == 1 || tasks == 0) + schedtune_cpu_update(cpu); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_enqueue_task(struct task_struct *p, int cpu) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; + struct schedtune *st; + int idx; + + if (unlikely(!schedtune_initialized)) + return; + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions for example on + * do_exit()::cgroup_exit() and task migration. + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); + rcu_read_lock(); + + st = task_schedtune(p); + idx = st->idx; + + schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +int schedtune_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + struct boost_groups *bg; + struct rq_flags rq_flags; + unsigned int cpu; + struct rq *rq; + int src_bg; /* Source boost group index */ + int dst_bg; /* Destination boost group index */ + int tasks; + + if (unlikely(!schedtune_initialized)) + return 0; + + + cgroup_taskset_for_each(task, css, tset) { + + /* + * Lock the CPU's RQ the task is enqueued to avoid race + * conditions with migration code while the task is being + * accounted + */ + rq = task_rq_lock(task, &rq_flags); + + if (!task->on_rq) { + task_rq_unlock(rq, task, &rq_flags); + continue; + } + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + cpu = cpu_of(rq); + bg = &per_cpu(cpu_boost_groups, cpu); + raw_spin_lock(&bg->lock); + + dst_bg = css_st(css)->idx; + src_bg = task_schedtune(task)->idx; + + /* + * Current task is not changing boostgroup, which can + * happen when the new hierarchy is in use. + */ + if (unlikely(dst_bg == src_bg)) { + raw_spin_unlock(&bg->lock); + task_rq_unlock(rq, task, &rq_flags); + continue; + } + + /* + * This is the case of a RUNNABLE task which is switching its + * current boost group. + */ + + /* Move task from src to dst boost group */ + tasks = bg->group[src_bg].tasks - 1; + bg->group[src_bg].tasks = max(0, tasks); + bg->group[dst_bg].tasks += 1; + + raw_spin_unlock(&bg->lock); + task_rq_unlock(rq, task, &rq_flags); + + /* Update CPU boost group */ + if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) + schedtune_cpu_update(task_cpu(task)); + + } + + return 0; +} + +void schedtune_cancel_attach(struct cgroup_taskset *tset) +{ + /* This can happen only if SchedTune controller is mounted with + * other hierarchies ane one of them fails. Since usually SchedTune is + * mouted on its own hierarcy, for the time being we do not implement + * a proper rollback mechanism */ + WARN(1, "SchedTune cancel attach not implemented"); +} + +/* + * NOTE: This function must be called while holding the lock on the CPU RQ + */ +void schedtune_dequeue_task(struct task_struct *p, int cpu) +{ + struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); + unsigned long irq_flags; + struct schedtune *st; + int idx; + + if (unlikely(!schedtune_initialized)) + return; + + /* + * Boost group accouting is protected by a per-cpu lock and requires + * interrupt to be disabled to avoid race conditions on... + */ + raw_spin_lock_irqsave(&bg->lock, irq_flags); + rcu_read_lock(); + + st = task_schedtune(p); + idx = st->idx; + + schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK); + + rcu_read_unlock(); + raw_spin_unlock_irqrestore(&bg->lock, irq_flags); +} + +int schedtune_cpu_boost(int cpu) +{ + struct boost_groups *bg; + + bg = &per_cpu(cpu_boost_groups, cpu); + return bg->boost_max; +} + +int schedtune_task_boost(struct task_struct *p) +{ + struct schedtune *st; + int task_boost; + + if (unlikely(!schedtune_initialized)) + return 0; + + /* Get task boost value */ + rcu_read_lock(); + st = task_schedtune(p); + task_boost = st->boost; + rcu_read_unlock(); + + return task_boost; +} + +int schedtune_prefer_idle(struct task_struct *p) +{ + struct schedtune *st; + int prefer_idle; + + if (unlikely(!schedtune_initialized)) + return 0; + + /* Get prefer_idle value */ + rcu_read_lock(); + st = task_schedtune(p); + prefer_idle = st->prefer_idle; + rcu_read_unlock(); + + return prefer_idle; +} + +static u64 +prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->prefer_idle; +} + +static int +prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 prefer_idle) +{ + struct schedtune *st = css_st(css); + st->prefer_idle = !!prefer_idle; + + return 0; +} + +static s64 +boost_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->boost; +} + +static int +boost_write(struct cgroup_subsys_state *css, struct cftype *cft, + s64 boost) +{ + struct schedtune *st = css_st(css); + + if (boost < 0 || boost > 100) + return -EINVAL; + + st->boost = boost; + + /* Update CPU boost */ + schedtune_boostgroup_update(st->idx, st->boost); + + return 0; +} + +static struct cftype files[] = { + { + .name = "boost", + .read_s64 = boost_read, + .write_s64 = boost_write, + }, + { + .name = "prefer_idle", + .read_u64 = prefer_idle_read, + .write_u64 = prefer_idle_write, + }, + { } /* terminate */ +}; + +static void +schedtune_boostgroup_init(struct schedtune *st, int idx) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize per CPUs boost group support */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + bg->group[idx].boost = 0; + bg->group[idx].valid = true; + } + + /* Keep track of allocated boost groups */ + allocated_group[idx] = st; + st->idx = idx; +} + +static struct cgroup_subsys_state * +schedtune_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct schedtune *st; + int idx; + + if (!parent_css) + return &root_schedtune.css; + + /* Allow only single level hierachies */ + if (parent_css != &root_schedtune.css) { + pr_err("Nested SchedTune boosting groups not allowed\n"); + return ERR_PTR(-ENOMEM); + } + + /* Allow only a limited number of boosting groups */ + for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) + if (!allocated_group[idx]) + break; + if (idx == BOOSTGROUPS_COUNT) { + pr_err("Trying to create more than %d SchedTune boosting groups\n", + BOOSTGROUPS_COUNT); + return ERR_PTR(-ENOSPC); + } + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto out; + + /* Initialize per CPUs boost group support */ + schedtune_boostgroup_init(st, idx); + + return &st->css; + +out: + return ERR_PTR(-ENOMEM); +} + +static void +schedtune_boostgroup_release(struct schedtune *st) +{ + struct boost_groups *bg; + int cpu; + + /* Reset per CPUs boost group support */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + bg->group[st->idx].valid = false; + bg->group[st->idx].boost = 0; + } + + /* Keep track of allocated boost groups */ + allocated_group[st->idx] = NULL; +} + +static void +schedtune_css_free(struct cgroup_subsys_state *css) +{ + struct schedtune *st = css_st(css); + + /* Release per CPUs boost group support */ + schedtune_boostgroup_release(st); + kfree(st); +} + +struct cgroup_subsys schedtune_cgrp_subsys = { + .css_alloc = schedtune_css_alloc, + .css_free = schedtune_css_free, + .can_attach = schedtune_can_attach, + .cancel_attach = schedtune_cancel_attach, + .legacy_cftypes = files, + .early_init = 1, +}; + +static inline void +schedtune_init_cgroups(void) +{ + struct boost_groups *bg; + int cpu; + + /* Initialize the per CPU boost groups */ + for_each_possible_cpu(cpu) { + bg = &per_cpu(cpu_boost_groups, cpu); + memset(bg, 0, sizeof(struct boost_groups)); + bg->group[0].valid = true; + raw_spin_lock_init(&bg->lock); + } + + pr_info("schedtune: configured to support %d boost groups\n", + BOOSTGROUPS_COUNT); + + schedtune_initialized = true; +} + +/* + * Initialize the cgroup structures + */ +static int +schedtune_init(void) +{ + schedtune_spc_rdiv = reciprocal_value(100); + schedtune_init_cgroups(); + return 0; +} +postcore_initcall(schedtune_init); diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h new file mode 100644 index 000000000000..bb187c6112a7 --- /dev/null +++ b/kernel/sched/tune.h @@ -0,0 +1,37 @@ + +#ifdef CONFIG_SCHED_TUNE + +#include + +/* + * System energy normalization constants + */ +struct target_nrg { + unsigned long min_power; + unsigned long max_power; + struct reciprocal_value rdiv; +}; + +int schedtune_cpu_boost(int cpu); +int schedtune_task_boost(struct task_struct *tsk); + +int schedtune_prefer_idle(struct task_struct *tsk); + +void schedtune_enqueue_task(struct task_struct *p, int cpu); +void schedtune_dequeue_task(struct task_struct *p, int cpu); + +unsigned long boosted_cpu_util(int cpu); + +#else /* CONFIG_SCHED_TUNE */ + +#define schedtune_cpu_boost(cpu) 0 +#define schedtune_task_boost(tsk) 0 + +#define schedtune_prefer_idle(tsk) 0 + +#define schedtune_enqueue_task(task, cpu) do { } while (0) +#define schedtune_dequeue_task(task, cpu) do { } while (0) + +#define boosted_cpu_util(cpu) cpu_util_cfs(cpu_rq(cpu)) + +#endif /* CONFIG_SCHED_TUNE */ From 3e63fb2c9136fe919893b3ac3622b61b4eb5c836 Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Tue, 27 Jan 2015 13:48:07 +0000 Subject: [PATCH 44/75] ANDROID: sched, cpuidle: Track cpuidle state index in the scheduler The idle-state of each cpu is currently pointed to by rq->idle_state but there isn't any information in the struct cpuidle_state that can used to look up the idle-state energy model data stored in struct sched_group_energy. For this purpose is necessary to store the idle state index as well. Ideally, the idle-state data should be unified. cc: Ingo Molnar cc: Peter Zijlstra Signed-off-by: Morten Rasmussen Change-Id: Ib3d1178512735b0e314881f73fb8ccff5a69319f Signed-off-by: Chris Redpath (cherry picked from commit a732c97420e109956c20f34c70b91e6d06f5df31) [ Fixed trivial cherry-pick conflict in kernel/sched/sched.h ] Signed-off-by: Quentin Perret --- drivers/cpuidle/cpuidle.c | 4 ++-- include/linux/cpuidle.h | 2 +- kernel/sched/idle.c | 3 ++- kernel/sched/sched.h | 21 +++++++++++++++++++++ 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 6df894d65d9e..96a3a9bf8b12 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -221,7 +221,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, } /* Take note of the planned idle state. */ - sched_idle_set_state(target_state); + sched_idle_set_state(target_state, index); trace_cpu_idle_rcuidle(index, dev->cpu); time_start = ns_to_ktime(local_clock()); @@ -235,7 +235,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); /* The cpu is no longer idle or about to enter idle. */ - sched_idle_set_state(NULL); + sched_idle_set_state(NULL, -1); if (broadcast) { if (WARN_ON_ONCE(!irqs_disabled())) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 4325d6fdde9b..35b067126e9a 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -219,7 +219,7 @@ static inline void cpuidle_use_deepest_state(bool enable) #endif /* kernel/sched/idle.c */ -extern void sched_idle_set_state(struct cpuidle_state *idle_state); +extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index); extern void default_idle_call(void); #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 16f84142f2f4..2d3e2804c08b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -16,9 +16,10 @@ extern char __cpuidle_text_start[], __cpuidle_text_end[]; * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. */ -void sched_idle_set_state(struct cpuidle_state *idle_state) +void sched_idle_set_state(struct cpuidle_state *idle_state, int index) { idle_set_state(this_rq(), idle_state); + idle_set_state_idx(this_rq(), index); } static int __read_mostly cpu_idle_force_poll; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6245f808e411..2fd3ad74c95b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -944,6 +944,7 @@ struct rq { #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; + int idle_state_idx; #endif }; @@ -1644,6 +1645,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) return rq->idle_state; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ + rq->idle_state_idx = idle_state_idx; +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + WARN_ON(!rcu_read_lock_held()); + return rq->idle_state_idx; +} #else static inline void idle_set_state(struct rq *rq, struct cpuidle_state *idle_state) @@ -1654,6 +1666,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) { return NULL; } + +static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx) +{ +} + +static inline int idle_get_state_idx(struct rq *rq) +{ + return -1; +} #endif extern void schedule_idle(void); From 2e88529cf886868901daeed15388a2f54faad6d1 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Mon, 23 Jul 2018 15:14:59 +0100 Subject: [PATCH 45/75] ANDROID: sched: Introduce sysctl_sched_cstate_aware Introduce a new sysctl for this option, 'sched_cstate_aware'. When this is enabled, the scheduler can make use of the idle state indexes in order to break the tie between potential CPU candidates. This patch is based on 7f6fb825d6bc ("ANDROID: sched: EAS: take cstate into account when selecting idle core") from android-4.14. All the credits goes to the authors. Change-Id: Ia076cf32faff91e90905291fa6f7924dc3dd6458 Signed-off-by: Quentin Perret --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 5 +++++ kernel/sysctl.c | 7 +++++++ 3 files changed, 13 insertions(+) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 99ce6d728df7..64a547ecc67e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -22,6 +22,7 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_cstate_aware; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 460237bda9e2..ac2252bd7c99 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -40,6 +40,11 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +/* + * Enable/disable using cstate knowledge in idle sibling selection + */ +unsigned int sysctl_sched_cstate_aware = 1; + /* * The initial- and re-scaling of tunables is configurable * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 481309a11174..71da17cfd242 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -320,6 +320,13 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_cstate_aware", + .data = &sysctl_sched_cstate_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_min_granularity_ns", .data = &sysctl_sched_min_granularity, From 9378697a9fb8443bd6406f7430bfee9321472270 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 31 Aug 2018 11:24:44 +0100 Subject: [PATCH 46/75] ANDROID: sched/fair: Factor out CPU selection from find_energy_efficient_cpu find_energy_efficient_cpu() is composed of two steps; we first look for the CPU with the max spare capacity in each frequency domain, and then the impact on energy of each candidate is estimated. In order to make it easier to implement other CPU selection policies, let's factor the candidate selection algorithm out of find_energy_efficient_cpu(), and mark the candidates in a mask. This should result in no functional difference. Signed-off-by: Quentin Perret Change-Id: I85a28880f01fcd11d7af28f9fbc1fe0cf4f197cf --- kernel/sched/fair.c | 102 +++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 40 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ac2252bd7c99..fc0d92409249 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6555,6 +6555,44 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) return energy; } +static void select_max_spare_cap_cpus(struct sched_domain *sd, cpumask_t *cpus, + struct perf_domain *pd, struct task_struct *p) +{ + unsigned long spare_cap, max_spare_cap, util, cpu_cap; + int cpu, max_spare_cap_cpu; + + for (; pd; pd = pd->next) { + max_spare_cap_cpu = -1; + max_spare_cap = 0; + + for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { + if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) + continue; + + /* Skip CPUs that will be overutilized. */ + util = cpu_util_next(cpu, p, cpu); + cpu_cap = capacity_of(cpu); + if (cpu_cap * 1024 < util * capacity_margin) + continue; + + /* + * Find the CPU with the maximum spare capacity in + * the performance domain + */ + spare_cap = cpu_cap - util; + if (spare_cap > max_spare_cap) { + max_spare_cap = spare_cap; + max_spare_cap_cpu = cpu; + } + } + + if (max_spare_cap_cpu >= 0) + cpumask_set_cpu(max_spare_cap_cpu, cpus); + } +} + +static DEFINE_PER_CPU(cpumask_t, energy_cpus); + /* * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the * waking task. find_energy_efficient_cpu() looks for the CPU with maximum @@ -6599,16 +6637,16 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - int cpu, best_energy_cpu = prev_cpu; - struct perf_domain *head, *pd; - unsigned long cpu_cap, util; + int weight, cpu, best_energy_cpu = prev_cpu; + unsigned long cur_energy; + struct perf_domain *pd; struct sched_domain *sd; + cpumask_t *candidates; rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) goto fail; - head = pd; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -6624,45 +6662,29 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (!task_util_est(p)) goto unlock; - for (; pd; pd = pd->next) { - unsigned long cur_energy, spare_cap, max_spare_cap = 0; - int max_spare_cap_cpu = -1; + /* Pre-select a set of candidate CPUs. */ + candidates = this_cpu_ptr(&energy_cpus); + cpumask_clear(candidates); + select_max_spare_cap_cpus(sd, candidates, pd, p); - for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) - continue; + /* Bail out if there is no candidate, or if the only one is prev_cpu */ + weight = cpumask_weight(candidates); + if (!weight || (weight == 1 && cpumask_first(candidates) == prev_cpu)) + goto unlock; - /* Skip CPUs that will be overutilized. */ - util = cpu_util_next(cpu, p, cpu); - cpu_cap = capacity_of(cpu); - if (cpu_cap * 1024 < util * capacity_margin) - continue; + if (cpumask_test_cpu(prev_cpu, &p->cpus_allowed)) + prev_energy = best_energy = compute_energy(p, prev_cpu, pd); + else + prev_energy = best_energy = ULONG_MAX; - /* Always use prev_cpu as a candidate. */ - if (cpu == prev_cpu) { - prev_energy = compute_energy(p, prev_cpu, head); - best_energy = min(best_energy, prev_energy); - continue; - } - - /* - * Find the CPU with the maximum spare capacity in - * the performance domain - */ - spare_cap = cpu_cap - util; - if (spare_cap > max_spare_cap) { - max_spare_cap = spare_cap; - max_spare_cap_cpu = cpu; - } - } - - /* Evaluate the energy impact of using this CPU. */ - if (max_spare_cap_cpu >= 0) { - cur_energy = compute_energy(p, max_spare_cap_cpu, head); - if (cur_energy < best_energy) { - best_energy = cur_energy; - best_energy_cpu = max_spare_cap_cpu; - } + /* Select the best candidate energy-wise. */ + for_each_cpu(cpu, candidates) { + if (cpu == prev_cpu) + continue; + cur_energy = compute_energy(p, cpu, pd); + if (cur_energy < best_energy) { + best_energy = cur_energy; + best_energy_cpu = cpu; } } unlock: From c27c56105dcaaae54ecc39ef33fbfac87a1486fc Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 19 Dec 2017 19:43:01 +0000 Subject: [PATCH 47/75] ANDROID: Add find_best_target to minimise energy calculation overhead find_best_target started life as an optimised energy cpu selection algorithm designed to be efficient and high performance and integrate well with schedtune and userspace cpuset configuration. It has had many many small tweaks over time, and the version added here is forward ported from the version used in android-4.4 and android-4.9. The linkage to the rest of EAS is slightly different, however. This version is split into the three main use-cases and addresses them in priority order: A) latency sensitive tasks B) non latency sensitive tasks on IDLE CPUs C) non latency sensitive tasks on ACTIVE CPUs Case A) Latency sensitive tasks Unconditionally favoring tasks that prefer idle CPU to improve latency. When we do enter here, we are looking for: - an idle CPU, whatever its idle_state is, since the first CPUs we explore are more likely to be reserved for latency sensitive tasks. - a non idle CPU where the task fits in its current capacity and has the maximum spare capacity. - a non idle CPU with lower contention from other tasks and running at the lowest possible OPP. The last two goals try to favor a non idle CPU where the task can run as if it is "almost alone". A maximum spare capacity CPU is favored since the task already fits into that CPU's capacity without waiting for an OPP change. For any case other than case A, we avoid CPUs which would become overutilized if we placed the task there. Case B) Non latency sensitive tasks on IDLE CPUs. Find an optimal backup IDLE CPU for non latency sensitive tasks. Here we are looking for: - minimizing the capacity_orig, i.e. preferring LITTLE CPUs If IDLE cpus are available, we prefer to choose one in order to spread tasks and improve performance. Case C) Non latency sensitive tasks on ACTIVE CPUs. Pack tasks in the most energy efficient capacities. This task packing strategy prefers more energy efficient CPUs (i.e. pack on smaller maximum capacity CPUs) while also trying to spread tasks to run them all at the lower OPP. This assumes for example that it's more energy efficient to run two tasks on two CPUs at a lower OPP than packing both on a single CPU but running that CPU at an higher OPP. This code has had many other contributors over the development listed here as Cc. Cc: Ke Wang Cc: Joel Fernandes Cc: Patrick Bellasi Cc: Valentin Schneider Cc: Dietmar Eggemann Cc: Srinath Sridharan Cc: Todd Kjos Cc: Juri Lelli Signed-off-by: Chris Redpath (cherry picked from commit f240e44406558b17ff7765f252b0bcdcbc15126f) [ - Removed tracepoints - Took capacity_curr_of() from "7f6fb82 ANDROID: sched: EAS: take cstate into account when selecting idle core" - Re-use sd_ea from find_energy_efficient_cpu() / removed start_cpu() - Mark candidates with a cpumask given by feec() - Squashed Ionela's tri-gear fbt fixes from android-4.14 - Squashed Patrick's changes related to util_est from android-4.14 ] Signed-off-by: Quentin Perret Change-Id: I9500d308c879dd53b08adeda8f988238e39cc392 --- kernel/sched/fair.c | 318 +++++++++++++++++++++++++++++++++++++++- kernel/sched/features.h | 5 + 2 files changed, 322 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fc0d92409249..97ca8d2dd483 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6452,6 +6452,318 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p) return min_t(unsigned long, util, capacity_orig_of(cpu)); } +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + + return cap_scale(max_cap, scale_freq); +} + +static void find_best_target(struct sched_domain *sd, cpumask_t *cpus, + struct task_struct *p) +{ + unsigned long min_util = boosted_task_util(p); + unsigned long target_capacity = ULONG_MAX; + unsigned long min_wake_util = ULONG_MAX; + unsigned long target_max_spare_cap = 0; + unsigned long target_util = ULONG_MAX; + bool prefer_idle = schedtune_prefer_idle(p); + bool boosted = schedtune_task_boost(p) > 0; + /* Initialise with deepest possible cstate (INT_MAX) */ + int shallowest_idle_cstate = INT_MAX; + struct sched_group *sg; + int best_active_cpu = -1; + int best_idle_cpu = -1; + int target_cpu = -1; + int backup_cpu = -1; + int i; + + /* + * In most cases, target_capacity tracks capacity_orig of the most + * energy efficient CPU candidate, thus requiring to minimise + * target_capacity. For these cases target_capacity is already + * initialized to ULONG_MAX. + * However, for prefer_idle and boosted tasks we look for a high + * performance CPU, thus requiring to maximise target_capacity. In this + * case we initialise target_capacity to 0. + */ + if (prefer_idle && boosted) + target_capacity = 0; + + /* Scan CPUs in all SDs */ + sg = sd->groups; + do { + for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) { + unsigned long capacity_curr = capacity_curr_of(i); + unsigned long capacity_orig = capacity_orig_of(i); + unsigned long wake_util, new_util; + long spare_cap; + int idle_idx = INT_MAX; + + if (!cpu_online(i)) + continue; + + /* + * p's blocked utilization is still accounted for on prev_cpu + * so prev_cpu will receive a negative bias due to the double + * accounting. However, the blocked utilization may be zero. + */ + wake_util = cpu_util_wake(i, p); + new_util = wake_util + task_util_est(p); + + /* + * Ensure minimum capacity to grant the required boost. + * The target CPU can be already at a capacity level higher + * than the one required to boost the task. + */ + new_util = max(min_util, new_util); + if (new_util > capacity_orig) + continue; + + /* + * Pre-compute the maximum possible capacity we expect + * to have available on this CPU once the task is + * enqueued here. + */ + spare_cap = capacity_orig - new_util; + + if (idle_cpu(i)) + idle_idx = idle_get_state_idx(cpu_rq(i)); + + + /* + * Case A) Latency sensitive tasks + * + * Unconditionally favoring tasks that prefer idle CPU to + * improve latency. + * + * Looking for: + * - an idle CPU, whatever its idle_state is, since + * the first CPUs we explore are more likely to be + * reserved for latency sensitive tasks. + * - a non idle CPU where the task fits in its current + * capacity and has the maximum spare capacity. + * - a non idle CPU with lower contention from other + * tasks and running at the lowest possible OPP. + * + * The last two goals tries to favor a non idle CPU + * where the task can run as if it is "almost alone". + * A maximum spare capacity CPU is favoured since + * the task already fits into that CPU's capacity + * without waiting for an OPP chance. + * + * The following code path is the only one in the CPUs + * exploration loop which is always used by + * prefer_idle tasks. It exits the loop with wither a + * best_active_cpu or a target_cpu which should + * represent an optimal choice for latency sensitive + * tasks. + */ + if (prefer_idle) { + + /* + * Case A.1: IDLE CPU + * Return the best IDLE CPU we find: + * - for boosted tasks: the CPU with the highest + * performance (i.e. biggest capacity_orig) + * - for !boosted tasks: the most energy + * efficient CPU (i.e. smallest capacity_orig) + */ + if (idle_cpu(i)) { + if (boosted && + capacity_orig < target_capacity) + continue; + if (!boosted && + capacity_orig > target_capacity) + continue; + /* + * Minimise value of idle state: skip + * deeper idle states and pick the + * shallowest. + */ + if (capacity_orig == target_capacity && + sysctl_sched_cstate_aware && + idle_idx >= shallowest_idle_cstate) + continue; + + target_capacity = capacity_orig; + shallowest_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + if (best_idle_cpu != -1) + continue; + + /* + * Case A.2: Target ACTIVE CPU + * Favor CPUs with max spare capacity. + */ + if (capacity_curr > new_util && + spare_cap > target_max_spare_cap) { + target_max_spare_cap = spare_cap; + target_cpu = i; + continue; + } + if (target_cpu != -1) + continue; + + + /* + * Case A.3: Backup ACTIVE CPU + * Favor CPUs with: + * - lower utilization due to other tasks + * - lower utilization with the task in + */ + if (wake_util > min_wake_util) + continue; + min_wake_util = wake_util; + best_active_cpu = i; + continue; + } + + /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + + /* + * Favor CPUs with smaller capacity for non latency + * sensitive tasks. + */ + if (capacity_orig > target_capacity) + continue; + + /* + * Case B) Non latency sensitive tasks on IDLE CPUs. + * + * Find an optimal backup IDLE CPU for non latency + * sensitive tasks. + * + * Looking for: + * - minimizing the capacity_orig, + * i.e. preferring LITTLE CPUs + * - favoring shallowest idle states + * i.e. avoid to wakeup deep-idle CPUs + * + * The following code path is used by non latency + * sensitive tasks if IDLE CPUs are available. If at + * least one of such CPUs are available it sets the + * best_idle_cpu to the most suitable idle CPU to be + * selected. + * + * If idle CPUs are available, favour these CPUs to + * improve performances by spreading tasks. + * Indeed, the energy_diff() computed by the caller + * will take care to ensure the minimization of energy + * consumptions without affecting performance. + */ + if (idle_cpu(i)) { + /* + * Skip CPUs in deeper idle state, but only + * if they are also less energy efficient. + * IOW, prefer a deep IDLE LITTLE CPU vs a + * shallow idle big CPU. + */ + if (capacity_orig == target_capacity && + sysctl_sched_cstate_aware && + idle_idx >= shallowest_idle_cstate) + continue; + + target_capacity = capacity_orig; + shallowest_idle_cstate = idle_idx; + best_idle_cpu = i; + continue; + } + + /* + * Case C) Non latency sensitive tasks on ACTIVE CPUs. + * + * Pack tasks in the most energy efficient capacities. + * + * This task packing strategy prefers more energy + * efficient CPUs (i.e. pack on smaller maximum + * capacity CPUs) while also trying to spread tasks to + * run them all at the lower OPP. + * + * This assumes for example that it's more energy + * efficient to run two tasks on two CPUs at a lower + * OPP than packing both on a single CPU but running + * that CPU at an higher OPP. + * + * Thus, this case keep track of the CPU with the + * smallest maximum capacity and highest spare maximum + * capacity. + */ + + /* Favor CPUs with maximum spare capacity */ + if (capacity_orig == target_capacity && + spare_cap < target_max_spare_cap) + continue; + + target_max_spare_cap = spare_cap; + target_capacity = capacity_orig; + target_util = new_util; + target_cpu = i; + } + + } while (sg = sg->next, sg != sd->groups); + + /* + * For non latency sensitive tasks, cases B and C in the previous loop, + * we pick the best IDLE CPU only if we was not able to find a target + * ACTIVE CPU. + * + * Policies priorities: + * + * - prefer_idle tasks: + * + * a) IDLE CPU available: best_idle_cpu + * b) ACTIVE CPU where task fits and has the bigger maximum spare + * capacity (i.e. target_cpu) + * c) ACTIVE CPU with less contention due to other tasks + * (i.e. best_active_cpu) + * + * - NON prefer_idle tasks: + * + * a) ACTIVE CPU: target_cpu + * b) IDLE CPU: best_idle_cpu + */ + + if (prefer_idle && (best_idle_cpu != -1)) { + target_cpu = best_idle_cpu; + goto target; + } + + if (target_cpu == -1) + target_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + else + backup_cpu = prefer_idle + ? best_active_cpu + : best_idle_cpu; + + if (backup_cpu >= 0) + cpumask_set_cpu(backup_cpu, cpus); + if (target_cpu >= 0) { +target: + cpumask_set_cpu(target_cpu, cpus); + } +} + /* * Disable WAKE_AFFINE in the case where task @p doesn't fit in the * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. @@ -6665,7 +6977,11 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) /* Pre-select a set of candidate CPUs. */ candidates = this_cpu_ptr(&energy_cpus); cpumask_clear(candidates); - select_max_spare_cap_cpus(sd, candidates, pd, p); + + if (sched_feat(FIND_BEST_TARGET)) + find_best_target(sd, candidates, p); + else + select_max_spare_cap_cpus(sd, candidates, pd, p); /* Bail out if there is no candidate, or if the only one is prev_cpu */ weight = cpumask_weight(candidates); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 85ae8488039c..531ebc5c266c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -90,3 +90,8 @@ SCHED_FEAT(WA_BIAS, true) * UtilEstimation. Use estimated CPU utilization. */ SCHED_FEAT(UTIL_EST, true) + +/* + * Fast pre-selection of CPU candidates for EAS. + */ +SCHED_FEAT(FIND_BEST_TARGET, true) From af362094dfe83c5134b95d3a7112ad49303a400e Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Sat, 21 Oct 2017 15:56:50 +0100 Subject: [PATCH 48/75] ANDROID: sched: Unconditionally honor sync flag for energy-aware wakeups Since we don't do energy-aware wakeups when we are overutilized, always honoring sync wakeups in this state does not prevent wake-wide mechanics overruling the flag as normal. This patch is based upon previous work to build EAS for android products. sync-hint code taken from commit 4a5e890ec60d "sched/fair: add tunable to force selection at cpu granularity" written by Juri Lelli Signed-off-by: Chris Redpath (cherry-picked from commit f1ec666a62dec1083ed52fe1ddef093b84373aaf) [ Moved the feature to find_energy_efficient_cpu() ] Signed-off-by: Quentin Perret Change-Id: I4b3d79141fc8e53dc51cd63ac11096c2e3cb10f5 --- include/linux/sched/sysctl.h | 1 + kernel/sched/fair.c | 15 +++++++++++++-- kernel/sysctl.c | 7 +++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 64a547ecc67e..3c0f82df2c0a 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -22,6 +22,7 @@ enum { sysctl_hung_task_timeout_secs = 0 }; extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; +extern unsigned int sysctl_sched_sync_hint_enable; extern unsigned int sysctl_sched_cstate_aware; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97ca8d2dd483..e52abce08c76 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -40,6 +40,11 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +/* + * Enable/disable honoring sync flag in energy-aware wakeups. + */ +unsigned int sysctl_sched_sync_hint_enable = 1; + /* * Enable/disable using cstate knowledge in idle sibling selection */ @@ -6945,7 +6950,7 @@ static DEFINE_PER_CPU(cpumask_t, energy_cpus); * let's keep things simple by re-using the existing slow path. */ -static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) +static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync) { unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; @@ -6955,6 +6960,12 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct sched_domain *sd; cpumask_t *candidates; + if (sysctl_sched_sync_hint_enable && sync) { + cpu = smp_processor_id(); + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) + return cpu; + } + rcu_read_lock(); pd = rcu_dereference(rd->pd); if (!pd || READ_ONCE(rd->overutilized)) @@ -7049,7 +7060,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f record_wakee(p); if (static_branch_unlikely(&sched_energy_present)) { - new_cpu = find_energy_efficient_cpu(p, prev_cpu); + new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync); if (new_cpu >= 0) return new_cpu; new_cpu = prev_cpu; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 71da17cfd242..e0eb3c5b1df6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -345,6 +345,13 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_sync_hint_enable", + .data = &sysctl_sched_sync_hint_enable, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity, From 88d05f9257a770f4ca4545232804f7f154c07e3a Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 11 Aug 2017 10:45:58 +0100 Subject: [PATCH 49/75] FROMLIST: sched/fair: Use wake_q length as a hint for wake_wide This patch adds a parameter to select_task_rq, sibling_count_hint allowing the caller, where it has this information, to inform the sched_class the number of tasks that are being woken up as part of the same event. The wake_q mechanism is one case where this information is available. select_task_rq_fair can then use the information to detect that it needs to widen the search space for task placement in order to avoid overloading the last-level cache domain's CPUs. * * * The reason I am investigating this change is the following use case on ARM big.LITTLE (asymmetrical CPU capacity): 1 task per CPU, which all repeatedly do X amount of work then pthread_barrier_wait (i.e. sleep until the last task finishes its X and hits the barrier). On big.LITTLE, the tasks which get a "big" CPU finish faster, and then those CPUs pull over the tasks that are still running: v CPU v ->time-> ------------- 0 (big) 11111 /333 ------------- 1 (big) 22222 /444| ------------- 2 (LITTLE) 333333/ ------------- 3 (LITTLE) 444444/ ------------- Now when task 4 hits the barrier (at |) and wakes the others up, there are 4 tasks with prev_cpu= and 0 tasks with prev_cpu=. want_affine therefore means that we'll only look in CPUs 0 and 1 (sd_llc), so tasks will be unnecessarily coscheduled on the bigs until the next load balance, something like this: v CPU v ->time-> ------------------------ 0 (big) 11111 /333 31313\33333 ------------------------ 1 (big) 22222 /444|424\4444444 ------------------------ 2 (LITTLE) 333333/ \222222 ------------------------ 3 (LITTLE) 444444/ \1111 ------------------------ ^^^ underutilization So, I'm trying to get want_affine = 0 for these tasks. I don't _think_ any incarnation of the wakee_flips mechanism can help us here because which task is waker and which tasks are wakees generally changes with each iteration. However pthread_barrier_wait (or more accurately FUTEX_WAKE) has the nice property that we know exactly how many tasks are being woken, so we can cheat. It might be a disadvantage that we "widen" _every_ task that's woken in an event, while select_idle_sibling would work fine for the first sd_llc_size - 1 tasks. IIUC, if wake_affine() behaves correctly this trick wouldn't be necessary on SMP systems, so it might be best guarded by the presence of SD_ASYM_CPUCAPACITY? * * * Final note.. In order to observe "perfect" behaviour for this use case, I also had to disable the TTWU_QUEUE sched feature. Suppose during the wakeup above we are working through the work queue and have placed tasks 3 and 2, and are about to place task 1: v CPU v ->time-> -------------- 0 (big) 11111 /333 3 -------------- 1 (big) 22222 /444|4 -------------- 2 (LITTLE) 333333/ 2 -------------- 3 (LITTLE) 444444/ <- Task 1 should go here -------------- If TTWU_QUEUE is enabled, we will not yet have enqueued task 2 (having instead sent a reschedule IPI) or attached its load to CPU 2. So we are likely to also place task 1 on cpu 2. Disabling TTWU_QUEUE means that we enqueue task 2 before placing task 1, solving this issue. TTWU_QUEUE is there to minimise rq lock contention, and I guess that this contention is less of an issue on big.LITTLE systems since they have relatively few CPUs, which suggests the trade-off makes sense here. Signed-off-by: Brendan Jackman Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Josef Bacik Cc: Joel Fernandes Cc: Mike Galbraith Cc: Matt Fleming ( - Applied from https://patchwork.kernel.org/patch/9895261/ - Fixed trivial conflict in kernel/sched/core.c - Fixed select_task_rq_idle, now in kernel/sched/idle.c - Fixed trivial conflict in select_task_rq_fair ) Signed-off-by: Quentin Perret Change-Id: I3cfc4bf48c3d7feef969db4d22449f4fbb4f795d --- include/linux/sched/wake_q.h | 2 ++ kernel/sched/core.c | 34 +++++++++++++++++++++++----------- kernel/sched/deadline.c | 3 ++- kernel/sched/fair.c | 15 ++++++++++----- kernel/sched/idle.c | 3 ++- kernel/sched/rt.c | 3 ++- kernel/sched/sched.h | 3 ++- kernel/sched/stop_task.c | 3 ++- 8 files changed, 45 insertions(+), 21 deletions(-) diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 10b19a192b2d..a3661e93da6f 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -34,6 +34,7 @@ struct wake_q_head { struct wake_q_node *first; struct wake_q_node **lastp; + int count; }; #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) @@ -45,6 +46,7 @@ static inline void wake_q_init(struct wake_q_head *head) { head->first = WAKE_Q_TAIL; head->lastp = &head->first; + head->count = 0; } extern void wake_q_add(struct wake_q_head *head, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ad97f3ba5ec5..a30556569187 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -412,6 +412,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) return; + head->count++; + get_task_struct(task); /* @@ -421,6 +423,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) head->lastp = &node->next; } +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint); + void wake_up_q(struct wake_q_head *head) { struct wake_q_node *node = head->first; @@ -435,10 +441,10 @@ void wake_up_q(struct wake_q_head *head) task->wake_q.next = NULL; /* - * wake_up_process() executes a full barrier, which pairs with + * try_to_wake_up() executes a full barrier, which pairs with * the queueing in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + try_to_wake_up(task, TASK_NORMAL, 0, head->count); put_task_struct(task); } } @@ -1523,12 +1529,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags, + int sibling_count_hint) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags, + sibling_count_hint); else cpu = cpumask_any(&p->cpus_allowed); @@ -1931,6 +1939,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * @p: the thread to be awakened * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) + * @sibling_count_hint: A hint at the number of threads that are being woken up + * in this event. * * If (@state & @p->state) @p->state = TASK_RUNNING. * @@ -1946,7 +1956,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * %false otherwise. */ static int -try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags, + int sibling_count_hint) { unsigned long flags; int cpu, success = 0; @@ -2033,7 +2044,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) atomic_dec(&task_rq(p)->nr_iowait); } - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags, + sibling_count_hint); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -2120,13 +2132,13 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_NORMAL, 0); + return try_to_wake_up(p, TASK_NORMAL, 0, 1); } EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) { - return try_to_wake_up(p, state, 0); + return try_to_wake_up(p, state, 0, 1); } /* @@ -2408,7 +2420,7 @@ void wake_up_new_task(struct task_struct *p) * as we're not fully set-up yet. */ p->recent_used_cpu = task_cpu(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -2947,7 +2959,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3708,7 +3720,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, void *key) { - return try_to_wake_up(curr->private, mode, wake_flags); + return try_to_wake_up(curr->private, mode, wake_flags, 1); } EXPORT_SYMBOL(default_wake_function); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 91e4202b0634..921685482cc0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1567,7 +1567,8 @@ static void yield_task_dl(struct rq *rq) static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e52abce08c76..61764a9daf04 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5656,15 +5656,18 @@ static void record_wakee(struct task_struct *p) * whatever is irrelevant, spread criteria is apparent partner count exceeds * socket size. */ -static int wake_wide(struct task_struct *p) +static int wake_wide(struct task_struct *p, int sibling_count_hint) { unsigned int master = current->wakee_flips; unsigned int slave = p->wakee_flips; - int factor = this_cpu_read(sd_llc_size); + int llc_size = this_cpu_read(sd_llc_size); + + if (sibling_count_hint >= llc_size) + return 1; if (master < slave) swap(master, slave); - if (slave < factor || master < slave * factor) + if (slave < llc_size || master < slave * llc_size) return 0; return 1; } @@ -7048,7 +7051,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sy * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, + int sibling_count_hint) { struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); @@ -7066,7 +7070,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = prev_cpu; } - want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && + want_affine = !wake_wide(p, sibling_count_hint) && + !wake_cap(p, cpu, prev_cpu) && cpumask_test_cpu(cpu, &p->cpus_allowed); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2d3e2804c08b..2c8719f1de0a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -375,7 +375,8 @@ void cpu_startup_entry(enum cpuhp_state state) #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2e2955a8cf8f..0be707d9c2db 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1386,7 +1386,8 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { struct task_struct *curr; struct rq *rq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2fd3ad74c95b..c004d8ce5990 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1557,7 +1557,8 @@ struct sched_class { void (*put_prev_task)(struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags, + int subling_count_hint); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c183b790ca54..6446d6130c5d 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,8 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags, + int sibling_count_hint) { return task_cpu(p); /* stop tasks as never migrate */ } From 1d00c33d8bd9c57a4cf6197390285ed364898ae5 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 19 Dec 2017 19:32:02 +0000 Subject: [PATCH 50/75] ANDROID: sched: fair: Bypass energy-aware wakeup for prefer-idle tasks Use the upstream slow path to find an idle cpu for prefer-idle tasks. This slow-path is actually faster than the EAS path we are currently going through (compute_energy()) which is really slow. No performance degradation is seen with this and it reduces the delta quite a bit between upstream and out of tree code. It's not clear yet if using the mainline slow path task placement when a task has the schedtune attribute prefer_idle=1 is the right thing to do for products. Put the option to disable this behind a sched feature so we can try out both options. Signed-off-by: Joel Fernandes (refactored for 4.14 version) Signed-off-by: Chris Redpath (cherry picked from commit c0ff131c88f68e4985793663144b6f9cf77be9d3) [ - Refactored for 4.17 version - Adjusted the commit header to the new function names ] Signed-off-by: Quentin Perret Change-Id: Icf762a101c92c0e3f9e61df0370247fa15455581 --- kernel/sched/fair.c | 4 ++++ kernel/sched/features.h | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61764a9daf04..a8b48ab4b4e1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7064,6 +7064,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f record_wakee(p); if (static_branch_unlikely(&sched_energy_present)) { + if (schedtune_prefer_idle(p) && !sched_feat(EAS_PREFER_IDLE) && !sync) + goto sd_loop; + new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync); if (new_cpu >= 0) return new_cpu; @@ -7075,6 +7078,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f cpumask_test_cpu(cpu, &p->cpus_allowed); } +sd_loop: rcu_read_lock(); for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 531ebc5c266c..0232a4570b78 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -95,3 +95,12 @@ SCHED_FEAT(UTIL_EST, true) * Fast pre-selection of CPU candidates for EAS. */ SCHED_FEAT(FIND_BEST_TARGET, true) + +/* + * Energy aware scheduling algorithm choices: + * EAS_PREFER_IDLE + * Direct tasks in a schedtune.prefer_idle=1 group through + * the EAS path for wakeup task placement. Otherwise, put + * those tasks through the mainline slow path. + */ +SCHED_FEAT(EAS_PREFER_IDLE, true) From c06013206c840b1c3f0ec45acb8a54fc06ea0be5 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 24 May 2018 17:35:02 +0100 Subject: [PATCH 51/75] ANDROID: sched/fair: Bypass energy computation for prefer_idle tasks If the only pre-selected candidate CPU in find_energy_efficient_cpu() happens to be prev_cpu, there is not point in computing the system energy since we have nothing to compare it against, so we currently bail out early. The same logic can be extended when prefer_idle tasks are routed in the energy-aware wake-up path: if the only candidate is idle for a prefer_idle task, just select it no matter what the energy impact is. That should help speeding-up wake-ups of prefer_idle tasks, at least when find_best_target() is used for them. Signed-off-by: Quentin Perret Change-Id: Idd0e387e4a766061cc05d2584df3a31e4dabfd09 --- kernel/sched/fair.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a8b48ab4b4e1..31f3c747916a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6997,11 +6997,19 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sy else select_max_spare_cap_cpus(sd, candidates, pd, p); - /* Bail out if there is no candidate, or if the only one is prev_cpu */ + /* Bail out if no candidate was found. */ weight = cpumask_weight(candidates); - if (!weight || (weight == 1 && cpumask_first(candidates) == prev_cpu)) + if (!weight) goto unlock; + /* If there is only one sensible candidate, select it now. */ + cpu = cpumask_first(candidates); + if (weight == 1 && ((schedtune_prefer_idle(p) && idle_cpu(cpu)) || + (cpu == prev_cpu))) { + best_energy_cpu = cpu; + goto unlock; + } + if (cpumask_test_cpu(prev_cpu, &p->cpus_allowed)) prev_energy = best_energy = compute_energy(p, prev_cpu, pd); else From 6c401bf9b0cabf1c4fa4a70237b16e5349e0918a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 10 May 2018 15:48:06 +0100 Subject: [PATCH 52/75] ANDROID: sched/fair: add arch scaling function for max frequency capping To be able to scale the cpu capacity by this factor introduce a call to the new arch scaling function arch_scale_max_freq_capacity() in update_cpu_capacity() and provide a default implementation which returns SCHED_CAPACITY_SCALE. Another subsystem (e.g. cpufreq) or architectural or platform specific code can overwrite this default implementation, exactly as for frequency and cpu invariance. It has to be enabled by the arch by defining arch_scale_max_freq_capacity to the actual implementation. Signed-off-by: Ionela Voinescu Signed-off-by: Dietmar Eggemann ( Fixed conflict with scaling against the PELT-based scale_rt_capacity ) Signed-off-by: Quentin Perret Change-Id: I770a8b1f4f7340e9e314f71c64a765bf880f4b4d --- kernel/sched/fair.c | 12 ++++++++---- kernel/sched/sched.h | 9 +++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 31f3c747916a..a7ebc2ebaecf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8357,10 +8357,9 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) +static unsigned long scale_rt_capacity(int cpu, unsigned long max) { struct rq *rq = cpu_rq(cpu); - unsigned long max = arch_scale_cpu_capacity(sd, cpu); unsigned long used, free; unsigned long irq; @@ -8382,10 +8381,15 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity = scale_rt_capacity(sd, cpu); + unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); + cpu_rq(cpu)->cpu_capacity_orig = capacity; + + capacity *= arch_scale_max_freq_capacity(sd, cpu); + capacity >>= SCHED_CAPACITY_SHIFT; + + capacity = scale_rt_capacity(cpu, capacity); if (!capacity) capacity = 1; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c004d8ce5990..fde3dc0629c9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1809,6 +1809,15 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifndef arch_scale_max_freq_capacity +struct sched_domain; +static __always_inline +unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); From c4ed8aa44d2449778149d98bc445d875abae5c9c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 10 May 2018 16:52:33 +0100 Subject: [PATCH 53/75] ANDROID: implement max frequency capping Implements the Max Frequency Capping Engine (MFCE) getter function topology_get_max_freq_scale() to provide the scheduler with a maximum frequency scaling correction factor for more accurate cpu capacity handling by being able to deal with max frequency capping. This scaling factor describes the influence of running a cpu with a current maximum frequency (policy) lower than the maximum possible frequency (cpuinfo). The factor is: policy_max_freq(cpu) << SCHED_CAPACITY_SHIFT / cpuinfo_max_freq(cpu) It also implements the MFCE setter function arch_set_max_freq_scale() which is called from cpufreq_set_policy(). Signed-off-by: Ionela Voinescu Signed-off-by: Dietmar Eggemann [Trivial cherry-pick issue in cpufreq.c] Signed-off-by: Quentin Perret Change-Id: I59e52861ee260755ab0518fe1f7183a2e4e3d0fc --- drivers/base/arch_topology.c | 25 ++++++++++++++++++++++++- drivers/cpufreq/cpufreq.c | 8 ++++++++ include/linux/arch_topology.h | 8 ++++++++ include/linux/cpufreq.h | 2 ++ 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index edfcf8d982e4..b5f61f2840d0 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -18,6 +18,8 @@ #include DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; +DEFINE_PER_CPU(unsigned long, max_cpu_freq); +DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE; void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) @@ -27,8 +29,29 @@ void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; - for_each_cpu(i, cpus) + for_each_cpu(i, cpus) { per_cpu(freq_scale, i) = scale; + per_cpu(max_cpu_freq, i) = max_freq; + } +} + +void arch_set_max_freq_scale(struct cpumask *cpus, + unsigned long policy_max_freq) +{ + unsigned long scale, max_freq; + int cpu = cpumask_first(cpus); + + if (cpu > nr_cpu_ids) + return; + + max_freq = per_cpu(max_cpu_freq, cpu); + if (!max_freq) + return; + + scale = (policy_max_freq << SCHED_CAPACITY_SHIFT) / max_freq; + + for_each_cpu(cpu, cpus) + per_cpu(max_freq_scale, cpu) = scale; } static DEFINE_MUTEX(cpu_scale_mutex); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index bde9606c84c1..d4cf9dcadc55 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -159,6 +159,12 @@ __weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, } EXPORT_SYMBOL_GPL(arch_set_freq_scale); +__weak void arch_set_max_freq_scale(struct cpumask *cpus, + unsigned long policy_max_freq) +{ +} +EXPORT_SYMBOL_GPL(arch_set_max_freq_scale); + /* * This is a generic cpufreq init() routine which can be used by cpufreq * drivers of SMP systems. It will do following: @@ -2244,6 +2250,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, policy->max = new_policy->max; trace_cpu_frequency_limits(policy); + arch_set_max_freq_scale(policy->cpus, policy->max); + policy->cached_target_freq = UINT_MAX; pr_debug("new min and max freqs are %u - %u kHz\n", diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d9bdc1a7f4e7..7e5a33ea8df0 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -33,4 +33,12 @@ unsigned long topology_get_freq_scale(int cpu) return per_cpu(freq_scale, cpu); } +DECLARE_PER_CPU(unsigned long, max_freq_scale); + +static inline +unsigned long topology_get_max_freq_scale(struct sched_domain *sd, int cpu) +{ + return per_cpu(max_freq_scale, cpu); +} + #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 882a9b9e34bc..eb7b26f53f55 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -955,6 +955,8 @@ extern unsigned int arch_freq_get_on_cpu(int cpu); extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq); +extern void arch_set_max_freq_scale(struct cpumask *cpus, + unsigned long policy_max_freq); /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; From 9ac9491eb2cffaa9f51f227e04db18f80da4f7e2 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 10 May 2018 16:54:16 +0100 Subject: [PATCH 54/75] ANDROID: arm64: enable max frequency capping Defines arch_scale_max_freq_capacity() to use the topology driver scale function. Signed-off-by: Ionela Voinescu Signed-off-by: Dietmar Eggemann Signed-off-by: Quentin Perret Change-Id: If7565747ec862e42ac55196240522ef8d22ca67d --- arch/arm64/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0524f2438649..8e0a96d71bd6 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -42,6 +42,9 @@ int pcibus_to_node(struct pci_bus *bus); /* Replace task scheduler's default frequency-invariant accounting */ #define arch_scale_freq_capacity topology_get_freq_scale +/* Replace task scheduler's default max-frequency-invariant accounting */ +#define arch_scale_max_freq_capacity topology_get_max_freq_scale + /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale From 95247bf7cce7e3434c831096a0b33e64d4052c02 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 10 May 2018 16:58:04 +0100 Subject: [PATCH 55/75] ANDROID: arm: enable max frequency capping Defines arch_scale_max_freq_capacity() to use the topology driver scale function. Signed-off-by: Ionela Voinescu Signed-off-by: Dietmar Eggemann Signed-off-by: Quentin Perret Change-Id: I79f444399ea3b2948364fde80ccee52a9ece5b9a --- arch/arm/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 2a786f54d8b8..201dc2011c16 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -30,6 +30,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu); /* Replace task scheduler's default frequency-invariant accounting */ #define arch_scale_freq_capacity topology_get_freq_scale +/* Replace task scheduler's default max-frequency-invariant accounting */ +#define arch_scale_max_freq_capacity topology_get_max_freq_scale + /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale From 9c83d8435a0b07d16b0bda669b13a9d87fef3ad3 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Sat, 26 Sep 2015 18:19:54 +0100 Subject: [PATCH 56/75] ANDROID: sched: Update max cpu capacity in case of max frequency constraints Wakeup balancing uses cpu capacity awareness and needs to know the system-wide maximum cpu capacity. Patch "sched: Store system-wide maximum cpu capacity in root domain" finds the system-wide maximum cpu capacity during scheduler domain hierarchy setup. This is sufficient as long as maximum frequency invariance is not enabled. If it is enabled, the system-wide maximum cpu capacity can change between scheduler domain hierarchy setups due to frequency capping. The cpu capacity is changed in update_cpu_capacity() which is called in load balance on the lowest scheduler domain hierarchy level. To be able to know if a change in cpu capacity for a certain cpu also has an effect on the system-wide maximum cpu capacity it is normally necessary to iterate over all cpus. This would be way too costly. That's why this patch follows a different approach. The unsigned long max_cpu_capacity value in struct root_domain is replaced with a struct max_cpu_capacity, containing value (the max_cpu_capacity) and cpu (the cpu index of the cpu providing the maximum cpu_capacity). Changes to the system-wide maximum cpu capacity and the cpu index are made if: 1 System-wide maximum cpu capacity < cpu capacity 2 System-wide maximum cpu capacity > cpu capacity and cpu index == cpu There are no changes to the system-wide maximum cpu capacity in all other cases. Atomic read and write access to the pair (max_cpu_capacity.val, max_cpu_capacity.cpu) is enforced by max_cpu_capacity.lock. The access to max_cpu_capacity.val in task_fits_max() is still performed without taking the max_cpu_capacity.lock. The code to set max cpu capacity in build_sched_domains() has been removed because the whole functionality is now provided by update_cpu_capacity() instead. This approach can introduce errors temporarily, e.g. in case the cpu currently providing the max cpu capacity has its cpu capacity lowered due to frequency capping and calls update_cpu_capacity() before any cpu which might provide the max cpu now. Signed-off-by: Ionela Voinescu * Signed-off-by: Dietmar Eggemann ( Fixed cherry-pick issues ) Signed-off-by: Quentin Perret Change-Id: Idaa7a16723001e222e476de34df332558e48dd13 --- kernel/sched/fair.c | 31 ++++++++++++++++++++++++++++++- kernel/sched/sched.h | 10 +++++++++- kernel/sched/topology.c | 15 +++------------ 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a7ebc2ebaecf..fad840752bdb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6787,7 +6787,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) return 0; min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); - max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; + max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val; /* Minimum capacity is close to max, no need to abort wake_affine */ if (max_cap - min_cap < max_cap >> 3) @@ -8379,16 +8379,45 @@ static unsigned long scale_rt_capacity(int cpu, unsigned long max) return scale_irq_capacity(free, irq, max); } +void init_max_cpu_capacity(struct max_cpu_capacity *mcc) { + raw_spin_lock_init(&mcc->lock); + mcc->val = 0; + mcc->cpu = -1; +} + static void update_cpu_capacity(struct sched_domain *sd, int cpu) { unsigned long capacity = arch_scale_cpu_capacity(sd, cpu); struct sched_group *sdg = sd->groups; + struct max_cpu_capacity *mcc; + unsigned long max_capacity; + int max_cap_cpu; + unsigned long flags; cpu_rq(cpu)->cpu_capacity_orig = capacity; capacity *= arch_scale_max_freq_capacity(sd, cpu); capacity >>= SCHED_CAPACITY_SHIFT; + mcc = &cpu_rq(cpu)->rd->max_cpu_capacity; + + raw_spin_lock_irqsave(&mcc->lock, flags); + max_capacity = mcc->val; + max_cap_cpu = mcc->cpu; + + if ((max_capacity > capacity && max_cap_cpu == cpu) || + (max_capacity < capacity)) { + mcc->val = capacity; + mcc->cpu = cpu; +#ifdef CONFIG_SCHED_DEBUG + raw_spin_unlock_irqrestore(&mcc->lock, flags); + pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); + goto skip_unlock; +#endif + } + raw_spin_unlock_irqrestore(&mcc->lock, flags); + +skip_unlock: __attribute__ ((unused)); capacity = scale_rt_capacity(cpu, capacity); if (!capacity) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fde3dc0629c9..2fd06be206ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -711,6 +711,12 @@ struct perf_domain { struct rcu_head rcu; }; +struct max_cpu_capacity { + raw_spinlock_t lock; + unsigned long val; + int cpu; +}; + /* Scheduling group status flags */ #define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ #define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ @@ -769,7 +775,8 @@ struct root_domain { cpumask_var_t rto_mask; struct cpupri cpupri; - unsigned long max_cpu_capacity; + /* Maximum cpu capacity in the system. */ + struct max_cpu_capacity max_cpu_capacity; /* * NULL-terminated list of performance domains intersecting with the @@ -782,6 +789,7 @@ extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void init_defrootdomain(void); +extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index db7dbeeeef49..63ee0513af3f 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -524,6 +524,9 @@ static int init_rootdomain(struct root_domain *rd) if (cpupri_init(&rd->cpupri) != 0) goto free_cpudl; + + init_max_cpu_capacity(&rd->max_cpu_capacity); + return 0; free_cpudl: @@ -1950,7 +1953,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att enum s_alloc alloc_state; struct sched_domain *sd; struct s_data d; - struct rq *rq = NULL; int i, ret = -ENOMEM; struct sched_domain_topology_level *tl_asym; bool has_asym = false; @@ -2013,13 +2015,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { - rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); - - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); - cpu_attach_domain(sd, d.rd, i); } rcu_read_unlock(); @@ -2027,11 +2023,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (has_asym) static_branch_enable_cpuslocked(&sched_asym_cpucapacity); - if (rq && sched_debug_enabled) { - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); - } - ret = 0; error: __free_domain_allocs(&d, alloc_state, cpu_map); From ad271f2d0d7e222b5720931ffc73c4e45cc06c71 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 10 Jul 2018 15:44:09 +0100 Subject: [PATCH 57/75] ANDROID: cpufreq/schedutil: Select frequency using util_avg for RT Schedutil always requests max frequency whenever a RT task is running. Now that we have a better estimate of the utilization of RT runqueues, it is possible to make a less conservative decision and scale frequency according to the needs of the RT tasks. To do so, protect the RT-go-to-max code with a new sched_feature. The sched_feature is disabled by default, hence favoring energy savings as required in mobile environments. Signed-off-by: Quentin Perret Change-Id: Ic9f01c8703d4f843addaa0d684012a422fe9f3b8 --- kernel/sched/cpufreq_schedutil.c | 3 ++- kernel/sched/features.h | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 20ad939e143d..56881d416449 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -204,7 +204,8 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, struct rq *rq = cpu_rq(cpu); unsigned long util, irq; - if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) + if (sched_feat(SUGOV_RT_MAX_FREQ) && type == FREQUENCY_UTIL && + rt_rq_is_runnable(&rq->rt)) return max; /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 0232a4570b78..5590f47a6e78 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -104,3 +104,8 @@ SCHED_FEAT(FIND_BEST_TARGET, true) * those tasks through the mainline slow path. */ SCHED_FEAT(EAS_PREFER_IDLE, true) + +/* + * Request max frequency from schedutil whenever a RT task is running. + */ +SCHED_FEAT(SUGOV_RT_MAX_FREQ, false) From 61ac7019606374cbc541781dec72bc9ba3805e14 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Thu, 17 Nov 2016 10:48:45 +0530 Subject: [PATCH 58/75] ANDROID: cpufreq/schedutil: add up/down frequency transition rate limits The rate-limit tunable in the schedutil governor applies to transitions to both lower and higher frequencies. On several platforms it is not the ideal tunable though, as it is difficult to get best power/performance figures using the same limit in both directions. It is common on mobile platforms with demanding user interfaces to want to increase frequency rapidly for example but decrease slowly. One of the example can be a case where we have short busy periods followed by similar or longer idle periods. If we keep the rate-limit high enough, we will not go to higher frequencies soon enough. On the other hand, if we keep it too low, we will have too many frequency transitions, as we will always reduce the frequency after the busy period. It would be very useful if we can set low rate-limit while increasing the frequency (so that we can respond to the short busy periods quickly) and high rate-limit while decreasing frequency (so that we don't reduce the frequency immediately after the short busy period and that may avoid frequency transitions before the next busy period). Implement separate up/down transition rate limits. Note that the governor avoids frequency recalculations for a period equal to minimum of up and down rate-limit. A global mutex is also defined to protect updates to min_rate_limit_us via two separate sysfs files. Note that this wouldn't change behavior of the schedutil governor for the platforms which wish to keep same values for both up and down rate limits. This is tested with the rt-app [1] on ARM Exynos, dual A15 processor platform. Testcase: Run a SCHED_OTHER thread on CPU0 which will emulate work-load for X ms of busy period out of the total period of Y ms, i.e. Y - X ms of idle period. The values of X/Y taken were: 20/40, 20/50, 20/70, i.e idle periods of 20, 30 and 50 ms respectively. These were tested against values of up/down rate limits as: 10/10 ms and 10/40 ms. For every test we noticed a performance increase of 5-10% with the schedutil governor, which was very much expected. [Viresh]: Simplified user interface and introduced min_rate_limit_us + mutex, rewrote commit log and included test results. [1] https://github.com/scheduler-tools/rt-app/ Signed-off-by: Steve Muckle Signed-off-by: Viresh Kumar (applied from https://marc.info/?l=linux-kernel&m=147936011103832&w=2) [trivial adaptations] Signed-off-by: Juri Lelli [updated rate limiting & fixed conflicts] Signed-off-by: Chris Redpath (cherry picked from commit 50c26fdb74563ec0cb4a83373d42667f4e83a23e) [Trivial cherry-pick conflicts] Signed-off-by: Quentin Perret Change-Id: I18720a83855b196b8e21dcdc8deae79131635b84 --- kernel/sched/cpufreq_schedutil.c | 103 ++++++++++++++++++++++++++----- 1 file changed, 89 insertions(+), 14 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 56881d416449..b01cd99ffbd0 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -18,7 +18,8 @@ struct sugov_tunables { struct gov_attr_set attr_set; - unsigned int rate_limit_us; + unsigned int up_rate_limit_us; + unsigned int down_rate_limit_us; }; struct sugov_policy { @@ -29,7 +30,9 @@ struct sugov_policy { raw_spinlock_t update_lock; /* For shared policies */ u64 last_freq_update_time; - s64 freq_update_delay_ns; + s64 min_rate_limit_ns; + s64 up_rate_delay_ns; + s64 down_rate_delay_ns; unsigned int next_freq; unsigned int cached_raw_freq; @@ -94,9 +97,32 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) if (unlikely(sg_policy->need_freq_update)) return true; + /* No need to recalculate next freq for min_rate_limit_us + * at least. However we might still decide to further rate + * limit once frequency change direction is decided, according + * to the separate rate limits. + */ + + delta_ns = time - sg_policy->last_freq_update_time; + return delta_ns >= sg_policy->min_rate_limit_ns; +} + +static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time, + unsigned int next_freq) +{ + s64 delta_ns; + delta_ns = time - sg_policy->last_freq_update_time; - return delta_ns >= sg_policy->freq_update_delay_ns; + if (next_freq > sg_policy->next_freq && + delta_ns < sg_policy->up_rate_delay_ns) + return true; + + if (next_freq < sg_policy->next_freq && + delta_ns < sg_policy->down_rate_delay_ns) + return true; + + return false; } static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, @@ -105,6 +131,9 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, if (sg_policy->next_freq == next_freq) return false; + if (sugov_up_down_rate_limit(sg_policy, time, next_freq)) + return false; + sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -593,15 +622,32 @@ static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr return container_of(attr_set, struct sugov_tunables, attr_set); } -static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +static DEFINE_MUTEX(min_rate_lock); + +static void update_min_rate_limit_ns(struct sugov_policy *sg_policy) +{ + mutex_lock(&min_rate_lock); + sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns, + sg_policy->down_rate_delay_ns); + mutex_unlock(&min_rate_lock); +} + +static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) { struct sugov_tunables *tunables = to_sugov_tunables(attr_set); - return sprintf(buf, "%u\n", tunables->rate_limit_us); + return sprintf(buf, "%u\n", tunables->up_rate_limit_us); } -static ssize_t -rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) +static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + + return sprintf(buf, "%u\n", tunables->down_rate_limit_us); +} + +static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) { struct sugov_tunables *tunables = to_sugov_tunables(attr_set); struct sugov_policy *sg_policy; @@ -610,18 +656,42 @@ rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count if (kstrtouint(buf, 10, &rate_limit_us)) return -EINVAL; - tunables->rate_limit_us = rate_limit_us; + tunables->up_rate_limit_us = rate_limit_us; - list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) - sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC; + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) { + sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_ns(sg_policy); + } return count; } -static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us); +static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set, + const char *buf, size_t count) +{ + struct sugov_tunables *tunables = to_sugov_tunables(attr_set); + struct sugov_policy *sg_policy; + unsigned int rate_limit_us; + + if (kstrtouint(buf, 10, &rate_limit_us)) + return -EINVAL; + + tunables->down_rate_limit_us = rate_limit_us; + + list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) { + sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_ns(sg_policy); + } + + return count; +} + +static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us); +static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us); static struct attribute *sugov_attributes[] = { - &rate_limit_us.attr, + &up_rate_limit_us.attr, + &down_rate_limit_us.attr, NULL }; @@ -777,7 +847,8 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy); + tunables->up_rate_limit_us = cpufreq_policy_transition_delay_us(policy); + tunables->down_rate_limit_us = cpufreq_policy_transition_delay_us(policy); policy->governor_data = sg_policy; sg_policy->tunables = tunables; @@ -835,7 +906,11 @@ static int sugov_start(struct cpufreq_policy *policy) struct sugov_policy *sg_policy = policy->governor_data; unsigned int cpu; - sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; + sg_policy->up_rate_delay_ns = + sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC; + sg_policy->down_rate_delay_ns = + sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC; + update_min_rate_limit_ns(sg_policy); sg_policy->last_freq_update_time = 0; sg_policy->next_freq = 0; sg_policy->work_in_progress = false; From a5ece57c96f52d541def647d6b72929fa2a1b14e Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 1 Jun 2018 20:34:10 +0100 Subject: [PATCH 59/75] ANDROID: sched/fair: Attempt to improve throughput for asym cap systems In some systems the capacity and group weights line up to defeat all the small imbalance correction conditions in fix_small_imbalance, which can cause bad task placement. Add a new condition if the existing code can't see anything to fix: If we have asymmetric capacity, and there are more tasks than CPUs in the busiest group *and* there are less tasks than CPUs in the local group then we try to pull something. There could be transient small tasks which prevent this from working, but on the whole it is beneficial for those systems with inconvenient capacity/cluster size relationships. Signed-off-by: Chris Redpath Change-Id: Icf81cde215c082a61f816534b7990ccb70aee409 --- kernel/sched/fair.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fad840752bdb..48ba40686331 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9053,7 +9053,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) capa_move /= SCHED_CAPACITY_SCALE; /* Move if we gain throughput */ - if (capa_move > capa_now) + if (capa_move > capa_now) { + env->imbalance = busiest->load_per_task; + return; + } + + /* We can't see throughput improvement with the load-based + * method, but it is possible depending upon group size and + * capacity range that there might still be an underutilized + * cpu available in an asymmetric capacity system. Do one last + * check just in case. + */ + if (env->sd->flags & SD_ASYM_CPUCAPACITY && + busiest->group_type == group_overloaded && + busiest->sum_nr_running > busiest->group_weight && + local->sum_nr_running < local->group_weight && + local->group_capacity < busiest->group_capacity) env->imbalance = busiest->load_per_task; } From de526a3400c6a566d266ef576d3eeb8fe3184ec7 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 5 Jun 2018 11:47:57 +0100 Subject: [PATCH 60/75] ANDROID: sched/fair: Don't balance misfits if it would overload local group When load balancing in a system with misfit tasks present, if we always pull a misfit task to the local group this can lead to pulling a running task from a smaller capacity CPUs to a bigger CPU which is busy. In this situation, the pulled task is likely not to get a chance to run before an idle balance on another small CPU pulls it back. This penalises the pulled task as it is stopped for a short amount of time and then likely relocated to a different CPU (since the original CPU just did a NEWLY_IDLE balance and reset the periodic interval). If we only do this unconditionally for NEWLY_IDLE balance, we can be sure that any tasks and load which are present on the local group are related to short-running tasks which we are happy to displace for a longer running task in a system with misfit tasks present. However, other balance types should only pull a task if we think that the local group is underutilized - checking the number of tasks gives us a conservative estimate here since if they were short tasks we would have been doing NEWLY_IDLE balances instead. Signed-off-by: Chris Redpath Change-Id: I710add1ab1139482620b6addc8370ad194791beb --- kernel/sched/fair.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 48ba40686331..6761e0aec0e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9137,8 +9137,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; - /* Boost imbalance to allow misfit task to be balanced. */ - if (busiest->group_type == group_misfit_task) { + /* Boost imbalance to allow misfit task to be balanced. + * Always do this if we are doing a NEWLY_IDLE balance + * on the assumption that any tasks we have must not be + * long-running (and hence we cannot rely upon load). + * However if we are not idle, we should assume the tasks + * we have are longer running and not override load-based + * calculations above unless we are sure that the local + * group is underutilized. + */ + if (busiest->group_type == group_misfit_task && + (env->idle == CPU_NEWLY_IDLE || + local->sum_nr_running < local->group_weight)) { env->imbalance = max_t(long, env->imbalance, busiest->group_misfit_task_load); } From df6cd51a542ab5e20e42d94b5b960b9b96e845a1 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Tue, 5 Jun 2018 12:21:33 +0100 Subject: [PATCH 61/75] ANDROID: sched/fair: Also do misfit in overloaded groups If we can classify the group as overloaded, that overrides any classification as misfit but we may still have misfit tasks present. Check the rq we're looking at to see if this is the case. Signed-off-by: Chris Redpath [Removed stray reference to rq_has_misfit] Signed-off-by: Valentin Schneider Change-Id: Ida8eb66aa625e34de3fe2ee1b0dd8a78926273d8 --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6761e0aec0e5..226eec4d6910 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9419,6 +9419,10 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load) + return 1; + + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } From 31d01ca5d412864e5a7f3be8cd98e23d00538e88 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 17 Mar 2017 19:09:03 +0000 Subject: [PATCH 62/75] ANDROID: sched/autogroup: Define autogroup_path() for !CONFIG_SCHED_DEBUG Define autogroup_path() even in the !CONFIG_SCHED_DEBUG case. If CONFIG_SCHED_AUTOGROUP is enabled the path of an autogroup has to be available to be printed in the load tracking trace events provided by this patch-stack regardless whether CONFIG_SCHED_DEBUG is set or not. Signed-off-by: Dietmar Eggemann Cc: Peter Zijlstra Cc: Ingo Molnar Change-Id: Iecf5466fc837f428ee545ddabe70024c152ff38d --- kernel/sched/autogroup.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2d4ff5353ded..2067080bb235 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -259,7 +259,6 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) } #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_SCHED_DEBUG int autogroup_path(struct task_group *tg, char *buf, int buflen) { if (!task_group_is_autogroup(tg)) @@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } -#endif From b540360a9cb119b84c978e8904c2d62a37ba5acc Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 17 Mar 2017 20:27:06 +0000 Subject: [PATCH 63/75] ANDROID: sched/events: Introduce cfs_rq load tracking trace event The following trace event keys are mapped to: (1) load : cfs_rq->avg.load_avg (2) rbl_load : cfs_rq->avg.runnable_load_avg (2) util : cfs_rq->avg.util_avg To let this trace event work for configurations w/ and w/o group scheduling support for cfs (CONFIG_FAIR_GROUP_SCHED) the following special handling is necessary for a non-existent key=value pair: path = "(null)" : In case of !CONFIG_FAIR_GROUP_SCHED. The following list shows examples of the key=value pairs in different configurations for: (1) a root task_group: cpu=4 path=/ load=6 rbl_load=6 util=331 (2) a task_group: cpu=1 path=/tg1/tg11/tg111 load=538 rbl_load=538 util=522 (3) an autogroup: cpu=3 path=/autogroup-18 load=997 rbl_load=997 util=517 (4) w/o CONFIG_FAIR_GROUP_SCHED: cpu=0 path=(null) load=314 rbl_load=314 util=289 The trace event is only defined for CONFIG_SMP. The helper function __trace_sched_path() can be used to get the length parameter of the dynamic array (path == NULL) and to copy the path into it (path != NULL). Signed-off-by: Dietmar Eggemann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Steven Rostedt [ Fixed issues related to the new pelt.c file ] Signed-off-by: Quentin Perret Change-Id: I1107044c52b74ecb3df69f3a45c1e530f0e59b1b --- include/trace/events/sched.h | 66 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 6 ++++ kernel/sched/pelt.c | 5 +++ 3 files changed, 77 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 0be866c91f62..8c79c661772f 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -572,6 +572,72 @@ TRACE_EVENT(sched_wake_idle_without_ipi, TP_printk("cpu=%d", __entry->cpu) ); + +#ifdef CONFIG_SMP +#ifdef CREATE_TRACE_POINTS +static inline +int __trace_sched_cpu(struct cfs_rq *cfs_rq) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq = cfs_rq->rq; +#else + struct rq *rq = container_of(cfs_rq, struct rq, cfs); +#endif + return cpu_of(rq); +} + +static inline +int __trace_sched_path(struct cfs_rq *cfs_rq, char *path, int len) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + int l = path ? len : 0; + + if (task_group_is_autogroup(cfs_rq->tg)) + return autogroup_path(cfs_rq->tg, path, l) + 1; + else + return cgroup_path(cfs_rq->tg->css.cgroup, path, l) + 1; +#else + if (path) + strcpy(path, "(null)"); + + return strlen("(null)"); +#endif +} + +#endif /* CREATE_TRACE_POINTS */ + +/* + * Tracepoint for cfs_rq load tracking: + */ +TRACE_EVENT(sched_load_cfs_rq, + + TP_PROTO(struct cfs_rq *cfs_rq), + + TP_ARGS(cfs_rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __dynamic_array(char, path, + __trace_sched_path(cfs_rq, NULL, 0) ) + __field( unsigned long, load ) + __field( unsigned long, rbl_load ) + __field( unsigned long, util ) + ), + + TP_fast_assign( + __entry->cpu = __trace_sched_cpu(cfs_rq); + __trace_sched_path(cfs_rq, __get_dynamic_array(path), + __get_dynamic_array_len(path)); + __entry->load = cfs_rq->avg.load_avg; + __entry->rbl_load = cfs_rq->avg.runnable_load_avg; + __entry->util = cfs_rq->avg.util_avg; + ), + + TP_printk("cpu=%d path=%s load=%lu rbl_load=%lu util=%lu", + __entry->cpu, __get_str(path), __entry->load, + __entry->rbl_load,__entry->util) +); +#endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 226eec4d6910..eeb977be8c3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3338,6 +3338,8 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_util(cfs_rq, se, gcfs_rq); update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); + trace_sched_load_cfs_rq(cfs_rq); + return 1; } @@ -3490,6 +3492,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); cfs_rq_util_change(cfs_rq, flags); + + trace_sched_load_cfs_rq(cfs_rq); } /** @@ -3509,6 +3513,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); cfs_rq_util_change(cfs_rq, 0); + + trace_sched_load_cfs_rq(cfs_rq); } /* diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 35475c0c5419..f042ee0f7b08 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -29,6 +29,8 @@ #include "sched-pelt.h" #include "pelt.h" +#include + /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) @@ -304,6 +306,9 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1, 1); + + trace_sched_load_cfs_rq(cfs_rq); + return 1; } From 42903694913697da88a4ac627a92bbfdf44f0a2e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 20 Mar 2017 17:26:47 +0000 Subject: [PATCH 64/75] ANDROID: sched/events: Introduce sched_entity load tracking trace event The following trace event keys are mapped to: (1) load : se->avg.load_avg (2) rbl_load : se->avg.runnable_load_avg (3) util : se->avg.util_avg To let this trace event work for configurations w/ and w/o group scheduling support for cfs (CONFIG_FAIR_GROUP_SCHED) the following special handling is necessary for non-existent key=value pairs: path = "(null)" : In case of !CONFIG_FAIR_GROUP_SCHED or the sched_entity represents a task. comm = "(null)" : In case sched_entity represents a task_group. pid = -1 : In case sched_entity represents a task_group. The following list shows examples of the key=value pairs in different configurations for: (1) a task: cpu=0 path=(null) comm=sshd pid=2206 load=102 rbl_load=102 util=102 (2) a taskgroup: cpu=1 path=/tg1/tg11/tg111 comm=(null) pid=-1 load=882 rbl_load=882 util=510 (3) an autogroup: cpu=0 path=/autogroup-13 comm=(null) pid=-1 load=49 rbl_load=49 util=48 (4) w/o CONFIG_FAIR_GROUP_SCHED: cpu=0 path=(null) comm=sshd pid=2211 load=301 rbl_load=301 util=265 The trace event is only defined for CONFIG_SMP. The helper functions __trace_sched_cpu(), __trace_sched_path() and __trace_sched_id() are extended to deal with sched_entities as well. Signed-off-by: Dietmar Eggemann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Steven Rostedt [ Fixed issues related to the new pelt.c file ] Signed-off-by: Quentin Perret Change-Id: Id2e4d1ddb79c13412c80e4fa4147b9df3b1e212a --- include/trace/events/sched.h | 67 +++++++++++++++++++++++++++++++----- kernel/sched/fair.c | 1 + kernel/sched/pelt.c | 6 ++++ 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8c79c661772f..989494d99541 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -576,14 +576,15 @@ TRACE_EVENT(sched_wake_idle_without_ipi, #ifdef CONFIG_SMP #ifdef CREATE_TRACE_POINTS static inline -int __trace_sched_cpu(struct cfs_rq *cfs_rq) +int __trace_sched_cpu(struct cfs_rq *cfs_rq, struct sched_entity *se) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq = cfs_rq->rq; + struct rq *rq = cfs_rq ? cfs_rq->rq : NULL; #else - struct rq *rq = container_of(cfs_rq, struct rq, cfs); + struct rq *rq = cfs_rq ? container_of(cfs_rq, struct rq, cfs) : NULL; #endif - return cpu_of(rq); + return rq ? cpu_of(rq) + : task_cpu((container_of(se, struct task_struct, se))); } static inline @@ -592,18 +593,26 @@ int __trace_sched_path(struct cfs_rq *cfs_rq, char *path, int len) #ifdef CONFIG_FAIR_GROUP_SCHED int l = path ? len : 0; - if (task_group_is_autogroup(cfs_rq->tg)) + if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) return autogroup_path(cfs_rq->tg, path, l) + 1; - else + else if (cfs_rq && cfs_rq->tg->css.cgroup) return cgroup_path(cfs_rq->tg->css.cgroup, path, l) + 1; -#else +#endif if (path) strcpy(path, "(null)"); return strlen("(null)"); -#endif } +static inline +struct cfs_rq *__trace_sched_group_cfs_rq(struct sched_entity *se) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + return se->my_q; +#else + return NULL; +#endif +} #endif /* CREATE_TRACE_POINTS */ /* @@ -625,7 +634,7 @@ TRACE_EVENT(sched_load_cfs_rq, ), TP_fast_assign( - __entry->cpu = __trace_sched_cpu(cfs_rq); + __entry->cpu = __trace_sched_cpu(cfs_rq, NULL); __trace_sched_path(cfs_rq, __get_dynamic_array(path), __get_dynamic_array_len(path)); __entry->load = cfs_rq->avg.load_avg; @@ -637,6 +646,46 @@ TRACE_EVENT(sched_load_cfs_rq, __entry->cpu, __get_str(path), __entry->load, __entry->rbl_load,__entry->util) ); + +/* + * Tracepoint for sched_entity load tracking: + */ +TRACE_EVENT(sched_load_se, + + TP_PROTO(struct sched_entity *se), + + TP_ARGS(se), + + TP_STRUCT__entry( + __field( int, cpu ) + __dynamic_array(char, path, + __trace_sched_path(__trace_sched_group_cfs_rq(se), NULL, 0) ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, load ) + __field( unsigned long, rbl_load ) + __field( unsigned long, util ) + ), + + TP_fast_assign( + struct cfs_rq *gcfs_rq = __trace_sched_group_cfs_rq(se); + struct task_struct *p = gcfs_rq ? NULL + : container_of(se, struct task_struct, se); + + __entry->cpu = __trace_sched_cpu(gcfs_rq, se); + __trace_sched_path(gcfs_rq, __get_dynamic_array(path), + __get_dynamic_array_len(path)); + memcpy(__entry->comm, p ? p->comm : "(null)", TASK_COMM_LEN); + __entry->pid = p ? p->pid : -1; + __entry->load = se->avg.load_avg; + __entry->rbl_load = se->avg.runnable_load_avg; + __entry->util = se->avg.util_avg; + ), + + TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu rbl_load=%lu util=%lu", + __entry->cpu, __get_str(path), __entry->comm, __entry->pid, + __entry->load, __entry->rbl_load, __entry->util) +); #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index eeb977be8c3a..66560e3707fe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3339,6 +3339,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); trace_sched_load_cfs_rq(cfs_rq); + trace_sched_load_se(se); return 1; } diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index f042ee0f7b08..95923939c027 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -276,6 +276,9 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) { ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); + + trace_sched_load_se(se); + return 1; } @@ -292,6 +295,9 @@ int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_e ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); cfs_se_util_change(&se->avg); + + trace_sched_load_se(se); + return 1; } From 915679307f7d3a59638b0300c44adc163a5c3e0c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 17 Mar 2017 21:23:35 +0000 Subject: [PATCH 65/75] ANDROID: sched/events: Introduce task_group load tracking trace event The trace event key load is mapped to: (1) load : cfs_rq->tg->load_avg The cfs_rq owned by the task_group is used as the only parameter for the trace event because it has a reference to the taskgroup and the cpu. Using the taskgroup as a parameter instead would require the cpu as a second parameter. A task_group is global and not per-cpu data. The cpu key only tells on which cpu the value was gathered. The following list shows examples of the key=value pairs for: (1) a task group: cpu=1 path=/tg1/tg11/tg111 load=517 (2) an autogroup: cpu=1 path=/autogroup-10 load=1050 We don't maintain a load signal for a root task group. The trace event is only defined if cfs group scheduling support (CONFIG_FAIR_GROUP_SCHED) is enabled. Signed-off-by: Dietmar Eggemann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Dietmar Eggemann Change-Id: I7de38e6b30a99d7c9887c94c707ded26b383b5f8 --- include/trace/events/sched.h | 29 +++++++++++++++++++++++++++++ kernel/sched/fair.c | 2 ++ 2 files changed, 31 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 989494d99541..566d66d26643 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -686,6 +686,35 @@ TRACE_EVENT(sched_load_se, __entry->cpu, __get_str(path), __entry->comm, __entry->pid, __entry->load, __entry->rbl_load, __entry->util) ); + +/* + * Tracepoint for task_group load tracking: + */ +#ifdef CONFIG_FAIR_GROUP_SCHED +TRACE_EVENT(sched_load_tg, + + TP_PROTO(struct cfs_rq *cfs_rq), + + TP_ARGS(cfs_rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __dynamic_array(char, path, + __trace_sched_path(cfs_rq, NULL, 0) ) + __field( long, load ) + ), + + TP_fast_assign( + __entry->cpu = cfs_rq->rq->cpu; + __trace_sched_path(cfs_rq, __get_dynamic_array(path), + __get_dynamic_array_len(path)); + __entry->load = atomic_long_read(&cfs_rq->tg->load_avg); + ), + + TP_printk("cpu=%d path=%s load=%ld", __entry->cpu, __get_str(path), + __entry->load) +); +#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 66560e3707fe..aeb1ce31589c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3096,6 +3096,8 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; + + trace_sched_load_tg(cfs_rq); } } From 8eb64d5f73e6764b61e596fda87bdc4ff308eaab Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Fri, 27 Oct 2017 16:12:51 +0100 Subject: [PATCH 66/75] ANDROID: sched/events: Introduce util_est trace events Signed-off-by: Patrick Bellasi Change-Id: I65e294c454369cbc15a29370d8a13ce358a95c39 --- include/trace/events/sched.h | 64 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 10 ++++++ 2 files changed, 74 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 566d66d26643..1b73c549bb90 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -715,6 +715,70 @@ TRACE_EVENT(sched_load_tg, __entry->load) ); #endif /* CONFIG_FAIR_GROUP_SCHED */ + +/* + * Tracepoint for tasks' estimated utilization. + */ +TRACE_EVENT(sched_util_est_task, + + TP_PROTO(struct task_struct *tsk, struct sched_avg *avg), + + TP_ARGS(tsk, avg), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( unsigned int, util_avg ) + __field( unsigned int, est_enqueued ) + __field( unsigned int, est_ewma ) + + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = task_cpu(tsk); + __entry->util_avg = avg->util_avg; + __entry->est_enqueued = avg->util_est.enqueued; + __entry->est_ewma = avg->util_est.ewma; + ), + + TP_printk("comm=%s pid=%d cpu=%d util_avg=%u util_est_ewma=%u util_est_enqueued=%u", + __entry->comm, + __entry->pid, + __entry->cpu, + __entry->util_avg, + __entry->est_ewma, + __entry->est_enqueued) +); + +/* + * Tracepoint for root cfs_rq's estimated utilization. + */ +TRACE_EVENT(sched_util_est_cpu, + + TP_PROTO(int cpu, struct cfs_rq *cfs_rq), + + TP_ARGS(cpu, cfs_rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned int, util_avg ) + __field( unsigned int, util_est_enqueued ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->util_avg = cfs_rq->avg.util_avg; + __entry->util_est_enqueued = cfs_rq->avg.util_est.enqueued; + ), + + TP_printk("cpu=%d util_avg=%u util_est_enqueued=%u", + __entry->cpu, + __entry->util_avg, + __entry->util_est_enqueued) +); #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aeb1ce31589c..a9abb574c3f1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3665,6 +3665,10 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, enqueued = cfs_rq->avg.util_est.enqueued; enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + /* Update plots for Task and CPU estimated utilization */ + trace_sched_util_est_task(p, &p->se.avg); + trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); } /* @@ -3695,6 +3699,9 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) (_task_util_est(p) | UTIL_AVG_UNCHANGED)); WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); + /* Update plots for CPU's estimated utilization */ + trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq); + /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -3740,6 +3747,9 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; WRITE_ONCE(p->se.avg.util_est, ue); + + /* Update plots for Task's estimated utilization */ + trace_sched_util_est_task(p, &p->se.avg); } static inline int task_fits_capacity(struct task_struct *p, long capacity) From 171db7dcaedd04039a7697ad51241ffb5eb75793 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 31 May 2018 11:15:26 +0100 Subject: [PATCH 67/75] ANDROID: sched/events: Introduce find_best_target trace event Adapated from the existing trace event from android-4.14. Change-Id: I9785e692fb0af087c236906d7f47fed1b20690f5 Signed-off-by: Quentin Perret --- include/trace/events/sched.h | 41 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 3 +++ 2 files changed, 44 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 1b73c549bb90..2a81858f7b8d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -779,6 +779,47 @@ TRACE_EVENT(sched_util_est_cpu, __entry->util_avg, __entry->util_est_enqueued) ); + +/* + * Tracepoint for find_best_target + */ +TRACE_EVENT(sched_find_best_target, + + TP_PROTO(struct task_struct *tsk, bool prefer_idle, + unsigned long min_util, int best_idle, int best_active, + int target, int backup), + + TP_ARGS(tsk, prefer_idle, min_util, best_idle, + best_active, target, backup), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, min_util ) + __field( bool, prefer_idle ) + __field( int, best_idle ) + __field( int, best_active ) + __field( int, target ) + __field( int, backup ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->min_util = min_util; + __entry->prefer_idle = prefer_idle; + __entry->best_idle = best_idle; + __entry->best_active = best_active; + __entry->target = target; + __entry->backup = backup; + ), + + TP_printk("pid=%d comm=%s prefer_idle=%d " + "best_idle=%d best_active=%d target=%d backup=%d", + __entry->pid, __entry->comm, __entry->prefer_idle, + __entry->best_idle, __entry->best_active, + __entry->target, __entry->backup) +); #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a9abb574c3f1..57cf2bc3a5c9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6789,6 +6789,9 @@ static void find_best_target(struct sched_domain *sd, cpumask_t *cpus, target: cpumask_set_cpu(target_cpu, cpus); } + + trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu, + best_active_cpu, target_cpu, backup_cpu); } /* From 8c850534bb054fadc36cd50d2ee52cf98227b663 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 31 May 2018 11:26:52 +0100 Subject: [PATCH 68/75] ANDROID: sched/events: Introduce schedtune trace events Suggested-by: Patrick Bellasi Signed-off-by: Quentin Perret Change-Id: I2c43bcb37f57844a07aa36e339da00180e65b6c2 --- include/trace/events/sched.h | 120 +++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 4 ++ kernel/sched/tune.c | 7 ++ 3 files changed, 131 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 2a81858f7b8d..ddef64bf6eaa 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -820,6 +820,126 @@ TRACE_EVENT(sched_find_best_target, __entry->best_idle, __entry->best_active, __entry->target, __entry->backup) ); + +/* + * Tracepoint for accounting CPU boosted utilization + */ +TRACE_EVENT(sched_boost_cpu, + + TP_PROTO(int cpu, unsigned long util, long margin), + + TP_ARGS(cpu, util, margin), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, util ) + __field(long, margin ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("cpu=%d util=%lu margin=%ld", + __entry->cpu, + __entry->util, + __entry->margin) +); + +/* + * Tracepoint for schedtune_tasks_update + */ +TRACE_EVENT(sched_tune_tasks_update, + + TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, + int boost, int max_boost), + + TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( int, tasks ) + __field( int, idx ) + __field( int, boost ) + __field( int, max_boost ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = cpu; + __entry->tasks = tasks; + __entry->idx = idx; + __entry->boost = boost; + __entry->max_boost = max_boost; + ), + + TP_printk("pid=%d comm=%s " + "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d", + __entry->pid, __entry->comm, + __entry->cpu, __entry->tasks, __entry->idx, + __entry->boost, __entry->max_boost) +); + +/* + * Tracepoint for schedtune_boostgroup_update + */ +TRACE_EVENT(sched_tune_boostgroup_update, + + TP_PROTO(int cpu, int variation, int max_boost), + + TP_ARGS(cpu, variation, max_boost), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( int, variation ) + __field( int, max_boost ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->variation = variation; + __entry->max_boost = max_boost; + ), + + TP_printk("cpu=%d variation=%d max_boost=%d", + __entry->cpu, __entry->variation, __entry->max_boost) +); + +/* + * Tracepoint for accounting task boosted utilization + */ +TRACE_EVENT(sched_boost_task, + + TP_PROTO(struct task_struct *tsk, unsigned long util, long margin), + + TP_ARGS(tsk, util, margin), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, util ) + __field( long, margin ) + + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->util = util; + __entry->margin = margin; + ), + + TP_printk("comm=%s pid=%d util=%lu margin=%ld", + __entry->comm, __entry->pid, + __entry->util, + __entry->margin) +); + #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 57cf2bc3a5c9..2cea88b3ea4f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5852,6 +5852,8 @@ boosted_cpu_util(int cpu) unsigned long util = cpu_util_cfs(cpu_rq(cpu)); long margin = schedtune_cpu_margin(util, cpu); + trace_sched_boost_cpu(cpu, util, margin); + return util + margin; } @@ -5879,6 +5881,8 @@ boosted_task_util(struct task_struct *task) unsigned long util = task_util_est(task); long margin = schedtune_task_margin(task); + trace_sched_boost_task(task, util, margin); + return util + margin; } diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index af6de6d3df6d..7e0630c52b81 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -227,14 +227,18 @@ schedtune_boostgroup_update(int idx, int boost) /* Check if this update increase current max */ if (boost > cur_boost_max && bg->group[idx].tasks) { bg->boost_max = boost; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ if (cur_boost_max == old_boost && old_boost > boost) { schedtune_cpu_update(cpu); + trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); continue; } + + trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max); } return 0; @@ -252,6 +256,9 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) /* Update boosted tasks count while avoiding to make it negative */ bg->group[idx].tasks = max(0, tasks); + trace_sched_tune_tasks_update(p, cpu, tasks, idx, + bg->group[idx].boost, bg->boost_max); + /* Boost group activation or deactivation on that RQ */ if (tasks == 1 || tasks == 0) schedtune_cpu_update(cpu); From 3124a5b9d0d17bc6f097d35d20c285b7397c6c7a Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Fri, 27 Oct 2017 16:52:17 +0100 Subject: [PATCH 69/75] ANDROID: sched/events: Introduce rt_rq load tracking trace event We want to be able to track rt_rq signals same as we do for other RQs. Signed-off-by: Chris Redpath (cherry-picked from commit 574a2d189695c334ae290f522b098f05398a3765) [ - Fixed conflicts with the refactored RT util_avg tracking - Changed commit title for consistency with other tracepoint patches ] Signed-off-by: Quentin Perret Change-Id: I38b64baa50e1ff86019ca4b8b0a04af994880b35 --- include/trace/events/sched.h | 24 ++++++++++++++++++++++++ kernel/sched/pelt.c | 3 +++ 2 files changed, 27 insertions(+) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ddef64bf6eaa..2b642a4c8813 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -647,6 +647,30 @@ TRACE_EVENT(sched_load_cfs_rq, __entry->rbl_load,__entry->util) ); +/* + * Tracepoint for rt_rq load tracking: + */ +struct rq; +TRACE_EVENT(sched_load_rt_rq, + + TP_PROTO(struct rq *rq), + + TP_ARGS(rq), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, util ) + ), + + TP_fast_assign( + __entry->cpu = rq->cpu; + __entry->util = rq->avg_rt.util_avg; + ), + + TP_printk("cpu=%d util=%lu", __entry->cpu, + __entry->util) +); + /* * Tracepoint for sched_entity load tracking: */ diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 95923939c027..c116744ec44b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -340,6 +340,9 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) running)) { ___update_load_avg(&rq->avg_rt, 1, 1); + + trace_sched_load_rt_rq(rq); + return 1; } From 6dfaed989ea4ca223f0913dfc11cdafd9664fc1c Mon Sep 17 00:00:00 2001 From: Patrick Bellasi Date: Wed, 10 Feb 2016 09:24:36 +0000 Subject: [PATCH 70/75] ANDROID: sched/events: Introduce overutilized trace event Signed-off-by: Patrick Bellasi Signed-off-by: Andres Oportus (cherry picked from commit 8e45d941282039d5379f4e286e5bd0a2044e105c) [ - Trivial cherry pick issues - Changed commit title for consistency ] Signed-off-by: Quentin Perret Change-Id: I78fb5e82223558def0cf16105c233591cda81d5c --- include/trace/events/sched.h | 21 +++++++++++++++++++++ kernel/sched/fair.c | 7 ++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 2b642a4c8813..947dbcae17a7 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -964,6 +964,27 @@ TRACE_EVENT(sched_boost_task, __entry->margin) ); +/* + * Tracepoint for system overutilized flag +*/ +TRACE_EVENT(sched_overutilized, + + TP_PROTO(int overutilized), + + TP_ARGS(overutilized), + + TP_STRUCT__entry( + __field( int, overutilized ) + ), + + TP_fast_assign( + __entry->overutilized = overutilized; + ), + + TP_printk("overutilized=%d", + __entry->overutilized) +); + #endif /* CONFIG_SMP */ #endif /* _TRACE_SCHED_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2cea88b3ea4f..36d2f3bde904 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5134,8 +5134,10 @@ static inline bool cpu_overutilized(int cpu) static inline void update_overutilized_status(struct rq *rq) { - if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) + if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized(1); + } } #else static inline void update_overutilized_status(struct rq *rq) { } @@ -8961,9 +8963,12 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* Update over-utilization (tipping point, U >= 0) indicator */ WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); + trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED)); } else if (sg_status & SG_OVERUTILIZED) { WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); + trace_sched_overutilized(1); } + } /** From 55679f6419d4a55b7a924737d1a1e11f4e6e56c1 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 12 Sep 2018 15:34:46 +0100 Subject: [PATCH 71/75] ANDROID: sched: Make the cpu_util* accessors available without sugov In preparation for the introduction of the boost hold feature in SchedTune, make the cpu_util_* signals available even without sugov, hence making the stune integration a lot easier without breaking other platforms Change-Id: I77c14b6fd470a41ffb3ac510b76395ebc116ddfd Signed-off-by: Quentin Perret --- kernel/sched/sched.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2fd06be206ea..3997a6908afd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2262,7 +2262,14 @@ static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); } +#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +{ + return cfs; +} +#endif +#ifdef CONFIG_SMP static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -2289,11 +2296,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) -{ - return cfs; -} #endif #ifdef HAVE_SCHED_AVG_IRQ From cc2c83709c5ca27548491d9b7a415c51e05e3600 Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 9 Jul 2018 16:54:02 +0100 Subject: [PATCH 72/75] ANDROID: sched/rt: Add schedtune accounting to rt task enqueue/dequeue rt tasks are currently not eligible for schedtune boosting. Make it so by adding enqueue/dequeue hooks. For rt tasks, schedtune only acts as a frequency boosting framework, it has no impact on placement decisions and the prefer_idle attribute is not used. Also prepare schedutil use of boosted util for rt task boosting With this change, schedtune accounting will include rt class tasks, however boosting currently only applies to the utilization provided by fair class tasks. Sum up the tracked CPU utilization applying boost to the aggregate util instead - this includes RT task util in the boosting if any tasks are runnable. Scenario 1, considering one CPU: 1x rt task running, util 250, boost 0 1x cfs task runnable, util 250, boost 50 previous util=250+(50pct_boosted_250) = 887 new util=50_pct_boosted_500 = 762 Scenario 2, considering one CPU: 1x rt task running, util 250, boost 50 1x cfs task runnable, util 250, boost 0 previous util=250+250 = 500 new util=50_pct_boosted_500 = 762 Scenario 3, considering one CPU: 1x rt task running, util 250, boost 50 1x cfs task runnable, util 250, boost 50 previous util=250+(50pct_boosted_250) = 887 new util=50_pct_boosted_500 = 762 Scenario 4: 1x rt task running, util 250, boost 50 previous util=250 = 250 new util=50_pct_boosted_250 = 637 Change-Id: Ie287cbd0692468525095b5024db9faac8b2f4878 Signed-off-by: Chris Redpath (cherry picked from commit 8e266aebf737262aeca9662254a3e61ccc7f8dec) [ - Fixed conflicts in sugov related to PELT signals of all classes - Fixed conflicts related to boosted_cpu_util being in a different location - Moved the CFS/RT util aggregation out of schedutil_freq_util to ease the integration with SchedTune ] Signed-off-by: Quentin Perret --- kernel/sched/cpufreq_schedutil.c | 21 ++++++++++----------- kernel/sched/fair.c | 7 +++---- kernel/sched/rt.c | 4 ++++ kernel/sched/sched.h | 10 +++++----- kernel/sched/tune.h | 4 ++-- 5 files changed, 24 insertions(+), 22 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index b01cd99ffbd0..f1841ce4555f 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -219,6 +219,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * Where the cfs,rt and dl util numbers are tracked with the same metric and * synchronized windows and are thus directly comparable. * + * The @util parameter passed to this function is assumed to be the aggregation + * of RT and CFS util numbers. The cases of DL and IRQ are managed here. + * * The cfs,rt,dl utilization are the running times measured with rq->clock_task * which excludes things like IRQ and steal-time. These latter are then accrued * in the irq utilization. @@ -227,11 +230,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, +unsigned long schedutil_freq_util(int cpu, unsigned long util, unsigned long max, enum schedutil_type type) { struct rq *rq = cpu_rq(cpu); - unsigned long util, irq; + unsigned long irq; if (sched_feat(SUGOV_RT_MAX_FREQ) && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) @@ -247,14 +250,11 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, return max; /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. + * The function is called with @util defined as the aggregation (the + * sum) of RT and CFS signals, hence leaving the special case of DL + * to be delt with. The exact way of doing things depend on the calling + * context. */ - util = util_cfs; - util += cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) { /* * For frequency selection we do not make cpu_util_dl() a @@ -267,7 +267,6 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, * to not quite hit saturation when we should -- * something for later. */ - if ((util + cpu_util_dl(rq)) >= max) return max; } else { @@ -314,7 +313,7 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util = boosted_cpu_util(sg_cpu->cpu); + unsigned long util = boosted_cpu_util(sg_cpu->cpu, cpu_util_rt(rq)); unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); sg_cpu->max = max; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 36d2f3bde904..8f3573d7165d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5849,9 +5849,9 @@ schedtune_task_margin(struct task_struct *task) } unsigned long -boosted_cpu_util(int cpu) +boosted_cpu_util(int cpu, unsigned long other_util) { - unsigned long util = cpu_util_cfs(cpu_rq(cpu)); + unsigned long util = cpu_util_cfs(cpu_rq(cpu)) + other_util; long margin = schedtune_cpu_margin(util, cpu); trace_sched_boost_cpu(cpu, util, margin); @@ -5875,8 +5875,6 @@ schedtune_task_margin(struct task_struct *task) #endif /* CONFIG_SCHED_TUNE */ - - static inline unsigned long boosted_task_util(struct task_struct *task) { @@ -6892,6 +6890,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) */ for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { util = cpu_util_next(cpu, p, dst_cpu); + util += cpu_util_rt(cpu_rq(cpu)); util = schedutil_energy_util(cpu, util); max_util = max(util, max_util); sum_util += util; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0be707d9c2db..3eed85fc86db 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1329,6 +1329,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; + schedtune_enqueue_task(p, cpu_of(rq)); + if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; @@ -1342,6 +1344,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) { struct sched_rt_entity *rt_se = &p->rt; + schedtune_dequeue_task(p, cpu_of(rq)); + update_curr_rt(rq); dequeue_rt_entity(rt_se, flags); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3997a6908afd..33911f3f74ac 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2253,19 +2253,19 @@ enum schedutil_type { ENERGY_UTIL, }; -unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, +unsigned long schedutil_freq_util(int cpu, unsigned long util, unsigned long max, enum schedutil_type type); -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +static inline unsigned long schedutil_energy_util(int cpu, unsigned long util) { unsigned long max = arch_scale_cpu_capacity(NULL, cpu); - return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); + return schedutil_freq_util(cpu, util, max, ENERGY_UTIL); } #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) +static inline unsigned long schedutil_energy_util(int cpu, unsigned long util) { - return cfs; + return util; } #endif diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index bb187c6112a7..821f026b510f 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -20,7 +20,7 @@ int schedtune_prefer_idle(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); -unsigned long boosted_cpu_util(int cpu); +unsigned long boosted_cpu_util(int cpu, unsigned long other_util); #else /* CONFIG_SCHED_TUNE */ @@ -32,6 +32,6 @@ unsigned long boosted_cpu_util(int cpu); #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) -#define boosted_cpu_util(cpu) cpu_util_cfs(cpu_rq(cpu)) +#define boosted_cpu_util(cpu, other_util) cpu_util_cfs(cpu_rq(cpu)) #endif /* CONFIG_SCHED_TUNE */ From 4cb65352a390105e5a09a0f0a49e41eae907bc1d Mon Sep 17 00:00:00 2001 From: Chris Redpath Date: Mon, 9 Jul 2018 15:44:00 +0100 Subject: [PATCH 73/75] ANDROID: Add hold functionality to schedtune CPU boost When tasks come and go from a runqueue quickly, this can lead to boost being applied and removed quickly which sometimes means we cannot raise the CPU frequency again when we need to (due to the rate limit on frequency updates). This has proved to be a particular issue for RT tasks and alternative methods have been used in the past to work around it. This is an attempt to solve the issue for all task classes and cpufreq governors by introducing a generic mechanism in schedtune to retain the max boost level from task enqueue for a minimum period - defined here as 50ms. This timeout was determined experimentally and is not configurable. A sched_feat guards the application of this to tasks - in the default configuration, task boosting only applied to tasks which have RT policy. Change SCHEDTUNE_BOOST_HOLD_ALL to true to apply it to all tasks regardless of class. It works like so: Every task enqueue (in an allowed class) stores a cpu-local timestamp. If the task is not a member of an allowed class (all or RT depending upon feature selection), the timestamp is not updated. The boost group will stay active regardless of tasks present until 50ms beyond the last timestamp stored. We also store the timestamp of the active boost group to avoid unneccesarily revisiting the boost groups when checking CPU boost level. If the timestamp is more than 50ms in the past when we check boost then we re-evaluate the boost groups for that CPU, taking into account the timestamps associated with each group. Idea based on rt-boost-retention patches from Joel. Change-Id: I52cc2d2e82d1c5aa03550378c8836764f41630c1 Suggested-by: Joel Fernandes Reviewed-by: Patrick Bellasi Signed-off-by: Chris Redpath [forward ported from android-4.9-eas-dev proposal] (cherry picked from commit a485e8b7bf8e95759e600396feeb7bfb400b6e46) [ - Trivial cherry-pick conflicts in include/trace/events/sched.h ] Signed-off-by: Quentin Perret --- include/trace/events/sched.h | 11 +++-- kernel/sched/features.h | 11 +++++ kernel/sched/tune.c | 95 ++++++++++++++++++++++++++++++------ 3 files changed, 97 insertions(+), 20 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 947dbcae17a7..23cee9056d8e 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -878,9 +878,9 @@ TRACE_EVENT(sched_boost_cpu, TRACE_EVENT(sched_tune_tasks_update, TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx, - int boost, int max_boost), + int boost, int max_boost, u64 group_ts), - TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost), + TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost, group_ts), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -890,6 +890,7 @@ TRACE_EVENT(sched_tune_tasks_update, __field( int, idx ) __field( int, boost ) __field( int, max_boost ) + __field( u64, group_ts ) ), TP_fast_assign( @@ -900,13 +901,15 @@ TRACE_EVENT(sched_tune_tasks_update, __entry->idx = idx; __entry->boost = boost; __entry->max_boost = max_boost; + __entry->group_ts = group_ts; ), TP_printk("pid=%d comm=%s " - "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d", + "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d timeout=%llu", __entry->pid, __entry->comm, __entry->cpu, __entry->tasks, __entry->idx, - __entry->boost, __entry->max_boost) + __entry->boost, __entry->max_boost, + __entry->group_ts) ); /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 5590f47a6e78..50bdfd7c3321 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -109,3 +109,14 @@ SCHED_FEAT(EAS_PREFER_IDLE, true) * Request max frequency from schedutil whenever a RT task is running. */ SCHED_FEAT(SUGOV_RT_MAX_FREQ, false) + +/* + * Apply schedtune boost hold to tasks of all sched classes. + * If enabled, schedtune will hold the boost applied to a CPU + * for 50ms regardless of task activation - if the task is + * still running 50ms later, the boost hold expires and schedtune + * boost will expire immediately the task stops. + * If disabled, this behaviour will only apply to tasks of the + * RT class. + */ +SCHED_FEAT(SCHEDTUNE_BOOST_HOLD_ALL, false) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 7e0630c52b81..3b231c639fe4 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -13,6 +13,9 @@ bool schedtune_initialized = false; extern struct reciprocal_value schedtune_spc_rdiv; +/* We hold schedtune boost in effect for at least this long */ +#define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL + /* * EAS scheduler tunables for task groups. * @@ -151,6 +154,7 @@ static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = { struct boost_groups { /* Maximum boost value for all RUNNABLE tasks on a CPU */ int boost_max; + u64 boost_ts; struct { /* True when this boost group maps an actual cgroup */ bool valid; @@ -158,6 +162,8 @@ struct boost_groups { int boost; /* Count of RUNNABLE tasks on that boost group */ unsigned tasks; + /* Timestamp of boost activation */ + u64 ts; } group[BOOSTGROUPS_COUNT]; /* CPU's boost group locking */ raw_spinlock_t lock; @@ -166,15 +172,31 @@ struct boost_groups { /* Boost groups affecting each CPU in the system */ DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups); +static inline bool schedtune_boost_timeout(u64 now, u64 ts) +{ + return ((now - ts) > SCHEDTUNE_BOOST_HOLD_NS); +} + +static inline bool +schedtune_boost_group_active(int idx, struct boost_groups* bg, u64 now) +{ + if (bg->group[idx].tasks) + return true; + + return !schedtune_boost_timeout(now, bg->group[idx].ts); +} + static void -schedtune_cpu_update(int cpu) +schedtune_cpu_update(int cpu, u64 now) { struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu); int boost_max; + u64 boost_ts; int idx; /* The root boost group is always active */ boost_max = bg->group[0].boost; + boost_ts = now; for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) { /* Ignore non boostgroups not mapping a cgroup */ @@ -183,12 +205,18 @@ schedtune_cpu_update(int cpu) /* * A boost group affects a CPU only if it has - * RUNNABLE tasks on that CPU + * RUNNABLE tasks on that CPU or it has hold + * in effect from a previous task. */ - if (bg->group[idx].tasks == 0) + if (!schedtune_boost_group_active(idx, bg, now)) continue; - boost_max = max(boost_max, bg->group[idx].boost); + /* This boost group is active */ + if (boost_max > bg->group[idx].boost) + continue; + + boost_max = bg->group[idx].boost; + boost_ts = bg->group[idx].ts; } /* Ensures boost_max is non-negative when all cgroup boost values @@ -196,6 +224,7 @@ schedtune_cpu_update(int cpu) * task stacking and frequency spikes.*/ boost_max = max(boost_max, 0); bg->boost_max = boost_max; + bg->boost_ts = boost_ts; } static int @@ -205,6 +234,7 @@ schedtune_boostgroup_update(int idx, int boost) int cur_boost_max; int old_boost; int cpu; + u64 now; /* Update per CPU boost groups */ for_each_possible_cpu(cpu) { @@ -225,15 +255,19 @@ schedtune_boostgroup_update(int idx, int boost) bg->group[idx].boost = boost; /* Check if this update increase current max */ - if (boost > cur_boost_max && bg->group[idx].tasks) { + now = sched_clock_cpu(cpu); + if (boost > cur_boost_max && + schedtune_boost_group_active(idx, bg, now)) { bg->boost_max = boost; + bg->boost_ts = bg->group[idx].ts; + trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max); continue; } /* Check if this update has decreased current max */ if (cur_boost_max == old_boost && old_boost > boost) { - schedtune_cpu_update(cpu); + schedtune_cpu_update(cpu, now); trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max); continue; } @@ -247,6 +281,15 @@ schedtune_boostgroup_update(int idx, int boost) #define ENQUEUE_TASK 1 #define DEQUEUE_TASK -1 +static inline bool +schedtune_update_timestamp(struct task_struct *p) +{ + if (sched_feat(SCHEDTUNE_BOOST_HOLD_ALL)) + return true; + + return task_has_rt_policy(p); +} + static inline void schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) { @@ -256,12 +299,21 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count) /* Update boosted tasks count while avoiding to make it negative */ bg->group[idx].tasks = max(0, tasks); - trace_sched_tune_tasks_update(p, cpu, tasks, idx, - bg->group[idx].boost, bg->boost_max); + /* Update timeout on enqueue */ + if (task_count > 0) { + u64 now = sched_clock_cpu(cpu); - /* Boost group activation or deactivation on that RQ */ - if (tasks == 1 || tasks == 0) - schedtune_cpu_update(cpu); + if (schedtune_update_timestamp(p)) + bg->group[idx].ts = now; + + /* Boost group activation or deactivation on that RQ */ + if (bg->group[idx].tasks == 1) + schedtune_cpu_update(cpu, now); + } + + trace_sched_tune_tasks_update(p, cpu, tasks, idx, + bg->group[idx].boost, bg->boost_max, + bg->group[idx].ts); } /* @@ -305,6 +357,7 @@ int schedtune_can_attach(struct cgroup_taskset *tset) int src_bg; /* Source boost group index */ int dst_bg; /* Destination boost group index */ int tasks; + u64 now; if (unlikely(!schedtune_initialized)) return 0; @@ -355,13 +408,15 @@ int schedtune_can_attach(struct cgroup_taskset *tset) bg->group[src_bg].tasks = max(0, tasks); bg->group[dst_bg].tasks += 1; + /* Update boost hold start for this group */ + now = sched_clock_cpu(cpu); + bg->group[dst_bg].ts = now; + + /* Force boost group re-evaluation at next boost check */ + bg->boost_ts = now - SCHEDTUNE_BOOST_HOLD_NS; + raw_spin_unlock(&bg->lock); task_rq_unlock(rq, task, &rq_flags); - - /* Update CPU boost group */ - if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1) - schedtune_cpu_update(task_cpu(task)); - } return 0; @@ -408,8 +463,15 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu) int schedtune_cpu_boost(int cpu) { struct boost_groups *bg; + u64 now; bg = &per_cpu(cpu_boost_groups, cpu); + now = sched_clock_cpu(cpu); + + /* Check to see if we have a hold in effect */ + if (schedtune_boost_timeout(now, bg->boost_ts)) + schedtune_cpu_update(cpu, now); + return bg->boost_max; } @@ -515,6 +577,7 @@ schedtune_boostgroup_init(struct schedtune *st, int idx) bg = &per_cpu(cpu_boost_groups, cpu); bg->group[idx].boost = 0; bg->group[idx].valid = true; + bg->group[idx].ts = 0; } /* Keep track of allocated boost groups */ From 2054105d167cef7d3916c33f73e1d35595205277 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 4 Sep 2018 12:51:08 +0100 Subject: [PATCH 74/75] ANDROID: thermal: cpu_cooling: Migrate to using the EM framework The newly introduced Energy Model framework manages power cost tables in a generic way. Moreover, it supports a wide variety of models since the tables can come from DT, firmware, etc. On the other hand, the cpu_cooling subsystem manages its own power cost tables using only DT information. In order to avoid the duplication of data in the kernel, and in order to enable IPA with EMs comming from more than just DT, remove the private tables from cpu_cooling.c and migrate it to using the centralized EM framework. The case where the thermal subsystem is used without an Energy Model (cpufreq_cooling_ops) is handled by looking at CPUFreq's frequency table which is already a dependency for cpu_cooling.c anyway. Signed-off-by: Quentin Perret Change-Id: I064e4b0e56d1c52b82a4dc800a1fa90456429911 --- drivers/thermal/cpu_cooling.c | 262 +++++++++++----------------------- 1 file changed, 83 insertions(+), 179 deletions(-) diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c index dfd23245f778..eb0d87f32bc7 100644 --- a/drivers/thermal/cpu_cooling.c +++ b/drivers/thermal/cpu_cooling.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -48,19 +49,6 @@ * ... */ -/** - * struct freq_table - frequency table along with power entries - * @frequency: frequency in KHz - * @power: power in mW - * - * This structure is built when the cooling device registers and helps - * in translating frequency to power and vice versa. - */ -struct freq_table { - u32 frequency; - u32 power; -}; - /** * struct time_in_idle - Idle time stats * @time: previous reading of the absolute time that this cpu was idle @@ -82,7 +70,7 @@ struct time_in_idle { * frequency. * @max_level: maximum cooling level. One less than total number of valid * cpufreq frequencies. - * @freq_table: Freq table in descending order of frequencies + * @em: Reference on the Energy Model of the device * @cdev: thermal_cooling_device pointer to keep track of the * registered cooling device. * @policy: cpufreq policy. @@ -98,7 +86,7 @@ struct cpufreq_cooling_device { unsigned int cpufreq_state; unsigned int clipped_freq; unsigned int max_level; - struct freq_table *freq_table; /* In descending order */ + struct em_perf_domain *em; struct thermal_cooling_device *cdev; struct cpufreq_policy *policy; struct list_head node; @@ -111,26 +99,6 @@ static LIST_HEAD(cpufreq_cdev_list); /* Below code defines functions to be used for cpufreq as cooling device */ -/** - * get_level: Find the level for a particular frequency - * @cpufreq_cdev: cpufreq_cdev for which the property is required - * @freq: Frequency - * - * Return: level corresponding to the frequency. - */ -static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev, - unsigned int freq) -{ - struct freq_table *freq_table = cpufreq_cdev->freq_table; - unsigned long level; - - for (level = 1; level <= cpufreq_cdev->max_level; level++) - if (freq > freq_table[level].frequency) - break; - - return level - 1; -} - /** * cpufreq_thermal_notifier - notifier callback for cpufreq policy change. * @nb: struct notifier_block * with callback info. @@ -184,105 +152,52 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb, return NOTIFY_OK; } +#ifdef CONFIG_ENERGY_MODEL /** - * update_freq_table() - Update the freq table with power numbers - * @cpufreq_cdev: the cpufreq cooling device in which to update the table - * @capacitance: dynamic power coefficient for these cpus + * get_level: Find the level for a particular frequency + * @cpufreq_cdev: cpufreq_cdev for which the property is required + * @freq: Frequency * - * Update the freq table with power numbers. This table will be used in - * cpu_power_to_freq() and cpu_freq_to_power() to convert between power and - * frequency efficiently. Power is stored in mW, frequency in KHz. The - * resulting table is in descending order. - * - * Return: 0 on success, -EINVAL if there are no OPPs for any CPUs, - * or -ENOMEM if we run out of memory. + * Return: level corresponding to the frequency. */ -static int update_freq_table(struct cpufreq_cooling_device *cpufreq_cdev, - u32 capacitance) +static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev, + unsigned int freq) { - struct freq_table *freq_table = cpufreq_cdev->freq_table; - struct dev_pm_opp *opp; - struct device *dev = NULL; - int num_opps = 0, cpu = cpufreq_cdev->policy->cpu, i; + int i; - dev = get_cpu_device(cpu); - if (unlikely(!dev)) { - dev_warn(&cpufreq_cdev->cdev->device, - "No cpu device for cpu %d\n", cpu); - return -ENODEV; + for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) { + if (freq > cpufreq_cdev->em->table[i].frequency) + break; } - num_opps = dev_pm_opp_get_opp_count(dev); - if (num_opps < 0) - return num_opps; - - /* - * The cpufreq table is also built from the OPP table and so the count - * should match. - */ - if (num_opps != cpufreq_cdev->max_level + 1) { - dev_warn(dev, "Number of OPPs not matching with max_levels\n"); - return -EINVAL; - } - - for (i = 0; i <= cpufreq_cdev->max_level; i++) { - unsigned long freq = freq_table[i].frequency * 1000; - u32 freq_mhz = freq_table[i].frequency / 1000; - u64 power; - u32 voltage_mv; - - /* - * Find ceil frequency as 'freq' may be slightly lower than OPP - * freq due to truncation while converting to kHz. - */ - opp = dev_pm_opp_find_freq_ceil(dev, &freq); - if (IS_ERR(opp)) { - dev_err(dev, "failed to get opp for %lu frequency\n", - freq); - return -EINVAL; - } - - voltage_mv = dev_pm_opp_get_voltage(opp) / 1000; - dev_pm_opp_put(opp); - - /* - * Do the multiplication with MHz and millivolt so as - * to not overflow. - */ - power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv; - do_div(power, 1000000000); - - /* power is stored in mW */ - freq_table[i].power = power; - } - - return 0; + return cpufreq_cdev->max_level - i - 1; } + static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev, u32 freq) { int i; - struct freq_table *freq_table = cpufreq_cdev->freq_table; - for (i = 1; i <= cpufreq_cdev->max_level; i++) - if (freq > freq_table[i].frequency) + for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) { + if (freq > cpufreq_cdev->em->table[i].frequency) break; + } - return freq_table[i - 1].power; + return cpufreq_cdev->em->table[i + 1].power; } static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, u32 power) { int i; - struct freq_table *freq_table = cpufreq_cdev->freq_table; - for (i = 1; i <= cpufreq_cdev->max_level; i++) - if (power > freq_table[i].power) + for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) { + if (power > cpufreq_cdev->em->table[i].power) break; + } - return freq_table[i - 1].frequency; + return cpufreq_cdev->em->table[i + 1].frequency; } /** @@ -332,6 +247,7 @@ static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev, raw_cpu_power = cpu_freq_to_power(cpufreq_cdev, freq); return (raw_cpu_power * cpufreq_cdev->last_load) / 100; } +#endif /* cpufreq cooling device callback functions are defined below */ @@ -374,6 +290,30 @@ static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev, return 0; } +static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev, + unsigned long state) +{ + struct cpufreq_policy *policy; + unsigned long idx; + +#ifdef CONFIG_ENERGY_MODEL + /* Use the Energy Model table if available */ + if (cpufreq_cdev->em) { + idx = cpufreq_cdev->max_level - state; + return cpufreq_cdev->em->table[idx].frequency; + } +#endif + + /* Otherwise, fallback on the CPUFreq table */ + policy = cpufreq_cdev->policy; + if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING) + idx = cpufreq_cdev->max_level - state; + else + idx = state; + + return policy->freq_table[idx].frequency; +} + /** * cpufreq_set_cur_state - callback function to set the current cooling state. * @cdev: thermal cooling device pointer. @@ -398,7 +338,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, if (cpufreq_cdev->cpufreq_state == state) return 0; - clip_freq = cpufreq_cdev->freq_table[state].frequency; + clip_freq = get_state_freq(cpufreq_cdev, state); cpufreq_cdev->cpufreq_state = state; cpufreq_cdev->clipped_freq = clip_freq; @@ -407,6 +347,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, return 0; } +#ifdef CONFIG_ENERGY_MODEL /** * cpufreq_get_requested_power() - get the current power * @cdev: &thermal_cooling_device pointer @@ -497,7 +438,7 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev, struct thermal_zone_device *tz, unsigned long state, u32 *power) { - unsigned int freq, num_cpus; + unsigned int freq, num_cpus, idx; struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; /* Request state should be less than max_level */ @@ -506,7 +447,8 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev, num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus); - freq = cpufreq_cdev->freq_table[state].frequency; + idx = cpufreq_cdev->max_level - state; + freq = cpufreq_cdev->em->table[idx].frequency; *power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus; return 0; @@ -553,14 +495,6 @@ static int cpufreq_power2state(struct thermal_cooling_device *cdev, return 0; } -/* Bind cpufreq callbacks to thermal cooling device ops */ - -static struct thermal_cooling_device_ops cpufreq_cooling_ops = { - .get_max_state = cpufreq_get_max_state, - .get_cur_state = cpufreq_get_cur_state, - .set_cur_state = cpufreq_set_cur_state, -}; - static struct thermal_cooling_device_ops cpufreq_power_cooling_ops = { .get_max_state = cpufreq_get_max_state, .get_cur_state = cpufreq_get_cur_state, @@ -569,32 +503,27 @@ static struct thermal_cooling_device_ops cpufreq_power_cooling_ops = { .state2power = cpufreq_state2power, .power2state = cpufreq_power2state, }; +#endif + +/* Bind cpufreq callbacks to thermal cooling device ops */ + +static struct thermal_cooling_device_ops cpufreq_cooling_ops = { + .get_max_state = cpufreq_get_max_state, + .get_cur_state = cpufreq_get_cur_state, + .set_cur_state = cpufreq_set_cur_state, +}; /* Notifier for cpufreq policy change */ static struct notifier_block thermal_cpufreq_notifier_block = { .notifier_call = cpufreq_thermal_notifier, }; -static unsigned int find_next_max(struct cpufreq_frequency_table *table, - unsigned int prev_max) -{ - struct cpufreq_frequency_table *pos; - unsigned int max = 0; - - cpufreq_for_each_valid_entry(pos, table) { - if (pos->frequency > max && pos->frequency < prev_max) - max = pos->frequency; - } - - return max; -} - /** * __cpufreq_cooling_register - helper function to create cpufreq cooling device * @np: a valid struct device_node to the cooling device device tree node * @policy: cpufreq policy * Normally this should be same as cpufreq policy->related_cpus. - * @capacitance: dynamic power coefficient for these cpus + * @try_model: true if a power model should be used * * This interface function registers the cpufreq cooling device with the name * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq @@ -606,12 +535,12 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table, */ static struct thermal_cooling_device * __cpufreq_cooling_register(struct device_node *np, - struct cpufreq_policy *policy, u32 capacitance) + struct cpufreq_policy *policy, bool try_model) { struct thermal_cooling_device *cdev; struct cpufreq_cooling_device *cpufreq_cdev; char dev_name[THERMAL_NAME_LENGTH]; - unsigned int freq, i, num_cpus; + unsigned int i, num_cpus; int ret; struct thermal_cooling_device_ops *cooling_ops; bool first; @@ -645,54 +574,36 @@ __cpufreq_cooling_register(struct device_node *np, /* max_level is an index, not a counter */ cpufreq_cdev->max_level = i - 1; - cpufreq_cdev->freq_table = kmalloc_array(i, - sizeof(*cpufreq_cdev->freq_table), - GFP_KERNEL); - if (!cpufreq_cdev->freq_table) { - cdev = ERR_PTR(-ENOMEM); - goto free_idle_time; - } +#ifdef CONFIG_ENERGY_MODEL + if (try_model) { + struct em_perf_domain *em = em_cpu_get(policy->cpu); + + if (!em || !cpumask_equal(policy->cpus, to_cpumask(em->cpus))) { + cdev = ERR_PTR(-EINVAL); + goto free_idle_time; + } + cpufreq_cdev->em = em; + cooling_ops = &cpufreq_power_cooling_ops; + } else +#endif + cooling_ops = &cpufreq_cooling_ops; ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL); if (ret < 0) { cdev = ERR_PTR(ret); - goto free_table; + goto free_idle_time; } cpufreq_cdev->id = ret; snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d", cpufreq_cdev->id); - /* Fill freq-table in descending order of frequencies */ - for (i = 0, freq = -1; i <= cpufreq_cdev->max_level; i++) { - freq = find_next_max(policy->freq_table, freq); - cpufreq_cdev->freq_table[i].frequency = freq; - - /* Warn for duplicate entries */ - if (!freq) - pr_warn("%s: table has duplicate entries\n", __func__); - else - pr_debug("%s: freq:%u KHz\n", __func__, freq); - } - - if (capacitance) { - ret = update_freq_table(cpufreq_cdev, capacitance); - if (ret) { - cdev = ERR_PTR(ret); - goto remove_ida; - } - - cooling_ops = &cpufreq_power_cooling_ops; - } else { - cooling_ops = &cpufreq_cooling_ops; - } - cdev = thermal_of_cooling_device_register(np, dev_name, cpufreq_cdev, cooling_ops); if (IS_ERR(cdev)) goto remove_ida; - cpufreq_cdev->clipped_freq = cpufreq_cdev->freq_table[0].frequency; + cpufreq_cdev->clipped_freq = get_state_freq(cpufreq_cdev, 0); cpufreq_cdev->cdev = cdev; mutex_lock(&cooling_list_lock); @@ -709,8 +620,6 @@ __cpufreq_cooling_register(struct device_node *np, remove_ida: ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); -free_table: - kfree(cpufreq_cdev->freq_table); free_idle_time: kfree(cpufreq_cdev->idle_time); free_cdev: @@ -732,7 +641,7 @@ __cpufreq_cooling_register(struct device_node *np, struct thermal_cooling_device * cpufreq_cooling_register(struct cpufreq_policy *policy) { - return __cpufreq_cooling_register(NULL, policy, 0); + return __cpufreq_cooling_register(NULL, policy, false); } EXPORT_SYMBOL_GPL(cpufreq_cooling_register); @@ -760,7 +669,6 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy) { struct device_node *np = of_get_cpu_node(policy->cpu, NULL); struct thermal_cooling_device *cdev = NULL; - u32 capacitance = 0; if (!np) { pr_err("cpu_cooling: OF node not available for cpu%d\n", @@ -769,10 +677,7 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy) } if (of_find_property(np, "#cooling-cells", NULL)) { - of_property_read_u32(np, "dynamic-power-coefficient", - &capacitance); - - cdev = __cpufreq_cooling_register(np, policy, capacitance); + cdev = __cpufreq_cooling_register(np, policy, true); if (IS_ERR(cdev)) { pr_err("cpu_cooling: cpu%d is not running as cooling device: %ld\n", policy->cpu, PTR_ERR(cdev)); @@ -814,7 +719,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) thermal_cooling_device_unregister(cpufreq_cdev->cdev); ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); kfree(cpufreq_cdev->idle_time); - kfree(cpufreq_cdev->freq_table); kfree(cpufreq_cdev); } EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister); From 834fd1abeb5a8632f9fa2e2f04893fb5e76712eb Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 10 Jul 2018 16:57:36 +0100 Subject: [PATCH 75/75] ANDROID: arm64: defconfig: Enable CONFIG_SCHED_TUNE Signed-off-by: Quentin Perret Change-Id: Id31c1a0811a1ee45e5533d948dc2faef50624a5a --- arch/arm64/configs/defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 31fd24bff688..2fc45d5505a4 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -106,6 +106,7 @@ CONFIG_COMPAT=y CONFIG_HIBERNATION=y CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y CONFIG_ENERGY_MODEL=y +CONFIG_SCHED_TUNE=y CONFIG_ARM_CPUIDLE=y CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_STAT=y