From 17a8b0b6dde67f8561cf2ccbe945d5089cd70e08 Mon Sep 17 00:00:00 2001 From: Sibi Sankar Date: Wed, 12 May 2021 13:41:22 +0530 Subject: [PATCH 01/11] cpufreq: blacklist SC7280 in cpufreq-dt-platdev Add SC7280 to cpufreq-dt-platdev blacklist since the actual scaling is handled by the 'qcom-cpufreq-hw' driver. Reviewed-by: Douglas Anderson Signed-off-by: Sibi Sankar Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 5e07065ec22f..345418b8250e 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -137,6 +137,7 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "qcom,msm8996", }, { .compatible = "qcom,qcs404", }, { .compatible = "qcom,sc7180", }, + { .compatible = "qcom,sc7280", }, { .compatible = "qcom,sdm845", }, { .compatible = "st,stih407", }, From 88bf5a85fe9840c9b49c5f6c625cdccd11233943 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Mon, 17 May 2021 16:54:58 +0100 Subject: [PATCH 02/11] dt-bindings: dvfs: Add support for generic performance domains The CLKSCREW attack [0] exposed security vulnerabilities in energy management implementations where untrusted software had direct access to clock and voltage hardware controls. In this attack, the malicious software was able to place the platform into unsafe overclocked or undervolted configurations. Such configurations then enabled the injection of predictable faults to reveal secrets. Many Arm-based systems used to or still use voltage regulator and clock frameworks in the kernel. These frameworks allow callers to independently manipulate frequency and voltage settings. Such implementations can render systems susceptible to this form of attack. Attacks such as CLKSCREW are now being mitigated by not having direct and independent control of clock and voltage in the kernel and moving that control to a trusted entity, such as the SCP firmware or secure world firmware/software which are to perform sanity checking on the requested performance levels, thereby preventing any attempted malicious programming. With the advent of such an abstraction, there is a need to replace the generic clock and regulator bindings used by such devices with a generic performance domains bindings. [0] https://www.usenix.org/conference/usenixsecurity17/technical-sessions/presentation/tang Cc: Rob Herring Acked-by: Viresh Kumar Signed-off-by: Sudeep Holla Reviewed-by: Rob Herring Signed-off-by: Viresh Kumar --- .../devicetree/bindings/arm/cpus.yaml | 7 ++ .../bindings/dvfs/performance-domain.yaml | 74 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 Documentation/devicetree/bindings/dvfs/performance-domain.yaml diff --git a/Documentation/devicetree/bindings/arm/cpus.yaml b/Documentation/devicetree/bindings/arm/cpus.yaml index f3c7249c73d6..9a2432a88074 100644 --- a/Documentation/devicetree/bindings/arm/cpus.yaml +++ b/Documentation/devicetree/bindings/arm/cpus.yaml @@ -257,6 +257,13 @@ properties: where voltage is in V, frequency is in MHz. + performance-domains: + maxItems: 1 + description: + List of phandles and performance domain specifiers, as defined by + bindings of the performance domain provider. See also + dvfs/performance-domain.yaml. + power-domains: description: List of phandles and PM domain specifiers, as defined by bindings of the diff --git a/Documentation/devicetree/bindings/dvfs/performance-domain.yaml b/Documentation/devicetree/bindings/dvfs/performance-domain.yaml new file mode 100644 index 000000000000..c8b91207f34d --- /dev/null +++ b/Documentation/devicetree/bindings/dvfs/performance-domain.yaml @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/dvfs/performance-domain.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Generic performance domains + +maintainers: + - Sudeep Holla + +description: |+ + This binding is intended for performance management of groups of devices or + CPUs that run in the same performance domain. Performance domains must not + be confused with power domains. A performance domain is defined by a set + of devices that always have to run at the same performance level. For a given + performance domain, there is a single point of control that affects all the + devices in the domain, making it impossible to set the performance level of + an individual device in the domain independently from other devices in + that domain. For example, a set of CPUs that share a voltage domain, and + have a common frequency control, is said to be in the same performance + domain. + + This device tree binding can be used to bind performance domain consumer + devices with their performance domains provided by performance domain + providers. A performance domain provider can be represented by any node in + the device tree and can provide one or more performance domains. A consumer + node can refer to the provider by a phandle and a set of phandle arguments + (so called performance domain specifiers) of length specified by the + \#performance-domain-cells property in the performance domain provider node. + +select: true + +properties: + "#performance-domain-cells": + description: + Number of cells in a performance domain specifier. Typically 0 for nodes + representing a single performance domain and 1 for nodes providing + multiple performance domains (e.g. performance controllers), but can be + any value as specified by device tree binding documentation of particular + provider. + enum: [ 0, 1 ] + + performance-domains: + $ref: '/schemas/types.yaml#/definitions/phandle-array' + maxItems: 1 + description: + A phandle and performance domain specifier as defined by bindings of the + performance controller/provider specified by phandle. + +additionalProperties: true + +examples: + - | + performance: performance-controller@12340000 { + compatible = "qcom,cpufreq-hw"; + reg = <0x12340000 0x1000>; + #performance-domain-cells = <1>; + }; + + // The node above defines a performance controller that is a performance + // domain provider and expects one cell as its phandle argument. + + cpus { + #address-cells = <2>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + compatible = "arm,cortex-a57"; + reg = <0x0 0x0>; + performance-domains = <&performance 1>; + }; + }; From 70d99a8f0442bbc5abfa34ea27ce1fcacff57f90 Mon Sep 17 00:00:00 2001 From: Fabien Parent Date: Wed, 19 May 2021 18:25:50 +0200 Subject: [PATCH 03/11] cpufreq: mediatek: add support for mt8365 Add compatible stirng for MediaTek MT8365 SoC. Add also the compatible in the blacklist of the cpufreq-dt-platdev driver. Signed-off-by: Fabien Parent Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 1 + drivers/cpufreq/mediatek-cpufreq.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 345418b8250e..0bb10402f02c 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -126,6 +126,7 @@ static const struct of_device_id blacklist[] __initconst = { { .compatible = "mediatek,mt8173", }, { .compatible = "mediatek,mt8176", }, { .compatible = "mediatek,mt8183", }, + { .compatible = "mediatek,mt8365", }, { .compatible = "mediatek,mt8516", }, { .compatible = "nvidia,tegra20", }, diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index f2e491b25b07..87019d5a9547 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -537,6 +537,7 @@ static const struct of_device_id mtk_cpufreq_machines[] __initconst = { { .compatible = "mediatek,mt8173", }, { .compatible = "mediatek,mt8176", }, { .compatible = "mediatek,mt8183", }, + { .compatible = "mediatek,mt8365", }, { .compatible = "mediatek,mt8516", }, { } From b791c7f94680ba9b60b0c0786b1d0eb4393053d6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 6 May 2021 21:09:48 +0200 Subject: [PATCH 04/11] cpufreq: scmi: Fix an error message 'ret' is known to be 0 here. The last error code is stored in 'nr_opp', so use it in the error message. Fixes: 71a37cd6a59d ("scmi-cpufreq: Remove deferred probe") Signed-off-by: Christophe JAILLET Reviewed-by: Sudeep Holla Signed-off-by: Viresh Kumar --- drivers/cpufreq/scmi-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index c8a4364ad3c2..ec9a87ca2dbb 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -174,7 +174,7 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) nr_opp = dev_pm_opp_get_opp_count(cpu_dev); if (nr_opp <= 0) { dev_err(cpu_dev, "%s: No OPPs for this device: %d\n", - __func__, ret); + __func__, nr_opp); ret = -ENODEV; goto out_free_opp; From 4814d9c5d3b956c5a8f47acbb6b98fdd4dfe334f Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 20 May 2021 09:56:18 +0530 Subject: [PATCH 05/11] cpufreq: dt: Rename black/white-lists Rename them in accordance with the coding guidelines. Reviewed-by: Rafael J. Wysocki Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 0bb10402f02c..bef7528aecd3 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -15,7 +15,7 @@ * Machines for which the cpufreq device is *always* created, mostly used for * platforms using "operating-points" (V1) property. */ -static const struct of_device_id whitelist[] __initconst = { +static const struct of_device_id allowlist[] __initconst = { { .compatible = "allwinner,sun4i-a10", }, { .compatible = "allwinner,sun5i-a10s", }, { .compatible = "allwinner,sun5i-a13", }, @@ -100,7 +100,7 @@ static const struct of_device_id whitelist[] __initconst = { * Machines for which the cpufreq device is *not* created, mostly used for * platforms using "operating-points-v2" property. */ -static const struct of_device_id blacklist[] __initconst = { +static const struct of_device_id blocklist[] __initconst = { { .compatible = "allwinner,sun50i-h6", }, { .compatible = "arm,vexpress", }, @@ -179,13 +179,13 @@ static int __init cpufreq_dt_platdev_init(void) if (!np) return -ENODEV; - match = of_match_node(whitelist, np); + match = of_match_node(allowlist, np); if (match) { data = match->data; goto create_pdev; } - if (cpu0_node_has_opp_v2_prop() && !of_match_node(blacklist, np)) + if (cpu0_node_has_opp_v2_prop() && !of_match_node(blocklist, np)) goto create_pdev; of_node_put(np); From eed828895b2426a286717c1ddea8af45fa08bfc3 Mon Sep 17 00:00:00 2001 From: Seiya Wang Date: Tue, 1 Jun 2021 15:10:41 +0800 Subject: [PATCH 06/11] clk: mediatek: remove deprecated CLK_INFRA_CA57SEL for MT8173 SoC Remove CLK_INFRA_CA57SEL for MT8173 since it's no longer used. Acked-by: Rob Herring Reviewed-by: Matthias Brugger Signed-off-by: Seiya Wang Signed-off-by: Viresh Kumar --- include/dt-bindings/clock/mt8173-clk.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/dt-bindings/clock/mt8173-clk.h b/include/dt-bindings/clock/mt8173-clk.h index 3acebe937bfc..3d00c98b9654 100644 --- a/include/dt-bindings/clock/mt8173-clk.h +++ b/include/dt-bindings/clock/mt8173-clk.h @@ -186,7 +186,6 @@ #define CLK_INFRA_PMICWRAP 11 #define CLK_INFRA_CLK_13M 12 #define CLK_INFRA_CA53SEL 13 -#define CLK_INFRA_CA57SEL 14 /* Deprecated. Don't use it. */ #define CLK_INFRA_CA72SEL 14 #define CLK_INFRA_NR_CLK 15 From 9821a195d4e263801884b105554e801642c59f2a Mon Sep 17 00:00:00 2001 From: Seiya Wang Date: Tue, 1 Jun 2021 15:10:42 +0800 Subject: [PATCH 07/11] dt-bindings: cpufreq: update cpu type and clock name for MT8173 SoC Update the cpu type of cpu2 and cpu3 since MT8173 used Cortex-a72. Acked-by: Viresh Kumar Acked-by: Rob Herring Reviewed-by: Matthias Brugger Signed-off-by: Seiya Wang Signed-off-by: Viresh Kumar --- .../devicetree/bindings/cpufreq/cpufreq-mediatek.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt b/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt index ea4994b35207..ef68711716fb 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt @@ -202,11 +202,11 @@ Example 2 (MT8173 SoC): cpu2: cpu@100 { device_type = "cpu"; - compatible = "arm,cortex-a57"; + compatible = "arm,cortex-a72"; reg = <0x100>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; - clocks = <&infracfg CLK_INFRA_CA57SEL>, + clocks = <&infracfg CLK_INFRA_CA72SEL>, <&apmixedsys CLK_APMIXED_MAINPLL>; clock-names = "cpu", "intermediate"; operating-points-v2 = <&cpu_opp_table_b>; @@ -214,11 +214,11 @@ Example 2 (MT8173 SoC): cpu3: cpu@101 { device_type = "cpu"; - compatible = "arm,cortex-a57"; + compatible = "arm,cortex-a72"; reg = <0x101>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; - clocks = <&infracfg CLK_INFRA_CA57SEL>, + clocks = <&infracfg CLK_INFRA_CA72SEL>, <&apmixedsys CLK_APMIXED_MAINPLL>; clock-names = "cpu", "intermediate"; operating-points-v2 = <&cpu_opp_table_b>; From fe2535a44904a77615a3af8e8fd7dafb98fb0e1b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 18 Jun 2021 13:31:27 +0530 Subject: [PATCH 08/11] cpufreq: CPPC: Fix potential memleak in cppc_cpufreq_cpu_init It's a classic example of memleak, we allocate something, we fail and never free the resources. Make sure we free all resources on policy ->init() failures. Fixes: a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list") Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar --- drivers/cpufreq/cppc_cpufreq.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index be4f62e2c5f1..945ab4942c1c 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -256,6 +256,16 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) return NULL; } +static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data = policy->driver_data; + + list_del(&cpu_data->node); + free_cpumask_var(cpu_data->shared_cpu_map); + kfree(cpu_data); + policy->driver_data = NULL; +} + static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; @@ -309,7 +319,8 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) default: pr_debug("Unsupported CPU co-ord type: %d\n", policy->shared_type); - return -EFAULT; + ret = -EFAULT; + goto out; } /* @@ -324,10 +335,16 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) cpu_data->perf_ctrls.desired_perf = caps->highest_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) + if (ret) { pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->highest_perf, cpu, ret); + goto out; + } + return 0; + +out: + cppc_cpufreq_put_cpu_data(policy); return ret; } @@ -345,12 +362,7 @@ static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", caps->lowest_perf, cpu, ret); - /* Remove CPU node from list and free driver data for policy */ - free_cpumask_var(cpu_data->shared_cpu_map); - list_del(&cpu_data->node); - kfree(policy->driver_data); - policy->driver_data = NULL; - + cppc_cpufreq_put_cpu_data(policy); return 0; } From eead1840cbd31e553bf8ccdefbd5b065bf596b71 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 18 Jun 2021 13:42:23 +0530 Subject: [PATCH 09/11] cpufreq: CPPC: Pass structure instance by reference Don't pass structure instance by value, pass it by reference instead. Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar --- drivers/cpufreq/cppc_cpufreq.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 945ab4942c1c..4a7f0f9b8c60 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -375,18 +375,18 @@ static inline u64 get_delta(u64 t1, u64 t0) } static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs fb_ctrs_t0, - struct cppc_perf_fb_ctrs fb_ctrs_t1) + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; u64 reference_perf, delivered_perf; - reference_perf = fb_ctrs_t0.reference_perf; + reference_perf = fb_ctrs_t0->reference_perf; - delta_reference = get_delta(fb_ctrs_t1.reference, - fb_ctrs_t0.reference); - delta_delivered = get_delta(fb_ctrs_t1.delivered, - fb_ctrs_t0.delivered); + delta_reference = get_delta(fb_ctrs_t1->reference, + fb_ctrs_t0->reference); + delta_delivered = get_delta(fb_ctrs_t1->delivered, + fb_ctrs_t0->delivered); /* Check to avoid divide-by zero */ if (delta_reference || delta_delivered) @@ -417,7 +417,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) if (ret) return ret; - return cppc_get_rate_from_fbctrs(cpu_data, fb_ctrs_t0, fb_ctrs_t1); + return cppc_get_rate_from_fbctrs(cpu_data, &fb_ctrs_t0, &fb_ctrs_t1); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) From 83150f5d05f065fb5c12c612f119015cabdcc124 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 15 Jun 2021 14:27:50 +0530 Subject: [PATCH 10/11] arch_topology: Avoid use-after-free for scale_freq_data Currently topology_scale_freq_tick() (which gets called from scheduler_tick()) may end up using a pointer to "struct scale_freq_data", which was previously cleared by topology_clear_scale_freq_source(), as there is no protection in place here. The users of topology_clear_scale_freq_source() though needs a guarantee that the previously cleared scale_freq_data isn't used anymore, so they can free the related resources. Since topology_scale_freq_tick() is called from scheduler tick, we don't want to add locking in there. Use the RCU update mechanism instead (which is already used by the scheduler's utilization update path) to guarantee race free updates here. synchronize_rcu() makes sure that all RCU critical sections that started before it is called, will finish before it returns. And so the callers of topology_clear_scale_freq_source() don't need to worry about their callback getting called anymore. Cc: Paul E. McKenney Fixes: 01e055c120a4 ("arch_topology: Allow multiple entities to provide sched_freq_tick() callback") Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar --- drivers/base/arch_topology.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index c1179edc0f3b..921312a8d957 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -18,10 +18,11 @@ #include #include #include +#include #include #include -static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); +static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; @@ -66,16 +67,20 @@ void topology_set_scale_freq_source(struct scale_freq_data *data, if (cpumask_empty(&scale_freq_counters_mask)) scale_freq_invariant = topology_scale_freq_invariant(); + rcu_read_lock(); + for_each_cpu(cpu, cpus) { - sfd = per_cpu(sft_data, cpu); + sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); /* Use ARCH provided counters whenever possible */ if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { - per_cpu(sft_data, cpu) = data; + rcu_assign_pointer(per_cpu(sft_data, cpu), data); cpumask_set_cpu(cpu, &scale_freq_counters_mask); } } + rcu_read_unlock(); + update_scale_freq_invariant(true); } EXPORT_SYMBOL_GPL(topology_set_scale_freq_source); @@ -86,22 +91,32 @@ void topology_clear_scale_freq_source(enum scale_freq_source source, struct scale_freq_data *sfd; int cpu; + rcu_read_lock(); + for_each_cpu(cpu, cpus) { - sfd = per_cpu(sft_data, cpu); + sfd = rcu_dereference(*per_cpu_ptr(&sft_data, cpu)); if (sfd && sfd->source == source) { - per_cpu(sft_data, cpu) = NULL; + rcu_assign_pointer(per_cpu(sft_data, cpu), NULL); cpumask_clear_cpu(cpu, &scale_freq_counters_mask); } } + rcu_read_unlock(); + + /* + * Make sure all references to previous sft_data are dropped to avoid + * use-after-free races. + */ + synchronize_rcu(); + update_scale_freq_invariant(false); } EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source); void topology_scale_freq_tick(void) { - struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data); + struct scale_freq_data *sfd = rcu_dereference_sched(*this_cpu_ptr(&sft_data)); if (sfd) sfd->set_freq_scale(); From 1eb5dde674f57b1a1918dab33f09e35cdd64eb07 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 23 Jun 2020 15:49:40 +0530 Subject: [PATCH 11/11] cpufreq: CPPC: Add support for frequency invariance The Frequency Invariance Engine (FIE) is providing a frequency scaling correction factor that helps achieve more accurate load-tracking. Normally, this scaling factor can be obtained directly with the help of the cpufreq drivers as they know the exact frequency the hardware is running at. But that isn't the case for CPPC cpufreq driver. Another way of obtaining that is using the arch specific counter support, which is already present in kernel, but that hardware is optional for platforms. This patch updates the CPPC driver to register itself with the topology core to provide its own implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which gets called by the scheduler on every tick. Note that the arch specific counters have higher priority than CPPC counters, if available, though the CPPC driver doesn't need to have any special handling for that. On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we reach here from hard-irq context), which then schedules a normal work item and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable based on the counter updates since the last tick. To allow platforms to disable this CPPC counter-based frequency invariance support, this is all done under CONFIG_ACPI_CPPC_CPUFREQ_FIE, which is enabled by default. This also exports sched_setattr_nocheck() as the CPPC driver can be built as a module. Cc: linux-acpi@vger.kernel.org Tested-by: Vincent Guittot Reviewed-by: Ionela Voinescu Tested-by: Qian Cai Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 10 ++ drivers/cpufreq/cppc_cpufreq.c | 252 +++++++++++++++++++++++++++++++-- include/linux/arch_topology.h | 1 + kernel/sched/core.c | 1 + 4 files changed, 251 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index e65e0a43be64..a5c5f70acfc9 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ If in doubt, say N. +config ACPI_CPPC_CPUFREQ_FIE + bool "Frequency Invariance support for CPPC cpufreq driver" + depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY + default y + help + This extends frequency invariance support in the CPPC cpufreq driver, + by using CPPC delivered and reference performance counters. + + If in doubt, say N. + config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM tristate "Allwinner nvmem based SUN50I CPUFreq driver" depends on ARCH_SUNXI diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 4a7f0f9b8c60..d4c27022b9c9 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -10,14 +10,18 @@ #define pr_fmt(fmt) "CPPC Cpufreq:" fmt +#include #include #include #include #include #include #include +#include +#include #include #include +#include #include @@ -57,6 +61,216 @@ static struct cppc_workaround_oem_info wa_info[] = { } }; +#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE + +/* Frequency invariance support */ +struct cppc_freq_invariance { + int cpu; + struct irq_work irq_work; + struct kthread_work work; + struct cppc_perf_fb_ctrs prev_perf_fb_ctrs; + struct cppc_cpudata *cpu_data; +}; + +static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); +static struct kthread_worker *kworker_fie; + +static struct cpufreq_driver cppc_cpufreq_driver; +static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu); +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1); + +/** + * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance + * @work: The work item. + * + * The CPPC driver register itself with the topology core to provide its own + * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which + * gets called by the scheduler on every tick. + * + * Note that the arch specific counters have higher priority than CPPC counters, + * if available, though the CPPC driver doesn't need to have any special + * handling for that. + * + * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we + * reach here from hard-irq context), which then schedules a normal work item + * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable + * based on the counter updates since the last tick. + */ +static void cppc_scale_freq_workfn(struct kthread_work *work) +{ + struct cppc_freq_invariance *cppc_fi; + struct cppc_perf_fb_ctrs fb_ctrs = {0}; + struct cppc_cpudata *cpu_data; + unsigned long local_freq_scale; + u64 perf; + + cppc_fi = container_of(work, struct cppc_freq_invariance, work); + cpu_data = cppc_fi->cpu_data; + + if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) { + pr_warn("%s: failed to read perf counters\n", __func__); + return; + } + + perf = cppc_perf_from_fbctrs(cpu_data, &cppc_fi->prev_perf_fb_ctrs, + &fb_ctrs); + cppc_fi->prev_perf_fb_ctrs = fb_ctrs; + + perf <<= SCHED_CAPACITY_SHIFT; + local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf); + + /* This can happen due to counter's overflow */ + if (unlikely(local_freq_scale > 1024)) + local_freq_scale = 1024; + + per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale; +} + +static void cppc_irq_work(struct irq_work *irq_work) +{ + struct cppc_freq_invariance *cppc_fi; + + cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work); + kthread_queue_work(kworker_fie, &cppc_fi->work); +} + +static void cppc_scale_freq_tick(void) +{ + struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id()); + + /* + * cppc_get_perf_ctrs() can potentially sleep, call that from the right + * context. + */ + irq_work_queue(&cppc_fi->irq_work); +} + +static struct scale_freq_data cppc_sftd = { + .source = SCALE_FREQ_SOURCE_CPPC, + .set_freq_scale = cppc_scale_freq_tick, +}; + +static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu, ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + for_each_cpu(cpu, policy->cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + cppc_fi->cpu = cpu; + cppc_fi->cpu_data = policy->driver_data; + kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn); + init_irq_work(&cppc_fi->irq_work, cppc_irq_work); + + ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); + if (ret) { + pr_warn("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + + /* + * Don't abort if the CPU was offline while the driver + * was getting registered. + */ + if (cpu_online(cpu)) + return; + } + } + + /* Register for freq-invariance */ + topology_set_scale_freq_source(&cppc_sftd, policy->cpus); +} + +/* + * We free all the resources on policy's removal and not on CPU removal as the + * irq-work are per-cpu and the hotplug core takes care of flushing the pending + * irq-works (hint: smpcfd_dying_cpu()) on CPU hotplug. Even if the kthread-work + * fires on another CPU after the concerned CPU is removed, it won't harm. + * + * We just need to make sure to remove them all on policy->exit(). + */ +static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + /* policy->cpus will be empty here, use related_cpus instead */ + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, policy->related_cpus); + + for_each_cpu(cpu, policy->related_cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + irq_work_sync(&cppc_fi->irq_work); + kthread_cancel_work_sync(&cppc_fi->work); + } +} + +static void __init cppc_freq_invariance_init(void) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; + int ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + kworker_fie = kthread_create_worker(0, "cppc_fie"); + if (IS_ERR(kworker_fie)) + return; + + ret = sched_setattr_nocheck(kworker_fie->task, &attr); + if (ret) { + pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__, + ret); + kthread_destroy_worker(kworker_fie); + return; + } +} + +static void cppc_freq_invariance_exit(void) +{ + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + kthread_destroy_worker(kworker_fie); + kworker_fie = NULL; +} + +#else +static inline void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_freq_invariance_init(void) +{ +} + +static inline void cppc_freq_invariance_exit(void) +{ +} +#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */ + /* Callback function used to retrieve the max frequency from DMI */ static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) { @@ -341,6 +555,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) goto out; } + cppc_cpufreq_cpu_fie_init(policy); return 0; out: @@ -355,6 +570,8 @@ static int cppc_cpufreq_cpu_exit(struct cpufreq_policy *policy) unsigned int cpu = policy->cpu; int ret; + cppc_cpufreq_cpu_fie_exit(policy); + cpu_data->perf_ctrls.desired_perf = caps->lowest_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); @@ -374,12 +591,12 @@ static inline u64 get_delta(u64 t1, u64 t0) return (u32)t1 - (u32)t0; } -static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs *fb_ctrs_t0, - struct cppc_perf_fb_ctrs *fb_ctrs_t1) +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; - u64 reference_perf, delivered_perf; + u64 reference_perf; reference_perf = fb_ctrs_t0->reference_perf; @@ -388,14 +605,11 @@ static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, delta_delivered = get_delta(fb_ctrs_t1->delivered, fb_ctrs_t0->delivered); - /* Check to avoid divide-by zero */ - if (delta_reference || delta_delivered) - delivered_perf = (reference_perf * delta_delivered) / - delta_reference; - else - delivered_perf = cpu_data->perf_ctrls.desired_perf; + /* Check to avoid divide-by zero and invalid delivered_perf */ + if (!delta_reference || !delta_delivered) + return cpu_data->perf_ctrls.desired_perf; - return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); + return (reference_perf * delta_delivered) / delta_reference; } static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) @@ -403,6 +617,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0}; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); struct cppc_cpudata *cpu_data = policy->driver_data; + u64 delivered_perf; int ret; cpufreq_cpu_put(policy); @@ -417,7 +632,10 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) if (ret) return ret; - return cppc_get_rate_from_fbctrs(cpu_data, &fb_ctrs_t0, &fb_ctrs_t1); + delivered_perf = cppc_perf_from_fbctrs(cpu_data, &fb_ctrs_t0, + &fb_ctrs_t1); + + return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) @@ -518,14 +736,21 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { + int ret; + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; INIT_LIST_HEAD(&cpu_data_list); cppc_check_hisi_workaround(); + cppc_freq_invariance_init(); - return cpufreq_register_driver(&cppc_cpufreq_driver); + ret = cpufreq_register_driver(&cppc_cpufreq_driver); + if (ret) + cppc_freq_invariance_exit(); + + return ret; } static inline void free_cpu_data(void) @@ -543,6 +768,7 @@ static inline void free_cpu_data(void) static void __exit cppc_cpufreq_exit(void) { cpufreq_unregister_driver(&cppc_cpufreq_driver); + cppc_freq_invariance_exit(); free_cpu_data(); } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 11e555cfaecb..f180240dc95f 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -37,6 +37,7 @@ bool topology_scale_freq_invariant(void); enum scale_freq_source { SCALE_FREQ_SOURCE_CPUFREQ = 0, SCALE_FREQ_SOURCE_ARCH, + SCALE_FREQ_SOURCE_CPPC, }; struct scale_freq_data { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf16f8fda9a6..2d9ff40f4661 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7182,6 +7182,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.