From 4742da9774a416908ef8e3916164192c15c0e2d1 Mon Sep 17 00:00:00 2001 From: zuoqian Date: Sat, 25 Jan 2025 08:49:49 +0000 Subject: [PATCH 001/139] cpufreq: scpi: compare kHz instead of Hz The CPU rate from clk_get_rate() may not be divisible by 1000 (e.g., 133333333). But the rate calculated from frequency(kHz) is always divisible by 1000 (e.g., 133333000). Comparing the rate causes a warning during CPU scaling: "cpufreq: __target_index: Failed to change cpu frequency: -5". When we choose to compare kHz here, the issue does not occur. Fixes: 343a8d17fa8d ("cpufreq: scpi: remove arm_big_little dependency") Signed-off-by: zuoqian Reviewed-by: Dan Carpenter Signed-off-by: Viresh Kumar --- drivers/cpufreq/scpi-cpufreq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index cd89c1b9832c..9e09565e41c0 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -39,8 +39,9 @@ static unsigned int scpi_cpufreq_get_rate(unsigned int cpu) static int scpi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index) { - u64 rate = policy->freq_table[index].frequency * 1000; + unsigned long freq_khz = policy->freq_table[index].frequency; struct scpi_data *priv = policy->driver_data; + unsigned long rate = freq_khz * 1000; int ret; ret = clk_set_rate(priv->clk, rate); @@ -48,7 +49,7 @@ scpi_cpufreq_set_target(struct cpufreq_policy *policy, unsigned int index) if (ret) return ret; - if (clk_get_rate(priv->clk) != rate) + if (clk_get_rate(priv->clk) / 1000 != freq_khz) return -EIO; return 0; From db1cafc77aaaf871509da06f4a864e9af6d6791f Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Thu, 30 Jan 2025 08:52:52 +0000 Subject: [PATCH 002/139] cpufreq: amd-pstate: Remove unnecessary driver_lock in set_boost set_boost is a per-policy function call, hence a driver wide lock is unnecessary. Also this mutex_acquire can collide with the mutex_acquire from the mode-switch path in status_store(), which can lead to a deadlock. So, remove it. Signed-off-by: Dhananjay Ugwekar Acked-by: Mario Limonciello Signed-off-by: Viresh Kumar --- drivers/cpufreq/amd-pstate.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index dd9b8d6993d6..a7e70fe0c57d 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -744,7 +744,6 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) pr_err("Boost mode is not supported by this processor or SBIOS\n"); return -EOPNOTSUPP; } - guard(mutex)(&amd_pstate_driver_lock); ret = amd_pstate_cpu_boost_update(policy, state); policy->boost_enabled = !ret ? state : false; From 4ba6d37ccca1e4ac07ad660006bf130726a11ff7 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Fri, 7 Feb 2025 11:59:53 +0800 Subject: [PATCH 003/139] cpufreq: Use str_enable_disable() helper Commit f994c1cb6c43 ("cpufreq: Use str_enable_disable()-like helpers") has already introduced helpers from string_choices.h and replaced ternary syntax with it. Use str_enable_disable() helper in this line to stay consistent. Signed-off-by: Lifeng Zheng Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 30ffbddc7ece..a12e1da89163 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1578,7 +1578,7 @@ static int cpufreq_online(unsigned int cpu) if (ret) { /* If the set_boost fails, the online operation is not affected */ pr_info("%s: CPU%d: Cannot %s BOOST\n", __func__, policy->cpu, - policy->boost_enabled ? "enable" : "disable"); + str_enable_disable(policy->boost_enabled)); policy->boost_enabled = !policy->boost_enabled; } } From dc47f23f1df65a6b61b5d1f8d25cc4ff30a67a00 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 15:31:49 +0530 Subject: [PATCH 004/139] cpufreq: Always create freq-table related sysfs file Currently it is left for the individual drivers to set the available and boost frequencies related attributes in the cpufreq_driver->attr field. Some drivers provide them, while others don't. A quick search revealed that only the drivers that set the policy->freq_table field, enable these attributes. Which makes sense as well, since the show_available_freqs() helper works only if the freq_table is present. In order to simplify drivers, create the relevant sysfs files forcefully from cpufreq core. For now, skip adding them twice. This can be removed once all the drivers are updated. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a12e1da89163..71ecf6a004e8 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1059,9 +1059,31 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) struct freq_attr **drv_attr; int ret = 0; + /* Attributes that need freq_table */ + if (policy->freq_table) { + ret = sysfs_create_file(&policy->kobj, + &cpufreq_freq_attr_scaling_available_freqs.attr); + if (ret) + return ret; + + if (cpufreq_boost_supported()) { + ret = sysfs_create_file(&policy->kobj, + &cpufreq_freq_attr_scaling_boost_freqs.attr); + if (ret) + return ret; + } + } + /* set up files for this cpu device */ drv_attr = cpufreq_driver->attr; while (drv_attr && *drv_attr) { + /* These are already added, skip them */ + if (*drv_attr == &cpufreq_freq_attr_scaling_available_freqs || + *drv_attr == &cpufreq_freq_attr_scaling_boost_freqs) { + drv_attr++; + continue; + } + ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr)); if (ret) return ret; From 991e0a064bf39f8d3da08eacfaa1e72cd6cde0d3 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 005/139] cpufreq: dt: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 3a7c3372bda7..072ccf0c2e41 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -36,12 +36,6 @@ struct private_data { static LIST_HEAD(priv_list); -static struct freq_attr *cpufreq_dt_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, /* Extra space for boost-attr if required */ - NULL, -}; - static struct private_data *cpufreq_dt_find_data(int cpu) { struct private_data *priv; @@ -126,7 +120,6 @@ static int cpufreq_init(struct cpufreq_policy *policy) ret = cpufreq_enable_boost_support(); if (ret) goto out_clk_put; - cpufreq_dt_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs; } return 0; @@ -169,7 +162,6 @@ static struct cpufreq_driver dt_cpufreq_driver = { .offline = cpufreq_offline, .register_em = cpufreq_register_em_with_opp, .name = "cpufreq-dt", - .attr = cpufreq_dt_attr, .suspend = cpufreq_generic_suspend, }; From 5c840223abc58dcc143713384e5efbae0a6dc050 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 006/139] cpufreq: acpi: Stop setting common freq attributes The core handles this now, the driver can skip setting it. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 463b69a2dff5..c598295d1c52 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -949,7 +949,6 @@ static int acpi_cpufreq_resume(struct cpufreq_policy *policy) } static struct freq_attr *acpi_cpufreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, &freqdomain_cpus, #ifdef CONFIG_X86_ACPI_CPUFREQ_CPB &cpb, From 8b04d1435ffe110411a868c3ea3b9f1eb78b72bc Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 007/139] cpufreq: apple: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/apple-soc-cpufreq.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/cpufreq/apple-soc-cpufreq.c b/drivers/cpufreq/apple-soc-cpufreq.c index 269b18c62d04..6ff604a0fe79 100644 --- a/drivers/cpufreq/apple-soc-cpufreq.c +++ b/drivers/cpufreq/apple-soc-cpufreq.c @@ -229,12 +229,6 @@ static int apple_soc_cpufreq_find_cluster(struct cpufreq_policy *policy, return 0; } -static struct freq_attr *apple_soc_cpufreq_hw_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, /* Filled in below if boost is enabled */ - NULL, -}; - static int apple_soc_cpufreq_init(struct cpufreq_policy *policy) { int ret, i; @@ -321,7 +315,6 @@ static int apple_soc_cpufreq_init(struct cpufreq_policy *policy) if (ret) { dev_warn(cpu_dev, "failed to enable boost: %d\n", ret); } else { - apple_soc_cpufreq_hw_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs; apple_soc_cpufreq_driver.boost_enabled = true; } } @@ -360,7 +353,6 @@ static struct cpufreq_driver apple_soc_cpufreq_driver = { .target_index = apple_soc_cpufreq_set_target, .fast_switch = apple_soc_cpufreq_fast_switch, .register_em = cpufreq_register_em_with_opp, - .attr = apple_soc_cpufreq_hw_attr, .suspend = cpufreq_generic_suspend, }; From 818c3748ade6b32a6b174bb504e115c9b40ee631 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 008/139] cpufreq: bmips: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Reviewed-by: Florian Fainelli Acked-by: Rafael J. Wysocki --- drivers/cpufreq/bmips-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/bmips-cpufreq.c b/drivers/cpufreq/bmips-cpufreq.c index 17a4c174553d..36051880640b 100644 --- a/drivers/cpufreq/bmips-cpufreq.c +++ b/drivers/cpufreq/bmips-cpufreq.c @@ -150,7 +150,6 @@ static struct cpufreq_driver bmips_cpufreq_driver = { .get = bmips_cpufreq_get, .init = bmips_cpufreq_init, .exit = bmips_cpufreq_exit, - .attr = cpufreq_generic_attr, .name = BMIPS_CPUFREQ_PREFIX, }; From 80f9f241bb2a39faa72165559b42ad1101a9a3b1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 009/139] cpufreq: brcmstb: Stop setting common freq attributes The cpufreq core handles this now, the driver can skip setting it. Signed-off-by: Viresh Kumar Reviewed-by: Florian Fainelli Acked-by: Rafael J. Wysocki --- drivers/cpufreq/brcmstb-avs-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c index 2fd0f6be6fa3..7b841a086acc 100644 --- a/drivers/cpufreq/brcmstb-avs-cpufreq.c +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -720,7 +720,6 @@ cpufreq_freq_attr_ro(brcm_avs_voltage); cpufreq_freq_attr_ro(brcm_avs_frequency); static struct freq_attr *brcm_avs_cpufreq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, &brcm_avs_pstate, &brcm_avs_mode, &brcm_avs_pmap, From 6f80f75511fec7a86847f6913ab8d9a02e7be767 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 010/139] cpufreq: davinci: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/davinci-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c index 8736be3a06ce..2c277eb3795a 100644 --- a/drivers/cpufreq/davinci-cpufreq.c +++ b/drivers/cpufreq/davinci-cpufreq.c @@ -101,7 +101,6 @@ static struct cpufreq_driver davinci_driver = { .get = cpufreq_generic_get, .init = davinci_cpu_init, .name = "davinci", - .attr = cpufreq_generic_attr, }; static int __init davinci_cpufreq_probe(struct platform_device *pdev) From b9b60007e6439ffe843b5a9fec96012c985e4f84 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 011/139] cpufreq: e_powersaver: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/e_powersaver.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/e_powersaver.c b/drivers/cpufreq/e_powersaver.c index 6e958b09e1b5..d23a97ba6478 100644 --- a/drivers/cpufreq/e_powersaver.c +++ b/drivers/cpufreq/e_powersaver.c @@ -376,7 +376,6 @@ static struct cpufreq_driver eps_driver = { .exit = eps_cpu_exit, .get = eps_get, .name = "e_powersaver", - .attr = cpufreq_generic_attr, }; From 32ada732b629a3024a51ede7ec988af5f130b839 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 012/139] cpufreq: elanfreq: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/elanfreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/elanfreq.c b/drivers/cpufreq/elanfreq.c index 4ce5eb35dc46..36494b855e41 100644 --- a/drivers/cpufreq/elanfreq.c +++ b/drivers/cpufreq/elanfreq.c @@ -194,7 +194,6 @@ static struct cpufreq_driver elanfreq_driver = { .target_index = elanfreq_target, .init = elanfreq_cpu_init, .name = "elanfreq", - .attr = cpufreq_generic_attr, }; static const struct x86_cpu_id elan_id[] = { From 03973e997fc4b63aa0305c08b1ec1945fc745824 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 013/139] cpufreq: imx6q: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/imx6q-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c index f3c99f378ad6..db1c88e9d3f9 100644 --- a/drivers/cpufreq/imx6q-cpufreq.c +++ b/drivers/cpufreq/imx6q-cpufreq.c @@ -207,7 +207,6 @@ static struct cpufreq_driver imx6q_cpufreq_driver = { .init = imx6q_cpufreq_init, .register_em = cpufreq_register_em_with_opp, .name = "imx6q-cpufreq", - .attr = cpufreq_generic_attr, .suspend = cpufreq_generic_suspend, }; From 25e4d8c131b28337e3b85341977dae9af416d790 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 014/139] cpufreq: kirkwood: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/kirkwood-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c index 312f2654d1d5..24b285cbeb8d 100644 --- a/drivers/cpufreq/kirkwood-cpufreq.c +++ b/drivers/cpufreq/kirkwood-cpufreq.c @@ -96,7 +96,6 @@ static struct cpufreq_driver kirkwood_cpufreq_driver = { .target_index = kirkwood_cpufreq_target, .init = kirkwood_cpufreq_cpu_init, .name = "kirkwood-cpufreq", - .attr = cpufreq_generic_attr, }; static int kirkwood_cpufreq_probe(struct platform_device *pdev) From d4a3b9572b83c38d1913afbd0aa498ebb916b06d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 015/139] cpufreq: longhaul: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/longhaul.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c index bd6fe8638d39..68ccd73c8129 100644 --- a/drivers/cpufreq/longhaul.c +++ b/drivers/cpufreq/longhaul.c @@ -906,7 +906,6 @@ static struct cpufreq_driver longhaul_driver = { .get = longhaul_get, .init = longhaul_cpu_init, .name = "longhaul", - .attr = cpufreq_generic_attr, }; static const struct x86_cpu_id longhaul_id[] = { From 06e9a34aa8fcabb37028de0a124ce34c8c951129 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 016/139] cpufreq: loongson: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/loongson2_cpufreq.c | 1 - drivers/cpufreq/loongson3_cpufreq.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/cpufreq/loongson2_cpufreq.c b/drivers/cpufreq/loongson2_cpufreq.c index ed1a6dbad638..39a6c4315a60 100644 --- a/drivers/cpufreq/loongson2_cpufreq.c +++ b/drivers/cpufreq/loongson2_cpufreq.c @@ -91,7 +91,6 @@ static struct cpufreq_driver loongson2_cpufreq_driver = { .verify = cpufreq_generic_frequency_table_verify, .target_index = loongson2_cpufreq_target, .get = cpufreq_generic_get, - .attr = cpufreq_generic_attr, }; static const struct platform_device_id platform_device_ids[] = { diff --git a/drivers/cpufreq/loongson3_cpufreq.c b/drivers/cpufreq/loongson3_cpufreq.c index bd34bf0fafa5..ea516b939c44 100644 --- a/drivers/cpufreq/loongson3_cpufreq.c +++ b/drivers/cpufreq/loongson3_cpufreq.c @@ -337,7 +337,6 @@ static struct cpufreq_driver loongson3_cpufreq_driver = { .offline = loongson3_cpufreq_cpu_offline, .get = loongson3_cpufreq_get, .target_index = loongson3_cpufreq_target, - .attr = cpufreq_generic_attr, .verify = cpufreq_generic_frequency_table_verify, .suspend = cpufreq_generic_suspend, }; From 1a867c7ce6d7f617210c0efcdc242f4d7352a799 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 017/139] cpufreq: mediatek: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/mediatek-cpufreq-hw.c | 1 - drivers/cpufreq/mediatek-cpufreq.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index 9252ebd60373..aa209f5527dc 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -293,7 +293,6 @@ static struct cpufreq_driver cpufreq_mtk_hw_driver = { .register_em = mtk_cpufreq_register_em, .fast_switch = mtk_cpufreq_hw_fast_switch, .name = "mtk-cpufreq-hw", - .attr = cpufreq_generic_attr, }; static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev) diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index 663f61565cf7..2656b88db378 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -618,7 +618,6 @@ static struct cpufreq_driver mtk_cpufreq_driver = { .exit = mtk_cpufreq_exit, .register_em = cpufreq_register_em_with_opp, .name = "mtk-cpufreq", - .attr = cpufreq_generic_attr, }; static int mtk_cpufreq_probe(struct platform_device *pdev) From ef282f6bef1456266ba7af6f5408e6b4615c1738 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 018/139] cpufreq: omap: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/omap-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index 106220c0fd11..bbb01d93b54b 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -147,7 +147,6 @@ static struct cpufreq_driver omap_driver = { .exit = omap_cpu_exit, .register_em = cpufreq_register_em_with_opp, .name = "omap", - .attr = cpufreq_generic_attr, }; static int omap_cpufreq_probe(struct platform_device *pdev) From 047124e431b061f8e6f9e0647454b462d5bdffe1 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 019/139] cpufreq: p4: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/p4-clockmod.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c index ef0a3216a386..69c19233fcd4 100644 --- a/drivers/cpufreq/p4-clockmod.c +++ b/drivers/cpufreq/p4-clockmod.c @@ -227,7 +227,6 @@ static struct cpufreq_driver p4clockmod_driver = { .init = cpufreq_p4_cpu_init, .get = cpufreq_p4_get, .name = "p4-clockmod", - .attr = cpufreq_generic_attr, }; static const struct x86_cpu_id cpufreq_p4_id[] = { From d3d57f9d2eeeddfa847b44944b7d3fa283825715 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 020/139] cpufreq: pasemi: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/pasemi-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c index 5fc9cb480516..a3931349360f 100644 --- a/drivers/cpufreq/pasemi-cpufreq.c +++ b/drivers/cpufreq/pasemi-cpufreq.c @@ -245,7 +245,6 @@ static struct cpufreq_driver pas_cpufreq_driver = { .exit = pas_cpufreq_cpu_exit, .verify = cpufreq_generic_frequency_table_verify, .target_index = pas_cpufreq_target, - .attr = cpufreq_generic_attr, }; /* From 6cdc8c3ca954a56e54fc844475c09a0ad8f7f1d0 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 021/139] cpufreq: pmac: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/pmac32-cpufreq.c | 1 - drivers/cpufreq/pmac64-cpufreq.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/cpufreq/pmac32-cpufreq.c b/drivers/cpufreq/pmac32-cpufreq.c index 6c9f0888a2a7..a22c22bd693a 100644 --- a/drivers/cpufreq/pmac32-cpufreq.c +++ b/drivers/cpufreq/pmac32-cpufreq.c @@ -439,7 +439,6 @@ static struct cpufreq_driver pmac_cpufreq_driver = { .suspend = pmac_cpufreq_suspend, .resume = pmac_cpufreq_resume, .flags = CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING, - .attr = cpufreq_generic_attr, .name = "powermac", }; diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c index 74ff6c47df29..80897ec8f00e 100644 --- a/drivers/cpufreq/pmac64-cpufreq.c +++ b/drivers/cpufreq/pmac64-cpufreq.c @@ -332,7 +332,6 @@ static struct cpufreq_driver g5_cpufreq_driver = { .verify = cpufreq_generic_frequency_table_verify, .target_index = g5_cpufreq_target, .get = g5_cpufreq_get_speed, - .attr = cpufreq_generic_attr, }; From 5b6fc62eff3dfc38a44f0344b5b8146c0b41b837 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 022/139] cpufreq: powernow: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/powernow-k6.c | 1 - drivers/cpufreq/powernow-k7.c | 1 - drivers/cpufreq/powernow-k8.c | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/cpufreq/powernow-k6.c b/drivers/cpufreq/powernow-k6.c index f0a4a6c31204..99d2244e03b0 100644 --- a/drivers/cpufreq/powernow-k6.c +++ b/drivers/cpufreq/powernow-k6.c @@ -253,7 +253,6 @@ static struct cpufreq_driver powernow_k6_driver = { .exit = powernow_k6_cpu_exit, .get = powernow_k6_get, .name = "powernow-k6", - .attr = cpufreq_generic_attr, }; static const struct x86_cpu_id powernow_k6_ids[] = { diff --git a/drivers/cpufreq/powernow-k7.c b/drivers/cpufreq/powernow-k7.c index 4271446c8725..fb2197dc170f 100644 --- a/drivers/cpufreq/powernow-k7.c +++ b/drivers/cpufreq/powernow-k7.c @@ -667,7 +667,6 @@ static struct cpufreq_driver powernow_driver = { .init = powernow_cpu_init, .exit = powernow_cpu_exit, .name = "powernow-k7", - .attr = cpufreq_generic_attr, }; static int __init powernow_init(void) diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c index a01170f7d01c..4e3ba6e68c32 100644 --- a/drivers/cpufreq/powernow-k8.c +++ b/drivers/cpufreq/powernow-k8.c @@ -1143,7 +1143,6 @@ static struct cpufreq_driver cpufreq_amd64_driver = { .exit = powernowk8_cpu_exit, .get = powernowk8_get, .name = "powernow-k8", - .attr = cpufreq_generic_attr, }; static void __request_acpi_cpufreq(void) From 792e6a8ec211690655dedd59ccbf3b24e6505e41 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 023/139] cpufreq: powernv: Stop setting common freq attributes The cpufreq core handles this now, the driver can skip setting it. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/powernv-cpufreq.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index ae79d909943b..0631284c4cfb 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -386,12 +386,8 @@ static ssize_t cpuinfo_nominal_freq_show(struct cpufreq_policy *policy, static struct freq_attr cpufreq_freq_attr_cpuinfo_nominal_freq = __ATTR_RO(cpuinfo_nominal_freq); -#define SCALING_BOOST_FREQS_ATTR_INDEX 2 - static struct freq_attr *powernv_cpu_freq_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, &cpufreq_freq_attr_cpuinfo_nominal_freq, - &cpufreq_freq_attr_scaling_boost_freqs, NULL, }; @@ -1129,8 +1125,6 @@ static int __init powernv_cpufreq_init(void) if (powernv_pstate_info.wof_enabled) powernv_cpufreq_driver.boost_enabled = true; - else - powernv_cpu_freq_attr[SCALING_BOOST_FREQS_ATTR_INDEX] = NULL; rc = cpufreq_register_driver(&powernv_cpufreq_driver); if (rc) { From ac0bcf38f336b9bf13b6efc39d2e3195efc4ee37 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 024/139] cpufreq: qcom: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/qcom-cpufreq-hw.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index b2e7e89feaac..7d83d7d2ccc8 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -595,12 +595,6 @@ static void qcom_cpufreq_ready(struct cpufreq_policy *policy) enable_irq(data->throttle_irq); } -static struct freq_attr *qcom_cpufreq_hw_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - &cpufreq_freq_attr_scaling_boost_freqs, - NULL -}; - static struct cpufreq_driver cpufreq_qcom_hw_driver = { .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_HAVE_GOVERNOR_PER_POLICY | @@ -615,7 +609,6 @@ static struct cpufreq_driver cpufreq_qcom_hw_driver = { .register_em = cpufreq_register_em_with_opp, .fast_switch = qcom_cpufreq_hw_fast_switch, .name = "qcom-cpufreq-hw", - .attr = qcom_cpufreq_hw_attr, .ready = qcom_cpufreq_ready, }; From e382146efae229bf96143622bcca689db65ecd41 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 025/139] cpufreq: qoriq: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/qoriq-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/qoriq-cpufreq.c b/drivers/cpufreq/qoriq-cpufreq.c index a37ce051236c..8d1f5ac59132 100644 --- a/drivers/cpufreq/qoriq-cpufreq.c +++ b/drivers/cpufreq/qoriq-cpufreq.c @@ -254,7 +254,6 @@ static struct cpufreq_driver qoriq_cpufreq_driver = { .verify = cpufreq_generic_frequency_table_verify, .target_index = qoriq_cpufreq_target, .get = cpufreq_generic_get, - .attr = cpufreq_generic_attr, }; static const struct of_device_id qoriq_cpufreq_blacklist[] = { From e2079dcc2b63c75cd4a63d0dc6d42105f3e893a3 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 026/139] cpufreq: sc520_freq: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/sc520_freq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/sc520_freq.c b/drivers/cpufreq/sc520_freq.c index 330c8d6cf93c..103d2519dff7 100644 --- a/drivers/cpufreq/sc520_freq.c +++ b/drivers/cpufreq/sc520_freq.c @@ -92,7 +92,6 @@ static struct cpufreq_driver sc520_freq_driver = { .target_index = sc520_freq_target, .init = sc520_freq_cpu_init, .name = "sc520_freq", - .attr = cpufreq_generic_attr, }; static const struct x86_cpu_id sc520_ids[] = { From 50b8cd5c91d27167665c51763f539f4291608834 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 027/139] cpufreq: scmi: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Sudeep Holla Acked-by: Rafael J. Wysocki --- drivers/cpufreq/scmi-cpufreq.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index b8fe758aeb01..4a3ee59cb771 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -171,12 +171,6 @@ scmi_get_rate_limit(u32 domain, bool has_fast_switch) return rate_limit; } -static struct freq_attr *scmi_cpufreq_hw_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, - NULL, -}; - static int scmi_limit_notify_cb(struct notifier_block *nb, unsigned long event, void *data) { struct scmi_data *priv = container_of(nb, struct scmi_data, limit_notify_nb); @@ -309,7 +303,6 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) dev_warn(cpu_dev, "failed to enable boost: %d\n", ret); goto out_free_table; } else { - scmi_cpufreq_hw_attr[1] = &cpufreq_freq_attr_scaling_boost_freqs; scmi_cpufreq_driver.boost_enabled = true; } } @@ -395,7 +388,6 @@ static struct cpufreq_driver scmi_cpufreq_driver = { CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, - .attr = scmi_cpufreq_hw_attr, .target_index = scmi_cpufreq_set_target, .fast_switch = scmi_cpufreq_fast_switch, .get = scmi_cpufreq_get_rate, From ad3f116fe3de520c7ff52ffd36861212a05add10 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 028/139] cpufreq: scpi: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Sudeep Holla Acked-by: Rafael J. Wysocki --- drivers/cpufreq/scpi-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index 9e09565e41c0..048dc43a9997 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -184,7 +184,6 @@ static struct cpufreq_driver scpi_cpufreq_driver = { CPUFREQ_NEED_INITIAL_FREQ_CHECK | CPUFREQ_IS_COOLING_DEV, .verify = cpufreq_generic_frequency_table_verify, - .attr = cpufreq_generic_attr, .get = scpi_cpufreq_get_rate, .init = scpi_cpufreq_init, .exit = scpi_cpufreq_exit, From 7b748fa7f316b8afe54f5939a1f0d91f6837d44c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 029/139] cpufreq: sh: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/sh-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/sh-cpufreq.c b/drivers/cpufreq/sh-cpufreq.c index aa74036d0420..9c0b01e00508 100644 --- a/drivers/cpufreq/sh-cpufreq.c +++ b/drivers/cpufreq/sh-cpufreq.c @@ -151,7 +151,6 @@ static struct cpufreq_driver sh_cpufreq_driver = { .verify = sh_cpufreq_verify, .init = sh_cpufreq_cpu_init, .exit = sh_cpufreq_cpu_exit, - .attr = cpufreq_generic_attr, }; static int __init sh_cpufreq_module_init(void) From c3245e78b54224b4a42de1b44cc96b47beb53338 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 030/139] cpufreq: spear: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/spear-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c index d8ab5b01d46d..707c71090cc3 100644 --- a/drivers/cpufreq/spear-cpufreq.c +++ b/drivers/cpufreq/spear-cpufreq.c @@ -165,7 +165,6 @@ static struct cpufreq_driver spear_cpufreq_driver = { .target_index = spear_cpufreq_target, .get = cpufreq_generic_get, .init = spear_cpufreq_init, - .attr = cpufreq_generic_attr, }; static int spear_cpufreq_probe(struct platform_device *pdev) From 63c778aa1598c62971e7523c2a540da40cee1d70 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 031/139] cpufreq: speedstep: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/speedstep-centrino.c | 1 - drivers/cpufreq/speedstep-ich.c | 1 - drivers/cpufreq/speedstep-smi.c | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/cpufreq/speedstep-centrino.c b/drivers/cpufreq/speedstep-centrino.c index 3fafedb983b5..3e6e85a92212 100644 --- a/drivers/cpufreq/speedstep-centrino.c +++ b/drivers/cpufreq/speedstep-centrino.c @@ -507,7 +507,6 @@ static struct cpufreq_driver centrino_driver = { .verify = cpufreq_generic_frequency_table_verify, .target_index = centrino_target, .get = get_cur_freq, - .attr = cpufreq_generic_attr, }; /* diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c index f2076d72bf39..262cfbde9ca7 100644 --- a/drivers/cpufreq/speedstep-ich.c +++ b/drivers/cpufreq/speedstep-ich.c @@ -315,7 +315,6 @@ static struct cpufreq_driver speedstep_driver = { .target_index = speedstep_target, .init = speedstep_cpu_init, .get = speedstep_get, - .attr = cpufreq_generic_attr, }; static const struct x86_cpu_id ss_smi_ids[] = { diff --git a/drivers/cpufreq/speedstep-smi.c b/drivers/cpufreq/speedstep-smi.c index 0ce9d4b6dfcc..39265884c3f1 100644 --- a/drivers/cpufreq/speedstep-smi.c +++ b/drivers/cpufreq/speedstep-smi.c @@ -295,7 +295,6 @@ static struct cpufreq_driver speedstep_driver = { .init = speedstep_cpu_init, .get = speedstep_get, .resume = speedstep_resume, - .attr = cpufreq_generic_attr, }; static const struct x86_cpu_id ss_smi_ids[] = { From f577fab0cc768a3e5cc3591df6d74a583401d211 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 032/139] cpufreq: tegra: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/tegra186-cpufreq.c | 1 - drivers/cpufreq/tegra194-cpufreq.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index c7761eb99f3c..b54a77be54e6 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -123,7 +123,6 @@ static struct cpufreq_driver tegra186_cpufreq_driver = { .verify = cpufreq_generic_frequency_table_verify, .target_index = tegra186_cpufreq_set_target, .init = tegra186_cpufreq_init, - .attr = cpufreq_generic_attr, }; static struct cpufreq_frequency_table *init_vhint_table( diff --git a/drivers/cpufreq/tegra194-cpufreq.c b/drivers/cpufreq/tegra194-cpufreq.c index 9055dd398e7f..9b4f516f313e 100644 --- a/drivers/cpufreq/tegra194-cpufreq.c +++ b/drivers/cpufreq/tegra194-cpufreq.c @@ -589,7 +589,6 @@ static struct cpufreq_driver tegra194_cpufreq_driver = { .exit = tegra194_cpufreq_exit, .online = tegra194_cpufreq_online, .offline = tegra194_cpufreq_offline, - .attr = cpufreq_generic_attr, }; static struct tegra_cpufreq_ops tegra194_cpufreq_ops = { From 260d6cdc7b69ca0e5080d70d4937427e454183c8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 033/139] cpufreq: vexpress: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Sudeep Holla Acked-by: Rafael J. Wysocki --- drivers/cpufreq/vexpress-spc-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/vexpress-spc-cpufreq.c b/drivers/cpufreq/vexpress-spc-cpufreq.c index 0f86cdb7ec8a..65fea47b82e6 100644 --- a/drivers/cpufreq/vexpress-spc-cpufreq.c +++ b/drivers/cpufreq/vexpress-spc-cpufreq.c @@ -471,7 +471,6 @@ static struct cpufreq_driver ve_spc_cpufreq_driver = { .init = ve_spc_cpufreq_init, .exit = ve_spc_cpufreq_exit, .register_em = cpufreq_register_em_with_opp, - .attr = cpufreq_generic_attr, }; #ifdef CONFIG_BL_SWITCHER From 0df09bf56eb2971e12407ba0c08852d5f74e245c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:23:51 +0530 Subject: [PATCH 034/139] cpufreq: virtual: Stop setting cpufreq_driver->attr field The cpufreq core now handles this for basic attributes, including boost frequencies, the driver can skip setting them. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/virtual-cpufreq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c index a050b3a6737f..45becb92aa4a 100644 --- a/drivers/cpufreq/virtual-cpufreq.c +++ b/drivers/cpufreq/virtual-cpufreq.c @@ -265,7 +265,6 @@ static struct cpufreq_driver cpufreq_virt_driver = { .verify = virt_cpufreq_verify_policy, .target = virt_cpufreq_target, .fast_switch = virt_cpufreq_fast_switch, - .attr = cpufreq_generic_attr, }; static int virt_cpufreq_driver_probe(struct platform_device *pdev) From 486729c6012042c486db2a5e4d5dd034fb1d3f3c Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 22 Jan 2025 16:40:54 +0530 Subject: [PATCH 035/139] cpufreq: Remove cpufreq_generic_attrs All users of cpufreq_generic_attr are migrated now, remove it. While at it, also stop exporting attributes for available and boost frequencies as they are only used by cpufreq core now. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/freq_table.c | 8 -------- include/linux/cpufreq.h | 1 - 2 files changed, 9 deletions(-) diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 10e80d912b8d..16e56f2fcee4 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -276,7 +276,6 @@ static ssize_t scaling_available_frequencies_show(struct cpufreq_policy *policy, return show_available_freqs(policy, buf, false); } cpufreq_attr_available_freq(scaling_available); -EXPORT_SYMBOL_GPL(cpufreq_freq_attr_scaling_available_freqs); /* * scaling_boost_frequencies_show - show available boost frequencies for @@ -288,13 +287,6 @@ static ssize_t scaling_boost_frequencies_show(struct cpufreq_policy *policy, return show_available_freqs(policy, buf, true); } cpufreq_attr_available_freq(scaling_boost); -EXPORT_SYMBOL_GPL(cpufreq_freq_attr_scaling_boost_freqs); - -struct freq_attr *cpufreq_generic_attr[] = { - &cpufreq_freq_attr_scaling_available_freqs, - NULL, -}; -EXPORT_SYMBOL_GPL(cpufreq_generic_attr); static int set_freq_table_sorted(struct cpufreq_policy *policy) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7fe0981a7e46..d237ef91d1f1 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1198,7 +1198,6 @@ void arch_set_freq_scale(const struct cpumask *cpus, /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; -extern struct freq_attr *cpufreq_generic_attr[]; int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy); unsigned int cpufreq_generic_get(unsigned int cpu); From 38bcdb635ac6b837175200b34e5164a256abd27d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 09:36:44 +0530 Subject: [PATCH 036/139] cpufreq: Stop checking for duplicate available/boost freq attributes None of the drivers set these attributes directly now, remove the unnecessary check. Signed-off-by: Viresh Kumar Acked-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 71ecf6a004e8..42b00ca51035 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1077,13 +1077,6 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) /* set up files for this cpu device */ drv_attr = cpufreq_driver->attr; while (drv_attr && *drv_attr) { - /* These are already added, skip them */ - if (*drv_attr == &cpufreq_freq_attr_scaling_available_freqs || - *drv_attr == &cpufreq_freq_attr_scaling_boost_freqs) { - drv_attr++; - continue; - } - ret = sysfs_create_file(&policy->kobj, &((*drv_attr)->attr)); if (ret) return ret; From 1f04815057a4c1ca557448b56ad5ab536978540e Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 10:48:34 +0530 Subject: [PATCH 037/139] cpufreq: staticize cpufreq_boost_trigger_state() cpufreq_boost_trigger_state() is only used by cpufreq core, mark it static. Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 3 ++- include/linux/cpufreq.h | 5 ----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 42b00ca51035..eca649d062d0 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -88,6 +88,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_governor *new_gov, unsigned int new_pol); static bool cpufreq_boost_supported(void); +static int cpufreq_boost_trigger_state(int state); /* * Two notifier lists: the "policy" list is involved in the @@ -2807,7 +2808,7 @@ static int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) return 0; } -int cpufreq_boost_trigger_state(int state) +static int cpufreq_boost_trigger_state(int state) { struct cpufreq_policy *policy; unsigned long flags; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index d237ef91d1f1..0e708830d30d 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -778,7 +778,6 @@ int cpufreq_frequency_table_get_index(struct cpufreq_policy *policy, ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); #ifdef CONFIG_CPU_FREQ -int cpufreq_boost_trigger_state(int state); bool cpufreq_boost_enabled(void); int cpufreq_enable_boost_support(void); bool policy_has_boost_freq(struct cpufreq_policy *policy); @@ -1150,10 +1149,6 @@ static inline int of_perf_domain_get_sharing_cpumask(int pcpu, const char *list_ return 0; } #else -static inline int cpufreq_boost_trigger_state(int state) -{ - return 0; -} static inline bool cpufreq_boost_enabled(void) { return false; From 9a23eb8b2b5d8c5f1129c5a523a786ddd53cd7c9 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 13:09:35 +0530 Subject: [PATCH 038/139] cpufreq: Export cpufreq_boost_set_sw() This will be used directly by cpufreq driver going forward, export it. Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 3 ++- include/linux/cpufreq.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index eca649d062d0..8dec9d2a1e6e 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2788,7 +2788,7 @@ EXPORT_SYMBOL_GPL(cpufreq_update_limits); /********************************************************************* * BOOST * *********************************************************************/ -static int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) +int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) { int ret; @@ -2807,6 +2807,7 @@ static int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) return 0; } +EXPORT_SYMBOL_GPL(cpufreq_boost_set_sw); static int cpufreq_boost_trigger_state(int state) { diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 0e708830d30d..c7d1fe5ebf7a 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -781,6 +781,7 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); bool cpufreq_boost_enabled(void); int cpufreq_enable_boost_support(void); bool policy_has_boost_freq(struct cpufreq_policy *policy); +int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state); /* Find lowest freq at or above target in a table in ascending order */ static inline int cpufreq_table_find_index_al(struct cpufreq_policy *policy, @@ -1164,6 +1165,11 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) return false; } +static inline int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) +{ + return -EOPNOTSUPP; +} + static inline int cpufreq_table_set_inefficient(struct cpufreq_policy *policy, unsigned int frequency) From 1f7d1bab50e6ae517f8b6699e56d709d61ae13e5 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 11:04:25 +0530 Subject: [PATCH 039/139] cpufreq: Introduce policy->boost_supported flag It is possible to have a scenario where not all cpufreq policies support boost frequencies. And letting sysfs (or other parts of the kernel) enable boost feature for that policy isn't correct. Add a new flag, boost_supported, which will be set to true by the cpufreq core only if the freq table contains valid boost frequencies. Some cpufreq drivers though don't have boost frequencies in the freq-table, they can set this flag from their ->init() callbacks. Once all the drivers are updated to set the flag correctly, we can check it before enabling boost feature for a policy. Signed-off-by: Viresh Kumar --- drivers/cpufreq/freq_table.c | 4 ++++ include/linux/cpufreq.h | 3 +++ 2 files changed, 7 insertions(+) diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 16e56f2fcee4..185070052b41 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -359,6 +359,10 @@ int cpufreq_table_validate_and_sort(struct cpufreq_policy *policy) if (ret) return ret; + /* Driver's may have set this field already */ + if (policy_has_boost_freq(policy)) + policy->boost_supported = true; + return set_freq_table_sorted(policy); } diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index c7d1fe5ebf7a..b017af4398b9 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -144,6 +144,9 @@ struct cpufreq_policy { /* Per policy boost enabled flag. */ bool boost_enabled; + /* Per policy boost supported flag. */ + bool boost_supported; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; unsigned int cached_resolved_idx; From be6b8681a0e4c1477a2c1cb155f7b9188aa88acb Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 15:30:42 +0530 Subject: [PATCH 040/139] cpufreq: acpi: Set policy->boost_supported With a later commit, the cpufreq core will call the ->set_boost() callback only if the policy supports boost frequency. The boost_supported flag is set by the cpufreq core if policy->freq_table is set and one or more boost frequencies are present. For other drivers, the flag must be set explicitly. Signed-off-by: Viresh Kumar --- drivers/cpufreq/acpi-cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index c598295d1c52..924314cdeebc 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -909,6 +909,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (perf->states[0].core_frequency * 1000 != freq_table[0].frequency) pr_warn(FW_WARN "P-state 0 is not max freq\n"); + if (acpi_cpufreq_driver.set_boost) + policy->boost_supported = true; + return result; err_unreg: From 98f39e93d102af743d173f76ca12fd5fcacbb0ea Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 14:38:03 +0530 Subject: [PATCH 041/139] cpufreq: amd: Set policy->boost_supported With a later commit, the cpufreq core will call the ->set_boost() callback only if the policy supports boost frequency. The boost_supported flag is set by the cpufreq core if policy->freq_table is set and one or more boost frequencies are present. For other drivers, the flag must be set explicitly. The policy->boost_enabled flag is set by the cpufreq core once the policy is initialized, don't set it anymore. Signed-off-by: Viresh Kumar --- drivers/cpufreq/amd-pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index a7e70fe0c57d..41d83dd50a71 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1004,7 +1004,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.min_freq = min_freq; policy->cpuinfo.max_freq = max_freq; - policy->boost_enabled = READ_ONCE(cpudata->boost_supported); + policy->boost_supported = READ_ONCE(cpudata->boost_supported); /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; @@ -1497,7 +1497,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->min = policy->cpuinfo.min_freq; policy->max = policy->cpuinfo.max_freq; - policy->boost_enabled = READ_ONCE(cpudata->boost_supported); + policy->boost_supported = READ_ONCE(cpudata->boost_supported); /* * Set the policy to provide a valid fallback value in case From a3f48fb2e5b7db23b4bc5c699baf67c18b50ab4b Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 14:10:25 +0530 Subject: [PATCH 042/139] cpufreq: cppc: Set policy->boost_supported With a later commit, the cpufreq core will call the ->set_boost() callback only if the policy supports boost frequency. The boost_supported flag is set by the cpufreq core if policy->freq_table is set and one or more boost frequencies are present. For other drivers, the flag must be set explicitly. With this, the local variable boost_supported isn't required anymore. Signed-off-by: Viresh Kumar --- drivers/cpufreq/cppc_cpufreq.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 8f512448382f..b3d74f9adcf0 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -34,8 +34,6 @@ */ static LIST_HEAD(cpu_data_list); -static bool boost_supported; - static struct cpufreq_driver cppc_cpufreq_driver; #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE @@ -653,7 +651,7 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) * is supported. */ if (caps->highest_perf > caps->nominal_perf) - boost_supported = true; + policy->boost_supported = true; /* Set policy->cur to max now. The governors will adjust later. */ policy->cur = cppc_perf_to_khz(caps, caps->highest_perf); @@ -791,11 +789,6 @@ static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) struct cppc_perf_caps *caps = &cpu_data->perf_caps; int ret; - if (!boost_supported) { - pr_err("BOOST not supported by CPU or firmware\n"); - return -EINVAL; - } - if (state) policy->max = cppc_perf_to_khz(caps, caps->highest_perf); else From 691b321278124e7cab6855dd2992e067013b4198 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 11:04:25 +0530 Subject: [PATCH 043/139] cpufreq: Restrict enabling boost on policies with no boost frequencies It is possible to have a scenario where not all cpufreq policies support boost frequencies. And letting sysfs (or other parts of the kernel) enable boost feature for that policy isn't correct. Now that all drivers (that required a change) are updated to set the policy->boost_supported properly, check this flag before enabling boost feature for a policy. Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 8dec9d2a1e6e..08deef6884d6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -632,6 +632,9 @@ static ssize_t store_local_boost(struct cpufreq_policy *policy, if (!cpufreq_driver->boost_enabled) return -EINVAL; + if (!policy->boost_supported) + return -EINVAL; + if (policy->boost_enabled == enable) return count; @@ -1587,7 +1590,7 @@ static int cpufreq_online(unsigned int cpu) policy->cdev = of_cpufreq_cooling_register(policy); /* Let the per-policy boost flag mirror the cpufreq_driver boost during init */ - if (cpufreq_driver->set_boost && + if (cpufreq_driver->set_boost && policy->boost_supported && policy->boost_enabled != cpufreq_boost_enabled()) { policy->boost_enabled = cpufreq_boost_enabled(); ret = cpufreq_driver->set_boost(policy, policy->boost_enabled); @@ -2824,6 +2827,9 @@ static int cpufreq_boost_trigger_state(int state) cpus_read_lock(); for_each_active_policy(policy) { + if (!policy->boost_supported) + continue; + policy->boost_enabled = state; ret = cpufreq_driver->set_boost(policy, state); if (ret) { From ddef17bb869858eb610f34b681fccde9fbbe4539 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 13:14:20 +0530 Subject: [PATCH 044/139] cpufreq: apple: Set .set_boost directly The boost feature can be controlled at two levels currently, driver level (applies to all policies) and per-policy. Currently the driver enables driver level boost support from the per-policy ->init() callback, which isn't really efficient as that gets called for each policy and then there is online/offline path too where this gets done unnecessarily. Instead set the .set_boost field directly and always enable the boost support. If a policy doesn't support boost feature, the core will not enable it for that policy. Keep the initial state of driver level boost to disabled and let the user enable it if required as ideally the boost frequencies must be used only when really required. Signed-off-by: Viresh Kumar --- drivers/cpufreq/apple-soc-cpufreq.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/cpufreq/apple-soc-cpufreq.c b/drivers/cpufreq/apple-soc-cpufreq.c index 6ff604a0fe79..4994c86feb57 100644 --- a/drivers/cpufreq/apple-soc-cpufreq.c +++ b/drivers/cpufreq/apple-soc-cpufreq.c @@ -310,15 +310,6 @@ static int apple_soc_cpufreq_init(struct cpufreq_policy *policy) policy->fast_switch_possible = true; policy->suspend_freq = freq_table[0].frequency; - if (policy_has_boost_freq(policy)) { - ret = cpufreq_enable_boost_support(); - if (ret) { - dev_warn(cpu_dev, "failed to enable boost: %d\n", ret); - } else { - apple_soc_cpufreq_driver.boost_enabled = true; - } - } - return 0; out_free_cpufreq_table: @@ -353,6 +344,7 @@ static struct cpufreq_driver apple_soc_cpufreq_driver = { .target_index = apple_soc_cpufreq_set_target, .fast_switch = apple_soc_cpufreq_fast_switch, .register_em = cpufreq_register_em_with_opp, + .set_boost = cpufreq_boost_set_sw, .suspend = cpufreq_generic_suspend, }; From 13e92357b6e875c4ad1a4f5edeb460717380e76a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 13:14:20 +0530 Subject: [PATCH 045/139] cpufreq: loongson: Set .set_boost directly The boost feature can be controlled at two levels currently, driver level (applies to all policies) and per-policy. Currently the driver enables driver level boost support from the per-policy ->init() callback, which isn't really efficient as that gets called for each policy and then there is online/offline path too where this gets done unnecessarily. Instead set the .set_boost field directly and always enable the boost support. If a policy doesn't support boost feature, the core will not enable it for that policy. Keep the initial state of driver level boost to disabled and let the user enable it if required as ideally the boost frequencies must be used only when really required. Signed-off-by: Viresh Kumar --- drivers/cpufreq/loongson3_cpufreq.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/cpufreq/loongson3_cpufreq.c b/drivers/cpufreq/loongson3_cpufreq.c index ea516b939c44..1e8715ea1b77 100644 --- a/drivers/cpufreq/loongson3_cpufreq.c +++ b/drivers/cpufreq/loongson3_cpufreq.c @@ -299,15 +299,6 @@ static int loongson3_cpufreq_cpu_init(struct cpufreq_policy *policy) per_cpu(freq_data, i) = per_cpu(freq_data, cpu); } - if (policy_has_boost_freq(policy)) { - ret = cpufreq_enable_boost_support(); - if (ret < 0) { - pr_warn("cpufreq: Failed to enable boost: %d\n", ret); - return ret; - } - loongson3_cpufreq_driver.boost_enabled = true; - } - return 0; } @@ -338,6 +329,7 @@ static struct cpufreq_driver loongson3_cpufreq_driver = { .get = loongson3_cpufreq_get, .target_index = loongson3_cpufreq_target, .verify = cpufreq_generic_frequency_table_verify, + .set_boost = cpufreq_boost_set_sw, .suspend = cpufreq_generic_suspend, }; From 3fd920377884a0ed5b15da0c1f34a90b280b3b3d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 13:14:20 +0530 Subject: [PATCH 046/139] cpufreq: powernv: Set .set_boost directly The boost feature can be controlled at two levels currently, driver level (applies to all policies) and per-policy. Currently the driver enables driver level boost support from the per-policy ->init() callback, which isn't really efficient as that gets called for each policy and then there is online/offline path too where this gets done unnecessarily. Instead set the .set_boost field directly and always enable the boost support. If a policy doesn't support boost feature, the core will not enable it for that policy. Keep the initial state of driver level boost to disabled and let the user enable it if required as ideally the boost frequencies must be used only when really required. Signed-off-by: Viresh Kumar --- drivers/cpufreq/powernv-cpufreq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 0631284c4cfb..6094c530bf57 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -1124,7 +1124,7 @@ static int __init powernv_cpufreq_init(void) goto out; if (powernv_pstate_info.wof_enabled) - powernv_cpufreq_driver.boost_enabled = true; + powernv_cpufreq_driver.set_boost = cpufreq_boost_set_sw; rc = cpufreq_register_driver(&powernv_cpufreq_driver); if (rc) { @@ -1132,9 +1132,6 @@ static int __init powernv_cpufreq_init(void) goto cleanup; } - if (powernv_pstate_info.wof_enabled) - cpufreq_enable_boost_support(); - register_reboot_notifier(&powernv_cpufreq_reboot_nb); opal_message_notifier_register(OPAL_MSG_OCC, &powernv_cpufreq_opal_nb); From 11847a5c1265d08097128f4f7692cf6400e97805 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 13:14:20 +0530 Subject: [PATCH 047/139] cpufreq: scmi: Set .set_boost directly The boost feature can be controlled at two levels currently, driver level (applies to all policies) and per-policy. Currently the driver enables driver level boost support from the per-policy ->init() callback, which isn't really efficient as that gets called for each policy and then there is online/offline path too where this gets done unnecessarily. Instead set the .set_boost field directly and always enable the boost support. If a policy doesn't support boost feature, the core will not enable it for that policy. Keep the initial state of driver level boost to disabled and let the user enable it if required as ideally the boost frequencies must be used only when really required. Signed-off-by: Viresh Kumar Acked-by: Sudeep Holla --- drivers/cpufreq/scmi-cpufreq.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 4a3ee59cb771..ff2897789797 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -297,16 +297,6 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) policy->transition_delay_us = scmi_get_rate_limit(domain, policy->fast_switch_possible); - if (policy_has_boost_freq(policy)) { - ret = cpufreq_enable_boost_support(); - if (ret) { - dev_warn(cpu_dev, "failed to enable boost: %d\n", ret); - goto out_free_table; - } else { - scmi_cpufreq_driver.boost_enabled = true; - } - } - ret = freq_qos_add_request(&policy->constraints, &priv->limits_freq_req, FREQ_QOS_MAX, FREQ_QOS_MAX_DEFAULT_VALUE); if (ret < 0) { @@ -394,6 +384,7 @@ static struct cpufreq_driver scmi_cpufreq_driver = { .init = scmi_cpufreq_init, .exit = scmi_cpufreq_exit, .register_em = scmi_cpufreq_register_em, + .set_boost = cpufreq_boost_set_sw, }; static int scmi_cpufreq_probe(struct scmi_device *sdev) From 707e222314ff9468c4a507ab38c02d190bd5fbac Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 13:14:20 +0530 Subject: [PATCH 048/139] cpufreq: dt: Set .set_boost directly The boost feature can be controlled at two levels currently, driver level (applies to all policies) and per-policy. Currently the driver enables driver level boost support from the per-policy ->init() callback, which isn't really efficient as that gets called for each policy and then there is online/offline path too where this gets done unnecessarily. Instead set the .set_boost field directly and always enable the boost support. If a policy doesn't support boost feature, the core will not enable it for that policy. Keep the initial state of driver level boost to disabled and let the user enable it if required as ideally the boost frequencies must be used only when really required. Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 072ccf0c2e41..778916f89a51 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -114,20 +114,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = transition_latency; policy->dvfs_possible_from_any_cpu = true; - /* Support turbo/boost mode */ - if (policy_has_boost_freq(policy)) { - /* This gets disabled by core on driver unregister */ - ret = cpufreq_enable_boost_support(); - if (ret) - goto out_clk_put; - } - return 0; - -out_clk_put: - clk_put(cpu_clk); - - return ret; } static int cpufreq_online(struct cpufreq_policy *policy) @@ -162,6 +149,7 @@ static struct cpufreq_driver dt_cpufreq_driver = { .offline = cpufreq_offline, .register_em = cpufreq_register_em_with_opp, .name = "cpufreq-dt", + .set_boost = cpufreq_boost_set_sw, .suspend = cpufreq_generic_suspend, }; From e8b08af135b7686640ebb2dc1eaa060e42a41af6 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 13:14:20 +0530 Subject: [PATCH 049/139] cpufreq: qcom: Set .set_boost directly The boost feature can be controlled at two levels currently, driver level (applies to all policies) and per-policy. Currently the driver enables driver level boost support from the per-policy ->init() callback, which isn't really efficient as that gets called for each policy and then there is online/offline path too where this gets done unnecessarily. Instead set the .set_boost field directly and always enable the boost support. If a policy doesn't support boost feature, the core will not enable it for that policy. Keep the initial state of driver level boost to disabled and let the user enable it if required as ideally the boost frequencies must be used only when really required. Signed-off-by: Viresh Kumar --- drivers/cpufreq/qcom-cpufreq-hw.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 7d83d7d2ccc8..4b3b3dbc7d38 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -566,12 +566,6 @@ static int qcom_cpufreq_hw_cpu_init(struct cpufreq_policy *policy) return -ENODEV; } - if (policy_has_boost_freq(policy)) { - ret = cpufreq_enable_boost_support(); - if (ret) - dev_warn(cpu_dev, "failed to enable boost: %d\n", ret); - } - return qcom_cpufreq_hw_lmh_init(policy, index); } @@ -610,6 +604,7 @@ static struct cpufreq_driver cpufreq_qcom_hw_driver = { .fast_switch = qcom_cpufreq_hw_fast_switch, .name = "qcom-cpufreq-hw", .ready = qcom_cpufreq_ready, + .set_boost = cpufreq_boost_set_sw, }; static unsigned long qcom_cpufreq_hw_recalc_rate(struct clk_hw *hw, unsigned long parent_rate) From c952775a3d72b0505c2f8f4171929c293479f0fd Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 14:27:11 +0530 Subject: [PATCH 050/139] cpufreq: staticize policy_has_boost_freq() policy_has_boost_freq() isn't used outside of freq_table.c now, mark it static. Signed-off-by: Viresh Kumar --- drivers/cpufreq/freq_table.c | 3 +-- include/linux/cpufreq.h | 6 ------ 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c index 185070052b41..c03a91502f84 100644 --- a/drivers/cpufreq/freq_table.c +++ b/drivers/cpufreq/freq_table.c @@ -14,7 +14,7 @@ * FREQUENCY TABLE HELPERS * *********************************************************************/ -bool policy_has_boost_freq(struct cpufreq_policy *policy) +static bool policy_has_boost_freq(struct cpufreq_policy *policy) { struct cpufreq_frequency_table *pos, *table = policy->freq_table; @@ -27,7 +27,6 @@ bool policy_has_boost_freq(struct cpufreq_policy *policy) return false; } -EXPORT_SYMBOL_GPL(policy_has_boost_freq); int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy, struct cpufreq_frequency_table *table) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b017af4398b9..466d186166da 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -783,7 +783,6 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); #ifdef CONFIG_CPU_FREQ bool cpufreq_boost_enabled(void); int cpufreq_enable_boost_support(void); -bool policy_has_boost_freq(struct cpufreq_policy *policy); int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state); /* Find lowest freq at or above target in a table in ascending order */ @@ -1163,11 +1162,6 @@ static inline int cpufreq_enable_boost_support(void) return -EINVAL; } -static inline bool policy_has_boost_freq(struct cpufreq_policy *policy) -{ - return false; -} - static inline int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) { return -EOPNOTSUPP; From 0322f3e89b4eb32702321687a7636ee5290fe255 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 23 Jan 2025 14:26:20 +0530 Subject: [PATCH 051/139] cpufreq: Remove cpufreq_enable_boost_support() Remove the now unused helper, cpufreq_enable_boost_support(). Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 15 --------------- include/linux/cpufreq.h | 6 ------ 2 files changed, 21 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 08deef6884d6..70237d78d71f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2877,21 +2877,6 @@ static void remove_boost_sysfs_file(void) sysfs_remove_file(cpufreq_global_kobject, &boost.attr); } -int cpufreq_enable_boost_support(void) -{ - if (!cpufreq_driver) - return -EINVAL; - - if (cpufreq_boost_supported()) - return 0; - - cpufreq_driver->set_boost = cpufreq_boost_set_sw; - - /* This will get removed on driver unregister */ - return create_boost_sysfs_file(); -} -EXPORT_SYMBOL_GPL(cpufreq_enable_boost_support); - bool cpufreq_boost_enabled(void) { return cpufreq_driver->boost_enabled; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 466d186166da..cefd853abfd1 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -782,7 +782,6 @@ ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf); #ifdef CONFIG_CPU_FREQ bool cpufreq_boost_enabled(void); -int cpufreq_enable_boost_support(void); int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state); /* Find lowest freq at or above target in a table in ascending order */ @@ -1157,11 +1156,6 @@ static inline bool cpufreq_boost_enabled(void) return false; } -static inline int cpufreq_enable_boost_support(void) -{ - return -EINVAL; -} - static inline int cpufreq_boost_set_sw(struct cpufreq_policy *policy, int state) { return -EOPNOTSUPP; From d42044aad6528e0c9533dbaf836d1b0fbb19fe2d Mon Sep 17 00:00:00 2001 From: David Reaver Date: Sun, 12 Jan 2025 07:26:55 -0800 Subject: [PATCH 052/139] PM: hibernate: Replace deprecated kmap_atomic() with kmap_local_page() kmap_atomic() is deprecated and should be replaced with kmap_local_page() [1][2]. kmap_local_page() is faster in kernels with HIGHMEM enabled, can take page faults, and allows preemption. According to [2], this replacement is safe as long as the code between kmap_atomic() and kunmap_atomic() does not implicitly depend on disabling page faults or preemption. In all of the call sites in this patch, the only thing happening between mapping and unmapping pages is copy_page() calls, and I don't suspect they depend on disabling page faults or preemption. Link: https://lwn.net/Articles/836144/ [1] Link: https://docs.kernel.org/mm/highmem.html#temporary-virtual-mappings [2] Signed-off-by: David Reaver Link: https://patch.msgid.link/20250112152658.20132-1-me@davidreaver.com Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c9fb559a6399..4e6e24e8b854 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -2270,9 +2270,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr); + kunmap_local(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2561,9 +2561,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page); + dst = kmap_local_page(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst); + kunmap_local(dst); last_highmem_page = NULL; } } @@ -2881,13 +2881,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2, { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1); - kaddr2 = kmap_atomic(p2); + kaddr1 = kmap_local_page(p1); + kaddr2 = kmap_local_page(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2); - kunmap_atomic(kaddr1); + kunmap_local(kaddr2); + kunmap_local(kaddr1); } /** From 5fad775d432c6c9158ea12e7e00d8922ef8d3dfc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 Jan 2025 14:37:29 +0100 Subject: [PATCH 053/139] PM: EM: Drop unused parameter from em_adjust_new_capacity() The max_cap parameter is never used in em_adjust_new_capacity(), so drop it. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/2369979.ElGaqSPkdT@rjwysocki.net --- kernel/power/energy_model.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 3874f0e97651..c79bf3c8b0f1 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -728,8 +728,7 @@ static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, * are correctly calculated. */ static void em_adjust_new_capacity(struct device *dev, - struct em_perf_domain *pd, - u64 max_cap) + struct em_perf_domain *pd) { struct em_perf_table __rcu *em_table; @@ -800,7 +799,7 @@ static void em_check_capacity_update(void) cpu, cpu_capacity, em_max_perf); dev = get_cpu_device(cpu); - em_adjust_new_capacity(dev, pd, cpu_capacity); + em_adjust_new_capacity(dev, pd); } free_cpumask_var(cpu_done_mask); From a8e62726ac0dd7b610c87ba1a938a5a9091c34df Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 Jan 2025 14:38:25 +0100 Subject: [PATCH 054/139] PM: EM: Slightly reduce em_check_capacity_update() overhead Every iteration of the loop over all possible CPUs in em_check_capacity_update() causes get_cpu_device() to be called twice for the same CPU, once indirectly via em_cpu_get() and once directly. Get rid of the indirect get_cpu_device() call by moving the direct invocation of it earlier and using em_pd_get() instead of em_cpu_get() to get a pd pointer for the dev one returned by it. This also exposes the fact that dev is needed to get a pd, so the code becomes somewhat easier to follow after it. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/1925950.tdWV9SEqCh@rjwysocki.net --- kernel/power/energy_model.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index c79bf3c8b0f1..066bcf1c71a1 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -774,7 +774,8 @@ static void em_check_capacity_update(void) } cpufreq_cpu_put(policy); - pd = em_cpu_get(cpu); + dev = get_cpu_device(cpu); + pd = em_pd_get(dev); if (!pd || em_is_artificial(pd)) continue; @@ -798,7 +799,6 @@ static void em_check_capacity_update(void) pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n", cpu, cpu_capacity, em_max_perf); - dev = get_cpu_device(cpu); em_adjust_new_capacity(dev, pd); } From 7802fce7dc18394d041a1310fe4ad76120e08145 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 27 Jan 2025 14:07:12 +0100 Subject: [PATCH 055/139] cpufreq: intel_pstate: Make it possible to avoid enabling CAS Capacity-aware scheduling (CAS) is enabled by default by intel_pstate on hybrid systems without SMT, but in some usage scenarios it may be more attractive to place tasks for maximum CPU performance regardless of the extra cost in terms of energy, which is the case on such systems when CAS is not enabled, so introduce a command line option to forbid intel_pstate to enable CAS. Signed-off-by: Rafael J. Wysocki Acked-by:Srinivas Pandruvada Link: https://patch.msgid.link/2781262.mvXUDI8C0e@rjwysocki.net --- Documentation/admin-guide/kernel-parameters.txt | 3 +++ Documentation/admin-guide/pm/intel_pstate.rst | 3 +++ drivers/cpufreq/intel_pstate.c | 9 +++++++++ 3 files changed, 15 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb8752b42ec8..77e671e55993 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2316,6 +2316,9 @@ per_cpu_perf_limits Allow per-logical-CPU P-State performance control limits using cpufreq sysfs interface + no_cas + Do not enable capacity-aware scheduling (CAS) on + hybrid systems intremap= [X86-64,Intel-IOMMU,EARLY] on enable Interrupt Remapping (default) diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index bf13ad25a32f..78fc83ed2a7e 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -696,6 +696,9 @@ of them have to be prepended with the ``intel_pstate=`` prefix. Use per-logical-CPU P-State limits (see `Coordination of P-state Limits`_ for details). +``no_cas`` + Do not enable capacity-aware scheduling (CAS) which is enabled by + default on hybrid systems. Diagnostics and Tuning ====================== diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 9c4cc01fd51a..bc31e9b9b660 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -936,6 +936,8 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { NULL, }; +static bool no_cas __ro_after_init; + static struct cpudata *hybrid_max_perf_cpu __read_mostly; /* * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata, @@ -1041,6 +1043,10 @@ static void hybrid_refresh_cpu_capacity_scaling(void) static void hybrid_init_cpu_capacity_scaling(bool refresh) { + /* Bail out if enabling capacity-aware scheduling is prohibited. */ + if (no_cas) + return; + /* * If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity * scaling has been enabled already and the driver is just changing the @@ -3835,6 +3841,9 @@ static int __init intel_pstate_setup(char *str) if (!strcmp(str, "no_hwp")) no_hwp = 1; + if (!strcmp(str, "no_cas")) + no_cas = true; + if (!strcmp(str, "force")) force_load = 1; if (!strcmp(str, "hwp_only")) From 258e231dc29fbd72bc82c16859a8304f71780ba2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 17 Feb 2025 21:03:01 +0100 Subject: [PATCH 056/139] PM: Rearrange documentation related to __pm_runtime_disable() There are only two callers of __pm_runtime_disable(), one of which is device_suspend_late() and the other is pm_runtime_disable() that has its own kerneldoc comment and there are no plans to add any more of them. Since they use different values of the __pm_runtime_disable() second parameter, the actual code behavior is different in each case, but it is all documented in the __pm_runtime_disable() kerneldoc comment which is not particularly straightforward. For this reason, move the information from the __pm_runtime_disable() kerneldoc comment to the pm_runtime_disable() one and into a separate comment in device_suspend_late() and remove the __pm_runtime_disable() kerneldoc comment altogether. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/12617588.O9o76ZdvQC@rjwysocki.net --- drivers/base/power/main.c | 4 ++++ drivers/base/power/runtime.c | 14 -------------- include/linux/pm_runtime.h | 15 +++++++++++---- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 40e1d8d8a589..dffa2aa1ba7d 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1404,6 +1404,10 @@ static int device_suspend_late(struct device *dev, pm_message_t state, bool asyn TRACE_DEVICE(dev); TRACE_SUSPEND(0); + /* + * Disable runtime PM for the device without checking if there is a + * pending resume request for it. + */ __pm_runtime_disable(dev, false); dpm_wait_for_subordinate(dev, async); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 2ee45841486b..a5aed89e1a6b 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1460,20 +1460,6 @@ int pm_runtime_barrier(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_barrier); -/** - * __pm_runtime_disable - Disable runtime PM of a device. - * @dev: Device to handle. - * @check_resume: If set, check if there's a resume request for the device. - * - * Increment power.disable_depth for the device and if it was zero previously, - * cancel all pending runtime PM requests for the device and wait for all - * operations in progress to complete. The device can be either active or - * suspended after its runtime PM has been disabled. - * - * If @check_resume is set and there's a resume request pending when - * __pm_runtime_disable() is called and power.disable_depth is zero, the - * function will wake up the device before disabling its runtime PM. - */ void __pm_runtime_disable(struct device *dev, bool check_resume) { spin_lock_irq(&dev->power.lock); diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index d39dc863f612..72c62e1171ca 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -556,11 +556,18 @@ static inline int pm_runtime_set_suspended(struct device *dev) * pm_runtime_disable - Disable runtime PM for a device. * @dev: Target device. * - * Prevent the runtime PM framework from working with @dev (by incrementing its - * "blocking" counter). + * Prevent the runtime PM framework from working with @dev by incrementing its + * "disable" counter. * - * For each invocation of this function for @dev there must be a matching - * pm_runtime_enable() call in order for runtime PM to be enabled for it. + * If the counter is zero when this function runs and there is a pending runtime + * resume request for @dev, it will be resumed. If the counter is still zero at + * that point, all of the pending runtime PM requests for @dev will be canceled + * and all runtime PM operations in progress involving it will be waited for to + * complete. + * + * For each invocation of this function for @dev, there must be a matching + * pm_runtime_enable() call, so that runtime PM is eventually enabled for it + * again. */ static inline void pm_runtime_disable(struct device *dev) { From 3e5eee147b7b0f5a93f56beffe34e81fdd00fa0d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Feb 2025 21:11:42 +0100 Subject: [PATCH 057/139] PM: Block enabling of runtime PM during system suspend If device_prepare() runs on a device that has never had runtime PM enabled so far, it may reasonably assume that runtime PM will not be enabled for that device during the system suspend-resume cycle currently in progress, but this has never been guaranteed. To verify this assumption, make device_prepare() arrange for triggering a device warning accompanied by a call trace dump if runtime PM is enabled for such a device after it has returned. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/6131109.lOV4Wx5bFT@rjwysocki.net --- drivers/base/power/main.c | 9 +++++++++ drivers/base/power/runtime.c | 24 ++++++++++++++++++++++++ include/linux/pm.h | 1 + include/linux/pm_runtime.h | 4 ++++ 4 files changed, 38 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index dffa2aa1ba7d..acabd9f3e60f 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1109,6 +1109,8 @@ static void device_complete(struct device *dev, pm_message_t state) device_unlock(dev); out: + /* If enabling runtime PM for the device is blocked, unblock it. */ + pm_runtime_unblock(dev); pm_runtime_put(dev); } @@ -1815,6 +1817,13 @@ static int device_prepare(struct device *dev, pm_message_t state) * it again during the complete phase. */ pm_runtime_get_noresume(dev); + /* + * If runtime PM is disabled for the device at this point and it has + * never been enabled so far, it should not be enabled until this system + * suspend-resume cycle is complete, so prepare to trigger a warning on + * subsequent attempts to enable it. + */ + pm_runtime_block_if_disabled(dev); if (dev->power.syscore) return 0; diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index a5aed89e1a6b..797ea38ceba7 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1460,6 +1460,26 @@ int pm_runtime_barrier(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_barrier); +void pm_runtime_block_if_disabled(struct device *dev) +{ + spin_lock_irq(&dev->power.lock); + + if (dev->power.disable_depth && dev->power.last_status == RPM_INVALID) + dev->power.last_status = RPM_BLOCKED; + + spin_unlock_irq(&dev->power.lock); +} + +void pm_runtime_unblock(struct device *dev) +{ + spin_lock_irq(&dev->power.lock); + + if (dev->power.last_status == RPM_BLOCKED) + dev->power.last_status = RPM_INVALID; + + spin_unlock_irq(&dev->power.lock); +} + void __pm_runtime_disable(struct device *dev, bool check_resume) { spin_lock_irq(&dev->power.lock); @@ -1518,6 +1538,10 @@ void pm_runtime_enable(struct device *dev) if (--dev->power.disable_depth > 0) goto out; + if (dev->power.last_status == RPM_BLOCKED) { + dev_warn(dev, "Attempt to enable runtime PM when it is blocked\n"); + dump_stack(); + } dev->power.last_status = RPM_INVALID; dev->power.accounting_timestamp = ktime_get_mono_fast_ns(); diff --git a/include/linux/pm.h b/include/linux/pm.h index 78855d794342..6ca6f34c58c3 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -597,6 +597,7 @@ enum rpm_status { RPM_RESUMING, RPM_SUSPENDED, RPM_SUSPENDING, + RPM_BLOCKED, }; /* diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 72c62e1171ca..10769119867b 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -77,6 +77,8 @@ extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); +extern void pm_runtime_block_if_disabled(struct device *dev); +extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); extern void pm_runtime_allow(struct device *dev); @@ -271,6 +273,8 @@ static inline int pm_runtime_get_if_active(struct device *dev) static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } +static inline void pm_runtime_block_if_disabled(struct device *dev) {} +static inline void pm_runtime_unblock(struct device *dev) {} static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} static inline void pm_runtime_allow(struct device *dev) {} From 758cc55ce3d5d79e8f98adbd03ad2cd29133af33 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Feb 2025 21:13:09 +0100 Subject: [PATCH 058/139] PM: runtime: Introduce pm_runtime_blocked() Introduce a new helper function called pm_runtime_blocked() for checking the power.last_status value indicating whether or not enabling runtime PM for the given device has been blocked (which happens in the "prepare" phase of system-wide suspend if runtime PM is disabled for the given device at that point). Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/4632087.LvFx2qVVIh@rjwysocki.net --- drivers/base/power/runtime.c | 17 +++++++++++++++++ include/linux/pm_runtime.h | 2 ++ 2 files changed, 19 insertions(+) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 797ea38ceba7..c0f5a9f89299 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1555,6 +1555,23 @@ void pm_runtime_enable(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_enable); +bool pm_runtime_blocked(struct device *dev) +{ + bool ret; + + /* + * dev->power.last_status is a bit field, so in case it is updated via + * RMW, read it under the spin lock. + */ + spin_lock_irq(&dev->power.lock); + + ret = dev->power.last_status == RPM_BLOCKED; + + spin_unlock_irq(&dev->power.lock); + + return ret; +} + static void pm_runtime_disable_action(void *data) { pm_runtime_dont_use_autosuspend(data); diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 10769119867b..aea0395c10a1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -81,6 +81,7 @@ extern void pm_runtime_block_if_disabled(struct device *dev); extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); +extern bool pm_runtime_blocked(struct device *dev); extern void pm_runtime_allow(struct device *dev); extern void pm_runtime_forbid(struct device *dev); extern void pm_runtime_no_callbacks(struct device *dev); @@ -277,6 +278,7 @@ static inline void pm_runtime_block_if_disabled(struct device *dev) {} static inline void pm_runtime_unblock(struct device *dev) {} static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} +static inline bool pm_runtime_blocked(struct device *dev) { return true; } static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} From f2d32942026c05acc49d5f445dd38931419967aa Mon Sep 17 00:00:00 2001 From: Benjamin Schneider Date: Mon, 25 Nov 2024 13:14:52 -0800 Subject: [PATCH 059/139] cpufreq: enable 1200Mhz clock speed for armada-37xx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This frequency was disabled because of stability problems whose source could not be accurately identified[1]. After seven months of testing, the evidence points to an incorrectly configured bootloader as the source of the historical instability. Testing was performed on two A3720 devices with this frequency enabled and the ondemand policy in use. Marvell merged[2] changes to their bootloader source needed to address the stability issue. This driver should expose this frequency option to users. [1] https://github.com/torvalds/linux/commit/484f2b7c61b9ae58cc00c5127bcbcd9177af8dfe [2] https://github.com/MarvellEmbeddedProcessors/mv-ddr-marvell/pull/44 Signed-off-by: Benjamin Schneider Reviewed-by: Pali Rohár Reviewed-by: Andrew Lunn Acked-by: Gregory CLEMENT Signed-off-by: Viresh Kumar --- drivers/cpufreq/armada-37xx-cpufreq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c index bea41ccabf1f..f28a4435fba7 100644 --- a/drivers/cpufreq/armada-37xx-cpufreq.c +++ b/drivers/cpufreq/armada-37xx-cpufreq.c @@ -102,11 +102,7 @@ struct armada_37xx_dvfs { }; static struct armada_37xx_dvfs armada_37xx_dvfs[] = { - /* - * The cpufreq scaling for 1.2 GHz variant of the SOC is currently - * unstable because we do not know how to configure it properly. - */ - /* {.cpu_freq_max = 1200*1000*1000, .divider = {1, 2, 4, 6} }, */ + {.cpu_freq_max = 1200*1000*1000, .divider = {1, 2, 4, 6} }, {.cpu_freq_max = 1000*1000*1000, .divider = {1, 2, 4, 5} }, {.cpu_freq_max = 800*1000*1000, .divider = {1, 2, 3, 4} }, {.cpu_freq_max = 600*1000*1000, .divider = {2, 4, 5, 6} }, From c93d13b661a6ce34b9cd8241f5e410658078d7b1 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 10 Feb 2025 09:12:53 +0200 Subject: [PATCH 060/139] intel_idle: clean up BYT/CHT auto demotion disable Bay Trail (BYT) and Cherry Trail (CHT) platforms have a very specific way of disabling auto-demotion via specific MSR bits. Clean up the code so that BYT/CHT-specifics do not show up in the common 'struct idle_cpu' data structure. Remove the 'byt_auto_demotion_disable_flag' flag from 'struct idle_cpu', because a better coding pattern is to avoid very case-specific fields like 'bool byt_auto_demotion_disable_flag' in a common data structure, which is used for all platforms, not only BYT/CHT. The code is just more readable when common data structures contain only commonly used fields. Instead, match BYT/CHT in the 'intel_idle_init_cstates_icpu()' function, and introduce a small helper to take care of BYT/CHT auto-demotion. This is consistent with how platform-specific things are done for other platforms. No intended functional changes, compile-tested only. Signed-off-by: Artem Bityutskiy Link: https://patch.msgid.link/20250210071253.2991030-1-dedekind1@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 118fe1d37c22..324814dc34fa 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -89,7 +89,6 @@ struct idle_cpu { * Indicate which enable bits to clear here. */ unsigned long auto_demotion_disable_flags; - bool byt_auto_demotion_disable_flag; bool disable_promotion_to_c1e; bool use_acpi; }; @@ -1463,13 +1462,11 @@ static const struct idle_cpu idle_cpu_snx __initconst = { static const struct idle_cpu idle_cpu_byt __initconst = { .state_table = byt_cstates, .disable_promotion_to_c1e = true, - .byt_auto_demotion_disable_flag = true, }; static const struct idle_cpu idle_cpu_cht __initconst = { .state_table = cht_cstates, .disable_promotion_to_c1e = true, - .byt_auto_demotion_disable_flag = true, }; static const struct idle_cpu idle_cpu_ivb __initconst = { @@ -2055,6 +2052,15 @@ static void __init spr_idle_state_table_update(void) } } +/** + * byt_cht_auto_demotion_disable - Disable Bay/Cherry Trail auto-demotion. + */ +static void __init byt_cht_auto_demotion_disable(void) +{ + wrmsrl(MSR_CC6_DEMOTION_POLICY_CONFIG, 0); + wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0); +} + static bool __init intel_idle_verify_cstate(unsigned int mwait_hint) { unsigned int mwait_cstate = (MWAIT_HINT2CSTATE(mwait_hint) + 1) & @@ -2136,6 +2142,10 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) case INTEL_ATOM_GRACEMONT: adl_idle_state_table_update(); break; + case INTEL_ATOM_SILVERMONT: + case INTEL_ATOM_AIRMONT: + byt_cht_auto_demotion_disable(); + break; } for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) { @@ -2178,11 +2188,6 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv) drv->state_count++; } - - if (icpu->byt_auto_demotion_disable_flag) { - wrmsrl(MSR_CC6_DEMOTION_POLICY_CONFIG, 0); - wrmsrl(MSR_MC6_DEMOTION_POLICY_CONFIG, 0); - } } /** From bca84a7b93fdc744d79d94423c2cb905b1832310 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Feb 2025 21:16:48 +0100 Subject: [PATCH 061/139] PM: sleep: Use DPM_FLAG_SMART_SUSPEND conditionally A recent discussion has revealed that using DPM_FLAG_SMART_SUSPEND unconditionally is generally problematic because it may lead to situations in which the device's runtime PM information is internally inconsistent or does not reflect its real state [1]. For this reason, change the handling of DPM_FLAG_SMART_SUSPEND so that it is only taken into account if it is consistently set by the drivers of all devices having any PM callbacks throughout dependency graphs in accordance with the following rules: - The "smart suspend" feature is only enabled for devices whose drivers ask for it (that is, set DPM_FLAG_SMART_SUSPEND) and for devices without PM callbacks unless they have never had runtime PM enabled. - The "smart suspend" feature is not enabled for a device if it has not been enabled for the device's parent unless the parent does not take children into account or it has never had runtime PM enabled. - The "smart suspend" feature is not enabled for a device if it has not been enabled for one of the device's suppliers taking runtime PM into account unless that supplier has never had runtime PM enabled. Namely, introduce a new device PM flag called smart_suspend that is only set if the above conditions are met and update all DPM_FLAG_SMART_SUSPEND users to check power.smart_suspend instead of directly checking the latter. At the same time, drop the power.set_active flage introduced recently in commit 3775fc538f53 ("PM: sleep: core: Synchronize runtime PM status of parents and children") because it is now sufficient to check power.smart_suspend along with the dev_pm_skip_resume() return value to decide whether or not pm_runtime_set_active() needs to be called for the device. Link: https://lore.kernel.org/linux-pm/CAPDyKFroyU3YDSfw_Y6k3giVfajg3NQGwNWeteJWqpW29BojhQ@mail.gmail.com/ [1] Fixes: 7585946243d6 ("PM: sleep: core: Restrict power.set_active propagation") Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Acked-by: Bjorn Helgaas # drivers/pci Link: https://patch.msgid.link/1914558.tdWV9SEqCh@rjwysocki.net --- drivers/acpi/device_pm.c | 4 +-- drivers/base/power/main.c | 63 ++++++++++++++++++++++++++++++--------- drivers/mfd/intel-lpss.c | 2 +- drivers/pci/pci-driver.c | 6 ++-- include/linux/device.h | 9 ++++++ include/linux/pm.h | 2 +- 6 files changed, 64 insertions(+), 22 deletions(-) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 3b4d048c4941..dbd4446025ec 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -1161,7 +1161,7 @@ EXPORT_SYMBOL_GPL(acpi_subsys_complete); */ int acpi_subsys_suspend(struct device *dev) { - if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) || + if (!dev_pm_smart_suspend(dev) || acpi_dev_needs_resume(dev, ACPI_COMPANION(dev))) pm_runtime_resume(dev); @@ -1320,7 +1320,7 @@ EXPORT_SYMBOL_GPL(acpi_subsys_restore_early); */ int acpi_subsys_poweroff(struct device *dev) { - if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) || + if (!dev_pm_smart_suspend(dev) || acpi_dev_needs_resume(dev, ACPI_COMPANION(dev))) pm_runtime_resume(dev); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index acabd9f3e60f..cc9903065900 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -656,15 +656,13 @@ static void device_resume_noirq(struct device *dev, pm_message_t state, bool asy * so change its status accordingly. * * Otherwise, the device is going to be resumed, so set its PM-runtime - * status to "active" unless its power.set_active flag is clear, in + * status to "active" unless its power.smart_suspend flag is clear, in * which case it is not necessary to update its PM-runtime status. */ - if (skip_resume) { + if (skip_resume) pm_runtime_set_suspended(dev); - } else if (dev->power.set_active) { + else if (dev_pm_smart_suspend(dev)) pm_runtime_set_active(dev); - dev->power.set_active = false; - } if (dev->pm_domain) { info = "noirq power domain "; @@ -1282,14 +1280,8 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool asy dev->power.may_skip_resume)) dev->power.must_resume = true; - if (dev->power.must_resume) { - if (dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) { - dev->power.set_active = true; - if (dev->parent && !dev->parent->power.ignore_children) - dev->parent->power.set_active = true; - } + if (dev->power.must_resume) dpm_superior_set_must_resume(dev); - } Complete: complete_all(&dev->power.completion); @@ -1797,6 +1789,49 @@ int dpm_suspend(pm_message_t state) return error; } +static void device_prepare_smart_suspend(struct device *dev) +{ + struct device_link *link; + int idx; + + /* + * The "smart suspend" feature is enabled for devices whose drivers ask + * for it and for devices without PM callbacks unless runtime PM is + * disabled and enabling it is blocked for them. + * + * However, if "smart suspend" is not enabled for the device's parent + * or any of its suppliers that take runtime PM into account, it cannot + * be enabled for the device either. + */ + dev->power.smart_suspend = (dev->power.no_pm_callbacks || + dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) && + !pm_runtime_blocked(dev); + + if (!dev_pm_smart_suspend(dev)) + return; + + if (dev->parent && !dev_pm_smart_suspend(dev->parent) && + !dev->parent->power.ignore_children && !pm_runtime_blocked(dev->parent)) { + dev->power.smart_suspend = false; + return; + } + + idx = device_links_read_lock(); + + list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { + if (!(link->flags | DL_FLAG_PM_RUNTIME)) + continue; + + if (!dev_pm_smart_suspend(link->supplier) && + !pm_runtime_blocked(link->supplier)) { + dev->power.smart_suspend = false; + break; + } + } + + device_links_read_unlock(idx); +} + /** * device_prepare - Prepare a device for system power transition. * @dev: Device to handle. @@ -1858,6 +1893,7 @@ static int device_prepare(struct device *dev, pm_message_t state) pm_runtime_put(dev); return ret; } + device_prepare_smart_suspend(dev); /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is @@ -2033,6 +2069,5 @@ void device_pm_check_callbacks(struct device *dev) bool dev_pm_skip_suspend(struct device *dev) { - return dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) && - pm_runtime_status_suspended(dev); + return dev_pm_smart_suspend(dev) && pm_runtime_status_suspended(dev); } diff --git a/drivers/mfd/intel-lpss.c b/drivers/mfd/intel-lpss.c index 3ba05ebb9035..63d6694f7145 100644 --- a/drivers/mfd/intel-lpss.c +++ b/drivers/mfd/intel-lpss.c @@ -480,7 +480,7 @@ EXPORT_SYMBOL_NS_GPL(intel_lpss_remove, "INTEL_LPSS"); static int resume_lpss_device(struct device *dev, void *data) { - if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) + if (!dev_pm_smart_suspend(dev)) pm_runtime_resume(dev); return 0; diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index f57ea36d125d..02726f36beb5 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -812,8 +812,7 @@ static int pci_pm_suspend(struct device *dev) * suspend callbacks can cope with runtime-suspended devices, it is * better to resume the device from runtime suspend here. */ - if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) || - pci_dev_need_resume(pci_dev)) { + if (!dev_pm_smart_suspend(dev) || pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; } else { @@ -1151,8 +1150,7 @@ static int pci_pm_poweroff(struct device *dev) } /* The reason to do that is the same as in pci_pm_suspend(). */ - if (!dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND) || - pci_dev_need_resume(pci_dev)) { + if (!dev_pm_smart_suspend(dev) || pci_dev_need_resume(pci_dev)) { pm_runtime_resume(dev); pci_dev->state_saved = false; } else { diff --git a/include/linux/device.h b/include/linux/device.h index 80a5b3268986..615282365052 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1025,6 +1025,15 @@ static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags) return !!(dev->power.driver_flags & flags); } +static inline bool dev_pm_smart_suspend(struct device *dev) +{ +#ifdef CONFIG_PM_SLEEP + return dev->power.smart_suspend; +#else + return false; +#endif +} + static inline void device_lock(struct device *dev) { mutex_lock(&dev->mutex); diff --git a/include/linux/pm.h b/include/linux/pm.h index 6ca6f34c58c3..24647108f0ad 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -680,8 +680,8 @@ struct dev_pm_info { bool syscore:1; bool no_pm_callbacks:1; /* Owned by the PM core */ bool async_in_progress:1; /* Owned by the PM core */ + bool smart_suspend:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ - bool set_active:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ #else bool should_wakeup:1; From 208baa3ec9043a664d9acfb8174b332e6b17fb69 Mon Sep 17 00:00:00 2001 From: Zhongqiu Han Date: Wed, 19 Feb 2025 20:27:15 +0800 Subject: [PATCH 062/139] pm: cpupower: bench: Prevent NULL dereference on malloc failure If malloc returns NULL due to low memory, 'config' pointer can be NULL. Add a check to prevent NULL dereference. Link: https://lore.kernel.org/r/20250219122715.3892223-1-quic_zhonhan@quicinc.com Signed-off-by: Zhongqiu Han Signed-off-by: Shuah Khan --- tools/power/cpupower/bench/parse.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/power/cpupower/bench/parse.c b/tools/power/cpupower/bench/parse.c index 080678d9d74e..bd67c758b33a 100644 --- a/tools/power/cpupower/bench/parse.c +++ b/tools/power/cpupower/bench/parse.c @@ -121,6 +121,10 @@ FILE *prepare_output(const char *dirname) struct config *prepare_default_config() { struct config *config = malloc(sizeof(struct config)); + if (!config) { + perror("malloc"); + return NULL; + } dprintf("loading defaults\n"); From 80d3175a7e073fad281bdc5c7e881f46ab806d97 Mon Sep 17 00:00:00 2001 From: Yiwei Lin Date: Fri, 21 Feb 2025 00:38:46 +0800 Subject: [PATCH 063/139] cpupower: monitor: Exit with error status if execvp() fail In the case that we give a invalid command to idle_monitor for monitoring, the execvp() will fail and thus go to the next line. As a result, we'll see two differnt monitoring output. For example, running `cpupower monitor -i 5 invalidcmd` which `invalidcmd` is not executable. Link: https://lore.kernel.org/r/20250220163846.2765-1-s921975628@gmail.com Signed-off-by: Yiwei Lin Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c index f746099b5dac..e123aa578881 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c @@ -6,6 +6,7 @@ */ +#include #include #include #include @@ -294,7 +295,10 @@ int fork_it(char **argv) if (!child_pid) { /* child */ - execvp(argv[0], argv); + if (execvp(argv[0], argv) == -1) { + printf("Invalid monitor command %s\n", argv[0]); + exit(errno); + } } else { /* parent */ if (child_pid == -1) { From 3698dd6b139dc37b35a9ad83d9330c1f99666c02 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Thu, 13 Feb 2025 11:55:10 +0800 Subject: [PATCH 064/139] cpufreq: governor: Fix negative 'idle_time' handling in dbs_update() We observed an issue that the CPU frequency can't raise up with a 100% CPU load when NOHZ is off and the 'conservative' governor is selected. 'idle_time' can be negative if it's obtained from get_cpu_idle_time_jiffy() when NOHZ is off. This was found and explained in commit 9485e4ca0b48 ("cpufreq: governor: Fix handling of special cases in dbs_update()"). However, commit 7592019634f8 ("cpufreq: governors: Fix long idle detection logic in load calculation") introduced a comparison between 'idle_time' and 'samling_rate' to detect a long idle interval. While 'idle_time' is converted to int before comparison, it's actually promoted to unsigned again when compared with an unsigned 'sampling_rate'. Hence, this leads to wrong idle interval detection when it's in fact 100% busy and sets policy_dbs->idle_periods to a very large value. 'conservative' adjusts the frequency to minimum because of the large 'idle_periods', such that the frequency can't raise up. 'Ondemand' doesn't use policy_dbs->idle_periods so it fortunately avoids the issue. Correct negative 'idle_time' to 0 before any use of it in dbs_update(). Fixes: 7592019634f8 ("cpufreq: governors: Fix long idle detection logic in load calculation") Signed-off-by: Jie Zhan Reviewed-by: Chen Yu Link: https://patch.msgid.link/20250213035510.2402076-1-zhanjie9@hisilicon.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.c | 45 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index af44ee6a6430..1a7fcaf39cc9 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -145,7 +145,23 @@ unsigned int dbs_update(struct cpufreq_policy *policy) time_elapsed = update_time - j_cdbs->prev_update_time; j_cdbs->prev_update_time = update_time; - idle_time = cur_idle_time - j_cdbs->prev_cpu_idle; + /* + * cur_idle_time could be smaller than j_cdbs->prev_cpu_idle if + * it's obtained from get_cpu_idle_time_jiffy() when NOHZ is + * off, where idle_time is calculated by the difference between + * time elapsed in jiffies and "busy time" obtained from CPU + * statistics. If a CPU is 100% busy, the time elapsed and busy + * time should grow with the same amount in two consecutive + * samples, but in practice there could be a tiny difference, + * making the accumulated idle time decrease sometimes. Hence, + * in this case, idle_time should be regarded as 0 in order to + * make the further process correct. + */ + if (cur_idle_time > j_cdbs->prev_cpu_idle) + idle_time = cur_idle_time - j_cdbs->prev_cpu_idle; + else + idle_time = 0; + j_cdbs->prev_cpu_idle = cur_idle_time; if (ignore_nice) { @@ -162,7 +178,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy) * calls, so the previous load value can be used then. */ load = j_cdbs->prev_load; - } else if (unlikely((int)idle_time > 2 * sampling_rate && + } else if (unlikely(idle_time > 2 * sampling_rate && j_cdbs->prev_load)) { /* * If the CPU had gone completely idle and a task has @@ -189,30 +205,15 @@ unsigned int dbs_update(struct cpufreq_policy *policy) load = j_cdbs->prev_load; j_cdbs->prev_load = 0; } else { - if (time_elapsed >= idle_time) { + if (time_elapsed > idle_time) load = 100 * (time_elapsed - idle_time) / time_elapsed; - } else { - /* - * That can happen if idle_time is returned by - * get_cpu_idle_time_jiffy(). In that case - * idle_time is roughly equal to the difference - * between time_elapsed and "busy time" obtained - * from CPU statistics. Then, the "busy time" - * can end up being greater than time_elapsed - * (for example, if jiffies_64 and the CPU - * statistics are updated by different CPUs), - * so idle_time may in fact be negative. That - * means, though, that the CPU was busy all - * the time (on the rough average) during the - * last sampling interval and 100 can be - * returned as the load. - */ - load = (int)idle_time < 0 ? 100 : 0; - } + else + load = 0; + j_cdbs->prev_load = load; } - if (unlikely((int)idle_time > 2 * sampling_rate)) { + if (unlikely(idle_time > 2 * sampling_rate)) { unsigned int periods = idle_time / sampling_rate; if (periods < idle_periods) From 1618f635bdf56f3ac158171114e9bf18db234cbf Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 18 Feb 2025 16:20:21 +0800 Subject: [PATCH 065/139] PM: EM: use kfree_rcu() to simplify the code The callback function of call_rcu() just calls kfree(), so use kfree_rcu() instead of call_rcu() + callback function. Signed-off-by: Li RongQing Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20250218082021.2766-1-lirongqing@baidu.com Signed-off-by: Rafael J. Wysocki --- kernel/power/energy_model.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 066bcf1c71a1..16f6dcafdb90 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -161,14 +161,6 @@ static void em_debug_create_pd(struct device *dev) {} static void em_debug_remove_pd(struct device *dev) {} #endif -static void em_destroy_table_rcu(struct rcu_head *rp) -{ - struct em_perf_table __rcu *table; - - table = container_of(rp, struct em_perf_table, rcu); - kfree(table); -} - static void em_release_table_kref(struct kref *kref) { struct em_perf_table __rcu *table; @@ -176,7 +168,7 @@ static void em_release_table_kref(struct kref *kref) /* It was the last owner of this table so we can free */ table = container_of(kref, struct em_perf_table, kref); - call_rcu(&table->rcu, em_destroy_table_rcu); + kfree_rcu(table, rcu); } /** From a29ba0023ddfb060473a0f55f2944ccd1c19b408 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 20 Feb 2025 11:40:33 +0000 Subject: [PATCH 066/139] MAINTAINERS: Add Energy Model framework as properly maintained The Energy Model framework had some recent grow and became a bit more complex. Add the proper contact points to maintainers so other developers can get the right support. Signed-off-by: Lukasz Luba Link: https://patch.msgid.link/20250220114103.515278-1-lukasz.luba@arm.com Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index efee40ea589f..2c106088bfbf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8518,6 +8518,15 @@ M: Maxim Levitsky S: Maintained F: drivers/media/rc/ene_ir.* +ENERGY MODEL +M: Lukasz Luba +M: "Rafael J. Wysocki" +L: linux-pm@vger.kernel.org +S: Maintained +F: kernel/power/energy_model.c +F: include/linux/energy_model.h +F: Documentation/power/energy-model.rst + EPAPR HYPERVISOR BYTE CHANNEL DEVICE DRIVER M: Laurentiu Tudor L: linuxppc-dev@lists.ozlabs.org From ed7cad0504e38a2a4e1aa6168b6eadee6de531b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Feb 2025 13:48:40 +0100 Subject: [PATCH 067/139] cpufreq: intel_pstate: Relocate platform preference check Move the invocation of intel_pstate_platform_pwr_mgmt_exists() before checking whether or not HWP is enabled because it does not depend on any code running before it except for the vendor check and if CPU performance scaling is going to be carried out by the platform, all of the code that runs before that function (again, except for the vendor check) is redundant. This is not expected to alter any functionality except for the ordering of messages printed by intel_pstate_init() when it is going to return an error before attempting to register the driver. Signed-off-by: Rafael J. Wysocki Acked-by: Srinivas Pandruvada Link: https://patch.msgid.link/2776745.mvXUDI8C0e@rjwysocki.net --- drivers/cpufreq/intel_pstate.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index bc31e9b9b660..fa81054fa411 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3694,6 +3694,15 @@ static int __init intel_pstate_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -ENODEV; + /* + * The Intel pstate driver will be ignored if the platform + * firmware has its own power management modes. + */ + if (intel_pstate_platform_pwr_mgmt_exists()) { + pr_info("P-states controlled by the platform\n"); + return -ENODEV; + } + id = x86_match_cpu(hwp_support_ids); if (id) { hwp_forced = intel_pstate_hwp_is_enabled(); @@ -3749,15 +3758,6 @@ static int __init intel_pstate_init(void) default_driver = &intel_cpufreq; hwp_cpu_matched: - /* - * The Intel pstate driver will be ignored if the platform - * firmware has its own power management modes. - */ - if (intel_pstate_platform_pwr_mgmt_exists()) { - pr_info("P-states controlled by the platform\n"); - return -ENODEV; - } - if (!hwp_active && hwp_only) return -ENOTSUPP; From 6ceb877d5cecd5417d63239bf833a1cd5f8f271c Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:14 +0000 Subject: [PATCH 068/139] cpufreq/amd-pstate: Modify the min_perf calculation in adjust_perf callback Instead of setting a fixed floor at lowest_nonlinear_perf, use the min_limit_perf value, so that it gives the user the freedom to lower the floor further. There are two minimum frequency/perf limits that we need to consider in the adjust_perf callback. One provided by schedutil i.e. the sg_cpu->bw_min value passed in _min_perf arg, another is the effective value of min_freq_qos request that is updated in cpudata->min_limit_perf. Modify the code to use the bigger of these two values. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-4-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 313550fa62d4..17595a2454e1 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -672,7 +672,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, unsigned long capacity) { unsigned long max_perf, min_perf, des_perf, - cap_perf, lowest_nonlinear_perf; + cap_perf, min_limit_perf; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; @@ -684,20 +684,20 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) amd_pstate_update_min_max_limit(policy); - cap_perf = READ_ONCE(cpudata->highest_perf); - lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); + min_limit_perf = READ_ONCE(cpudata->min_limit_perf); des_perf = cap_perf; if (target_perf < capacity) des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity); - min_perf = READ_ONCE(cpudata->lowest_perf); if (_min_perf < capacity) min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity); + else + min_perf = cap_perf; - if (min_perf < lowest_nonlinear_perf) - min_perf = lowest_nonlinear_perf; + if (min_perf < min_limit_perf) + min_perf = min_limit_perf; max_perf = cpudata->max_limit_perf; if (max_perf < min_perf) From 932da6489669da4d61b711c44af208fa431654fa Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:15 +0000 Subject: [PATCH 069/139] cpufreq/amd-pstate: Remove the redundant des_perf clamping in adjust_perf des_perf is later on clamped between min_perf and max_perf in amd_pstate_update. So, remove the redundant clamping from amd_pstate_adjust_perf. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-5-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 17595a2454e1..0cf24dff55e9 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -703,8 +703,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (max_perf < min_perf) max_perf = min_perf; - des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); - amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, policy->governor->flags); cpufreq_cpu_put(policy); From e9869c836b2a460c48e2d69ae79d786303dbffda Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:16 +0000 Subject: [PATCH 070/139] cpufreq/amd-pstate: Pass min/max_limit_perf as min/max_perf to amd_pstate_update Currently, amd_pstate_update_freq passes the hardware perf limits as min/max_perf to amd_pstate_update, which eventually gets programmed into the min/max_perf fields of the CPPC_REQ register. Instead pass the effective perf limits i.e. min/max_limit_perf values to amd_pstate_update as min/max_perf. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-6-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 0cf24dff55e9..3cb81b826bcb 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -615,7 +615,7 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy, { struct cpufreq_freqs freqs; struct amd_cpudata *cpudata = policy->driver_data; - unsigned long max_perf, min_perf, des_perf, cap_perf; + unsigned long des_perf, cap_perf; if (!cpudata->max_freq) return -ENODEV; @@ -624,8 +624,6 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy, amd_pstate_update_min_max_limit(policy); cap_perf = READ_ONCE(cpudata->highest_perf); - min_perf = READ_ONCE(cpudata->lowest_perf); - max_perf = cap_perf; freqs.old = policy->cur; freqs.new = target_freq; @@ -642,8 +640,9 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy, if (!fast_switch) cpufreq_freq_transition_begin(policy, &freqs); - amd_pstate_update(cpudata, min_perf, des_perf, - max_perf, fast_switch, policy->governor->flags); + amd_pstate_update(cpudata, cpudata->min_limit_perf, des_perf, + cpudata->max_limit_perf, fast_switch, + policy->governor->flags); if (!fast_switch) cpufreq_freq_transition_end(policy, &freqs, false); From 555bbe67a622b297405e246d1868dbda3284bde8 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:17 +0000 Subject: [PATCH 071/139] cpufreq/amd-pstate: Convert all perf values to u8 All perf values are always within 0-255 range, hence convert their datatype to u8 everywhere. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-7-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-trace.h | 46 +++++++++++------------ drivers/cpufreq/amd-pstate.c | 60 +++++++++++++++--------------- drivers/cpufreq/amd-pstate.h | 18 ++++----- 3 files changed, 62 insertions(+), 62 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h index 8d692415d905..f457d4af2c62 100644 --- a/drivers/cpufreq/amd-pstate-trace.h +++ b/drivers/cpufreq/amd-pstate-trace.h @@ -24,9 +24,9 @@ TRACE_EVENT(amd_pstate_perf, - TP_PROTO(unsigned long min_perf, - unsigned long target_perf, - unsigned long capacity, + TP_PROTO(u8 min_perf, + u8 target_perf, + u8 capacity, u64 freq, u64 mperf, u64 aperf, @@ -47,9 +47,9 @@ TRACE_EVENT(amd_pstate_perf, ), TP_STRUCT__entry( - __field(unsigned long, min_perf) - __field(unsigned long, target_perf) - __field(unsigned long, capacity) + __field(u8, min_perf) + __field(u8, target_perf) + __field(u8, capacity) __field(unsigned long long, freq) __field(unsigned long long, mperf) __field(unsigned long long, aperf) @@ -70,10 +70,10 @@ TRACE_EVENT(amd_pstate_perf, __entry->fast_switch = fast_switch; ), - TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u fast_switch=%s", - (unsigned long)__entry->min_perf, - (unsigned long)__entry->target_perf, - (unsigned long)__entry->capacity, + TP_printk("amd_min_perf=%hhu amd_des_perf=%hhu amd_max_perf=%hhu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u fast_switch=%s", + (u8)__entry->min_perf, + (u8)__entry->target_perf, + (u8)__entry->capacity, (unsigned long long)__entry->freq, (unsigned long long)__entry->mperf, (unsigned long long)__entry->aperf, @@ -86,10 +86,10 @@ TRACE_EVENT(amd_pstate_perf, TRACE_EVENT(amd_pstate_epp_perf, TP_PROTO(unsigned int cpu_id, - unsigned int highest_perf, - unsigned int epp, - unsigned int min_perf, - unsigned int max_perf, + u8 highest_perf, + u8 epp, + u8 min_perf, + u8 max_perf, bool boost ), @@ -102,10 +102,10 @@ TRACE_EVENT(amd_pstate_epp_perf, TP_STRUCT__entry( __field(unsigned int, cpu_id) - __field(unsigned int, highest_perf) - __field(unsigned int, epp) - __field(unsigned int, min_perf) - __field(unsigned int, max_perf) + __field(u8, highest_perf) + __field(u8, epp) + __field(u8, min_perf) + __field(u8, max_perf) __field(bool, boost) ), @@ -118,12 +118,12 @@ TRACE_EVENT(amd_pstate_epp_perf, __entry->boost = boost; ), - TP_printk("cpu%u: [%u<->%u]/%u, epp=%u, boost=%u", + TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u", (unsigned int)__entry->cpu_id, - (unsigned int)__entry->min_perf, - (unsigned int)__entry->max_perf, - (unsigned int)__entry->highest_perf, - (unsigned int)__entry->epp, + (u8)__entry->min_perf, + (u8)__entry->max_perf, + (u8)__entry->highest_perf, + (u8)__entry->epp, (bool)__entry->boost ) ); diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 3cb81b826bcb..69409d43eae8 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -186,7 +186,7 @@ static inline int get_mode_idx_from_str(const char *str, size_t size) static DEFINE_MUTEX(amd_pstate_limits_lock); static DEFINE_MUTEX(amd_pstate_driver_lock); -static s16 msr_get_epp(struct amd_cpudata *cpudata) +static u8 msr_get_epp(struct amd_cpudata *cpudata) { u64 value; int ret; @@ -207,7 +207,7 @@ static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata) return static_call(amd_pstate_get_epp)(cpudata); } -static s16 shmem_get_epp(struct amd_cpudata *cpudata) +static u8 shmem_get_epp(struct amd_cpudata *cpudata) { u64 epp; int ret; @@ -218,11 +218,11 @@ static s16 shmem_get_epp(struct amd_cpudata *cpudata) return ret; } - return (s16)(epp & 0xff); + return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, epp); } -static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, - u32 des_perf, u32 max_perf, u32 epp, bool fast_switch) +static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, + u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { u64 value, prev; @@ -257,15 +257,15 @@ static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, - u32 min_perf, u32 des_perf, - u32 max_perf, u32 epp, + u8 min_perf, u8 des_perf, + u8 max_perf, u8 epp, bool fast_switch) { return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, max_perf, epp, fast_switch); } -static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) +static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) { u64 value, prev; int ret; @@ -292,12 +292,12 @@ static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp); -static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) +static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u8 epp) { return static_call(amd_pstate_set_epp)(cpudata, epp); } -static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp) +static int shmem_set_epp(struct amd_cpudata *cpudata, u8 epp) { int ret; struct cppc_perf_ctrls perf_ctrls; @@ -320,7 +320,7 @@ static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy, int pref_index) { struct amd_cpudata *cpudata = policy->driver_data; - int epp; + u8 epp; if (!pref_index) epp = cpudata->epp_default; @@ -479,8 +479,8 @@ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) return static_call(amd_pstate_init_perf)(cpudata); } -static int shmem_update_perf(struct amd_cpudata *cpudata, u32 min_perf, - u32 des_perf, u32 max_perf, u32 epp, bool fast_switch) +static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf, + u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { struct cppc_perf_ctrls perf_ctrls; @@ -531,14 +531,14 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) return true; } -static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, - u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags) +static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, + u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) { unsigned long max_freq; struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); - u32 nominal_perf = READ_ONCE(cpudata->nominal_perf); + u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); - des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); + des_perf = clamp_t(u8, des_perf, min_perf, max_perf); max_freq = READ_ONCE(cpudata->max_limit_freq); policy->cur = div_u64(des_perf * max_freq, max_perf); @@ -550,7 +550,7 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, /* limit the max perf when core performance boost feature is disabled */ if (!cpudata->boost_supported) - max_perf = min_t(unsigned long, nominal_perf, max_perf); + max_perf = min_t(u8, nominal_perf, max_perf); if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, @@ -591,7 +591,8 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) { - u32 max_limit_perf, min_limit_perf, max_perf, max_freq; + u8 max_limit_perf, min_limit_perf, max_perf; + u32 max_freq; struct amd_cpudata *cpudata = policy->driver_data; max_perf = READ_ONCE(cpudata->highest_perf); @@ -615,7 +616,7 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy, { struct cpufreq_freqs freqs; struct amd_cpudata *cpudata = policy->driver_data; - unsigned long des_perf, cap_perf; + u8 des_perf, cap_perf; if (!cpudata->max_freq) return -ENODEV; @@ -670,8 +671,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, unsigned long target_perf, unsigned long capacity) { - unsigned long max_perf, min_perf, des_perf, - cap_perf, min_limit_perf; + u8 max_perf, min_perf, des_perf, cap_perf, min_limit_perf; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; @@ -905,8 +905,8 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) { int ret; u32 min_freq, max_freq; - u32 highest_perf, nominal_perf, nominal_freq; - u32 lowest_nonlinear_perf, lowest_nonlinear_freq; + u8 highest_perf, nominal_perf, lowest_nonlinear_perf; + u32 nominal_freq, lowest_nonlinear_freq; struct cppc_perf_caps cppc_perf; ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); @@ -1113,7 +1113,7 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, char *buf) { - u32 perf; + u8 perf; struct amd_cpudata *cpudata = policy->driver_data; perf = READ_ONCE(cpudata->highest_perf); @@ -1124,7 +1124,7 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy, char *buf) { - u32 perf; + u8 perf; struct amd_cpudata *cpudata = policy->driver_data; perf = READ_ONCE(cpudata->prefcore_ranking); @@ -1187,7 +1187,7 @@ static ssize_t show_energy_performance_preference( struct cpufreq_policy *policy, char *buf) { struct amd_cpudata *cpudata = policy->driver_data; - int preference; + u8 preference; switch (cpudata->epp_cached) { case AMD_CPPC_EPP_PERFORMANCE: @@ -1549,7 +1549,7 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - u32 epp; + u8 epp; amd_pstate_update_min_max_limit(policy); @@ -1598,7 +1598,7 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - u64 max_perf; + u8 max_perf; int ret; ret = amd_pstate_cppc_enable(true); @@ -1635,7 +1635,7 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - int min_perf; + u8 min_perf; if (cpudata->suspended) return 0; diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 9747e3be6cee..19d405c6d805 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -70,13 +70,13 @@ struct amd_cpudata { struct freq_qos_request req[2]; u64 cppc_req_cached; - u32 highest_perf; - u32 nominal_perf; - u32 lowest_nonlinear_perf; - u32 lowest_perf; - u32 prefcore_ranking; - u32 min_limit_perf; - u32 max_limit_perf; + u8 highest_perf; + u8 nominal_perf; + u8 lowest_nonlinear_perf; + u8 lowest_perf; + u8 prefcore_ranking; + u8 min_limit_perf; + u8 max_limit_perf; u32 min_limit_freq; u32 max_limit_freq; @@ -93,11 +93,11 @@ struct amd_cpudata { bool hw_prefcore; /* EPP feature related attributes*/ - s16 epp_cached; + u8 epp_cached; u32 policy; u64 cppc_cap1_cached; bool suspended; - s16 epp_default; + u8 epp_default; }; /* From 620136ced35a9329f4d1ea90e51bee2dfd7ee5b0 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:18 +0000 Subject: [PATCH 072/139] cpufreq/amd-pstate: Modularize perf<->freq conversion Delegate the perf<->frequency conversion to helper functions to reduce code duplication, and improve readability. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-8-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 57 +++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 69409d43eae8..9ab95ec1f828 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -142,6 +142,20 @@ static struct quirk_entry quirk_amd_7k62 = { .lowest_freq = 550, }; +static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val) +{ + u8 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf, + cpudata->nominal_freq); + + return clamp_t(u8, perf_val, cpudata->lowest_perf, cpudata->highest_perf); +} + +static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val) +{ + return DIV_ROUND_UP_ULL((u64)cpudata->nominal_freq * perf_val, + cpudata->nominal_perf); +} + static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) { /** @@ -534,14 +548,12 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) { - unsigned long max_freq; struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); des_perf = clamp_t(u8, des_perf, min_perf, max_perf); - max_freq = READ_ONCE(cpudata->max_limit_freq); - policy->cur = div_u64(des_perf * max_freq, max_perf); + policy->cur = perf_to_freq(cpudata, des_perf); if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { min_perf = des_perf; @@ -591,14 +603,11 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) { - u8 max_limit_perf, min_limit_perf, max_perf; - u32 max_freq; + u8 max_limit_perf, min_limit_perf; struct amd_cpudata *cpudata = policy->driver_data; - max_perf = READ_ONCE(cpudata->highest_perf); - max_freq = READ_ONCE(cpudata->max_freq); - max_limit_perf = div_u64(policy->max * max_perf, max_freq); - min_limit_perf = div_u64(policy->min * max_perf, max_freq); + max_limit_perf = freq_to_perf(cpudata, policy->max); + min_limit_perf = freq_to_perf(cpudata, policy->min); if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) min_limit_perf = min(cpudata->nominal_perf, max_limit_perf); @@ -616,21 +625,15 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy, { struct cpufreq_freqs freqs; struct amd_cpudata *cpudata = policy->driver_data; - u8 des_perf, cap_perf; - - if (!cpudata->max_freq) - return -ENODEV; + u8 des_perf; if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) amd_pstate_update_min_max_limit(policy); - cap_perf = READ_ONCE(cpudata->highest_perf); - freqs.old = policy->cur; freqs.new = target_freq; - des_perf = DIV_ROUND_CLOSEST(target_freq * cap_perf, - cpudata->max_freq); + des_perf = freq_to_perf(cpudata, target_freq); WARN_ON(fast_switch && !policy->fast_switch_enabled); /* @@ -905,7 +908,6 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) { int ret; u32 min_freq, max_freq; - u8 highest_perf, nominal_perf, lowest_nonlinear_perf; u32 nominal_freq, lowest_nonlinear_freq; struct cppc_perf_caps cppc_perf; @@ -923,16 +925,17 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) else nominal_freq = cppc_perf.nominal_freq; - highest_perf = READ_ONCE(cpudata->highest_perf); - nominal_perf = READ_ONCE(cpudata->nominal_perf); - max_freq = div_u64((u64)highest_perf * nominal_freq, nominal_perf); + min_freq *= 1000; + nominal_freq *= 1000; - lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); - lowest_nonlinear_freq = div_u64((u64)nominal_freq * lowest_nonlinear_perf, nominal_perf); - WRITE_ONCE(cpudata->min_freq, min_freq * 1000); - WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq * 1000); - WRITE_ONCE(cpudata->nominal_freq, nominal_freq * 1000); - WRITE_ONCE(cpudata->max_freq, max_freq * 1000); + WRITE_ONCE(cpudata->nominal_freq, nominal_freq); + WRITE_ONCE(cpudata->min_freq, min_freq); + + max_freq = perf_to_freq(cpudata, cpudata->highest_perf); + lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf); + + WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); + WRITE_ONCE(cpudata->max_freq, max_freq); /** * Below values need to be initialized correctly, otherwise driver will fail to load From b899434857b0f1ab460b3c126cbed82ab9b52d43 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:19 +0000 Subject: [PATCH 073/139] cpufreq/amd-pstate: Remove the unnecessary cpufreq_update_policy call The update_limits callback is only called in two conditions. * When the preferred core rankings change. In which case, we just need to change the prefcore ranking in the cpudata struct. As there are no changes to any of the perf values, there is no need to call cpufreq_update_policy() * When the _PPC ACPI object changes, i.e. the highest allowed Pstate changes. The _PPC object is only used for a table based cpufreq driver like acpi-cpufreq, hence is irrelevant for CPPC based amd-pstate. Hence, the cpufreq_update_policy() call becomes unnecessary and can be removed. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-9-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 9ab95ec1f828..9c939be59042 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -853,10 +853,6 @@ static void amd_pstate_update_limits(unsigned int cpu) sched_set_itmt_core_prio((int)cur_high, cpu); } cpufreq_cpu_put(policy); - - if (!highest_perf_changed) - cpufreq_update_policy(cpu); - } /* From 426db24d4db2e4f0d6720aeb7795eafcb9e82640 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:21 +0000 Subject: [PATCH 074/139] cpufreq/amd-pstate: Add missing NULL ptr check in amd_pstate_update Check if policy is NULL before dereferencing it in amd_pstate_update. Fixes: e8f555daacd3 ("cpufreq/amd-pstate: fix setting policy current frequency value") Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-11-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 9c939be59042..6a604f0797d9 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -551,6 +551,9 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); + if (!policy) + return; + des_perf = clamp_t(u8, des_perf, min_perf, max_perf); policy->cur = perf_to_freq(cpudata, des_perf); From 97a705dc1a3654d8d2e466433a897be202a7f0ac Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:22 +0000 Subject: [PATCH 075/139] cpufreq/amd-pstate: Use scope based cleanup for cpufreq_policy refs There have been instances in past where refcount decrementing is missed while exiting a function. Use automatic scope based cleanup to avoid such errors. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-12-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 25 ++++++++----------------- include/linux/cpufreq.h | 3 +++ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 6a604f0797d9..ee7e3f0a4c0a 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -548,7 +548,7 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) { - struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); if (!policy) @@ -574,8 +574,6 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, } amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch); - - cpufreq_cpu_put(policy); } static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) @@ -587,7 +585,8 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) * amd-pstate qos_requests. */ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) { - struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu); + struct cpufreq_policy *policy __free(put_cpufreq_policy) = + cpufreq_cpu_get(policy_data->cpu); struct amd_cpudata *cpudata; if (!policy) @@ -595,7 +594,6 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) cpudata = policy->driver_data; policy_data->min = cpudata->lowest_nonlinear_freq; - cpufreq_cpu_put(policy); } cpufreq_verify_within_cpu_limits(policy_data); @@ -678,7 +676,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, unsigned long capacity) { u8 max_perf, min_perf, des_perf, cap_perf, min_limit_perf; - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; if (!policy) @@ -710,7 +708,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, policy->governor->flags); - cpufreq_cpu_put(policy); } static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) @@ -824,28 +821,23 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) static void amd_pstate_update_limits(unsigned int cpu) { - struct cpufreq_policy *policy = NULL; + struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; u32 prev_high = 0, cur_high = 0; - int ret; bool highest_perf_changed = false; if (!amd_pstate_prefcore) return; - policy = cpufreq_cpu_get(cpu); if (!policy) return; - cpudata = policy->driver_data; - guard(mutex)(&amd_pstate_driver_lock); - ret = amd_get_highest_perf(cpu, &cur_high); - if (ret) { - cpufreq_cpu_put(policy); + if (amd_get_highest_perf(cpu, &cur_high)) return; - } + + cpudata = policy->driver_data; prev_high = READ_ONCE(cpudata->prefcore_ranking); highest_perf_changed = (prev_high != cur_high); @@ -855,7 +847,6 @@ static void amd_pstate_update_limits(unsigned int cpu) if (cur_high < CPPC_MAX_PERF) sched_set_itmt_core_prio((int)cur_high, cpu); } - cpufreq_cpu_put(policy); } /* diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 7fe0981a7e46..dde5212d256c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -210,6 +210,9 @@ static inline struct cpufreq_policy *cpufreq_cpu_get(unsigned int cpu) static inline void cpufreq_cpu_put(struct cpufreq_policy *policy) { } #endif +/* Scope based cleanup macro for cpufreq_policy kobject reference counting */ +DEFINE_FREE(put_cpufreq_policy, struct cpufreq_policy *, if (_T) cpufreq_cpu_put(_T)) + static inline bool policy_is_inactive(struct cpufreq_policy *policy) { return cpumask_empty(policy->cpus); From 3e93edc58a63cf816e6dc853da8e9b0201bd0298 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Wed, 5 Feb 2025 11:25:23 +0000 Subject: [PATCH 076/139] cpufreq/amd-pstate: Remove the unncecessary driver_lock in amd_pstate_update_limits There is no need to take a driver wide lock while updating the highest_perf value in the percpu cpudata struct. Hence remove it. Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250205112523.201101-13-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ee7e3f0a4c0a..08ae48076812 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -832,8 +832,6 @@ static void amd_pstate_update_limits(unsigned int cpu) if (!policy) return; - guard(mutex)(&amd_pstate_driver_lock); - if (amd_get_highest_perf(cpu, &cur_high)) return; From d2cd195b57cf5ffbe432be01e96f35637e7bd403 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Feb 2025 15:22:59 +0100 Subject: [PATCH 077/139] cpuidle: menu: Drop a redundant local variable Local variable min in get_typical_interval() is updated, but never accessed later, so drop it. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Artem Bityutskiy Tested-by: Christian Loehle Tested-by: Aboorva Devarajan Link: https://patch.msgid.link/13699686.uLZWGnKmhe@rjwysocki.net --- drivers/cpuidle/governors/menu.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 28363bfa3e4c..b19406502b56 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -117,7 +117,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); static unsigned int get_typical_interval(struct menu_device *data) { int i, divisor; - unsigned int min, max, thresh, avg; + unsigned int max, thresh, avg; uint64_t sum, variance; thresh = INT_MAX; /* Discard outliers above this value */ @@ -125,7 +125,6 @@ static unsigned int get_typical_interval(struct menu_device *data) again: /* First calculate the average of past intervals */ - min = UINT_MAX; max = 0; sum = 0; divisor = 0; @@ -136,9 +135,6 @@ static unsigned int get_typical_interval(struct menu_device *data) divisor++; if (value > max) max = value; - - if (value < min) - min = value; } } From 13982929fb08ed4691256072856f50bf7b206b9b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Feb 2025 15:24:18 +0100 Subject: [PATCH 078/139] cpuidle: menu: Use one loop for average and variance computations Use the observation that one loop is sufficient to compute the average of an array of values and their variance to eliminate one of the loops from get_typical_interval(). While at it, make get_typical_interval() consistently use u64 as the 64-bit unsigned integer data type and rearrange some white space and the declarations of local variables in it (to make them follow the reverse X-mas tree pattern). No intentional functional impact. Signed-off-by: Rafael J. Wysocki Tested-by: Artem Bityutskiy Reviewed-by: Christian Loehle Tested-by: Christian Loehle Tested-by: Aboorva Devarajan Link: https://patch.msgid.link/3339073.aeNJFYEL58@rjwysocki.net --- drivers/cpuidle/governors/menu.c | 61 +++++++++++++++----------------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b19406502b56..ec472af38de6 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -116,49 +116,45 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); */ static unsigned int get_typical_interval(struct menu_device *data) { - int i, divisor; - unsigned int max, thresh, avg; - uint64_t sum, variance; - - thresh = INT_MAX; /* Discard outliers above this value */ + unsigned int max, divisor, thresh = INT_MAX; + u64 avg, variance, avg_sq; + int i; again: - - /* First calculate the average of past intervals */ + /* Compute the average and variance of past intervals. */ max = 0; - sum = 0; + avg = 0; + variance = 0; divisor = 0; for (i = 0; i < INTERVALS; i++) { unsigned int value = data->intervals[i]; - if (value <= thresh) { - sum += value; - divisor++; - if (value > max) - max = value; - } + + /* Discard data points above the threshold. */ + if (value > thresh) + continue; + + divisor++; + + avg += value; + variance += (u64)value * value; + + if (value > max) + max = value; } if (!max) return UINT_MAX; - if (divisor == INTERVALS) - avg = sum >> INTERVAL_SHIFT; - else - avg = div_u64(sum, divisor); - - /* Then try to determine variance */ - variance = 0; - for (i = 0; i < INTERVALS; i++) { - unsigned int value = data->intervals[i]; - if (value <= thresh) { - int64_t diff = (int64_t)value - avg; - variance += diff * diff; - } - } - if (divisor == INTERVALS) + if (divisor == INTERVALS) { + avg >>= INTERVAL_SHIFT; variance >>= INTERVAL_SHIFT; - else + } else { + do_div(avg, divisor); do_div(variance, divisor); + } + + avg_sq = avg * avg; + variance -= avg_sq; /* * The typical interval is obtained when standard deviation is @@ -173,10 +169,9 @@ static unsigned int get_typical_interval(struct menu_device *data) * Use this result only if there is no timer to wake us up sooner. */ if (likely(variance <= U64_MAX/36)) { - if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3)) - || variance <= 400) { + if ((avg_sq > variance * 36 && divisor * 4 >= INTERVALS * 3) || + variance <= 400) return avg; - } } /* From 60256e458e1c29652b2f9e4f2ba71fc7b09bd30c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Feb 2025 15:25:18 +0100 Subject: [PATCH 079/139] cpuidle: menu: Tweak threshold use in get_typical_interval() To prepare get_typical_interval() for subsequent changes, rearrange the use of the data point threshold in it a bit and initialize that threshold to UINT_MAX which is more consistent with its data type. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Tested-by: Artem Bityutskiy Reviewed-by: Christian Loehle Tested-by: Christian Loehle Tested-by: Aboorva Devarajan Link: https://patch.msgid.link/8490144.T7Z3S40VBb@rjwysocki.net --- drivers/cpuidle/governors/menu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index ec472af38de6..680ff20e5528 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -116,7 +116,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); */ static unsigned int get_typical_interval(struct menu_device *data) { - unsigned int max, divisor, thresh = INT_MAX; + unsigned int max, divisor, thresh = UINT_MAX; u64 avg, variance, avg_sq; int i; @@ -129,8 +129,8 @@ static unsigned int get_typical_interval(struct menu_device *data) for (i = 0; i < INTERVALS; i++) { unsigned int value = data->intervals[i]; - /* Discard data points above the threshold. */ - if (value > thresh) + /* Discard data points above or at the threshold. */ + if (value >= thresh) continue; divisor++; @@ -186,7 +186,7 @@ static unsigned int get_typical_interval(struct menu_device *data) if ((divisor * 4) <= INTERVALS * 3) return UINT_MAX; - thresh = max - 1; + thresh = max; goto again; } From 8de7606f0fe2bf5a918fe97d425e16e190a24fe6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Feb 2025 15:26:41 +0100 Subject: [PATCH 080/139] cpuidle: menu: Eliminate outliers on both ends of the sample set Currently, get_typical_interval() attempts to eliminate outliers at the high end of the sample set only (probably in order to bias the prediction toward lower values), but this it problematic because if the outliers are present at the low end of the sample set, discarding the highest values will not help to reduce the variance. Since the presence of outliers at the low end of the sample set is generally as likely as their presence at the high end of the sample set, modify get_typical_interval() to treat samples at the largest distances from the average (on both ends of the sample set) as outliers. This should increase the likelihood of making a meaningful prediction in some cases. Signed-off-by: Rafael J. Wysocki Reported-by: Artem Bityutskiy Tested-by: Artem Bityutskiy Reviewed-by: Christian Loehle Tested-by: Christian Loehle Tested-by: Aboorva Devarajan Link: https://patch.msgid.link/2301940.iZASKD2KPV@rjwysocki.net --- drivers/cpuidle/governors/menu.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 680ff20e5528..48ebbde750e5 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -116,30 +116,37 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); */ static unsigned int get_typical_interval(struct menu_device *data) { - unsigned int max, divisor, thresh = UINT_MAX; + s64 value, min_thresh = -1, max_thresh = UINT_MAX; + unsigned int max, min, divisor; u64 avg, variance, avg_sq; int i; again: /* Compute the average and variance of past intervals. */ max = 0; + min = UINT_MAX; avg = 0; variance = 0; divisor = 0; for (i = 0; i < INTERVALS; i++) { - unsigned int value = data->intervals[i]; - - /* Discard data points above or at the threshold. */ - if (value >= thresh) + value = data->intervals[i]; + /* + * Discard the samples outside the interval between the min and + * max thresholds. + */ + if (value <= min_thresh || value >= max_thresh) continue; divisor++; avg += value; - variance += (u64)value * value; + variance += value * value; if (value > max) max = value; + + if (value < min) + min = value; } if (!max) @@ -175,10 +182,10 @@ static unsigned int get_typical_interval(struct menu_device *data) } /* - * If we have outliers to the upside in our distribution, discard - * those by setting the threshold to exclude these outliers, then + * If there are outliers, discard them by setting thresholds to exclude + * data points at a large enough distance from the average, then * calculate the average and standard deviation again. Once we get - * down to the bottom 3/4 of our samples, stop excluding samples. + * down to the last 3/4 of our samples, stop excluding samples. * * This can deal with workloads that have long pauses interspersed * with sporadic activity with a bunch of short pauses. @@ -186,7 +193,12 @@ static unsigned int get_typical_interval(struct menu_device *data) if ((divisor * 4) <= INTERVALS * 3) return UINT_MAX; - thresh = max; + /* Update the thresholds for the next round. */ + if (avg - min > max - avg) + min_thresh = min; + else + max_thresh = max; + goto again; } From 85975daeaa4d6ec560bfcd354fc9c08ad7f38888 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Feb 2025 15:29:05 +0100 Subject: [PATCH 081/139] cpuidle: menu: Avoid discarding useful information When giving up on making a high-confidence prediction, get_typical_interval() always returns UINT_MAX which means that the next idle interval prediction will be based entirely on the time till the next timer. However, the information represented by the most recent intervals may not be completely useless in those cases. Namely, the largest recent idle interval is an upper bound on the recently observed idle duration, so it is reasonable to assume that the next idle duration is unlikely to exceed it. Moreover, this is still true after eliminating the suspected outliers if the sample set still under consideration is at least as large as 50% of the maximum sample set size. Accordingly, make get_typical_interval() return the current maximum recent interval value in that case instead of UINT_MAX. Signed-off-by: Rafael J. Wysocki Reported-by: Artem Bityutskiy Tested-by: Artem Bityutskiy Reviewed-by: Christian Loehle Tested-by: Christian Loehle Tested-by: Aboorva Devarajan Link: https://patch.msgid.link/7770672.EvYhyI6sBW@rjwysocki.net --- drivers/cpuidle/governors/menu.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 48ebbde750e5..30ffb1f69056 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -190,8 +190,19 @@ static unsigned int get_typical_interval(struct menu_device *data) * This can deal with workloads that have long pauses interspersed * with sporadic activity with a bunch of short pauses. */ - if ((divisor * 4) <= INTERVALS * 3) + if (divisor * 4 <= INTERVALS * 3) { + /* + * If there are sufficiently many data points still under + * consideration after the outliers have been eliminated, + * returning without a prediction would be a mistake because it + * is likely that the next interval will not exceed the current + * maximum, so return the latter in that case. + */ + if (divisor >= INTERVALS / 2) + return max; + return UINT_MAX; + } /* Update the thresholds for the next round. */ if (avg - min > max - avg) From 5c350410999653dff8d2975d794088e4c166e8b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 20 Feb 2025 21:13:12 +0100 Subject: [PATCH 082/139] cpuidle: menu: Update documentation after get_typical_interval() changes The documentation of the menu cpuidle governor needs to be updated to match the code behavior after some changes made recently. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/4998484.31r3eYUQgx@rjwysocki.net [ rjw: More specific subject, two typos fixed in the changelog ] Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/cpuidle.rst | 27 +++++++++++++--------- drivers/cpuidle/governors/menu.c | 29 ++++++++---------------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/Documentation/admin-guide/pm/cpuidle.rst b/Documentation/admin-guide/pm/cpuidle.rst index eb58d7a5affd..0c090b076224 100644 --- a/Documentation/admin-guide/pm/cpuidle.rst +++ b/Documentation/admin-guide/pm/cpuidle.rst @@ -275,20 +275,25 @@ values and, when predicting the idle duration next time, it computes the average and variance of them. If the variance is small (smaller than 400 square milliseconds) or it is small relative to the average (the average is greater that 6 times the standard deviation), the average is regarded as the "typical -interval" value. Otherwise, the longest of the saved observed idle duration +interval" value. Otherwise, either the longest or the shortest (depending on +which one is farther from the average) of the saved observed idle duration values is discarded and the computation is repeated for the remaining ones. + Again, if the variance of them is small (in the above sense), the average is taken as the "typical interval" value and so on, until either the "typical -interval" is determined or too many data points are disregarded, in which case -the "typical interval" is assumed to equal "infinity" (the maximum unsigned -integer value). +interval" is determined or too many data points are disregarded. In the latter +case, if the size of the set of data points still under consideration is +sufficiently large, the next idle duration is not likely to be above the largest +idle duration value still in that set, so that value is taken as the predicted +next idle duration. Finally, if the set of data points still under +consideration is too small, no prediction is made. -If the "typical interval" computed this way is long enough, the governor obtains -the time until the closest timer event with the assumption that the scheduler -tick will be stopped. That time, referred to as the *sleep length* in what follows, -is the upper bound on the time before the next CPU wakeup. It is used to determine -the sleep length range, which in turn is needed to get the sleep length correction -factor. +If the preliminary prediction of the next idle duration computed this way is +long enough, the governor obtains the time until the closest timer event with +the assumption that the scheduler tick will be stopped. That time, referred to +as the *sleep length* in what follows, is the upper bound on the time before the +next CPU wakeup. It is used to determine the sleep length range, which in turn +is needed to get the sleep length correction factor. The ``menu`` governor maintains an array containing several correction factor values that correspond to different sleep length ranges organized so that each @@ -302,7 +307,7 @@ to 1 the correction factor becomes (it must fall between 0 and 1 inclusive). The sleep length is multiplied by the correction factor for the range that it falls into to obtain an approximation of the predicted idle duration that is compared to the "typical interval" determined previously and the minimum of -the two is taken as the idle duration prediction. +the two is taken as the final idle duration prediction. If the "typical interval" value is small, which means that the CPU is likely to be woken up soon enough, the sleep length computation is skipped as it may diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 30ffb1f69056..39aa0aea61c6 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -41,7 +41,7 @@ * the C state is required to actually break even on this cost. CPUIDLE * provides us this duration in the "target_residency" field. So all that we * need is a good prediction of how long we'll be idle. Like the traditional - * menu governor, we start with the actual known "next timer event" time. + * menu governor, we take the actual known "next timer event" time. * * Since there are other source of wakeups (interrupts for example) than * the next timer event, this estimation is rather optimistic. To get a @@ -50,30 +50,21 @@ * duration always was 50% of the next timer tick, the correction factor will * be 0.5. * - * menu uses a running average for this correction factor, however it uses a - * set of factors, not just a single factor. This stems from the realization - * that the ratio is dependent on the order of magnitude of the expected - * duration; if we expect 500 milliseconds of idle time the likelihood of - * getting an interrupt very early is much higher than if we expect 50 micro - * seconds of idle time. A second independent factor that has big impact on - * the actual factor is if there is (disk) IO outstanding or not. - * (as a special twist, we consider every sleep longer than 50 milliseconds - * as perfect; there are no power gains for sleeping longer than this) - * - * For these two reasons we keep an array of 12 independent factors, that gets - * indexed based on the magnitude of the expected duration as well as the - * "is IO outstanding" property. + * menu uses a running average for this correction factor, but it uses a set of + * factors, not just a single factor. This stems from the realization that the + * ratio is dependent on the order of magnitude of the expected duration; if we + * expect 500 milliseconds of idle time the likelihood of getting an interrupt + * very early is much higher than if we expect 50 micro seconds of idle time. + * For this reason, menu keeps an array of 6 independent factors, that gets + * indexed based on the magnitude of the expected duration. * * Repeatable-interval-detector * ---------------------------- * There are some cases where "next timer" is a completely unusable predictor: * Those cases where the interval is fixed, for example due to hardware - * interrupt mitigation, but also due to fixed transfer rate devices such as - * mice. + * interrupt mitigation, but also due to fixed transfer rate devices like mice. * For this, we use a different predictor: We track the duration of the last 8 - * intervals and if the stand deviation of these 8 intervals is below a - * threshold value, we use the average of these intervals as prediction. - * + * intervals and use them to estimate the duration of the next one. */ struct menu_device { From 5e7e39ae15b0ea370e783a9326fdd1d91357fc3e Mon Sep 17 00:00:00 2001 From: David Arcari Date: Thu, 20 Feb 2025 10:11:20 -0500 Subject: [PATCH 083/139] intel_idle: introduce 'no_native' module parameter Since commit 18734958e9bf ("intel_idle: Use ACPI _CST for processor models without C-state tables") the intel_idle driver has had the ability to use the ACPI _CST to populate C-states when the processor model is not recognized. However, even when the processor model is recognized (native mode) there are cases where it is useful to make the driver ignore the per-CPU idle states in lieu of ACPI C-states (such as specific application performance). Add a new 'no_native' module parameter to provide this functionality. Signed-off-by: David Arcari Link: https://patch.msgid.link/20250220151120.1131122-1-darcari@redhat.com Reviewed-by: Artem Bityutskiy [ rjw: Spell CPU in capitals ] Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_idle.rst | 18 +++++++++++++----- drivers/idle/intel_idle.c | 14 ++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/pm/intel_idle.rst b/Documentation/admin-guide/pm/intel_idle.rst index 39bd6ecce7de..5940528146eb 100644 --- a/Documentation/admin-guide/pm/intel_idle.rst +++ b/Documentation/admin-guide/pm/intel_idle.rst @@ -192,11 +192,19 @@ even if they have been enumerated (see :ref:`cpu-pm-qos` in Documentation/admin-guide/pm/cpuidle.rst). Setting ``max_cstate`` to 0 causes the ``intel_idle`` initialization to fail. -The ``no_acpi`` and ``use_acpi`` module parameters (recognized by ``intel_idle`` -if the kernel has been configured with ACPI support) can be set to make the -driver ignore the system's ACPI tables entirely or use them for all of the -recognized processor models, respectively (they both are unset by default and -``use_acpi`` has no effect if ``no_acpi`` is set). +The ``no_acpi``, ``use_acpi`` and ``no_native`` module parameters are +recognized by ``intel_idle`` if the kernel has been configured with ACPI +support. In the case that ACPI is not configured these flags have no impact +on functionality. + +``no_acpi`` - Do not use ACPI at all. Only native mode is available, no +ACPI mode. + +``use_acpi`` - No-op in ACPI mode, the driver will consult ACPI tables for +C-states on/off status in native mode. + +``no_native`` - Work only in ACPI mode, no native mode available (ignore +all custom tables). The value of the ``states_off`` module parameter (0 by default) represents a list of idle states to be disabled by default in the form of a bitmask. diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 324814dc34fa..0f4247e3070c 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1692,6 +1692,10 @@ static bool force_use_acpi __read_mostly; /* No effect if no_acpi is set. */ module_param_named(use_acpi, force_use_acpi, bool, 0444); MODULE_PARM_DESC(use_acpi, "Use ACPI _CST for building the idle states list"); +static bool no_native __read_mostly; /* No effect if no_acpi is set. */ +module_param_named(no_native, no_native, bool, 0444); +MODULE_PARM_DESC(no_native, "Ignore cpu specific (native) idle states in lieu of ACPI idle states"); + static struct acpi_processor_power acpi_state_table __initdata; /** @@ -1831,6 +1835,11 @@ static bool __init intel_idle_off_by_default(unsigned int flags, u32 mwait_hint) } return true; } + +static inline bool ignore_native(void) +{ + return no_native && !no_acpi; +} #else /* !CONFIG_ACPI_PROCESSOR_CSTATE */ #define force_use_acpi (false) @@ -1840,6 +1849,7 @@ static inline bool intel_idle_off_by_default(unsigned int flags, u32 mwait_hint) { return false; } +static inline bool ignore_native(void) { return false; } #endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */ /** @@ -2333,6 +2343,10 @@ static int __init intel_idle_init(void) pr_debug("MWAIT substates: 0x%x\n", mwait_substates); icpu = (const struct idle_cpu *)id->driver_data; + if (icpu && ignore_native()) { + pr_debug("ignoring native CPU idle states\n"); + icpu = NULL; + } if (icpu) { if (icpu->state_table) cpuidle_state_table = icpu->state_table; From 64c66da08d1004f8ff620d32aa0ed3f6168c60d2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 20 Feb 2025 21:11:29 +0100 Subject: [PATCH 084/139] cpuidle: intel_idle: Update MAINTAINERS Update the intel_idle record in MAINTAINERS to reflect the current state of affairs. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/12621866.O9o76ZdvQC@rjwysocki.net --- MAINTAINERS | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index efee40ea589f..a4869cb54442 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11669,12 +11669,14 @@ F: Documentation/driver-api/crypto/iaa/iaa-crypto.rst F: drivers/crypto/intel/iaa/* INTEL IDLE DRIVER -M: Jacob Pan -M: Len Brown +M: Rafael J. Wysocki +M: Artem Bityutskiy +M: Artem Bityutskiy +R: Len Brown L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git F: drivers/idle/intel_idle.c INTEL IDXD DRIVER From 7ebd85022c0075c4465a51e8ace4e08dfce747b1 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 24 Feb 2025 01:06:10 +0000 Subject: [PATCH 085/139] PM: clk: remove unused of_pm_clk_add_clk() The last use of of_pm_clk_add_clk() was removed by 2019's commit fe00f8900ca7 ("irqchip/gic-pm: Update driver to use clk_bulk APIs") Remove it. Note that the plural version of_pm_clk_add_clks() is still being used and is left. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250224010610.187503-1-linux@treblig.org Signed-off-by: Rafael J. Wysocki --- drivers/base/power/clock_ops.c | 33 --------------------------------- include/linux/pm_clock.h | 1 - 2 files changed, 34 deletions(-) diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c index e18ba676cdf6..97a53215a274 100644 --- a/drivers/base/power/clock_ops.c +++ b/drivers/base/power/clock_ops.c @@ -259,39 +259,6 @@ int pm_clk_add_clk(struct device *dev, struct clk *clk) } EXPORT_SYMBOL_GPL(pm_clk_add_clk); - -/** - * of_pm_clk_add_clk - Start using a device clock for power management. - * @dev: Device whose clock is going to be used for power management. - * @name: Name of clock that is going to be used for power management. - * - * Add the clock described in the 'clocks' device-tree node that matches - * with the 'name' provided, to the list of clocks used for the power - * management of @dev. On success, returns 0. Returns a negative error - * code if the clock is not found or cannot be added. - */ -int of_pm_clk_add_clk(struct device *dev, const char *name) -{ - struct clk *clk; - int ret; - - if (!dev || !dev->of_node || !name) - return -EINVAL; - - clk = of_clk_get_by_name(dev->of_node, name); - if (IS_ERR(clk)) - return PTR_ERR(clk); - - ret = pm_clk_add_clk(dev, clk); - if (ret) { - clk_put(clk); - return ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(of_pm_clk_add_clk); - /** * of_pm_clk_add_clks - Start using device clock(s) for power management. * @dev: Device whose clock(s) is going to be used for power management. diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 68669ce18720..45c3f3ccbaf8 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -41,7 +41,6 @@ extern int pm_clk_create(struct device *dev); extern void pm_clk_destroy(struct device *dev); extern int pm_clk_add(struct device *dev, const char *con_id); extern int pm_clk_add_clk(struct device *dev, struct clk *clk); -extern int of_pm_clk_add_clk(struct device *dev, const char *name); extern int of_pm_clk_add_clks(struct device *dev); extern void pm_clk_remove(struct device *dev, const char *con_id); extern void pm_clk_remove_clk(struct device *dev, struct clk *clk); From 520a552f19d55825108ab83da093084c9afb50e9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 18 Feb 2025 21:20:46 +0100 Subject: [PATCH 086/139] PM: sleep: Avoid unnecessary checks in device_prepare_smart_suspend() Add an optimization (on top of previous changes) to avoid calling pm_runtime_blocked(), which involves acquiring the device's PM spinlock, for devices with no PM callbacks and runtime PM "blocked". Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/2978873.e9J7NaK4W3@rjwysocki.net --- drivers/base/power/main.c | 16 +++++++++------- drivers/base/power/runtime.c | 9 +++++++-- include/linux/pm_runtime.h | 4 ++-- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index cc9903065900..a06ef91fbdb9 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1796,16 +1796,14 @@ static void device_prepare_smart_suspend(struct device *dev) /* * The "smart suspend" feature is enabled for devices whose drivers ask - * for it and for devices without PM callbacks unless runtime PM is - * disabled and enabling it is blocked for them. + * for it and for devices without PM callbacks. * * However, if "smart suspend" is not enabled for the device's parent * or any of its suppliers that take runtime PM into account, it cannot * be enabled for the device either. */ - dev->power.smart_suspend = (dev->power.no_pm_callbacks || - dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) && - !pm_runtime_blocked(dev); + dev->power.smart_suspend = dev->power.no_pm_callbacks || + dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND); if (!dev_pm_smart_suspend(dev)) return; @@ -1843,6 +1841,7 @@ static void device_prepare_smart_suspend(struct device *dev) static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; + bool no_runtime_pm; int ret = 0; /* @@ -1858,7 +1857,7 @@ static int device_prepare(struct device *dev, pm_message_t state) * suspend-resume cycle is complete, so prepare to trigger a warning on * subsequent attempts to enable it. */ - pm_runtime_block_if_disabled(dev); + no_runtime_pm = pm_runtime_block_if_disabled(dev); if (dev->power.syscore) return 0; @@ -1893,7 +1892,10 @@ static int device_prepare(struct device *dev, pm_message_t state) pm_runtime_put(dev); return ret; } - device_prepare_smart_suspend(dev); + /* Do not enable "smart suspend" for devices without runtime PM. */ + if (!no_runtime_pm) + device_prepare_smart_suspend(dev); + /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index c0f5a9f89299..e772e45d30f3 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1460,14 +1460,19 @@ int pm_runtime_barrier(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_barrier); -void pm_runtime_block_if_disabled(struct device *dev) +bool pm_runtime_block_if_disabled(struct device *dev) { + bool ret; + spin_lock_irq(&dev->power.lock); - if (dev->power.disable_depth && dev->power.last_status == RPM_INVALID) + ret = dev->power.disable_depth && dev->power.last_status == RPM_INVALID; + if (ret) dev->power.last_status = RPM_BLOCKED; spin_unlock_irq(&dev->power.lock); + + return ret; } void pm_runtime_unblock(struct device *dev) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index aea0395c10a1..01ead602aedd 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -77,7 +77,7 @@ extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); extern int pm_runtime_barrier(struct device *dev); -extern void pm_runtime_block_if_disabled(struct device *dev); +extern bool pm_runtime_block_if_disabled(struct device *dev); extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); @@ -274,7 +274,7 @@ static inline int pm_runtime_get_if_active(struct device *dev) static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } static inline int pm_runtime_barrier(struct device *dev) { return 0; } -static inline void pm_runtime_block_if_disabled(struct device *dev) {} +static inline bool pm_runtime_block_if_disabled(struct device *dev) { return true; } static inline void pm_runtime_unblock(struct device *dev) {} static inline void pm_runtime_enable(struct device *dev) {} static inline void __pm_runtime_disable(struct device *dev, bool c) {} From 52323ed1444ea5c2a5f1754ea0a2d9c8c216ccdf Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Mon, 24 Feb 2025 09:31:39 +0800 Subject: [PATCH 087/139] PM: hibernate: Avoid deadlock in hibernate_compressor_param_set() syzbot reported a deadlock in lock_system_sleep() (see below). The write operation to "/sys/module/hibernate/parameters/compressor" conflicts with the registration of ieee80211 device, resulting in a deadlock when attempting to acquire system_transition_mutex under param_lock. To avoid this deadlock, change hibernate_compressor_param_set() to use mutex_trylock() for attempting to acquire system_transition_mutex and return -EBUSY when it fails. Task flags need not be saved or adjusted before calling mutex_trylock(&system_transition_mutex) because the caller is not going to end up waiting for this mutex and if it runs concurrently with system suspend in progress, it will be frozen properly when it returns to user space. syzbot report: syz-executor895/5833 is trying to acquire lock: ffffffff8e0828c8 (system_transition_mutex){+.+.}-{4:4}, at: lock_system_sleep+0x87/0xa0 kernel/power/main.c:56 but task is already holding lock: ffffffff8e07dc68 (param_lock){+.+.}-{4:4}, at: kernel_param_lock kernel/params.c:607 [inline] ffffffff8e07dc68 (param_lock){+.+.}-{4:4}, at: param_attr_store+0xe6/0x300 kernel/params.c:586 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (param_lock){+.+.}-{4:4}: __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x19b/0xb10 kernel/locking/mutex.c:730 ieee80211_rate_control_ops_get net/mac80211/rate.c:220 [inline] rate_control_alloc net/mac80211/rate.c:266 [inline] ieee80211_init_rate_ctrl_alg+0x18d/0x6b0 net/mac80211/rate.c:1015 ieee80211_register_hw+0x20cd/0x4060 net/mac80211/main.c:1531 mac80211_hwsim_new_radio+0x304e/0x54e0 drivers/net/wireless/virtual/mac80211_hwsim.c:5558 init_mac80211_hwsim+0x432/0x8c0 drivers/net/wireless/virtual/mac80211_hwsim.c:6910 do_one_initcall+0x128/0x700 init/main.c:1257 do_initcall_level init/main.c:1319 [inline] do_initcalls init/main.c:1335 [inline] do_basic_setup init/main.c:1354 [inline] kernel_init_freeable+0x5c7/0x900 init/main.c:1568 kernel_init+0x1c/0x2b0 init/main.c:1457 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 -> #2 (rtnl_mutex){+.+.}-{4:4}: __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x19b/0xb10 kernel/locking/mutex.c:730 wg_pm_notification drivers/net/wireguard/device.c:80 [inline] wg_pm_notification+0x49/0x180 drivers/net/wireguard/device.c:64 notifier_call_chain+0xb7/0x410 kernel/notifier.c:85 notifier_call_chain_robust kernel/notifier.c:120 [inline] blocking_notifier_call_chain_robust kernel/notifier.c:345 [inline] blocking_notifier_call_chain_robust+0xc9/0x170 kernel/notifier.c:333 pm_notifier_call_chain_robust+0x27/0x60 kernel/power/main.c:102 snapshot_open+0x189/0x2b0 kernel/power/user.c:77 misc_open+0x35a/0x420 drivers/char/misc.c:179 chrdev_open+0x237/0x6a0 fs/char_dev.c:414 do_dentry_open+0x735/0x1c40 fs/open.c:956 vfs_open+0x82/0x3f0 fs/open.c:1086 do_open fs/namei.c:3830 [inline] path_openat+0x1e88/0x2d80 fs/namei.c:3989 do_filp_open+0x20c/0x470 fs/namei.c:4016 do_sys_openat2+0x17a/0x1e0 fs/open.c:1428 do_sys_open fs/open.c:1443 [inline] __do_sys_openat fs/open.c:1459 [inline] __se_sys_openat fs/open.c:1454 [inline] __x64_sys_openat+0x175/0x210 fs/open.c:1454 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #1 ((pm_chain_head).rwsem){++++}-{4:4}: down_read+0x9a/0x330 kernel/locking/rwsem.c:1524 blocking_notifier_call_chain_robust kernel/notifier.c:344 [inline] blocking_notifier_call_chain_robust+0xa9/0x170 kernel/notifier.c:333 pm_notifier_call_chain_robust+0x27/0x60 kernel/power/main.c:102 snapshot_open+0x189/0x2b0 kernel/power/user.c:77 misc_open+0x35a/0x420 drivers/char/misc.c:179 chrdev_open+0x237/0x6a0 fs/char_dev.c:414 do_dentry_open+0x735/0x1c40 fs/open.c:956 vfs_open+0x82/0x3f0 fs/open.c:1086 do_open fs/namei.c:3830 [inline] path_openat+0x1e88/0x2d80 fs/namei.c:3989 do_filp_open+0x20c/0x470 fs/namei.c:4016 do_sys_openat2+0x17a/0x1e0 fs/open.c:1428 do_sys_open fs/open.c:1443 [inline] __do_sys_openat fs/open.c:1459 [inline] __se_sys_openat fs/open.c:1454 [inline] __x64_sys_openat+0x175/0x210 fs/open.c:1454 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (system_transition_mutex){+.+.}-{4:4}: check_prev_add kernel/locking/lockdep.c:3163 [inline] check_prevs_add kernel/locking/lockdep.c:3282 [inline] validate_chain kernel/locking/lockdep.c:3906 [inline] __lock_acquire+0x249e/0x3c40 kernel/locking/lockdep.c:5228 lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5851 __mutex_lock_common kernel/locking/mutex.c:585 [inline] __mutex_lock+0x19b/0xb10 kernel/locking/mutex.c:730 lock_system_sleep+0x87/0xa0 kernel/power/main.c:56 hibernate_compressor_param_set+0x1c/0x210 kernel/power/hibernate.c:1452 param_attr_store+0x18f/0x300 kernel/params.c:588 module_attr_store+0x55/0x80 kernel/params.c:924 sysfs_kf_write+0x117/0x170 fs/sysfs/file.c:139 kernfs_fop_write_iter+0x33d/0x500 fs/kernfs/file.c:334 new_sync_write fs/read_write.c:586 [inline] vfs_write+0x5ae/0x1150 fs/read_write.c:679 ksys_write+0x12b/0x250 fs/read_write.c:731 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Chain exists of: system_transition_mutex --> rtnl_mutex --> param_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(param_lock); lock(rtnl_mutex); lock(param_lock); lock(system_transition_mutex); *** DEADLOCK *** Reported-by: syzbot+ace60642828c074eb913@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=ace60642828c074eb913 Signed-off-by: Lizhi Xu Link: https://patch.msgid.link/20250224013139.3994500-1-lizhi.xu@windriver.com [ rjw: New subject matching the code changes, changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 10a01af63a80..b129ed1d25a8 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1446,10 +1446,10 @@ static const char * const comp_alg_enabled[] = { static int hibernate_compressor_param_set(const char *compressor, const struct kernel_param *kp) { - unsigned int sleep_flags; int index, ret; - sleep_flags = lock_system_sleep(); + if (!mutex_trylock(&system_transition_mutex)) + return -EBUSY; index = sysfs_match_string(comp_alg_enabled, compressor); if (index >= 0) { @@ -1461,7 +1461,7 @@ static int hibernate_compressor_param_set(const char *compressor, ret = index; } - unlock_system_sleep(sleep_flags); + mutex_unlock(&system_transition_mutex); if (ret) pr_debug("Cannot set specified compressor %s\n", From e8195f0630f1c4c2465074fe81b5fda19efd3148 Mon Sep 17 00:00:00 2001 From: Xu Yang Date: Mon, 24 Feb 2025 15:00:49 +0800 Subject: [PATCH 088/139] PM: sleep: Suppress sleeping parent warning in special case Currently, if power.no_callbacks is set, device_prepare() will also set power.direct_complete for the device. If power.direct_complete is set in device_resume(), the clearing of power.is_prepared will be skipped and if new children appear under the device at that point, a warning will be printed. After commit (f76b168b6f11 PM: Rename dev_pm_info.in_suspend to is_prepared), power.is_prepared is generally cleared in device_resume() before invoking the resume callback for the device which allows that callback to add new children without triggering the warning, but this does not happen for devices with power.direct_complete set. This problem is visible in USB where usb_set_interface() can be called before device_complete() clears power.is_prepared for interface devices and since ep devices are added then, the warning is printed: usb 1-1: reset high-speed USB device number 3 using ci_hdrc ep_81: PM: parent 1-1:1.1 should not be sleeping PM: resume devices took 0.936 seconds Since it is legitimate to add the ep devices at that point, the warning above is not particularly useful, so get rid of it by clearing power.is_prepared in device_resume() for devices with power.direct_complete set if they have no PM callbacks, in which case they need not actually resume for the new children to work. Suggested-by: Rafael J. Wysocki Signed-off-by: Xu Yang Link: https://patch.msgid.link/20250224070049.3338646-1-xu.yang_2@nxp.com [ rjw: New subject, changelog edits, rephrased new code comment ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index a06ef91fbdb9..e4103d29a21a 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -928,6 +928,13 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) goto Complete; if (dev->power.direct_complete) { + /* + * Allow new children to be added under the device after this + * point if it has no PM callbacks. + */ + if (dev->power.no_pm_callbacks) + dev->power.is_prepared = false; + /* Match the pm_runtime_disable() in device_suspend(). */ pm_runtime_enable(dev); goto Complete; From 630d55e038728f6f5917da051984a5ec515d152e Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Fri, 21 Feb 2025 05:02:19 -0800 Subject: [PATCH 089/139] PM: wakeup: Remove needless return in three void APIs Remove needless 'return' in the following void APIs: __pm_wakeup_event() pm_wakeup_event() pm_wakeup_hard_event() Since both the API and callee involved are void functions. Signed-off-by: Zijun Hu Link: https://patch.msgid.link/20250221-rmv_return-v1-14-cc8dff275827@quicinc.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm_wakeup.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index d501c09c60cd..51e0e8dd5f9e 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -205,17 +205,17 @@ static inline void device_set_awake_path(struct device *dev) static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) { - return pm_wakeup_ws_event(ws, msec, false); + pm_wakeup_ws_event(ws, msec, false); } static inline void pm_wakeup_event(struct device *dev, unsigned int msec) { - return pm_wakeup_dev_event(dev, msec, false); + pm_wakeup_dev_event(dev, msec, false); } static inline void pm_wakeup_hard_event(struct device *dev) { - return pm_wakeup_dev_event(dev, 0, true); + pm_wakeup_dev_event(dev, 0, true); } /** From d2677d57d4b8ec63da8f51357bcf855c4cf47c78 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 27 Feb 2025 11:56:12 +0100 Subject: [PATCH 090/139] PM: runtime: Drop status check from pm_runtime_force_resume() Since pm_runtime_force_resume() requires pm_runtime_force_suspend() to be called before it on the same device, the runtime PM status of the device is RPM_SUSPENDED when it is called unless the device's runtime PM status is changed somewhere else in the meantime. However, even if that happens, the power.needs_force_resume check is still required to pass and that flag is only set by pm_runtime_force_suspend() once and it is cleared at the end of pm_runtime_force_resume(), so it cannot be taken into account twice in a row. According to the above, the pm_runtime_status_suspended(dev) check in pm_runtime_force_resume() is redundant, so drop it. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/2309120.iZASKD2KPV@rjwysocki.net --- drivers/base/power/runtime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index a5aed89e1a6b..1714358b541e 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1945,7 +1945,7 @@ int pm_runtime_force_resume(struct device *dev) int (*callback)(struct device *); int ret = 0; - if (!pm_runtime_status_suspended(dev) || !dev->power.needs_force_resume) + if (!dev->power.needs_force_resume) goto out; /* From eeb87d17aceab7803a5a5bcb6cf2817b745157cf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 27 Feb 2025 11:53:50 +0100 Subject: [PATCH 091/139] PM: sleep: Adjust check before setting power.must_resume The check before setting power.must_resume in device_suspend_noirq() does not take power.child_count into account, but it should do that, so use pm_runtime_need_not_resume() in it for this purpose and adjust the comment next to it accordingly. Fixes: 107d47b2b95e ("PM: sleep: core: Simplify the SMART_SUSPEND flag handling") Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/3353728.44csPzL39Z@rjwysocki.net --- drivers/base/power/main.c | 13 ++++++------- drivers/base/power/runtime.c | 2 +- include/linux/pm_runtime.h | 2 ++ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e4103d29a21a..b03cdbc75b6d 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1277,14 +1277,13 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool asy dev->power.is_noirq_suspended = true; /* - * Skipping the resume of devices that were in use right before the - * system suspend (as indicated by their PM-runtime usage counters) - * would be suboptimal. Also resume them if doing that is not allowed - * to be skipped. + * Devices must be resumed unless they are explicitly allowed to be left + * in suspend, but even in that case skipping the resume of devices that + * were in use right before the system suspend (as indicated by their + * runtime PM usage counters and child counters) would be suboptimal. */ - if (atomic_read(&dev->power.usage_count) > 1 || - !(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && - dev->power.may_skip_resume)) + if (!(dev_pm_test_driver_flags(dev, DPM_FLAG_MAY_SKIP_RESUME) && + dev->power.may_skip_resume) || !pm_runtime_need_not_resume(dev)) dev->power.must_resume = true; if (dev->power.must_resume) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index e772e45d30f3..3d08eb677177 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1906,7 +1906,7 @@ void pm_runtime_drop_link(struct device_link *link) pm_request_idle(link->supplier); } -static bool pm_runtime_need_not_resume(struct device *dev) +bool pm_runtime_need_not_resume(struct device *dev) { return atomic_read(&dev->power.usage_count) <= 1 && (atomic_read(&dev->power.child_count) == 0 || diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 01ead602aedd..4c5b74b745d5 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -66,6 +66,7 @@ static inline bool queue_pm_work(struct work_struct *work) extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); +extern bool pm_runtime_need_not_resume(struct device *dev); extern int pm_runtime_force_suspend(struct device *dev); extern int pm_runtime_force_resume(struct device *dev); @@ -244,6 +245,7 @@ static inline bool queue_pm_work(struct work_struct *work) { return false; } static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } +static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; } static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } static inline int pm_runtime_force_resume(struct device *dev) { return 0; } From cb88c229fe77ea16cef2b9c8154cf44d331818a6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 27 Feb 2025 11:45:52 +0100 Subject: [PATCH 092/139] PM: sleep: Update power.smart_suspend under PM spinlock Put the update of the power.smart_suspend device flag under the PM spinlock of the device in case multiple bit fields in struct dev_pm_info occupy one memory location which needs to be updated via RMW every time any of these bit fields is updated. The lock in question is already held around the power.direct_complete flag update in device_prepare() for the same reason, so this change does not add locking-related overhead to the code. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/2368159.ElGaqSPkdT@rjwysocki.net --- drivers/base/power/main.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index b03cdbc75b6d..2f86d7cfdbbc 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1795,9 +1795,10 @@ int dpm_suspend(pm_message_t state) return error; } -static void device_prepare_smart_suspend(struct device *dev) +static bool device_prepare_smart_suspend(struct device *dev) { struct device_link *link; + bool ret = true; int idx; /* @@ -1808,17 +1809,13 @@ static void device_prepare_smart_suspend(struct device *dev) * or any of its suppliers that take runtime PM into account, it cannot * be enabled for the device either. */ - dev->power.smart_suspend = dev->power.no_pm_callbacks || - dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND); - - if (!dev_pm_smart_suspend(dev)) - return; + if (!dev->power.no_pm_callbacks && + !dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) + return false; if (dev->parent && !dev_pm_smart_suspend(dev->parent) && - !dev->parent->power.ignore_children && !pm_runtime_blocked(dev->parent)) { - dev->power.smart_suspend = false; - return; - } + !dev->parent->power.ignore_children && !pm_runtime_blocked(dev->parent)) + return false; idx = device_links_read_lock(); @@ -1828,12 +1825,14 @@ static void device_prepare_smart_suspend(struct device *dev) if (!dev_pm_smart_suspend(link->supplier) && !pm_runtime_blocked(link->supplier)) { - dev->power.smart_suspend = false; + ret = false; break; } } device_links_read_unlock(idx); + + return ret; } /** @@ -1847,7 +1846,7 @@ static void device_prepare_smart_suspend(struct device *dev) static int device_prepare(struct device *dev, pm_message_t state) { int (*callback)(struct device *) = NULL; - bool no_runtime_pm; + bool smart_suspend; int ret = 0; /* @@ -1863,7 +1862,7 @@ static int device_prepare(struct device *dev, pm_message_t state) * suspend-resume cycle is complete, so prepare to trigger a warning on * subsequent attempts to enable it. */ - no_runtime_pm = pm_runtime_block_if_disabled(dev); + smart_suspend = !pm_runtime_block_if_disabled(dev); if (dev->power.syscore) return 0; @@ -1899,9 +1898,12 @@ static int device_prepare(struct device *dev, pm_message_t state) return ret; } /* Do not enable "smart suspend" for devices without runtime PM. */ - if (!no_runtime_pm) - device_prepare_smart_suspend(dev); + if (smart_suspend) + smart_suspend = device_prepare_smart_suspend(dev); + spin_lock_irq(&dev->power.lock); + + dev->power.smart_suspend = smart_suspend; /* * A positive return value from ->prepare() means "this device appears * to be runtime-suspended and its state is fine, so if it really is @@ -1909,11 +1911,12 @@ static int device_prepare(struct device *dev, pm_message_t state) * will do the same thing with all of its descendants". This only * applies to suspend transitions, however. */ - spin_lock_irq(&dev->power.lock); dev->power.direct_complete = state.event == PM_EVENT_SUSPEND && (ret > 0 || dev->power.no_pm_callbacks) && !dev_pm_test_driver_flags(dev, DPM_FLAG_NO_DIRECT_COMPLETE); + spin_unlock_irq(&dev->power.lock); + return 0; } From 1476bb20eec33bd68b67c7bb7a5d62063af8148d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 27 Feb 2025 11:47:33 +0100 Subject: [PATCH 093/139] PM: runtime: Convert pm_runtime_blocked() to static inline The comment in pm_runtime_blocked() is acutally wrong: power.last_status is not a bit field. Its data type is an enum and so one can reasonably assume that partial updates of it will not be observed. Accordingly, pm_runtime_blocked() can be converted to a static inline function and the related locking overhead can be eliminated, so long as it is only used in system suspend/resume code paths because power.last_status is not expected to be updated concurrently while that code is running. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/1923449.tdWV9SEqCh@rjwysocki.net --- drivers/base/power/runtime.c | 17 ----------------- include/linux/pm_runtime.h | 12 +++++++++++- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 3d08eb677177..42a58ed45a08 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1560,23 +1560,6 @@ void pm_runtime_enable(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_enable); -bool pm_runtime_blocked(struct device *dev) -{ - bool ret; - - /* - * dev->power.last_status is a bit field, so in case it is updated via - * RMW, read it under the spin lock. - */ - spin_lock_irq(&dev->power.lock); - - ret = dev->power.last_status == RPM_BLOCKED; - - spin_unlock_irq(&dev->power.lock); - - return ret; -} - static void pm_runtime_disable_action(void *data) { pm_runtime_dont_use_autosuspend(data); diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 4c5b74b745d5..7fb5a459847e 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -82,7 +82,6 @@ extern bool pm_runtime_block_if_disabled(struct device *dev); extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); extern void __pm_runtime_disable(struct device *dev, bool check_resume); -extern bool pm_runtime_blocked(struct device *dev); extern void pm_runtime_allow(struct device *dev); extern void pm_runtime_forbid(struct device *dev); extern void pm_runtime_no_callbacks(struct device *dev); @@ -200,6 +199,17 @@ static inline bool pm_runtime_enabled(struct device *dev) return !dev->power.disable_depth; } +/** + * pm_runtime_blocked - Check if runtime PM enabling is blocked. + * @dev: Target device. + * + * Do not call this function outside system suspend/resume code paths. + */ +static inline bool pm_runtime_blocked(struct device *dev) +{ + return dev->power.last_status == RPM_BLOCKED; +} + /** * pm_runtime_has_no_callbacks - Check if runtime PM callbacks may be present. * @dev: Target device. From a84c2a885bc62a61e08fbcd9976a2a40400470c0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 27 Feb 2025 11:49:12 +0100 Subject: [PATCH 094/139] PM: core: Tweak pm_runtime_block_if_disabled() return value Modify pm_runtime_block_if_disabled() to return true when runtime PM is disabled for the device, regardless of the power.last_status value. This effectively prevents "smart suspend" from being enabled for devices with runtime PM disabled in device_prepare(), even transiently, so update the related comment in that function accordingly. If a device has runtime PM disabled in device_prepare(), it is not actually known whether or not runtime PM will be enabled for that device going forward, so it is more appropriate to postpone the "smart suspend" optimization for the device in the given system suspend-resume cycle than to enable it and get confused going forward. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/13718674.uLZWGnKmhe@rjwysocki.net --- drivers/base/power/main.c | 2 +- drivers/base/power/runtime.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 2f86d7cfdbbc..9215ec9f326b 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1897,7 +1897,7 @@ static int device_prepare(struct device *dev, pm_message_t state) pm_runtime_put(dev); return ret; } - /* Do not enable "smart suspend" for devices without runtime PM. */ + /* Do not enable "smart suspend" for devices with disabled runtime PM. */ if (smart_suspend) smart_suspend = device_prepare_smart_suspend(dev); diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 42a58ed45a08..18e40dce460a 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1466,8 +1466,8 @@ bool pm_runtime_block_if_disabled(struct device *dev) spin_lock_irq(&dev->power.lock); - ret = dev->power.disable_depth && dev->power.last_status == RPM_INVALID; - if (ret) + ret = !pm_runtime_enabled(dev); + if (ret && dev->power.last_status == RPM_INVALID) dev->power.last_status = RPM_BLOCKED; spin_unlock_irq(&dev->power.lock); From 3038b22bc098565b93cfb3cc8933b89701feb407 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Feb 2025 17:38:58 +0100 Subject: [PATCH 095/139] PM: sleep: Rename power.async_in_progress to power.work_in_progress Rename the async_in_progress field in struct dev_pm_info to work_in_progress as after subsequent changes it will mean work in general rather than just async work. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/3338693.aeNJFYEL58@rjwysocki.net --- drivers/base/power/main.c | 12 ++++++------ include/linux/pm.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 9215ec9f326b..edf5a4af8b03 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -602,7 +602,7 @@ static bool dpm_async_fn(struct device *dev, async_func_t func) reinit_completion(&dev->power.completion); if (is_async(dev)) { - dev->power.async_in_progress = true; + dev->power.work_in_progress = true; get_device(dev); @@ -614,9 +614,9 @@ static bool dpm_async_fn(struct device *dev, async_func_t func) /* * Because async_schedule_dev_nocall() above has returned false or it * has not been called at all, func() is not running and it is safe to - * update the async_in_progress flag without extra synchronization. + * update the work_in_progress flag without extra synchronization. */ - dev->power.async_in_progress = false; + dev->power.work_in_progress = false; return false; } @@ -736,7 +736,7 @@ static void dpm_noirq_resume_devices(pm_message_t state) dev = to_device(dpm_noirq_list.next); list_move_tail(&dev->power.entry, &dpm_late_early_list); - if (!dev->power.async_in_progress) { + if (!dev->power.work_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); @@ -876,7 +876,7 @@ void dpm_resume_early(pm_message_t state) dev = to_device(dpm_late_early_list.next); list_move_tail(&dev->power.entry, &dpm_suspended_list); - if (!dev->power.async_in_progress) { + if (!dev->power.work_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); @@ -1049,7 +1049,7 @@ void dpm_resume(pm_message_t state) dev = to_device(dpm_suspended_list.next); list_move_tail(&dev->power.entry, &dpm_prepared_list); - if (!dev->power.async_in_progress) { + if (!dev->power.work_in_progress) { get_device(dev); mutex_unlock(&dpm_list_mtx); diff --git a/include/linux/pm.h b/include/linux/pm.h index 24647108f0ad..63a8dffda787 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -679,7 +679,7 @@ struct dev_pm_info { bool wakeup_path:1; bool syscore:1; bool no_pm_callbacks:1; /* Owned by the PM core */ - bool async_in_progress:1; /* Owned by the PM core */ + bool work_in_progress:1; /* Owned by the PM core */ bool smart_suspend:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ From 628ccd80529223c19d22e06086be6dd87d064e0c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Feb 2025 17:39:40 +0100 Subject: [PATCH 096/139] PM: sleep: Rearrange dpm_async_fn() and async state clearing In preparation for subsequent changes, move the power.completion reinitialization along with clearing power.work_in_progress into a separate function called dpm_clear_async_state() and rearrange dpm_async_fn() to get rid of unnecessary indentation. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/8494650.T7Z3S40VBb@rjwysocki.net --- drivers/base/power/main.c | 44 ++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index edf5a4af8b03..d240dc352b1f 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -599,27 +599,34 @@ static bool is_async(struct device *dev) static bool dpm_async_fn(struct device *dev, async_func_t func) { - reinit_completion(&dev->power.completion); + if (!is_async(dev)) + return false; - if (is_async(dev)) { - dev->power.work_in_progress = true; + dev->power.work_in_progress = true; - get_device(dev); + get_device(dev); - if (async_schedule_dev_nocall(func, dev)) - return true; + if (async_schedule_dev_nocall(func, dev)) + return true; + + put_device(dev); - put_device(dev); - } /* - * Because async_schedule_dev_nocall() above has returned false or it - * has not been called at all, func() is not running and it is safe to - * update the work_in_progress flag without extra synchronization. + * async_schedule_dev_nocall() above has returned false, so func() is + * not running and it is safe to update power.work_in_progress without + * extra synchronization. */ dev->power.work_in_progress = false; + return false; } +static void dpm_clear_async_state(struct device *dev) +{ + reinit_completion(&dev->power.completion); + dev->power.work_in_progress = false; +} + /** * device_resume_noirq - Execute a "noirq resume" callback for given device. * @dev: Device to handle. @@ -729,8 +736,10 @@ static void dpm_noirq_resume_devices(pm_message_t state) * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ - list_for_each_entry(dev, &dpm_noirq_list, power.entry) + list_for_each_entry(dev, &dpm_noirq_list, power.entry) { + dpm_clear_async_state(dev); dpm_async_fn(dev, async_resume_noirq); + } while (!list_empty(&dpm_noirq_list)) { dev = to_device(dpm_noirq_list.next); @@ -869,8 +878,10 @@ void dpm_resume_early(pm_message_t state) * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ - list_for_each_entry(dev, &dpm_late_early_list, power.entry) + list_for_each_entry(dev, &dpm_late_early_list, power.entry) { + dpm_clear_async_state(dev); dpm_async_fn(dev, async_resume_early); + } while (!list_empty(&dpm_late_early_list)) { dev = to_device(dpm_late_early_list.next); @@ -1042,8 +1053,10 @@ void dpm_resume(pm_message_t state) * Trigger the resume of "async" devices upfront so they don't have to * wait for the "non-async" ones they don't depend on. */ - list_for_each_entry(dev, &dpm_suspended_list, power.entry) + list_for_each_entry(dev, &dpm_suspended_list, power.entry) { + dpm_clear_async_state(dev); dpm_async_fn(dev, async_resume); + } while (!list_empty(&dpm_suspended_list)) { dev = to_device(dpm_suspended_list.next); @@ -1320,6 +1333,7 @@ static int dpm_noirq_suspend_devices(pm_message_t state) list_move(&dev->power.entry, &dpm_noirq_list); + dpm_clear_async_state(dev); if (dpm_async_fn(dev, async_suspend_noirq)) continue; @@ -1497,6 +1511,7 @@ int dpm_suspend_late(pm_message_t state) list_move(&dev->power.entry, &dpm_late_early_list); + dpm_clear_async_state(dev); if (dpm_async_fn(dev, async_suspend_late)) continue; @@ -1764,6 +1779,7 @@ int dpm_suspend(pm_message_t state) list_move(&dev->power.entry, &dpm_suspended_list); + dpm_clear_async_state(dev); if (dpm_async_fn(dev, async_suspend)) continue; From 72263869656d09a5f9727504bb1f3cb7b010f0e5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Feb 2025 18:06:18 +0100 Subject: [PATCH 097/139] PM: runtime: Unify error handling during suspend and resume There is a confusing difference in error handling between rpm_suspend() and rpm_resume() related to the special way in which -EAGAIN and -EBUSY error values are treated by the former. Also, converting -EACCES coming from the callback to I/O error, which it quite likely is not, may confuse runtime PM users. To address the above, modify rpm_callback() to convert -EACCES coming from the driver to -EAGAIN and to set power.runtime_error only if the return value is not -EAGAIN or -EBUSY. This will cause the error handling in rpm_resume() and rpm_suspend() to work consistently, so drop the no longer needed -EAGAIN or -EBUSY special case from the latter and make it retry autosuspend if power.runtime_error is unset. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/12620037.O9o76ZdvQC@rjwysocki.net --- drivers/base/power/runtime.c | 40 +++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 1714358b541e..da74e1c69f7a 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -448,8 +448,19 @@ static int rpm_callback(int (*cb)(struct device *), struct device *dev) retval = __rpm_callback(cb, dev); } - dev->power.runtime_error = retval; - return retval != -EACCES ? retval : -EIO; + /* + * Since -EACCES means that runtime PM is disabled for the given device, + * it should not be returned by runtime PM callbacks. If it is returned + * nevertheless, assume it to be a transient error and convert it to + * -EAGAIN. + */ + if (retval == -EACCES) + retval = -EAGAIN; + + if (retval != -EAGAIN && retval != -EBUSY) + dev->power.runtime_error = retval; + + return retval; } /** @@ -725,21 +736,18 @@ static int rpm_suspend(struct device *dev, int rpmflags) dev->power.deferred_resume = false; wake_up_all(&dev->power.wait_queue); - if (retval == -EAGAIN || retval == -EBUSY) { - dev->power.runtime_error = 0; + /* + * On transient errors, if the callback routine failed an autosuspend, + * and if the last_busy time has been updated so that there is a new + * autosuspend expiration time, automatically reschedule another + * autosuspend. + */ + if (!dev->power.runtime_error && (rpmflags & RPM_AUTO) && + pm_runtime_autosuspend_expiration(dev) != 0) + goto repeat; + + pm_runtime_cancel_pending(dev); - /* - * If the callback routine failed an autosuspend, and - * if the last_busy time has been updated so that there - * is a new autosuspend expiration time, automatically - * reschedule another autosuspend. - */ - if ((rpmflags & RPM_AUTO) && - pm_runtime_autosuspend_expiration(dev) != 0) - goto repeat; - } else { - pm_runtime_cancel_pending(dev); - } goto out; } From 4a1e3bf61fc78ad100018adb573355303915dca3 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Wed, 26 Feb 2025 12:51:59 -0600 Subject: [PATCH 098/139] cpufreq: tegra194: Allow building for Tegra234 Support was added for Tegra234 in the referenced commit, but the Kconfig was not updated to allow building for the arch. Fixes: 273bc890a2a8 ("cpufreq: tegra194: Add support for Tegra234") Signed-off-by: Aaron Kling Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 9e46960f6a86..4f9cb943d945 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -254,7 +254,7 @@ config ARM_TEGRA186_CPUFREQ config ARM_TEGRA194_CPUFREQ tristate "Tegra194 CPUFreq support" - depends on ARCH_TEGRA_194_SOC || (64BIT && COMPILE_TEST) + depends on ARCH_TEGRA_194_SOC || ARCH_TEGRA_234_SOC || (64BIT && COMPILE_TEST) depends on TEGRA_BPMP default y help From a1d1d8fb653532638cfb3ee0b7e67ebd5327a3d6 Mon Sep 17 00:00:00 2001 From: Dhananjay Ugwekar Date: Sat, 22 Feb 2025 03:32:22 +0000 Subject: [PATCH 099/139] cpufreq/amd-pstate: Fix the clamping of perf values The clamping in freq_to_perf() is broken right now, as we first typecast (read wraparound) the overflowing value into a u8 and then clamp it down. So, use a u32 to store the >255 value in certain edge cases and then clamp it down into a u8. Also, use a "explicit typecast + clamp" instead of just a "clamp_t" as the latter typecasts first and then clamps between the limits, which defeats our purpose. Fixes: 620136ced35a ("cpufreq/amd-pstate: Modularize perf<->freq conversion") Signed-off-by: Dhananjay Ugwekar Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250222033221.554976-1-dhananjay.ugwekar@amd.com Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 08ae48076812..56930424c9fa 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -144,10 +144,10 @@ static struct quirk_entry quirk_amd_7k62 = { static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val) { - u8 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf, + u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf, cpudata->nominal_freq); - return clamp_t(u8, perf_val, cpudata->lowest_perf, cpudata->highest_perf); + return (u8)clamp(perf_val, cpudata->lowest_perf, cpudata->highest_perf); } static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val) From b7a41156588ad03757bf0a2f0e05d6cbcebeaa9e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 17 Feb 2025 13:28:51 -0600 Subject: [PATCH 100/139] cpufreq/amd-pstate: Invalidate cppc_req_cached during suspend During resume it's possible the firmware didn't restore the CPPC request MSR but the kernel thinks the values line up. This leads to incorrect performance after resume from suspend. To fix the issue invalidate the cached value at suspend. During resume use the saved values programmed as cached limits. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Reported-by: Miroslav Pavleski Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217931 Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 56930424c9fa..44318eb33463 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1605,7 +1605,7 @@ static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) max_perf, policy->boost_enabled); } - return amd_pstate_update_perf(cpudata, 0, 0, max_perf, cpudata->epp_cached, false); + return amd_pstate_epp_update_limit(policy); } static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) @@ -1654,6 +1654,9 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) if (cppc_state != AMD_PSTATE_ACTIVE) return 0; + /* invalidate to ensure it's rewritten during resume */ + cpudata->cppc_req_cached = 0; + /* set this flag to avoid setting core offline*/ cpudata->suspended = true; From a9ba0fd452d82ca0da170eb6291aac01075a17d5 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 11 Dec 2024 14:57:57 -0600 Subject: [PATCH 101/139] cpufreq/amd-pstate: Show a warning when a CPU fails to setup I came across a system that MSR_AMD_CPPC_CAP1 for some CPUs isn't populated. This is an unexpected behavior that is most likely a BIOS bug. In the event it happens I'd like users to report bugs to properly root cause and get this fixed. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 44318eb33463..29250638a2ac 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1028,6 +1028,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) free_cpudata2: freq_qos_remove_request(&cpudata->req[0]); free_cpudata1: + pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret); kfree(cpudata); return ret; } @@ -1521,6 +1522,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) return 0; free_cpudata1: + pr_warn("Failed to initialize CPU %d: %d\n", policy->cpu, ret); kfree(cpudata); return ret; } From a9b9b4c2a4cdd00428d14914e3c18be3856aba71 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 23 Jan 2025 16:16:01 -0600 Subject: [PATCH 102/139] cpufreq/amd-pstate: Drop min and max cached frequencies Use the perf_to_freq helpers to calculate this on the fly. As the members are no longer cached add an extra check into amd_pstate_epp_update_limit() to avoid unnecessary calls in amd_pstate_update_min_max_limit(). Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 14 +++++------ drivers/cpufreq/amd-pstate.c | 43 +++++++++------------------------ drivers/cpufreq/amd-pstate.h | 9 ++----- 3 files changed, 20 insertions(+), 46 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 3a0a380c3590..445278cf40b6 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -214,14 +214,14 @@ static void amd_pstate_ut_check_freq(u32 index) break; cpudata = policy->driver_data; - if (!((cpudata->max_freq >= cpudata->nominal_freq) && + if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) && (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && - (cpudata->lowest_nonlinear_freq > cpudata->min_freq) && - (cpudata->min_freq > 0))) { + (cpudata->lowest_nonlinear_freq > policy->cpuinfo.min_freq) && + (policy->cpuinfo.min_freq > 0))) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", - __func__, cpu, cpudata->max_freq, cpudata->nominal_freq, - cpudata->lowest_nonlinear_freq, cpudata->min_freq); + __func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq, + cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq); goto skip_test; } @@ -233,13 +233,13 @@ static void amd_pstate_ut_check_freq(u32 index) } if (cpudata->boost_supported) { - if ((policy->max == cpudata->max_freq) || + if ((policy->max == policy->cpuinfo.max_freq) || (policy->max == cpudata->nominal_freq)) amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; else { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", - __func__, cpu, policy->max, cpudata->max_freq, + __func__, cpu, policy->max, policy->cpuinfo.max_freq, cpudata->nominal_freq); goto skip_test; } diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 29250638a2ac..ac5a6fc61db7 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -717,7 +717,7 @@ static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) int ret = 0; nominal_freq = READ_ONCE(cpudata->nominal_freq); - max_freq = READ_ONCE(cpudata->max_freq); + max_freq = perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf)); if (on) policy->cpuinfo.max_freq = max_freq; @@ -917,13 +917,10 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) nominal_freq *= 1000; WRITE_ONCE(cpudata->nominal_freq, nominal_freq); - WRITE_ONCE(cpudata->min_freq, min_freq); max_freq = perf_to_freq(cpudata, cpudata->highest_perf); lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf); - WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); - WRITE_ONCE(cpudata->max_freq, max_freq); /** * Below values need to be initialized correctly, otherwise driver will fail to load @@ -948,9 +945,9 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { - int min_freq, max_freq, ret; - struct device *dev; struct amd_cpudata *cpudata; + struct device *dev; + int ret; /* * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, @@ -981,17 +978,11 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; - min_freq = READ_ONCE(cpudata->min_freq); - max_freq = READ_ONCE(cpudata->max_freq); - policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu); policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu); - policy->min = min_freq; - policy->max = max_freq; - - policy->cpuinfo.min_freq = min_freq; - policy->cpuinfo.max_freq = max_freq; + policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf); + policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf); policy->boost_enabled = READ_ONCE(cpudata->boost_supported); @@ -1015,9 +1006,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) goto free_cpudata2; } - cpudata->max_limit_freq = max_freq; - cpudata->min_limit_freq = min_freq; - policy->driver_data = cpudata; if (!current_pstate_driver->adjust_perf) @@ -1075,14 +1063,10 @@ static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy) static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, char *buf) { - int max_freq; struct amd_cpudata *cpudata = policy->driver_data; - max_freq = READ_ONCE(cpudata->max_freq); - if (max_freq < 0) - return max_freq; - return sysfs_emit(buf, "%u\n", max_freq); + return sysfs_emit(buf, "%u\n", perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf))); } static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, @@ -1440,10 +1424,10 @@ static bool amd_pstate_acpi_pm_profile_undefined(void) static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) { - int min_freq, max_freq, ret; struct amd_cpudata *cpudata; struct device *dev; u64 value; + int ret; /* * Resetting PERF_CTL_MSR will put the CPU in P0 frequency, @@ -1474,19 +1458,13 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; - min_freq = READ_ONCE(cpudata->min_freq); - max_freq = READ_ONCE(cpudata->max_freq); - - policy->cpuinfo.min_freq = min_freq; - policy->cpuinfo.max_freq = max_freq; + policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf); + policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf); /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; policy->driver_data = cpudata; - policy->min = policy->cpuinfo.min_freq; - policy->max = policy->cpuinfo.max_freq; - policy->boost_enabled = READ_ONCE(cpudata->boost_supported); /* @@ -1544,7 +1522,8 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) struct amd_cpudata *cpudata = policy->driver_data; u8 epp; - amd_pstate_update_min_max_limit(policy); + if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) + amd_pstate_update_min_max_limit(policy); if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) epp = 0; diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 19d405c6d805..014993369245 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -46,8 +46,6 @@ struct amd_aperf_mperf { * @max_limit_perf: Cached value of the performance corresponding to policy->max * @min_limit_freq: Cached value of policy->min (in khz) * @max_limit_freq: Cached value of policy->max (in khz) - * @max_freq: the frequency (in khz) that mapped to highest_perf - * @min_freq: the frequency (in khz) that mapped to lowest_perf * @nominal_freq: the frequency (in khz) that mapped to nominal_perf * @lowest_nonlinear_freq: the frequency (in khz) that mapped to lowest_nonlinear_perf * @cur: Difference of Aperf/Mperf/tsc count between last and current sample @@ -77,11 +75,8 @@ struct amd_cpudata { u8 prefcore_ranking; u8 min_limit_perf; u8 max_limit_perf; - u32 min_limit_freq; - u32 max_limit_freq; - - u32 max_freq; - u32 min_freq; + u32 min_limit_freq; + u32 max_limit_freq; u32 nominal_freq; u32 lowest_nonlinear_freq; From 009d1c29a45194212e9310ccd91a19a673908a5c Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 17 Jan 2025 18:34:38 -0600 Subject: [PATCH 103/139] cpufreq/amd-pstate: Move perf values into a union By storing perf values in a union all the writes and reads can be done atomically, removing the need for some concurrency protections. While making this change, also drop the cached frequency values, using inline helpers to calculate them on demand from perf value. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 18 +-- drivers/cpufreq/amd-pstate.c | 213 ++++++++++++++++++-------------- drivers/cpufreq/amd-pstate.h | 51 +++++--- 3 files changed, 162 insertions(+), 120 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 445278cf40b6..5f6a92a816e6 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -129,6 +129,7 @@ static void amd_pstate_ut_check_perf(u32 index) struct cppc_perf_caps cppc_perf; struct cpufreq_policy *policy = NULL; struct amd_cpudata *cpudata = NULL; + union perf_cached cur_perf; for_each_possible_cpu(cpu) { policy = cpufreq_cpu_get(cpu); @@ -162,19 +163,20 @@ static void amd_pstate_ut_check_perf(u32 index) lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); } - if (highest_perf != READ_ONCE(cpudata->highest_perf) && !cpudata->hw_prefcore) { + cur_perf = READ_ONCE(cpudata->perf); + if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) { pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n", - __func__, cpu, highest_perf, cpudata->highest_perf); + __func__, cpu, highest_perf, cur_perf.highest_perf); goto skip_test; } - if ((nominal_perf != READ_ONCE(cpudata->nominal_perf)) || - (lowest_nonlinear_perf != READ_ONCE(cpudata->lowest_nonlinear_perf)) || - (lowest_perf != READ_ONCE(cpudata->lowest_perf))) { + if (nominal_perf != cur_perf.nominal_perf || + (lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) || + (lowest_perf != cur_perf.lowest_perf)) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d nominal=%d %d lowest_nonlinear=%d %d lowest=%d %d, they should be equal!\n", - __func__, cpu, nominal_perf, cpudata->nominal_perf, - lowest_nonlinear_perf, cpudata->lowest_nonlinear_perf, - lowest_perf, cpudata->lowest_perf); + __func__, cpu, nominal_perf, cur_perf.nominal_perf, + lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf, + lowest_perf, cur_perf.lowest_perf); goto skip_test; } diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index ac5a6fc61db7..983c8728701e 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -142,18 +142,17 @@ static struct quirk_entry quirk_amd_7k62 = { .lowest_freq = 550, }; -static inline u8 freq_to_perf(struct amd_cpudata *cpudata, unsigned int freq_val) +static inline u8 freq_to_perf(union perf_cached perf, u32 nominal_freq, unsigned int freq_val) { - u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * cpudata->nominal_perf, - cpudata->nominal_freq); + u32 perf_val = DIV_ROUND_UP_ULL((u64)freq_val * perf.nominal_perf, nominal_freq); - return (u8)clamp(perf_val, cpudata->lowest_perf, cpudata->highest_perf); + return (u8)clamp(perf_val, perf.lowest_perf, perf.highest_perf); } -static inline u32 perf_to_freq(struct amd_cpudata *cpudata, u8 perf_val) +static inline u32 perf_to_freq(union perf_cached perf, u32 nominal_freq, u8 perf_val) { - return DIV_ROUND_UP_ULL((u64)cpudata->nominal_freq * perf_val, - cpudata->nominal_perf); + return DIV_ROUND_UP_ULL((u64)nominal_freq * perf_val, + perf.nominal_perf); } static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) @@ -347,7 +346,9 @@ static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy, } if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, + union perf_cached perf = READ_ONCE(cpudata->perf); + + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, epp, FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), @@ -425,6 +426,7 @@ static inline int amd_pstate_cppc_enable(bool enable) static int msr_init_perf(struct amd_cpudata *cpudata) { + union perf_cached perf = READ_ONCE(cpudata->perf); u64 cap1, numerator; int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, @@ -436,19 +438,21 @@ static int msr_init_perf(struct amd_cpudata *cpudata) if (ret) return ret; - WRITE_ONCE(cpudata->highest_perf, numerator); - WRITE_ONCE(cpudata->max_limit_perf, numerator); - WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); - WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); - WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); + perf.highest_perf = numerator; + perf.max_limit_perf = numerator; + perf.min_limit_perf = AMD_CPPC_LOWEST_PERF(cap1); + perf.nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1); + perf.lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1); + perf.lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); + WRITE_ONCE(cpudata->perf, perf); WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); - WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1)); return 0; } static int shmem_init_perf(struct amd_cpudata *cpudata) { struct cppc_perf_caps cppc_perf; + union perf_cached perf = READ_ONCE(cpudata->perf); u64 numerator; int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); @@ -459,14 +463,14 @@ static int shmem_init_perf(struct amd_cpudata *cpudata) if (ret) return ret; - WRITE_ONCE(cpudata->highest_perf, numerator); - WRITE_ONCE(cpudata->max_limit_perf, numerator); - WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); - WRITE_ONCE(cpudata->lowest_nonlinear_perf, - cppc_perf.lowest_nonlinear_perf); - WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); + perf.highest_perf = numerator; + perf.max_limit_perf = numerator; + perf.min_limit_perf = cppc_perf.lowest_perf; + perf.nominal_perf = cppc_perf.nominal_perf; + perf.lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; + perf.lowest_perf = cppc_perf.lowest_perf; + WRITE_ONCE(cpudata->perf, perf); WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf); - WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf); if (cppc_state == AMD_PSTATE_ACTIVE) return 0; @@ -549,14 +553,14 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, u8 des_perf, u8 max_perf, bool fast_switch, int gov_flags) { struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpudata->cpu); - u8 nominal_perf = READ_ONCE(cpudata->nominal_perf); + union perf_cached perf = READ_ONCE(cpudata->perf); if (!policy) return; des_perf = clamp_t(u8, des_perf, min_perf, max_perf); - policy->cur = perf_to_freq(cpudata, des_perf); + policy->cur = perf_to_freq(perf, cpudata->nominal_freq, des_perf); if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { min_perf = des_perf; @@ -565,7 +569,7 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, /* limit the max perf when core performance boost feature is disabled */ if (!cpudata->boost_supported) - max_perf = min_t(u8, nominal_perf, max_perf); + max_perf = min_t(u8, perf.nominal_perf, max_perf); if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, @@ -602,39 +606,41 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) return 0; } -static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) +static void amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) { - u8 max_limit_perf, min_limit_perf; struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); - max_limit_perf = freq_to_perf(cpudata, policy->max); - min_limit_perf = freq_to_perf(cpudata, policy->min); + perf.max_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->max); + perf.min_limit_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->min); if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) - min_limit_perf = min(cpudata->nominal_perf, max_limit_perf); + perf.min_limit_perf = min(perf.nominal_perf, perf.max_limit_perf); - WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); - WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); WRITE_ONCE(cpudata->max_limit_freq, policy->max); WRITE_ONCE(cpudata->min_limit_freq, policy->min); - - return 0; + WRITE_ONCE(cpudata->perf, perf); } static int amd_pstate_update_freq(struct cpufreq_policy *policy, unsigned int target_freq, bool fast_switch) { struct cpufreq_freqs freqs; - struct amd_cpudata *cpudata = policy->driver_data; + struct amd_cpudata *cpudata; + union perf_cached perf; u8 des_perf; + cpudata = policy->driver_data; + if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) amd_pstate_update_min_max_limit(policy); + perf = READ_ONCE(cpudata->perf); + freqs.old = policy->cur; freqs.new = target_freq; - des_perf = freq_to_perf(cpudata, target_freq); + des_perf = freq_to_perf(perf, cpudata->nominal_freq, target_freq); WARN_ON(fast_switch && !policy->fast_switch_enabled); /* @@ -645,8 +651,8 @@ static int amd_pstate_update_freq(struct cpufreq_policy *policy, if (!fast_switch) cpufreq_freq_transition_begin(policy, &freqs); - amd_pstate_update(cpudata, cpudata->min_limit_perf, des_perf, - cpudata->max_limit_perf, fast_switch, + amd_pstate_update(cpudata, perf.min_limit_perf, des_perf, + perf.max_limit_perf, fast_switch, policy->governor->flags); if (!fast_switch) @@ -675,9 +681,10 @@ static void amd_pstate_adjust_perf(unsigned int cpu, unsigned long target_perf, unsigned long capacity) { - u8 max_perf, min_perf, des_perf, cap_perf, min_limit_perf; + u8 max_perf, min_perf, des_perf, cap_perf; struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu); struct amd_cpudata *cpudata; + union perf_cached perf; if (!policy) return; @@ -687,8 +694,8 @@ static void amd_pstate_adjust_perf(unsigned int cpu, if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) amd_pstate_update_min_max_limit(policy); - cap_perf = READ_ONCE(cpudata->highest_perf); - min_limit_perf = READ_ONCE(cpudata->min_limit_perf); + perf = READ_ONCE(cpudata->perf); + cap_perf = perf.highest_perf; des_perf = cap_perf; if (target_perf < capacity) @@ -699,10 +706,10 @@ static void amd_pstate_adjust_perf(unsigned int cpu, else min_perf = cap_perf; - if (min_perf < min_limit_perf) - min_perf = min_limit_perf; + if (min_perf < perf.min_limit_perf) + min_perf = perf.min_limit_perf; - max_perf = cpudata->max_limit_perf; + max_perf = perf.max_limit_perf; if (max_perf < min_perf) max_perf = min_perf; @@ -713,11 +720,12 @@ static void amd_pstate_adjust_perf(unsigned int cpu, static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) { struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf = READ_ONCE(cpudata->perf); u32 nominal_freq, max_freq; int ret = 0; nominal_freq = READ_ONCE(cpudata->nominal_freq); - max_freq = perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf)); + max_freq = perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf); if (on) policy->cpuinfo.max_freq = max_freq; @@ -882,44 +890,44 @@ static u32 amd_pstate_get_transition_latency(unsigned int cpu) } /* - * amd_pstate_init_freq: Initialize the max_freq, min_freq, - * nominal_freq and lowest_nonlinear_freq for - * the @cpudata object. + * amd_pstate_init_freq: Initialize the nominal_freq and lowest_nonlinear_freq + * for the @cpudata object. * - * Requires: highest_perf, lowest_perf, nominal_perf and - * lowest_nonlinear_perf members of @cpudata to be - * initialized. + * Requires: all perf members of @cpudata to be initialized. * - * Returns 0 on success, non-zero value on failure. + * Returns 0 on success, non-zero value on failure. */ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) { - int ret; - u32 min_freq, max_freq; - u32 nominal_freq, lowest_nonlinear_freq; + u32 min_freq, max_freq, nominal_freq, lowest_nonlinear_freq; struct cppc_perf_caps cppc_perf; + union perf_cached perf; + int ret; ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); if (ret) return ret; - - if (quirks && quirks->lowest_freq) - min_freq = quirks->lowest_freq; - else - min_freq = cppc_perf.lowest_freq; + perf = READ_ONCE(cpudata->perf); if (quirks && quirks->nominal_freq) nominal_freq = quirks->nominal_freq; else nominal_freq = cppc_perf.nominal_freq; + nominal_freq *= 1000; + + if (quirks && quirks->lowest_freq) { + min_freq = quirks->lowest_freq; + perf.lowest_perf = freq_to_perf(perf, nominal_freq, min_freq); + WRITE_ONCE(cpudata->perf, perf); + } else + min_freq = cppc_perf.lowest_freq; min_freq *= 1000; - nominal_freq *= 1000; WRITE_ONCE(cpudata->nominal_freq, nominal_freq); - max_freq = perf_to_freq(cpudata, cpudata->highest_perf); - lowest_nonlinear_freq = perf_to_freq(cpudata, cpudata->lowest_nonlinear_perf); + max_freq = perf_to_freq(perf, nominal_freq, perf.highest_perf); + lowest_nonlinear_freq = perf_to_freq(perf, nominal_freq, perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); /** @@ -946,6 +954,7 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata; + union perf_cached perf; struct device *dev; int ret; @@ -981,8 +990,14 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu); policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu); - policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf); - policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf); + perf = READ_ONCE(cpudata->perf); + + policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf, + cpudata->nominal_freq, + perf.lowest_perf); + policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, + cpudata->nominal_freq, + perf.highest_perf); policy->boost_enabled = READ_ONCE(cpudata->boost_supported); @@ -1063,23 +1078,27 @@ static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy) static ssize_t show_amd_pstate_max_freq(struct cpufreq_policy *policy, char *buf) { - struct amd_cpudata *cpudata = policy->driver_data; + struct amd_cpudata *cpudata; + union perf_cached perf; + cpudata = policy->driver_data; + perf = READ_ONCE(cpudata->perf); - return sysfs_emit(buf, "%u\n", perf_to_freq(cpudata, READ_ONCE(cpudata->highest_perf))); + return sysfs_emit(buf, "%u\n", + perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf)); } static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *policy, char *buf) { - int freq; - struct amd_cpudata *cpudata = policy->driver_data; + struct amd_cpudata *cpudata; + union perf_cached perf; - freq = READ_ONCE(cpudata->lowest_nonlinear_freq); - if (freq < 0) - return freq; + cpudata = policy->driver_data; + perf = READ_ONCE(cpudata->perf); - return sysfs_emit(buf, "%u\n", freq); + return sysfs_emit(buf, "%u\n", + perf_to_freq(perf, cpudata->nominal_freq, perf.lowest_nonlinear_perf)); } /* @@ -1089,12 +1108,11 @@ static ssize_t show_amd_pstate_lowest_nonlinear_freq(struct cpufreq_policy *poli static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, char *buf) { - u8 perf; - struct amd_cpudata *cpudata = policy->driver_data; + struct amd_cpudata *cpudata; - perf = READ_ONCE(cpudata->highest_perf); + cpudata = policy->driver_data; - return sysfs_emit(buf, "%u\n", perf); + return sysfs_emit(buf, "%u\n", cpudata->perf.highest_perf); } static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy, @@ -1425,6 +1443,7 @@ static bool amd_pstate_acpi_pm_profile_undefined(void) static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata; + union perf_cached perf; struct device *dev; u64 value; int ret; @@ -1458,8 +1477,15 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; - policy->cpuinfo.min_freq = policy->min = perf_to_freq(cpudata, cpudata->lowest_perf); - policy->cpuinfo.max_freq = policy->max = perf_to_freq(cpudata, cpudata->highest_perf); + perf = READ_ONCE(cpudata->perf); + + policy->cpuinfo.min_freq = policy->min = perf_to_freq(perf, + cpudata->nominal_freq, + perf.lowest_perf); + policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, + cpudata->nominal_freq, + perf.highest_perf); + /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; @@ -1520,6 +1546,7 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; + union perf_cached perf; u8 epp; if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) @@ -1530,15 +1557,16 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) else epp = READ_ONCE(cpudata->epp_cached); + perf = READ_ONCE(cpudata->perf); if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp, - cpudata->min_limit_perf, - cpudata->max_limit_perf, + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, epp, + perf.min_limit_perf, + perf.max_limit_perf, policy->boost_enabled); } - return amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, - cpudata->max_limit_perf, epp, false); + return amd_pstate_update_perf(cpudata, perf.min_limit_perf, 0U, + perf.max_limit_perf, epp, false); } static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) @@ -1570,20 +1598,18 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - u8 max_perf; + union perf_cached perf = READ_ONCE(cpudata->perf); int ret; ret = amd_pstate_cppc_enable(true); if (ret) pr_err("failed to enable amd pstate during resume, return %d\n", ret); - max_perf = READ_ONCE(cpudata->highest_perf); - if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, cpudata->epp_cached, FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), - max_perf, policy->boost_enabled); + perf.highest_perf, policy->boost_enabled); } return amd_pstate_epp_update_limit(policy); @@ -1607,22 +1633,21 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - u8 min_perf; + union perf_cached perf = READ_ONCE(cpudata->perf); if (cpudata->suspended) return 0; - min_perf = READ_ONCE(cpudata->lowest_perf); - guard(mutex)(&amd_pstate_limits_lock); if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, AMD_CPPC_EPP_BALANCE_POWERSAVE, - min_perf, min_perf, policy->boost_enabled); + perf.lowest_perf, perf.lowest_perf, + policy->boost_enabled); } - return amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, + return amd_pstate_update_perf(cpudata, perf.lowest_perf, 0, perf.lowest_perf, AMD_CPPC_EPP_BALANCE_POWERSAVE, false); } diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 014993369245..83532a0079a8 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -13,6 +13,36 @@ /********************************************************************* * AMD P-state INTERFACE * *********************************************************************/ + +/** + * union perf_cached - A union to cache performance-related data. + * @highest_perf: the maximum performance an individual processor may reach, + * assuming ideal conditions + * For platforms that support the preferred core feature, the highest_perf value maybe + * configured to any value in the range 166-255 by the firmware (because the preferred + * core ranking is encoded in the highest_perf value). To maintain consistency across + * all platforms, we split the highest_perf and preferred core ranking values into + * cpudata->perf.highest_perf and cpudata->prefcore_ranking. + * @nominal_perf: the maximum sustained performance level of the processor, + * assuming ideal operating conditions + * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power + * savings are achieved + * @lowest_perf: the absolute lowest performance level of the processor + * @min_limit_perf: Cached value of the performance corresponding to policy->min + * @max_limit_perf: Cached value of the performance corresponding to policy->max + */ +union perf_cached { + struct { + u8 highest_perf; + u8 nominal_perf; + u8 lowest_nonlinear_perf; + u8 lowest_perf; + u8 min_limit_perf; + u8 max_limit_perf; + }; + u64 val; +}; + /** * struct amd_aperf_mperf * @aperf: actual performance frequency clock count @@ -30,20 +60,9 @@ struct amd_aperf_mperf { * @cpu: CPU number * @req: constraint request to apply * @cppc_req_cached: cached performance request hints - * @highest_perf: the maximum performance an individual processor may reach, - * assuming ideal conditions - * For platforms that do not support the preferred core feature, the - * highest_pef may be configured with 166 or 255, to avoid max frequency - * calculated wrongly. we take the fixed value as the highest_perf. - * @nominal_perf: the maximum sustained performance level of the processor, - * assuming ideal operating conditions - * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power - * savings are achieved - * @lowest_perf: the absolute lowest performance level of the processor + * @perf: cached performance-related data * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher * priority. - * @min_limit_perf: Cached value of the performance corresponding to policy->min - * @max_limit_perf: Cached value of the performance corresponding to policy->max * @min_limit_freq: Cached value of policy->min (in khz) * @max_limit_freq: Cached value of policy->max (in khz) * @nominal_freq: the frequency (in khz) that mapped to nominal_perf @@ -68,13 +87,9 @@ struct amd_cpudata { struct freq_qos_request req[2]; u64 cppc_req_cached; - u8 highest_perf; - u8 nominal_perf; - u8 lowest_nonlinear_perf; - u8 lowest_perf; + union perf_cached perf; + u8 prefcore_ranking; - u8 min_limit_perf; - u8 max_limit_perf; u32 min_limit_freq; u32 max_limit_freq; u32 nominal_freq; From 6f0b13f16f7a214996a8a125080a6a99bda5d1f7 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 6 Dec 2024 23:20:04 -0600 Subject: [PATCH 104/139] cpufreq/amd-pstate: Overhaul locking amd_pstate_cpu_boost_update() and refresh_frequency_limits() both update the policy state and have nothing to do with the amd-pstate driver itself. A global "limits" lock doesn't make sense because each CPU can have policies changed independently. Each time a CPU changes values they will atomically be written to the per-CPU perf member. Drop per CPU locking cases. The remaining "global" driver lock is used to ensure that only one entity can change driver modes at a given time. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 983c8728701e..65714d510ce3 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -196,7 +196,6 @@ static inline int get_mode_idx_from_str(const char *str, size_t size) return -EINVAL; } -static DEFINE_MUTEX(amd_pstate_limits_lock); static DEFINE_MUTEX(amd_pstate_driver_lock); static u8 msr_get_epp(struct amd_cpudata *cpudata) @@ -752,7 +751,6 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) pr_err("Boost mode is not supported by this processor or SBIOS\n"); return -EOPNOTSUPP; } - guard(mutex)(&amd_pstate_driver_lock); ret = amd_pstate_cpu_boost_update(policy, state); refresh_frequency_limits(policy); @@ -1170,8 +1168,6 @@ static ssize_t store_energy_performance_preference( if (ret < 0) return -EINVAL; - guard(mutex)(&amd_pstate_limits_lock); - ret = amd_pstate_set_energy_pref_index(policy, ret); return ret ? ret : count; @@ -1344,8 +1340,10 @@ int amd_pstate_update_status(const char *buf, size_t size) if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) return -EINVAL; - if (mode_state_machine[cppc_state][mode_idx]) + if (mode_state_machine[cppc_state][mode_idx]) { + guard(mutex)(&amd_pstate_driver_lock); return mode_state_machine[cppc_state][mode_idx](mode_idx); + } return 0; } @@ -1366,7 +1364,6 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, char *p = memchr(buf, '\n', count); int ret; - guard(mutex)(&amd_pstate_driver_lock); ret = amd_pstate_update_status(buf, p ? p - buf : count); return ret < 0 ? ret : count; @@ -1638,8 +1635,6 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) if (cpudata->suspended) return 0; - guard(mutex)(&amd_pstate_limits_lock); - if (trace_amd_pstate_epp_perf_enabled()) { trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, AMD_CPPC_EPP_BALANCE_POWERSAVE, @@ -1679,8 +1674,6 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) struct amd_cpudata *cpudata = policy->driver_data; if (cpudata->suspended) { - guard(mutex)(&amd_pstate_limits_lock); - /* enable amd pstate from suspend state*/ amd_pstate_epp_reenable(policy); From f458cf79d73b144d900e0c274b9eb8a2f3641a0e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sat, 7 Dec 2024 00:28:41 -0600 Subject: [PATCH 105/139] cpufreq/amd-pstate: Drop `cppc_cap1_cached` The `cppc_cap1_cached` variable isn't used at all, there is no need to read it at initialization for each CPU. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 5 ----- drivers/cpufreq/amd-pstate.h | 2 -- 2 files changed, 7 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 65714d510ce3..6f1a3056bcbd 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1508,11 +1508,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) if (ret) return ret; WRITE_ONCE(cpudata->cppc_req_cached, value); - - ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, &value); - if (ret) - return ret; - WRITE_ONCE(cpudata->cppc_cap1_cached, value); } ret = amd_pstate_set_epp(cpudata, cpudata->epp_default); if (ret) diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 83532a0079a8..1557e1afea79 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -76,7 +76,6 @@ struct amd_aperf_mperf { * AMD P-State driver supports preferred core featue. * @epp_cached: Cached CPPC energy-performance preference value * @policy: Cpufreq policy value - * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value * * The amd_cpudata is key private data for each CPU thread in AMD P-State, and * represents all the attributes and goals that AMD P-State requests at runtime. @@ -105,7 +104,6 @@ struct amd_cpudata { /* EPP feature related attributes*/ u8 epp_cached; u32 policy; - u64 cppc_cap1_cached; bool suspended; u8 epp_default; }; From 93984d3cea8ab5f2631ec5ba491ce4e47218ab7b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sat, 7 Dec 2024 23:42:48 -0600 Subject: [PATCH 106/139] cpufreq/amd-pstate-ut: Use _free macro to free put policy Using a scoped cleanup macro simplifies cleanup code. Reviewed-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 5f6a92a816e6..e02672e67380 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -127,11 +128,12 @@ static void amd_pstate_ut_check_perf(u32 index) u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0; u64 cap1 = 0; struct cppc_perf_caps cppc_perf; - struct cpufreq_policy *policy = NULL; struct amd_cpudata *cpudata = NULL; union perf_cached cur_perf; for_each_possible_cpu(cpu) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + policy = cpufreq_cpu_get(cpu); if (!policy) break; @@ -142,7 +144,7 @@ static void amd_pstate_ut_check_perf(u32 index) if (ret) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cppc_get_perf_caps ret=%d error!\n", __func__, ret); - goto skip_test; + return; } highest_perf = cppc_perf.highest_perf; @@ -154,7 +156,7 @@ static void amd_pstate_ut_check_perf(u32 index) if (ret) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s read CPPC_CAP1 ret=%d error!\n", __func__, ret); - goto skip_test; + return; } highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); @@ -167,7 +169,7 @@ static void amd_pstate_ut_check_perf(u32 index) if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) { pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n", __func__, cpu, highest_perf, cur_perf.highest_perf); - goto skip_test; + return; } if (nominal_perf != cur_perf.nominal_perf || (lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) || @@ -177,7 +179,7 @@ static void amd_pstate_ut_check_perf(u32 index) __func__, cpu, nominal_perf, cur_perf.nominal_perf, lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf, lowest_perf, cur_perf.lowest_perf); - goto skip_test; + return; } if (!((highest_perf >= nominal_perf) && @@ -188,15 +190,11 @@ static void amd_pstate_ut_check_perf(u32 index) pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n", __func__, cpu, highest_perf, nominal_perf, lowest_nonlinear_perf, lowest_perf); - goto skip_test; + return; } - cpufreq_cpu_put(policy); } amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; - return; -skip_test: - cpufreq_cpu_put(policy); } /* @@ -207,10 +205,11 @@ static void amd_pstate_ut_check_perf(u32 index) static void amd_pstate_ut_check_freq(u32 index) { int cpu = 0; - struct cpufreq_policy *policy = NULL; struct amd_cpudata *cpudata = NULL; for_each_possible_cpu(cpu) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + policy = cpufreq_cpu_get(cpu); if (!policy) break; @@ -224,14 +223,14 @@ static void amd_pstate_ut_check_freq(u32 index) pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", __func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq, cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq); - goto skip_test; + return; } if (cpudata->lowest_nonlinear_freq != policy->min) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n", __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min); - goto skip_test; + return; } if (cpudata->boost_supported) { @@ -243,20 +242,16 @@ static void amd_pstate_ut_check_freq(u32 index) pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", __func__, cpu, policy->max, policy->cpuinfo.max_freq, cpudata->nominal_freq); - goto skip_test; + return; } } else { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d must support boost!\n", __func__, cpu); - goto skip_test; + return; } - cpufreq_cpu_put(policy); } amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; - return; -skip_test: - cpufreq_cpu_put(policy); } static int amd_pstate_set_mode(enum amd_pstate_mode mode) From 66030cc1c533e26bb4dbe8a356090f24dd734801 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 14 Feb 2025 12:59:40 -0600 Subject: [PATCH 107/139] cpufreq/amd-pstate-ut: Allow lowest nonlinear and lowest to be the same Several Ryzen AI processors support the exact same value for lowest nonlinear perf and lowest perf. Loosen up the unit tests to allow this scenario. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index e02672e67380..b3693a4c28d2 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -184,7 +184,7 @@ static void amd_pstate_ut_check_perf(u32 index) if (!((highest_perf >= nominal_perf) && (nominal_perf > lowest_nonlinear_perf) && - (lowest_nonlinear_perf > lowest_perf) && + (lowest_nonlinear_perf >= lowest_perf) && (lowest_perf > 0))) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n", @@ -217,7 +217,7 @@ static void amd_pstate_ut_check_freq(u32 index) if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) && (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && - (cpudata->lowest_nonlinear_freq > policy->cpuinfo.min_freq) && + (cpudata->lowest_nonlinear_freq >= policy->cpuinfo.min_freq) && (policy->cpuinfo.min_freq > 0))) { amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", From a7875346c6891063b83cd59dc3bd23113e4debf5 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 14 Feb 2025 16:31:25 -0600 Subject: [PATCH 108/139] cpufreq/amd-pstate-ut: Drop SUCCESS and FAIL enums Enums are effectively used as a boolean and don't show the return value of the failing call. Instead of using enums switch to returning the actual return code from the unit test. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 149 +++++++++++++------------------- 1 file changed, 58 insertions(+), 91 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index b3693a4c28d2..cd9a472e8dc3 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -32,30 +32,20 @@ #include "amd-pstate.h" -/* - * Abbreviations: - * amd_pstate_ut: used as a shortform for AMD P-State unit test. - * It helps to keep variable names smaller, simpler - */ -enum amd_pstate_ut_result { - AMD_PSTATE_UT_RESULT_PASS, - AMD_PSTATE_UT_RESULT_FAIL, -}; struct amd_pstate_ut_struct { const char *name; - void (*func)(u32 index); - enum amd_pstate_ut_result result; + int (*func)(u32 index); }; /* * Kernel module for testing the AMD P-State unit test */ -static void amd_pstate_ut_acpi_cpc_valid(u32 index); -static void amd_pstate_ut_check_enabled(u32 index); -static void amd_pstate_ut_check_perf(u32 index); -static void amd_pstate_ut_check_freq(u32 index); -static void amd_pstate_ut_check_driver(u32 index); +static int amd_pstate_ut_acpi_cpc_valid(u32 index); +static int amd_pstate_ut_check_enabled(u32 index); +static int amd_pstate_ut_check_perf(u32 index); +static int amd_pstate_ut_check_freq(u32 index); +static int amd_pstate_ut_check_driver(u32 index); static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { {"amd_pstate_ut_acpi_cpc_valid", amd_pstate_ut_acpi_cpc_valid }, @@ -78,51 +68,46 @@ static bool get_shared_mem(void) /* * check the _CPC object is present in SBIOS. */ -static void amd_pstate_ut_acpi_cpc_valid(u32 index) +static int amd_pstate_ut_acpi_cpc_valid(u32 index) { - if (acpi_cpc_valid()) - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; - else { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + if (!acpi_cpc_valid()) { pr_err("%s the _CPC object is not present in SBIOS!\n", __func__); + return -EINVAL; } -} -static void amd_pstate_ut_pstate_enable(u32 index) -{ - int ret = 0; - u64 cppc_enable = 0; - - ret = rdmsrl_safe(MSR_AMD_CPPC_ENABLE, &cppc_enable); - if (ret) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; - pr_err("%s rdmsrl_safe MSR_AMD_CPPC_ENABLE ret=%d error!\n", __func__, ret); - return; - } - if (cppc_enable) - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; - else { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; - pr_err("%s amd pstate must be enabled!\n", __func__); - } + return 0; } /* * check if amd pstate is enabled */ -static void amd_pstate_ut_check_enabled(u32 index) +static int amd_pstate_ut_check_enabled(u32 index) { + u64 cppc_enable = 0; + int ret; + if (get_shared_mem()) - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; - else - amd_pstate_ut_pstate_enable(index); + return 0; + + ret = rdmsrl_safe(MSR_AMD_CPPC_ENABLE, &cppc_enable); + if (ret) { + pr_err("%s rdmsrl_safe MSR_AMD_CPPC_ENABLE ret=%d error!\n", __func__, ret); + return ret; + } + + if (!cppc_enable) { + pr_err("%s amd pstate must be enabled!\n", __func__); + return -EINVAL; + } + + return 0; } /* * check if performance values are reasonable. * highest_perf >= nominal_perf > lowest_nonlinear_perf > lowest_perf > 0 */ -static void amd_pstate_ut_check_perf(u32 index) +static int amd_pstate_ut_check_perf(u32 index) { int cpu = 0, ret = 0; u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0; @@ -142,9 +127,8 @@ static void amd_pstate_ut_check_perf(u32 index) if (get_shared_mem()) { ret = cppc_get_perf_caps(cpu, &cppc_perf); if (ret) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cppc_get_perf_caps ret=%d error!\n", __func__, ret); - return; + return ret; } highest_perf = cppc_perf.highest_perf; @@ -154,9 +138,8 @@ static void amd_pstate_ut_check_perf(u32 index) } else { ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); if (ret) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s read CPPC_CAP1 ret=%d error!\n", __func__, ret); - return; + return ret; } highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); @@ -169,32 +152,30 @@ static void amd_pstate_ut_check_perf(u32 index) if (highest_perf != cur_perf.highest_perf && !cpudata->hw_prefcore) { pr_err("%s cpu%d highest=%d %d highest perf doesn't match\n", __func__, cpu, highest_perf, cur_perf.highest_perf); - return; + return -EINVAL; } if (nominal_perf != cur_perf.nominal_perf || (lowest_nonlinear_perf != cur_perf.lowest_nonlinear_perf) || (lowest_perf != cur_perf.lowest_perf)) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d nominal=%d %d lowest_nonlinear=%d %d lowest=%d %d, they should be equal!\n", __func__, cpu, nominal_perf, cur_perf.nominal_perf, lowest_nonlinear_perf, cur_perf.lowest_nonlinear_perf, lowest_perf, cur_perf.lowest_perf); - return; + return -EINVAL; } if (!((highest_perf >= nominal_perf) && (nominal_perf > lowest_nonlinear_perf) && (lowest_nonlinear_perf >= lowest_perf) && (lowest_perf > 0))) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n", __func__, cpu, highest_perf, nominal_perf, lowest_nonlinear_perf, lowest_perf); - return; + return -EINVAL; } } - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; + return 0; } /* @@ -202,7 +183,7 @@ static void amd_pstate_ut_check_perf(u32 index) * max_freq >= nominal_freq > lowest_nonlinear_freq > min_freq > 0 * check max freq when set support boost mode. */ -static void amd_pstate_ut_check_freq(u32 index) +static int amd_pstate_ut_check_freq(u32 index) { int cpu = 0; struct amd_cpudata *cpudata = NULL; @@ -219,39 +200,33 @@ static void amd_pstate_ut_check_freq(u32 index) (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && (cpudata->lowest_nonlinear_freq >= policy->cpuinfo.min_freq) && (policy->cpuinfo.min_freq > 0))) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", __func__, cpu, policy->cpuinfo.max_freq, cpudata->nominal_freq, cpudata->lowest_nonlinear_freq, policy->cpuinfo.min_freq); - return; + return -EINVAL; } if (cpudata->lowest_nonlinear_freq != policy->min) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n", __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min); - return; + return -EINVAL; } if (cpudata->boost_supported) { - if ((policy->max == policy->cpuinfo.max_freq) || - (policy->max == cpudata->nominal_freq)) - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; - else { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + if ((policy->max != policy->cpuinfo.max_freq) && + (policy->max != cpudata->nominal_freq)) { pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", __func__, cpu, policy->max, policy->cpuinfo.max_freq, cpudata->nominal_freq); - return; + return -EINVAL; } } else { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; pr_err("%s cpu%d must support boost!\n", __func__, cpu); - return; + return -EINVAL; } } - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; + return 0; } static int amd_pstate_set_mode(enum amd_pstate_mode mode) @@ -263,32 +238,28 @@ static int amd_pstate_set_mode(enum amd_pstate_mode mode) return amd_pstate_update_status(mode_str, strlen(mode_str)); } -static void amd_pstate_ut_check_driver(u32 index) +static int amd_pstate_ut_check_driver(u32 index) { enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE; - int ret; for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) { - ret = amd_pstate_set_mode(mode1); + int ret = amd_pstate_set_mode(mode1); if (ret) - goto out; + return ret; for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) { if (mode1 == mode2) continue; ret = amd_pstate_set_mode(mode2); - if (ret) - goto out; + if (ret) { + pr_err("%s: failed to update status for %s->%s\n", __func__, + amd_pstate_get_mode_string(mode1), + amd_pstate_get_mode_string(mode2)); + return ret; + } } } -out: - if (ret) - pr_warn("%s: failed to update status for %s->%s: %d\n", __func__, - amd_pstate_get_mode_string(mode1), - amd_pstate_get_mode_string(mode2), ret); - amd_pstate_ut_cases[index].result = ret ? - AMD_PSTATE_UT_RESULT_FAIL : - AMD_PSTATE_UT_RESULT_PASS; + return 0; } static int __init amd_pstate_ut_init(void) @@ -296,16 +267,12 @@ static int __init amd_pstate_ut_init(void) u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases); for (i = 0; i < arr_size; i++) { - amd_pstate_ut_cases[i].func(i); - switch (amd_pstate_ut_cases[i].result) { - case AMD_PSTATE_UT_RESULT_PASS: + int ret = amd_pstate_ut_cases[i].func(i); + + if (ret) + pr_err("%-4d %-20s\t fail: %d!\n", i+1, amd_pstate_ut_cases[i].name, ret); + else pr_info("%-4d %-20s\t success!\n", i+1, amd_pstate_ut_cases[i].name); - break; - case AMD_PSTATE_UT_RESULT_FAIL: - default: - pr_info("%-4d %-20s\t fail!\n", i+1, amd_pstate_ut_cases[i].name); - break; - } } return 0; From 2aac38ac06cb751bd3e672a4fb1eb3073f1866ab Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 14 Feb 2025 16:33:12 -0600 Subject: [PATCH 109/139] cpufreq/amd-pstate-ut: Run on all of the correct CPUs If a CPU is missing a policy or one has been offlined then the unit test is skipped for the rest of the CPUs on the system. Instead; iterate online CPUs and skip any missing policies to allow continuing to test the rest of them. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index cd9a472e8dc3..2ab3017d7a0b 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -116,12 +116,12 @@ static int amd_pstate_ut_check_perf(u32 index) struct amd_cpudata *cpudata = NULL; union perf_cached cur_perf; - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; policy = cpufreq_cpu_get(cpu); if (!policy) - break; + continue; cpudata = policy->driver_data; if (get_shared_mem()) { @@ -188,12 +188,12 @@ static int amd_pstate_ut_check_freq(u32 index) int cpu = 0; struct amd_cpudata *cpudata = NULL; - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; policy = cpufreq_cpu_get(cpu); if (!policy) - break; + continue; cpudata = policy->driver_data; if (!((policy->cpuinfo.max_freq >= cpudata->nominal_freq) && From c630458c7a4b990742905e312af20a7c3260ca14 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 14 Feb 2025 16:34:23 -0600 Subject: [PATCH 110/139] cpufreq/amd-pstate-ut: Adjust variable scope In amd_pstate_ut_check_freq() and amd_pstate_ut_check_perf() the cpudata variable is only needed in the scope of the for loop. Move it there. Reviewed-by: Gautham R. Shenoy Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-ut.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index 2ab3017d7a0b..edc1475989e3 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -113,11 +113,11 @@ static int amd_pstate_ut_check_perf(u32 index) u32 highest_perf = 0, nominal_perf = 0, lowest_nonlinear_perf = 0, lowest_perf = 0; u64 cap1 = 0; struct cppc_perf_caps cppc_perf; - struct amd_cpudata *cpudata = NULL; union perf_cached cur_perf; for_each_online_cpu(cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + struct amd_cpudata *cpudata; policy = cpufreq_cpu_get(cpu); if (!policy) @@ -186,10 +186,10 @@ static int amd_pstate_ut_check_perf(u32 index) static int amd_pstate_ut_check_freq(u32 index) { int cpu = 0; - struct amd_cpudata *cpudata = NULL; for_each_online_cpu(cpu) { struct cpufreq_policy *policy __free(put_cpufreq_policy) = NULL; + struct amd_cpudata *cpudata; policy = cpufreq_cpu_get(cpu); if (!policy) From b4cc466b973590114b4bf49025d362ad9618a23e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sat, 7 Dec 2024 22:38:06 -0600 Subject: [PATCH 111/139] cpufreq/amd-pstate: Replace all AMD_CPPC_* macros with masks Bitfield masks are easier to follow and less error prone. Reviewed-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- arch/x86/include/asm/msr-index.h | 18 ++++++++++-------- arch/x86/kernel/acpi/cppc.c | 4 +++- drivers/cpufreq/amd-pstate-ut.c | 9 +++++---- drivers/cpufreq/amd-pstate.c | 16 ++++++---------- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 72765b2fe0d8..fc2634cc48fd 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -701,15 +701,17 @@ #define MSR_AMD_CPPC_REQ 0xc00102b3 #define MSR_AMD_CPPC_STATUS 0xc00102b4 -#define AMD_CPPC_LOWEST_PERF(x) (((x) >> 0) & 0xff) -#define AMD_CPPC_LOWNONLIN_PERF(x) (((x) >> 8) & 0xff) -#define AMD_CPPC_NOMINAL_PERF(x) (((x) >> 16) & 0xff) -#define AMD_CPPC_HIGHEST_PERF(x) (((x) >> 24) & 0xff) +/* Masks for use with MSR_AMD_CPPC_CAP1 */ +#define AMD_CPPC_LOWEST_PERF_MASK GENMASK(7, 0) +#define AMD_CPPC_LOWNONLIN_PERF_MASK GENMASK(15, 8) +#define AMD_CPPC_NOMINAL_PERF_MASK GENMASK(23, 16) +#define AMD_CPPC_HIGHEST_PERF_MASK GENMASK(31, 24) -#define AMD_CPPC_MAX_PERF(x) (((x) & 0xff) << 0) -#define AMD_CPPC_MIN_PERF(x) (((x) & 0xff) << 8) -#define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) -#define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) +/* Masks for use with MSR_AMD_CPPC_REQ */ +#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) +#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8) +#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) +#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) /* AMD Performance Counter Global Status and Control MSRs */ #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index d745dd586303..77bfb846490c 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -4,6 +4,8 @@ * Copyright (c) 2016, Intel Corporation. */ +#include + #include #include #include @@ -149,7 +151,7 @@ int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) if (ret) goto out; - val = AMD_CPPC_HIGHEST_PERF(val); + val = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, val); } else { ret = cppc_get_highest_perf(cpu, &val); if (ret) diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c index edc1475989e3..e671bc7d1550 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c @@ -22,6 +22,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include @@ -142,10 +143,10 @@ static int amd_pstate_ut_check_perf(u32 index) return ret; } - highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); - nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1); - lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1); - lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); + highest_perf = FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, cap1); + nominal_perf = FIELD_GET(AMD_CPPC_NOMINAL_PERF_MASK, cap1); + lowest_nonlinear_perf = FIELD_GET(AMD_CPPC_LOWNONLIN_PERF_MASK, cap1); + lowest_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); } cur_perf = READ_ONCE(cpudata->perf); diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 6f1a3056bcbd..5c439b14caae 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -89,11 +89,6 @@ static bool cppc_enabled; static bool amd_pstate_prefcore = true; static struct quirk_entry *quirks; -#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) -#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8) -#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) -#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) - /* * AMD Energy Preference Performance (EPP) * The EPP is used in the CCLK DPM controller to drive @@ -439,12 +434,13 @@ static int msr_init_perf(struct amd_cpudata *cpudata) perf.highest_perf = numerator; perf.max_limit_perf = numerator; - perf.min_limit_perf = AMD_CPPC_LOWEST_PERF(cap1); - perf.nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1); - perf.lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1); - perf.lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); + perf.min_limit_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); + perf.nominal_perf = FIELD_GET(AMD_CPPC_NOMINAL_PERF_MASK, cap1); + perf.lowest_nonlinear_perf = FIELD_GET(AMD_CPPC_LOWNONLIN_PERF_MASK, cap1); + perf.lowest_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); WRITE_ONCE(cpudata->perf, perf); - WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); + WRITE_ONCE(cpudata->prefcore_ranking, FIELD_GET(AMD_CPPC_HIGHEST_PERF_MASK, cap1)); + return 0; } From 9f5daa2f2f6dddd2a847d077bfddc7dc0d9dab10 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 11:54:32 -0600 Subject: [PATCH 112/139] cpufreq/amd-pstate: Cache CPPC request in shared mem case too In order to prevent a potential write for shmem_update_perf() cache the request into the cppc_req_cached variable normally only used for the MSR case. This adds symmetry into the code and potentially avoids extra writes. Reviewed-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5c439b14caae..06bf0d888be6 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -496,6 +496,8 @@ static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf, u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { struct cppc_perf_ctrls perf_ctrls; + u64 value, prev; + int ret; if (cppc_state == AMD_PSTATE_ACTIVE) { int ret = shmem_set_epp(cpudata, epp); @@ -504,11 +506,29 @@ static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf, return ret; } + value = prev = READ_ONCE(cpudata->cppc_req_cached); + + value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | + AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK); + value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); + value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf); + value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + + if (value == prev) + return 0; + perf_ctrls.max_perf = max_perf; perf_ctrls.min_perf = min_perf; perf_ctrls.desired_perf = des_perf; - return cppc_set_perf(cpudata->cpu, &perf_ctrls); + ret = cppc_set_perf(cpudata->cpu, &perf_ctrls); + if (ret) + return ret; + + WRITE_ONCE(cpudata->cppc_req_cached, value); + + return 0; } static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) From 77fbea69b0ffad0e46c7018d07e04b6628482e14 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 11:57:38 -0600 Subject: [PATCH 113/139] cpufreq/amd-pstate: Move all EPP tracing into *_update_perf and *_set_epp functions The EPP tracing is done by the caller today, but this precludes the information about whether the CPPC request has changed. Move it into the update_perf and set_epp functions and include information about whether the request has changed from the last one. amd_pstate_update_perf() and amd_pstate_set_epp() now require the policy as an argument instead of the cpudata. Reviewed-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate-trace.h | 13 +++- drivers/cpufreq/amd-pstate.c | 118 +++++++++++++++++------------ 2 files changed, 80 insertions(+), 51 deletions(-) diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h index f457d4af2c62..32e1bdc588c5 100644 --- a/drivers/cpufreq/amd-pstate-trace.h +++ b/drivers/cpufreq/amd-pstate-trace.h @@ -90,7 +90,8 @@ TRACE_EVENT(amd_pstate_epp_perf, u8 epp, u8 min_perf, u8 max_perf, - bool boost + bool boost, + bool changed ), TP_ARGS(cpu_id, @@ -98,7 +99,8 @@ TRACE_EVENT(amd_pstate_epp_perf, epp, min_perf, max_perf, - boost), + boost, + changed), TP_STRUCT__entry( __field(unsigned int, cpu_id) @@ -107,6 +109,7 @@ TRACE_EVENT(amd_pstate_epp_perf, __field(u8, min_perf) __field(u8, max_perf) __field(bool, boost) + __field(bool, changed) ), TP_fast_assign( @@ -116,15 +119,17 @@ TRACE_EVENT(amd_pstate_epp_perf, __entry->min_perf = min_perf; __entry->max_perf = max_perf; __entry->boost = boost; + __entry->changed = changed; ), - TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u", + TP_printk("cpu%u: [%hhu<->%hhu]/%hhu, epp=%hhu, boost=%u, changed=%u", (unsigned int)__entry->cpu_id, (u8)__entry->min_perf, (u8)__entry->max_perf, (u8)__entry->highest_perf, (u8)__entry->epp, - (bool)__entry->boost + (bool)__entry->boost, + (bool)__entry->changed ) ); diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 06bf0d888be6..e5db731618e8 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -228,9 +228,10 @@ static u8 shmem_get_epp(struct amd_cpudata *cpudata) return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, epp); } -static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, +static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf, u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { + struct amd_cpudata *cpudata = policy->driver_data; u64 value, prev; value = prev = READ_ONCE(cpudata->cppc_req_cached); @@ -242,6 +243,18 @@ static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = READ_ONCE(cpudata->perf); + + trace_amd_pstate_epp_perf(cpudata->cpu, + perf.highest_perf, + epp, + min_perf, + max_perf, + policy->boost_enabled, + value != prev); + } + if (value == prev) return 0; @@ -256,24 +269,26 @@ static int msr_update_perf(struct amd_cpudata *cpudata, u8 min_perf, } WRITE_ONCE(cpudata->cppc_req_cached, value); - WRITE_ONCE(cpudata->epp_cached, epp); + if (epp != cpudata->epp_cached) + WRITE_ONCE(cpudata->epp_cached, epp); return 0; } DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); -static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, +static inline int amd_pstate_update_perf(struct cpufreq_policy *policy, u8 min_perf, u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { - return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, + return static_call(amd_pstate_update_perf)(policy, min_perf, des_perf, max_perf, epp, fast_switch); } -static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) +static int msr_set_epp(struct cpufreq_policy *policy, u8 epp) { + struct amd_cpudata *cpudata = policy->driver_data; u64 value, prev; int ret; @@ -281,6 +296,19 @@ static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) value &= ~AMD_CPPC_EPP_PERF_MASK; value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = cpudata->perf; + + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, + epp, + FIELD_GET(AMD_CPPC_MIN_PERF_MASK, + cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, + cpudata->cppc_req_cached), + policy->boost_enabled, + value != prev); + } + if (value == prev) return 0; @@ -299,15 +327,29 @@ static int msr_set_epp(struct amd_cpudata *cpudata, u8 epp) DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp); -static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u8 epp) +static inline int amd_pstate_set_epp(struct cpufreq_policy *policy, u8 epp) { - return static_call(amd_pstate_set_epp)(cpudata, epp); + return static_call(amd_pstate_set_epp)(policy, epp); } -static int shmem_set_epp(struct amd_cpudata *cpudata, u8 epp) +static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) { - int ret; + struct amd_cpudata *cpudata = policy->driver_data; struct cppc_perf_ctrls perf_ctrls; + int ret; + + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = cpudata->perf; + + trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, + epp, + FIELD_GET(AMD_CPPC_MIN_PERF_MASK, + cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, + cpudata->cppc_req_cached), + policy->boost_enabled, + epp != cpudata->epp_cached); + } if (epp == cpudata->epp_cached) return 0; @@ -339,17 +381,7 @@ static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy, return -EBUSY; } - if (trace_amd_pstate_epp_perf_enabled()) { - union perf_cached perf = READ_ONCE(cpudata->perf); - - trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, - epp, - FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), - FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), - policy->boost_enabled); - } - - return amd_pstate_set_epp(cpudata, epp); + return amd_pstate_set_epp(policy, epp); } static inline int msr_cppc_enable(bool enable) @@ -492,15 +524,16 @@ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) return static_call(amd_pstate_init_perf)(cpudata); } -static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf, +static int shmem_update_perf(struct cpufreq_policy *policy, u8 min_perf, u8 des_perf, u8 max_perf, u8 epp, bool fast_switch) { + struct amd_cpudata *cpudata = policy->driver_data; struct cppc_perf_ctrls perf_ctrls; u64 value, prev; int ret; if (cppc_state == AMD_PSTATE_ACTIVE) { - int ret = shmem_set_epp(cpudata, epp); + int ret = shmem_set_epp(policy, epp); if (ret) return ret; @@ -515,6 +548,18 @@ static int shmem_update_perf(struct amd_cpudata *cpudata, u8 min_perf, value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + if (trace_amd_pstate_epp_perf_enabled()) { + union perf_cached perf = READ_ONCE(cpudata->perf); + + trace_amd_pstate_epp_perf(cpudata->cpu, + perf.highest_perf, + epp, + min_perf, + max_perf, + policy->boost_enabled, + value != prev); + } + if (value == prev) return 0; @@ -592,7 +637,7 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, cpudata->cpu, fast_switch); } - amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch); + amd_pstate_update_perf(policy, min_perf, des_perf, max_perf, 0, fast_switch); } static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) @@ -1525,7 +1570,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) return ret; WRITE_ONCE(cpudata->cppc_req_cached, value); } - ret = amd_pstate_set_epp(cpudata, cpudata->epp_default); + ret = amd_pstate_set_epp(policy, cpudata->epp_default); if (ret) return ret; @@ -1566,14 +1611,8 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) epp = READ_ONCE(cpudata->epp_cached); perf = READ_ONCE(cpudata->perf); - if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, epp, - perf.min_limit_perf, - perf.max_limit_perf, - policy->boost_enabled); - } - return amd_pstate_update_perf(cpudata, perf.min_limit_perf, 0U, + return amd_pstate_update_perf(policy, perf.min_limit_perf, 0U, perf.max_limit_perf, epp, false); } @@ -1605,20 +1644,12 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) { - struct amd_cpudata *cpudata = policy->driver_data; - union perf_cached perf = READ_ONCE(cpudata->perf); int ret; ret = amd_pstate_cppc_enable(true); if (ret) pr_err("failed to enable amd pstate during resume, return %d\n", ret); - if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, - cpudata->epp_cached, - FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), - perf.highest_perf, policy->boost_enabled); - } return amd_pstate_epp_update_limit(policy); } @@ -1646,14 +1677,7 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) if (cpudata->suspended) return 0; - if (trace_amd_pstate_epp_perf_enabled()) { - trace_amd_pstate_epp_perf(cpudata->cpu, perf.highest_perf, - AMD_CPPC_EPP_BALANCE_POWERSAVE, - perf.lowest_perf, perf.lowest_perf, - policy->boost_enabled); - } - - return amd_pstate_update_perf(cpudata, perf.lowest_perf, 0, perf.lowest_perf, + return amd_pstate_update_perf(policy, perf.lowest_perf, 0, perf.lowest_perf, AMD_CPPC_EPP_BALANCE_POWERSAVE, false); } From 1905fac6f9e08e34135e089704a0733ce711eb83 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 12:02:14 -0600 Subject: [PATCH 114/139] cpufreq/amd-pstate: Update cppc_req_cached for shared mem EPP writes On EPP only writes update the cached variable so that the min/max performance controls don't need to be updated again. Reviewed-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index e5db731618e8..df42a2d22225 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -336,6 +336,7 @@ static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) { struct amd_cpudata *cpudata = policy->driver_data; struct cppc_perf_ctrls perf_ctrls; + u64 value; int ret; if (trace_amd_pstate_epp_perf_enabled()) { @@ -362,6 +363,11 @@ static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) } WRITE_ONCE(cpudata->epp_cached, epp); + value = READ_ONCE(cpudata->cppc_req_cached); + value &= ~AMD_CPPC_EPP_PERF_MASK; + value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + WRITE_ONCE(cpudata->cppc_req_cached, value); + return ret; } From 93039a60fb28f72196769869aa4b502f1849a373 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 9 Dec 2024 14:44:15 -0600 Subject: [PATCH 115/139] cpufreq/amd-pstate: Drop debug statements for policy setting There are trace events that exist now for all amd-pstate modes that will output information right before programming to the hardware. This makes the existing debug statements unnecessary remaining overhead. Drop them. Reviewed-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index df42a2d22225..c70ed41a2448 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -667,7 +667,6 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) } cpufreq_verify_within_cpu_limits(policy_data); - pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min); return 0; } @@ -1630,9 +1629,6 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) if (!policy->cpuinfo.max_freq) return -ENODEV; - pr_debug("set_policy: cpuinfo.max %u policy->max %u\n", - policy->cpuinfo.max_freq, policy->max); - cpudata->policy = policy->policy; ret = amd_pstate_epp_update_limit(policy); From 2064543f5ba0d2929e3e9b3a616c3262a57c7925 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Mon, 16 Dec 2024 10:41:23 -0600 Subject: [PATCH 116/139] cpufreq/amd-pstate: Rework CPPC enabling The CPPC enable register is configured as "write once". That is any future writes don't actually do anything. Because of this, all the cleanup paths that currently exist for CPPC disable are non-effective. Rework CPPC enable to only enable after all the CAP registers have been read to avoid enabling CPPC on CPUs with invalid _CPC or unpopulated MSRs. As the register is write once, remove all cleanup paths as well. Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 179 +++++++---------------------------- 1 file changed, 35 insertions(+), 144 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index c70ed41a2448..7802f37bde02 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -85,7 +85,6 @@ static struct cpufreq_driver *current_pstate_driver; static struct cpufreq_driver amd_pstate_driver; static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; -static bool cppc_enabled; static bool amd_pstate_prefcore = true; static struct quirk_entry *quirks; @@ -371,89 +370,21 @@ static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) return ret; } -static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy, - int pref_index) +static inline int msr_cppc_enable(struct cpufreq_policy *policy) { - struct amd_cpudata *cpudata = policy->driver_data; - u8 epp; - - if (!pref_index) - epp = cpudata->epp_default; - else - epp = epp_values[pref_index]; - - if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { - pr_debug("EPP cannot be set under performance policy\n"); - return -EBUSY; - } - - return amd_pstate_set_epp(policy, epp); + return wrmsrl_safe_on_cpu(policy->cpu, MSR_AMD_CPPC_ENABLE, 1); } -static inline int msr_cppc_enable(bool enable) +static int shmem_cppc_enable(struct cpufreq_policy *policy) { - int ret, cpu; - unsigned long logical_proc_id_mask = 0; - - /* - * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared. - */ - if (!enable) - return 0; - - if (enable == cppc_enabled) - return 0; - - for_each_present_cpu(cpu) { - unsigned long logical_id = topology_logical_package_id(cpu); - - if (test_bit(logical_id, &logical_proc_id_mask)) - continue; - - set_bit(logical_id, &logical_proc_id_mask); - - ret = wrmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_ENABLE, - enable); - if (ret) - return ret; - } - - cppc_enabled = enable; - return 0; -} - -static int shmem_cppc_enable(bool enable) -{ - int cpu, ret = 0; - struct cppc_perf_ctrls perf_ctrls; - - if (enable == cppc_enabled) - return 0; - - for_each_present_cpu(cpu) { - ret = cppc_set_enable(cpu, enable); - if (ret) - return ret; - - /* Enable autonomous mode for EPP */ - if (cppc_state == AMD_PSTATE_ACTIVE) { - /* Set desired perf as zero to allow EPP firmware control */ - perf_ctrls.desired_perf = 0; - ret = cppc_set_perf(cpu, &perf_ctrls); - if (ret) - return ret; - } - } - - cppc_enabled = enable; - return ret; + return cppc_set_enable(policy->cpu, 1); } DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable); -static inline int amd_pstate_cppc_enable(bool enable) +static inline int amd_pstate_cppc_enable(struct cpufreq_policy *policy) { - return static_call(amd_pstate_cppc_enable)(enable); + return static_call(amd_pstate_cppc_enable)(policy); } static int msr_init_perf(struct amd_cpudata *cpudata) @@ -1063,6 +994,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) cpudata->nominal_freq, perf.highest_perf); + ret = amd_pstate_cppc_enable(policy); + if (ret) + goto free_cpudata1; + policy->boost_enabled = READ_ONCE(cpudata->boost_supported); /* It will be updated by governor */ @@ -1110,28 +1045,6 @@ static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) kfree(cpudata); } -static int amd_pstate_cpu_resume(struct cpufreq_policy *policy) -{ - int ret; - - ret = amd_pstate_cppc_enable(true); - if (ret) - pr_err("failed to enable amd-pstate during resume, return %d\n", ret); - - return ret; -} - -static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy) -{ - int ret; - - ret = amd_pstate_cppc_enable(false); - if (ret) - pr_err("failed to disable amd-pstate during suspend, return %d\n", ret); - - return ret; -} - /* Sysfs attributes */ /* @@ -1223,8 +1136,10 @@ static ssize_t show_energy_performance_available_preferences( static ssize_t store_energy_performance_preference( struct cpufreq_policy *policy, const char *buf, size_t count) { + struct amd_cpudata *cpudata = policy->driver_data; char str_preference[21]; ssize_t ret; + u8 epp; ret = sscanf(buf, "%20s", str_preference); if (ret != 1) @@ -1234,7 +1149,17 @@ static ssize_t store_energy_performance_preference( if (ret < 0) return -EINVAL; - ret = amd_pstate_set_energy_pref_index(policy, ret); + if (!ret) + epp = cpudata->epp_default; + else + epp = epp_values[ret]; + + if (epp > 0 && policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + pr_debug("EPP cannot be set under performance policy\n"); + return -EBUSY; + } + + ret = amd_pstate_set_epp(policy, epp); return ret ? ret : count; } @@ -1267,7 +1192,6 @@ static ssize_t show_energy_performance_preference( static void amd_pstate_driver_cleanup(void) { - amd_pstate_cppc_enable(false); cppc_state = AMD_PSTATE_DISABLE; current_pstate_driver = NULL; } @@ -1301,14 +1225,6 @@ static int amd_pstate_register_driver(int mode) cppc_state = mode; - ret = amd_pstate_cppc_enable(true); - if (ret) { - pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n", - ret); - amd_pstate_driver_cleanup(); - return ret; - } - /* at least one CPU supports CPB */ current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB); @@ -1548,11 +1464,15 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.max_freq = policy->max = perf_to_freq(perf, cpudata->nominal_freq, perf.highest_perf); + policy->driver_data = cpudata; + + ret = amd_pstate_cppc_enable(policy); + if (ret) + goto free_cpudata1; /* It will be updated by governor */ policy->cur = policy->cpuinfo.min_freq; - policy->driver_data = cpudata; policy->boost_enabled = READ_ONCE(cpudata->boost_supported); @@ -1644,31 +1564,11 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) return 0; } -static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) -{ - int ret; - - ret = amd_pstate_cppc_enable(true); - if (ret) - pr_err("failed to enable amd pstate during resume, return %d\n", ret); - - - return amd_pstate_epp_update_limit(policy); -} - static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) { - struct amd_cpudata *cpudata = policy->driver_data; - int ret; + pr_debug("AMD CPU Core %d going online\n", policy->cpu); - pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); - - ret = amd_pstate_epp_reenable(policy); - if (ret) - return ret; - cpudata->suspended = false; - - return 0; + return amd_pstate_cppc_enable(policy); } static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) @@ -1686,11 +1586,6 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) { struct amd_cpudata *cpudata = policy->driver_data; - int ret; - - /* avoid suspending when EPP is not enabled */ - if (cppc_state != AMD_PSTATE_ACTIVE) - return 0; /* invalidate to ensure it's rewritten during resume */ cpudata->cppc_req_cached = 0; @@ -1698,11 +1593,6 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) /* set this flag to avoid setting core offline*/ cpudata->suspended = true; - /* disable CPPC in lowlevel firmware */ - ret = amd_pstate_cppc_enable(false); - if (ret) - pr_err("failed to suspend, return %d\n", ret); - return 0; } @@ -1711,8 +1601,12 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) struct amd_cpudata *cpudata = policy->driver_data; if (cpudata->suspended) { + int ret; + /* enable amd pstate from suspend state*/ - amd_pstate_epp_reenable(policy); + ret = amd_pstate_epp_update_limit(policy); + if (ret) + return ret; cpudata->suspended = false; } @@ -1727,8 +1621,6 @@ static struct cpufreq_driver amd_pstate_driver = { .fast_switch = amd_pstate_fast_switch, .init = amd_pstate_cpu_init, .exit = amd_pstate_cpu_exit, - .suspend = amd_pstate_cpu_suspend, - .resume = amd_pstate_cpu_resume, .set_boost = amd_pstate_set_boost, .update_limits = amd_pstate_update_limits, .name = "amd-pstate", @@ -1895,7 +1787,6 @@ static int __init amd_pstate_init(void) global_attr_free: cpufreq_unregister_driver(current_pstate_driver); - amd_pstate_cppc_enable(false); return ret; } device_initcall(amd_pstate_init); From 4e16c1175238f2b9d86199b427c5db1a24c9a85f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Sun, 19 Jan 2025 07:05:43 -0600 Subject: [PATCH 117/139] cpufreq/amd-pstate: Stop caching EPP EPP values are cached in the cpudata structure per CPU. This is needless though because they are also cached in the CPPC request variable. Drop the separate cache for EPP values and always reference the CPPC request variable when needed. Reviewed-by: Dhananjay Ugwekar Reviewed-by: Gautham R. Shenoy Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 19 ++++++++++--------- drivers/cpufreq/amd-pstate.h | 1 - 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 7802f37bde02..8c5f4449adfa 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -268,8 +268,6 @@ static int msr_update_perf(struct cpufreq_policy *policy, u8 min_perf, } WRITE_ONCE(cpudata->cppc_req_cached, value); - if (epp != cpudata->epp_cached) - WRITE_ONCE(cpudata->epp_cached, epp); return 0; } @@ -318,7 +316,6 @@ static int msr_set_epp(struct cpufreq_policy *policy, u8 epp) } /* update both so that msr_update_perf() can effectively check */ - WRITE_ONCE(cpudata->epp_cached, epp); WRITE_ONCE(cpudata->cppc_req_cached, value); return ret; @@ -335,9 +332,12 @@ static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) { struct amd_cpudata *cpudata = policy->driver_data; struct cppc_perf_ctrls perf_ctrls; + u8 epp_cached; u64 value; int ret; + + epp_cached = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); if (trace_amd_pstate_epp_perf_enabled()) { union perf_cached perf = cpudata->perf; @@ -348,10 +348,10 @@ static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), policy->boost_enabled, - epp != cpudata->epp_cached); + epp != epp_cached); } - if (epp == cpudata->epp_cached) + if (epp == epp_cached) return 0; perf_ctrls.energy_perf = epp; @@ -360,7 +360,6 @@ static int shmem_set_epp(struct cpufreq_policy *policy, u8 epp) pr_debug("failed to set energy perf value (%d)\n", ret); return ret; } - WRITE_ONCE(cpudata->epp_cached, epp); value = READ_ONCE(cpudata->cppc_req_cached); value &= ~AMD_CPPC_EPP_PERF_MASK; @@ -1168,9 +1167,11 @@ static ssize_t show_energy_performance_preference( struct cpufreq_policy *policy, char *buf) { struct amd_cpudata *cpudata = policy->driver_data; - u8 preference; + u8 preference, epp; - switch (cpudata->epp_cached) { + epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); + + switch (epp) { case AMD_CPPC_EPP_PERFORMANCE: preference = EPP_INDEX_PERFORMANCE; break; @@ -1533,7 +1534,7 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) epp = 0; else - epp = READ_ONCE(cpudata->epp_cached); + epp = FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached); perf = READ_ONCE(cpudata->perf); diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h index 1557e1afea79..fbe1c08d3f06 100644 --- a/drivers/cpufreq/amd-pstate.h +++ b/drivers/cpufreq/amd-pstate.h @@ -102,7 +102,6 @@ struct amd_cpudata { bool hw_prefcore; /* EPP feature related attributes*/ - u8 epp_cached; u32 policy; bool suspended; u8 epp_default; From efb758c8c803217e58248f03db372c5e23827dae Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 19 Feb 2025 14:08:14 -0600 Subject: [PATCH 118/139] cpufreq/amd-pstate: Drop actions in amd_pstate_epp_cpu_offline() When the CPU goes offline there is no need to change the CPPC request because the CPU will go into the deepest C-state it supports already. Actually changing the CPPC request when it goes offline messes up the cached values and can lead to the wrong values being restored when it comes back. Instead drop the actions and if the CPU comes back online let amd_pstate_epp_set_policy() restore it to expected values. Reviewed-by: Dhananjay Ugwekar Signed-off-by: Mario Limonciello --- drivers/cpufreq/amd-pstate.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 8c5f4449adfa..024d33d5e367 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1574,14 +1574,7 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) { - struct amd_cpudata *cpudata = policy->driver_data; - union perf_cached perf = READ_ONCE(cpudata->perf); - - if (cpudata->suspended) - return 0; - - return amd_pstate_update_perf(policy, perf.lowest_perf, 0, perf.lowest_perf, - AMD_CPPC_EPP_BALANCE_POWERSAVE, false); + return 0; } static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) From 860a731f52f83309c213b943bac8f4ea70a88805 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 5 Mar 2025 22:08:21 +0100 Subject: [PATCH 119/139] PM: EM: Consify two parameters of em_dev_register_perf_domain() Notice that em_dev_register_perf_domain() and the functions called by it do not update objects pointed to by its cb and cpus parameters, so the const modifier can be added to them. This allows the return value of cpumask_of() or a pointer to a struct em_data_callback declared as const to be passed to em_dev_register_perf_domain() directly without explicit type casting which is rather handy. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/4648962.LvFx2qVVIh@rjwysocki.net --- include/linux/energy_model.h | 8 ++++---- kernel/power/energy_model.c | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 78318d49276d..b23c8c798dac 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -169,8 +169,8 @@ struct em_perf_domain *em_pd_get(struct device *dev); int em_dev_update_perf_domain(struct device *dev, struct em_perf_table __rcu *new_table); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span, - bool microwatts); + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table __rcu *table); @@ -346,8 +346,8 @@ struct em_data_callback {}; static inline int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { return -EINVAL; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 16f6dcafdb90..1e3caa96c271 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -231,7 +231,7 @@ static void em_init_performance(struct device *dev, struct em_perf_domain *pd, } static int em_compute_costs(struct device *dev, struct em_perf_state *table, - struct em_data_callback *cb, int nr_states, + const struct em_data_callback *cb, int nr_states, unsigned long flags) { unsigned long prev_cost = ULONG_MAX; @@ -333,7 +333,7 @@ EXPORT_SYMBOL_GPL(em_dev_update_perf_domain); static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, struct em_perf_state *table, - struct em_data_callback *cb, + const struct em_data_callback *cb, unsigned long flags) { unsigned long power, freq, prev_freq = 0; @@ -388,7 +388,8 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, } static int em_create_pd(struct device *dev, int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, + const struct em_data_callback *cb, + const cpumask_t *cpus, unsigned long flags) { struct em_perf_table __rcu *em_table; @@ -548,8 +549,8 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus, - bool microwatts) + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { unsigned long cap, prev_cap = 0; unsigned long flags = 0; From d52d2b311a5a251ddd723d1dac881dc46e8803ab Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 5 Mar 2025 15:53:39 -0700 Subject: [PATCH 120/139] pm: cpupower: Fix cmd_monitor() error legs to free cpu_topology cmd_monitor() calls get_cpu_topology() to allocate memory for cpu topology and fails to release in error legs. Fix it to call cpu_topology_release() from error legs. Link: https://lore.kernel.org/r/20250305225342.19447-2-skhan@linuxfoundation.org Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c index e123aa578881..0380d2e70016 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c @@ -427,11 +427,13 @@ int cmd_monitor(int argc, char **argv) if (avail_monitors == 0) { printf(_("No HW Cstate monitors found\n")); + cpu_topology_release(cpu_top); return 1; } if (mode == list) { list_monitors(); + cpu_topology_release(cpu_top); exit(EXIT_SUCCESS); } From 0014f65e3df04ef15df3e8212be7b14fd26a2ef4 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 5 Mar 2025 15:53:40 -0700 Subject: [PATCH 121/139] pm: cpupower: remove hard-coded topology depth values Remove hard-coded topology depth values and replace them with defines to improve code readability and maintainability in cpupower-monitor code. Link: https://lore.kernel.org/r/20250305225342.19447-3-skhan@linuxfoundation.org Signed-off-by: Shuah Khan --- .../utils/idle_monitor/cpupower-monitor.c | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c index 0380d2e70016..ad493157f826 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c @@ -92,7 +92,11 @@ int fill_string_with_spaces(char *s, int n) return 0; } -#define MAX_COL_WIDTH 6 +#define MAX_COL_WIDTH 6 +#define TOPOLOGY_DEPTH_PKG 3 +#define TOPOLOGY_DEPTH_CORE 2 +#define TOPOLOGY_DEPTH_CPU 1 + void print_header(int topology_depth) { int unsigned mon; @@ -114,12 +118,19 @@ void print_header(int topology_depth) } printf("\n"); - if (topology_depth > 2) + switch (topology_depth) { + case TOPOLOGY_DEPTH_PKG: printf(" PKG|"); - if (topology_depth > 1) + break; + case TOPOLOGY_DEPTH_CORE: printf("CORE|"); - if (topology_depth > 0) + break; + case TOPOLOGY_DEPTH_CPU: printf(" CPU|"); + break; + default: + return; + } for (mon = 0; mon < avail_monitors; mon++) { if (mon != 0) @@ -153,12 +164,19 @@ void print_results(int topology_depth, int cpu) cpu_top.core_info[cpu].pkg == -1) return; - if (topology_depth > 2) + switch (topology_depth) { + case TOPOLOGY_DEPTH_PKG: printf("%4d|", cpu_top.core_info[cpu].pkg); - if (topology_depth > 1) + break; + case TOPOLOGY_DEPTH_CORE: printf("%4d|", cpu_top.core_info[cpu].core); - if (topology_depth > 0) + break; + case TOPOLOGY_DEPTH_CPU: printf("%4d|", cpu_top.core_info[cpu].cpu); + break; + default: + return; + } for (mon = 0; mon < avail_monitors; mon++) { if (mon != 0) @@ -454,15 +472,15 @@ int cmd_monitor(int argc, char **argv) /* ToDo: Topology parsing needs fixing first to do this more generically */ if (cpu_top.pkgs > 1) - print_header(3); + print_header(TOPOLOGY_DEPTH_PKG); else - print_header(1); + print_header(TOPOLOGY_DEPTH_CPU); for (cpu = 0; cpu < cpu_count; cpu++) { if (cpu_top.pkgs > 1) - print_results(3, cpu); + print_results(TOPOLOGY_DEPTH_PKG, cpu); else - print_results(1, cpu); + print_results(TOPOLOGY_DEPTH_CPU, cpu); } for (num = 0; num < avail_monitors; num++) { From f89cb9cba7a2557db676b507dd181d9751c7d11d Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Wed, 5 Mar 2025 16:08:59 -0500 Subject: [PATCH 122/139] cpupower: Implement CPU physical core querying This patch is also an issue report. get_cpu_topology will always save into cpupower_topology a cores size of 0. The code to handle this looks like it was commented out, and what is commented out is missing a curly bracket. https://elixir.bootlin.com/linux/v6.13.5/source/tools/power/cpupower/lib/cpupower.c#L206-L212 Inspiration was taken from psutil to implement this by querying core_cpu_list. Instead of using a hashmap, I used a sorted array, and counted the number of valid unique strings. The counting of this takes place before the qsort for .pkg as the following code says it is dependent on the order of that sort. The previous code claimed Intel CPUs are not numbered correctly. I was not able to reproduce that issue and removed that comment and the code. This commit was tested with the libcpupower SWIG Python bindings and performed correctly on 4 different setups. The most notable is the Framework Intel laptop; a hybrid system of 4 P cores (8 threads) and 8 E cores (8 threads). The 4 setups: A 4 core virt-manager VM running Fedora 41 4c/4t (specs not listed) was tested as a sanity test for VMs. A Lenovo Ryzen 7 Pro 7840HS 8c/16t. A Supermico Intel(R) Xeon(R) Gold 6330 CPU w/ 56c/112t with 2 CPU sockets. A Framework 12th Gen Intel(R) Core(TM) i5-1240P with hybrid cores. CPU(s): 16 On-line CPU(s) list: 0-15 Vendor ID: AuthenticAMD Model name: AMD Ryzen 7 PRO 7840HS w/ Radeon 780M Graphics CPU family: 25 Model: 116 Thread(s) per core: 2 Core(s) per socket: 8 Socket(s): 1 Stepping: 1 CPU(s): 112 On-line CPU(s) list: 0-111 Vendor ID: GenuineIntel BIOS Vendor ID: Intel(R) Corporation Model name: Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz BIOS Model name: Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz CPU @ 2.0GHz BIOS CPU family: 179 CPU family: 6 Model: 106 Thread(s) per core: 2 Core(s) per socket: 28 Socket(s): 2 Stepping: 6 CPU(s): 16 On-line CPU(s) list: 0-15 Vendor ID: GenuineIntel Model name: 12th Gen Intel(R) Core(TM) i5-1240P CPU family: 6 Model: 154 Thread(s) per core: 2 Core(s) per socket: 12 Socket(s): 1 Stepping: 3 Link: https://lore.kernel.org/r/20250305210901.24177-1-jwyatt@redhat.com Signed-off-by: "John B. Wyatt IV" Signed-off-by: "John B. Wyatt IV" Signed-off-by: Shuah Khan --- tools/power/cpupower/lib/cpupower.c | 48 ++++++++++++++++++++++++----- tools/power/cpupower/lib/cpupower.h | 3 ++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/tools/power/cpupower/lib/cpupower.c b/tools/power/cpupower/lib/cpupower.c index 7a2ef691b20e..ce8dfb8e46ab 100644 --- a/tools/power/cpupower/lib/cpupower.c +++ b/tools/power/cpupower/lib/cpupower.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "cpupower.h" #include "cpupower_intern.h" @@ -150,15 +151,25 @@ static int __compare(const void *t1, const void *t2) return 0; } +static int __compare_core_cpu_list(const void *t1, const void *t2) +{ + struct cpuid_core_info *top1 = (struct cpuid_core_info *)t1; + struct cpuid_core_info *top2 = (struct cpuid_core_info *)t2; + + return strcmp(top1->core_cpu_list, top2->core_cpu_list); +} + /* * Returns amount of cpus, negative on error, cpu_top must be * passed to cpu_topology_release to free resources * - * Array is sorted after ->pkg, ->core, then ->cpu + * Array is sorted after ->cpu_smt_list ->pkg, ->core */ int get_cpu_topology(struct cpupower_topology *cpu_top) { int cpu, last_pkg, cpus = sysconf(_SC_NPROCESSORS_CONF); + char path[SYSFS_PATH_MAX]; + char *last_cpu_list; cpu_top->core_info = malloc(sizeof(struct cpuid_core_info) * cpus); if (cpu_top->core_info == NULL) @@ -183,6 +194,34 @@ int get_cpu_topology(struct cpupower_topology *cpu_top) cpu_top->core_info[cpu].core = -1; continue; } + if (cpu_top->core_info[cpu].core == -1) { + strncpy(cpu_top->core_info[cpu].core_cpu_list, "-1", CPULIST_BUFFER); + continue; + } + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/topology/%s", + cpu, "core_cpus_list"); + if (cpupower_read_sysfs( + path, + cpu_top->core_info[cpu].core_cpu_list, + CPULIST_BUFFER) < 1) { + printf("Warning CPU%u has a 0 size core_cpus_list string", cpu); + } + } + + /* Count the number of distinct cpu lists to get the physical core + * count. + */ + qsort(cpu_top->core_info, cpus, sizeof(struct cpuid_core_info), + __compare_core_cpu_list); + + last_cpu_list = cpu_top->core_info[0].core_cpu_list; + cpu_top->cores = 1; + for (cpu = 1; cpu < cpus; cpu++) { + if (strcmp(cpu_top->core_info[cpu].core_cpu_list, last_cpu_list) != 0 && + cpu_top->core_info[cpu].pkg != -1) { + last_cpu_list = cpu_top->core_info[cpu].core_cpu_list; + cpu_top->cores++; + } } qsort(cpu_top->core_info, cpus, sizeof(struct cpuid_core_info), @@ -203,13 +242,6 @@ int get_cpu_topology(struct cpupower_topology *cpu_top) if (!(cpu_top->core_info[0].pkg == -1)) cpu_top->pkgs++; - /* Intel's cores count is not consecutively numbered, there may - * be a core_id of 3, but none of 2. Assume there always is 0 - * Get amount of cores by counting duplicates in a package - for (cpu = 0; cpu_top->core_info[cpu].pkg = 0 && cpu < cpus; cpu++) { - if (cpu_top->core_info[cpu].core == 0) - cpu_top->cores++; - */ return cpus; } diff --git a/tools/power/cpupower/lib/cpupower.h b/tools/power/cpupower/lib/cpupower.h index e4e4292eacec..2e67a080f203 100644 --- a/tools/power/cpupower/lib/cpupower.h +++ b/tools/power/cpupower/lib/cpupower.h @@ -2,6 +2,8 @@ #ifndef __CPUPOWER_CPUPOWER_H__ #define __CPUPOWER_CPUPOWER_H__ +#define CPULIST_BUFFER 5 + struct cpupower_topology { /* Amount of CPU cores, packages and threads per core in the system */ unsigned int cores; @@ -16,6 +18,7 @@ struct cpuid_core_info { int pkg; int core; int cpu; + char core_cpu_list[CPULIST_BUFFER]; /* flags */ unsigned int is_online:1; From 3ee7be9e10dd5f79448788b899591d4bd2bf0c19 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Mar 2025 17:49:20 +0100 Subject: [PATCH 123/139] PM: EM: Address RCU-related sparse warnings The usage of __rcu in the Energy Model code is quite inconsistent which causes the following sparse warnings to trigger: kernel/power/energy_model.c:169:15: warning: incorrect type in assignment (different address spaces) kernel/power/energy_model.c:169:15: expected struct em_perf_table [noderef] __rcu *table kernel/power/energy_model.c:169:15: got struct em_perf_table * kernel/power/energy_model.c:171:9: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:171:9: expected struct callback_head *head kernel/power/energy_model.c:171:9: got struct callback_head [noderef] __rcu * kernel/power/energy_model.c:171:9: warning: cast removes address space '__rcu' of expression kernel/power/energy_model.c:182:19: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:182:19: expected struct kref *kref kernel/power/energy_model.c:182:19: got struct kref [noderef] __rcu * kernel/power/energy_model.c:200:15: warning: incorrect type in assignment (different address spaces) kernel/power/energy_model.c:200:15: expected struct em_perf_table [noderef] __rcu *table kernel/power/energy_model.c:200:15: got void *[assigned] _res kernel/power/energy_model.c:204:20: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:204:20: expected struct kref *kref kernel/power/energy_model.c:204:20: got struct kref [noderef] __rcu * kernel/power/energy_model.c:320:19: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:320:19: expected struct kref *kref kernel/power/energy_model.c:320:19: got struct kref [noderef] __rcu * kernel/power/energy_model.c:325:45: warning: incorrect type in argument 2 (different address spaces) kernel/power/energy_model.c:325:45: expected struct em_perf_state *table kernel/power/energy_model.c:325:45: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:425:45: warning: incorrect type in argument 3 (different address spaces) kernel/power/energy_model.c:425:45: expected struct em_perf_state *table kernel/power/energy_model.c:425:45: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:442:15: warning: incorrect type in argument 1 (different address spaces) kernel/power/energy_model.c:442:15: expected void const *objp kernel/power/energy_model.c:442:15: got struct em_perf_table [noderef] __rcu *[assigned] em_table kernel/power/energy_model.c:626:55: warning: incorrect type in argument 2 (different address spaces) kernel/power/energy_model.c:626:55: expected struct em_perf_state *table kernel/power/energy_model.c:626:55: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:681:16: warning: incorrect type in assignment (different address spaces) kernel/power/energy_model.c:681:16: expected struct em_perf_state *new_ps kernel/power/energy_model.c:681:16: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:699:37: warning: incorrect type in argument 2 (different address spaces) kernel/power/energy_model.c:699:37: expected struct em_perf_state *table kernel/power/energy_model.c:699:37: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:733:38: warning: incorrect type in argument 3 (different address spaces) kernel/power/energy_model.c:733:38: expected struct em_perf_state *table kernel/power/energy_model.c:733:38: got struct em_perf_state [noderef] __rcu * kernel/power/energy_model.c:855:53: warning: dereference of noderef expression kernel/power/energy_model.c:864:32: warning: dereference of noderef expression This is because the __rcu annotation for sparse is only applicable to pointers that need rcu_dereference() or equivalent for protection, which basically means pointers assigned with rcu_assign_pointer(). Make all of the above sparse warnings go away by cleaning up the usage of __rcu and using rcu_dereference_protected() where applicable. Cc: All applicable Signed-off-by: Rafael J. Wysocki Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/5885405.DvuYhMxLoT@rjwysocki.net --- include/linux/energy_model.h | 12 +++++------ kernel/power/energy_model.c | 39 ++++++++++++++++++------------------ 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b23c8c798dac..ddd09debfc7d 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -167,13 +167,13 @@ struct em_data_callback { struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table); + struct em_perf_table *new_table); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd); -void em_table_free(struct em_perf_table __rcu *table); +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd); +void em_table_free(struct em_perf_table *table); int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, int nr_states); int em_dev_update_chip_binning(struct device *dev); @@ -373,14 +373,14 @@ static inline int em_pd_nr_perf_states(struct em_perf_domain *pd) return 0; } static inline -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { return NULL; } -static inline void em_table_free(struct em_perf_table __rcu *table) {} +static inline void em_table_free(struct em_perf_table *table) {} static inline int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { return -EINVAL; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 1e3caa96c271..d9b7e2b38c7a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -163,12 +163,8 @@ static void em_debug_remove_pd(struct device *dev) {} static void em_release_table_kref(struct kref *kref) { - struct em_perf_table __rcu *table; - /* It was the last owner of this table so we can free */ - table = container_of(kref, struct em_perf_table, kref); - - kfree_rcu(table, rcu); + kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu); } /** @@ -177,7 +173,7 @@ static void em_release_table_kref(struct kref *kref) * * No return values. */ -void em_table_free(struct em_perf_table __rcu *table) +void em_table_free(struct em_perf_table *table) { kref_put(&table->kref, em_release_table_kref); } @@ -190,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table) * has a user. * Returns allocated table or NULL. */ -struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd) +struct em_perf_table *em_table_alloc(struct em_perf_domain *pd) { - struct em_perf_table __rcu *table; + struct em_perf_table *table; int table_size; table_size = sizeof(struct em_perf_state) * pd->nr_perf_states; @@ -300,9 +296,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table, * Return 0 on success or an error code on failure. */ int em_dev_update_perf_domain(struct device *dev, - struct em_perf_table __rcu *new_table) + struct em_perf_table *new_table) { - struct em_perf_table __rcu *old_table; + struct em_perf_table *old_table; struct em_perf_domain *pd; if (!dev) @@ -319,7 +315,8 @@ int em_dev_update_perf_domain(struct device *dev, kref_get(&new_table->kref); - old_table = pd->em_table; + old_table = rcu_dereference_protected(pd->em_table, + lockdep_is_held(&em_pd_mutex)); rcu_assign_pointer(pd->em_table, new_table); em_cpufreq_update_efficiencies(dev, new_table->state); @@ -392,7 +389,7 @@ static int em_create_pd(struct device *dev, int nr_states, const cpumask_t *cpus, unsigned long flags) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; struct device *cpu_dev; int cpu, ret, num_cpus; @@ -552,6 +549,7 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts) { + struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; unsigned long flags = 0; int cpu, ret; @@ -624,7 +622,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, dev->em_pd->min_perf_state = 0; dev->em_pd->max_perf_state = nr_states - 1; - em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state); + em_table = rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex)); + em_cpufreq_update_efficiencies(dev, em_table->state); em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); @@ -661,7 +661,8 @@ void em_dev_unregister_perf_domain(struct device *dev) mutex_lock(&em_pd_mutex); em_debug_remove_pd(dev); - em_table_free(dev->em_pd->em_table); + em_table_free(rcu_dereference_protected(dev->em_pd->em_table, + lockdep_is_held(&em_pd_mutex))); kfree(dev->em_pd); dev->em_pd = NULL; @@ -669,9 +670,9 @@ void em_dev_unregister_perf_domain(struct device *dev) } EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain); -static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) +static struct em_perf_table *em_table_dup(struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_state *ps, *new_ps; int ps_size; @@ -693,7 +694,7 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd) } static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, - struct em_perf_table __rcu *em_table) + struct em_perf_table *em_table) { int ret; @@ -723,7 +724,7 @@ static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd, static void em_adjust_new_capacity(struct device *dev, struct em_perf_domain *pd) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; em_table = em_table_dup(pd); if (!em_table) { @@ -814,7 +815,7 @@ static void em_update_workfn(struct work_struct *work) */ int em_dev_update_chip_binning(struct device *dev) { - struct em_perf_table __rcu *em_table; + struct em_perf_table *em_table; struct em_perf_domain *pd; int i, ret; From 17f08280cf89baf5e4620fc7af300082bcee7e24 Mon Sep 17 00:00:00 2001 From: Jeson Gao Date: Fri, 7 Mar 2025 13:23:49 +0000 Subject: [PATCH 124/139] PM: EM: Rework the depends on for CONFIG_ENERGY_MODEL Now not only CPUs can use energy efficiency models, but GPUs can also use. On the other hand, even with only one CPU, we can also use energy_model to align control in thermal. So remove the dependence of SMP, and add the DEVFREQ. Signed-off-by: Jeson Gao [Added missing SMP config option in DTPM_CPU dependency] Signed-off-by: Lukasz Luba Link: https://patch.msgid.link/20250307132649.4056210-1-lukasz.luba@arm.com [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/Kconfig | 2 +- kernel/power/Kconfig | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index 69ef8d081c98..03c4c796d993 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -82,7 +82,7 @@ config DTPM config DTPM_CPU bool "Add CPU power capping based on the energy model" - depends on DTPM && ENERGY_MODEL + depends on DTPM && ENERGY_MODEL && SMP help This enables support for CPU power limitation based on energy model. diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ca947ed32e3d..54a623680019 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -380,8 +380,7 @@ config CPU_PM config ENERGY_MODEL bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" - depends on SMP - depends on CPU_FREQ + depends on CPU_FREQ || PM_DEVFREQ help Several subsystems (thermal and/or the task scheduler for example) can leverage information about the energy consumed by devices to From fffadbdd6b5acdb6390d6d0bc3ad6d3da5d2bd53 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Fri, 7 Mar 2025 10:43:34 +0100 Subject: [PATCH 125/139] cpupower: Make lib versioning scheme more obvious and fix version link library versioning was broken: libcpupower.so.0.0.1 libcpupower.so -> libcpupower.so.0.0.1 libcpupower.so.1 -> libcpupower.so.0.0.1 and is fixed by this patch to: libcpupower.so.1.0.1 libcpupower.so -> libcpupower.so.1.0.1 libcpupower.so.1 -> libcpupower.so.1.0.1 Link: https://lore.kernel.org/r/20250307094334.39587-1-trenn@suse.de Signed-off-by: Thomas Renninger Signed-off-by: Shuah Khan --- tools/power/cpupower/Makefile | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tools/power/cpupower/Makefile b/tools/power/cpupower/Makefile index 51a95239fe06..835123add0ed 100644 --- a/tools/power/cpupower/Makefile +++ b/tools/power/cpupower/Makefile @@ -52,8 +52,11 @@ DESTDIR ?= # and _should_ modify the PACKAGE_BUGREPORT definition VERSION:= $(shell ./utils/version-gen.sh) -LIB_MAJ= 0.0.1 -LIB_MIN= 1 +LIB_FIX= 1 +LIB_MIN= 0 +LIB_MAJ= 1 +LIB_VER= $(LIB_MAJ).$(LIB_MIN).$(LIB_FIX) + PACKAGE = cpupower PACKAGE_BUGREPORT = linux-pm@vger.kernel.org @@ -200,14 +203,14 @@ $(OUTPUT)lib/%.o: $(LIB_SRC) $(LIB_HEADERS) $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -fPIC -o $@ -c lib/$*.c -$(OUTPUT)libcpupower.so.$(LIB_MAJ): $(LIB_OBJS) +$(OUTPUT)libcpupower.so.$(LIB_VER): $(LIB_OBJS) $(ECHO) " LD " $@ $(QUIET) $(CC) -shared $(CFLAGS) $(LDFLAGS) -o $@ \ - -Wl,-soname,libcpupower.so.$(LIB_MIN) $(LIB_OBJS) + -Wl,-soname,libcpupower.so.$(LIB_MAJ) $(LIB_OBJS) @ln -sf $(@F) $(OUTPUT)libcpupower.so - @ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MIN) + @ln -sf $(@F) $(OUTPUT)libcpupower.so.$(LIB_MAJ) -libcpupower: $(OUTPUT)libcpupower.so.$(LIB_MAJ) +libcpupower: $(OUTPUT)libcpupower.so.$(LIB_VER) # Let all .o files depend on its .c file and all headers # Might be worth to put this into utils/Makefile at some point of time @@ -217,7 +220,7 @@ $(OUTPUT)%.o: %.c $(ECHO) " CC " $@ $(QUIET) $(CC) $(CFLAGS) -I./lib -I ./utils -o $@ -c $*.c -$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_MAJ) +$(OUTPUT)cpupower: $(UTIL_OBJS) $(OUTPUT)libcpupower.so.$(LIB_VER) $(ECHO) " CC " $@ ifeq ($(strip $(STATIC)),true) $(QUIET) $(CC) $(CFLAGS) $(LDFLAGS) $(UTIL_OBJS) -lrt -lpci -L$(OUTPUT) -o $@ @@ -262,7 +265,7 @@ update-po: $(OUTPUT)po/$(PACKAGE).pot done; endif -compile-bench: $(OUTPUT)libcpupower.so.$(LIB_MAJ) +compile-bench: $(OUTPUT)libcpupower.so.$(LIB_VER) @V=$(V) confdir=$(confdir) $(MAKE) -C bench O=$(OUTPUT) # we compile into subdirectories. if the target directory is not the From be4ae8c19492cd6d5de61ccb34ffb3f5ede5eec8 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Mon, 10 Mar 2025 00:28:48 -0500 Subject: [PATCH 126/139] cpufreq: tegra186: Share policy per cluster This functionally brings tegra186 in line with tegra210 and tegra194, sharing a cpufreq policy between all cores in a cluster. Reviewed-by: Sumit Gupta Acked-by: Thierry Reding Signed-off-by: Aaron Kling Signed-off-by: Viresh Kumar --- drivers/cpufreq/tegra186-cpufreq.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index b54a77be54e6..cbabb726c664 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -73,11 +73,18 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy) { struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id; + u32 cpu; policy->freq_table = data->clusters[cluster].table; policy->cpuinfo.transition_latency = 300 * 1000; policy->driver_data = NULL; + /* set same policy for all cpus in a cluster */ + for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) { + if (data->cpus[cpu].bpmp_cluster_id == cluster) + cpumask_set_cpu(cpu, policy->cpus); + } + return 0; } From 13b4f9e126cb55b65430bae7e704cea23c78620c Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 7 Mar 2025 02:17:50 +0000 Subject: [PATCH 127/139] PM: sleep: Remove unused pm_generic_ wrappers pm_generic_thaw_early() has been unused since 2016's commit 294f47ffd55c ("PM / Domains: Remove redundant system PM callbacks") pm_generic_freeze_late() has been unused since 2019's commit 3cd7957e85e6 ("ACPI: PM: Simplify and fix PM domain hibernation callbacks") Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250307021750.457600-1-linux@treblig.org Signed-off-by: Rafael J. Wysocki --- drivers/base/power/generic_ops.c | 24 ------------------------ include/linux/pm.h | 4 ---- 2 files changed, 28 deletions(-) diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c index 4fa525668cb7..6502720bb564 100644 --- a/drivers/base/power/generic_ops.c +++ b/drivers/base/power/generic_ops.c @@ -114,18 +114,6 @@ int pm_generic_freeze_noirq(struct device *dev) } EXPORT_SYMBOL_GPL(pm_generic_freeze_noirq); -/** - * pm_generic_freeze_late - Generic freeze_late callback for subsystems. - * @dev: Device to freeze. - */ -int pm_generic_freeze_late(struct device *dev) -{ - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->freeze_late ? pm->freeze_late(dev) : 0; -} -EXPORT_SYMBOL_GPL(pm_generic_freeze_late); - /** * pm_generic_freeze - Generic freeze callback for subsystems. * @dev: Device to freeze. @@ -186,18 +174,6 @@ int pm_generic_thaw_noirq(struct device *dev) } EXPORT_SYMBOL_GPL(pm_generic_thaw_noirq); -/** - * pm_generic_thaw_early - Generic thaw_early callback for subsystems. - * @dev: Device to thaw. - */ -int pm_generic_thaw_early(struct device *dev) -{ - const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; - - return pm && pm->thaw_early ? pm->thaw_early(dev) : 0; -} -EXPORT_SYMBOL_GPL(pm_generic_thaw_early); - /** * pm_generic_thaw - Generic thaw callback for subsystems. * @dev: Device to thaw. diff --git a/include/linux/pm.h b/include/linux/pm.h index 63a8dffda787..f0bd8fbae4f2 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -839,10 +839,8 @@ extern int pm_generic_resume_early(struct device *dev); extern int pm_generic_resume_noirq(struct device *dev); extern int pm_generic_resume(struct device *dev); extern int pm_generic_freeze_noirq(struct device *dev); -extern int pm_generic_freeze_late(struct device *dev); extern int pm_generic_freeze(struct device *dev); extern int pm_generic_thaw_noirq(struct device *dev); -extern int pm_generic_thaw_early(struct device *dev); extern int pm_generic_thaw(struct device *dev); extern int pm_generic_restore_noirq(struct device *dev); extern int pm_generic_restore_early(struct device *dev); @@ -884,10 +882,8 @@ static inline void dpm_for_each_dev(void *data, void (*fn)(struct device *, void #define pm_generic_resume_noirq NULL #define pm_generic_resume NULL #define pm_generic_freeze_noirq NULL -#define pm_generic_freeze_late NULL #define pm_generic_freeze NULL #define pm_generic_thaw_noirq NULL -#define pm_generic_thaw_early NULL #define pm_generic_thaw NULL #define pm_generic_restore_noirq NULL #define pm_generic_restore_early NULL From 0f42194c6b22d687fd53c8aea5413cf976366672 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 11 Mar 2025 17:08:22 +0100 Subject: [PATCH 128/139] PM: s2idle: Drop redundant locks when entering s2idle The calls to cpus_read_lock|unlock() protects us from getting CPUS hotplugged, while entering suspend-to-idle. However, when s2idle_enter() is called we should be far beyond the point when CPUs may be hotplugged. Let's therefore simplify the code and drop the use of the lock. Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20250311160827.1129643-2-ulf.hansson@linaro.org [ rjw: Rewrote the new comment ] Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 09f8397bae15..1876abf1be15 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -91,6 +91,12 @@ static void s2idle_enter(void) { trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true); + /* + * The correctness of the code below depends on the number of online + * CPUs being stable, but CPUs cannot be taken offline or put online + * while it is running. + */ + raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; @@ -98,8 +104,6 @@ static void s2idle_enter(void) s2idle_state = S2IDLE_STATE_ENTER; raw_spin_unlock_irq(&s2idle_lock); - cpus_read_lock(); - /* Push all the CPUs into the idle loop. */ wake_up_all_idle_cpus(); /* Make the current CPU wait so it can enter the idle loop too. */ @@ -112,8 +116,6 @@ static void s2idle_enter(void) */ wake_up_all_idle_cpus(); - cpus_read_unlock(); - raw_spin_lock_irq(&s2idle_lock); out: From 4b7d654258e0dd69a7d00be387b388f9d7544912 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 11 Mar 2025 17:08:23 +0100 Subject: [PATCH 129/139] PM: s2idle: Extend comment in s2idle_enter() The s2idle_lock must be held while checking for a pending wakeup and while moving into S2IDLE_STATE_ENTER, to make sure a wakeup doesn't get lost. Let's extend the comment in the code to make this clear. Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20250311160827.1129643-3-ulf.hansson@linaro.org [ rjw: Rewrote the new comment ] Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 1876abf1be15..6fae1e0a331c 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -95,8 +95,12 @@ static void s2idle_enter(void) * The correctness of the code below depends on the number of online * CPUs being stable, but CPUs cannot be taken offline or put online * while it is running. + * + * The s2idle_lock must be acquired before the pending wakeup check to + * prevent pm_system_wakeup() from running as a whole between that check + * and the subsequent s2idle_state update in which case a wakeup event + * would get lost. */ - raw_spin_lock_irq(&s2idle_lock); if (pm_wakeup_pending()) goto out; From 956af869a2b7a1ab1f67bf8a74c51897f6f6715d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 12 Mar 2025 11:47:19 +0100 Subject: [PATCH 130/139] PM: sleep: core: Fix indentation in dpm_wait_for_children() The body of dpm_wait_for_children() is indented by 7 spaces instead of a single TAB. Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/9c8ff2b103c3ba7b0d27bdc8248b05e3b1dc9551.1741776430.git.geert+renesas@glider.be Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index d240dc352b1f..1b0eb83f7061 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -249,7 +249,7 @@ static int dpm_wait_fn(struct device *dev, void *async_ptr) static void dpm_wait_for_children(struct device *dev, bool async) { - device_for_each_child(dev, &async, dpm_wait_fn); + device_for_each_child(dev, &async, dpm_wait_fn); } static void dpm_wait_for_suppliers(struct device *dev, bool async) From 3efeeaf85f5cab84aa05003308eddb62e2acf5bd Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 7 Mar 2025 21:23:47 +0000 Subject: [PATCH 131/139] PM: clk: Remove unused pm_clk_remove() pm_clk_remove() is currently unused. It hasn't been used since at least 2011 when it was renamed from pm_runtime_clk_remove() by commit 3d5c30367cbc ("PM: Rename clock management functions") Remove it. Note that the __pm_clk_remove() is still used and is left in. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20250307212347.68785-1-linux@treblig.org Signed-off-by: Rafael J. Wysocki --- drivers/base/power/clock_ops.c | 40 ---------------------------------- include/linux/pm_clock.h | 4 ---- 2 files changed, 44 deletions(-) diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c index 97a53215a274..b69bcb37c830 100644 --- a/drivers/base/power/clock_ops.c +++ b/drivers/base/power/clock_ops.c @@ -343,46 +343,6 @@ static void __pm_clk_remove(struct pm_clock_entry *ce) kfree(ce); } -/** - * pm_clk_remove - Stop using a device clock for power management. - * @dev: Device whose clock should not be used for PM any more. - * @con_id: Connection ID of the clock. - * - * Remove the clock represented by @con_id from the list of clocks used for - * the power management of @dev. - */ -void pm_clk_remove(struct device *dev, const char *con_id) -{ - struct pm_subsys_data *psd = dev_to_psd(dev); - struct pm_clock_entry *ce; - - if (!psd) - return; - - pm_clk_list_lock(psd); - - list_for_each_entry(ce, &psd->clock_list, node) { - if (!con_id && !ce->con_id) - goto remove; - else if (!con_id || !ce->con_id) - continue; - else if (!strcmp(con_id, ce->con_id)) - goto remove; - } - - pm_clk_list_unlock(psd); - return; - - remove: - list_del(&ce->node); - if (ce->enabled_when_prepared) - psd->clock_op_might_sleep--; - pm_clk_list_unlock(psd); - - __pm_clk_remove(ce); -} -EXPORT_SYMBOL_GPL(pm_clk_remove); - /** * pm_clk_remove_clk - Stop using a device clock for power management. * @dev: Device whose clock should not be used for PM any more. diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 45c3f3ccbaf8..c3b46fa358d3 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -42,7 +42,6 @@ extern void pm_clk_destroy(struct device *dev); extern int pm_clk_add(struct device *dev, const char *con_id); extern int pm_clk_add_clk(struct device *dev, struct clk *clk); extern int of_pm_clk_add_clks(struct device *dev); -extern void pm_clk_remove(struct device *dev, const char *con_id); extern void pm_clk_remove_clk(struct device *dev, struct clk *clk); extern int pm_clk_suspend(struct device *dev); extern int pm_clk_resume(struct device *dev); @@ -75,9 +74,6 @@ static inline int of_pm_clk_add_clks(struct device *dev) { return -EINVAL; } -static inline void pm_clk_remove(struct device *dev, const char *con_id) -{ -} #define pm_clk_suspend NULL #define pm_clk_resume NULL static inline void pm_clk_remove_clk(struct device *dev, struct clk *clk) From 68cb0139fec8e05b93978dc0ef1bc8df90a86419 Mon Sep 17 00:00:00 2001 From: Jacky Bai Date: Fri, 7 Mar 2025 22:55:47 +0800 Subject: [PATCH 132/139] cpuidle: Init cpuidle only for present CPUs for_each_possible_cpu() is currently used to initialize cpuidle in below cpuidle drivers: drivers/cpuidle/cpuidle-arm.c drivers/cpuidle/cpuidle-big_little.c drivers/cpuidle/cpuidle-psci.c drivers/cpuidle/cpuidle-qcom-spm.c drivers/cpuidle/cpuidle-riscv-sbi.c However, in cpu_dev_register_generic(), for_each_present_cpu() is used to register CPU devices which means the CPU devices are only registered for present CPUs and not all possible CPUs. With nosmp or maxcpus=0, only the boot CPU is present, lead to the failure: | Failed to register cpuidle device for cpu1 Then rollback to cancel all CPUs' cpuidle registration. Change for_each_possible_cpu() to for_each_present_cpu() in the above cpuidle drivers to ensure it only registers cpuidle devices for CPUs that are actually present. Fixes: b0c69e1214bc ("drivers: base: Use present CPUs in GENERIC_CPU_DEVICES") Reviewed-by: Dhruva Gole Reviewed-by: Sudeep Holla Tested-by: Yuanjie Yang Signed-off-by: Jacky Bai Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/20250307145547.2784821-1-ping.bai@nxp.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-arm.c | 8 ++++---- drivers/cpuidle/cpuidle-big_little.c | 2 +- drivers/cpuidle/cpuidle-psci.c | 4 ++-- drivers/cpuidle/cpuidle-qcom-spm.c | 2 +- drivers/cpuidle/cpuidle-riscv-sbi.c | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c index caba6f4bb1b7..e044fefdb816 100644 --- a/drivers/cpuidle/cpuidle-arm.c +++ b/drivers/cpuidle/cpuidle-arm.c @@ -137,9 +137,9 @@ static int __init arm_idle_init_cpu(int cpu) /* * arm_idle_init - Initializes arm cpuidle driver * - * Initializes arm cpuidle driver for all CPUs, if any CPU fails - * to register cpuidle driver then rollback to cancel all CPUs - * registration. + * Initializes arm cpuidle driver for all present CPUs, if any + * CPU fails to register cpuidle driver then rollback to cancel + * all CPUs registration. */ static int __init arm_idle_init(void) { @@ -147,7 +147,7 @@ static int __init arm_idle_init(void) struct cpuidle_driver *drv; struct cpuidle_device *dev; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { ret = arm_idle_init_cpu(cpu); if (ret) goto out_fail; diff --git a/drivers/cpuidle/cpuidle-big_little.c b/drivers/cpuidle/cpuidle-big_little.c index 74972deda0ea..4abba42fcc31 100644 --- a/drivers/cpuidle/cpuidle-big_little.c +++ b/drivers/cpuidle/cpuidle-big_little.c @@ -148,7 +148,7 @@ static int __init bl_idle_driver_init(struct cpuidle_driver *drv, int part_id) if (!cpumask) return -ENOMEM; - for_each_possible_cpu(cpu) + for_each_present_cpu(cpu) if (smp_cpuid_part(cpu) == part_id) cpumask_set_cpu(cpu, cpumask); diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c index 2562dc001fc1..a4594c3d6562 100644 --- a/drivers/cpuidle/cpuidle-psci.c +++ b/drivers/cpuidle/cpuidle-psci.c @@ -400,7 +400,7 @@ static int psci_idle_init_cpu(struct device *dev, int cpu) /* * psci_idle_probe - Initializes PSCI cpuidle driver * - * Initializes PSCI cpuidle driver for all CPUs, if any CPU fails + * Initializes PSCI cpuidle driver for all present CPUs, if any CPU fails * to register cpuidle driver then rollback to cancel all CPUs * registration. */ @@ -410,7 +410,7 @@ static int psci_cpuidle_probe(struct platform_device *pdev) struct cpuidle_driver *drv; struct cpuidle_device *dev; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { ret = psci_idle_init_cpu(&pdev->dev, cpu); if (ret) goto out_fail; diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c index 3ab240e0e122..5f386761b156 100644 --- a/drivers/cpuidle/cpuidle-qcom-spm.c +++ b/drivers/cpuidle/cpuidle-qcom-spm.c @@ -135,7 +135,7 @@ static int spm_cpuidle_drv_probe(struct platform_device *pdev) if (ret) return dev_err_probe(&pdev->dev, ret, "set warm boot addr failed"); - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { ret = spm_cpuidle_register(&pdev->dev, cpu); if (ret && ret != -ENODEV) { dev_err(&pdev->dev, diff --git a/drivers/cpuidle/cpuidle-riscv-sbi.c b/drivers/cpuidle/cpuidle-riscv-sbi.c index 0c92a628bbd4..0fe1ece9fbdc 100644 --- a/drivers/cpuidle/cpuidle-riscv-sbi.c +++ b/drivers/cpuidle/cpuidle-riscv-sbi.c @@ -529,8 +529,8 @@ static int sbi_cpuidle_probe(struct platform_device *pdev) return ret; } - /* Initialize CPU idle driver for each CPU */ - for_each_possible_cpu(cpu) { + /* Initialize CPU idle driver for each present CPU */ + for_each_present_cpu(cpu) { ret = sbi_cpuidle_init_cpu(&pdev->dev, cpu); if (ret) { pr_debug("HART%ld: idle driver init failed\n", From 03f1444016b71feffa1dfb8a51f15ba592f94b13 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Mar 2025 17:00:00 +0100 Subject: [PATCH 133/139] PM: sleep: Fix handling devices with direct_complete set on errors When dpm_suspend() fails, some devices with power.direct_complete set may not have been handled by device_suspend() yet, so runtime PM has not been disabled for them yet even though power.direct_complete is set. Since device_resume() expects that runtime PM has been disabled for all devices with power.direct_complete set, it will attempt to reenable runtime PM for the devices that have not been processed by device_suspend() which does not make sense. Had those devices had runtime PM disabled before device_suspend() had run, device_resume() would have inadvertently enable runtime PM for them, but this is not expected to happen because it would require ->prepare() callbacks to return positive values for devices with runtime PM disabled, which would be invalid. In practice, this issue is most likely benign because pm_runtime_enable() will not allow the "disable depth" counter to underflow, but it causes a warning message to be printed for each affected device. To allow device_resume() to distinguish the "direct complete" devices that have been processed by device_suspend() from those which have not been handled by it, make device_suspend() set power.is_suspended for "direct complete" devices. Next, move the power.is_suspended check in device_resume() before the power.direct_complete check in it to make it skip the "direct complete" devices that have not been handled by device_suspend(). This change is based on a preliminary patch from Saravana Kannan. Fixes: aae4518b3124 ("PM / sleep: Mechanism to avoid resuming runtime-suspended devices unnecessarily") Link: https://lore.kernel.org/linux-pm/20241114220921.2529905-2-saravanak@google.com/ Reported-by: Saravana Kannan Signed-off-by: Rafael J. Wysocki Reviewed-by: Saravana Kannan Link: https://patch.msgid.link/12627587.O9o76ZdvQC@rjwysocki.net --- drivers/base/power/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 1b0eb83f7061..ad50018b8047 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -938,6 +938,9 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) if (dev->power.syscore) goto Complete; + if (!dev->power.is_suspended) + goto Complete; + if (dev->power.direct_complete) { /* * Allow new children to be added under the device after this @@ -963,9 +966,6 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) */ dev->power.is_prepared = false; - if (!dev->power.is_suspended) - goto Unlock; - if (dev->pm_domain) { info = "power domain "; callback = pm_op(&dev->pm_domain->ops, state); @@ -1005,7 +1005,6 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) error = dpm_run_callback(callback, dev, state, info); dev->power.is_suspended = false; - Unlock: device_unlock(dev); dpm_watchdog_clear(&wd); @@ -1669,6 +1668,7 @@ static int device_suspend(struct device *dev, pm_message_t state, bool async) pm_runtime_disable(dev); if (pm_runtime_status_suspended(dev)) { pm_dev_dbg(dev, state, "direct-complete "); + dev->power.is_suspended = true; goto Complete; } From 45f589b7167f36290d29c79e3a442dc0b13c086a Mon Sep 17 00:00:00 2001 From: Jacky Bai Date: Thu, 13 Mar 2025 09:39:28 +0800 Subject: [PATCH 134/139] cpufreq: Init cpufreq only for present CPUs for_each_possible_cpu() is currently used to initialize cpufreq. However, in cpu_dev_register_generic(), for_each_present_cpu() is used to register CPU devices which means the CPU devices are only registered for present CPUs and not all possible CPUs. With nosmp or maxcpus=0, only the boot CPU is present, lead to the cpufreq probe failure or defer probe due to no cpu device available for not present CPUs. Change for_each_possible_cpu() to for_each_present_cpu() in the above cpufreq drivers to ensure it only registers cpufreq for CPUs that are actually present. Fixes: b0c69e1214bc ("drivers: base: Use present CPUs in GENERIC_CPU_DEVICES") Reviewed-by: Sudeep Holla Signed-off-by: Jacky Bai Signed-off-by: Viresh Kumar --- drivers/cpufreq/armada-8k-cpufreq.c | 2 +- drivers/cpufreq/cpufreq-dt.c | 2 +- drivers/cpufreq/mediatek-cpufreq-hw.c | 2 +- drivers/cpufreq/mediatek-cpufreq.c | 2 +- drivers/cpufreq/mvebu-cpufreq.c | 2 +- drivers/cpufreq/qcom-cpufreq-hw.c | 2 +- drivers/cpufreq/qcom-cpufreq-nvmem.c | 8 ++++---- drivers/cpufreq/scmi-cpufreq.c | 2 +- drivers/cpufreq/scpi-cpufreq.c | 2 +- drivers/cpufreq/sun50i-cpufreq-nvmem.c | 6 +++--- drivers/cpufreq/virtual-cpufreq.c | 2 +- 11 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/cpufreq/armada-8k-cpufreq.c b/drivers/cpufreq/armada-8k-cpufreq.c index 7a979db81f09..5a3545bd0d8d 100644 --- a/drivers/cpufreq/armada-8k-cpufreq.c +++ b/drivers/cpufreq/armada-8k-cpufreq.c @@ -47,7 +47,7 @@ static void __init armada_8k_get_sharing_cpus(struct clk *cur_clk, { int cpu; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { struct device *cpu_dev; struct clk *clk; diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 778916f89a51..e80dd982a3e2 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -283,7 +283,7 @@ static int dt_cpufreq_probe(struct platform_device *pdev) int ret, cpu; /* Request resources early so we can return in case of -EPROBE_DEFER */ - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { ret = dt_cpufreq_early_init(&pdev->dev, cpu); if (ret) goto err; diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index aa209f5527dc..74f1b4c796e4 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c +++ b/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -303,7 +303,7 @@ static int mtk_cpufreq_hw_driver_probe(struct platform_device *pdev) struct regulator *cpu_reg; /* Make sure that all CPU supplies are available before proceeding. */ - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { cpu_dev = get_cpu_device(cpu); if (!cpu_dev) return dev_err_probe(&pdev->dev, -EPROBE_DEFER, diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c index 2656b88db378..f3f02c4b6888 100644 --- a/drivers/cpufreq/mediatek-cpufreq.c +++ b/drivers/cpufreq/mediatek-cpufreq.c @@ -631,7 +631,7 @@ static int mtk_cpufreq_probe(struct platform_device *pdev) return dev_err_probe(&pdev->dev, -ENODEV, "failed to get mtk cpufreq platform data\n"); - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { info = mtk_cpu_dvfs_info_lookup(cpu); if (info) continue; diff --git a/drivers/cpufreq/mvebu-cpufreq.c b/drivers/cpufreq/mvebu-cpufreq.c index 7f3cfe668f30..2aad4c04673c 100644 --- a/drivers/cpufreq/mvebu-cpufreq.c +++ b/drivers/cpufreq/mvebu-cpufreq.c @@ -56,7 +56,7 @@ static int __init armada_xp_pmsu_cpufreq_init(void) * it), and registers the clock notifier that will take care * of doing the PMSU part of a frequency transition. */ - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { struct device *cpu_dev; struct clk *clk; int ret; diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c index 4b3b3dbc7d38..8422704a3b10 100644 --- a/drivers/cpufreq/qcom-cpufreq-hw.c +++ b/drivers/cpufreq/qcom-cpufreq-hw.c @@ -306,7 +306,7 @@ static void qcom_get_related_cpus(int index, struct cpumask *m) struct of_phandle_args args; int cpu, ret; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { cpu_np = of_cpu_device_node_get(cpu); if (!cpu_np) continue; diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c index 3a8ed723a23e..54f8117103c8 100644 --- a/drivers/cpufreq/qcom-cpufreq-nvmem.c +++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c @@ -489,7 +489,7 @@ static int qcom_cpufreq_probe(struct platform_device *pdev) nvmem_cell_put(speedbin_nvmem); } - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { struct dev_pm_opp_config config = { .supported_hw = NULL, }; @@ -543,7 +543,7 @@ static int qcom_cpufreq_probe(struct platform_device *pdev) dev_err(cpu_dev, "Failed to register platform device\n"); free_opp: - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { dev_pm_domain_detach_list(drv->cpus[cpu].pd_list); dev_pm_opp_clear_config(drv->cpus[cpu].opp_token); } @@ -557,7 +557,7 @@ static void qcom_cpufreq_remove(struct platform_device *pdev) platform_device_unregister(cpufreq_dt_pdev); - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { dev_pm_domain_detach_list(drv->cpus[cpu].pd_list); dev_pm_opp_clear_config(drv->cpus[cpu].opp_token); } @@ -568,7 +568,7 @@ static int qcom_cpufreq_suspend(struct device *dev) struct qcom_cpufreq_drv *drv = dev_get_drvdata(dev); unsigned int cpu; - for_each_possible_cpu(cpu) + for_each_present_cpu(cpu) qcom_cpufreq_suspend_pd_devs(drv, cpu); return 0; diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index ff2897789797..c310aeebc8f3 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -104,7 +104,7 @@ scmi_get_sharing_cpus(struct device *cpu_dev, int domain, int cpu, tdomain; struct device *tcpu_dev; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { if (cpu == cpu_dev->id) continue; diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index 048dc43a9997..17cda84f00df 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -65,7 +65,7 @@ scpi_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask) if (domain < 0) return domain; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { if (cpu == cpu_dev->id) continue; diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c index 17d6a149f580..47d6840b3489 100644 --- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c +++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c @@ -262,7 +262,7 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) snprintf(name, sizeof(name), "speed%d", speed); config.prop_name = name; - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { struct device *cpu_dev = get_cpu_device(cpu); if (!cpu_dev) { @@ -288,7 +288,7 @@ static int sun50i_cpufreq_nvmem_probe(struct platform_device *pdev) pr_err("Failed to register platform device\n"); free_opp: - for_each_possible_cpu(cpu) + for_each_present_cpu(cpu) dev_pm_opp_clear_config(opp_tokens[cpu]); kfree(opp_tokens); @@ -302,7 +302,7 @@ static void sun50i_cpufreq_nvmem_remove(struct platform_device *pdev) platform_device_unregister(cpufreq_dt_pdev); - for_each_possible_cpu(cpu) + for_each_present_cpu(cpu) dev_pm_opp_clear_config(opp_tokens[cpu]); kfree(opp_tokens); diff --git a/drivers/cpufreq/virtual-cpufreq.c b/drivers/cpufreq/virtual-cpufreq.c index 45becb92aa4a..7dd1b0c263c7 100644 --- a/drivers/cpufreq/virtual-cpufreq.c +++ b/drivers/cpufreq/virtual-cpufreq.c @@ -138,7 +138,7 @@ static int virt_cpufreq_get_sharing_cpus(struct cpufreq_policy *policy) cur_perf_domain = readl_relaxed(base + policy->cpu * PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET); - for_each_possible_cpu(cpu) { + for_each_present_cpu(cpu) { cpu_dev = get_cpu_device(cpu); if (!cpu_dev) continue; From c6d5df70004fb12e774ef107f1465387d8e260c5 Mon Sep 17 00:00:00 2001 From: Imran Shaik Date: Thu, 13 Mar 2025 11:33:39 +0530 Subject: [PATCH 135/139] dt-bindings: cpufreq: cpufreq-qcom-hw: Add QCS8300 compatible Document compatible for cpufreq hardware on Qualcomm QCS8300 platform. Signed-off-by: Imran Shaik Reviewed-by: Krzysztof Kozlowski Signed-off-by: Viresh Kumar --- Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml index e937eb7355e7..90d9e7f43406 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml @@ -34,6 +34,7 @@ properties: - description: v2 of CPUFREQ HW (EPSS) items: - enum: + - qcom,qcs8300-cpufreq-epss - qcom,qdu1000-cpufreq-epss - qcom,sa8255p-cpufreq-epss - qcom,sa8775p-cpufreq-epss @@ -166,6 +167,7 @@ allOf: compatible: contains: enum: + - qcom,qcs8300-cpufreq-epss - qcom,sc7280-cpufreq-epss - qcom,sm8250-cpufreq-epss - qcom,sm8350-cpufreq-epss From a298c20f39dea74d1edaa50a1c37856b0f5f55c3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 13 Mar 2025 10:08:20 +0100 Subject: [PATCH 136/139] dt-bindings: cpufreq: cpufreq-qcom-hw: Add missing constraint for interrupt-names When narrowing properties per variant, the 'interrupt-names' should have the same constraints as 'interrupts'. Add missing upper bound on the property. Fixes: e69003202434 ("dt-bindings: cpufreq: cpufreq-qcom-hw: Add QCM2290") Fixes: 7ae24e054f75 ("dt-bindings: cpufreq: cpufreq-qcom-hw: Sanitize data per compatible") Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring (Arm) Signed-off-by: Viresh Kumar --- .../devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml index 90d9e7f43406..ee2a7eff1657 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml @@ -128,6 +128,7 @@ allOf: interrupt-names: minItems: 1 + maxItems: 1 - if: properties: @@ -161,6 +162,7 @@ allOf: interrupt-names: minItems: 2 + maxItems: 2 - if: properties: @@ -189,6 +191,7 @@ allOf: interrupt-names: minItems: 3 + maxItems: 3 - if: properties: @@ -213,6 +216,7 @@ allOf: interrupt-names: minItems: 2 + maxItems: 2 examples: From 684ab6f7ec5de57e4c349bb6a2207b1e5ed2fd71 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 13 Mar 2025 10:08:21 +0100 Subject: [PATCH 137/139] dt-bindings: cpufreq: cpufreq-qcom-hw: Drop redundant minItems:1 List cannot have 0 items, so 'minItems: 1' in each if:then: is redundant. Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring (Arm) Signed-off-by: Viresh Kumar --- .../devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml index ee2a7eff1657..475331dc71bc 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml @@ -115,19 +115,15 @@ allOf: then: properties: reg: - minItems: 1 maxItems: 1 reg-names: - minItems: 1 maxItems: 1 interrupts: - minItems: 1 maxItems: 1 interrupt-names: - minItems: 1 maxItems: 1 - if: From 169b9b1db893eca4f008b665d304eee372b6a627 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 13 Mar 2025 10:08:22 +0100 Subject: [PATCH 138/139] dt-bindings: cpufreq: cpufreq-qcom-hw: Narrow properties on SDX75, SA8775p and SM8650 Add SDX75 and SA8775p compatibles to respective if:then: blocks to narrow their properties and add a new section for SM8650 with four 'reg' and 'interrupts' (top-level already allows four). SA8755p DTS comes without interrupts, but only because they might not be available for OS under given firmware. Signed-off-by: Krzysztof Kozlowski Acked-by: Rob Herring (Arm) Signed-off-by: Viresh Kumar --- .../bindings/cpufreq/cpufreq-qcom-hw.yaml | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml index 475331dc71bc..e0242bed3342 100644 --- a/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml +++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-qcom-hw.yaml @@ -112,6 +112,7 @@ allOf: enum: - qcom,qcm2290-cpufreq-hw - qcom,sar2130p-cpufreq-epss + - qcom,sdx75-cpufreq-epss then: properties: reg: @@ -133,6 +134,7 @@ allOf: enum: - qcom,qdu1000-cpufreq-epss - qcom,sa8255p-cpufreq-epss + - qcom,sa8775p-cpufreq-epss - qcom,sc7180-cpufreq-hw - qcom,sc8180x-cpufreq-hw - qcom,sc8280xp-cpufreq-epss @@ -214,6 +216,29 @@ allOf: minItems: 2 maxItems: 2 + - if: + properties: + compatible: + contains: + enum: + - qcom,sm8650-cpufreq-epss + then: + properties: + reg: + minItems: 4 + maxItems: 4 + + reg-names: + minItems: 4 + maxItems: 4 + + interrupts: + minItems: 4 + maxItems: 4 + + interrupt-names: + minItems: 4 + maxItems: 4 examples: - | From 3860cbe239639503e56bd4365c6bf4cb957ef04e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 19 Mar 2025 11:43:24 +0000 Subject: [PATCH 139/139] PM: sleep: Fix bit masking operation The mask operation link->flags | DL_FLAG_PM_RUNTIME is always true which is incorrect. The mask operation should be using the bit-wise & operator. Fix this. Fixes: bca84a7b93fd ("PM: sleep: Use DPM_FLAG_SMART_SUSPEND conditionally") Signed-off-by: Colin Ian King Link: https://patch.msgid.link/20250319114324.791829-1-colin.i.king@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index ad50018b8047..ac2a197c1234 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1836,7 +1836,7 @@ static bool device_prepare_smart_suspend(struct device *dev) idx = device_links_read_lock(); list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { - if (!(link->flags | DL_FLAG_PM_RUNTIME)) + if (!(link->flags & DL_FLAG_PM_RUNTIME)) continue; if (!dev_pm_smart_suspend(link->supplier) &&