From d80a75624051b817043431f847470fb4680f2582 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 26 May 2025 19:30:55 +0800 Subject: [PATCH 01/75] cpufreq: CPPC: Remove cpu_data_list After commit a28b2bfc099c ("cppc_cpufreq: replace per-cpu data array with a list"), cpu_data can be got from policy->driver_data, so cpu_data_list is not actually needed and can be removed. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-2-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 25 ------------------------- include/acpi/cppc_acpi.h | 1 - 2 files changed, 26 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index b7c688a5659c..f3b5ea9fcbf5 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -26,14 +26,6 @@ #include -/* - * This list contains information parsed from per CPU ACPI _CPC and _PSD - * structures: e.g. the highest and lowest supported performance, capabilities, - * desired performance, level requested etc. Depending on the share_type, not - * all CPUs will have an entry in the list. - */ -static LIST_HEAD(cpu_data_list); - static struct cpufreq_driver cppc_cpufreq_driver; #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE @@ -567,8 +559,6 @@ static struct cppc_cpudata *cppc_cpufreq_get_cpu_data(unsigned int cpu) goto free_mask; } - list_add(&cpu_data->node, &cpu_data_list); - return cpu_data; free_mask: @@ -583,7 +573,6 @@ static void cppc_cpufreq_put_cpu_data(struct cpufreq_policy *policy) { struct cppc_cpudata *cpu_data = policy->driver_data; - list_del(&cpu_data->node); free_cpumask_var(cpu_data->shared_cpu_map); kfree(cpu_data); policy->driver_data = NULL; @@ -954,24 +943,10 @@ static int __init cppc_cpufreq_init(void) return ret; } -static inline void free_cpu_data(void) -{ - struct cppc_cpudata *iter, *tmp; - - list_for_each_entry_safe(iter, tmp, &cpu_data_list, node) { - free_cpumask_var(iter->shared_cpu_map); - list_del(&iter->node); - kfree(iter); - } - -} - static void __exit cppc_cpufreq_exit(void) { cpufreq_unregister_driver(&cppc_cpufreq_driver); cppc_freq_invariance_exit(); - - free_cpu_data(); } module_exit(cppc_cpufreq_exit); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 325e9543e08f..20f3d62e7a16 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -139,7 +139,6 @@ struct cppc_perf_fb_ctrs { /* Per CPU container for runtime CPPC management. */ struct cppc_cpudata { - struct list_head node; struct cppc_perf_caps perf_caps; struct cppc_perf_ctrls perf_ctrls; struct cppc_perf_fb_ctrs perf_fb_ctrs; From 3d5978ea6cbc4df192d0ea1800ef5d55b28b965e Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 26 May 2025 19:30:56 +0800 Subject: [PATCH 02/75] cpufreq: CPPC: Do not return a value from populate_efficiency_class() The return value of populate_efficiency_class() is never needed and the result of it doesn't affect the initialization of cppc_cpufreq. It makes more sense to change it into a void function. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-3-zhenglifeng1@huawei.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index f3b5ea9fcbf5..c2be4b188a23 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -480,7 +480,7 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, return 0; } -static int populate_efficiency_class(void) +static void populate_efficiency_class(void) { struct acpi_madt_generic_interrupt *gicc; DECLARE_BITMAP(used_classes, 256) = {}; @@ -495,7 +495,7 @@ static int populate_efficiency_class(void) if (bitmap_weight(used_classes, 256) <= 1) { pr_debug("Efficiency classes are all equal (=%d). " "No EM registered", class); - return -EINVAL; + return; } /* @@ -512,8 +512,6 @@ static int populate_efficiency_class(void) index++; } cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; - - return 0; } static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) @@ -529,9 +527,8 @@ static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) } #else -static int populate_efficiency_class(void) +static void populate_efficiency_class(void) { - return 0; } #endif From c83a92df2fc60bf0b3130cdf0bc2104d61750317 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 26 May 2025 19:30:57 +0800 Subject: [PATCH 03/75] cpufreq: CPPC: Remove forward declaration of cppc_cpufreq_register_em() cppc_cpufreq_register_em() is only used in populate_efficiency_class(). A forward declaration of it is not necessary. Move cppc_cpufreq_register_em() in front of populate_efficiency_class() and remove the forward declaration of cppc_cpufreq_register_em(). No functional change. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250526113057.3086513-4-zhenglifeng1@huawei.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index c2be4b188a23..a1fd0ff22bc5 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -344,7 +344,6 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu) #if defined(CONFIG_ARM64) && defined(CONFIG_ENERGY_MODEL) static DEFINE_PER_CPU(unsigned int, efficiency_class); -static void cppc_cpufreq_register_em(struct cpufreq_policy *policy); /* Create an artificial performance state every CPPC_EM_CAP_STEP capacity unit. */ #define CPPC_EM_CAP_STEP (20) @@ -480,6 +479,18 @@ static int cppc_get_cpu_cost(struct device *cpu_dev, unsigned long KHz, return 0; } +static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) +{ + struct cppc_cpudata *cpu_data; + struct em_data_callback em_cb = + EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); + + cpu_data = policy->driver_data; + em_dev_register_perf_domain(get_cpu_device(policy->cpu), + get_perf_level_count(policy), &em_cb, + cpu_data->shared_cpu_map, 0); +} + static void populate_efficiency_class(void) { struct acpi_madt_generic_interrupt *gicc; @@ -514,18 +525,6 @@ static void populate_efficiency_class(void) cppc_cpufreq_driver.register_em = cppc_cpufreq_register_em; } -static void cppc_cpufreq_register_em(struct cpufreq_policy *policy) -{ - struct cppc_cpudata *cpu_data; - struct em_data_callback em_cb = - EM_ADV_DATA_CB(cppc_get_cpu_power, cppc_get_cpu_cost); - - cpu_data = policy->driver_data; - em_dev_register_perf_domain(get_cpu_device(policy->cpu), - get_perf_level_count(policy), &em_cb, - cpu_data->shared_cpu_map, 0); -} - #else static void populate_efficiency_class(void) { From 2e1185c9d7b4f890614e891ae1414653b35b88e3 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Tue, 27 May 2025 21:59:09 +0900 Subject: [PATCH 04/75] cpufreq: userspace: set CPUFREQ_GOV_STRICT_TARGET flag When the userspace governor is used, the user intends to set a fixed CPU frequency for a policy, for whatever reason. The CPUFREQ_GOV_STRICT_TARGET flag is the required behaviour. Without this flag, the intel_pstate driver, with HWP enabled, will set HWP_MIN_PERF to the target frequency and HWP_MAX_PERF to the policy maximum, when configuring the HWP_REQUEST MSR. This lets the hardware choose any frequency between the target frequency and the policy maximum, which is not the intended behaviour. To fix this, `cat scaling_setspeed > scaling_max_freq` had to be done. With this patch, that is no longer necessary. Setting scaling_setspeed is sufficient, as expected. Signed-off-by: Shashank Balaji Link: https://patch.msgid.link/20250527-userspace-governor-doc-v2-1-0e22c69920f2@sony.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_userspace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c index 2c42fee76daa..77d62152cd38 100644 --- a/drivers/cpufreq/cpufreq_userspace.c +++ b/drivers/cpufreq/cpufreq_userspace.c @@ -134,6 +134,7 @@ static struct cpufreq_governor cpufreq_gov_userspace = { .store_setspeed = cpufreq_set, .show_setspeed = show_speed, .owner = THIS_MODULE, + .flags = CPUFREQ_GOV_STRICT_TARGET, }; MODULE_AUTHOR("Dominik Brodowski , " From 221504a63419bc3ef6e72deb42f9f8476eb83538 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Tue, 27 May 2025 21:59:10 +0900 Subject: [PATCH 05/75] cpufreq: docs: userspace: Explain HW coordination influence Extend the "scaling_setspeed" sysfs attribute description in the userspace governor documentation to cover possible differences between the requested and actual CPU frequency. Signed-off-by: Shashank Balaji Link: https://patch.msgid.link/20250527-userspace-governor-doc-v2-2-0e22c69920f2@sony.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/cpufreq.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst index 2d74af7f0efe..cacb9f0307dd 100644 --- a/Documentation/admin-guide/pm/cpufreq.rst +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -398,7 +398,9 @@ policy limits change after that. This governor does not do anything by itself. Instead, it allows user space to set the CPU frequency for the policy it is attached to by writing to the -``scaling_setspeed`` attribute of that policy. +``scaling_setspeed`` attribute of that policy. Though the intention may be to +set an exact frequency for the policy, the actual frequency may vary depending +on hardware coordination, thermal and power limits, and other factors. ``schedutil`` ------------- From 271ff96d6066347cd267ac3bcd6021bd4d38913d Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 16 Jun 2025 09:12:07 +0300 Subject: [PATCH 06/75] PM: runtime: Document return values of suspend-related API functions Document return values for device suspend and idle related API functions. Signed-off-by: Sakari Ailus Link: https://patch.msgid.link/20250616061212.2286741-2-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 147 ++++++++++++++++++++++++++++++++++--- 1 file changed, 138 insertions(+), 9 deletions(-) diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index e7cb70fcc0af..9dd2e4031a27 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -337,6 +337,20 @@ static inline void pm_runtime_release_supplier(struct device_link *link) {} * Invoke the "idle check" callback of @dev and, depending on its return value, * set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend has been enabled for it). + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing + * or device not in %RPM_ACTIVE state. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -EINPROGRESS: Suspend already in progress. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. + * Other values and conditions for the above values are possible as returned by + * Runtime PM idle and suspend callbacks. */ static inline int pm_runtime_idle(struct device *dev) { @@ -346,6 +360,18 @@ static inline int pm_runtime_idle(struct device *dev) /** * pm_runtime_suspend - Suspend a device synchronously. * @dev: Target device. + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. + * Other values and conditions for the above values are possible as returned by + * Runtime PM suspend callbacks. */ static inline int pm_runtime_suspend(struct device *dev) { @@ -358,6 +384,18 @@ static inline int pm_runtime_suspend(struct device *dev) * * Set up autosuspend of @dev or suspend it (depending on whether or not * autosuspend is enabled for it) without engaging its "idle check" callback. + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. + * Other values and conditions for the above values are possible as returned by + * Runtime PM suspend callbacks. */ static inline int pm_runtime_autosuspend(struct device *dev) { @@ -379,6 +417,18 @@ static inline int pm_runtime_resume(struct device *dev) * * Queue up a work item to run an equivalent of pm_runtime_idle() for @dev * asynchronously. + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero, Runtime PM status change ongoing + * or device not in %RPM_ACTIVE state. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -EINPROGRESS: Suspend already in progress. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. */ static inline int pm_request_idle(struct device *dev) { @@ -400,6 +450,17 @@ static inline int pm_request_resume(struct device *dev) * * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev * asynchronously. + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -EINPROGRESS: Suspend already in progress. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. */ static inline int pm_request_autosuspend(struct device *dev) { @@ -464,6 +525,17 @@ static inline int pm_runtime_resume_and_get(struct device *dev) * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_idle(). + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -EINPROGRESS: Suspend already in progress. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. */ static inline int pm_runtime_put(struct device *dev) { @@ -478,6 +550,17 @@ DEFINE_FREE(pm_runtime_put, struct device *, if (_T) pm_runtime_put(_T)) * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -EINPROGRESS: Suspend already in progress. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. */ static inline int __pm_runtime_put_autosuspend(struct device *dev) { @@ -490,6 +573,17 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev) * * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -EINPROGRESS: Suspend already in progress. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. */ static inline int pm_runtime_put_autosuspend(struct device *dev) { @@ -506,9 +600,20 @@ static inline int pm_runtime_put_autosuspend(struct device *dev) * return value, set up autosuspend of @dev or suspend it (depending on whether * or not autosuspend has been enabled for it). * - * The possible return values of this function are the same as for - * pm_runtime_idle() and the runtime PM usage counter of @dev remains - * decremented in all cases, even if it returns an error code. + * The runtime PM usage counter of @dev remains decremented in all cases, even + * if it returns an error code. + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. + * Other values and conditions for the above values are possible as returned by + * Runtime PM suspend callbacks. */ static inline int pm_runtime_put_sync(struct device *dev) { @@ -522,9 +627,21 @@ static inline int pm_runtime_put_sync(struct device *dev) * Decrement the runtime PM usage counter of @dev and if it turns out to be * equal to 0, carry out runtime-suspend of @dev synchronously. * - * The possible return values of this function are the same as for - * pm_runtime_suspend() and the runtime PM usage counter of @dev remains - * decremented in all cases, even if it returns an error code. + * The runtime PM usage counter of @dev remains decremented in all cases, even + * if it returns an error code. + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EAGAIN: usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. + * Other values and conditions for the above values are possible as returned by + * Runtime PM suspend callbacks. */ static inline int pm_runtime_put_sync_suspend(struct device *dev) { @@ -539,9 +656,21 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev) * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending * on whether or not autosuspend has been enabled for it). * - * The possible return values of this function are the same as for - * pm_runtime_autosuspend() and the runtime PM usage counter of @dev remains - * decremented in all cases, even if it returns an error code. + * The runtime PM usage counter of @dev remains decremented in all cases, even + * if it returns an error code. + * + * Return: + * * 0: Success. + * * -EINVAL: Runtime PM error. + * * -EACCES: Runtime PM disabled. + * * -EAGAIN: Runtime PM usage_count non-zero or Runtime PM status change ongoing. + * * -EBUSY: Runtime PM child_count non-zero. + * * -EPERM: Device PM QoS resume latency 0. + * * -EINPROGRESS: Suspend already in progress. + * * -ENOSYS: CONFIG_PM not enabled. + * * 1: Device already suspended. + * Other values and conditions for the above values are possible as returned by + * Runtime PM suspend callbacks. */ static inline int pm_runtime_put_sync_autosuspend(struct device *dev) { From b3db492e8335417dfd66c1fa2ea08e1d2f7b6736 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 16 Jun 2025 09:12:08 +0300 Subject: [PATCH 07/75] PM: runtime: Mark last busy stamp in pm_runtime_put_autosuspend() Set device's last busy timestamp to current time in pm_runtime_put_autosuspend(). Callers wishing not to do that will need to use __pm_runtime_put_autosuspend(). Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Link: https://patch.msgid.link/20250616061212.2286741-3-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 23 ++++++++++------------- include/linux/pm_runtime.h | 12 +++++++----- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 63344bea8393..e7bbdc66d64c 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -411,8 +411,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: pm_request_idle(dev) and return its result `int pm_runtime_put_autosuspend(struct device *dev);` - - does the same as __pm_runtime_put_autosuspend() for now, but in the - future, will also call pm_runtime_mark_last_busy() as well, DO NOT USE! + - set the power.last_busy field to the current time and decrement the + device's usage counter; if the result is 0 then run + pm_request_autosuspend(dev) and return its result `int __pm_runtime_put_autosuspend(struct device *dev);` - decrement the device's usage counter; if the result is 0 then run @@ -870,11 +871,9 @@ device is automatically suspended (the subsystem or driver still has to call the appropriate PM routines); rather it means that runtime suspends will automatically be delayed until the desired period of inactivity has elapsed. -Inactivity is determined based on the power.last_busy field. Drivers should -call pm_runtime_mark_last_busy() to update this field after carrying out I/O, -typically just before calling __pm_runtime_put_autosuspend(). The desired -length of the inactivity period is a matter of policy. Subsystems can set this -length initially by calling pm_runtime_set_autosuspend_delay(), but after device +Inactivity is determined based on the power.last_busy field. The desired length +of the inactivity period is a matter of policy. Subsystems can set this length +initially by calling pm_runtime_set_autosuspend_delay(), but after device registration the length should be controlled by user space, using the /sys/devices/.../power/autosuspend_delay_ms attribute. @@ -885,7 +884,7 @@ instead of the non-autosuspend counterparts:: Instead of: pm_runtime_suspend use: pm_runtime_autosuspend; Instead of: pm_schedule_suspend use: pm_request_autosuspend; - Instead of: pm_runtime_put use: __pm_runtime_put_autosuspend; + Instead of: pm_runtime_put use: pm_runtime_put_autosuspend; Instead of: pm_runtime_put_sync use: pm_runtime_put_sync_autosuspend. Drivers may also continue to use the non-autosuspend helper functions; they @@ -922,12 +921,10 @@ Here is a schematic pseudo-code example:: foo_io_completion(struct foo_priv *foo, void *req) { lock(&foo->private_lock); - if (--foo->num_pending_requests == 0) { - pm_runtime_mark_last_busy(&foo->dev); - __pm_runtime_put_autosuspend(&foo->dev); - } else { + if (--foo->num_pending_requests == 0) + pm_runtime_put_autosuspend(&foo->dev); + else foo_process_next_request(foo); - } unlock(&foo->private_lock); /* Send req result back to the user ... */ } diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 9dd2e4031a27..14ca7be96686 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -568,11 +568,13 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev) } /** - * pm_runtime_put_autosuspend - Drop device usage counter and queue autosuspend if 0. + * pm_runtime_put_autosuspend - Update the last access time of a device, drop + * its usage counter and queue autosuspend if the usage counter becomes 0. * @dev: Target device. * - * Decrement the runtime PM usage counter of @dev and if it turns out to be - * equal to 0, queue up a work item for @dev like in pm_request_autosuspend(). + * Update the last access time of @dev, decrement runtime PM usage counter of + * @dev and if it turns out to be equal to 0, queue up a work item for @dev like + * in pm_request_autosuspend(). * * Return: * * 0: Success. @@ -587,8 +589,8 @@ static inline int __pm_runtime_put_autosuspend(struct device *dev) */ static inline int pm_runtime_put_autosuspend(struct device *dev) { - return __pm_runtime_suspend(dev, - RPM_GET_PUT | RPM_ASYNC | RPM_AUTO); + pm_runtime_mark_last_busy(dev); + return __pm_runtime_put_autosuspend(dev); } /** From e24e0630b5ba13e83f65905becde9945518efa0b Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 16 Jun 2025 09:12:09 +0300 Subject: [PATCH 08/75] PM: runtime: Mark last busy stamp in pm_runtime_put_sync_autosuspend() Set device's last busy timestamp to current time in pm_runtime_put_sync_autosuspend(). Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Link: https://patch.msgid.link/20250616061212.2286741-4-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 3 ++- include/linux/pm_runtime.h | 11 +++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index e7bbdc66d64c..9c21c913f9cf 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -428,7 +428,8 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: pm_runtime_suspend(dev) and return its result `int pm_runtime_put_sync_autosuspend(struct device *dev);` - - decrement the device's usage counter; if the result is 0 then run + - set the power.last_busy field to the current time and decrement the + device's usage counter; if the result is 0 then run pm_runtime_autosuspend(dev) and return its result `void pm_runtime_enable(struct device *dev);` diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 14ca7be96686..3a0d5f0ea471 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -651,12 +651,14 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev) } /** - * pm_runtime_put_sync_autosuspend - Drop device usage counter and autosuspend if 0. + * pm_runtime_put_sync_autosuspend - Update the last access time of a device, + * drop device usage counter and autosuspend if 0. * @dev: Target device. * - * Decrement the runtime PM usage counter of @dev and if it turns out to be - * equal to 0, set up autosuspend of @dev or suspend it synchronously (depending - * on whether or not autosuspend has been enabled for it). + * Update the last access time of @dev, decrement the runtime PM usage counter + * of @dev and if it turns out to be equal to 0, set up autosuspend of @dev or + * suspend it synchronously (depending on whether or not autosuspend has been + * enabled for it). * * The runtime PM usage counter of @dev remains decremented in all cases, even * if it returns an error code. @@ -676,6 +678,7 @@ static inline int pm_runtime_put_sync_suspend(struct device *dev) */ static inline int pm_runtime_put_sync_autosuspend(struct device *dev) { + pm_runtime_mark_last_busy(dev); return __pm_runtime_suspend(dev, RPM_GET_PUT | RPM_AUTO); } From 08071e64cb642ae19ebd6ffeb13b4f3d130b5860 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 16 Jun 2025 09:12:10 +0300 Subject: [PATCH 09/75] PM: runtime: Mark last busy stamp in pm_runtime_autosuspend() Set device's last busy timestamp to current time in pm_runtime_autosuspend(). Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Link: https://patch.msgid.link/20250616061212.2286741-5-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 15 ++++++--------- include/linux/pm_runtime.h | 9 ++++++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 9c21c913f9cf..39a0b62f6648 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -154,11 +154,9 @@ suspending the device are satisfied) and to queue up a suspend request for the device in that case. If there is no idle callback, or if the callback returns 0, then the PM core will attempt to carry out a runtime suspend of the device, also respecting devices configured for autosuspend. In essence this means a -call to pm_runtime_autosuspend() (do note that drivers needs to update the -device last busy mark, pm_runtime_mark_last_busy(), to control the delay under -this circumstance). To prevent this (for example, if the callback routine has -started a delayed suspend), the routine must return a non-zero value. Negative -error return codes are ignored by the PM core. +call to pm_runtime_autosuspend(). To prevent this (for example, if the callback +routine has started a delayed suspend), the routine must return a non-zero +value. Negative error return codes are ignored by the PM core. The helper functions provided by the PM core, described in Section 4, guarantee that the following constraints are met with respect to runtime PM callbacks for @@ -330,10 +328,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: 'power.disable_depth' is different from 0 `int pm_runtime_autosuspend(struct device *dev);` - - same as pm_runtime_suspend() except that the autosuspend delay is taken - `into account;` if pm_runtime_autosuspend_expiration() says the delay has - not yet expired then an autosuspend is scheduled for the appropriate time - and 0 is returned + - same as pm_runtime_suspend() except that a call to + pm_runtime_mark_last_busy() is made and an autosuspend is scheduled for + the appropriate time and 0 is returned `int pm_runtime_resume(struct device *dev);` - execute the subsystem-level resume callback for the device; returns 0 on diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 3a0d5f0ea471..566a07b60f63 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -379,11 +379,13 @@ static inline int pm_runtime_suspend(struct device *dev) } /** - * pm_runtime_autosuspend - Set up autosuspend of a device or suspend it. + * pm_runtime_autosuspend - Update the last access time and set up autosuspend + * of a device. * @dev: Target device. * - * Set up autosuspend of @dev or suspend it (depending on whether or not - * autosuspend is enabled for it) without engaging its "idle check" callback. + * First update the last access time, then set up autosuspend of @dev or suspend + * it (depending on whether or not autosuspend is enabled for it) without + * engaging its "idle check" callback. * * Return: * * 0: Success. @@ -399,6 +401,7 @@ static inline int pm_runtime_suspend(struct device *dev) */ static inline int pm_runtime_autosuspend(struct device *dev) { + pm_runtime_mark_last_busy(dev); return __pm_runtime_suspend(dev, RPM_AUTO); } From 18c1fe53d186867243f4cf17f4eef60737a16c4c Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 16 Jun 2025 09:12:11 +0300 Subject: [PATCH 10/75] PM: runtime: Mark last busy stamp in pm_request_autosuspend() Set device's last busy timestamp to current time in pm_request_autosuspend(). Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Link: https://patch.msgid.link/20250616061212.2286741-6-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 6 +++--- include/linux/pm_runtime.h | 8 +++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 39a0b62f6648..91bc93422262 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -354,9 +354,9 @@ drivers/base/power/runtime.c and include/linux/pm_runtime.h: success or error code if the request has not been queued up `int pm_request_autosuspend(struct device *dev);` - - schedule the execution of the subsystem-level suspend callback for the - device when the autosuspend delay has expired; if the delay has already - expired then the work item is queued up immediately + - Call pm_runtime_mark_last_busy() and schedule the execution of the + subsystem-level suspend callback for the device when the autosuspend delay + expires `int pm_schedule_suspend(struct device *dev, unsigned int delay);` - schedule the execution of the subsystem-level suspend callback for the diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 566a07b60f63..778d5988f35e 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -448,11 +448,12 @@ static inline int pm_request_resume(struct device *dev) } /** - * pm_request_autosuspend - Queue up autosuspend of a device. + * pm_request_autosuspend - Update the last access time and queue up autosuspend + * of a device. * @dev: Target device. * - * Queue up a work item to run an equivalent pm_runtime_autosuspend() for @dev - * asynchronously. + * Update the last access time of a device and queue up a work item to run an + * equivalent pm_runtime_autosuspend() for @dev asynchronously. * * Return: * * 0: Success. @@ -467,6 +468,7 @@ static inline int pm_request_resume(struct device *dev) */ static inline int pm_request_autosuspend(struct device *dev) { + pm_runtime_mark_last_busy(dev); return __pm_runtime_suspend(dev, RPM_ASYNC | RPM_AUTO); } From cd4da713f99651e99fbce8ed6b6ec8f686c029a8 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 16 Jun 2025 09:12:12 +0300 Subject: [PATCH 11/75] Documentation: PM: *_autosuspend() functions update last busy time Document that the *_autosuspend() variants of the Runtime PM functions update the last busy timestamp. Signed-off-by: Sakari Ailus Reviewed-by: Laurent Pinchart Link: https://patch.msgid.link/20250616061212.2286741-7-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- Documentation/power/runtime_pm.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/power/runtime_pm.rst b/Documentation/power/runtime_pm.rst index 91bc93422262..c8dbdb8595e5 100644 --- a/Documentation/power/runtime_pm.rst +++ b/Documentation/power/runtime_pm.rst @@ -887,7 +887,8 @@ instead of the non-autosuspend counterparts:: Drivers may also continue to use the non-autosuspend helper functions; they will behave normally, which means sometimes taking the autosuspend delay into -account (see pm_runtime_idle). +account (see pm_runtime_idle). The autosuspend variants of the functions also +call pm_runtime_mark_last_busy(). Under some circumstances a driver or subsystem may want to prevent a device from autosuspending immediately, even though the usage counter is zero and the From cda7ac8ce7de84cf32a3871ba5f318aa3b79381e Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 12 Jun 2025 17:53:54 +0530 Subject: [PATCH 12/75] pm: cpupower: Fix the snapshot-order of tsc,mperf, clock in mperf_stop() In the function mperf_start(), mperf_monitor snapshots the time, tsc and finally the aperf,mperf MSRs. However, this order of snapshotting in is reversed in mperf_stop(). As a result, the C0 residency (which is computed as delta_mperf * 100 / delta_tsc) is under-reported on CPUs that is 100% busy. Fix this by snapshotting time, tsc and then aperf,mperf in mperf_stop() in the same order as in mperf_start(). Link: https://lore.kernel.org/r/20250612122355.19629-2-gautham.shenoy@amd.com Signed-off-by: Gautham R. Shenoy Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/idle_monitor/mperf_monitor.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c index 73b6b10cbdd2..5ae02c3d5b64 100644 --- a/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/mperf_monitor.c @@ -240,9 +240,9 @@ static int mperf_stop(void) int cpu; for (cpu = 0; cpu < cpu_count; cpu++) { - mperf_measure_stats(cpu); - mperf_get_tsc(&tsc_at_measure_end[cpu]); clock_gettime(CLOCK_REALTIME, &time_end[cpu]); + mperf_get_tsc(&tsc_at_measure_end[cpu]); + mperf_measure_stats(cpu); } return 0; From 14a3318b4ac8ae0ca2e1132a89de167e1030fbdb Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 12 Jun 2025 17:53:55 +0530 Subject: [PATCH 13/75] pm: cpupower: Fix printing of CORE, CPU fields in cpupower-monitor After the commit 0014f65e3df0 ("pm: cpupower: remove hard-coded topology depth values"), "cpupower monitor" output ceased to print the CORE and the CPU fields on a multi-socket platform. The reason for this is that the patch changed the behaviour to break out of the switch-case after printing the PKG details, while prior to the patch, the CORE and the CPU details would also get printed since the "if" condition check would pass for any level whose topology depth was lesser than that of a package. Fix this ensuring all the details below a desired topology depth are printed in the cpupower monitor output. Link: https://lore.kernel.org/r/20250612122355.19629-3-gautham.shenoy@amd.com Fixes: 0014f65e3df0 ("pm: cpupower: remove hard-coded topology depth values") Signed-off-by: Gautham R. Shenoy Signed-off-by: Shuah Khan --- tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c index ad493157f826..e8b3841d5c0f 100644 --- a/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c +++ b/tools/power/cpupower/utils/idle_monitor/cpupower-monitor.c @@ -121,10 +121,8 @@ void print_header(int topology_depth) switch (topology_depth) { case TOPOLOGY_DEPTH_PKG: printf(" PKG|"); - break; case TOPOLOGY_DEPTH_CORE: printf("CORE|"); - break; case TOPOLOGY_DEPTH_CPU: printf(" CPU|"); break; @@ -167,10 +165,8 @@ void print_results(int topology_depth, int cpu) switch (topology_depth) { case TOPOLOGY_DEPTH_PKG: printf("%4d|", cpu_top.core_info[cpu].pkg); - break; case TOPOLOGY_DEPTH_CORE: printf("%4d|", cpu_top.core_info[cpu].core); - break; case TOPOLOGY_DEPTH_CPU: printf("%4d|", cpu_top.core_info[cpu].cpu); break; From 0e18b1b106a29472ad7dab8eb97f4f24da870507 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 16 Jun 2025 16:01:08 +0200 Subject: [PATCH 14/75] MAINTAINERS: adjust file entry in CPU HOTPLUG Commit c7f005f70d22 ("rust: cpu: Add CpuId::current() to retrieve current CPU ID") adds the file rust/helpers/cpu.c, and intends to add a file entry for that file in the MAINTAINERS section CPU HOTPLUG. However, the added file entry is rust/helper/cpu.c; note the subtle difference between the two file names. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken reference. Adjust the file entry to the intended file. Fixes: c7f005f70d22 ("rust: cpu: Add CpuId::current() to retrieve current CPU ID") Signed-off-by: Lukas Bulwahn Reviewed-by: Boqun Feng Signed-off-by: Viresh Kumar --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0c1d245bf7b8..38d6d38beae2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6255,7 +6255,7 @@ F: include/linux/cpuhotplug.h F: include/linux/smpboot.h F: kernel/cpu.c F: kernel/smpboot.* -F: rust/helper/cpu.c +F: rust/helpers/cpu.c F: rust/kernel/cpu.rs CPU IDLE TIME MANAGEMENT FRAMEWORK From 897c0958808ac6b11a9715adef38682d7fa66229 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Thu, 12 Jun 2025 21:11:30 +0000 Subject: [PATCH 15/75] cpufreq: apple: drop default ARCH_APPLE in Kconfig When the first driver for Apple Silicon was upstreamed we accidentally included `default ARCH_APPLE` in its Kconfig which then spread to almost every subsequent driver. As soon as ARCH_APPLE is set to y this will pull in many drivers as built-ins which is not what we want. Thus, drop `default ARCH_APPLE` from Kconfig. Signed-off-by: Sven Peter Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 0d46402e3094..4346629d3bc0 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -28,7 +28,6 @@ config ARM_APPLE_SOC_CPUFREQ tristate "Apple Silicon SoC CPUFreq support" depends on ARCH_APPLE || (COMPILE_TEST && 64BIT) select PM_OPP - default ARCH_APPLE help This adds the CPUFreq driver for Apple Silicon machines (e.g. Apple M1). From 43ab245a9ec3837a0ca75c3ed9ba887d8e8b022e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 13 Jun 2025 09:16:44 +0200 Subject: [PATCH 16/75] cpufreq: brcmstb-avs: Fully open-code compatible for grepping It is very useful to find driver implementing compatibles with `git grep compatible`, so driver should not use defines for that string, even if this means string will be effectively duplicated. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Florian Fainelli Signed-off-by: Viresh Kumar --- drivers/cpufreq/brcmstb-avs-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c index 7b841a086acc..5940d262374f 100644 --- a/drivers/cpufreq/brcmstb-avs-cpufreq.c +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -765,7 +765,7 @@ static void brcm_avs_cpufreq_remove(struct platform_device *pdev) } static const struct of_device_id brcm_avs_cpufreq_match[] = { - { .compatible = BRCM_AVS_CPU_DATA }, + { .compatible = "brcm,avs-cpu-data-mem" }, { } }; MODULE_DEVICE_TABLE(of, brcm_avs_cpufreq_match); From 10bb7f09e346f152d2627e0b3619c402d64a50e9 Mon Sep 17 00:00:00 2001 From: Abhinav Ananthu Date: Fri, 13 Jun 2025 15:48:16 +0530 Subject: [PATCH 17/75] rust: cpufreq: Ensure C ABI compatibility in all unsafe Update all `unsafe extern "C"` callback functions in the cpufreq module to use `kernel::ffi` types (`c_int`, `c_uint`, etc.) instead of Rust-native types like `i32`, `u32`, or `usize`. This change ensures that all Rust callbacks have signatures that are ABI-compatible with their corresponding C counterparts, which is critical for FFI correctness and safety. Suggested-by: Miguel Ojeda Link: https://github.com/Rust-for-Linux/linux/issues/1170 Signed-off-by: Abhinav Ananthu Signed-off-by: Viresh Kumar --- rust/kernel/cpufreq.rs | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index 11b03e9d7e89..481a6d2dc362 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -1207,8 +1207,8 @@ impl Registration { /// - The pointer arguments must be valid pointers. unsafe extern "C" fn target_callback( ptr: *mut bindings::cpufreq_policy, - target_freq: u32, - relation: u32, + target_freq: c_uint, + relation: c_uint, ) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the @@ -1226,7 +1226,7 @@ impl Registration { /// - The pointer arguments must be valid pointers. unsafe extern "C" fn target_index_callback( ptr: *mut bindings::cpufreq_policy, - index: u32, + index: c_uint, ) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the @@ -1249,7 +1249,7 @@ impl Registration { /// - The pointer arguments must be valid pointers. unsafe extern "C" fn fast_switch_callback( ptr: *mut bindings::cpufreq_policy, - target_freq: u32, + target_freq: c_uint, ) -> kernel::ffi::c_uint { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1263,10 +1263,10 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. unsafe extern "C" fn adjust_perf_callback( - cpu: u32, - min_perf: usize, - target_perf: usize, - capacity: usize, + cpu: c_uint, + min_perf: c_ulong, + target_perf: c_ulong, + capacity: c_ulong, ) { // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) }; @@ -1284,7 +1284,7 @@ impl Registration { /// - The pointer arguments must be valid pointers. unsafe extern "C" fn get_intermediate_callback( ptr: *mut bindings::cpufreq_policy, - index: u32, + index: c_uint, ) -> kernel::ffi::c_uint { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1305,7 +1305,7 @@ impl Registration { /// - The pointer arguments must be valid pointers. unsafe extern "C" fn target_intermediate_callback( ptr: *mut bindings::cpufreq_policy, - index: u32, + index: c_uint, ) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the @@ -1325,7 +1325,7 @@ impl Registration { /// # Safety /// /// - This function may only be called from the cpufreq C infrastructure. - unsafe extern "C" fn get_callback(cpu: u32) -> kernel::ffi::c_uint { + unsafe extern "C" fn get_callback(cpu: c_uint) -> kernel::ffi::c_uint { // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) }; @@ -1351,7 +1351,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn bios_limit_callback(cpu: i32, limit: *mut u32) -> kernel::ffi::c_int { + unsafe extern "C" fn bios_limit_callback(cpu: c_int, limit: *mut c_uint) -> kernel::ffi::c_int { // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. let cpu_id = unsafe { CpuId::from_i32_unchecked(cpu) }; @@ -1371,7 +1371,7 @@ impl Registration { /// - The pointer arguments must be valid pointers. unsafe extern "C" fn set_boost_callback( ptr: *mut bindings::cpufreq_policy, - state: i32, + state: c_int, ) -> kernel::ffi::c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the From b0a86fb0b27f1ae944a6b0a1a9187c332bb2773e Mon Sep 17 00:00:00 2001 From: Abhinav Ananthu Date: Fri, 20 Jun 2025 14:22:30 +0530 Subject: [PATCH 18/75] rust: cpufreq: use c_ types from kernel prelude Update cpufreq FFI callback signatures to use `c_int` from the `kernel::prelude`, rather than accessing it explicitly through `kernel::ffi::c_int`. Although these types are defined in the `ffi` crate, they are re-exported via `kernel::prelude`. This aligns with the Rust-for-Linux coding guidelines and ensures proper C ABI compatibility across platforms. Signed-off-by: Abhinav Ananthu Suggested-by: Viresh Kumar Reviewed-by: Alice Ryhl [ Viresh: Fixed rustfmtcheck errors ] Signed-off-by: Viresh Kumar --- rust/kernel/cpufreq.rs | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs index 481a6d2dc362..d6a14239f4ba 100644 --- a/rust/kernel/cpufreq.rs +++ b/rust/kernel/cpufreq.rs @@ -1061,7 +1061,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn init_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + unsafe extern "C" fn init_callback(ptr: *mut bindings::cpufreq_policy) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1094,7 +1094,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn online_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + unsafe extern "C" fn online_callback(ptr: *mut bindings::cpufreq_policy) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1109,9 +1109,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn offline_callback( - ptr: *mut bindings::cpufreq_policy, - ) -> kernel::ffi::c_int { + unsafe extern "C" fn offline_callback(ptr: *mut bindings::cpufreq_policy) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1126,9 +1124,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn suspend_callback( - ptr: *mut bindings::cpufreq_policy, - ) -> kernel::ffi::c_int { + unsafe extern "C" fn suspend_callback(ptr: *mut bindings::cpufreq_policy) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1143,7 +1139,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn resume_callback(ptr: *mut bindings::cpufreq_policy) -> kernel::ffi::c_int { + unsafe extern "C" fn resume_callback(ptr: *mut bindings::cpufreq_policy) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1171,9 +1167,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn verify_callback( - ptr: *mut bindings::cpufreq_policy_data, - ) -> kernel::ffi::c_int { + unsafe extern "C" fn verify_callback(ptr: *mut bindings::cpufreq_policy_data) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1188,9 +1182,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn setpolicy_callback( - ptr: *mut bindings::cpufreq_policy, - ) -> kernel::ffi::c_int { + unsafe extern "C" fn setpolicy_callback(ptr: *mut bindings::cpufreq_policy) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1209,7 +1201,7 @@ impl Registration { ptr: *mut bindings::cpufreq_policy, target_freq: c_uint, relation: c_uint, - ) -> kernel::ffi::c_int { + ) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1227,7 +1219,7 @@ impl Registration { unsafe extern "C" fn target_index_callback( ptr: *mut bindings::cpufreq_policy, index: c_uint, - ) -> kernel::ffi::c_int { + ) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1250,7 +1242,7 @@ impl Registration { unsafe extern "C" fn fast_switch_callback( ptr: *mut bindings::cpufreq_policy, target_freq: c_uint, - ) -> kernel::ffi::c_uint { + ) -> c_uint { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. let policy = unsafe { Policy::from_raw_mut(ptr) }; @@ -1285,7 +1277,7 @@ impl Registration { unsafe extern "C" fn get_intermediate_callback( ptr: *mut bindings::cpufreq_policy, index: c_uint, - ) -> kernel::ffi::c_uint { + ) -> c_uint { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. let policy = unsafe { Policy::from_raw_mut(ptr) }; @@ -1306,7 +1298,7 @@ impl Registration { unsafe extern "C" fn target_intermediate_callback( ptr: *mut bindings::cpufreq_policy, index: c_uint, - ) -> kernel::ffi::c_int { + ) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. @@ -1325,7 +1317,7 @@ impl Registration { /// # Safety /// /// - This function may only be called from the cpufreq C infrastructure. - unsafe extern "C" fn get_callback(cpu: c_uint) -> kernel::ffi::c_uint { + unsafe extern "C" fn get_callback(cpu: c_uint) -> c_uint { // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) }; @@ -1351,7 +1343,7 @@ impl Registration { /// /// - This function may only be called from the cpufreq C infrastructure. /// - The pointer arguments must be valid pointers. - unsafe extern "C" fn bios_limit_callback(cpu: c_int, limit: *mut c_uint) -> kernel::ffi::c_int { + unsafe extern "C" fn bios_limit_callback(cpu: c_int, limit: *mut c_uint) -> c_int { // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number. let cpu_id = unsafe { CpuId::from_i32_unchecked(cpu) }; @@ -1372,7 +1364,7 @@ impl Registration { unsafe extern "C" fn set_boost_callback( ptr: *mut bindings::cpufreq_policy, state: c_int, - ) -> kernel::ffi::c_int { + ) -> c_int { from_result(|| { // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the // lifetime of `policy`. From 22679d807dea5c065d8019acfce48f20e87ba5ca Mon Sep 17 00:00:00 2001 From: Abhinav Ananthu Date: Fri, 20 Jun 2025 15:29:21 +0530 Subject: [PATCH 19/75] rust: opp: use c_* types via kernel prelude Update OPP FFI callback signatures to use `c_int` from the `kernel::prelude`, instead of accessing it via `kernel::ffi::c_int`. Although these types are defined in a crate named `ffi`, they are re-exported via the `kernel::prelude` and should be used from there. This aligns with the Rust-for-Linux coding guidelines and ensures ABI correctness when interfacing with C code. Suggested-by: Viresh Kumar Signed-off-by: Abhinav Ananthu Signed-off-by: Viresh Kumar --- rust/kernel/opp.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/kernel/opp.rs b/rust/kernel/opp.rs index a566fc3e7dcb..846583da9a2f 100644 --- a/rust/kernel/opp.rs +++ b/rust/kernel/opp.rs @@ -514,9 +514,9 @@ extern "C" fn config_clks( dev: *mut bindings::device, opp_table: *mut bindings::opp_table, opp: *mut bindings::dev_pm_opp, - _data: *mut kernel::ffi::c_void, + _data: *mut c_void, scaling_down: bool, - ) -> kernel::ffi::c_int { + ) -> c_int { from_result(|| { // SAFETY: 'dev' is guaranteed by the C code to be valid. let dev = unsafe { Device::get_device(dev) }; @@ -540,8 +540,8 @@ extern "C" fn config_regulators( old_opp: *mut bindings::dev_pm_opp, new_opp: *mut bindings::dev_pm_opp, regulators: *mut *mut bindings::regulator, - count: kernel::ffi::c_uint, - ) -> kernel::ffi::c_int { + count: c_uint, + ) -> c_int { from_result(|| { // SAFETY: 'dev' is guaranteed by the C code to be valid. let dev = unsafe { Device::get_device(dev) }; From b1b41bc072baf7301b1ae95fe417de09a5ad47e2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:14:53 +0200 Subject: [PATCH 20/75] cpufreq: armada-8k: make both cpu masks static An earlier patch marked one of the two CPU masks as 'static' to reduce stack usage, but if CONFIG_NR_CPUS is large enough, the function still produces a warning for compile testing: drivers/cpufreq/armada-8k-cpufreq.c: In function 'armada_8k_cpufreq_init': drivers/cpufreq/armada-8k-cpufreq.c:203:1: error: the frame size of 1416 bytes is larger than 1408 bytes [-Werror=frame-larger-than=] Normally this should be done using alloc_cpumask_var(), but since the driver already has a static mask and the probe function is not called concurrently, use the same trick for both. Fixes: 1ffec650d07f ("cpufreq: armada-8k: Avoid excessive stack usage") Signed-off-by: Arnd Bergmann Signed-off-by: Viresh Kumar --- drivers/cpufreq/armada-8k-cpufreq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/cpufreq/armada-8k-cpufreq.c b/drivers/cpufreq/armada-8k-cpufreq.c index 5a3545bd0d8d..006f4c554dd7 100644 --- a/drivers/cpufreq/armada-8k-cpufreq.c +++ b/drivers/cpufreq/armada-8k-cpufreq.c @@ -132,7 +132,7 @@ static int __init armada_8k_cpufreq_init(void) int ret = 0, opps_index = 0, cpu, nb_cpus; struct freq_table *freq_tables; struct device_node *node; - static struct cpumask cpus; + static struct cpumask cpus, shared_cpus; node = of_find_matching_node_and_match(NULL, armada_8k_cpufreq_of_match, NULL); @@ -154,7 +154,6 @@ static int __init armada_8k_cpufreq_init(void) * divisions of it). */ for_each_cpu(cpu, &cpus) { - struct cpumask shared_cpus; struct device *cpu_dev; struct clk *clk; From afc6a5b12b62c7743fb96fe27864604d7b33a5d2 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Wed, 25 Jun 2025 10:05:22 +0800 Subject: [PATCH 21/75] powercap: intel_rapl_msr: Add PL4 support for Panther Lake Add Panther Lake to the list of processors where PL4 is supported. Signed-off-by: Zhang Rui Link: https://patch.msgid.link/20250625020522.253548-1-rui.zhang@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_msr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 8ad2115d65f6..4ed06c71a3ac 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -150,6 +150,7 @@ static const struct x86_cpu_id pl4_support_ids[] = { X86_MATCH_VFM(INTEL_METEORLAKE_L, NULL), X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL), X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), {} }; From 82a7021f5074ff69478b5104739b91ff2ae3bb4a Mon Sep 17 00:00:00 2001 From: Qiao Wei Date: Thu, 12 Jun 2025 15:06:13 +0800 Subject: [PATCH 22/75] powercap: intel_rapl: Add support for Bartlett Lake platform Add Bartlett Lake to the list of supported processors in the RAPL common driver. Acked-by: Zhang Rui Signed-off-by: Qiao Wei [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index faa0b6bc5b53..c7e7f9bf5313 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1277,6 +1277,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_RAPTORLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_RAPTORLAKE_P, &rapl_defaults_core), X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_BARTLETTLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_METEORLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_METEORLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &rapl_defaults_spr_server), From b74710eaff314d6afe4fb0bbe9bc7657bf226fd4 Mon Sep 17 00:00:00 2001 From: "John B. Wyatt IV" Date: Tue, 24 Jun 2025 16:41:04 -0400 Subject: [PATCH 23/75] cpupower: Improve Python binding's Makefile Add a few build variables to make it easier for distributions to package the bindings. Allow current variables to be overwritten by environment variables that are passed to make. CCing Thorsten Leemhuis . https://lore.kernel.org/r/20250624204105.457971-1-jwyatt@redhat.com Signed-off-by: John B. Wyatt IV Signed-off-by: John B. Wyatt IV Signed-off-by: Shuah Khan --- tools/power/cpupower/bindings/python/Makefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/power/cpupower/bindings/python/Makefile b/tools/power/cpupower/bindings/python/Makefile index 81db39a03efb..4527cd732b42 100644 --- a/tools/power/cpupower/bindings/python/Makefile +++ b/tools/power/cpupower/bindings/python/Makefile @@ -4,20 +4,22 @@ # This Makefile expects you have already run `make install-lib` in the lib # directory for the bindings to be created. -CC := gcc +CC ?= gcc +# CFLAGS ?= +LDFLAGS ?= -lcpupower HAVE_SWIG := $(shell if which swig >/dev/null 2>&1; then echo 1; else echo 0; fi) HAVE_PYCONFIG := $(shell if which python-config >/dev/null 2>&1; then echo 1; else echo 0; fi) -PY_INCLUDE = $(firstword $(shell python-config --includes)) -INSTALL_DIR = $(shell python3 -c "import site; print(site.getsitepackages()[0])") +PY_INCLUDE ?= $(firstword $(shell python-config --includes)) +INSTALL_DIR ?= $(shell python3 -c "import site; print(site.getsitepackages()[0])") all: _raw_pylibcpupower.so _raw_pylibcpupower.so: raw_pylibcpupower_wrap.o - $(CC) -shared -lcpupower raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so + $(CC) -shared $(LDFLAGS) raw_pylibcpupower_wrap.o -o _raw_pylibcpupower.so raw_pylibcpupower_wrap.o: raw_pylibcpupower_wrap.c - $(CC) -fPIC -c raw_pylibcpupower_wrap.c $(PY_INCLUDE) + $(CC) $(CFLAGS) $(PY_INCLUDE) -fPIC -c raw_pylibcpupower_wrap.c raw_pylibcpupower_wrap.c: raw_pylibcpupower.swg ifeq ($(HAVE_SWIG),0) From 4a26df233266a628157d7f0285451d8655defdfc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 1 Jul 2025 17:30:01 -0500 Subject: [PATCH 24/75] cpufreq: armada-8k: Fix off by one in armada_8k_cpufreq_free_table() The freq_tables[] array has num_possible_cpus() elements so, to avoid an out of bounds access, this loop should be capped at "< nb_cpus" instead of "<= nb_cpus". The freq_tables[] array is allocated in armada_8k_cpufreq_init(). Cc: stable@vger.kernel.org Fixes: f525a670533d ("cpufreq: ap806: add cpufreq driver for Armada 8K") Signed-off-by: Dan Carpenter Signed-off-by: Viresh Kumar --- drivers/cpufreq/armada-8k-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/armada-8k-cpufreq.c b/drivers/cpufreq/armada-8k-cpufreq.c index 006f4c554dd7..d96c1718f7f8 100644 --- a/drivers/cpufreq/armada-8k-cpufreq.c +++ b/drivers/cpufreq/armada-8k-cpufreq.c @@ -103,7 +103,7 @@ static void armada_8k_cpufreq_free_table(struct freq_table *freq_tables) { int opps_index, nb_cpus = num_possible_cpus(); - for (opps_index = 0 ; opps_index <= nb_cpus; opps_index++) { + for (opps_index = 0 ; opps_index < nb_cpus; opps_index++) { int i; /* If cpu_dev is NULL then we reached the end of the array */ From 4266e8fa56d3d982bf451d382a410b9db432015c Mon Sep 17 00:00:00 2001 From: tuhaowen Date: Wed, 11 Jun 2025 11:23:45 +0800 Subject: [PATCH 25/75] PM: sleep: console: Fix the black screen issue When the computer enters sleep status without a monitor connected, the system switches the console to the virtual terminal tty63(SUSPEND_CONSOLE). If a monitor is subsequently connected before waking up, the system skips the required VT restoration process during wake-up, leaving the console on tty63 instead of switching back to tty1. To fix this issue, a global flag vt_switch_done is introduced to record whether the system has successfully switched to the suspend console via vt_move_to_console() during suspend. If the switch was completed, vt_switch_done is set to 1. Later during resume, this flag is checked to ensure that the original console is restored properly by calling vt_move_to_console(orig_fgconsole, 0). This prevents scenarios where the resume logic skips console restoration due to incorrect detection of the console state, especially when a monitor is reconnected before waking up. Signed-off-by: tuhaowen Link: https://patch.msgid.link/20250611032345.29962-1-tuhaowen@uniontech.com Signed-off-by: Rafael J. Wysocki --- kernel/power/console.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/power/console.c b/kernel/power/console.c index fcdf0e14a47d..19c48aa5355d 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -16,6 +16,7 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; +static bool vt_switch_done; static DEFINE_MUTEX(vt_switch_mutex); @@ -136,17 +137,21 @@ void pm_prepare_console(void) if (orig_fgconsole < 0) return; + vt_switch_done = true; + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); return; } void pm_restore_console(void) { - if (!pm_vt_switch()) + if (!pm_vt_switch() && !vt_switch_done) return; if (orig_fgconsole >= 0) { vt_move_to_console(orig_fgconsole, 0); vt_kmsg_redirect(orig_kmsg); } + + vt_switch_done = false; } From 5e8be76a7c37b98876704cf211ac0ab674304f4f Mon Sep 17 00:00:00 2001 From: Zhongqiu Han Date: Tue, 17 Jun 2025 16:46:50 +0800 Subject: [PATCH 26/75] PM: sleep: Drop superfluous might_sleep() calls Drop superfluous might_sleep() calls from dpm_resume(), dpm_complete(), and dpm_prepare(). These functions already invoke primitives that implicitly check for sleep in atomic context: - dpm_resume() and dpm_complete() invoke mutex_lock(), which internally triggers might_sleep(). - dpm_prepare() calls wait_for_device_probe(), which internally uses flush_work(), and thus might_sleep(). These annotations are unnecessary and can be dropped to reduce clutter. Signed-off-by: Zhongqiu Han Link: https://patch.msgid.link/20250617084650.341262-1-quic_zhonhan@quicinc.com Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index bf77d28e959f..e6fc52b85295 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1095,7 +1095,6 @@ void dpm_resume(pm_message_t state) ktime_t starttime = ktime_get(); trace_suspend_resume(TPS("dpm_resume"), state.event, true); - might_sleep(); pm_transition = state; async_error = 0; @@ -1198,7 +1197,6 @@ void dpm_complete(pm_message_t state) struct list_head list; trace_suspend_resume(TPS("dpm_complete"), state.event, true); - might_sleep(); INIT_LIST_HEAD(&list); mutex_lock(&dpm_list_mtx); @@ -2110,7 +2108,6 @@ int dpm_prepare(pm_message_t state) int error = 0; trace_suspend_resume(TPS("dpm_prepare"), state.event, true); - might_sleep(); /* * Give a chance for the known devices to complete their probes, before From ed18738fff025df2a424d3b21e895992e6cb230a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Jun 2025 14:54:39 +0200 Subject: [PATCH 27/75] PM: sleep: Make async resume handle consumers like children Avoid starting "async" resume processing upfront for devices that have suppliers and start "async" resume processing for a device's consumers right after resuming the device itself. Suggested-by: Saravana Kannan Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Reviewed-by: Sudeep Holla Link: https://patch.msgid.link/3378088.aeNJFYEL58@rjwysocki.net --- drivers/base/power/main.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e6fc52b85295..d7ef5048a452 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -647,14 +647,27 @@ static void dpm_async_resume_children(struct device *dev, async_func_t func) /* * Start processing "async" children of the device unless it's been * started already for them. - * - * This could have been done for the device's "async" consumers too, but - * they either need to wait for their parents or the processing has - * already started for them after their parents were processed. */ device_for_each_child(dev, func, dpm_async_with_cleanup); } +static void dpm_async_resume_subordinate(struct device *dev, async_func_t func) +{ + struct device_link *link; + int idx; + + dpm_async_resume_children(dev, func); + + idx = device_links_read_lock(); + + /* Start processing the device's "async" consumers. */ + list_for_each_entry_rcu(link, &dev->links.consumers, s_node) + if (READ_ONCE(link->status) != DL_STATE_DORMANT) + dpm_async_with_cleanup(link->consumer, func); + + device_links_read_unlock(idx); +} + static void dpm_clear_async_state(struct device *dev) { reinit_completion(&dev->power.completion); @@ -663,7 +676,14 @@ static void dpm_clear_async_state(struct device *dev) static bool dpm_root_device(struct device *dev) { - return !dev->parent; + lockdep_assert_held(&dpm_list_mtx); + + /* + * Since this function is required to run under dpm_list_mtx, the + * list_empty() below will only return true if the device's list of + * consumers is actually empty before calling it. + */ + return !dev->parent && list_empty(&dev->links.suppliers); } static void async_resume_noirq(void *data, async_cookie_t cookie); @@ -752,7 +772,7 @@ static void device_resume_noirq(struct device *dev, pm_message_t state, bool asy pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } - dpm_async_resume_children(dev, async_resume_noirq); + dpm_async_resume_subordinate(dev, async_resume_noirq); } static void async_resume_noirq(void *data, async_cookie_t cookie) @@ -895,7 +915,7 @@ static void device_resume_early(struct device *dev, pm_message_t state, bool asy pm_dev_err(dev, state, async ? " async early" : " early", error); } - dpm_async_resume_children(dev, async_resume_early); + dpm_async_resume_subordinate(dev, async_resume_early); } static void async_resume_early(void *data, async_cookie_t cookie) @@ -1071,7 +1091,7 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) pm_dev_err(dev, state, async ? " async" : "", error); } - dpm_async_resume_children(dev, async_resume); + dpm_async_resume_subordinate(dev, async_resume); } static void async_resume(void *data, async_cookie_t cookie) From 06799631d52261162d356623d14381d9f30223dc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 23 Jun 2025 14:55:05 +0200 Subject: [PATCH 28/75] PM: sleep: Make async suspend handle suppliers like parents Avoid starting "async" suspend processing upfront for devices that have consumers and start "async" suspend processing for a device's suppliers right after suspending the device itself. Suggested-by: Saravana Kannan Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Reviewed-by: Sudeep Holla Link: https://patch.msgid.link/3384525.44csPzL39Z@rjwysocki.net --- drivers/base/power/main.c | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index d7ef5048a452..04feac1a7059 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1276,10 +1276,15 @@ static bool dpm_leaf_device(struct device *dev) return false; } - return true; + /* + * Since this function is required to run under dpm_list_mtx, the + * list_empty() below will only return true if the device's list of + * consumers is actually empty before calling it. + */ + return list_empty(&dev->links.consumers); } -static void dpm_async_suspend_parent(struct device *dev, async_func_t func) +static bool dpm_async_suspend_parent(struct device *dev, async_func_t func) { guard(mutex)(&dpm_list_mtx); @@ -1291,11 +1296,31 @@ static void dpm_async_suspend_parent(struct device *dev, async_func_t func) * deleted before it. */ if (!device_pm_initialized(dev)) - return; + return false; /* Start processing the device's parent if it is "async". */ if (dev->parent) dpm_async_with_cleanup(dev->parent, func); + + return true; +} + +static void dpm_async_suspend_superior(struct device *dev, async_func_t func) +{ + struct device_link *link; + int idx; + + if (!dpm_async_suspend_parent(dev, func)) + return; + + idx = device_links_read_lock(); + + /* Start processing the device's "async" suppliers. */ + list_for_each_entry_rcu(link, &dev->links.suppliers, c_node) + if (READ_ONCE(link->status) != DL_STATE_DORMANT) + dpm_async_with_cleanup(link->supplier, func); + + device_links_read_unlock(idx); } /** @@ -1419,7 +1444,7 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool asy if (error || async_error) return error; - dpm_async_suspend_parent(dev, async_suspend_noirq); + dpm_async_suspend_superior(dev, async_suspend_noirq); return 0; } @@ -1615,7 +1640,7 @@ static int device_suspend_late(struct device *dev, pm_message_t state, bool asyn if (error || async_error) return error; - dpm_async_suspend_parent(dev, async_suspend_late); + dpm_async_suspend_superior(dev, async_suspend_late); return 0; } @@ -1906,7 +1931,7 @@ static int device_suspend(struct device *dev, pm_message_t state, bool async) if (error || async_error) return error; - dpm_async_suspend_parent(dev, async_suspend); + dpm_async_suspend_superior(dev, async_suspend); return 0; } From 9047685cfd2911c36ce89a16270aafa71057c507 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 26 Jun 2025 18:42:44 +0300 Subject: [PATCH 29/75] PM: Don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250626154244.324265-1-andriy.shevchenko@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/pm.h b/include/linux/pm.h index f0bd8fbae4f2..938b1b446a5d 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -8,14 +8,15 @@ #ifndef _LINUX_PM_H #define _LINUX_PM_H -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include /* * Callbacks for platform drivers to implement. From 200046d827188f878a61c01539c4315370577c73 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:06:34 +0200 Subject: [PATCH 30/75] PM: Use true/false as power.needs_force_resume values Since power.needs_force_resume is a bool field, use true/false as its values instead of 1/0, respectively. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/2254988.irdbgypaU6@rjwysocki.net --- drivers/base/power/runtime.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index c55a7c70bc1a..7dd815ab83a0 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1827,7 +1827,7 @@ void pm_runtime_init(struct device *dev) dev->power.request_pending = false; dev->power.request = RPM_REQ_NONE; dev->power.deferred_resume = false; - dev->power.needs_force_resume = 0; + dev->power.needs_force_resume = false; INIT_WORK(&dev->power.work, pm_runtime_work); dev->power.timer_expires = 0; @@ -1997,7 +1997,7 @@ int pm_runtime_force_suspend(struct device *dev) pm_runtime_set_suspended(dev); } else { __update_runtime_status(dev, RPM_SUSPENDED); - dev->power.needs_force_resume = 1; + dev->power.needs_force_resume = true; } return 0; @@ -2047,7 +2047,7 @@ int pm_runtime_force_resume(struct device *dev) pm_runtime_mark_last_busy(dev); out: - dev->power.needs_force_resume = 0; + dev->power.needs_force_resume = false; pm_runtime_enable(dev); return ret; } From c021c1b38f90d639423c1369625daa703a8472ea Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:08:48 +0200 Subject: [PATCH 31/75] PM: Move two sleep-related functions under CONFIG_PM_SLEEP Since pm_runtime_force_resume() and pm_runtime_need_not_resume() are only needed for handling system-wide PM transitions, there is no reason to compile them in if CONFIG_PM_SLEEP is unset. Accordingly, move them under CONFIG_PM_SLEEP and make the static inline stub for pm_runtime_force_resume() return an error to indicate that it should not be used outside CONFIG_PM_SLEEP. Putting pm_runtime_force_resume() also allows subsequent changes to be more straightforward because this function is going to access a device PM flag that is only defined when CONFIG_PM_SLEEP is set. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/3384523.aeNJFYEL58@rjwysocki.net --- drivers/base/power/runtime.c | 18 +++++++++++------- include/linux/pm_runtime.h | 16 ++++++++++++---- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 7dd815ab83a0..fe1f7cc663ac 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1941,13 +1941,6 @@ void pm_runtime_drop_link(struct device_link *link) pm_request_idle(link->supplier); } -bool pm_runtime_need_not_resume(struct device *dev) -{ - return atomic_read(&dev->power.usage_count) <= 1 && - (atomic_read(&dev->power.child_count) == 0 || - dev->power.ignore_children); -} - /** * pm_runtime_force_suspend - Force a device into suspend state if needed. * @dev: Device to suspend. @@ -2009,6 +2002,8 @@ int pm_runtime_force_suspend(struct device *dev) } EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); +#ifdef CONFIG_PM_SLEEP + /** * pm_runtime_force_resume - Force a device into resume state if needed. * @dev: Device to resume. @@ -2052,3 +2047,12 @@ int pm_runtime_force_resume(struct device *dev) return ret; } EXPORT_SYMBOL_GPL(pm_runtime_force_resume); + +bool pm_runtime_need_not_resume(struct device *dev) +{ + return atomic_read(&dev->power.usage_count) <= 1 && + (atomic_read(&dev->power.child_count) == 0 || + dev->power.ignore_children); +} + +#endif /* CONFIG_PM_SLEEP */ diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index e7cb70fcc0af..9bea07f22041 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -66,9 +66,7 @@ static inline bool queue_pm_work(struct work_struct *work) extern int pm_generic_runtime_suspend(struct device *dev); extern int pm_generic_runtime_resume(struct device *dev); -extern bool pm_runtime_need_not_resume(struct device *dev); extern int pm_runtime_force_suspend(struct device *dev); -extern int pm_runtime_force_resume(struct device *dev); extern int __pm_runtime_idle(struct device *dev, int rpmflags); extern int __pm_runtime_suspend(struct device *dev, int rpmflags); @@ -257,9 +255,7 @@ static inline bool queue_pm_work(struct work_struct *work) { return false; } static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; } static inline int pm_generic_runtime_resume(struct device *dev) { return 0; } -static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; } static inline int pm_runtime_force_suspend(struct device *dev) { return 0; } -static inline int pm_runtime_force_resume(struct device *dev) { return 0; } static inline int __pm_runtime_idle(struct device *dev, int rpmflags) { @@ -330,6 +326,18 @@ static inline void pm_runtime_release_supplier(struct device_link *link) {} #endif /* !CONFIG_PM */ +#ifdef CONFIG_PM_SLEEP + +bool pm_runtime_need_not_resume(struct device *dev); +int pm_runtime_force_resume(struct device *dev); + +#else /* !CONFIG_PM_SLEEP */ + +static inline bool pm_runtime_need_not_resume(struct device *dev) {return true; } +static inline int pm_runtime_force_resume(struct device *dev) { return -ENXIO; } + +#endif /* CONFIG_PM_SLEEP */ + /** * pm_runtime_idle - Conditionally set up autosuspend of a device or suspend it. * @dev: Target device. From e21bd84c2f1dd2900adb343a796bc88101ce48d0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:15:59 +0200 Subject: [PATCH 32/75] PM: Make pm_runtime_force_resume() work with DPM_FLAG_SMART_SUSPEND Curently, drivers using pm_runtime_force_suspend/resume() cannot set DPM_FLAG_SMART_SUSPEND because the devices with that flag set may need to be resumed during system-wide resume regardless of whether or not they have power.needs_force_resume set. That can happen due to a dependency resolved at the beginning of a system-wide resume transition (for instance, a bus type or PM domain has decided to resume a subordinate device with DPM_FLAG_SMART_SUSPEND and its parent and suppliers also need to be resumed). To overcome this limitation, modify pm_runtime_force_resume() to check the device's power.smart_suspend flag (which is set for devices with DPM_FLAG_SMART_SUSPEND set that meet some additional requirements) and the device's runtime PM status in addition to power.needs_force_resume. Also change it to clear power.smart_suspend to ensure that it will not handle the same device twice during one transition. The underlying observation is that there are two cases in which the device needs to be resumed by pm_runtime_force_resume(). One of them is when the device has power.needs_force_resume set, which means that pm_runtime_force_suspend() has suspended it and decided that it should be resumed during the subsequent system resume. The other one is when power.smart_suspend is set and the device's runtume PM status is RPM_ACTIVE. Update kerneldoc comments in accordance with the code changes. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/3662906.iIbC2pHGDl@rjwysocki.net --- drivers/base/power/runtime.c | 38 +++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index fe1f7cc663ac..e08500b72cc3 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1957,10 +1957,6 @@ void pm_runtime_drop_link(struct device_link *link) * sure the device is put into low power state and it should only be used during * system-wide PM transitions to sleep states. It assumes that the analogous * pm_runtime_force_resume() will be used to resume the device. - * - * Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent - * state where this function has called the ->runtime_suspend callback but the - * PM core marks the driver as runtime active. */ int pm_runtime_force_suspend(struct device *dev) { @@ -2008,20 +2004,28 @@ EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); * pm_runtime_force_resume - Force a device into resume state if needed. * @dev: Device to resume. * - * Prior invoking this function we expect the user to have brought the device - * into low power state by a call to pm_runtime_force_suspend(). Here we reverse - * those actions and bring the device into full power, if it is expected to be - * used on system resume. In the other case, we defer the resume to be managed - * via runtime PM. + * This function expects that either pm_runtime_force_suspend() has put the + * device into a low-power state prior to calling it, or the device had been + * runtime-suspended before the preceding system-wide suspend transition and it + * was left in suspend during that transition. * - * Typically this function may be invoked from a system resume callback. + * The actions carried out by pm_runtime_force_suspend(), or by a runtime + * suspend in general, are reversed and the device is brought back into full + * power if it is expected to be used on system resume, which is the case when + * its needs_force_resume flag is set or when its smart_suspend flag is set and + * its runtime PM status is "active". + * + * In other cases, the resume is deferred to be managed via runtime PM. + * + * Typically, this function may be invoked from a system resume callback. */ int pm_runtime_force_resume(struct device *dev) { int (*callback)(struct device *); int ret = 0; - if (!dev->power.needs_force_resume) + if (!dev->power.needs_force_resume && (!dev_pm_smart_suspend(dev) || + pm_runtime_status_suspended(dev))) goto out; /* @@ -2041,8 +2045,20 @@ int pm_runtime_force_resume(struct device *dev) } pm_runtime_mark_last_busy(dev); + out: + /* + * The smart_suspend flag can be cleared here because it is not going + * to be necessary until the next system-wide suspend transition that + * will update it again. + */ + dev->power.smart_suspend = false; + /* + * Also clear needs_force_resume to make this function skip devices that + * have been seen by it once. + */ dev->power.needs_force_resume = false; + pm_runtime_enable(dev); return ret; } From 89d9cec3b1e9c49bae9375a2db6dc49bc7468af0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:16:05 +0200 Subject: [PATCH 33/75] PM: runtime: Clear power.needs_force_resume in pm_runtime_reinit() Clear power.needs_force_resume in pm_runtime_reinit() in case it has been set by pm_runtime_force_suspend() invoked from a driver remove callback. Suggested-by: Ulf Hansson Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/9495163.CDJkKcVGEf@rjwysocki.net --- drivers/base/power/runtime.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index e08500b72cc3..d863b13c2e05 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1854,6 +1854,11 @@ void pm_runtime_reinit(struct device *dev) pm_runtime_put(dev->parent); } } + /* + * Clear power.needs_force_resume in case it has been set by + * pm_runtime_force_suspend() invoked from a driver remove callback. + */ + dev->power.needs_force_resume = false; } /** From ab5ce09709b5f3cc73124bd1f2d6de06c1a4b6be Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:19:31 +0200 Subject: [PATCH 34/75] PM: Check power.needs_force_resume in pm_runtime_force_suspend() Add a power.needs_force_resume check to pm_runtime_force_suspend() so it need not rely on the runtime PM status of the device when deciding whether or not to return early. With the new check in place, pm_runtime_force_suspend() will also skip devices with the runtime PM status equal to RPM_ACTIVE if they have power.needs_force_resume set, so it won't need to change the RPM status of the device to RPM_SUSPENDED in addition to setting power.needs_force_resume in the case when pm_runtime_need_not_resume() return false. That allows the runtime PM status update to be removed from pm_runtime_force_resume(), so the runtime PM status remains unchanged between the pm_runtime_force_suspend() and pm_runtime_force_resume() calls. This change potentially unbreaks drivers that call pm_runtime_force_suspend() from their ->remove() callbacks because currently, if the device being unbound from its driver has a parent with enabled runtime PM and/or (possibly) device links respecting runtime PM to suppliers, and it is RPM_ACTIVE when the remove takes place, pm_runtime_force_suspend() will not drop the parent's child count and the suppliers' runtime PM usage counters after force-suspending the device unless pm_runtime_need_not_resume() returns 'true' for it. Moreover, because pm_runtime_force_suspend() changes the device's runtime PM status to RPM_SUSPENDED, in the above case pm_runtime_reinit() will not cause those counters to drop, so they will remain nonzero forever effectively preventing the devices in question from runtime-suspending going forward. This change is also needed for pm_runtime_force_suspend() to work with PCI PM and ACPI PM after subsequent changes. Namely, say DPM_FLAG_SMART_SUSPEND is set for a PCI device and its driver uses pm_runtime_force_suspend() as its ->suspend() callback. If pm_runtime_force_suspend() changed the runtime PM status of the device to RPM_SUSPENDED, pci_pm_suspend_noirq() would skip the device due to the dev_pm_skip_suspend() check. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/1855933.VLH7GnMWUR@rjwysocki.net --- drivers/base/power/runtime.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index d863b13c2e05..f61b7fa183e0 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1969,7 +1969,7 @@ int pm_runtime_force_suspend(struct device *dev) int ret; pm_runtime_disable(dev); - if (pm_runtime_status_suspended(dev)) + if (pm_runtime_status_suspended(dev) || dev->power.needs_force_resume) return 0; callback = RPM_GET_CALLBACK(dev, runtime_suspend); @@ -1984,15 +1984,16 @@ int pm_runtime_force_suspend(struct device *dev) /* * If the device can stay in suspend after the system-wide transition * to the working state that will follow, drop the children counter of - * its parent, but set its status to RPM_SUSPENDED anyway in case this - * function will be called again for it in the meantime. + * its parent and the usage counters of its suppliers. Otherwise, set + * power.needs_force_resume to let pm_runtime_force_resume() know that + * the device needs to be taken care of and to prevent this function + * from handling the device again in case the device is passed to it + * once more subsequently. */ - if (pm_runtime_need_not_resume(dev)) { + if (pm_runtime_need_not_resume(dev)) pm_runtime_set_suspended(dev); - } else { - __update_runtime_status(dev, RPM_SUSPENDED); + else dev->power.needs_force_resume = true; - } return 0; @@ -2033,12 +2034,6 @@ int pm_runtime_force_resume(struct device *dev) pm_runtime_status_suspended(dev))) goto out; - /* - * The value of the parent's children counter is correct already, so - * just update the status of the device. - */ - __update_runtime_status(dev, RPM_ACTIVE); - callback = RPM_GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); From 2b2dcf08116d6e96a446c8f3216b2479701e39aa Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:20:40 +0200 Subject: [PATCH 35/75] PM: runtime: Introduce __rpm_get_driver_callback() Add a special function for computing the address of the runtime PM callback given by an offset relative to the start of the device driver's struct dev_pm_ops and use it to obtain the driver callback in __rpm_get_callback(). Also put the shared part of the callback address computation into a separate helper function to avoid code duplication and explicit pointer type casts. The new __rpm_get_driver_callback() will be used subsequently for implementing callback lookup in pm_runtime_force_suspend/resume(). No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/2054356.usQuhbGJ8B@rjwysocki.net --- drivers/base/power/runtime.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index f61b7fa183e0..8cd1a4db5e84 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -19,10 +19,24 @@ typedef int (*pm_callback_t)(struct device *); +static inline pm_callback_t get_callback_ptr(const void *start, size_t offset) +{ + return *(pm_callback_t *)(start + offset); +} + +static pm_callback_t __rpm_get_driver_callback(struct device *dev, + size_t cb_offset) +{ + if (dev->driver && dev->driver->pm) + return get_callback_ptr(dev->driver->pm, cb_offset); + + return NULL; +} + static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset) { - pm_callback_t cb; const struct dev_pm_ops *ops; + pm_callback_t cb = NULL; if (dev->pm_domain) ops = &dev->pm_domain->ops; @@ -36,12 +50,10 @@ static pm_callback_t __rpm_get_callback(struct device *dev, size_t cb_offset) ops = NULL; if (ops) - cb = *(pm_callback_t *)((void *)ops + cb_offset); - else - cb = NULL; + cb = get_callback_ptr(ops, cb_offset); - if (!cb && dev->driver && dev->driver->pm) - cb = *(pm_callback_t *)((void *)dev->driver->pm + cb_offset); + if (!cb) + cb = __rpm_get_driver_callback(dev, cb_offset); return cb; } From ffda4ca4608ea811aee2aace211bbf27c68a8853 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:23:42 +0200 Subject: [PATCH 36/75] PM: sleep: Add strict_midlayer flag to struct dev_pm_info Add a new flag, called strict_midlayer, to struct dev_pm_info, along with helper functions for updating and reading its value, to allow middle layer code that provides proper callbacks for device suspend- resume during system-wide PM transitions to let pm_runtime_force_suspend() and and pm_runtime_force_resume() know that they should only invoke runtime PM callbacks coming from the device's driver. Namely, if this flag is set, pm_runtime_force_suspend() and and pm_runtime_force_resume() will invoke runtime PM callbacks provided by the device's driver directly with the assumption that they have been called via a middle layer callback for device suspend or resume, respectively. For instance, acpi_general_pm_domain provides specific callback functions for system suspend, acpi_subsys_suspend(), acpi_subsys_suspend_late() and acpi_subsys_suspend_noirq(), and it does not expect its runtime suspend callback function, acpi_subsys_runtime_suspend(), to be invoked at any point during system suspend. In particular, it does not expect that function to be called from within any of the system suspend callback functions mentioned above which would happen if a device driver collaborating with acpi_general_pm_domain used pm_runtime_force_suspend() as its callback function for any system suspend phase later than "prepare". The new flag allows this expectation of acpi_general_pm_domain to be formally expressed, which is going to be done subsequently. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/24017035.6Emhk5qWAg@rjwysocki.net --- drivers/base/power/runtime.c | 21 +++++++++++++++++++-- include/linux/device.h | 27 +++++++++++++++++++++++++++ include/linux/pm.h | 1 + 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 8cd1a4db5e84..05ff3d2209e6 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1958,6 +1958,23 @@ void pm_runtime_drop_link(struct device_link *link) pm_request_idle(link->supplier); } +static pm_callback_t get_callback(struct device *dev, size_t cb_offset) +{ + /* + * Setting power.strict_midlayer means that the middle layer + * code does not want its runtime PM callbacks to be invoked via + * pm_runtime_force_suspend() and pm_runtime_force_resume(), so + * return a direct pointer to the driver callback in that case. + */ + if (dev_pm_strict_midlayer_is_set(dev)) + return __rpm_get_driver_callback(dev, cb_offset); + + return __rpm_get_callback(dev, cb_offset); +} + +#define GET_CALLBACK(dev, callback) \ + get_callback(dev, offsetof(struct dev_pm_ops, callback)) + /** * pm_runtime_force_suspend - Force a device into suspend state if needed. * @dev: Device to suspend. @@ -1984,7 +2001,7 @@ int pm_runtime_force_suspend(struct device *dev) if (pm_runtime_status_suspended(dev) || dev->power.needs_force_resume) return 0; - callback = RPM_GET_CALLBACK(dev, runtime_suspend); + callback = GET_CALLBACK(dev, runtime_suspend); dev_pm_enable_wake_irq_check(dev, true); ret = callback ? callback(dev) : 0; @@ -2046,7 +2063,7 @@ int pm_runtime_force_resume(struct device *dev) pm_runtime_status_suspended(dev))) goto out; - callback = RPM_GET_CALLBACK(dev, runtime_resume); + callback = GET_CALLBACK(dev, runtime_resume); dev_pm_disable_wake_irq_check(dev, false); ret = callback ? callback(dev) : 0; diff --git a/include/linux/device.h b/include/linux/device.h index 4940db137fff..5137f9d213ec 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -879,6 +879,33 @@ static inline bool dev_pm_smart_suspend(struct device *dev) #endif } +/* + * dev_pm_set_strict_midlayer - Update the device's power.strict_midlayer flag + * @dev: Target device. + * @val: New flag value. + * + * When set, power.strict_midlayer means that the middle layer power management + * code (typically, a bus type or a PM domain) does not expect its runtime PM + * suspend callback to be invoked at all during system-wide PM transitions and + * it does not expect its runtime PM resume callback to be invoked at any point + * when runtime PM is disabled for the device during system-wide PM transitions. + */ +static inline void dev_pm_set_strict_midlayer(struct device *dev, bool val) +{ +#ifdef CONFIG_PM_SLEEP + dev->power.strict_midlayer = val; +#endif +} + +static inline bool dev_pm_strict_midlayer_is_set(struct device *dev) +{ +#ifdef CONFIG_PM_SLEEP + return dev->power.strict_midlayer; +#else + return false; +#endif +} + static inline void device_lock(struct device *dev) { mutex_lock(&dev->mutex); diff --git a/include/linux/pm.h b/include/linux/pm.h index f0bd8fbae4f2..4149d45f6f76 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -683,6 +683,7 @@ struct dev_pm_info { bool smart_suspend:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ + bool strict_midlayer:1; #else bool should_wakeup:1; #endif From 325e3778eac3916f3451f8ceccafdc31427ccdd1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:25:22 +0200 Subject: [PATCH 37/75] ACPI: PM: Set/clear power.strict_midlayer in prepare/complete The ACPI general PM domain and the LPSS PM domain do not expect their mid-layer runtime PM suspend callbacks to be invoked at all during system-wide suspend and resume and they do not expect their runtime resume callbacks to be invoked at any point when runtime PM is disabled for the given device during system suspend and resume, so make acpi_subsys_prepare() set power.strict_midlayer for the given device to express that expectation and make acpi_subsys_complete() clear it. Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/4463062.ejJDZkT8p0@rjwysocki.net --- drivers/acpi/device_pm.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index dbd4446025ec..9a4d059f1d23 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -1119,6 +1119,8 @@ int acpi_subsys_prepare(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); + dev_pm_set_strict_midlayer(dev, true); + if (dev->driver && dev->driver->pm && dev->driver->pm->prepare) { int ret = dev->driver->pm->prepare(dev); @@ -1147,6 +1149,8 @@ void acpi_subsys_complete(struct device *dev) */ if (pm_runtime_suspended(dev) && pm_resume_via_firmware()) pm_request_resume(dev); + + dev_pm_set_strict_midlayer(dev, false); } EXPORT_SYMBOL_GPL(acpi_subsys_complete); From f19dc0489ed52f527e9b198b86b0b807ebbfa4e5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Jun 2025 21:29:16 +0200 Subject: [PATCH 38/75] PCI/PM: Set power.strict_midlayer in pci_pm_init() The PCI bus type does not expect its runtime PM suspend callback function, pci_pm_runtime_suspend(), to be invoked at all during system- wide suspend and resume, and it does not expect its runtime resume callback function, pci_pm_runtime_resume(), to be invoked at any point when runtime PM is disabled for the given device during system-wide suspend and resume, so make it express that expectation by setting power.strict_midlayer for all PCI devices in pci_pm_prepare() and clear it in pci_pm_complete(). Signed-off-by: Rafael J. Wysocki Acked-by: Bjorn Helgaas Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/1925097.atdPhlSkOF@rjwysocki.net --- drivers/pci/pci-driver.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 67db34fd10ee..b853585cb1f8 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -708,6 +708,8 @@ static int pci_pm_prepare(struct device *dev) struct pci_dev *pci_dev = to_pci_dev(dev); const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + dev_pm_set_strict_midlayer(dev, true); + if (pm && pm->prepare) { int error = pm->prepare(dev); if (error < 0) @@ -749,6 +751,8 @@ static void pci_pm_complete(struct device *dev) if (pci_dev->current_state < pre_sleep_state) pm_request_resume(dev); } + + dev_pm_set_strict_midlayer(dev, false); } #else /* !CONFIG_PM_SLEEP */ From 46dc57406887dd02565cb264224194a6776d882b Mon Sep 17 00:00:00 2001 From: Sivan Zohar-Kotzer Date: Wed, 2 Jul 2025 01:13:55 +0300 Subject: [PATCH 39/75] powercap: dtpm_cpu: Fix NULL pointer dereference in get_pd_power_uw() The get_pd_power_uw() function can crash with a NULL pointer dereference when em_cpu_get() returns NULL. This occurs when a CPU becomes impossible during runtime, causing get_cpu_device() to return NULL, which propagates through em_cpu_get() and leads to a crash when em_span_cpus() dereferences the NULL pointer. Add a NULL check after em_cpu_get() and return 0 if unavailable, matching the existing fallback behavior in __dtpm_cpu_setup(). Fixes: eb82bace8931 ("powercap/drivers/dtpm: Scale the power with the load") Signed-off-by: Sivan Zohar-Kotzer Link: https://patch.msgid.link/20250701221355.96916-1-sivany32@gmail.com [ rjw: Drop an excess empty code line ] Signed-off-by: Rafael J. Wysocki --- drivers/powercap/dtpm_cpu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/powercap/dtpm_cpu.c b/drivers/powercap/dtpm_cpu.c index 6b6f51b21550..99390ec1481f 100644 --- a/drivers/powercap/dtpm_cpu.c +++ b/drivers/powercap/dtpm_cpu.c @@ -96,6 +96,8 @@ static u64 get_pd_power_uw(struct dtpm *dtpm) int i; pd = em_cpu_get(dtpm_cpu->cpu); + if (!pd) + return 0; pd_mask = em_span_cpus(pd); From d42c7c6fd66a6e2a78ae1da666c5df6c2fde8389 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Thu, 3 Jul 2025 14:27:06 +0300 Subject: [PATCH 40/75] PM: domains: Add flags to specify power on attach/detach Calling dev_pm_domain_attach()/dev_pm_domain_detach() in bus driver probe/remove functions can affect system behavior when the drivers attached to the bus use devres-managed resources. Since devres actions may need to access device registers, calling dev_pm_domain_detach() too early, i.e., before these actions complete, can cause failures on some systems. One such example is Renesas RZ/G3S SoC-based platforms. If the device clocks are managed via PM domains, invoking dev_pm_domain_detach() in the bus driver's remove function removes the device's clocks from the PM domain, preventing any subsequent pm_runtime_resume*() calls from enabling those clocks. The second argument of dev_pm_domain_attach() specifies whether the PM domain should be powered on during attachment. Likewise, the second argument of dev_pm_domain_detach() indicates whether the domain should be powered off during detachment. Upcoming changes address the issue described above (initially for the platform bus only) by deferring the call to dev_pm_domain_detach() until after devres_release_all() in device_unbind_cleanup(). The detach_power_off field in struct dev_pm_info stores the detach power off info from the second argument of dev_pm_domain_attach(). Because there are cases where the device's PM domain power-on/off behavior must be conditional (e.g., in i2c_device_probe()), the patch introduces PD_FLAG_ATTACH_POWER_ON and PD_FLAG_DETACH_POWER_OFF flags to be passed to dev_pm_domain_attach(). Finally, dev_pm_domain_attach() and its users are updated to use the newly introduced PD_FLAG_ATTACH_POWER_ON and PD_FLAG_DETACH_POWER_OFF macros. This change is preparatory. Signed-off-by: Claudiu Beznea Reviewed-by: Mathieu Poirier Acked-by: Wolfram Sang # I2C Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/20250703112708.1621607-2-claudiu.beznea.uj@bp.renesas.com [ rjw: Changelog adjustments ] Signed-off-by: Rafael J. Wysocki --- drivers/amba/bus.c | 4 ++-- drivers/base/auxiliary.c | 2 +- drivers/base/platform.c | 2 +- drivers/base/power/common.c | 6 +++--- drivers/clk/qcom/apcs-sdx55.c | 2 +- drivers/gpu/drm/display/drm_dp_aux_bus.c | 2 +- drivers/i2c/i2c-core-base.c | 2 +- drivers/mmc/core/sdio_bus.c | 2 +- drivers/rpmsg/rpmsg_core.c | 2 +- drivers/soundwire/bus_type.c | 2 +- drivers/spi/spi.c | 2 +- drivers/tty/serdev/core.c | 2 +- include/linux/pm_domain.h | 10 ++++++++-- 13 files changed, 23 insertions(+), 17 deletions(-) diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 71482d639a6d..74e34a07ef72 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -138,7 +138,7 @@ static int amba_read_periphid(struct amba_device *dev) void __iomem *tmp; int i, ret; - ret = dev_pm_domain_attach(&dev->dev, true); + ret = dev_pm_domain_attach(&dev->dev, PD_FLAG_ATTACH_POWER_ON); if (ret) { dev_dbg(&dev->dev, "can't get PM domain: %d\n", ret); goto err_out; @@ -291,7 +291,7 @@ static int amba_probe(struct device *dev) if (ret < 0) break; - ret = dev_pm_domain_attach(dev, true); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); if (ret) break; diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index dba7c8e13a53..44cd3f85b659 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -217,7 +217,7 @@ static int auxiliary_bus_probe(struct device *dev) struct auxiliary_device *auxdev = to_auxiliary_dev(dev); int ret; - ret = dev_pm_domain_attach(dev, true); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); if (ret) { dev_warn(dev, "Failed to attach to PM Domain : %d\n", ret); return ret; diff --git a/drivers/base/platform.c b/drivers/base/platform.c index 075ec1d1b73a..df1ec34fdf56 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1396,7 +1396,7 @@ static int platform_probe(struct device *_dev) if (ret < 0) return ret; - ret = dev_pm_domain_attach(_dev, true); + ret = dev_pm_domain_attach(_dev, PD_FLAG_ATTACH_POWER_ON); if (ret) goto out; diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c index 781968a128ff..fecb85fa85ac 100644 --- a/drivers/base/power/common.c +++ b/drivers/base/power/common.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data); /** * dev_pm_domain_attach - Attach a device to its PM domain. * @dev: Device to attach. - * @power_on: Used to indicate whether we should power on the device. + * @flags: indicate whether we should power on/off the device on attach/detach * * The @dev may only be attached to a single PM domain. By iterating through * the available alternatives we try to find a valid PM domain for the device. @@ -100,14 +100,14 @@ EXPORT_SYMBOL_GPL(dev_pm_put_subsys_data); * Returns 0 on successfully attached PM domain, or when it is found that the * device doesn't need a PM domain, else a negative error code. */ -int dev_pm_domain_attach(struct device *dev, bool power_on) +int dev_pm_domain_attach(struct device *dev, u32 flags) { int ret; if (dev->pm_domain) return 0; - ret = acpi_dev_pm_attach(dev, power_on); + ret = acpi_dev_pm_attach(dev, !!(flags & PD_FLAG_ATTACH_POWER_ON)); if (!ret) ret = genpd_dev_pm_attach(dev); diff --git a/drivers/clk/qcom/apcs-sdx55.c b/drivers/clk/qcom/apcs-sdx55.c index 3ba01622d8f0..90dd1f1855c2 100644 --- a/drivers/clk/qcom/apcs-sdx55.c +++ b/drivers/clk/qcom/apcs-sdx55.c @@ -111,7 +111,7 @@ static int qcom_apcs_sdx55_clk_probe(struct platform_device *pdev) * driver, there seems to be no better place to do this. So do it here! */ cpu_dev = get_cpu_device(0); - ret = dev_pm_domain_attach(cpu_dev, true); + ret = dev_pm_domain_attach(cpu_dev, PD_FLAG_ATTACH_POWER_ON); if (ret) { dev_err_probe(dev, ret, "can't get PM domain: %d\n", ret); goto err; diff --git a/drivers/gpu/drm/display/drm_dp_aux_bus.c b/drivers/gpu/drm/display/drm_dp_aux_bus.c index ec7eac6b595f..718c9122bc3a 100644 --- a/drivers/gpu/drm/display/drm_dp_aux_bus.c +++ b/drivers/gpu/drm/display/drm_dp_aux_bus.c @@ -57,7 +57,7 @@ static int dp_aux_ep_probe(struct device *dev) container_of(aux_ep, struct dp_aux_ep_device_with_data, aux_ep); int ret; - ret = dev_pm_domain_attach(dev, true); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); if (ret) return dev_err_probe(dev, ret, "Failed to attach to PM Domain\n"); diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index 2ad2b1838f0f..38eabf1173da 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -573,7 +573,7 @@ static int i2c_device_probe(struct device *dev) goto err_clear_wakeup_irq; do_power_on = !i2c_acpi_waive_d0_probe(dev); - status = dev_pm_domain_attach(&client->dev, do_power_on); + status = dev_pm_domain_attach(&client->dev, do_power_on ? PD_FLAG_ATTACH_POWER_ON : 0); if (status) goto err_clear_wakeup_irq; diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index b66b637e2d57..656601754966 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -161,7 +161,7 @@ static int sdio_bus_probe(struct device *dev) if (!id) return -ENODEV; - ret = dev_pm_domain_attach(dev, false); + ret = dev_pm_domain_attach(dev, 0); if (ret) return ret; diff --git a/drivers/rpmsg/rpmsg_core.c b/drivers/rpmsg/rpmsg_core.c index 6ee36adcbdba..bece5e635ee9 100644 --- a/drivers/rpmsg/rpmsg_core.c +++ b/drivers/rpmsg/rpmsg_core.c @@ -479,7 +479,7 @@ static int rpmsg_dev_probe(struct device *dev) struct rpmsg_endpoint *ept = NULL; int err; - err = dev_pm_domain_attach(dev, true); + err = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); if (err) goto out; diff --git a/drivers/soundwire/bus_type.c b/drivers/soundwire/bus_type.c index 75d6f16efced..bc1e653080d9 100644 --- a/drivers/soundwire/bus_type.c +++ b/drivers/soundwire/bus_type.c @@ -101,7 +101,7 @@ static int sdw_drv_probe(struct device *dev) /* * attach to power domain but don't turn on (last arg) */ - ret = dev_pm_domain_attach(dev, false); + ret = dev_pm_domain_attach(dev, 0); if (ret) return ret; diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c index 1bc0fdbb1bd7..8200b47b2295 100644 --- a/drivers/spi/spi.c +++ b/drivers/spi/spi.c @@ -427,7 +427,7 @@ static int spi_probe(struct device *dev) if (spi->irq < 0) spi->irq = 0; - ret = dev_pm_domain_attach(dev, true); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); if (ret) return ret; diff --git a/drivers/tty/serdev/core.c b/drivers/tty/serdev/core.c index 0213381fa358..d16c207a1a9b 100644 --- a/drivers/tty/serdev/core.c +++ b/drivers/tty/serdev/core.c @@ -399,7 +399,7 @@ static int serdev_drv_probe(struct device *dev) const struct serdev_device_driver *sdrv = to_serdev_device_driver(dev->driver); int ret; - ret = dev_pm_domain_attach(dev, true); + ret = dev_pm_domain_attach(dev, PD_FLAG_ATTACH_POWER_ON); if (ret) return ret; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 0b18160901a2..62a35a78ce9b 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -36,10 +36,16 @@ * isn't specified, the index just follows the * index for the attached PM domain. * + * PD_FLAG_ATTACH_POWER_ON: Power on the domain during attach. + * + * PD_FLAG_DETACH_POWER_OFF: Power off the domain during detach. + * */ #define PD_FLAG_NO_DEV_LINK BIT(0) #define PD_FLAG_DEV_LINK_ON BIT(1) #define PD_FLAG_REQUIRED_OPP BIT(2) +#define PD_FLAG_ATTACH_POWER_ON BIT(3) +#define PD_FLAG_DETACH_POWER_OFF BIT(4) struct dev_pm_domain_attach_data { const char * const *pd_names; @@ -501,7 +507,7 @@ struct generic_pm_domain *of_genpd_remove_last(struct device_node *np) #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */ #ifdef CONFIG_PM -int dev_pm_domain_attach(struct device *dev, bool power_on); +int dev_pm_domain_attach(struct device *dev, u32 flags); struct device *dev_pm_domain_attach_by_id(struct device *dev, unsigned int index); struct device *dev_pm_domain_attach_by_name(struct device *dev, @@ -518,7 +524,7 @@ int dev_pm_domain_start(struct device *dev); void dev_pm_domain_set(struct device *dev, struct dev_pm_domain *pd); int dev_pm_domain_set_performance_state(struct device *dev, unsigned int state); #else -static inline int dev_pm_domain_attach(struct device *dev, bool power_on) +static inline int dev_pm_domain_attach(struct device *dev, u32 flags) { return 0; } From f99508074e78fea17f06d753d9ef453b174ec98e Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Thu, 3 Jul 2025 14:27:07 +0300 Subject: [PATCH 41/75] PM: domains: Detach on device_unbind_cleanup() The dev_pm_domain_attach() function is typically used in bus code alongside dev_pm_domain_detach(), often following patterns like: static int bus_probe(struct device *_dev) { struct bus_driver *drv = to_bus_driver(dev->driver); struct bus_device *dev = to_bus_device(_dev); int ret; // ... ret = dev_pm_domain_attach(_dev, true); if (ret) return ret; if (drv->probe) ret = drv->probe(dev); // ... } static void bus_remove(struct device *_dev) { struct bus_driver *drv = to_bus_driver(dev->driver); struct bus_device *dev = to_bus_device(_dev); if (drv->remove) drv->remove(dev); dev_pm_domain_detach(_dev); } When the driver's probe function uses devres-managed resources that depend on the power domain state, those resources are released later during device_unbind_cleanup(). Releasing devres-managed resources that depend on the power domain state after detaching the device from its PM domain can cause failures. For example, if the driver uses devm_pm_runtime_enable() in its probe function, and the device's clocks are managed by the PM domain, then during removal the runtime PM is disabled in device_unbind_cleanup() after the clocks have been removed from the PM domain. It may happen that the devm_pm_runtime_enable() action causes the device to be runtime- resumed. If the driver specific runtime PM APIs access registers directly, this will lead to accessing device registers without clocks being enabled. Similar issues may occur with other devres actions that access device registers. Add detach_power_off member to struct dev_pm_info, to be used later in device_unbind_cleanup() as the power_off argument for dev_pm_domain_detach(). This is a preparatory step toward removing dev_pm_domain_detach() calls from bus remove functions. Since the current PM domain detach functions (genpd_dev_pm_detach() and acpi_dev_pm_detach()) already set dev->pm_domain = NULL, there should be no issues with bus drivers that still call dev_pm_domain_detach() in their remove functions. Signed-off-by: Claudiu Beznea Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/20250703112708.1621607-3-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Rafael J. Wysocki --- drivers/base/dd.c | 2 ++ drivers/base/power/common.c | 3 +++ include/linux/pm.h | 1 + 3 files changed, 6 insertions(+) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index b526e0e0f52d..13ab98e033ea 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -552,6 +553,7 @@ static void device_unbind_cleanup(struct device *dev) dev->dma_range_map = NULL; device_set_driver(dev, NULL); dev_set_drvdata(dev, NULL); + dev_pm_domain_detach(dev, dev->power.detach_power_off); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); pm_runtime_reinit(dev); diff --git a/drivers/base/power/common.c b/drivers/base/power/common.c index fecb85fa85ac..6ecf9ce4a4e6 100644 --- a/drivers/base/power/common.c +++ b/drivers/base/power/common.c @@ -111,6 +111,9 @@ int dev_pm_domain_attach(struct device *dev, u32 flags) if (!ret) ret = genpd_dev_pm_attach(dev); + if (dev->pm_domain) + dev->power.detach_power_off = !!(flags & PD_FLAG_DETACH_POWER_OFF); + return ret < 0 ? ret : 0; } EXPORT_SYMBOL_GPL(dev_pm_domain_attach); diff --git a/include/linux/pm.h b/include/linux/pm.h index 938b1b446a5d..14e8370887e3 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -721,6 +721,7 @@ struct dev_pm_info { struct pm_subsys_data *subsys_data; /* Owned by the subsystem. */ void (*set_latency_tolerance)(struct device *, s32); struct dev_pm_qos *qos; + bool detach_power_off:1; /* Owned by the driver core */ }; extern int dev_pm_get_subsys_data(struct device *dev); From ba2ebd52a22eb7306a2093924920a125ad91215a Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Thu, 3 Jul 2025 14:27:08 +0300 Subject: [PATCH 42/75] driver core: platform: Drop dev_pm_domain_detach() call On the Renesas RZ/G3S (and other Renesas SoCs, e.g., RZ/G2{L, LC, UL}), clocks are managed through PM domains. These PM domains, registered on behalf of the clock controller driver, are configured with GENPD_FLAG_PM_CLK. In most of the Renesas drivers used by RZ SoCs, the clocks are enabled/disabled using runtime PM APIs. The power domains may also have power_on/power_off support implemented. After the device PM domain is powered off any CPU accesses to these domains leads to system aborts. During probe, devices are attached to the PM domain controlling their clocks and power. Similarly, during removal, devices are detached from the PM domain. The detachment call stack is as follows: device_driver_detach() -> device_release_driver_internal() -> __device_release_driver() -> device_remove() -> platform_remove() -> dev_pm_domain_detach() During driver unbind, after the device is detached from its PM domain, the device_unbind_cleanup() function is called, which subsequently invokes devres_release_all(). This function handles devres resource cleanup. If runtime PM is enabled in driver probe via devm_pm_runtime_enable(), the cleanup process triggers the action or reset function for disabling runtime PM. This function is pm_runtime_disable_action(), which leads to the following call stack of interest when called: pm_runtime_disable_action() -> pm_runtime_dont_use_autosuspend() -> __pm_runtime_use_autosuspend() -> update_autosuspend() -> rpm_idle() The rpm_idle() function attempts to resume the device at runtime. However, at the point it is called, the device is no longer part of a PM domain (which manages clocks and power states). If the driver implements its own runtime PM APIs for specific functionalities - such as the rzg2l_adc driver - while also relying on the power domain subsystem for power management, rpm_idle() will invoke the driver's runtime PM API. However, since the device is no longer part of a PM domain at this point, the PM domain's runtime PM APIs will not be called. This leads to system aborts on Renesas SoCs. Another identified case is when a subsystem performs various cleanups using device_unbind_cleanup(), calling driver-specific APIs in the process. A known example is the thermal subsystem, which may call driver- specific APIs to disable the thermal device. The relevant call stack in this case is: device_driver_detach() -> device_release_driver_internal() -> device_unbind_cleanup() -> devres_release_all() -> devm_thermal_of_zone_release() -> thermal_zone_device_disable() -> thermal_zone_device_set_mode() -> struct thermal_zone_device_ops::change_mode() At the moment the driver-specific change_mode() API is called, the device is no longer part of its PM domain. Accessing its registers without proper power management leads to system aborts. Drop the call to dev_pm_domain_detach() from the platform bus remove function and rely on the newly introduced call in device_unbind_cleanup(). This ensures the same effect, but the call now occurs after all driver-specific devres resources have been freed. Signed-off-by: Claudiu Beznea Reviewed-by: Ulf Hansson Link: https://patch.msgid.link/20250703112708.1621607-4-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Rafael J. Wysocki --- drivers/base/platform.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index df1ec34fdf56..09450349cf32 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -1396,15 +1396,13 @@ static int platform_probe(struct device *_dev) if (ret < 0) return ret; - ret = dev_pm_domain_attach(_dev, PD_FLAG_ATTACH_POWER_ON); + ret = dev_pm_domain_attach(_dev, PD_FLAG_ATTACH_POWER_ON | + PD_FLAG_DETACH_POWER_OFF); if (ret) goto out; - if (drv->probe) { + if (drv->probe) ret = drv->probe(dev); - if (ret) - dev_pm_domain_detach(_dev, true); - } out: if (drv->prevent_deferred_probe && ret == -EPROBE_DEFER) { @@ -1422,7 +1420,6 @@ static void platform_remove(struct device *_dev) if (drv->remove) drv->remove(dev); - dev_pm_domain_detach(_dev, true); } static void platform_shutdown(struct device *_dev) From 3254f54a3abda7080acfe51d7cbfabd7bba64d5a Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Mon, 7 Jul 2025 16:17:13 -0500 Subject: [PATCH 43/75] cpufreq: Export disable_cpufreq() This is used by the tegra124-cpufreq driver. Signed-off-by: Aaron Kling Acked-by: Rafael J. Wysocki Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d7426e1d8bdd..8487fe528de3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -109,6 +109,8 @@ void disable_cpufreq(void) { off = 1; } +EXPORT_SYMBOL_GPL(disable_cpufreq); + static DEFINE_MUTEX(cpufreq_governor_mutex); bool have_governor_per_policy(void) From d812734842f8ab3100e7cee8716b35be5a31a22a Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Mon, 7 Jul 2025 16:17:14 -0500 Subject: [PATCH 44/75] cpufreq: dt: Add register helper Cpufreq-dt currently exports no functions. This means that drivers that are based on cpufreq-dt have no way of establishing a depmod dependency on it. This helper allows that link. Signed-off-by: Aaron Kling Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt.c | 11 +++++++++++ drivers/cpufreq/cpufreq-dt.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index e80dd982a3e2..506437489b4d 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -329,6 +329,17 @@ static struct platform_driver dt_cpufreq_platdrv = { }; module_platform_driver(dt_cpufreq_platdrv); +struct platform_device *cpufreq_dt_pdev_register(struct device *dev) +{ + struct platform_device_info cpufreq_dt_devinfo = {}; + + cpufreq_dt_devinfo.name = "cpufreq-dt"; + cpufreq_dt_devinfo.parent = dev; + + return platform_device_register_full(&cpufreq_dt_devinfo); +} +EXPORT_SYMBOL_GPL(cpufreq_dt_pdev_register); + MODULE_ALIAS("platform:cpufreq-dt"); MODULE_AUTHOR("Viresh Kumar "); MODULE_AUTHOR("Shawn Guo "); diff --git a/drivers/cpufreq/cpufreq-dt.h b/drivers/cpufreq/cpufreq-dt.h index 28c8af7ec5ef..fc1889aeb4f1 100644 --- a/drivers/cpufreq/cpufreq-dt.h +++ b/drivers/cpufreq/cpufreq-dt.h @@ -22,4 +22,6 @@ struct cpufreq_dt_platform_data { int (*resume)(struct cpufreq_policy *policy); }; +struct platform_device *cpufreq_dt_pdev_register(struct device *dev); + #endif /* __CPUFREQ_DT_H__ */ From 0ae93389b6c84fbbc6414a5c78f50d65eea8cf35 Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Mon, 7 Jul 2025 16:17:15 -0500 Subject: [PATCH 45/75] cpufreq: tegra124: Allow building as a module This requires four changes: * Using the cpufreq-dt register helper to establish a hard dependency for depmod to track * Adding a remove routine to remove the cpufreq-dt device * Adding a exit routine to handle cleaning up the driver * Populating module license Signed-off-by: Aaron Kling Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 2 +- drivers/cpufreq/tegra124-cpufreq.c | 44 +++++++++++++++++++++++------- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 4346629d3bc0..9be0503df55a 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -237,7 +237,7 @@ config ARM_TEGRA20_CPUFREQ This adds the CPUFreq driver support for Tegra20/30 SOCs. config ARM_TEGRA124_CPUFREQ - bool "Tegra124 CPUFreq support" + tristate "Tegra124 CPUFreq support" depends on ARCH_TEGRA || COMPILE_TEST depends on CPUFREQ_DT default ARCH_TEGRA diff --git a/drivers/cpufreq/tegra124-cpufreq.c b/drivers/cpufreq/tegra124-cpufreq.c index 514146d98bca..b8bccde8b298 100644 --- a/drivers/cpufreq/tegra124-cpufreq.c +++ b/drivers/cpufreq/tegra124-cpufreq.c @@ -16,6 +16,10 @@ #include #include +#include "cpufreq-dt.h" + +static struct platform_device *tegra124_cpufreq_pdev; + struct tegra124_cpufreq_priv { struct clk *cpu_clk; struct clk *pllp_clk; @@ -55,7 +59,6 @@ static int tegra124_cpufreq_probe(struct platform_device *pdev) struct device_node *np __free(device_node) = of_cpu_device_node_get(0); struct tegra124_cpufreq_priv *priv; struct device *cpu_dev; - struct platform_device_info cpufreq_dt_devinfo = {}; int ret; if (!np) @@ -95,11 +98,7 @@ static int tegra124_cpufreq_probe(struct platform_device *pdev) if (ret) goto out_put_pllp_clk; - cpufreq_dt_devinfo.name = "cpufreq-dt"; - cpufreq_dt_devinfo.parent = &pdev->dev; - - priv->cpufreq_dt_pdev = - platform_device_register_full(&cpufreq_dt_devinfo); + priv->cpufreq_dt_pdev = cpufreq_dt_pdev_register(&pdev->dev); if (IS_ERR(priv->cpufreq_dt_pdev)) { ret = PTR_ERR(priv->cpufreq_dt_pdev); goto out_put_pllp_clk; @@ -173,6 +172,21 @@ static int __maybe_unused tegra124_cpufreq_resume(struct device *dev) return err; } +static void tegra124_cpufreq_remove(struct platform_device *pdev) +{ + struct tegra124_cpufreq_priv *priv = dev_get_drvdata(&pdev->dev); + + if (!IS_ERR(priv->cpufreq_dt_pdev)) { + platform_device_unregister(priv->cpufreq_dt_pdev); + priv->cpufreq_dt_pdev = ERR_PTR(-ENODEV); + } + + clk_put(priv->pllp_clk); + clk_put(priv->pllx_clk); + clk_put(priv->dfll_clk); + clk_put(priv->cpu_clk); +} + static const struct dev_pm_ops tegra124_cpufreq_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(tegra124_cpufreq_suspend, tegra124_cpufreq_resume) @@ -182,12 +196,12 @@ static struct platform_driver tegra124_cpufreq_platdrv = { .driver.name = "cpufreq-tegra124", .driver.pm = &tegra124_cpufreq_pm_ops, .probe = tegra124_cpufreq_probe, + .remove = tegra124_cpufreq_remove, }; static int __init tegra_cpufreq_init(void) { int ret; - struct platform_device *pdev; if (!(of_machine_is_compatible("nvidia,tegra124") || of_machine_is_compatible("nvidia,tegra210"))) @@ -201,15 +215,25 @@ static int __init tegra_cpufreq_init(void) if (ret) return ret; - pdev = platform_device_register_simple("cpufreq-tegra124", -1, NULL, 0); - if (IS_ERR(pdev)) { + tegra124_cpufreq_pdev = platform_device_register_simple("cpufreq-tegra124", -1, NULL, 0); + if (IS_ERR(tegra124_cpufreq_pdev)) { platform_driver_unregister(&tegra124_cpufreq_platdrv); - return PTR_ERR(pdev); + return PTR_ERR(tegra124_cpufreq_pdev); } return 0; } module_init(tegra_cpufreq_init); +static void __exit tegra_cpufreq_module_exit(void) +{ + if (!IS_ERR_OR_NULL(tegra124_cpufreq_pdev)) + platform_device_unregister(tegra124_cpufreq_pdev); + + platform_driver_unregister(&tegra124_cpufreq_platdrv); +} +module_exit(tegra_cpufreq_module_exit); + MODULE_AUTHOR("Tuomas Tynkkynen "); MODULE_DESCRIPTION("cpufreq driver for NVIDIA Tegra124"); +MODULE_LICENSE("GPL"); From 01d40d3c146449e8538bfffc65e90bfbfc2a99e8 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Thu, 10 Jul 2025 12:26:56 +0200 Subject: [PATCH 46/75] Documentation: power: Remove info about non-existing QoS interfaces cpu_latency_qos_add|remove_notifier() doesn't exist, hence let's drop the documentation of them. Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20250710102656.127654-1-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- Documentation/power/pm_qos_interface.rst | 7 ------- 1 file changed, 7 deletions(-) diff --git a/Documentation/power/pm_qos_interface.rst b/Documentation/power/pm_qos_interface.rst index 69b0fe3e2542..5019c79c7710 100644 --- a/Documentation/power/pm_qos_interface.rst +++ b/Documentation/power/pm_qos_interface.rst @@ -52,13 +52,6 @@ int cpu_latency_qos_request_active(handle): Returns if the request is still active, i.e. it has not been removed from the CPU latency QoS list. -int cpu_latency_qos_add_notifier(notifier): - Adds a notification callback function to the CPU latency QoS. The callback is - called when the aggregated value for the CPU latency QoS is changed. - -int cpu_latency_qos_remove_notifier(notifier): - Removes the notification callback function from the CPU latency QoS. - From user space: From f747cde5e71b1701a107c3a2e223e5b4a6cb4c52 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Wed, 9 Jul 2025 12:31:16 +0000 Subject: [PATCH 47/75] PM: sleep: add kernel parameter to disable asynchronous suspend/resume On some platforms, device dependencies are not properly represented by device links, which can cause issues when asynchronous power management is enabled. While it is possible to disable this via sysfs, doing so at runtime can race with the first system suspend event. This patch introduces a kernel command-line parameter, "pm_async", which can be set to "off" to globally disable asynchronous suspend and resume operations from early boot. It effectively provides a way to set the initial value of the existing pm_async sysfs knob at boot time. This offers a robust method to fall back to synchronous (sequential) operation, which can stabilize platforms with problematic dependencies and also serve as a useful debugging tool. The default behavior remains unchanged (asynchronous enabled). To disable it, boot the kernel with the "pm_async=off" parameter. Signed-off-by: Tudor Ambarus Acked-by: Randy Dunlap Link: https://patch.msgid.link/20250709-pm-async-off-v3-1-cb69a6fc8d04@linaro.org Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/kernel-parameters.txt | 12 ++++++++++++ kernel/power/main.c | 9 +++++++++ 2 files changed, 21 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f1f2c0874da9..06beacf208de 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5000,6 +5000,18 @@ that number, otherwise (e.g., 'pmu_override=on'), MMCR1 remains 0. + pm_async= [PM] + Format: off + This parameter sets the initial value of the + /sys/power/pm_async sysfs knob at boot time. + If set to "off", disables asynchronous suspend and + resume of devices during system-wide power transitions. + This can be useful on platforms where device + dependencies are not well-defined, or for debugging + power management issues. Asynchronous operations are + enabled by default. + + pm_debug_messages [SUSPEND,KNL] Enable suspend/resume debug messages during boot up. diff --git a/kernel/power/main.c b/kernel/power/main.c index 3d484630505a..3cf2d7e72567 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -112,6 +113,14 @@ int pm_notifier_call_chain(unsigned long val) /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; +static int __init pm_async_setup(char *str) +{ + if (!strcmp(str, "off")) + pm_async_enabled = 0; + return 1; +} +__setup("pm_async=", pm_async_setup); + static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { From ee2736848f1c9b64a9b1321d839f084ce80d4e66 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Wed, 4 Jun 2025 17:39:06 -0400 Subject: [PATCH 48/75] cpuidle: dt: fix opencoded for_each_cpu() in idle_state_valid() The function opencodes the for_each_cpu_from() by using an open for-loop. Fix that in sake of readability. While there, drop the 'valid' variable as it's pretty useless here. Signed-off-by: Yury Norov [NVIDIA] Link: https://patch.msgid.link/20250604213908.27819-1-yury.norov@gmail.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/dt_idle_states.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c index 97feb7d8fb23..558d49838990 100644 --- a/drivers/cpuidle/dt_idle_states.c +++ b/drivers/cpuidle/dt_idle_states.c @@ -98,7 +98,6 @@ static bool idle_state_valid(struct device_node *state_node, unsigned int idx, { int cpu; struct device_node *cpu_node, *curr_state_node; - bool valid = true; /* * Compare idle state phandles for index idx on all CPUs in the @@ -107,20 +106,17 @@ static bool idle_state_valid(struct device_node *state_node, unsigned int idx, * retrieved from. If a mismatch is found bail out straight * away since we certainly hit a firmware misconfiguration. */ - for (cpu = cpumask_next(cpumask_first(cpumask), cpumask); - cpu < nr_cpu_ids; cpu = cpumask_next(cpu, cpumask)) { + cpu = cpumask_first(cpumask) + 1; + for_each_cpu_from(cpu, cpumask) { cpu_node = of_cpu_device_node_get(cpu); curr_state_node = of_get_cpu_state_node(cpu_node, idx); - if (state_node != curr_state_node) - valid = false; - of_node_put(curr_state_node); of_node_put(cpu_node); - if (!valid) - break; + if (state_node != curr_state_node) + return false; } - return valid; + return true; } /** From 914cc799b28f17d369d5b4db3b941957d18157e8 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 21 Apr 2025 11:00:17 +0800 Subject: [PATCH 49/75] PM / devfreq: governor: Replace sscanf() with kstrtoul() in set_freq_store() Replace sscanf() with kstrtoul() in set_freq_store() and check the result to avoid invalid input. Signed-off-by: Lifeng Zheng Link: https://lore.kernel.org/lkml/20250421030020.3108405-2-zhenglifeng1@huawei.com/ Signed-off-by: Chanwoo Choi --- drivers/devfreq/governor_userspace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/devfreq/governor_userspace.c b/drivers/devfreq/governor_userspace.c index d1aa6806b683..175de0c0b50e 100644 --- a/drivers/devfreq/governor_userspace.c +++ b/drivers/devfreq/governor_userspace.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -39,10 +40,13 @@ static ssize_t set_freq_store(struct device *dev, struct device_attribute *attr, unsigned long wanted; int err = 0; + err = kstrtoul(buf, 0, &wanted); + if (err) + return err; + mutex_lock(&devfreq->lock); data = devfreq->governor_data; - sscanf(buf, "%lu", &wanted); data->user_frequency = wanted; data->valid = true; err = update_devfreq(devfreq); From 5487f2595bc821348848b0708f42df825d856f9e Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 21 Apr 2025 11:00:18 +0800 Subject: [PATCH 50/75] PM / devfreq: Limit max_freq with scaling_min_freq Limit max_freq in devfreq_get_freq_range() with scaling_min_freq to avoid showing an unreachable freq when reading it. Use macro clamp to simplify code. Signed-off-by: Lifeng Zheng Link: https://lore.kernel.org/lkml/20250421030020.3108405-3-zhenglifeng1@huawei.com/ Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 98657d3b9435..2810c84b9f8a 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -152,11 +152,8 @@ void devfreq_get_freq_range(struct devfreq *devfreq, (unsigned long)HZ_PER_KHZ * qos_max_freq); /* Apply constraints from OPP interface */ - *min_freq = max(*min_freq, devfreq->scaling_min_freq); - *max_freq = min(*max_freq, devfreq->scaling_max_freq); - - if (*min_freq > *max_freq) - *min_freq = *max_freq; + *max_freq = clamp(*max_freq, devfreq->scaling_min_freq, devfreq->scaling_max_freq); + *min_freq = clamp(*min_freq, devfreq->scaling_min_freq, *max_freq); } EXPORT_SYMBOL(devfreq_get_freq_range); From a98d36802f677d90333cc431e23f13cd53608a96 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 21 Apr 2025 11:00:19 +0800 Subject: [PATCH 51/75] PM / devfreq: Remove redundant devfreq_get_freq_range() calling in devfreq_add_device() The calling of devfreq_get_freq_range() in devfreq_add_device() is redundant because min_freq and max_freq are never used. Remove it. Signed-off-by: Lifeng Zheng Link: https://lore.kernel.org/lkml/20250421030020.3108405-4-zhenglifeng1@huawei.com/ Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 2810c84b9f8a..18e3f7e063a4 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -804,7 +804,6 @@ struct devfreq *devfreq_add_device(struct device *dev, { struct devfreq *devfreq; struct devfreq_governor *governor; - unsigned long min_freq, max_freq; int err = 0; if (!dev || !profile || !governor_name) { @@ -872,8 +871,6 @@ struct devfreq *devfreq_add_device(struct device *dev, goto err_dev; } - devfreq_get_freq_range(devfreq, &min_freq, &max_freq); - devfreq->suspend_freq = dev_pm_opp_get_suspend_opp_freq(dev); devfreq->opp_table = dev_pm_opp_get_opp_table(dev); if (IS_ERR(devfreq->opp_table)) From bab7834c03820eb11269bc48f07c3800192460d2 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 21 Apr 2025 11:00:20 +0800 Subject: [PATCH 52/75] PM / devfreq: Check governor before using governor->name Commit 96ffcdf239de ("PM / devfreq: Remove redundant governor_name from struct devfreq") removes governor_name and uses governor->name to replace it. But devfreq->governor may be NULL and directly using devfreq->governor->name may cause null pointer exception. Move the check of governor to before using governor->name. Fixes: 96ffcdf239de ("PM / devfreq: Remove redundant governor_name from struct devfreq") Signed-off-by: Lifeng Zheng Link: https://lore.kernel.org/lkml/20250421030020.3108405-5-zhenglifeng1@huawei.com/ Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 18e3f7e063a4..46f3a8053197 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -1376,15 +1376,11 @@ int devfreq_remove_governor(struct devfreq_governor *governor) int ret; struct device *dev = devfreq->dev.parent; + if (!devfreq->governor) + continue; + if (!strncmp(devfreq->governor->name, governor->name, DEVFREQ_NAME_LEN)) { - /* we should have a devfreq governor! */ - if (!devfreq->governor) { - dev_warn(dev, "%s: Governor %s NOT present\n", - __func__, governor->name); - continue; - /* Fall through */ - } ret = devfreq->governor->event_handler(devfreq, DEVFREQ_GOV_STOP, NULL); if (ret) { From 78c5845fbbf6aaeb9959c5fbaee5cc53ef5f38c2 Mon Sep 17 00:00:00 2001 From: Chanwoo Choi Date: Fri, 7 Feb 2025 16:13:50 -1000 Subject: [PATCH 53/75] PM / devfreq: Fix a index typo in trans_stat Fixes: 4920ee6dcfaf ("PM / devfreq: Convert to use sysfs_emit_at() API") Signed-off-by: pls Link: https://patchwork.kernel.org/project/linux-pm/patch/20250515143100.17849-1-chanwoo@kernel.org/ Signed-off-by: Chanwoo Choi --- drivers/devfreq/devfreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 46f3a8053197..c5f5960e643b 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -1733,7 +1733,7 @@ static ssize_t trans_stat_show(struct device *dev, for (i = 0; i < max_state; i++) { if (len >= PAGE_SIZE - 1) break; - if (df->freq_table[2] == df->previous_freq) + if (df->freq_table[i] == df->previous_freq) len += sysfs_emit_at(buf, len, "*"); else len += sysfs_emit_at(buf, len, " "); From c3bc361393b289df3499f5a87276367c71fae7c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 13 May 2025 22:39:02 +0200 Subject: [PATCH 54/75] PM / devfreq: sun8i-a33-mbus: Simplify by using more devm functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use devm allocators for enabling the bus clock and clk_rate_exclusive_get(). This simplifies error handling and the remove callback. Reviewed-by: Chen-Yu Tsai Signed-off-by: Uwe Kleine-König Tested-by: Corentin LABBE Link: https://patchwork.kernel.org/project/linux-pm/patch/20250513203908.205060-2-u.kleine-koenig@baylibre.com/ Signed-off-by: Chanwoo Choi --- drivers/devfreq/sun8i-a33-mbus.c | 38 ++++++++------------------------ 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/drivers/devfreq/sun8i-a33-mbus.c b/drivers/devfreq/sun8i-a33-mbus.c index 7c6ae91ede1f..4bd5657558d6 100644 --- a/drivers/devfreq/sun8i-a33-mbus.c +++ b/drivers/devfreq/sun8i-a33-mbus.c @@ -360,7 +360,7 @@ static int sun8i_a33_mbus_probe(struct platform_device *pdev) if (IS_ERR(priv->reg_mbus)) return PTR_ERR(priv->reg_mbus); - priv->clk_bus = devm_clk_get(dev, "bus"); + priv->clk_bus = devm_clk_get_enabled(dev, "bus"); if (IS_ERR(priv->clk_bus)) return dev_err_probe(dev, PTR_ERR(priv->clk_bus), "failed to get bus clock\n"); @@ -375,24 +375,15 @@ static int sun8i_a33_mbus_probe(struct platform_device *pdev) return dev_err_probe(dev, PTR_ERR(priv->clk_mbus), "failed to get mbus clock\n"); - ret = clk_prepare_enable(priv->clk_bus); - if (ret) - return dev_err_probe(dev, ret, - "failed to enable bus clock\n"); - /* Lock the DRAM clock rate to keep priv->nominal_bw in sync. */ - ret = clk_rate_exclusive_get(priv->clk_dram); - if (ret) { - err = "failed to lock dram clock rate\n"; - goto err_disable_bus; - } + ret = devm_clk_rate_exclusive_get(dev, priv->clk_dram); + if (ret) + return dev_err_probe(dev, ret, "failed to lock dram clock rate\n"); /* Lock the MBUS clock rate to keep MBUS_TMR_PERIOD in sync. */ - ret = clk_rate_exclusive_get(priv->clk_mbus); - if (ret) { - err = "failed to lock mbus clock rate\n"; - goto err_unlock_dram; - } + ret = devm_clk_rate_exclusive_get(dev, priv->clk_mbus); + if (ret) + return dev_err_probe(dev, ret, "failed to lock mbus clock rate\n"); priv->gov_data.upthreshold = 10; priv->gov_data.downdifferential = 5; @@ -405,10 +396,8 @@ static int sun8i_a33_mbus_probe(struct platform_device *pdev) priv->profile.max_state = max_state; ret = devm_pm_opp_set_clkname(dev, "dram"); - if (ret) { - err = "failed to add OPP table\n"; - goto err_unlock_mbus; - } + if (ret) + return dev_err_probe(dev, ret, "failed to add OPP table\n"); base_freq = clk_get_rate(clk_get_parent(priv->clk_dram)); for (i = 0; i < max_state; ++i) { @@ -448,12 +437,6 @@ static int sun8i_a33_mbus_probe(struct platform_device *pdev) err_remove_opps: dev_pm_opp_remove_all_dynamic(dev); -err_unlock_mbus: - clk_rate_exclusive_put(priv->clk_mbus); -err_unlock_dram: - clk_rate_exclusive_put(priv->clk_dram); -err_disable_bus: - clk_disable_unprepare(priv->clk_bus); return dev_err_probe(dev, ret, err); } @@ -472,9 +455,6 @@ static void sun8i_a33_mbus_remove(struct platform_device *pdev) dev_warn(dev, "failed to restore DRAM frequency: %d\n", ret); dev_pm_opp_remove_all_dynamic(dev); - clk_rate_exclusive_put(priv->clk_mbus); - clk_rate_exclusive_put(priv->clk_dram); - clk_disable_unprepare(priv->clk_bus); } static const struct sun8i_a33_mbus_variant sun50i_a64_mbus = { From 45b9d1da6ca0d0285140f8779793b537e4560d45 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Mon, 23 Jun 2025 22:34:00 +0800 Subject: [PATCH 55/75] PM / devfreq: Allow devfreq driver to add custom sysfs ABIs Extend the devfreq_dev_profile to allow drivers optionally create device-specific sysfs ABIs together with other common devfreq ABIs under the devfreq device path. Reviewed-by: Jonathan Cameron Reviewed-by: Huisong Li Signed-off-by: Jie Zhan Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20250623143401.4095045-2-zhanjie9@hisilicon.com/ --- drivers/devfreq/devfreq.c | 1 + include/linux/devfreq.h | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index c5f5960e643b..2e8d01d47f69 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -831,6 +831,7 @@ struct devfreq *devfreq_add_device(struct device *dev, mutex_lock(&devfreq->lock); devfreq->dev.parent = dev; devfreq->dev.class = devfreq_class; + devfreq->dev.groups = profile->dev_groups; devfreq->dev.release = devfreq_dev_release; INIT_LIST_HEAD(&devfreq->node); devfreq->profile = profile; diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index d312ffbac4dd..dc1075dc3446 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -103,6 +103,8 @@ struct devfreq_dev_status { * * @is_cooling_device: A self-explanatory boolean giving the device a * cooling effect property. + * @dev_groups: Optional device-specific sysfs attribute groups that to + * be attached to the devfreq device. */ struct devfreq_dev_profile { unsigned long initial_freq; @@ -119,6 +121,8 @@ struct devfreq_dev_profile { unsigned int max_state; bool is_cooling_device; + + const struct attribute_group **dev_groups; }; /** From 7da2fdaaa1e6062686ac96a9f096c2d7847533e4 Mon Sep 17 00:00:00 2001 From: Jie Zhan Date: Mon, 23 Jun 2025 22:34:01 +0800 Subject: [PATCH 56/75] PM / devfreq: Add HiSilicon uncore frequency scaling driver Add the HiSilicon uncore frequency scaling driver for Kunpeng SoCs based on the devfreq framework. The uncore domain contains shared computing resources, including system interconnects and L3 cache. The uncore frequency significantly impacts the system-wide performance as well as power consumption. This driver adds support for runtime management of uncore frequency from kernel and userspace. The main function includes setting and getting frequencies, changing frequency scaling policies, and querying the list of CPUs whose performance is significantly related to this uncore frequency domain, etc. The driver communicates with a platform controller through an ACPI PCC mailbox to take the actual actions of frequency scaling. Co-developed-by: Lifeng Zheng Signed-off-by: Lifeng Zheng Reviewed-by: Jonathan Cameron Reviewed-by: Huisong Li Signed-off-by: Jie Zhan Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20250623143401.4095045-3-zhanjie9@hisilicon.com/ --- Documentation/ABI/testing/sysfs-class-devfreq | 9 + drivers/devfreq/Kconfig | 11 + drivers/devfreq/Makefile | 1 + drivers/devfreq/hisi_uncore_freq.c | 658 ++++++++++++++++++ 4 files changed, 679 insertions(+) create mode 100644 drivers/devfreq/hisi_uncore_freq.c diff --git a/Documentation/ABI/testing/sysfs-class-devfreq b/Documentation/ABI/testing/sysfs-class-devfreq index 1e7e0bb4c14e..df8ba88b9f6a 100644 --- a/Documentation/ABI/testing/sysfs-class-devfreq +++ b/Documentation/ABI/testing/sysfs-class-devfreq @@ -132,3 +132,12 @@ Description: A list of governors that support the node: - simple_ondemand + +What: /sys/class/devfreq/.../related_cpus +Date: June 2025 +Contact: Linux power management list +Description: The list of CPUs whose performance is closely related to the + frequency of this devfreq domain. + + This file is only present if a specific devfreq device is + closely associated with a subset of CPUs. diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig index 3c4862a752b5..c999c4a1e567 100644 --- a/drivers/devfreq/Kconfig +++ b/drivers/devfreq/Kconfig @@ -90,6 +90,17 @@ config ARM_EXYNOS_BUS_DEVFREQ and adjusts the operating frequencies and voltages with OPP support. This does not yet operate with optimal voltages. +config ARM_HISI_UNCORE_DEVFREQ + tristate "HiSilicon uncore DEVFREQ Driver" + depends on ACPI && ACPI_PPTT && PCC + select DEVFREQ_GOV_PERFORMANCE + select DEVFREQ_GOV_USERSPACE + help + This adds a DEVFREQ driver that manages uncore frequency scaling for + HiSilicon Kunpeng SoCs. This enables runtime management of uncore + frequency scaling from kernel and userspace. The uncore domain + contains system interconnects and L3 cache. + config ARM_IMX_BUS_DEVFREQ tristate "i.MX Generic Bus DEVFREQ Driver" depends on ARCH_MXC || COMPILE_TEST diff --git a/drivers/devfreq/Makefile b/drivers/devfreq/Makefile index bf40d04928d0..404179d79a9d 100644 --- a/drivers/devfreq/Makefile +++ b/drivers/devfreq/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_DEVFREQ_GOV_PASSIVE) += governor_passive.o # DEVFREQ Drivers obj-$(CONFIG_ARM_EXYNOS_BUS_DEVFREQ) += exynos-bus.o +obj-$(CONFIG_ARM_HISI_UNCORE_DEVFREQ) += hisi_uncore_freq.o obj-$(CONFIG_ARM_IMX_BUS_DEVFREQ) += imx-bus.o obj-$(CONFIG_ARM_IMX8M_DDRC_DEVFREQ) += imx8m-ddrc.o obj-$(CONFIG_ARM_MEDIATEK_CCI_DEVFREQ) += mtk-cci-devfreq.o diff --git a/drivers/devfreq/hisi_uncore_freq.c b/drivers/devfreq/hisi_uncore_freq.c new file mode 100644 index 000000000000..96d1815059e3 --- /dev/null +++ b/drivers/devfreq/hisi_uncore_freq.c @@ -0,0 +1,658 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HiSilicon uncore frequency scaling driver + * + * Copyright (c) 2025 HiSilicon Co., Ltd + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "governor.h" + +struct hisi_uncore_pcc_data { + u16 status; + u16 resv; + u32 data; +}; + +struct hisi_uncore_pcc_shmem { + struct acpi_pcct_shared_memory head; + struct hisi_uncore_pcc_data pcc_data; +}; + +enum hisi_uncore_pcc_cmd_type { + HUCF_PCC_CMD_GET_CAP = 0, + HUCF_PCC_CMD_GET_FREQ, + HUCF_PCC_CMD_SET_FREQ, + HUCF_PCC_CMD_GET_MODE, + HUCF_PCC_CMD_SET_MODE, + HUCF_PCC_CMD_GET_PLAT_FREQ_NUM, + HUCF_PCC_CMD_GET_PLAT_FREQ_BY_IDX, + HUCF_PCC_CMD_MAX = 256 +}; + +static int hisi_platform_gov_usage; +static DEFINE_MUTEX(hisi_platform_gov_usage_lock); + +enum hisi_uncore_freq_mode { + HUCF_MODE_PLATFORM = 0, + HUCF_MODE_OS, + HUCF_MODE_MAX +}; + +#define HUCF_CAP_PLATFORM_CTRL BIT(0) + +/** + * struct hisi_uncore_freq - hisi uncore frequency scaling device data + * @dev: device of this frequency scaling driver + * @cl: mailbox client object + * @pchan: PCC mailbox channel + * @chan_id: PCC channel ID + * @last_cmd_cmpl_time: timestamp of the last completed PCC command + * @pcc_lock: PCC channel lock + * @devfreq: devfreq data of this hisi_uncore_freq device + * @related_cpus: CPUs whose performance is majorly affected by this + * uncore frequency domain + * @cap: capability flag + */ +struct hisi_uncore_freq { + struct device *dev; + struct mbox_client cl; + struct pcc_mbox_chan *pchan; + int chan_id; + ktime_t last_cmd_cmpl_time; + struct mutex pcc_lock; + struct devfreq *devfreq; + struct cpumask related_cpus; + u32 cap; +}; + +/* PCC channel timeout = PCC nominal latency * NUM */ +#define HUCF_PCC_POLL_TIMEOUT_NUM 1000 +#define HUCF_PCC_POLL_INTERVAL_US 5 + +/* Default polling interval in ms for devfreq governors*/ +#define HUCF_DEFAULT_POLLING_MS 100 + +static void hisi_uncore_free_pcc_chan(struct hisi_uncore_freq *uncore) +{ + guard(mutex)(&uncore->pcc_lock); + pcc_mbox_free_channel(uncore->pchan); + uncore->pchan = NULL; +} + +static void devm_hisi_uncore_free_pcc_chan(void *data) +{ + hisi_uncore_free_pcc_chan(data); +} + +static int hisi_uncore_request_pcc_chan(struct hisi_uncore_freq *uncore) +{ + struct device *dev = uncore->dev; + struct pcc_mbox_chan *pcc_chan; + + uncore->cl = (struct mbox_client) { + .dev = dev, + .tx_block = false, + .knows_txdone = true, + }; + + pcc_chan = pcc_mbox_request_channel(&uncore->cl, uncore->chan_id); + if (IS_ERR(pcc_chan)) + return dev_err_probe(dev, PTR_ERR(pcc_chan), + "Failed to request PCC channel %u\n", uncore->chan_id); + + if (!pcc_chan->shmem_base_addr) { + pcc_mbox_free_channel(pcc_chan); + return dev_err_probe(dev, -EINVAL, + "Invalid PCC shared memory address\n"); + } + + if (pcc_chan->shmem_size < sizeof(struct hisi_uncore_pcc_shmem)) { + pcc_mbox_free_channel(pcc_chan); + return dev_err_probe(dev, -EINVAL, + "Invalid PCC shared memory size (%lluB)\n", + pcc_chan->shmem_size); + } + + uncore->pchan = pcc_chan; + + return devm_add_action_or_reset(uncore->dev, + devm_hisi_uncore_free_pcc_chan, uncore); +} + +static acpi_status hisi_uncore_pcc_reg_scan(struct acpi_resource *res, + void *ctx) +{ + struct acpi_resource_generic_register *reg; + struct hisi_uncore_freq *uncore; + + if (!res || res->type != ACPI_RESOURCE_TYPE_GENERIC_REGISTER) + return AE_OK; + + reg = &res->data.generic_reg; + if (reg->space_id != ACPI_ADR_SPACE_PLATFORM_COMM) + return AE_OK; + + if (!ctx) + return AE_ERROR; + + uncore = ctx; + /* PCC subspace ID stored in Access Size */ + uncore->chan_id = reg->access_size; + + return AE_CTRL_TERMINATE; +} + +static int hisi_uncore_init_pcc_chan(struct hisi_uncore_freq *uncore) +{ + acpi_handle handle = ACPI_HANDLE(uncore->dev); + acpi_status status; + int rc; + + uncore->chan_id = -1; + status = acpi_walk_resources(handle, METHOD_NAME__CRS, + hisi_uncore_pcc_reg_scan, uncore); + if (ACPI_FAILURE(status) || uncore->chan_id < 0) + return dev_err_probe(uncore->dev, -ENODEV, + "Failed to get a PCC channel\n"); + + + rc = devm_mutex_init(uncore->dev, &uncore->pcc_lock); + if (rc) + return rc; + + return hisi_uncore_request_pcc_chan(uncore); +} + +static int hisi_uncore_cmd_send(struct hisi_uncore_freq *uncore, + u8 cmd, u32 *data) +{ + struct hisi_uncore_pcc_shmem __iomem *addr; + struct hisi_uncore_pcc_shmem shmem; + struct pcc_mbox_chan *pchan; + unsigned int mrtt; + s64 time_delta; + u16 status; + int rc; + + guard(mutex)(&uncore->pcc_lock); + + pchan = uncore->pchan; + if (!pchan) + return -ENODEV; + + addr = (struct hisi_uncore_pcc_shmem __iomem *)pchan->shmem; + if (!addr) + return -EINVAL; + + /* Handle the Minimum Request Turnaround Time (MRTT) */ + mrtt = pchan->min_turnaround_time; + time_delta = ktime_us_delta(ktime_get(), uncore->last_cmd_cmpl_time); + if (mrtt > time_delta) + udelay(mrtt - time_delta); + + /* Copy data */ + shmem.head = (struct acpi_pcct_shared_memory) { + .signature = PCC_SIGNATURE | uncore->chan_id, + .command = cmd, + }; + shmem.pcc_data.data = *data; + memcpy_toio(addr, &shmem, sizeof(shmem)); + + /* Ring doorbell */ + rc = mbox_send_message(pchan->mchan, &cmd); + if (rc < 0) { + dev_err(uncore->dev, "Failed to send mbox message, %d\n", rc); + return rc; + } + + /* Wait status */ + rc = readw_poll_timeout(&addr->head.status, status, + status & (PCC_STATUS_CMD_COMPLETE | + PCC_STATUS_ERROR), + HUCF_PCC_POLL_INTERVAL_US, + pchan->latency * HUCF_PCC_POLL_TIMEOUT_NUM); + if (rc) { + dev_err(uncore->dev, "PCC channel response timeout, cmd=%u\n", cmd); + } else if (status & PCC_STATUS_ERROR) { + dev_err(uncore->dev, "PCC cmd error, cmd=%u\n", cmd); + rc = -EIO; + } + + uncore->last_cmd_cmpl_time = ktime_get(); + + /* Copy data back */ + memcpy_fromio(data, &addr->pcc_data.data, sizeof(*data)); + + /* Clear mailbox active req */ + mbox_client_txdone(pchan->mchan, rc); + + return rc; +} + +static int hisi_uncore_target(struct device *dev, unsigned long *freq, + u32 flags) +{ + struct hisi_uncore_freq *uncore = dev_get_drvdata(dev); + struct dev_pm_opp *opp; + u32 data; + + if (WARN_ON(!uncore || !uncore->pchan)) + return -ENODEV; + + opp = devfreq_recommended_opp(dev, freq, flags); + if (IS_ERR(opp)) { + dev_err(dev, "Failed to get opp for freq %lu hz\n", *freq); + return PTR_ERR(opp); + } + dev_pm_opp_put(opp); + + data = (u32)(dev_pm_opp_get_freq(opp) / HZ_PER_MHZ); + + return hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_SET_FREQ, &data); +} + +static int hisi_uncore_get_dev_status(struct device *dev, + struct devfreq_dev_status *stat) +{ + /* Not used */ + return 0; +} + +static int hisi_uncore_get_cur_freq(struct device *dev, unsigned long *freq) +{ + struct hisi_uncore_freq *uncore = dev_get_drvdata(dev); + u32 data = 0; + int rc; + + if (WARN_ON(!uncore || !uncore->pchan)) + return -ENODEV; + + rc = hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_GET_FREQ, &data); + + /* + * Upon a failure, 'data' remains 0 and 'freq' is set to 0 rather than a + * random value. devfreq shouldn't use 'freq' in that case though. + */ + *freq = data * HZ_PER_MHZ; + + return rc; +} + +static void devm_hisi_uncore_remove_opp(void *data) +{ + struct hisi_uncore_freq *uncore = data; + + dev_pm_opp_remove_all_dynamic(uncore->dev); +} + +static int hisi_uncore_init_opp(struct hisi_uncore_freq *uncore) +{ + struct device *dev = uncore->dev; + unsigned long freq_mhz; + u32 num, index; + u32 data = 0; + int rc; + + rc = hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_GET_PLAT_FREQ_NUM, + &data); + if (rc) + return dev_err_probe(dev, rc, "Failed to get plat freq num\n"); + + num = data; + + for (index = 0; index < num; index++) { + data = index; + rc = hisi_uncore_cmd_send(uncore, + HUCF_PCC_CMD_GET_PLAT_FREQ_BY_IDX, + &data); + if (rc) { + dev_pm_opp_remove_all_dynamic(dev); + return dev_err_probe(dev, rc, + "Failed to get plat freq at index %u\n", index); + } + freq_mhz = data; + + /* Don't care OPP voltage, take 1V as default */ + rc = dev_pm_opp_add(dev, freq_mhz * HZ_PER_MHZ, 1000000); + if (rc) { + dev_pm_opp_remove_all_dynamic(dev); + return dev_err_probe(dev, rc, + "Add OPP %lu failed\n", freq_mhz); + } + } + + return devm_add_action_or_reset(dev, devm_hisi_uncore_remove_opp, + uncore); +} + +static int hisi_platform_gov_func(struct devfreq *df, unsigned long *freq) +{ + /* + * Platform-controlled mode doesn't care the frequency issued from + * devfreq, so just pick the max freq. + */ + *freq = DEVFREQ_MAX_FREQ; + + return 0; +} + +static int hisi_platform_gov_handler(struct devfreq *df, unsigned int event, + void *val) +{ + struct hisi_uncore_freq *uncore = dev_get_drvdata(df->dev.parent); + int rc = 0; + u32 data; + + if (WARN_ON(!uncore || !uncore->pchan)) + return -ENODEV; + + switch (event) { + case DEVFREQ_GOV_START: + data = HUCF_MODE_PLATFORM; + rc = hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_SET_MODE, &data); + if (rc) + dev_err(uncore->dev, "Failed to set platform mode (%d)\n", rc); + break; + case DEVFREQ_GOV_STOP: + data = HUCF_MODE_OS; + rc = hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_SET_MODE, &data); + if (rc) + dev_err(uncore->dev, "Failed to set os mode (%d)\n", rc); + break; + default: + break; + } + + return rc; +} + +/* + * In the platform-controlled mode, the platform decides the uncore frequency + * and ignores the frequency issued from the driver. + * Thus, create a pseudo 'hisi_platform' governor that stops devfreq monitor + * from working so as to save meaningless overhead. + */ +static struct devfreq_governor hisi_platform_governor = { + .name = "hisi_platform", + /* + * Set interrupt_driven to skip the devfreq monitor mechanism, though + * this governor is not interrupt-driven. + */ + .flags = DEVFREQ_GOV_FLAG_IRQ_DRIVEN, + .get_target_freq = hisi_platform_gov_func, + .event_handler = hisi_platform_gov_handler, +}; + +static void hisi_uncore_remove_platform_gov(struct hisi_uncore_freq *uncore) +{ + u32 data = HUCF_MODE_PLATFORM; + int rc; + + if (!(uncore->cap & HUCF_CAP_PLATFORM_CTRL)) + return; + + guard(mutex)(&hisi_platform_gov_usage_lock); + + if (--hisi_platform_gov_usage == 0) { + rc = devfreq_remove_governor(&hisi_platform_governor); + if (rc) + dev_err(uncore->dev, "Failed to remove hisi_platform gov (%d)\n", rc); + } + + /* + * Set to the platform-controlled mode on exit if supported, so as to + * have a certain behaviour when the driver is detached. + */ + rc = hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_SET_MODE, &data); + if (rc) + dev_err(uncore->dev, "Failed to set platform mode on exit (%d)\n", rc); +} + +static void devm_hisi_uncore_remove_platform_gov(void *data) +{ + hisi_uncore_remove_platform_gov(data); +} + +static int hisi_uncore_add_platform_gov(struct hisi_uncore_freq *uncore) +{ + if (!(uncore->cap & HUCF_CAP_PLATFORM_CTRL)) + return 0; + + guard(mutex)(&hisi_platform_gov_usage_lock); + + if (hisi_platform_gov_usage == 0) { + int rc = devfreq_add_governor(&hisi_platform_governor); + if (rc) + return rc; + } + hisi_platform_gov_usage++; + + return devm_add_action_or_reset(uncore->dev, + devm_hisi_uncore_remove_platform_gov, + uncore); +} + +/* + * Returns: + * 0 if success, uncore->related_cpus is set. + * -EINVAL if property not found, or property found but without elements in it, + * or invalid arguments received in any of the subroutine. + * Other error codes if it goes wrong. + */ +static int hisi_uncore_mark_related_cpus(struct hisi_uncore_freq *uncore, + char *property, int (*get_topo_id)(int cpu), + const struct cpumask *(*get_cpumask)(int cpu)) +{ + unsigned int i, cpu; + size_t len; + int rc; + + rc = device_property_count_u32(uncore->dev, property); + if (rc < 0) + return rc; + if (rc == 0) + return -EINVAL; + + len = rc; + u32 *num __free(kfree) = kcalloc(len, sizeof(*num), GFP_KERNEL); + if (!num) + return -ENOMEM; + + rc = device_property_read_u32_array(uncore->dev, property, num, len); + if (rc) + return rc; + + for (i = 0; i < len; i++) { + for_each_possible_cpu(cpu) { + if (get_topo_id(cpu) != num[i]) + continue; + + cpumask_or(&uncore->related_cpus, + &uncore->related_cpus, get_cpumask(cpu)); + break; + } + } + + return 0; +} + +static int get_package_id(int cpu) +{ + return topology_physical_package_id(cpu); +} + +static const struct cpumask *get_package_cpumask(int cpu) +{ + return topology_core_cpumask(cpu); +} + +static int get_cluster_id(int cpu) +{ + return topology_cluster_id(cpu); +} + +static const struct cpumask *get_cluster_cpumask(int cpu) +{ + return topology_cluster_cpumask(cpu); +} + +static int hisi_uncore_mark_related_cpus_wrap(struct hisi_uncore_freq *uncore) +{ + int rc; + + cpumask_clear(&uncore->related_cpus); + + rc = hisi_uncore_mark_related_cpus(uncore, "related-package", + get_package_id, + get_package_cpumask); + /* Success, or firmware probably broken */ + if (!rc || rc != -EINVAL) + return rc; + + /* Try another property name if rc == -EINVAL */ + return hisi_uncore_mark_related_cpus(uncore, "related-cluster", + get_cluster_id, + get_cluster_cpumask); +} + +static ssize_t related_cpus_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hisi_uncore_freq *uncore = dev_get_drvdata(dev->parent); + + return cpumap_print_to_pagebuf(true, buf, &uncore->related_cpus); +} + +static DEVICE_ATTR_RO(related_cpus); + +static struct attribute *hisi_uncore_freq_attrs[] = { + &dev_attr_related_cpus.attr, + NULL +}; +ATTRIBUTE_GROUPS(hisi_uncore_freq); + +static int hisi_uncore_devfreq_register(struct hisi_uncore_freq *uncore) +{ + struct devfreq_dev_profile *profile; + struct device *dev = uncore->dev; + unsigned long freq; + u32 data; + int rc; + + rc = hisi_uncore_get_cur_freq(dev, &freq); + if (rc) + return dev_err_probe(dev, rc, "Failed to get plat init freq\n"); + + profile = devm_kzalloc(dev, sizeof(*profile), GFP_KERNEL); + if (!profile) + return -ENOMEM; + + *profile = (struct devfreq_dev_profile) { + .initial_freq = freq, + .polling_ms = HUCF_DEFAULT_POLLING_MS, + .timer = DEVFREQ_TIMER_DELAYED, + .target = hisi_uncore_target, + .get_dev_status = hisi_uncore_get_dev_status, + .get_cur_freq = hisi_uncore_get_cur_freq, + .dev_groups = hisi_uncore_freq_groups, + }; + + rc = hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_GET_MODE, &data); + if (rc) + return dev_err_probe(dev, rc, "Failed to get operate mode\n"); + + if (data == HUCF_MODE_PLATFORM) + uncore->devfreq = devm_devfreq_add_device(dev, profile, + hisi_platform_governor.name, NULL); + else + uncore->devfreq = devm_devfreq_add_device(dev, profile, + DEVFREQ_GOV_PERFORMANCE, NULL); + if (IS_ERR(uncore->devfreq)) + return dev_err_probe(dev, PTR_ERR(uncore->devfreq), + "Failed to add devfreq device\n"); + + return 0; +} + +static int hisi_uncore_freq_probe(struct platform_device *pdev) +{ + struct hisi_uncore_freq *uncore; + struct device *dev = &pdev->dev; + u32 cap; + int rc; + + uncore = devm_kzalloc(dev, sizeof(*uncore), GFP_KERNEL); + if (!uncore) + return -ENOMEM; + + uncore->dev = dev; + platform_set_drvdata(pdev, uncore); + + rc = hisi_uncore_init_pcc_chan(uncore); + if (rc) + return dev_err_probe(dev, rc, "Failed to init PCC channel\n"); + + rc = hisi_uncore_init_opp(uncore); + if (rc) + return dev_err_probe(dev, rc, "Failed to init OPP\n"); + + rc = hisi_uncore_cmd_send(uncore, HUCF_PCC_CMD_GET_CAP, &cap); + if (rc) + return dev_err_probe(dev, rc, "Failed to get capability\n"); + + uncore->cap = cap; + + rc = hisi_uncore_add_platform_gov(uncore); + if (rc) + return dev_err_probe(dev, rc, "Failed to add hisi_platform governor\n"); + + rc = hisi_uncore_mark_related_cpus_wrap(uncore); + if (rc) + return dev_err_probe(dev, rc, "Failed to mark related cpus\n"); + + rc = hisi_uncore_devfreq_register(uncore); + if (rc) + return dev_err_probe(dev, rc, "Failed to register devfreq\n"); + + return 0; +} + +static const struct acpi_device_id hisi_uncore_freq_acpi_match[] = { + { "HISI04F1", }, + { } +}; +MODULE_DEVICE_TABLE(acpi, hisi_uncore_freq_acpi_match); + +static struct platform_driver hisi_uncore_freq_drv = { + .probe = hisi_uncore_freq_probe, + .driver = { + .name = "hisi_uncore_freq", + .acpi_match_table = hisi_uncore_freq_acpi_match, + }, +}; +module_platform_driver(hisi_uncore_freq_drv); + +MODULE_DESCRIPTION("HiSilicon uncore frequency scaling driver"); +MODULE_AUTHOR("Jie Zhan "); +MODULE_LICENSE("GPL"); From 1cefe495cacba5fb0417da3a75a1a76e3546d176 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 16 Jun 2025 20:19:19 +0200 Subject: [PATCH 57/75] cpufreq: intel_pstate: Always use HWP_DESIRED_PERF in passive mode In the passive mode, intel_cpufreq_update_pstate() sets HWP_MIN_PERF in accordance with the target frequency to ensure delivering adequate performance, but it sets HWP_DESIRED_PERF to 0, so the processor has no indication that the desired performance level is actually equal to the floor one. This may cause it to choose a performance point way above the desired level. Moreover, this is inconsistent with intel_cpufreq_adjust_perf() which actually sets HWP_DESIRED_PERF in accordance with the target performance value. Address this by adjusting intel_cpufreq_update_pstate() to pass target_pstate as both the minimum and the desired performance levels to intel_cpufreq_hwp_update(). Fixes: a365ab6b9dfb ("cpufreq: intel_pstate: Implement the ->adjust_perf() callback") Signed-off-by: Rafael J. Wysocki Tested-by: Shashank Balaji Link: https://patch.msgid.link/6173276.lOV4Wx5bFT@rjwysocki.net --- drivers/cpufreq/intel_pstate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 64587d318267..60326ab5475f 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -3249,8 +3249,8 @@ static int intel_cpufreq_update_pstate(struct cpufreq_policy *policy, int max_pstate = policy->strict_target ? target_pstate : cpu->max_perf_ratio; - intel_cpufreq_hwp_update(cpu, target_pstate, max_pstate, 0, - fast_switch); + intel_cpufreq_hwp_update(cpu, target_pstate, max_pstate, + target_pstate, fast_switch); } else if (target_pstate != old_pstate) { intel_cpufreq_perf_ctl_update(cpu, target_pstate, fast_switch); } From fc64e0421598aaa87d61184f6777b52614a095be Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 23 Jun 2025 18:56:01 +0800 Subject: [PATCH 58/75] cpufreq: intel_pstate: Add Granite Rapids support in no-HWP mode Users may disable HWP in firmware, in which case intel_pstate wouldn't load unless the CPU model is explicitly supported. Signed-off-by: Li RongQing Link: https://patch.msgid.link/20250623105601.3924-1-lirongqing@baidu.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 60326ab5475f..06a1c7dd081f 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2775,6 +2775,8 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = { X86_MATCH(INTEL_TIGERLAKE, core_funcs), X86_MATCH(INTEL_SAPPHIRERAPIDS_X, core_funcs), X86_MATCH(INTEL_EMERALDRAPIDS_X, core_funcs), + X86_MATCH(INTEL_GRANITERAPIDS_D, core_funcs), + X86_MATCH(INTEL_GRANITERAPIDS_X, core_funcs), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); From 2e554cfa259fe07085a4fcff7d2ec4b7041bbd9c Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 23 Jun 2025 21:33:58 +0800 Subject: [PATCH 59/75] cpufreq: Contain scaling_cur_freq.attr in cpufreq_attrs After commit c034b02e213d ("cpufreq: expose scaling_cur_freq sysfs file for set_policy() drivers"), the file scaling_cur_freq is exposed to all drivers. No need to create this file separately. It's better to be contained in cpufreq_attrs. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250623133402.3120230-4-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index d7426e1d8bdd..5560c7df0347 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -967,6 +967,7 @@ static struct attribute *cpufreq_attrs[] = { &cpuinfo_min_freq.attr, &cpuinfo_max_freq.attr, &cpuinfo_transition_latency.attr, + &scaling_cur_freq.attr, &scaling_min_freq.attr, &scaling_max_freq.attr, &affected_cpus.attr, @@ -1095,10 +1096,6 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy) return ret; } - ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); - if (ret) - return ret; - if (cpufreq_driver->bios_limit) { ret = sysfs_create_file(&policy->kobj, &bios_limit.attr); if (ret) From 5d6ecaaa922611ec3ca067723ccefafb543010ee Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Mon, 23 Jun 2025 21:33:59 +0800 Subject: [PATCH 60/75] cpufreq: Remove duplicate check in __cpufreq_offline() The has_target() checks in __cpufreq_offline() are duplicate. Remove one of them and put the operations of exiting governor together with storing last governor's name. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250623133402.3120230-5-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 5560c7df0347..6ea39181e832 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1691,14 +1691,13 @@ static void __cpufreq_offline(unsigned int cpu, struct cpufreq_policy *policy) return; } - if (has_target()) + if (has_target()) { strscpy(policy->last_governor, policy->governor->name, CPUFREQ_NAME_LEN); - else - policy->last_policy = policy->policy; - - if (has_target()) cpufreq_exit_governor(policy); + } else { + policy->last_policy = policy->policy; + } /* * Perform the ->offline() during light-weight tear-down, as From 2a6c727387062a2ea79eb6cf5004820cb1b0afe2 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Wed, 9 Jul 2025 18:41:42 +0800 Subject: [PATCH 61/75] cpufreq: Initialize cpufreq-based frequency-invariance later The cpufreq-based invariance is enabled in cpufreq_register_driver(), but never disabled after registration fails. Move the invariance initialization to where all other initializations have been successfully done to solve this problem. Fixes: 874f63531064 ("cpufreq: report whether cpufreq supports Frequency Invariance (FI)") Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-2-zhenglifeng1@huawei.com [ rjw: New subject ] Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6ea39181e832..eb713d116c34 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2940,15 +2940,6 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) cpufreq_driver = driver_data; write_unlock_irqrestore(&cpufreq_driver_lock, flags); - /* - * Mark support for the scheduler's frequency invariance engine for - * drivers that implement target(), target_index() or fast_switch(). - */ - if (!cpufreq_driver->setpolicy) { - static_branch_enable_cpuslocked(&cpufreq_freq_invariance); - pr_debug("supports frequency invariance"); - } - if (driver_data->setpolicy) driver_data->flags |= CPUFREQ_CONST_LOOPS; @@ -2979,6 +2970,15 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data) hp_online = ret; ret = 0; + /* + * Mark support for the scheduler's frequency invariance engine for + * drivers that implement target(), target_index() or fast_switch(). + */ + if (!cpufreq_driver->setpolicy) { + static_branch_enable_cpuslocked(&cpufreq_freq_invariance); + pr_debug("supports frequency invariance"); + } + pr_debug("driver %s up and running\n", driver_data->name); goto out; From d1378d1d7edb3a4c4935a44fe834ae135be03564 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Wed, 9 Jul 2025 18:41:43 +0800 Subject: [PATCH 62/75] cpufreq: Init policy->rwsem before it may be possibly used In cpufreq_policy_put_kobj(), policy->rwsem is used. But in cpufreq_policy_alloc(), if freq_qos_add_notifier() returns an error, error path via err_kobj_remove or err_min_qos_notifier will be reached and cpufreq_policy_put_kobj() will be called before policy->rwsem is initialized. Thus, the calling of init_rwsem() should be moved to where before these two error paths can be reached. Fixes: 67d874c3b2c6 ("cpufreq: Register notifiers with the PM QoS framework") Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-3-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index eb713d116c34..2175d2df95b6 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1281,6 +1281,8 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu) goto err_free_real_cpus; } + init_rwsem(&policy->rwsem); + freq_constraints_init(&policy->constraints); policy->nb_min.notifier_call = cpufreq_notifier_min; @@ -1303,7 +1305,6 @@ static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu) } INIT_LIST_HEAD(&policy->policy_list); - init_rwsem(&policy->rwsem); spin_lock_init(&policy->transition_lock); init_waitqueue_head(&policy->transition_wait); INIT_WORK(&policy->update, handle_update); From 908981d85f86c5e2b39dfe0b2267c6d44d9c48f7 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Wed, 9 Jul 2025 18:41:44 +0800 Subject: [PATCH 63/75] cpufreq: Move the check of cpufreq_driver->get into cpufreq_verify_current_freq() Move the check of cpufreq_driver->get into cpufreq_verify_current_freq() in case of calling it without check. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-4-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 2175d2df95b6..84f175a55fc5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1800,6 +1800,9 @@ static unsigned int cpufreq_verify_current_freq(struct cpufreq_policy *policy, b { unsigned int new_freq; + if (!cpufreq_driver->get) + return 0; + new_freq = cpufreq_driver->get(policy->cpu); if (!new_freq) return 0; @@ -1922,10 +1925,7 @@ unsigned int cpufreq_get(unsigned int cpu) guard(cpufreq_policy_read)(policy); - if (cpufreq_driver->get) - return __cpufreq_get(policy); - - return 0; + return __cpufreq_get(policy); } EXPORT_SYMBOL(cpufreq_get); @@ -2479,8 +2479,7 @@ int cpufreq_start_governor(struct cpufreq_policy *policy) pr_debug("%s: for CPU %u\n", __func__, policy->cpu); - if (cpufreq_driver->get) - cpufreq_verify_current_freq(policy, false); + cpufreq_verify_current_freq(policy, false); if (policy->governor->start) { ret = policy->governor->start(policy); From 0ae204405095abfbc2d694ee0fbb49bcbbe55c57 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Wed, 9 Jul 2025 18:41:45 +0800 Subject: [PATCH 64/75] cpufreq: Exit governor when failed to start old governor Detect the result of starting old governor in cpufreq_set_policy(). If it fails, exit the governor and clear policy->governor. Signed-off-by: Lifeng Zheng Link: https://patch.msgid.link/20250709104145.2348017-5-zhenglifeng1@huawei.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 84f175a55fc5..d24ad67b3f1e 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2711,10 +2711,12 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, pr_debug("starting governor %s failed\n", policy->governor->name); if (old_gov) { policy->governor = old_gov; - if (cpufreq_init_governor(policy)) + if (cpufreq_init_governor(policy)) { policy->governor = NULL; - else - cpufreq_start_governor(policy); + } else if (cpufreq_start_governor(policy)) { + cpufreq_exit_governor(policy); + policy->governor = NULL; + } } return ret; From a507f8230d60d7e21aac390ee83eb625cb6021d9 Mon Sep 17 00:00:00 2001 From: Ritvik Gupta Date: Mon, 14 Jul 2025 00:32:44 +0530 Subject: [PATCH 65/75] rust: cpumask: Replace `MaybeUninit` and `mem::zeroed` with `Opaque` APIs Replace the following unsafe initializations: 1. `MaybeUninit::uninit().assume_init()` with `Opaque::uninit()` 2. `core::mem::zeroed()` with `Opaque::zeroed()` Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1178 Suggested-by: Alice Ryhl Link: https://lore.kernel.org/rust-for-linux/CAH5fLgj0OoCn56OkNUmiPQ=RAVa_VmS-yMZ4TNBSpGPNtZ5D0A@mail.gmail.com/ Reviewed-by: Benno Lossin Reviewed-by: Alice Ryhl Signed-off-by: Ritvik Gupta Signed-off-by: Viresh Kumar --- rust/kernel/cpumask.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/rust/kernel/cpumask.rs b/rust/kernel/cpumask.rs index 19c607709b5f..e07f8ff5e3fd 100644 --- a/rust/kernel/cpumask.rs +++ b/rust/kernel/cpumask.rs @@ -14,9 +14,6 @@ #[cfg(CONFIG_CPUMASK_OFFSTACK)] use core::ptr::{self, NonNull}; -#[cfg(not(CONFIG_CPUMASK_OFFSTACK))] -use core::mem::MaybeUninit; - use core::ops::{Deref, DerefMut}; /// A CPU Mask. @@ -239,10 +236,7 @@ pub fn new_zero(_flags: Flags) -> Result { }, #[cfg(not(CONFIG_CPUMASK_OFFSTACK))] - // SAFETY: FFI type is valid to be zero-initialized. - // - // INVARIANT: The associated memory is freed when the `CpumaskVar` goes out of scope. - mask: unsafe { core::mem::zeroed() }, + mask: Cpumask(Opaque::zeroed()), }) } @@ -266,10 +260,7 @@ pub unsafe fn new(_flags: Flags) -> Result { NonNull::new(ptr.cast()).ok_or(AllocError)? }, #[cfg(not(CONFIG_CPUMASK_OFFSTACK))] - // SAFETY: Guaranteed by the safety requirements of the function. - // - // INVARIANT: The associated memory is freed when the `CpumaskVar` goes out of scope. - mask: unsafe { MaybeUninit::uninit().assume_init() }, + mask: Cpumask(Opaque::uninit()), }) } From a7ce9ca1aaf93d55e32e915700d0ef9f69a781c9 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Mon, 14 Jul 2025 11:17:11 +0300 Subject: [PATCH 66/75] drivers: cpufreq: add Tegra114 support Tegra114 is fully compatible with existing Tegra124 cpufreq driver. Signed-off-by: Svyatoslav Ryhel Reviewed-by: Thierry Reding Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 1 + drivers/cpufreq/tegra124-cpufreq.c | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index a010da0f6337..015dd393eaba 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -143,6 +143,7 @@ static const struct of_device_id blocklist[] __initconst = { { .compatible = "nvidia,tegra20", }, { .compatible = "nvidia,tegra30", }, + { .compatible = "nvidia,tegra114", }, { .compatible = "nvidia,tegra124", }, { .compatible = "nvidia,tegra210", }, { .compatible = "nvidia,tegra234", }, diff --git a/drivers/cpufreq/tegra124-cpufreq.c b/drivers/cpufreq/tegra124-cpufreq.c index b8bccde8b298..f8a76bbecef9 100644 --- a/drivers/cpufreq/tegra124-cpufreq.c +++ b/drivers/cpufreq/tegra124-cpufreq.c @@ -203,8 +203,9 @@ static int __init tegra_cpufreq_init(void) { int ret; - if (!(of_machine_is_compatible("nvidia,tegra124") || - of_machine_is_compatible("nvidia,tegra210"))) + if (!(of_machine_is_compatible("nvidia,tegra114") || + of_machine_is_compatible("nvidia,tegra124") || + of_machine_is_compatible("nvidia,tegra210"))) return -ENODEV; /* From 80b1516e07c53f0b4df2f53d53f8fac4052d6ac2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 9 Jul 2025 19:31:07 +0200 Subject: [PATCH 67/75] PM: sleep: Clean up MAINTAINERS entries for suspend and hibernation Since Pavel Machek and Len Brown do not actually maintain the system suspend and hibernation code, change their records in the relevant MAINTAINERS entries to reviewers. While at it, use Len Brown's kernel.org address in the suspend-to-RAM MAINTAINERS record. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/2798682.mvXUDI8C0e@rjwysocki.net Link: https://lore.kernel.org/linux-pm/20250313091403.50077-1-krzysztof.kozlowski@linaro.org/ [ rjw: Add a Link tag relevant to this change, edit changelog ] Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fad6cb025a19..fd27e1cd5893 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9785,7 +9785,7 @@ F: fs/freevxfs/ FREEZER M: "Rafael J. Wysocki" -M: Pavel Machek +R: Pavel Machek L: linux-pm@vger.kernel.org S: Supported F: Documentation/power/freezing-of-tasks.rst @@ -10661,7 +10661,7 @@ F: drivers/video/fbdev/hgafb.c HIBERNATION (aka Software Suspend, aka swsusp) M: "Rafael J. Wysocki" -M: Pavel Machek +R: Pavel Machek L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org @@ -23943,8 +23943,8 @@ F: drivers/sh/ SUSPEND TO RAM M: "Rafael J. Wysocki" -M: Len Brown -M: Pavel Machek +R: Len Brown +R: Pavel Machek L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org From 996afb6efd1a345736f9a888e4d6c7d4f3752aa5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Jul 2025 15:10:41 +0200 Subject: [PATCH 68/75] kexec_core: Fix error code path in the KEXEC_JUMP flow If dpm_suspend_start() fails, dpm_resume_end() must be called to recover devices whose suspend callbacks have been called, but this does not happen in the KEXEC_JUMP flow's error path due to a confused goto target label. Address this by using the correct target label in the goto statement in question and drop the Resume_console label that is not used any more. Fixes: 2965faa5e03d ("kexec: split kexec_load syscall from kexec core code") Signed-off-by: Rafael J. Wysocki Acked-by: Baoquan He Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/2396879.ElGaqSPkdT@rjwysocki.net [ rjw: Drop unused label and amend the changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/kexec_core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 3a9a9f240dbc..554369595298 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1080,7 +1080,7 @@ int kernel_kexec(void) console_suspend_all(); error = dpm_suspend_start(PMSG_FREEZE); if (error) - goto Resume_console; + goto Resume_devices; /* * dpm_suspend_end() must be called after dpm_suspend_start() * to complete the transition, like in the hibernation flows @@ -1135,7 +1135,6 @@ int kernel_kexec(void) dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); - Resume_console: pm_restore_gfp_mask(); console_resume_all(); thaw_processes(); From 2096d42d82dc983d9db861bd6585723bd24a0819 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 10 Jul 2025 15:12:20 +0200 Subject: [PATCH 69/75] kexec_core: Drop redundant pm_restore_gfp_mask() call Drop the direct pm_restore_gfp_mask() call from the KEXEC_JUMP flow in kernel_kexec() because it is redundant. Namely, dpm_resume_end() called beforehand in the same code path invokes that function and it is sufficient to invoke it once. Signed-off-by: Rafael J. Wysocki Acked-by: Baoquan He Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/1949230.tdWV9SEqCh@rjwysocki.net [ rjw: Rebase after fixing up previous changes ] Signed-off-by: Rafael J. Wysocki --- kernel/kexec_core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 554369595298..351cd7d76dfa 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1135,7 +1135,6 @@ int kernel_kexec(void) dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); - pm_restore_gfp_mask(); console_resume_all(); thaw_processes(); Restore_console: From 51888393cc64dd0462d0b96c13ab94873abbc030 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 9 Jul 2025 12:41:45 +0200 Subject: [PATCH 70/75] PM: runtime: Take active children into account in pm_runtime_get_if_in_use() For all practical purposes, there is no difference between the situation in which a given device is not ignoring children and its active child count is nonzero and the situation in which its runtime PM usage counter is nonzero. However, pm_runtime_get_if_in_use() will only increment the device's usage counter and return 1 in the latter case. For consistency, make it do so in the former case either by adjusting pm_runtime_get_conditional() and update the related kerneldoc comments accordingly. Fixes: c111566bea7c ("PM: runtime: Add pm_runtime_get_if_active()") Signed-off-by: Rafael J. Wysocki Reviewed-by: Ulf Hansson Reviewed-by: Sakari Ailus Cc: 5.10+ # 5.10+: c0ef3df8dbae: PM: runtime: Simplify pm_runtime_get_if_active() usage Cc: 5.10+ # 5.10+ Link: https://patch.msgid.link/12700973.O9o76ZdvQC@rjwysocki.net --- drivers/base/power/runtime.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index c55a7c70bc1a..2ba0dfd1de5a 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -1191,10 +1191,12 @@ EXPORT_SYMBOL_GPL(__pm_runtime_resume); * * Return -EINVAL if runtime PM is disabled for @dev. * - * Otherwise, if the runtime PM status of @dev is %RPM_ACTIVE and either - * @ign_usage_count is %true or the runtime PM usage counter of @dev is not - * zero, increment the usage counter of @dev and return 1. Otherwise, return 0 - * without changing the usage counter. + * Otherwise, if its runtime PM status is %RPM_ACTIVE and (1) @ign_usage_count + * is set, or (2) @dev is not ignoring children and its active child count is + * nonero, or (3) the runtime PM usage counter of @dev is not zero, increment + * the usage counter of @dev and return 1. + * + * Otherwise, return 0 without changing the usage counter. * * If @ign_usage_count is %true, this function can be used to prevent suspending * the device when its runtime PM status is %RPM_ACTIVE. @@ -1216,7 +1218,8 @@ static int pm_runtime_get_conditional(struct device *dev, bool ign_usage_count) retval = -EINVAL; } else if (dev->power.runtime_status != RPM_ACTIVE) { retval = 0; - } else if (ign_usage_count) { + } else if (ign_usage_count || (!dev->power.ignore_children && + atomic_read(&dev->power.child_count) > 0)) { retval = 1; atomic_inc(&dev->power.usage_count); } else { @@ -1249,10 +1252,16 @@ EXPORT_SYMBOL_GPL(pm_runtime_get_if_active); * @dev: Target device. * * Increment the runtime PM usage counter of @dev if its runtime PM status is - * %RPM_ACTIVE and its runtime PM usage counter is greater than 0, in which case - * it returns 1. If the device is in a different state or its usage_count is 0, - * 0 is returned. -EINVAL is returned if runtime PM is disabled for the device, - * in which case also the usage_count will remain unmodified. + * %RPM_ACTIVE and its runtime PM usage counter is greater than 0 or it is not + * ignoring children and its active child count is nonzero. 1 is returned in + * this case. + * + * If @dev is in a different state or it is not in use (that is, its usage + * counter is 0, or it is ignoring children, or its active child count is 0), + * 0 is returned. + * + * -EINVAL is returned if runtime PM is disabled for the device, in which case + * also the usage counter of @dev is not updated. */ int pm_runtime_get_if_in_use(struct device *dev) { From efbc5b4ac98e187375bf14c18ecc76988d3bab3c Mon Sep 17 00:00:00 2001 From: shouyeliu Date: Thu, 22 May 2025 15:01:41 +0800 Subject: [PATCH 71/75] Documentation: amd-pstate:fix minimum performance state label error In the AMD P-States Performance Scale diagram, the labels for "Max Perf" and "Lowest Perf" were incorrectly used to define the range for "Desired Perf".The "Desired performance target" should be bounded by the "Maximum requested performance" and the "Minimum requested performance", which corresponds to "Max Perf" and "Min Perf", respectively. Signed-off-by: Shouye Liu Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250522070140.17557-1-shouyeliu@gmail.com Signed-off-by: Mario Limonciello --- Documentation/admin-guide/pm/amd-pstate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst index 412423c54f25..e1771f2225d5 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst @@ -72,7 +72,7 @@ to manage each performance update behavior. :: Lowest non- | | | | linear perf ------>+-----------------------+ +-----------------------+ | | | | - | | Lowest perf ---->| | + | | Min perf ---->| | | | | | Lowest perf ------>+-----------------------+ +-----------------------+ | | | | From dbd4bccd96626563d1d811bc121484cd45f964a1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 16 Jul 2025 21:31:23 +0200 Subject: [PATCH 72/75] PM: sleep: Rearrange suspend/resume error handling in the core Notice that device_suspend_noirq(), device_suspend_late() and device_suspend() all set async_error on errors, so they don't really need to return a value. Accordingly, make them all void and use async_error in their callers instead of their return values. Moreover, since async_error is updated concurrently without locking during asynchronous suspend and resume processing, use READ_ONCE() and WRITE_ONCE() for accessing it in those places to ensure that all of the accesses will be carried out as expected. Signed-off-by: Rafael J. Wysocki Reviewed-by: Saravana Kannan Link: https://patch.msgid.link/6198088.lOV4Wx5bFT@rjwysocki.net --- drivers/base/power/main.c | 79 +++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 44 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 5d9b3dc9011d..8aa06d59a2ee 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -767,7 +767,7 @@ static void device_resume_noirq(struct device *dev, pm_message_t state, bool asy TRACE_RESUME(error); if (error) { - async_error = error; + WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); } @@ -824,7 +824,7 @@ static void dpm_noirq_resume_devices(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "noirq"); - if (async_error) + if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); trace_suspend_resume(TPS("dpm_resume_noirq"), state.event, false); @@ -910,7 +910,7 @@ static void device_resume_early(struct device *dev, pm_message_t state, bool asy complete_all(&dev->power.completion); if (error) { - async_error = error; + WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async early" : " early", error); } @@ -971,7 +971,7 @@ void dpm_resume_early(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, "early"); - if (async_error) + if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME_EARLY); trace_suspend_resume(TPS("dpm_resume_early"), state.event, false); @@ -1086,7 +1086,7 @@ static void device_resume(struct device *dev, pm_message_t state, bool async) TRACE_RESUME(error); if (error) { - async_error = error; + WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } @@ -1150,7 +1150,7 @@ void dpm_resume(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); dpm_show_time(starttime, state, 0, NULL); - if (async_error) + if (READ_ONCE(async_error)) dpm_save_failed_step(SUSPEND_RESUME); cpufreq_resume(); @@ -1387,7 +1387,7 @@ static void async_suspend_noirq(void *data, async_cookie_t cookie); * The driver of @dev will not receive interrupts while this function is being * executed. */ -static int device_suspend_noirq(struct device *dev, pm_message_t state, bool async) +static void device_suspend_noirq(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -1398,7 +1398,7 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool asy dpm_wait_for_subordinate(dev, async); - if (async_error) + if (READ_ONCE(async_error)) goto Complete; if (dev->power.syscore || dev->power.direct_complete) @@ -1431,7 +1431,7 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool asy Run: error = dpm_run_callback(callback, dev, state, info); if (error) { - async_error = error; + WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async noirq" : " noirq", error); goto Complete; @@ -1457,12 +1457,10 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state, bool asy complete_all(&dev->power.completion); TRACE_SUSPEND(error); - if (error || async_error) - return error; + if (error || READ_ONCE(async_error)) + return; dpm_async_suspend_superior(dev, async_suspend_noirq); - - return 0; } static void async_suspend_noirq(void *data, async_cookie_t cookie) @@ -1477,7 +1475,7 @@ static int dpm_noirq_suspend_devices(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; - int error = 0; + int error; trace_suspend_resume(TPS("dpm_suspend_noirq"), state.event, true); @@ -1508,13 +1506,13 @@ static int dpm_noirq_suspend_devices(pm_message_t state) mutex_unlock(&dpm_list_mtx); - error = device_suspend_noirq(dev, state, false); + device_suspend_noirq(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); - if (error || async_error) { + if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_late_early_list); /* * Move all devices to the target list to resume them @@ -1528,9 +1526,8 @@ static int dpm_noirq_suspend_devices(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); - if (!error) - error = async_error; + error = READ_ONCE(async_error); if (error) dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); @@ -1585,7 +1582,7 @@ static void async_suspend_late(void *data, async_cookie_t cookie); * * Runtime PM is disabled for @dev while this function is being executed. */ -static int device_suspend_late(struct device *dev, pm_message_t state, bool async) +static void device_suspend_late(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -1602,11 +1599,11 @@ static int device_suspend_late(struct device *dev, pm_message_t state, bool asyn dpm_wait_for_subordinate(dev, async); - if (async_error) + if (READ_ONCE(async_error)) goto Complete; if (pm_wakeup_pending()) { - async_error = -EBUSY; + WRITE_ONCE(async_error, -EBUSY); goto Complete; } @@ -1640,7 +1637,7 @@ static int device_suspend_late(struct device *dev, pm_message_t state, bool asyn Run: error = dpm_run_callback(callback, dev, state, info); if (error) { - async_error = error; + WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async late" : " late", error); goto Complete; @@ -1654,12 +1651,10 @@ static int device_suspend_late(struct device *dev, pm_message_t state, bool asyn TRACE_SUSPEND(error); complete_all(&dev->power.completion); - if (error || async_error) - return error; + if (error || READ_ONCE(async_error)) + return; dpm_async_suspend_superior(dev, async_suspend_late); - - return 0; } static void async_suspend_late(void *data, async_cookie_t cookie) @@ -1678,7 +1673,7 @@ int dpm_suspend_late(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; - int error = 0; + int error; trace_suspend_resume(TPS("dpm_suspend_late"), state.event, true); @@ -1711,13 +1706,13 @@ int dpm_suspend_late(pm_message_t state) mutex_unlock(&dpm_list_mtx); - error = device_suspend_late(dev, state, false); + device_suspend_late(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); - if (error || async_error) { + if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_suspended_list); /* * Move all devices to the target list to resume them @@ -1731,9 +1726,8 @@ int dpm_suspend_late(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); - if (!error) - error = async_error; + error = READ_ONCE(async_error); if (error) { dpm_save_failed_step(SUSPEND_SUSPEND_LATE); dpm_resume_early(resume_event(state)); @@ -1822,7 +1816,7 @@ static void async_suspend(void *data, async_cookie_t cookie); * @state: PM transition of the system being carried out. * @async: If true, the device is being suspended asynchronously. */ -static int device_suspend(struct device *dev, pm_message_t state, bool async) +static void device_suspend(struct device *dev, pm_message_t state, bool async) { pm_callback_t callback = NULL; const char *info = NULL; @@ -1834,7 +1828,7 @@ static int device_suspend(struct device *dev, pm_message_t state, bool async) dpm_wait_for_subordinate(dev, async); - if (async_error) { + if (READ_ONCE(async_error)) { dev->power.direct_complete = false; goto Complete; } @@ -1854,7 +1848,7 @@ static int device_suspend(struct device *dev, pm_message_t state, bool async) if (pm_wakeup_pending()) { dev->power.direct_complete = false; - async_error = -EBUSY; + WRITE_ONCE(async_error, -EBUSY); goto Complete; } @@ -1938,7 +1932,7 @@ static int device_suspend(struct device *dev, pm_message_t state, bool async) Complete: if (error) { - async_error = error; + WRITE_ONCE(async_error, error); dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, async ? " async" : "", error); } @@ -1946,12 +1940,10 @@ static int device_suspend(struct device *dev, pm_message_t state, bool async) complete_all(&dev->power.completion); TRACE_SUSPEND(error); - if (error || async_error) - return error; + if (error || READ_ONCE(async_error)) + return; dpm_async_suspend_superior(dev, async_suspend); - - return 0; } static void async_suspend(void *data, async_cookie_t cookie) @@ -1970,7 +1962,7 @@ int dpm_suspend(pm_message_t state) { ktime_t starttime = ktime_get(); struct device *dev; - int error = 0; + int error; trace_suspend_resume(TPS("dpm_suspend"), state.event, true); might_sleep(); @@ -2005,13 +1997,13 @@ int dpm_suspend(pm_message_t state) mutex_unlock(&dpm_list_mtx); - error = device_suspend(dev, state, false); + device_suspend(dev, state, false); put_device(dev); mutex_lock(&dpm_list_mtx); - if (error || async_error) { + if (READ_ONCE(async_error)) { dpm_async_suspend_complete_all(&dpm_prepared_list); /* * Move all devices to the target list to resume them @@ -2025,9 +2017,8 @@ int dpm_suspend(pm_message_t state) mutex_unlock(&dpm_list_mtx); async_synchronize_full(); - if (!error) - error = async_error; + error = READ_ONCE(async_error); if (error) dpm_save_failed_step(SUSPEND_SUSPEND); From f633c1a236df95ab927f1919b3be2619c8cc6733 Mon Sep 17 00:00:00 2001 From: Darshan Rathod Date: Wed, 16 Jul 2025 12:42:16 +0000 Subject: [PATCH 73/75] PM: hibernate: Fix up white space that does not follow coding style Fix up white space usage that does not follow the kernel coding style rules in several places in snapshot.c. Signed-off-by: Darshan Rathod Link: https://patch.msgid.link/20250716124216.64329-1-darshanrathod475@gmail.com [ rjw: New subject and changelog ] Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2af36cfe35cd..501df0676a61 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1536,7 +1536,7 @@ static unsigned long copy_data_pages(struct memory_bitmap *copy_bm, memory_bm_position_reset(orig_bm); memory_bm_position_reset(copy_bm); copy_pfn = memory_bm_next_pfn(copy_bm); - for(;;) { + for (;;) { pfn = memory_bm_next_pfn(orig_bm); if (unlikely(pfn == BM_END_OF_MAP)) break; @@ -2161,13 +2161,13 @@ static const char *check_image_kernel(struct swsusp_info *info) { if (info->version_code != LINUX_VERSION_CODE) return "kernel version"; - if (strcmp(info->uts.sysname,init_utsname()->sysname)) + if (strcmp(info->uts.sysname, init_utsname()->sysname)) return "system type"; - if (strcmp(info->uts.release,init_utsname()->release)) + if (strcmp(info->uts.release, init_utsname()->release)) return "kernel release"; - if (strcmp(info->uts.version,init_utsname()->version)) + if (strcmp(info->uts.version, init_utsname()->version)) return "version"; - if (strcmp(info->uts.machine,init_utsname()->machine)) + if (strcmp(info->uts.machine, init_utsname()->machine)) return "machine"; return NULL; } @@ -2361,7 +2361,7 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm, struct memory_bitmap *zero_bm) { unsigned long decoded_pfn; - bool zero; + bool zero; int j; for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { From ebf266d0709b9b8eb3df1fde4152cdd329726598 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 19 Jul 2025 14:40:31 +0200 Subject: [PATCH 74/75] PM: docs: Use my kernel.org address in ABI docs and DT bindings For the sake of consistency, use my kernel.org address in all Contact records in sysfs-devices-power and sysfs-power, and in the power-domain DT binding. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/5911353.DvuYhMxLoT@rjwysocki.net --- Documentation/ABI/testing/sysfs-devices-power | 34 +++++++++---------- Documentation/ABI/testing/sysfs-power | 26 +++++++------- .../bindings/power/power-domain.yaml | 2 +- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power index 54195530e97a..9330fb3c79e2 100644 --- a/Documentation/ABI/testing/sysfs-devices-power +++ b/Documentation/ABI/testing/sysfs-devices-power @@ -1,6 +1,6 @@ What: /sys/devices/.../power/ Date: January 2009 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../power directory contains attributes allowing the user space to check and modify some power @@ -8,7 +8,7 @@ Description: What: /sys/devices/.../power/wakeup Date: January 2009 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../power/wakeup attribute allows the user space to check if the device is enabled to wake up the system @@ -34,7 +34,7 @@ Description: What: /sys/devices/.../power/control Date: January 2009 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../power/control attribute allows the user space to control the run-time power management of the device. @@ -53,7 +53,7 @@ Description: What: /sys/devices/.../power/async Date: January 2009 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../async attribute allows the user space to enable or diasble the device's suspend and resume callbacks to @@ -79,7 +79,7 @@ Description: What: /sys/devices/.../power/wakeup_count Date: September 2010 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_count attribute contains the number of signaled wakeup events associated with the device. This @@ -90,7 +90,7 @@ Description: What: /sys/devices/.../power/wakeup_active_count Date: September 2010 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_active_count attribute contains the number of times the processing of wakeup events associated with @@ -102,7 +102,7 @@ Description: What: /sys/devices/.../power/wakeup_abort_count Date: February 2012 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_abort_count attribute contains the number of times the processing of a wakeup event associated with @@ -114,7 +114,7 @@ Description: What: /sys/devices/.../power/wakeup_expire_count Date: February 2012 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_expire_count attribute contains the number of times a wakeup event associated with the device has @@ -126,7 +126,7 @@ Description: What: /sys/devices/.../power/wakeup_active Date: September 2010 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_active attribute contains either 1, or 0, depending on whether or not a wakeup event associated with @@ -138,7 +138,7 @@ Description: What: /sys/devices/.../power/wakeup_total_time_ms Date: September 2010 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_total_time_ms attribute contains the total time of processing wakeup events associated with the @@ -149,7 +149,7 @@ Description: What: /sys/devices/.../power/wakeup_max_time_ms Date: September 2010 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_max_time_ms attribute contains the maximum time of processing a single wakeup event associated @@ -161,7 +161,7 @@ Description: What: /sys/devices/.../power/wakeup_last_time_ms Date: September 2010 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_last_time_ms attribute contains the value of the monotonic clock corresponding to the time of @@ -173,7 +173,7 @@ Description: What: /sys/devices/.../power/wakeup_prevent_sleep_time_ms Date: February 2012 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../wakeup_prevent_sleep_time_ms attribute contains the total time the device has been preventing @@ -203,7 +203,7 @@ Description: What: /sys/devices/.../power/pm_qos_resume_latency_us Date: March 2012 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../power/pm_qos_resume_latency_us attribute contains the PM QoS resume latency limit for the given device, @@ -223,7 +223,7 @@ Description: What: /sys/devices/.../power/pm_qos_latency_tolerance_us Date: January 2014 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../power/pm_qos_latency_tolerance_us attribute contains the PM QoS active state latency tolerance limit for the @@ -248,7 +248,7 @@ Description: What: /sys/devices/.../power/pm_qos_no_power_off Date: September 2012 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../power/pm_qos_no_power_off attribute is used for manipulating the PM QoS "no power off" flag. If @@ -263,7 +263,7 @@ Description: What: /sys/devices/.../power/runtime_status Date: April 2010 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/devices/.../power/runtime_status attribute contains the current runtime PM status of the device, which may be diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 2192478e83cf..4d8e1ad020f0 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -1,6 +1,6 @@ What: /sys/power/ Date: August 2006 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power directory will contain files that will provide a unified interface to the power management @@ -8,7 +8,7 @@ Description: What: /sys/power/state Date: November 2016 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/state file controls system sleep states. Reading from this file returns the available sleep state @@ -23,7 +23,7 @@ Description: What: /sys/power/mem_sleep Date: November 2016 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/mem_sleep file controls the operating mode of system suspend. Reading from it returns the available modes @@ -41,7 +41,7 @@ Description: What: /sys/power/disk Date: September 2006 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/disk file controls the operating mode of the suspend-to-disk mechanism. Reading from this file returns @@ -90,7 +90,7 @@ Description: What: /sys/power/image_size Date: August 2006 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/image_size file controls the size of the image created by the suspend-to-disk mechanism. It can be written a @@ -107,7 +107,7 @@ Description: What: /sys/power/pm_trace Date: August 2006 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/pm_trace file controls the code which saves the last PM event point in the RTC across reboots, so that you can @@ -156,7 +156,7 @@ Description: What: /sys/power/pm_async Date: January 2009 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/pm_async file controls the switch allowing the user space to enable or disable asynchronous suspend and resume @@ -169,7 +169,7 @@ Description: What: /sys/power/wakeup_count Date: July 2010 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/wakeup_count file allows user space to put the system into a sleep state while taking into account the @@ -184,7 +184,7 @@ Description: What: /sys/power/reserved_size Date: May 2011 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/reserved_size file allows user space to control the amount of memory reserved for allocations made by device @@ -198,7 +198,7 @@ Description: What: /sys/power/autosleep Date: April 2012 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/autosleep file can be written one of the strings returned by reads from /sys/power/state. If that happens, a @@ -215,7 +215,7 @@ Description: What: /sys/power/wake_lock Date: February 2012 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/wake_lock file allows user space to create wakeup source objects and activate them on demand (if one of @@ -242,7 +242,7 @@ Description: What: /sys/power/wake_unlock Date: February 2012 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/wake_unlock file allows user space to deactivate wakeup sources created with the help of /sys/power/wake_lock. @@ -283,7 +283,7 @@ Description: What: /sys/power/pm_debug_messages Date: July 2017 -Contact: Rafael J. Wysocki +Contact: Rafael J. Wysocki Description: The /sys/power/pm_debug_messages file controls the printing of debug messages from the system suspend/hiberbation diff --git a/Documentation/devicetree/bindings/power/power-domain.yaml b/Documentation/devicetree/bindings/power/power-domain.yaml index 8fdb529d560b..b1147dbf2e73 100644 --- a/Documentation/devicetree/bindings/power/power-domain.yaml +++ b/Documentation/devicetree/bindings/power/power-domain.yaml @@ -7,7 +7,7 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Generic PM domains maintainers: - - Rafael J. Wysocki + - Rafael J. Wysocki - Kevin Hilman - Ulf Hansson From 0a1416a49e63c320f6e6c1c8d07e1b58c0d4a3f3 Mon Sep 17 00:00:00 2001 From: Prashant Malani Date: Tue, 22 Jul 2025 05:55:40 +0000 Subject: [PATCH 75/75] cpufreq: CPPC: Mark driver with NEED_UPDATE_LIMITS flag AMU counters on certain CPPC-based platforms tend to yield inaccurate delivered performance measurements on systems that are idle/mostly idle. This results in an inaccurate frequency being stored by cpufreq in its policy structure when the CPU is brought online. [1] Consequently, if the userspace governor tries to set the frequency to a new value, there is a possibility that it would be the erroneous value stored earlier. In such a scenario, cpufreq would assume that the requested frequency has already been set and return early, resulting in the correct/new frequency request never making it to the hardware. Since the operating frequency is liable to this sort of inconsistency, mark the CPPC driver with CPUFREQ_NEED_UPDATE_LIMITS so that it is always invoked when a target frequency update is requested. Link: https://lore.kernel.org/linux-pm/20250619000925.415528-3-pmalani@google.com/ [1] Suggested-by: Rafael J. Wysocki Signed-off-by: Prashant Malani Acked-by: Viresh Kumar Link: https://patch.msgid.link/20250722055611.130574-2-pmalani@google.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cppc_cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index a1fd0ff22bc5..4a17162a392d 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -910,7 +910,7 @@ static struct freq_attr *cppc_cpufreq_attr[] = { }; static struct cpufreq_driver cppc_cpufreq_driver = { - .flags = CPUFREQ_CONST_LOOPS, + .flags = CPUFREQ_CONST_LOOPS | CPUFREQ_NEED_UPDATE_LIMITS, .verify = cppc_verify_policy, .target = cppc_cpufreq_set_target, .get = cppc_cpufreq_get_rate,