From 220abf77e7c2835cc63ea8cd7158cf83952640af Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Thu, 21 Aug 2025 09:56:38 +0530 Subject: [PATCH 1/4] cpufreq/amd-pstate: Fix setting of CPPC.min_perf in active mode for performance governor In the "active" mode of the amd-pstate driver with performance governor, the CPPC.min_perf is expected to be the nominal_perf. However after commit a9b9b4c2a4cd ("cpufreq/amd-pstate: Drop min and max cached frequencies"), this is not the case when the governor is switched from performance to powersave and back to performance, and the CPPC.min_perf will be equal to the scaling_min_freq that was set for the powersave governor. This is because prior to commit a9b9b4c2a4cd ("cpufreq/amd-pstate: Drop min and max cached frequencies"), amd_pstate_epp_update_limit() would unconditionally call amd_pstate_update_min_max_limit() and the latter function would enforce the CPPC.min_perf constraint when the governor is performance. However, after the aforementioned commit, amd_pstate_update_min_max_limit() is called by amd_pstate_epp_update_limit() only when either the scaling_{min/max}_freq is different from the cached value of cpudata->{min/max}_limit_freq, which wouldn't have changed on a governor transition from powersave to performance, thus missing out on enforcing the CPPC.min_perf constraint for the performance governor. Fix this by invoking amd_pstate_epp_udpate_limit() not only when the {min/max} limits have changed from the cached values, but also when the policy itself has changed. Fixes: a9b9b4c2a4cd ("cpufreq/amd-pstate: Drop min and max cached frequencies") Signed-off-by: Gautham R. Shenoy Reviewed-by: Mario Limonciello Link: https://lore.kernel.org/r/20250821042638.356-1-gautham.shenoy@amd.com Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index bbc27ef9edf7..5cd91489fcbe 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1554,13 +1554,15 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) pr_debug("CPU %d exiting\n", policy->cpu); } -static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) +static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy, bool policy_change) { struct amd_cpudata *cpudata = policy->driver_data; union perf_cached perf; u8 epp; - if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) + if (policy_change || + policy->min != cpudata->min_limit_freq || + policy->max != cpudata->max_limit_freq) amd_pstate_update_min_max_limit(policy); if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) @@ -1584,7 +1586,7 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) cpudata->policy = policy->policy; - ret = amd_pstate_epp_update_limit(policy); + ret = amd_pstate_epp_update_limit(policy, true); if (ret) return ret; @@ -1658,7 +1660,7 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) int ret; /* enable amd pstate from suspend state*/ - ret = amd_pstate_epp_update_limit(policy); + ret = amd_pstate_epp_update_limit(policy, false); if (ret) return ret; From ba3319e5905710abe495b11a1aaf03ebb51d62e2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Tue, 26 Aug 2025 00:27:47 -0500 Subject: [PATCH 2/4] cpufreq/amd-pstate: Fix a regression leading to EPP 0 after resume During the suspend sequence the cached CPPC request is destroyed with the expectation that it's restored during resume. This assumption broke when the separate cache EPP variable was removed, and then it was broken again by commit 608a76b65288 ("cpufreq/amd-pstate: Add support for the "Requested CPU Min frequency" BIOS option") which explicitly set it to zero during suspend. Remove the invalidation and set the value during the suspend call to update limits so that the cached variable can be used to restore on resume. Fixes: 608a76b65288 ("cpufreq/amd-pstate: Add support for the "Requested CPU Min frequency" BIOS option") Fixes: b7a41156588a ("cpufreq/amd-pstate: Invalidate cppc_req_cached during suspend") Reported-by: goldens Closes: https://community.frame.work/t/increased-power-usage-after-resuming-from-suspend-on-ryzen-7040-kernel-6-15-regression/ Closes: https://bugzilla.redhat.com/show_bug.cgi?id=2391221 Tested-by: goldens Tested-by: Willian Wang Reported-by: Vincent Mauirn Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219981 Tested-by: Alex De Lorenzo Reviewed-by: Gautham R. Shenoy Link: https://lore.kernel.org/r/20250826052747.2240670-1-superm1@kernel.org Signed-off-by: Mario Limonciello (AMD) --- drivers/cpufreq/amd-pstate.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c index 5cd91489fcbe..b4c79fde1979 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -1628,13 +1628,14 @@ static int amd_pstate_suspend(struct cpufreq_policy *policy) * min_perf value across kexec reboots. If this CPU is just resumed back without kexec, * the limits, epp and desired perf will get reset to the cached values in cpudata struct */ - ret = amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + ret = amd_pstate_update_perf(policy, perf.bios_min_perf, + FIELD_GET(AMD_CPPC_DES_PERF_MASK, cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), + FIELD_GET(AMD_CPPC_EPP_PERF_MASK, cpudata->cppc_req_cached), + false); if (ret) return ret; - /* invalidate to ensure it's rewritten during resume */ - cpudata->cppc_req_cached = 0; - /* set this flag to avoid setting core offline*/ cpudata->suspended = true; From e0423541477dfb684fbc6e6b5386054bc650f264 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 5 Sep 2025 15:44:45 +0200 Subject: [PATCH 3/4] PM: EM: Add function for registering a PD without capacity update The intel_pstate driver manages CPU capacity changes itself and it does not need an update of the capacity of all CPUs in the system to be carried out after registering a PD. Moreover, in some configurations (for instance, an SMT-capable hybrid x86 system booted with nosmt in the kernel command line) the em_check_capacity_update() call at the end of em_dev_register_perf_domain() always fails and reschedules itself to run once again in 1 s, so effectively it runs in vain every 1 s forever. To address this, introduce a new variant of em_dev_register_perf_domain(), called em_dev_register_pd_no_update(), that does not invoke em_check_capacity_update(), and make intel_pstate use it instead of the original. Fixes: 7b010f9b9061 ("cpufreq: intel_pstate: EAS support for hybrid platforms") Closes: https://lore.kernel.org/linux-pm/40212796-734c-4140-8a85-854f72b8144d@panix.com/ Reported-by: Kenneth R. Crudup Tested-by: Kenneth R. Crudup Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 4 ++-- include/linux/energy_model.h | 10 ++++++++++ kernel/power/energy_model.c | 29 +++++++++++++++++++++++++---- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f366d35c5840..0d5d283a5429 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -1034,8 +1034,8 @@ static bool hybrid_register_perf_domain(unsigned int cpu) if (!cpu_dev) return false; - if (em_dev_register_perf_domain(cpu_dev, HYBRID_EM_STATE_COUNT, &cb, - cpumask_of(cpu), false)) + if (em_dev_register_pd_no_update(cpu_dev, HYBRID_EM_STATE_COUNT, &cb, + cpumask_of(cpu), false)) return false; cpudata->pd_registered = true; diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 7fa1eb3cc823..61d50571ad88 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -171,6 +171,9 @@ int em_dev_update_perf_domain(struct device *dev, int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts); +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts); void em_dev_unregister_perf_domain(struct device *dev); struct em_perf_table *em_table_alloc(struct em_perf_domain *pd); void em_table_free(struct em_perf_table *table); @@ -350,6 +353,13 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, { return -EINVAL; } +static inline +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) +{ + return -EINVAL; +} static inline void em_dev_unregister_perf_domain(struct device *dev) { } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index ea7995a25780..8df55397414a 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -552,6 +552,30 @@ EXPORT_SYMBOL_GPL(em_cpu_get); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, const struct em_data_callback *cb, const cpumask_t *cpus, bool microwatts) +{ + int ret = em_dev_register_pd_no_update(dev, nr_states, cb, cpus, microwatts); + + if (_is_cpu_device(dev)) + em_check_capacity_update(); + + return ret; +} +EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); + +/** + * em_dev_register_pd_no_update() - Register a perf domain for a device + * @dev : Device to register the PD for + * @nr_states : Number of performance states in the new PD + * @cb : Callback functions for populating the energy model + * @cpus : CPUs to include in the new PD (mandatory if @dev is a CPU device) + * @microwatts : Whether or not the power values in the EM will be in uW + * + * Like em_dev_register_perf_domain(), but does not trigger a CPU capacity + * update after registering the PD, even if @dev is a CPU device. + */ +int em_dev_register_pd_no_update(struct device *dev, unsigned int nr_states, + const struct em_data_callback *cb, + const cpumask_t *cpus, bool microwatts) { struct em_perf_table *em_table; unsigned long cap, prev_cap = 0; @@ -636,12 +660,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, unlock: mutex_unlock(&em_pd_mutex); - if (_is_cpu_device(dev)) - em_check_capacity_update(); - return ret; } -EXPORT_SYMBOL_GPL(em_dev_register_perf_domain); +EXPORT_SYMBOL_GPL(em_dev_register_pd_no_update); /** * em_dev_unregister_perf_domain() - Unregister Energy Model (EM) for a device From 449c9c02537a146ac97ef962327a221e21c9cab3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 10 Sep 2025 11:41:59 +0200 Subject: [PATCH 4/4] PM: hibernate: Restrict GFP mask in hibernation_snapshot() Commit 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") incorrectly removed a pm_restrict_gfp_mask() call from hibernation_snapshot(), so memory allocations involving swap are not prevented from being carried out in this code path any more which may lead to serious breakage. The symptoms of such breakage have become visible after adding a shrink_shmem_memory() call to hibernation_snapshot() in commit 2640e819474f ("PM: hibernate: shrink shmem pages after dev_pm_ops.prepare()") which caused this problem to be much more likely to manifest itself. However, since commit 2640e819474f was initially present in the DRM tree that did not include commit 12ffc3b1513e, the symptoms of this issue were not visible until merge commit 260f6f4fda93 ("Merge tag 'drm-next-2025-07-30' of https://gitlab.freedesktop.org/drm/kernel") that exposed it through an entirely reasonable merge conflict resolution. Fixes: 12ffc3b1513e ("PM: Restrict swap use to later in the suspend sequence") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220555 Reported-by: Todd Brandt Tested-by: Todd Brandt Cc: 6.16+ # 6.16+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 1f1f30cca573..2f66ab453823 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -449,6 +449,7 @@ int hibernation_snapshot(int platform_mode) shrink_shmem_memory(); console_suspend_all(); + pm_restrict_gfp_mask(); error = dpm_suspend(PMSG_FREEZE);