From 07d815701274d156ad8c7c088a52e01642156fb8 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Mon, 6 Oct 2025 07:09:54 +0530 Subject: [PATCH 01/17] cpuidle: menu: Use residency threshold in polling state override decisions On virtualized PowerPC (pseries) systems, where only one polling state (Snooze) and one deep state (CEDE) are available, selecting CEDE when the predicted idle duration is less than the target residency of CEDE state can hurt performance. In such cases, the entry/exit overhead of CEDE outweighs the power savings, leading to unnecessary state transitions and higher latency. Menu governor currently contains a special-case rule that prioritizes the first non-polling state over polling, even when its target residency is much longer than the predicted idle duration. On PowerPC/pseries, where the gap between the polling state (Snooze) and the first non-polling state (CEDE) is large, this behavior causes performance regressions. Refine that special case by adding an extra requirement: the first non-polling state can only be chosen if its target residency is below the defined RESIDENCY_THRESHOLD_NS. If this condition is not satisfied, polling is allowed instead, avoiding suboptimal non-polling state entries. This change is limited to the single special-case rule for the first non-polling state. The general non-polling state selection logic in the menu governor remains unchanged. Performance improvement observed with pgbench on PowerPC (pseries) system: +---------------------------+------------+------------+------------+ | Metric | Baseline | Patched | Change (%) | +---------------------------+------------+------------+------------+ | Transactions/sec (TPS) | 495,210 | 536,982 | +8.45% | | Avg latency (ms) | 0.163 | 0.150 | -7.98% | +---------------------------+------------+------------+------------+ CPUIdle state usage: +--------------+--------------+-------------+ | Metric | Baseline | Patched | +--------------+--------------+-------------+ | Total usage | 12,735,820 | 13,918,442 | | Above usage | 11,401,520 | 1,598,210 | | Below usage | 20,145 | 702,395 | +--------------+--------------+-------------+ Above/Total and Below/Total usage percentages: +------------------------+-----------+---------+ | Metric | Baseline | Patched | +------------------------+-----------+---------+ | Above % (Above/Total) | 89.56% | 11.49% | | Below % (Below/Total) | 0.16% | 5.05% | | Total cpuidle miss (%) | 89.72% | 16.54% | +------------------------+-----------+---------+ The results indicate that restricting CEDE selection to cases where its residency matches the predicted idle time reduces mispredictions, lowers unnecessary state transitions, and improves overall throughput. Reviewed-by: Christian Loehle Signed-off-by: Aboorva Devarajan [ rjw: Changelog edits, rebase ] Link: https://patch.msgid.link/20251006013954.17972-1-aboorvad@linux.ibm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 23239b0c04f9..64d6f7a1c776 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -317,12 +317,13 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } /* - * Use a physical idle state, not busy polling, unless a timer - * is going to trigger soon enough or the exit latency of the - * idle state in question is greater than the predicted idle - * duration. + * Use a physical idle state instead of busy polling so long as + * its target residency is below the residency threshold, its + * exit latency is not greater than the predicted idle duration, + * and the next timer doesn't expire soon. */ if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && + s->target_residency_ns < RESIDENCY_THRESHOLD_NS && s->target_residency_ns <= data->next_timer_ns && s->exit_latency_ns <= predicted_ns) { predicted_ns = s->target_residency_ns; From 39f421f2e301f995c17c35b783e2863155b3f647 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 23 Oct 2025 10:45:32 -0700 Subject: [PATCH 02/17] powercap: intel_rapl: Add support for Wildcat Lake platform Add Wildcat Lake to the list of supported processors for RAPL. Signed-off-by: Srinivas Pandruvada Link: https://patch.msgid.link/20251023174532.1882008-1-srinivas.pandruvada@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 1 + drivers/powercap/intel_rapl_msr.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index c7e7f9bf5313..cdb4363589e9 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1284,6 +1284,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &rapl_defaults_spr_server), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 4ed06c71a3ac..c4d536c2f989 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -151,6 +151,7 @@ static const struct x86_cpu_id pl4_support_ids[] = { X86_MATCH_VFM(INTEL_ARROWLAKE_U, NULL), X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), {} }; From 76934e495cdc31942b53b513cee4290750578a9a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Nov 2025 20:07:28 +0100 Subject: [PATCH 03/17] cpuidle: Add sanity check for exit latency and target residency Make __cpuidle_driver_init() fail if the exit latency of one of the driver's idle states is less than its target residency which would break cpuidle assumptions. Signed-off-by: Rafael J. Wysocki Reviewed-by: Artem Bityutskiy Reviewed-by: Christian Loehle [ rjw: Changelog fix ] Link: https://patch.msgid.link/12779486.O9o76ZdvQC@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/driver.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 9bbfa594c442..1c295a93d582 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -152,7 +152,7 @@ static void cpuidle_setup_broadcast_timer(void *arg) * __cpuidle_driver_init - initialize the driver's internal data * @drv: a valid pointer to a struct cpuidle_driver */ -static void __cpuidle_driver_init(struct cpuidle_driver *drv) +static int __cpuidle_driver_init(struct cpuidle_driver *drv) { int i; @@ -193,7 +193,17 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) s->exit_latency_ns = 0; else s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); + + /* + * Ensure that the exit latency of a CPU idle state does not + * exceed its target residency which is assumed in cpuidle in + * multiple places. + */ + if (s->exit_latency_ns > s->target_residency_ns) + return -EINVAL; } + + return 0; } /** @@ -223,7 +233,9 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv) if (cpuidle_disabled()) return -ENODEV; - __cpuidle_driver_init(drv); + ret = __cpuidle_driver_init(drv); + if (ret) + return ret; ret = __cpuidle_set_driver(drv); if (ret) From 0796ddf4a7f0d15b0cf1ef6f265671f2e5174c1f Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Mon, 10 Nov 2025 12:08:19 +0000 Subject: [PATCH 04/17] cpuidle: teo: Use this_cpu_ptr() where possible The cpuidle governor callbacks for update, select and reflect are always running on the actual idle entering/exiting CPU, so use the more optimized this_cpu_ptr() to access the internal teo data. This brings down the latency-critical teo_reflect() from static void teo_reflect(struct cpuidle_device *dev, int state) { ffffffc080ffcff0: hint #0x19 ffffffc080ffcff4: stp x29, x30, [sp, #-48]! struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); ffffffc080ffcff8: adrp x2, ffffffc0848c0000 { ffffffc080ffcffc: add x29, sp, #0x0 ffffffc080ffd000: stp x19, x20, [sp, #16] ffffffc080ffd004: orr x20, xzr, x0 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); ffffffc080ffd008: add x0, x2, #0xc20 { ffffffc080ffd00c: stp x21, x22, [sp, #32] struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); ffffffc080ffd010: adrp x19, ffffffc083eb5000 ffffffc080ffd014: add x19, x19, #0xbb0 ffffffc080ffd018: ldr w3, [x20, #4] dev->last_state_idx = state; to static void teo_reflect(struct cpuidle_device *dev, int state) { ffffffc080ffd034: hint #0x19 ffffffc080ffd038: stp x29, x30, [sp, #-48]! ffffffc080ffd03c: add x29, sp, #0x0 ffffffc080ffd040: stp x19, x20, [sp, #16] ffffffc080ffd044: orr x20, xzr, x0 struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); ffffffc080ffd048: adrp x19, ffffffc083eb5000 { ffffffc080ffd04c: stp x21, x22, [sp, #32] struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); ffffffc080ffd050: add x19, x19, #0xbb0 dev->last_state_idx = state; This saves us: adrp x2, ffffffc0848c0000 add x0, x2, #0xc20 ldr w3, [x20, #4] Signed-off-by: Christian Loehle [ rjw: Subject tweak ] Link: https://patch.msgid.link/20251110120819.714560-1-christian.loehle@arm.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index bfa55c1eab5b..a3ebc2cda093 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -155,7 +155,7 @@ static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); */ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); int i, idx_timer = 0, idx_duration = 0; s64 target_residency_ns; u64 measured_ns; @@ -268,7 +268,7 @@ static int teo_find_shallower_state(struct cpuidle_driver *drv, static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, bool *stop_tick) { - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ktime_t delta_tick = TICK_NSEC / 2; unsigned int idx_intercept_sum = 0; @@ -504,7 +504,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ static void teo_reflect(struct cpuidle_device *dev, int state) { - struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); + struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); dev->last_state_idx = state; if (dev->poll_time_limit || From a03b2011808ab02ccb7ab6b573b013b77fbb5921 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 14:24:31 +0100 Subject: [PATCH 05/17] cpuidle: governors: teo: Drop misguided target residency check When the target residency of the current candidate idle state is greater than the expected time till the closest timer (the sleep length), it does not matter whether or not the tick has already been stopped or if it is going to be stopped. The closest timer will trigger anyway at its due time, so if an idle state with target residency above the sleep length is selected, energy will be wasted and there may be excess latency. Of course, if the closest timer were canceled before it could trigger, a deeper idle state would be more suitable, but this is not expected to happen (generally speaking, hrtimers are not expected to be canceled as a rule). Accordingly, the teo_state_ok() check done in that case causes energy to be wasted more often than it allows any energy to be saved (if it allows any energy to be saved at all), so drop it and let the governor use the teo_find_shallower_state() return value as the new candidate idle state index. Fixes: 21d28cd2fa5f ("cpuidle: teo: Do not call tick_nohz_get_sleep_length() upfront") Cc: All applicable Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Link: https://patch.msgid.link/5955081.DvuYhMxLoT@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index a3ebc2cda093..cc74cecbea7f 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -458,11 +458,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * If the closest expected timer is before the target residency of the * candidate state, a shallower one needs to be found. */ - if (drv->states[idx].target_residency_ns > duration_ns) { - i = teo_find_shallower_state(drv, dev, idx, duration_ns, false); - if (teo_state_ok(i, drv)) - idx = i; - } + if (drv->states[idx].target_residency_ns > duration_ns) + idx = teo_find_shallower_state(drv, dev, idx, duration_ns, false); /* * If the selected state's target residency is below the tick length From 17673f64a002fa7bd8f688f45b12ed32b59dba26 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2025 17:23:24 +0100 Subject: [PATCH 06/17] cpuidle: governors: teo: Drop redundant function parameter The last no_poll parameter of teo_find_shallower_state() is always false, so drop it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Link: https://patch.msgid.link/2253109.irdbgypaU6@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index cc74cecbea7f..ada42e2ca759 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -239,17 +239,15 @@ static bool teo_state_ok(int i, struct cpuidle_driver *drv) * @dev: Target CPU. * @state_idx: Index of the capping idle state. * @duration_ns: Idle duration value to match. - * @no_poll: Don't consider polling states. */ static int teo_find_shallower_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, int state_idx, - s64 duration_ns, bool no_poll) + s64 duration_ns) { int i; for (i = state_idx - 1; i >= 0; i--) { - if (dev->states_usage[i].disable || - (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING)) + if (dev->states_usage[i].disable) continue; state_idx = i; @@ -459,7 +457,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * candidate state, a shallower one needs to be found. */ if (drv->states[idx].target_residency_ns > duration_ns) - idx = teo_find_shallower_state(drv, dev, idx, duration_ns, false); + idx = teo_find_shallower_state(drv, dev, idx, duration_ns); /* * If the selected state's target residency is below the tick length @@ -487,7 +485,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, */ if (idx > idx0 && drv->states[idx].target_residency_ns > delta_tick) - idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false); + idx = teo_find_shallower_state(drv, dev, idx, delta_tick); out_tick: *stop_tick = false; From 8f3f01082d7ab334706c7d96c9271cd99e68aabc Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2025 17:24:40 +0100 Subject: [PATCH 07/17] cpuidle: governors: teo: Use s64 consistently in teo_update() Two local variables in teo_update() are defined as u64, but their values are then compared with s64 values, so it is more consistent to use s64 as their data type. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Link: https://patch.msgid.link/3026616.e9J7NaK4W3@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index ada42e2ca759..88ed47e868b9 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -157,8 +157,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); int i, idx_timer = 0, idx_duration = 0; - s64 target_residency_ns; - u64 measured_ns; + s64 target_residency_ns, measured_ns; cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; @@ -167,9 +166,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * If one of the safety nets has triggered, assume that this * might have been a long sleep. */ - measured_ns = U64_MAX; + measured_ns = S64_MAX; } else { - u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; + s64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; measured_ns = dev->last_residency_ns; /* From b54df61c7428ff50b21a03a53e3d580c6e84d1bf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2025 19:03:08 +0100 Subject: [PATCH 08/17] cpuidle: governors: teo: Decay metrics below DECAY_SHIFT threshold If a given governor metric falls below a certain value (8 for DECAY_SHIFT equal to 3), it will not decay any more due to the simplistic decay implementation. This may in some cases lead to subtle inconsistencies in the governor behavior, so change the decay implementation to take it into account and set the metric at hand to 0 in that case. Suggested-by: Christian Loehle Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Christian Loehle Link: https://patch.msgid.link/2819353.mvXUDI8C0e@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 88ed47e868b9..8b80d73e518e 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -148,6 +148,16 @@ struct teo_cpu { static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); +static void teo_decay(unsigned int *metric) +{ + unsigned int delta = *metric >> DECAY_SHIFT; + + if (delta) + *metric -= delta; + else + *metric = 0; +} + /** * teo_update - Update CPU metrics after wakeup. * @drv: cpuidle driver containing state data. @@ -158,8 +168,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); int i, idx_timer = 0, idx_duration = 0; s64 target_residency_ns, measured_ns; + unsigned int total = 0; - cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; + teo_decay(&cpu_data->short_idles); if (cpu_data->artificial_wakeup) { /* @@ -195,8 +206,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) for (i = 0; i < drv->state_count; i++) { struct teo_bin *bin = &cpu_data->state_bins[i]; - bin->hits -= bin->hits >> DECAY_SHIFT; - bin->intercepts -= bin->intercepts >> DECAY_SHIFT; + teo_decay(&bin->hits); + total += bin->hits; + teo_decay(&bin->intercepts); + total += bin->intercepts; target_residency_ns = drv->states[i].target_residency_ns; @@ -207,7 +220,9 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } - cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT; + cpu_data->total = total + PULSE; + + teo_decay(&cpu_data->tick_intercepts); /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. @@ -221,9 +236,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (TICK_NSEC <= measured_ns) cpu_data->tick_intercepts += PULSE; } - - cpu_data->total -= cpu_data->total >> DECAY_SHIFT; - cpu_data->total += PULSE; } static bool teo_state_ok(int i, struct cpuidle_driver *drv) From 58075aec92a8141fd7f42e1c36d1bc54552c015e Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Tue, 28 Oct 2025 15:48:14 +0530 Subject: [PATCH 09/17] powercap: intel_rapl: Add support for Nova Lake processors Add RAPL support for Intel Nova Lake and Nova Lake L processors using the core defaults configuration. Signed-off-by: Kaushlendra Kumar [ rjw: Subject and changelog edits, rebase ] Link: https://patch.msgid.link/20251028101814.3482508-1-kaushlendra.kumar@intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 2 ++ drivers/powercap/intel_rapl_msr.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index cdb4363589e9..57bebd07c7d0 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1285,6 +1285,8 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { X86_MATCH_VFM(INTEL_LUNARLAKE_M, &rapl_defaults_core), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_NOVALAKE, &rapl_defaults_core), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE, &rapl_defaults_core), X86_MATCH_VFM(INTEL_ARROWLAKE_U, &rapl_defaults_core), diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index c4d536c2f989..c6b9a7debc35 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -152,6 +152,8 @@ static const struct x86_cpu_id pl4_support_ids[] = { X86_MATCH_VFM(INTEL_ARROWLAKE_H, NULL), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), + X86_MATCH_VFM(INTEL_NOVALAKE, NULL), + X86_MATCH_VFM(INTEL_NOVALAKE_L, NULL), {} }; From 083654ded547238c70e0d4f57115cd1c91245b6e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 17:56:27 +0100 Subject: [PATCH 10/17] cpuidle: governors: teo: Rework the handling of tick wakeups If the wakeup pattern is clearly dominated by tick wakeups, count those wakeups as hits on the deepest available idle state to increase the likelihood of stopping the tick, especially on systems where there are only 2 usable idle states and the tick can only be stopped when the deeper state is selected. This change is expected to reduce power on some systems where state 0 is selected relatively often even though they are almost idle. Without it, the governor may end up selecting the shallowest idle state all the time even if the system is almost completely idle due all tick wakeups being counted as hits on that state and preventing the tick from being stopped at all. Fixes: 4b20b07ce72f ("cpuidle: teo: Don't count non-existent intercepts") Reported-by: Reka Norman Closes: https://lore.kernel.org/linux-pm/CAEmPcwsNMNnNXuxgvHTQ93Mx-q3Oz9U57THQsU_qdcCx1m4w5g@mail.gmail.com/ Tested-by: Reka Norman Tested-by: Christian Loehle Cc: 6.11+ # 6.11+: 92ce5c07b7a1: cpuidle: teo: Reorder candidate state index checks Cc: 6.11+ # 6.11+: ea185406d1ed: cpuidle: teo: Combine candidate state index checks against 0 Cc: 6.11+ # 6.11+: b9a6af26bd83: cpuidle: teo: Drop local variable prev_intercept_idx Cc: 6.11+ # 6.11+: e24f8a55de50: cpuidle: teo: Clarify two code comments Cc: 6.11+ # 6.11+: d619b5cc6780: cpuidle: teo: Simplify counting events used for tick management Cc: 6.11+ # 6.11+: 13ed5c4a6d9c: cpuidle: teo: Skip getting the sleep length if wakeups are very frequent Cc: 6.11+ # 6.11+: ddcfa7964677: cpuidle: teo: Simplify handling of total events count Cc: 6.11+ # 6.11+: 65e18e654475: cpuidle: teo: Replace time_span_ns with a flag Cc: 6.11+ # 6.11+: 0796ddf4a7f0: cpuidle: teo: Use this_cpu_ptr() where possible Cc: 6.11+ # 6.11+: 8f3f01082d7a: cpuidle: governors: teo: Use s64 consistently in teo_update() Cc: 6.11+ # 6.11+: b54df61c7428: cpuidle: governors: teo: Decay metrics below DECAY_SHIFT threshold Cc: 6.11+ # 6.11+ Signed-off-by: Rafael J. Wysocki [ rjw: Rebase on commit 0796ddf4a7f0, changelog update ] Link: https://patch.msgid.link/6228387.lOV4Wx5bFT@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 39 ++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 8b80d73e518e..94ba00b7617d 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -133,17 +133,19 @@ struct teo_bin { * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. * @total: Grand total of the "intercepts" and "hits" metrics for all bins. + * @total_tick: Wakeups by the scheduler tick. * @tick_intercepts: "Intercepts" before TICK_NSEC. * @short_idles: Wakeups after short idle periods. - * @artificial_wakeup: Set if the wakeup has been triggered by a safety net. + * @tick_wakeup: Set if the last wakeup was by the scheduler tick. */ struct teo_cpu { s64 sleep_length_ns; struct teo_bin state_bins[CPUIDLE_STATE_MAX]; unsigned int total; + unsigned int total_tick; unsigned int tick_intercepts; unsigned int short_idles; - bool artificial_wakeup; + bool tick_wakeup; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -172,9 +174,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) teo_decay(&cpu_data->short_idles); - if (cpu_data->artificial_wakeup) { + if (dev->poll_time_limit) { + dev->poll_time_limit = false; /* - * If one of the safety nets has triggered, assume that this + * Polling state timeout has triggered, so assume that this * might have been a long sleep. */ measured_ns = S64_MAX; @@ -223,6 +226,21 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->total = total + PULSE; teo_decay(&cpu_data->tick_intercepts); + + teo_decay(&cpu_data->total_tick); + if (cpu_data->tick_wakeup) { + cpu_data->total_tick += PULSE; + /* + * If tick wakeups dominate the wakeup pattern, count this one + * as a hit on the deepest available idle state to increase the + * likelihood of stopping the tick. + */ + if (3 * cpu_data->total_tick > 2 * cpu_data->total) { + cpu_data->state_bins[drv->state_count-1].hits += PULSE; + return; + } + } + /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. @@ -512,18 +530,9 @@ static void teo_reflect(struct cpuidle_device *dev, int state) { struct teo_cpu *cpu_data = this_cpu_ptr(&teo_cpus); + cpu_data->tick_wakeup = tick_nohz_idle_got_tick(); + dev->last_state_idx = state; - if (dev->poll_time_limit || - (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { - /* - * The wakeup was not "genuine", but triggered by one of the - * safety nets. - */ - dev->poll_time_limit = false; - cpu_data->artificial_wakeup = true; - } else { - cpu_data->artificial_wakeup = false; - } } /** From 50db438231dcf7ceac187a6a9c68a1d757b8d883 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 16 Nov 2025 13:34:29 +0100 Subject: [PATCH 11/17] cpuidle: governors: teo: Fix tick_intercepts handling in teo_update() The condition deciding whether or not to increase cpu_data->tick_intercepts in teo_update() is reverse, so fix it. Fixes: d619b5cc6780 ("cpuidle: teo: Simplify counting events used for tick management") Cc: 6.14+ # 6.14+: 0796ddf4a7f0: cpuidle: teo: Use this_cpu_ptr() where possible Cc: 6.14+ # 6.14+: 8f3f01082d7a: cpuidle: governors: teo: Use s64 consistently in teo_update() Cc: 6.14+ # 6.14+: b54df61c7428: cpuidle: governors: teo: Decay metrics below DECAY_SHIFT threshold Cc: 6.14+ 6.14+: 083654ded547: cpuidle: governors: teo: Rework the handling of tick wakeups Cc: 6.14+ Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/5085160.31r3eYUQgx@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 94ba00b7617d..85b5517067d1 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -251,7 +251,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->state_bins[idx_timer].hits += PULSE; } else { cpu_data->state_bins[idx_duration].intercepts += PULSE; - if (TICK_NSEC <= measured_ns) + if (measured_ns <= TICK_NSEC) cpu_data->tick_intercepts += PULSE; } } From d834e68a0e8b4a3c673eb96d4d53e48f3c19a81e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 16 Nov 2025 13:35:14 +0100 Subject: [PATCH 12/17] cpuidle: governors: teo: Simplify intercepts-based state lookup Simplify the loop looking up a candidate idle state in the case when an intercept is likely to occur by adding a search for the state index limit if the tick is stopped before it. First, call tick_nohz_tick_stopped() just once and if it returns true, look for the shallowest state index below the current candidate one with target residency at least equal to the tick period length. Next, simply look for a state that is not shallower than the one found in the previous step and satisfies the intercepts majority condition (if there are no such states, the shallowest state that is not shallower than the one found in the previous step becomes the new candidate). Since teo_state_ok() has no callers any more after the above changes, drop it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle [ rjw: Changelog clarification and code comment edit ] Link: https://patch.msgid.link/2418792.ElGaqSPkdT@rafael.j.wysocki Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 62 +++++++++------------------------ 1 file changed, 16 insertions(+), 46 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 85b5517067d1..bab186336bf4 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -256,12 +256,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } -static bool teo_state_ok(int i, struct cpuidle_driver *drv) -{ - return !tick_nohz_tick_stopped() || - drv->states[i].target_residency_ns >= TICK_NSEC; -} - /** * teo_find_shallower_state - Find shallower idle state matching given duration. * @drv: cpuidle driver containing state data. @@ -383,7 +377,18 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * better choice. */ if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { - int first_suitable_idx = idx; + int min_idx = idx0; + + if (tick_nohz_tick_stopped()) { + /* + * Look for the shallowest idle state below the current + * candidate one whose target residency is at least + * equal to the tick period length. + */ + while (min_idx < idx && + drv->states[min_idx].target_residency_ns < TICK_NSEC) + min_idx++; + } /* * Look for the deepest idle state whose target residency had @@ -393,49 +398,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * Take the possible duration limitation present if the tick * has been stopped already into account. */ - intercept_sum = 0; - - for (i = idx - 1; i >= 0; i--) { - struct teo_bin *bin = &cpu_data->state_bins[i]; - - intercept_sum += bin->intercepts; - - if (2 * intercept_sum > idx_intercept_sum) { - /* - * Use the current state unless it is too - * shallow or disabled, in which case take the - * first enabled state that is deep enough. - */ - if (teo_state_ok(i, drv) && - !dev->states_usage[i].disable) { - idx = i; - break; - } - idx = first_suitable_idx; - break; - } + for (i = idx - 1, intercept_sum = 0; i >= min_idx; i--) { + intercept_sum += cpu_data->state_bins[i].intercepts; if (dev->states_usage[i].disable) continue; - if (teo_state_ok(i, drv)) { - /* - * The current state is deep enough, but still - * there may be a better one. - */ - first_suitable_idx = i; - continue; - } - - /* - * The current state is too shallow, so if no suitable - * states other than the initial candidate have been - * found, give up (the remaining states to check are - * shallower still), but otherwise the first suitable - * state other than the initial candidate may turn out - * to be preferable. - */ - if (first_suitable_idx == idx) + idx = i; + if (2 * intercept_sum > idx_intercept_sum) break; } } From 1d6c915819f5b805c35487b6ce5923e31a28266b Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 20 Nov 2025 16:05:38 -0800 Subject: [PATCH 13/17] powercap: intel_rapl: Prepare read_raw() interface for atomic-context callers The current read_raw() implementation of the TPMI, MMIO and MSR interfaces does not distinguish between atomic and non-atomic callers. rapl_msr_read_raw() uses rdmsrq_safe_on_cpu(), which can sleep and issue cross CPU calls. When MSR-based RAPL PMU support is enabled, PMU event handlers can invoke this function from atomic context where sleeping or rescheduling is not allowed. In atomic context, the caller is already executing on the target CPU, so a direct rdmsrq() is sufficient. To support such usage, introduce an atomic flag to the read_raw() interface to allow callers pass the context information. Modify the common RAPL code to propagate this flag, and set the flag to reflect the calling contexts. Utilize the atomic flag in rapl_msr_read_raw() to perform direct MSR read with rdmsrq() when running in atomic context, and a sanity check to ensure target CPU matches the current CPU for such use cases. The TPMI and MMIO implementations do not require special atomic handling, so the flag is ignored in those paths. This is a preparatory patch for adding MSR-based RAPL PMU support. Signed-off-by: Kuppuswamy Sathyanarayanan Reviewed-by: Srinivas Pandruvada [ rjw: Subject tweak ] Link: https://patch.msgid.link/20251121000539.386069-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 24 ++++++++++--------- drivers/powercap/intel_rapl_msr.c | 16 ++++++++++++- drivers/powercap/intel_rapl_tpmi.c | 2 +- .../int340x_thermal/processor_thermal_rapl.c | 2 +- include/linux/intel_rapl.h | 2 +- 5 files changed, 31 insertions(+), 15 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 57bebd07c7d0..47ec34d4c099 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -253,7 +253,8 @@ struct rapl_primitive_info { static void rapl_init_domains(struct rapl_package *rp); static int rapl_read_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, - bool xlate, u64 *data); + bool xlate, u64 *data, + bool atomic); static int rapl_write_data_raw(struct rapl_domain *rd, enum rapl_primitives prim, unsigned long long value); @@ -289,7 +290,7 @@ static int get_energy_counter(struct powercap_zone *power_zone, cpus_read_lock(); rd = power_zone_to_rapl_domain(power_zone); - if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { + if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now, false)) { *energy_raw = energy_now; cpus_read_unlock(); @@ -830,7 +831,8 @@ prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim) * 63-------------------------- 31--------------------------- 0 */ static int rapl_read_data_raw(struct rapl_domain *rd, - enum rapl_primitives prim, bool xlate, u64 *data) + enum rapl_primitives prim, bool xlate, u64 *data, + bool atomic) { u64 value; enum rapl_primitives prim_fixed = prim_fixups(rd, prim); @@ -852,7 +854,7 @@ static int rapl_read_data_raw(struct rapl_domain *rd, ra.mask = rpi->mask; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, atomic)) { pr_debug("failed to read reg 0x%llx for %s:%s\n", ra.reg.val, rd->rp->name, rd->name); return -EIO; } @@ -904,7 +906,7 @@ static int rapl_read_pl_data(struct rapl_domain *rd, int pl, if (!is_pl_valid(rd, pl)) return -EINVAL; - return rapl_read_data_raw(rd, prim, xlate, data); + return rapl_read_data_raw(rd, prim, xlate, data, false); } static int rapl_write_pl_data(struct rapl_domain *rd, int pl, @@ -941,7 +943,7 @@ static int rapl_check_unit_core(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -969,7 +971,7 @@ static int rapl_check_unit_atom(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -1156,7 +1158,7 @@ static int rapl_check_unit_tpmi(struct rapl_domain *rd) ra.reg = rd->regs[RAPL_DOMAIN_REG_UNIT]; ra.mask = ~0; - if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra)) { + if (rd->rp->priv->read_raw(get_rid(rd->rp), &ra, false)) { pr_err("Failed to read power unit REG 0x%llx on %s:%s, exit.\n", ra.reg.val, rd->rp->name, rd->name); return -ENODEV; @@ -1328,7 +1330,7 @@ static void rapl_update_domain_data(struct rapl_package *rp) struct rapl_primitive_info *rpi = get_rpi(rp, prim); if (!rapl_read_data_raw(&rp->domains[dmn], prim, - rpi->unit, &val)) + rpi->unit, &val, false)) rp->domains[dmn].rdd.primitives[prim] = val; } } @@ -1428,7 +1430,7 @@ static int rapl_check_domain(int domain, struct rapl_package *rp) */ ra.mask = ENERGY_STATUS_MASK; - if (rp->priv->read_raw(get_rid(rp), &ra) || !ra.value) + if (rp->priv->read_raw(get_rid(rp), &ra, false) || !ra.value) return -ENODEV; return 0; @@ -1639,7 +1641,7 @@ static u64 event_read_counter(struct perf_event *event) if (event->hw.idx < 0) return 0; - ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val); + ret = rapl_read_data_raw(&rp->domains[event->hw.idx], ENERGY_COUNTER, false, &val, true); /* Return 0 for failed read */ if (ret) diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index c6b9a7debc35..6e3c50af0912 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -102,12 +102,26 @@ static int rapl_cpu_down_prep(unsigned int cpu) return 0; } -static int rapl_msr_read_raw(int cpu, struct reg_action *ra) +static int rapl_msr_read_raw(int cpu, struct reg_action *ra, bool atomic) { + /* + * When called from atomic-context (eg PMU event handler) + * perform MSR read directly using rdmsrq(). + */ + if (atomic) { + if (unlikely(smp_processor_id() != cpu)) + return -EIO; + + rdmsrq(ra->reg.msr, ra->value); + goto out; + } + if (rdmsrq_safe_on_cpu(cpu, ra->reg.msr, &ra->value)) { pr_debug("failed to read msr 0x%x on cpu %d\n", ra->reg.msr, cpu); return -EIO; } + +out: ra->value &= ra->mask; return 0; } diff --git a/drivers/powercap/intel_rapl_tpmi.c b/drivers/powercap/intel_rapl_tpmi.c index 82201bf4685d..0a0b85f4528b 100644 --- a/drivers/powercap/intel_rapl_tpmi.c +++ b/drivers/powercap/intel_rapl_tpmi.c @@ -60,7 +60,7 @@ static DEFINE_MUTEX(tpmi_rapl_lock); static struct powercap_control_type *tpmi_control_type; -static int tpmi_rapl_read_raw(int id, struct reg_action *ra) +static int tpmi_rapl_read_raw(int id, struct reg_action *ra, bool atomic) { if (!ra->reg.mmio) return -EINVAL; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c index bde2cc386afd..bf51a17c5be6 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rapl.c @@ -19,7 +19,7 @@ static const struct rapl_mmio_regs rapl_mmio_default = { .limits[RAPL_DOMAIN_DRAM] = BIT(POWER_LIMIT2), }; -static int rapl_mmio_read_raw(int cpu, struct reg_action *ra) +static int rapl_mmio_read_raw(int cpu, struct reg_action *ra, bool atomic) { if (!ra->reg.mmio) return -EINVAL; diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index c0397423d3a8..e9ade2ff4af6 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -152,7 +152,7 @@ struct rapl_if_priv { union rapl_reg reg_unit; union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; - int (*read_raw)(int id, struct reg_action *ra); + int (*read_raw)(int id, struct reg_action *ra, bool atomic); int (*write_raw)(int id, struct reg_action *ra); void *defaults; void *rpi; From 748d6ba43afde7e9ac27443233203995cc15d235 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 20 Nov 2025 16:05:39 -0800 Subject: [PATCH 14/17] powercap: intel_rapl: Enable MSR-based RAPL PMU support Currently, RAPL PMU support requires adding CPU model entries to arch/x86/events/rapl.c for each new generation. However, RAPL MSRs are not architectural and require platform-specific customization, making arch/x86 an inappropriate location for this functionality. The powercap subsystem already handles RAPL functionality and is the natural place to consolidate all RAPL features. The powercap RAPL driver already includes PMU support for TPMI-based RAPL interfaces, making it straightforward to extend this support to MSR-based RAPL interfaces as well. This consolidation eliminates the need to maintain RAPL support in multiple subsystems and provides a unified approach for both TPMI and MSR-based RAPL implementations. The MSR-based PMU support includes the following updates: 1. Register MSR-based PMU support for the supported platforms and unregister it when no online CPUs remain in the package. 2. Remove existing checks that restrict RAPL PMU support to TPMI-based interfaces and extend the logic to allow MSR-based RAPL interfaces. 3. Define a CPU model list to determine which processors should register RAPL PMU interface through the powercap driver for MSR-based RAPL, excluding those that support TPMI interface. This list prevents conflicts with existing arch/x86 PMU code that already registers RAPL PMU for some processors. Add Panther Lake & Wildcat Lake to the CPU models list. Signed-off-by: Kuppuswamy Sathyanarayanan Reviewed-by: Srinivas Pandruvada [ rjw: Changelog edits ] Link: https://patch.msgid.link/20251121000539.386069-3-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/powercap/intel_rapl_common.c | 12 ++++++------ drivers/powercap/intel_rapl_msr.c | 24 ++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/powercap/intel_rapl_common.c b/drivers/powercap/intel_rapl_common.c index 47ec34d4c099..b9d87e56cbbc 100644 --- a/drivers/powercap/intel_rapl_common.c +++ b/drivers/powercap/intel_rapl_common.c @@ -1597,11 +1597,11 @@ static int get_pmu_cpu(struct rapl_package *rp) if (!rp->has_pmu) return nr_cpu_ids; - /* Only TPMI RAPL is supported for now */ - if (rp->priv->type != RAPL_IF_TPMI) + /* Only TPMI & MSR RAPL are supported for now */ + if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) return nr_cpu_ids; - /* TPMI RAPL uses any CPU in the package for PMU */ + /* TPMI/MSR RAPL uses any CPU in the package for PMU */ for_each_online_cpu(cpu) if (topology_physical_package_id(cpu) == rp->id) return cpu; @@ -1614,11 +1614,11 @@ static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu) if (!rp->has_pmu) return false; - /* Only TPMI RAPL is supported for now */ - if (rp->priv->type != RAPL_IF_TPMI) + /* Only TPMI & MSR RAPL are supported for now */ + if (rp->priv->type != RAPL_IF_TPMI && rp->priv->type != RAPL_IF_MSR) return false; - /* TPMI RAPL uses any CPU in the package for PMU */ + /* TPMI/MSR RAPL uses any CPU in the package for PMU */ return topology_physical_package_id(cpu) == rp->id; } diff --git a/drivers/powercap/intel_rapl_msr.c b/drivers/powercap/intel_rapl_msr.c index 6e3c50af0912..0ce1096b6314 100644 --- a/drivers/powercap/intel_rapl_msr.c +++ b/drivers/powercap/intel_rapl_msr.c @@ -33,6 +33,8 @@ /* private data for RAPL MSR Interface */ static struct rapl_if_priv *rapl_msr_priv; +static bool rapl_msr_pmu __ro_after_init; + static struct rapl_if_priv rapl_msr_priv_intel = { .type = RAPL_IF_MSR, .reg_unit.msr = MSR_RAPL_POWER_UNIT, @@ -79,6 +81,8 @@ static int rapl_cpu_online(unsigned int cpu) rp = rapl_add_package_cpuslocked(cpu, rapl_msr_priv, true); if (IS_ERR(rp)) return PTR_ERR(rp); + if (rapl_msr_pmu) + rapl_package_add_pmu(rp); } cpumask_set_cpu(cpu, &rp->cpumask); return 0; @@ -95,10 +99,14 @@ static int rapl_cpu_down_prep(unsigned int cpu) cpumask_clear_cpu(cpu, &rp->cpumask); lead_cpu = cpumask_first(&rp->cpumask); - if (lead_cpu >= nr_cpu_ids) + if (lead_cpu >= nr_cpu_ids) { + if (rapl_msr_pmu) + rapl_package_remove_pmu(rp); rapl_remove_package_cpuslocked(rp); - else if (rp->lead_cpu == cpu) + } else if (rp->lead_cpu == cpu) { rp->lead_cpu = lead_cpu; + } + return 0; } @@ -171,6 +179,13 @@ static const struct x86_cpu_id pl4_support_ids[] = { {} }; +/* List of MSR-based RAPL PMU support CPUs */ +static const struct x86_cpu_id pmu_support_ids[] = { + X86_MATCH_VFM(INTEL_PANTHERLAKE_L, NULL), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, NULL), + {} +}; + static int rapl_msr_probe(struct platform_device *pdev) { const struct x86_cpu_id *id = x86_match_cpu(pl4_support_ids); @@ -198,6 +213,11 @@ static int rapl_msr_probe(struct platform_device *pdev) pr_info("PL4 support detected.\n"); } + if (x86_match_cpu(pmu_support_ids)) { + rapl_msr_pmu = true; + pr_info("MSR-based RAPL PMU support enabled\n"); + } + rapl_msr_priv->control_type = powercap_register_control_type(NULL, "intel-rapl", NULL); if (IS_ERR(rapl_msr_priv->control_type)) { pr_debug("failed to register powercap control_type.\n"); From 15bfdadd617ec5363802f7cb6a0385b6569f374e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 21 Nov 2025 21:11:16 +0100 Subject: [PATCH 15/17] cpuidle: governors: teo: Add missing space to the description There is a missing space in the governor description comment, so add it. No functional impact. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/5059034.31r3eYUQgx@rafael.j.wysocki --- drivers/cpuidle/governors/teo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index bab186336bf4..81ac5fd58a1c 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -76,7 +76,7 @@ * likely woken up by a non-timer wakeup source). * * 2. If the second sum computed in step 1 is greater than a half of the sum of - * both metrics for the candidate state bin and all subsequent bins(if any), + * both metrics for the candidate state bin and all subsequent bins (if any), * a shallower idle state is likely to be more suitable, so look for it. * * - Traverse the enabled idle states shallower than the candidate one in the From 6d96ceff9aeb7e7a1713faaccf472f363cc6d48f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Nov 2025 21:57:52 +0100 Subject: [PATCH 16/17] cpuidle: Update header inclusion While cleaning up some headers, I got a build error on this file: drivers/cpuidle/poll_state.c:52:2: error: call to undeclared library function 'snprintf' with type 'int (char *restrict, unsigned long, const char *restrict, ...)'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] Update header inclusions to follow IWYU (Include What You Use) principle. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20251124205752.1328701-1-andriy.shevchenko@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/poll_state.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c index 9b6d90a72601..c7524e4c522a 100644 --- a/drivers/cpuidle/poll_state.c +++ b/drivers/cpuidle/poll_state.c @@ -4,9 +4,13 @@ */ #include +#include +#include #include #include #include +#include +#include #define POLL_IDLE_RELAX_COUNT 200 From 4bf944f3fcb6c192af1ea73e3d183b6364458b25 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Nov 2025 17:23:12 +0100 Subject: [PATCH 17/17] cpuidle: Warn instead of bailing out if target residency check fails It turns out that the change in commit 76934e495cdc ("cpuidle: Add sanity check for exit latency and target residency") goes too far because there are systems in the field on which the check introduced by that commit does not pass. For this reason, change __cpuidle_driver_init() return type back to void and make it print a warning when the check mentioned above does not pass. Fixes: 76934e495cdc ("cpuidle: Add sanity check for exit latency and target residency") Reported-by: Val Packett Closes: https://lore.kernel.org/linux-pm/20251121010756.6687-1-val@packett.cool/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/2808566.mvXUDI8C0e@rafael.j.wysocki --- drivers/cpuidle/driver.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 1c295a93d582..370664c47e65 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -8,6 +8,8 @@ * This code is licenced under the GPL. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -152,7 +154,7 @@ static void cpuidle_setup_broadcast_timer(void *arg) * __cpuidle_driver_init - initialize the driver's internal data * @drv: a valid pointer to a struct cpuidle_driver */ -static int __cpuidle_driver_init(struct cpuidle_driver *drv) +static void __cpuidle_driver_init(struct cpuidle_driver *drv) { int i; @@ -195,15 +197,13 @@ static int __cpuidle_driver_init(struct cpuidle_driver *drv) s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC); /* - * Ensure that the exit latency of a CPU idle state does not - * exceed its target residency which is assumed in cpuidle in - * multiple places. + * Warn if the exit latency of a CPU idle state exceeds its + * target residency which is assumed to never happen in cpuidle + * in multiple places. */ if (s->exit_latency_ns > s->target_residency_ns) - return -EINVAL; + pr_warn("Idle state %d target residency too low\n", i); } - - return 0; } /** @@ -233,9 +233,7 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv) if (cpuidle_disabled()) return -ENODEV; - ret = __cpuidle_driver_init(drv); - if (ret) - return ret; + __cpuidle_driver_init(drv); ret = __cpuidle_set_driver(drv); if (ret)