From 6db0f533d320fab54154b0207e9df108427dd939 Mon Sep 17 00:00:00 2001 From: Zihuan Zhang Date: Sat, 11 Oct 2025 15:24:20 +0800 Subject: [PATCH 1/9] cpufreq: preserve freq_table_sorted across suspend/hibernate During S3/S4 suspend and resume, cpufreq policies are not freed or recreated; the freq_table and policy structure remain intact. However, set_freq_table_sorted() currently resets policy->freq_table_sorted to UNSORTED unconditionally, which is unnecessary since the table order does not change across suspend/resume. This patch adds a check to skip validation if policy->freq_table_sorted is already ASCENDING or DESCENDING. This avoids unnecessary traversal of the frequency table on S3/S4 resume or repeated online events, reducing overhead while preserving correctness. Signed-off-by: Zihuan Zhang Link: https://patch.msgid.link/20251011072420.11495-1-zhangzihuan@kylinos.cn Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 852e024facc3..4a27f6cb07d3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1421,9 +1421,12 @@ static int cpufreq_policy_online(struct cpufreq_policy *policy, * If there is a problem with its frequency table, take it * offline and drop it. */ - ret = cpufreq_table_validate_and_sort(policy); - if (ret) - goto out_offline_policy; + if (policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_ASCENDING && + policy->freq_table_sorted != CPUFREQ_TABLE_SORTED_DESCENDING) { + ret = cpufreq_table_validate_and_sort(policy); + if (ret) + goto out_offline_policy; + } /* related_cpus should at least include policy->cpus. */ cpumask_copy(policy->related_cpus, policy->cpus); From 528dde6619677ac6dc26d9dda1e3c9014b4a08c8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Oct 2025 15:46:40 +0200 Subject: [PATCH 2/9] cpufreq: intel_pstate: Add and use hybrid_get_cpu_type() Introduce a function for identifying the type of a given CPU in a hybrid system, called hybrid_get_cpu_type(), and use if for hybrid scaling factor determination in hwp_get_cpu_scaling(). Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/1954386.tdWV9SEqCh@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 38897bb14a2c..22316c930864 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -912,6 +912,11 @@ static struct freq_attr *hwp_cpufreq_attrs[] = { [HWP_CPUFREQ_ATTR_COUNT] = NULL, }; +static u8 hybrid_get_cpu_type(unsigned int cpu) +{ + return cpu_data(cpu).topo.intel_type; +} + static bool no_cas __ro_after_init; static struct cpudata *hybrid_max_perf_cpu __read_mostly; @@ -2298,18 +2303,14 @@ static int knl_get_turbo_pstate(int cpu) static int hwp_get_cpu_scaling(int cpu) { if (hybrid_scaling_factor) { - struct cpuinfo_x86 *c = &cpu_data(cpu); - u8 cpu_type = c->topo.intel_type; - /* * Return the hybrid scaling factor for P-cores and use the * default core scaling for E-cores. */ - if (cpu_type == INTEL_CPU_TYPE_CORE) + if (hybrid_get_cpu_type(cpu) == INTEL_CPU_TYPE_CORE) return hybrid_scaling_factor; - if (cpu_type == INTEL_CPU_TYPE_ATOM) - return core_get_scaling(); + return core_get_scaling(); } /* Use core scaling on non-hybrid systems. */ From c17add73498245bd94cb8a05345c73366606e671 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Oct 2025 15:47:45 +0200 Subject: [PATCH 3/9] cpufreq: intel_pstate: Add and use hybrid_has_l3() Introduce a function for checking whether or not a given CPU has L3 cache, called hybrid_has_l3(), and use it in hybrid_get_cost() for computing cost coefficients associated with a given perf domain. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/13884343.uLZWGnKmhe@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 22316c930864..f85056ee6e61 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -951,11 +951,26 @@ static int hybrid_active_power(struct device *dev, unsigned long *power, return 0; } +static bool hybrid_has_l3(unsigned int cpu) +{ + struct cpu_cacheinfo *cacheinfo = get_cpu_cacheinfo(cpu); + unsigned int i; + + if (!cacheinfo) + return false; + + for (i = 0; i < cacheinfo->num_leaves; i++) { + if (cacheinfo->info_list[i].level == 3) + return true; + } + + return false; +} + static int hybrid_get_cost(struct device *dev, unsigned long freq, unsigned long *cost) { struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate; - struct cpu_cacheinfo *cacheinfo = get_cpu_cacheinfo(dev->id); /* * The smaller the perf-to-frequency scaling factor, the larger the IPC @@ -973,17 +988,8 @@ static int hybrid_get_cost(struct device *dev, unsigned long freq, * touching it in case some other CPUs of the same type can do the work * without it. */ - if (cacheinfo) { - unsigned int i; - - /* Check if L3 cache is there. */ - for (i = 0; i < cacheinfo->num_leaves; i++) { - if (cacheinfo->info_list[i].level == 3) { - *cost += 2; - break; - } - } - } + if (hybrid_has_l3(dev->id)) + *cost += 2; return 0; } From d852b6f67b71dd22cd2af8ee29306eccbd6c06bf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 16 Oct 2025 18:22:13 +0200 Subject: [PATCH 4/9] cpufreq: intel_pstate: hybrid: Adjust energy model rules Instead of using HWP-to-frequency scaling factors for computing cost coefficients in the energy model used on hybrid systems, which is fragile, rely on CPU type information that is easily accessible now and the information on whether or not L3 cache is present for this purpose. This also allows the cost coefficients for P-cores to be adjusted so that they start to be populated somewhat earlier (that is, before E-cores are loaded up to their full capacity). In addition to the above, replace an inaccurate comment regarding the reason why the freq value is added to the cost in hybrid_get_cost(). Signed-off-by: Rafael J. Wysocki Reviewed-by: Dietmar Eggemann Reviewed-by: Yaxiong Tian Link: https://patch.msgid.link/5932894.DvuYhMxLoT@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 35 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index f85056ee6e61..3ed8a0001b2f 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -933,11 +933,8 @@ static int hybrid_active_power(struct device *dev, unsigned long *power, unsigned long *freq) { /* - * Create "utilization bins" of 0-40%, 40%-60%, 60%-80%, and 80%-100% - * of the maximum capacity such that two CPUs of the same type will be - * regarded as equally attractive if the utilization of each of them - * falls into the same bin, which should prevent tasks from being - * migrated between them too often. + * Create four "states" corresponding to 40%, 60%, 80%, and 100% of the + * full capacity. * * For this purpose, return the "frequency" of 2 for the first * performance level and otherwise leave the value set by the caller. @@ -970,26 +967,22 @@ static bool hybrid_has_l3(unsigned int cpu) static int hybrid_get_cost(struct device *dev, unsigned long freq, unsigned long *cost) { - struct pstate_data *pstate = &all_cpu_data[dev->id]->pstate; - + /* Facilitate load balancing between CPUs of the same type. */ + *cost = freq; /* - * The smaller the perf-to-frequency scaling factor, the larger the IPC - * ratio between the given CPU and the least capable CPU in the system. - * Regard that IPC ratio as the primary cost component and assume that - * the scaling factors for different CPU types will differ by at least - * 5% and they will not be above INTEL_PSTATE_CORE_SCALING. + * Adjust the cost depending on CPU type. * - * Add the freq value to the cost, so that the cost of running on CPUs - * of the same type in different "utilization bins" is different. + * The idea is to start loading up LPE-cores before E-cores and start + * to populate E-cores when LPE-cores are utilized above 60% of the + * capacity. Similarly, P-cores start to be populated when E-cores are + * utilized above 60% of the capacity. */ - *cost = div_u64(100ULL * INTEL_PSTATE_CORE_SCALING, pstate->scaling) + freq; - /* - * Increase the cost slightly for CPUs able to access L3 to avoid - * touching it in case some other CPUs of the same type can do the work - * without it. - */ - if (hybrid_has_l3(dev->id)) + if (hybrid_get_cpu_type(dev->id) == INTEL_CPU_TYPE_ATOM) { + if (hybrid_has_l3(dev->id)) /* E-core */ + *cost += 1; + } else { /* P-core */ *cost += 2; + } return 0; } From 5313ec4a215a0c4af8fd927b103d31c2c93e961f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 15 Oct 2025 15:50:56 +0200 Subject: [PATCH 5/9] cpufreq: intel_pstate: Improve printing of debug messages Some debug messages generated by intel_pstate on a given hybrid system are only printed for some CPUs which is confusing, so modify the driver to print them for all CPUs. Also change those messages to avoid printing local variable names in them. Moreover, some debug messages printed by intel_pstate are quite hard to understand without looking at the code printing them, so make them somewhat clearer while at it. Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/8609836.T7Z3S40VBb@rafael.j.wysocki --- drivers/cpufreq/intel_pstate.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 3ed8a0001b2f..7d2a1aec3a61 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -575,13 +575,18 @@ static void intel_pstate_hybrid_hwp_adjust(struct cpudata *cpu) int scaling = cpu->pstate.scaling; int freq; - pr_debug("CPU%d: perf_ctl_max_phys = %d\n", cpu->cpu, perf_ctl_max_phys); - pr_debug("CPU%d: perf_ctl_turbo = %d\n", cpu->cpu, perf_ctl_turbo); - pr_debug("CPU%d: perf_ctl_scaling = %d\n", cpu->cpu, perf_ctl_scaling); + pr_debug("CPU%d: PERF_CTL max_phys = %d\n", cpu->cpu, perf_ctl_max_phys); + pr_debug("CPU%d: PERF_CTL turbo = %d\n", cpu->cpu, perf_ctl_turbo); + pr_debug("CPU%d: PERF_CTL scaling = %d\n", cpu->cpu, perf_ctl_scaling); pr_debug("CPU%d: HWP_CAP guaranteed = %d\n", cpu->cpu, cpu->pstate.max_pstate); pr_debug("CPU%d: HWP_CAP highest = %d\n", cpu->cpu, cpu->pstate.turbo_pstate); pr_debug("CPU%d: HWP-to-frequency scaling factor: %d\n", cpu->cpu, scaling); + if (scaling == perf_ctl_scaling) + return; + + hwp_is_hybrid = true; + cpu->pstate.turbo_freq = rounddown(cpu->pstate.turbo_pstate * scaling, perf_ctl_scaling); cpu->pstate.max_freq = rounddown(cpu->pstate.max_pstate * scaling, @@ -1044,9 +1049,9 @@ static void hybrid_set_cpu_capacity(struct cpudata *cpu) topology_set_cpu_scale(cpu->cpu, arch_scale_cpu_capacity(cpu->cpu)); - pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu, - cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf, - cpu->pstate.max_pstate_physical); + pr_debug("CPU%d: capacity perf = %u, base perf = %u, sys max perf = %u\n", + cpu->cpu, cpu->capacity_perf, cpu->pstate.max_pstate_physical, + hybrid_max_perf_cpu->capacity_perf); } static void hybrid_clear_cpu_capacity(unsigned int cpunum) @@ -2344,11 +2349,10 @@ static void intel_pstate_set_min_pstate(struct cpudata *cpu) static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) { - int perf_ctl_max_phys = pstate_funcs.get_max_physical(cpu->cpu); int perf_ctl_scaling = pstate_funcs.get_scaling(); + cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(cpu->cpu); cpu->pstate.min_pstate = pstate_funcs.get_min(cpu->cpu); - cpu->pstate.max_pstate_physical = perf_ctl_max_phys; cpu->pstate.perf_ctl_scaling = perf_ctl_scaling; if (hwp_active && !hwp_mode_bdw) { @@ -2356,10 +2360,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) if (pstate_funcs.get_cpu_scaling) { cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu); - if (cpu->pstate.scaling != perf_ctl_scaling) { - intel_pstate_hybrid_hwp_adjust(cpu); - hwp_is_hybrid = true; - } + intel_pstate_hybrid_hwp_adjust(cpu); } else { cpu->pstate.scaling = perf_ctl_scaling; } From ace04717749d20e34dac9f78c5ac772168232b67 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 17 Oct 2025 17:33:54 +0200 Subject: [PATCH 6/9] cpufreq: Replace deprecated strcpy() in cpufreq_unregister_governor() strcpy() is deprecated; assign the NUL terminator directly instead. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum [ rjw: Subject tweaks ] Link: https://patch.msgid.link/20251017153354.82009-2-thorsten.blum@linux.dev Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 4a27f6cb07d3..4472bb1ec83c 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2553,7 +2553,7 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor) for_each_inactive_policy(policy) { if (!strcmp(policy->last_governor, governor->name)) { policy->governor = NULL; - strcpy(policy->last_governor, "\0"); + policy->last_governor[0] = '\0'; } } read_unlock_irqrestore(&cpufreq_driver_lock, flags); From cb908f8b0acc7e28b93e653c2a521dd090d8b99e Mon Sep 17 00:00:00 2001 From: Swaraj Gaikwad Date: Wed, 29 Oct 2025 13:47:37 +0000 Subject: [PATCH 7/9] Documentation: intel_pstate: fix duplicate hyperlink target errors Fix reST warnings in Documentation/admin-guide/pm/intel_pstate.rst caused by missing explicit hyperlink labels for section titles. Before this change, the following errors were printed during `make htmldocs`: Documentation/admin-guide/pm/intel_pstate.rst:401: ERROR: Indirect hyperlink target (id="id6") refers to target "passive mode", which is a duplicate, and cannot be used as a unique reference. Documentation/admin-guide/pm/intel_pstate.rst:517: ERROR: Indirect hyperlink target (id="id9") refers to target "active mode", which is a duplicate, and cannot be used as a unique reference. Documentation/admin-guide/pm/intel_pstate.rst:611: ERROR: Indirect hyperlink target (id="id15") refers to target "global attributes", which is a duplicate, and cannot be used as a unique reference. ERROR: Duplicate target name, cannot be used as a unique reference: "passive mode", "active mode", "global attributes". These errors occurred because the sections "Active Mode", "Active Mode With HWP", "Passive Mode", and "Global Attributes" did not define explicit hyperlink labels. As a result, Sphinx auto-generated duplicate anchors when the same titles appeared multiple times within the document. Because of this, the generated HTML documentation contained broken references such as: `active mode `_ `passive mode `_ `global attributes `_ This patch adds explicit hyperlink labels for the affected sections, ensuring all references are unique and correctly resolved. After applying this patch, `make htmldocs` completes without any warnings, and all hyperlinks in intel_pstate.html render properly. Signed-off-by: Swaraj Gaikwad Reviewed-by: Bagas Sanjaya Acked-by: Randy Dunlap Tested-by: Randy Dunlap [ rjw: Subject adjustment ] Link: https://patch.msgid.link/20251029134737.42229-1-swarajgaikwad1925@gmail.com Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_pstate.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 26e702c7016e..9cdd9dad6516 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -62,6 +62,8 @@ a certain performance scaling algorithm. Which of them will be in effect depends on what kernel command line options are used and on the capabilities of the processor. +.. _Active Mode: + Active Mode ----------- @@ -94,6 +96,8 @@ Which of the P-state selection algorithms is used by default depends on the Namely, if that option is set, the ``performance`` algorithm will be used by default, and the other one will be used by default if it is not set. +.. _Active Mode With HWP: + Active Mode With HWP ~~~~~~~~~~~~~~~~~~~~ @@ -192,6 +196,8 @@ This is the default P-state selection algorithm if the :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option is not set. +.. _Passive Mode: + Passive Mode ------------ @@ -432,6 +438,8 @@ the ``energy_model`` directory in ``debugfs`` (typlically mounted on User Space Interface in ``sysfs`` ================================= +.. _Global Attributes: + Global Attributes ----------------- From 790e826be8994d4510146458ebf5eee6f3267a3a Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 22 Oct 2025 14:54:25 -0700 Subject: [PATCH 8/9] cpufreq: intel_pstate: Add Diamond Rapids OOB mode support Prevent intel_pstate from loading when Out-of-Band (OOB) P-states mode is enabled. The OOB identification mechanism for Diamond Rapids servers is the same as for prior generation CPUs such as Granite Rapids. Add the Diamond Rapids CPU model to intel_pstate_cpu_oob_ids[] to ensure correct OOB handling. Signed-off-by: Kuppuswamy Sathyanarayanan Link: https://patch.msgid.link/20251022215425.3566218-1-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 7d2a1aec3a61..56c28411e130 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -2762,6 +2762,7 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = { X86_MATCH(INTEL_ATOM_CRESTMONT, core_funcs), X86_MATCH(INTEL_ATOM_CRESTMONT_X, core_funcs), X86_MATCH(INTEL_ATOM_DARKMONT_X, core_funcs), + X86_MATCH(INTEL_DIAMONDRAPIDS_X, core_funcs), {} }; #endif From 4ab25c92147663a7ce3187bd9075eeb2709a415b Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Sat, 1 Nov 2025 12:56:14 +0700 Subject: [PATCH 9/9] Documentation: intel-pstate: Use :ref: directive for internal linking intel_pstate docs uses standard reST construct (`Section title`_) for cross-referencing sections (internal linking), rather than for external links. Incorrect cross-references are not caught when these are written in that syntax, however (fortunately docutils 0.22 raise duplicate target warnings that get fixed in cb908f8b0acc7e ("Documentation: intel_pstate: fix duplicate hyperlink target errors")). Convert the cross-references to use :ref: directive, which doesn't exhibit this problem. Signed-off-by: Bagas Sanjaya Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap [ rjw: Changelog tweak ] Link: https://patch.msgid.link/20251101055614.32270-1-bagasdotme@gmail.com Signed-off-by: Rafael J. Wysocki --- Documentation/admin-guide/pm/intel_pstate.rst | 133 +++++++++--------- 1 file changed, 70 insertions(+), 63 deletions(-) diff --git a/Documentation/admin-guide/pm/intel_pstate.rst b/Documentation/admin-guide/pm/intel_pstate.rst index 9cdd9dad6516..fde967b0c2e0 100644 --- a/Documentation/admin-guide/pm/intel_pstate.rst +++ b/Documentation/admin-guide/pm/intel_pstate.rst @@ -48,8 +48,9 @@ only way to pass early-configuration-time parameters to it is via the kernel command line. However, its configuration can be adjusted via ``sysfs`` to a great extent. In some configurations it even is possible to unregister it via ``sysfs`` which allows another ``CPUFreq`` scaling driver to be loaded and -registered (see `below `_). +registered (see :ref:`below `). +.. _operation_modes: Operation Modes =============== @@ -62,7 +63,7 @@ a certain performance scaling algorithm. Which of them will be in effect depends on what kernel command line options are used and on the capabilities of the processor. -.. _Active Mode: +.. _active_mode: Active Mode ----------- @@ -96,7 +97,7 @@ Which of the P-state selection algorithms is used by default depends on the Namely, if that option is set, the ``performance`` algorithm will be used by default, and the other one will be used by default if it is not set. -.. _Active Mode With HWP: +.. _active_mode_hwp: Active Mode With HWP ~~~~~~~~~~~~~~~~~~~~ @@ -127,7 +128,7 @@ Energy-Performance Bias (EPB) knob (otherwise), which means that the processor's internal P-state selection logic is expected to focus entirely on performance. This will override the EPP/EPB setting coming from the ``sysfs`` interface -(see `Energy vs Performance Hints`_ below). Moreover, any attempts to change +(see :ref:`energy_performance_hints` below). Moreover, any attempts to change the EPP/EPB to a value different from 0 ("performance") via ``sysfs`` in this configuration will be rejected. @@ -196,7 +197,7 @@ This is the default P-state selection algorithm if the :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option is not set. -.. _Passive Mode: +.. _passive_mode: Passive Mode ------------ @@ -295,12 +296,12 @@ Unlike ``_PSS`` objects in the ACPI tables, ``intel_pstate`` always exposes the entire range of available P-states, including the whole turbo range, to the ``CPUFreq`` core and (in the passive mode) to generic scaling governors. This generally causes turbo P-states to be set more often when ``intel_pstate`` is -used relative to ACPI-based CPU performance scaling (see `below `_ -for more information). +used relative to ACPI-based CPU performance scaling (see +:ref:`below ` for more information). Moreover, since ``intel_pstate`` always knows what the real turbo threshold is (even if the Configurable TDP feature is enabled in the processor), its -``no_turbo`` attribute in ``sysfs`` (described `below `_) should +``no_turbo`` attribute in ``sysfs`` (described :ref:`below `) should work as expected in all cases (that is, if set to disable turbo P-states, it always should prevent ``intel_pstate`` from using them). @@ -313,12 +314,12 @@ pieces of information on it to be known, including: * The minimum supported P-state. - * The maximum supported `non-turbo P-state `_. + * The maximum supported :ref:`non-turbo P-state `. * Whether or not turbo P-states are supported at all. - * The maximum supported `one-core turbo P-state `_ (if turbo P-states - are supported). + * The maximum supported :ref:`one-core turbo P-state ` (if turbo + P-states are supported). * The scaling formula to translate the driver's internal representation of P-states into frequencies and the other way around. @@ -406,10 +407,10 @@ Energy-Aware Scheduling Support If ``CONFIG_ENERGY_MODEL`` has been set during kernel configuration and ``intel_pstate`` runs on a hybrid processor without SMT, in addition to enabling -`CAS `_ it registers an Energy Model for the processor. This allows the +:ref:`CAS` it registers an Energy Model for the processor. This allows the Energy-Aware Scheduling (EAS) support to be enabled in the CPU scheduler if ``schedutil`` is used as the ``CPUFreq`` governor which requires ``intel_pstate`` -to operate in the `passive mode `_. +to operate in the :ref:`passive mode `. The Energy Model registered by ``intel_pstate`` is artificial (that is, it is based on abstract cost values and it does not include any real power numbers) @@ -438,7 +439,7 @@ the ``energy_model`` directory in ``debugfs`` (typlically mounted on User Space Interface in ``sysfs`` ================================= -.. _Global Attributes: +.. _global_attributes: Global Attributes ----------------- @@ -452,8 +453,8 @@ argument is passed to the kernel in the command line. ``max_perf_pct`` Maximum P-state the driver is allowed to set in percent of the - maximum supported performance level (the highest supported `turbo - P-state `_). + maximum supported performance level (the highest supported :ref:`turbo + P-state `). This attribute will not be exposed if the ``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel @@ -461,8 +462,8 @@ argument is passed to the kernel in the command line. ``min_perf_pct`` Minimum P-state the driver is allowed to set in percent of the - maximum supported performance level (the highest supported `turbo - P-state `_). + maximum supported performance level (the highest supported :ref:`turbo + P-state `). This attribute will not be exposed if the ``intel_pstate=per_cpu_perf_limits`` argument is present in the kernel @@ -471,18 +472,18 @@ argument is passed to the kernel in the command line. ``num_pstates`` Number of P-states supported by the processor (between 0 and 255 inclusive) including both turbo and non-turbo P-states (see - `Turbo P-states Support`_). + :ref:`turbo`). This attribute is present only if the value exposed by it is the same for all of the CPUs in the system. The value of this attribute is not affected by the ``no_turbo`` - setting described `below `_. + setting described :ref:`below `. This attribute is read-only. ``turbo_pct`` - Ratio of the `turbo range `_ size to the size of the entire + Ratio of the :ref:`turbo range ` size to the size of the entire range of supported P-states, in percent. This attribute is present only if the value exposed by it is the same @@ -494,7 +495,7 @@ argument is passed to the kernel in the command line. ``no_turbo`` If set (equal to 1), the driver is not allowed to set any turbo P-states - (see `Turbo P-states Support`_). If unset (equal to 0, which is the + (see :ref:`turbo`). If unset (equal to 0, which is the default), turbo P-states can be set by the driver. [Note that ``intel_pstate`` does not support the general ``boost`` attribute (supported by some other scaling drivers) which is replaced @@ -503,11 +504,11 @@ argument is passed to the kernel in the command line. This attribute does not affect the maximum supported frequency value supplied to the ``CPUFreq`` core and exposed via the policy interface, but it affects the maximum possible value of per-policy P-state limits - (see `Interpretation of Policy Attributes`_ below for details). + (see :ref:`policy_attributes_interpretation` below for details). ``hwp_dynamic_boost`` This attribute is only present if ``intel_pstate`` works in the - `active mode with the HWP feature enabled `_ in + :ref:`active mode with the HWP feature enabled ` in the processor. If set (equal to 1), it causes the minimum P-state limit to be increased dynamically for a short time whenever a task previously waiting on I/O is selected to run on a given logical CPU (the purpose @@ -522,12 +523,12 @@ argument is passed to the kernel in the command line. Operation mode of the driver: "active", "passive" or "off". "active" - The driver is functional and in the `active mode - `_. + The driver is functional and in the :ref:`active mode + `. "passive" - The driver is functional and in the `passive mode - `_. + The driver is functional and in the :ref:`passive mode + `. "off" The driver is not functional (it is not registered as a scaling @@ -555,13 +556,15 @@ argument is passed to the kernel in the command line. attribute to "1" enables the energy-efficiency optimizations and setting to "0" disables them. +.. _policy_attributes_interpretation: + Interpretation of Policy Attributes ----------------------------------- The interpretation of some ``CPUFreq`` policy attributes described in Documentation/admin-guide/pm/cpufreq.rst is special with ``intel_pstate`` as the current scaling driver and it generally depends on the driver's -`operation mode `_. +:ref:`operation mode `. First of all, the values of the ``cpuinfo_max_freq``, ``cpuinfo_min_freq`` and ``scaling_cur_freq`` attributes are produced by applying a processor-specific @@ -570,9 +573,10 @@ Also, the values of the ``scaling_max_freq`` and ``scaling_min_freq`` attributes are capped by the frequency corresponding to the maximum P-state that the driver is allowed to set. -If the ``no_turbo`` `global attribute `_ is set, the driver is -not allowed to use turbo P-states, so the maximum value of ``scaling_max_freq`` -and ``scaling_min_freq`` is limited to the maximum non-turbo P-state frequency. +If the ``no_turbo`` :ref:`global attribute ` is set, the driver +is not allowed to use turbo P-states, so the maximum value of +``scaling_max_freq`` and ``scaling_min_freq`` is limited to the maximum +non-turbo P-state frequency. Accordingly, setting ``no_turbo`` causes ``scaling_max_freq`` and ``scaling_min_freq`` to go down to that value if they were above it before. However, the old values of ``scaling_max_freq`` and ``scaling_min_freq`` will be @@ -584,7 +588,7 @@ and ``scaling_min_freq`` corresponds to the maximum supported turbo P-state, which also is the value of ``cpuinfo_max_freq`` in either case. Next, the following policy attributes have special meaning if -``intel_pstate`` works in the `active mode `_: +``intel_pstate`` works in the :ref:`active mode `: ``scaling_available_governors`` List of P-state selection algorithms provided by ``intel_pstate``. @@ -605,20 +609,22 @@ processor: Shows the base frequency of the CPU. Any frequency above this will be in the turbo frequency range. -The meaning of these attributes in the `passive mode `_ is the +The meaning of these attributes in the :ref:`passive mode ` is the same as for other scaling drivers. Additionally, the value of the ``scaling_driver`` attribute for ``intel_pstate`` depends on the operation mode of the driver. Namely, it is either -"intel_pstate" (in the `active mode `_) or "intel_cpufreq" (in the -`passive mode `_). +"intel_pstate" (in the :ref:`active mode `) or "intel_cpufreq" +(in the :ref:`passive mode `). + +.. _pstate_limits_coordination: Coordination of P-State Limits ------------------------------ ``intel_pstate`` allows P-state limits to be set in two ways: with the help of -the ``max_perf_pct`` and ``min_perf_pct`` `global attributes -`_ or via the ``scaling_max_freq`` and ``scaling_min_freq`` +the ``max_perf_pct`` and ``min_perf_pct`` :ref:`global attributes +` or via the ``scaling_max_freq`` and ``scaling_min_freq`` ``CPUFreq`` policy attributes. The coordination between those limits is based on the following rules, regardless of the current operation mode of the driver: @@ -640,17 +646,18 @@ on the following rules, regardless of the current operation mode of the driver: 3. The global and per-policy limits can be set independently. -In the `active mode with the HWP feature enabled `_, the +In the :ref:`active mode with the HWP feature enabled `, the resulting effective values are written into hardware registers whenever the limits change in order to request its internal P-state selection logic to always set P-states within these limits. Otherwise, the limits are taken into account -by scaling governors (in the `passive mode `_) and by the driver -every time before setting a new P-state for a CPU. +by scaling governors (in the :ref:`passive mode `) and by the +driver every time before setting a new P-state for a CPU. Additionally, if the ``intel_pstate=per_cpu_perf_limits`` command line argument is passed to the kernel, ``max_perf_pct`` and ``min_perf_pct`` are not exposed at all and the only way to set the limits is by using the policy attributes. +.. _energy_performance_hints: Energy vs Performance Hints --------------------------- @@ -710,9 +717,9 @@ output. On those systems each ``_PSS`` object returns a list of P-states supported by the corresponding CPU which basically is a subset of the P-states range that can be used by ``intel_pstate`` on the same system, with one exception: the whole -`turbo range `_ is represented by one item in it (the topmost one). By -convention, the frequency returned by ``_PSS`` for that item is greater by 1 MHz -than the frequency of the highest non-turbo P-state listed by it, but the +:ref:`turbo range ` is represented by one item in it (the topmost one). +By convention, the frequency returned by ``_PSS`` for that item is greater by +1 MHz than the frequency of the highest non-turbo P-state listed by it, but the corresponding P-state representation (following the hardware specification) returned for it matches the maximum supported turbo P-state (or is the special value 255 meaning essentially "go as high as you can get"). @@ -738,18 +745,18 @@ benefit from running at turbo frequencies will be given non-turbo P-states instead. One more issue related to that may appear on systems supporting the -`Configurable TDP feature `_ allowing the platform firmware to set the -turbo threshold. Namely, if that is not coordinated with the lists of P-states -returned by ``_PSS`` properly, there may be more than one item corresponding to -a turbo P-state in those lists and there may be a problem with avoiding the -turbo range (if desirable or necessary). Usually, to avoid using turbo -P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state listed -by ``_PSS``, but that is not sufficient when there are other turbo P-states in -the list returned by it. +:ref:`Configurable TDP feature ` allowing the platform firmware to set +the turbo threshold. Namely, if that is not coordinated with the lists of +P-states returned by ``_PSS`` properly, there may be more than one item +corresponding to a turbo P-state in those lists and there may be a problem with +avoiding the turbo range (if desirable or necessary). Usually, to avoid using +turbo P-states overall, ``acpi-cpufreq`` simply avoids using the topmost state +listed by ``_PSS``, but that is not sufficient when there are other turbo +P-states in the list returned by it. Apart from the above, ``acpi-cpufreq`` works like ``intel_pstate`` in the -`passive mode `_, except that the number of P-states it can set -is limited to the ones listed by the ACPI ``_PSS`` objects. +:ref:`passive mode `, except that the number of P-states it can +set is limited to the ones listed by the ACPI ``_PSS`` objects. Kernel Command Line Options for ``intel_pstate`` @@ -764,11 +771,11 @@ of them have to be prepended with the ``intel_pstate=`` prefix. processor is supported by it. ``active`` - Register ``intel_pstate`` in the `active mode `_ to start - with. + Register ``intel_pstate`` in the :ref:`active mode ` to + start with. ``passive`` - Register ``intel_pstate`` in the `passive mode `_ to + Register ``intel_pstate`` in the :ref:`passive mode ` to start with. ``force`` @@ -801,12 +808,12 @@ of them have to be prepended with the ``intel_pstate=`` prefix. and this option has no effect. ``per_cpu_perf_limits`` - Use per-logical-CPU P-State limits (see `Coordination of P-state - Limits`_ for details). + Use per-logical-CPU P-State limits (see + :ref:`pstate_limits_coordination` for details). ``no_cas`` - Do not enable `capacity-aware scheduling `_ which is enabled by - default on hybrid systems without SMT. + Do not enable :ref:`capacity-aware scheduling ` which is enabled + by default on hybrid systems without SMT. Diagnostics and Tuning ====================== @@ -818,7 +825,7 @@ There are two static trace events that can be used for ``intel_pstate`` diagnostics. One of them is the ``cpu_frequency`` trace event generally used by ``CPUFreq``, and the other one is the ``pstate_sample`` trace event specific to ``intel_pstate``. Both of them are triggered by ``intel_pstate`` only if -it works in the `active mode `_. +it works in the :ref:`active mode `. The following sequence of shell commands can be used to enable them and see their output (if the kernel is generally configured to support event tracing):: @@ -830,7 +837,7 @@ their output (if the kernel is generally configured to support event tracing):: gnome-terminal--4510 [001] ..s. 1177.680733: pstate_sample: core_busy=107 scaled=94 from=26 to=26 mperf=1143818 aperf=1230607 tsc=29838618 freq=2474476 cat-5235 [002] ..s. 1177.681723: cpu_frequency: state=2900000 cpu_id=2 -If ``intel_pstate`` works in the `passive mode `_, the +If ``intel_pstate`` works in the :ref:`passive mode `, the ``cpu_frequency`` trace event will be triggered either by the ``schedutil`` scaling governor (for the policies it is attached to), or by the ``CPUFreq`` core (for the policies with other scaling governors).