From 0f127178892ec54553da2e3975e23979709b5ff9 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Mon, 22 Jul 2024 10:14:32 -0700
Subject: [PATCH 01/35] cpufreq: powerpc: add missing MODULE_DESCRIPTION()
 macros

With ARCH=powerpc, make allmodconfig && make W=1 C=1 reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cpufreq/ppc-cbe-cpufreq.o
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/cpufreq/powernv-cpufreq.o

Add the missing invocation of the MODULE_DESCRIPTION() macro to all
files which have a MODULE_LICENSE().

This includes three additional files which, although they did not
produce a warning with the powerpc allmodconfig configuration, may
cause this warning with specific options enabled in the kernel
configuration.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Acked-by: Michael Ellerman <mpe@ellerman.id.au> (powerpc)
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Link: https://patch.msgid.link/20240722-md-powerpc-drivers-cpufreq-v2-1-bb84d715eb3d@quicinc.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/maple-cpufreq.c   | 1 +
 drivers/cpufreq/pasemi-cpufreq.c  | 1 +
 drivers/cpufreq/pmac64-cpufreq.c  | 1 +
 drivers/cpufreq/powernv-cpufreq.c | 1 +
 drivers/cpufreq/ppc_cbe_cpufreq.c | 1 +
 5 files changed, 5 insertions(+)
diff --git a/drivers/cpufreq/maple-cpufreq.c b/drivers/cpufreq/maple-cpufreq.c
index f9306410a07f..690da85c4865 100644
--- a/drivers/cpufreq/maple-cpufreq.c
+++ b/drivers/cpufreq/maple-cpufreq.c
@@ -238,4 +238,5 @@ static int __init maple_cpufreq_init(void)
 module_init(maple_cpufreq_init);
 
 
+MODULE_DESCRIPTION("cpufreq driver for Maple 970FX/970MP boards");
 MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/pasemi-cpufreq.c b/drivers/cpufreq/pasemi-cpufreq.c
index ee925b53b6b9..5fc9cb480516 100644
--- a/drivers/cpufreq/pasemi-cpufreq.c
+++ b/drivers/cpufreq/pasemi-cpufreq.c
@@ -269,5 +269,6 @@ static void __exit pas_cpufreq_exit(void)
 module_init(pas_cpufreq_init);
 module_exit(pas_cpufreq_exit);
 
+MODULE_DESCRIPTION("cpufreq driver for PA Semi PWRficient");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Egor Martovetsky <egor@pasemi.com>, Olof Johansson <olof@lixom.net>");
diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c
index 2cd2b06849a2..9d3fe36075f8 100644
--- a/drivers/cpufreq/pmac64-cpufreq.c
+++ b/drivers/cpufreq/pmac64-cpufreq.c
@@ -671,4 +671,5 @@ static int __init g5_cpufreq_init(void)
 module_init(g5_cpufreq_init);
 
 
+MODULE_DESCRIPTION("cpufreq driver for SMU & 970FX based G5 Macs");
 MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 50c62929f7ca..bc55723b4d87 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -1160,5 +1160,6 @@ static void __exit powernv_cpufreq_exit(void)
 }
 module_exit(powernv_cpufreq_exit);
 
+MODULE_DESCRIPTION("cpufreq driver for IBM/OpenPOWER powernv systems");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>");
diff --git a/drivers/cpufreq/ppc_cbe_cpufreq.c b/drivers/cpufreq/ppc_cbe_cpufreq.c
index 5ee4c7bfdcc5..98595b3ea13f 100644
--- a/drivers/cpufreq/ppc_cbe_cpufreq.c
+++ b/drivers/cpufreq/ppc_cbe_cpufreq.c
@@ -168,5 +168,6 @@ static void __exit cbe_cpufreq_exit(void)
 module_init(cbe_cpufreq_init);
 module_exit(cbe_cpufreq_exit);
 
+MODULE_DESCRIPTION("cpufreq driver for Cell BE processors");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Christian Krafft <krafft@de.ibm.com>");

From 37c6dccd683797a5a4ec85d2e94939bf829d3499 Mon Sep 17 00:00:00 2001
From: Qais Yousef <qyousef@layalina.io>
Date: Sun, 28 Jul 2024 20:26:59 +0100
Subject: [PATCH 02/35] cpufreq: Remove LATENCY_MULTIPLIER

The current LATENCY_MULTIPLIER which has been around for nearly 20 years
causes rate_limit_us to be always in ms range.

On M1 mac mini I get 50 and 56us transition latency, but due to the 1000
multiplier we end up setting rate_limit_us to 50 and 56ms, which gets
capped into 2ms and was 10ms before e13aa799c2a6 ("cpufreq: Change
default transition delay to 2ms")

On Intel I5 system transition latency is 20us but due to the multiplier
we end up with 20ms that again is capped to 2ms.

Given how good modern hardware and how modern workloads require systems
to be more responsive to cater for sudden changes in workload (tasks
sleeping/wakeup/migrating, uclamp causing a sudden boost or cap) and
that 2ms is quarter of the time of 120Hz refresh rate system, drop the
old logic in favour of providing 50% headroom.

	rate_limit_us = 1.5 * latency.

I considered not adding any headroom which could mean that we can end up
with infinite back-to-back requests.

I also considered providing a constant headroom (e.g: 100us) assuming
that any h/w or f/w dealing with the request shouldn't require a large
headroom when transition_latency is actually high.

But for both cases I wasn't sure if h/w or f/w can end up being
overwhelmed dealing with the freq requests in a potentially busy system.
So I opted for providing 50% breathing room.

This is expected to impact schedutil only as the other user,
dbs_governor, takes the max(2*tick, transition_delay_us) and the former
was at least 2ms on 1ms TICK, which is equivalent to the max_delay_us
before applying this patch. For systems with TICK of 4ms, this value
would have almost always ended up with 8ms sampling rate.

For systems that report 0 transition latency, we still default to
returning 1ms as transition delay.

This helps in eliminating a source of latency for applying requests as
mentioned in [1]. For example if we have a 1ms tick, most systems will
miss sending an update at tick when updating the util_avg for a task/CPU
(rate_limit_us will be 2ms for most systems).

Link: https://lore.kernel.org/lkml/20240724212255.mfr2ybiv2j2uqek7@airbuntu/ # [1]
Link: https://lore.kernel.org/lkml/20240205022500.2232124-1-qyousef@layalina.io/
Signed-off-by: Qais Yousef <qyousef@layalina.io>
Link: https://patch.msgid.link/20240728192659.58115-1-qyousef@layalina.io
[ rjw: Subject edits ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 27 ++++-----------------------
 include/linux/cpufreq.h   |  6 ------
 2 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 04fc786dd2c0..f98c9438760c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -575,30 +575,11 @@ unsigned int cpufreq_policy_transition_delay_us(struct cpufreq_policy *policy)
 		return policy->transition_delay_us;
 
 	latency = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
-	if (latency) {
-		unsigned int max_delay_us = 2 * MSEC_PER_SEC;
+	if (latency)
+		/* Give a 50% breathing room between updates */
+		return latency + (latency >> 1);
 
-		/*
-		 * If the platform already has high transition_latency, use it
-		 * as-is.
-		 */
-		if (latency > max_delay_us)
-			return latency;
-
-		/*
-		 * For platforms that can change the frequency very fast (< 2
-		 * us), the above formula gives a decent transition delay. But
-		 * for platforms where transition_latency is in milliseconds, it
-		 * ends up giving unrealistic values.
-		 *
-		 * Cap the default transition delay to 2 ms, which seems to be
-		 * a reasonable amount of time after which we should reevaluate
-		 * the frequency.
-		 */
-		return min(latency * LATENCY_MULTIPLIER, max_delay_us);
-	}
-
-	return LATENCY_MULTIPLIER;
+	return USEC_PER_MSEC;
 }
 EXPORT_SYMBOL_GPL(cpufreq_policy_transition_delay_us);
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d4d2f4d1d7cb..e0e19d9c1323 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -577,12 +577,6 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
 #define CPUFREQ_POLICY_POWERSAVE	(1)
 #define CPUFREQ_POLICY_PERFORMANCE	(2)
 
-/*
- * The polling frequency depends on the capability of the processor. Default
- * polling frequency is 1000 times the transition latency of the processor.
- */
-#define LATENCY_MULTIPLIER		(1000)
-
 struct cpufreq_governor {
 	char	name[CPUFREQ_NAME_LEN];
 	int	(*init)(struct cpufreq_policy *policy);

From 872cc94c7e3d874373f56b035f998631c6b26d22 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 31 Jul 2024 13:12:40 -0600
Subject: [PATCH 03/35] cpufreq: Use of_property_present()

Use of_property_present() to test for property presence rather than
of_(find|get)_property(). This is part of a larger effort to remove
callers of of_find_property() and similar functions. of_find_property()
leaks the DT struct property and data pointers which is a problem for
dynamically allocated nodes which may be freed.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Acked-by: Chen-Yu Tsai <wens@csie.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt.c           | 11 +++--------
 drivers/cpufreq/pmac64-cpufreq.c       |  2 +-
 drivers/cpufreq/sti-cpufreq.c          |  2 +-
 drivers/cpufreq/sun50i-cpufreq-nvmem.c |  2 +-
 4 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 6532c4d71338..983443396f8f 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -69,7 +69,6 @@ static int set_target(struct cpufreq_policy *policy, unsigned int index)
 static const char *find_supply_name(struct device *dev)
 {
 	struct device_node *np __free(device_node) = of_node_get(dev->of_node);
-	struct property *pp;
 	int cpu = dev->id;
 
 	/* This must be valid for sure */
@@ -77,14 +76,10 @@ static const char *find_supply_name(struct device *dev)
 		return NULL;
 
 	/* Try "cpu0" for older DTs */
-	if (!cpu) {
-		pp = of_find_property(np, "cpu0-supply", NULL);
-		if (pp)
-			return "cpu0";
-	}
+	if (!cpu && of_property_present(np, "cpu0-supply"))
+		return "cpu0";
 
-	pp = of_find_property(np, "cpu-supply", NULL);
-	if (pp)
+	if (of_property_present(np, "cpu-supply"))
 		return "cpu";
 
 	dev_dbg(dev, "no regulator for cpu%d\n", cpu);
diff --git a/drivers/cpufreq/pmac64-cpufreq.c b/drivers/cpufreq/pmac64-cpufreq.c
index 2cd2b06849a2..c87cd6e0b638 100644
--- a/drivers/cpufreq/pmac64-cpufreq.c
+++ b/drivers/cpufreq/pmac64-cpufreq.c
@@ -505,7 +505,7 @@ static int __init g5_pm72_cpufreq_init(struct device_node *cpunode)
 			continue;
 		if (strcmp(loc, "CPU CLOCK"))
 			continue;
-		if (!of_get_property(hwclock, "platform-get-frequency", NULL))
+		if (!of_property_present(hwclock, "platform-get-frequency"))
 			continue;
 		break;
 	}
diff --git a/drivers/cpufreq/sti-cpufreq.c b/drivers/cpufreq/sti-cpufreq.c
index 8e2e703c3865..b15b3142b5fe 100644
--- a/drivers/cpufreq/sti-cpufreq.c
+++ b/drivers/cpufreq/sti-cpufreq.c
@@ -267,7 +267,7 @@ static int __init sti_cpufreq_init(void)
 		goto skip_voltage_scaling;
 	}
 
-	if (!of_get_property(ddata.cpu->of_node, "operating-points-v2", NULL)) {
+	if (!of_property_present(ddata.cpu->of_node, "operating-points-v2")) {
 		dev_err(ddata.cpu, "OPP-v2 not supported\n");
 		goto skip_voltage_scaling;
 	}
diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c
index 95ac8d46c156..293921acec93 100644
--- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c
+++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c
@@ -146,7 +146,7 @@ static bool dt_has_supported_hw(void)
 		return false;
 
 	for_each_child_of_node_scoped(np, opp) {
-		if (of_find_property(opp, "opp-supported-hw", NULL)) {
+		if (of_property_present(opp, "opp-supported-hw")) {
 			has_opp_supported_hw = true;
 			break;
 		}

From 9b3cc56c24d80671763b4b6b7ea4aed213059d32 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Wed, 31 Jul 2024 14:16:15 -0600
Subject: [PATCH 04/35] cpufreq: spear: Use of_property_for_each_u32() instead
 of open coding

Use of_property_count_u32_elems() and of_property_for_each_u32() instead
of of_find_property() and open coding the property parsing.

This is part of a larger effort to remove callers of of_find_property()
and similar functions. of_find_property() leaks the DT struct property
and data pointers which is a problem for dynamically allocated nodes
which may be freed.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
[Viresh: Fixed build failure and initialize / increment 'i']
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/spear-cpufreq.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index 78b875db6b66..d8ab5b01d46d 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -171,10 +171,9 @@ static struct cpufreq_driver spear_cpufreq_driver = {
 static int spear_cpufreq_probe(struct platform_device *pdev)
 {
 	struct device_node *np;
-	const struct property *prop;
 	struct cpufreq_frequency_table *freq_tbl;
-	const __be32 *val;
-	int cnt, i, ret;
+	u32 val;
+	int cnt, ret, i = 0;
 
 	np = of_cpu_device_node_get(0);
 	if (!np) {
@@ -186,26 +185,23 @@ static int spear_cpufreq_probe(struct platform_device *pdev)
 				&spear_cpufreq.transition_latency))
 		spear_cpufreq.transition_latency = CPUFREQ_ETERNAL;
 
-	prop = of_find_property(np, "cpufreq_tbl", NULL);
-	if (!prop || !prop->value) {
+	cnt = of_property_count_u32_elems(np, "cpufreq_tbl");
+	if (cnt <= 0) {
 		pr_err("Invalid cpufreq_tbl\n");
 		ret = -ENODEV;
 		goto out_put_node;
 	}
 
-	cnt = prop->length / sizeof(u32);
-	val = prop->value;
-
 	freq_tbl = kcalloc(cnt + 1, sizeof(*freq_tbl), GFP_KERNEL);
 	if (!freq_tbl) {
 		ret = -ENOMEM;
 		goto out_put_node;
 	}
 
-	for (i = 0; i < cnt; i++)
-		freq_tbl[i].frequency = be32_to_cpup(val++);
+	of_property_for_each_u32(np, "cpufreq_tbl", val)
+		freq_tbl[i++].frequency = val;
 
-	freq_tbl[i].frequency = CPUFREQ_TABLE_END;
+	freq_tbl[cnt].frequency = CPUFREQ_TABLE_END;
 
 	spear_cpufreq.freq_tbl = freq_tbl;
 

From 5c326d24c1b31a275afc05b4fbb7292eabd4d70a Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Tue, 6 Aug 2024 07:58:22 -0600
Subject: [PATCH 05/35] cpufreq: qcom: Add explicit io.h include for
 readl/writel_relaxed

The qcom-cpufreq-hw driver is relying on an implicit include of io.h
which doesn't get included on some architectures.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/qcom-cpufreq-hw.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 370fe6a0104b..900d6844c43d 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -9,6 +9,7 @@
 #include <linux/init.h>
 #include <linux/interconnect.h>
 #include <linux/interrupt.h>
+#include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/of.h>

From b81f97031a0ea2823cf37bb37eac51dbb8350e02 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Tue, 6 Aug 2024 07:58:23 -0600
Subject: [PATCH 06/35] cpufreq: omap: Drop asm includes

The omap driver doesn't actually need asm/smp_plat.h, so drop it.
asm/cpu.h is not needed either as linux/cpu.h is already included.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/omap-cpufreq.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index 3458d5cc9b7f..de8be0a8932d 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -28,9 +28,6 @@
 #include <linux/platform_device.h>
 #include <linux/regulator/consumer.h>
 
-#include <asm/smp_plat.h>
-#include <asm/cpu.h>
-
 /* OPP tolerance in percentage */
 #define	OPP_TOLERANCE	4
 

From 1ffec650d07f63a584d2a789a20bb64e9b13d34b Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Tue, 6 Aug 2024 07:58:24 -0600
Subject: [PATCH 07/35] cpufreq: armada-8k: Avoid excessive stack usage

In some build configurations (e.g. x86 allmodconfig), 2 cpu_mask
variables exceeds the max stack frame size:

drivers/cpufreq/armada-8k-cpufreq.c:203:1: error: the frame size of 2128 bytes is larger than 2048 bytes

Fix it by making "cpus" static which is fine given that module init is
only called once.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/armada-8k-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/armada-8k-cpufreq.c b/drivers/cpufreq/armada-8k-cpufreq.c
index ce5a5641b6dd..7a979db81f09 100644
--- a/drivers/cpufreq/armada-8k-cpufreq.c
+++ b/drivers/cpufreq/armada-8k-cpufreq.c
@@ -132,7 +132,7 @@ static int __init armada_8k_cpufreq_init(void)
 	int ret = 0, opps_index = 0, cpu, nb_cpus;
 	struct freq_table *freq_tables;
 	struct device_node *node;
-	struct cpumask cpus;
+	static struct cpumask cpus;
 
 	node = of_find_matching_node_and_match(NULL, armada_8k_cpufreq_of_match,
 					       NULL);

From 3ca2a3d1e7271549985dc57c8106bb07b3134ecb Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 2 Aug 2024 11:48:39 -0700
Subject: [PATCH 08/35] cpufreq: intel_pstate: Support Granite Rapids and
 Sierra Forest OOB mode

Prevent intel_pstate from loading when OOB (Out Of Band) P-states mode is
enabled.

The OOB identifying bits are same as for the prior generation CPUs like
Emerald Rapids servers. Add Granite Rapids and Sierra Forest CPU models to
intel_pstate_cpu_oob_ids[].

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Link: https://patch.msgid.link/20240802184839.1909091-1-srinivas.pandruvada@linux.intel.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c0278d023cfc..c69b420d0bc9 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2425,6 +2425,10 @@ static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] __initconst = {
 	X86_MATCH(INTEL_ICELAKE_X,		core_funcs),
 	X86_MATCH(INTEL_SAPPHIRERAPIDS_X,	core_funcs),
 	X86_MATCH(INTEL_EMERALDRAPIDS_X,	core_funcs),
+	X86_MATCH(INTEL_GRANITERAPIDS_D,	core_funcs),
+	X86_MATCH(INTEL_GRANITERAPIDS_X,	core_funcs),
+	X86_MATCH(INTEL_ATOM_CRESTMONT,		core_funcs),
+	X86_MATCH(INTEL_ATOM_CRESTMONT_X,	core_funcs),
 	{}
 };
 #endif

From 5a9d10145a54f7a3fb6297c0082bf030e04db3bc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 28 Aug 2024 13:47:25 +0200
Subject: [PATCH 09/35] x86/sched: Add basic support for CPU capacity scaling

In order be able to compute the sizes of tasks consistently across all
CPUs in a hybrid system, it is necessary to provide CPU capacity scaling
information to the scheduler via arch_scale_cpu_capacity().  Moreover,
the value returned by arch_scale_freq_capacity() for the given CPU must
correspond to the arch_scale_cpu_capacity() return value for it, or
utilization computations will be inaccurate.

Add support for it through per-CPU variables holding the capacity and
maximum-to-base frequency ratio (times SCHED_CAPACITY_SCALE) that will
be returned by arch_scale_cpu_capacity() and used by scale_freq_tick()
to compute arch_freq_scale for the current CPU, respectively.

In order to avoid adding measurable overhead for non-hybrid x86 systems,
which are the vast majority in the field, whether or not the new hybrid
CPU capacity scaling will be in effect is controlled by a static key.
This static key is set by calling arch_enable_hybrid_capacity_scale()
which also allocates memory for the per-CPU data and initializes it.
Next, arch_set_cpu_capacity() is used to set the per-CPU variables
mentioned above for each CPU and arch_rebuild_sched_domains() needs
to be called for the scheduler to realize that capacity-aware
scheduling can be used going forward.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> # scale invariance
Link: https://patch.msgid.link/10523497.nUPlyArG6x@rjwysocki.net
[ rjw: Added parens to function kerneldoc comments ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/topology.h  | 13 +++++
 arch/x86/kernel/cpu/aperfmperf.c | 89 +++++++++++++++++++++++++++++++-
 2 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index abe3a8f22cbd..aef70336d624 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -282,9 +282,22 @@ static inline long arch_scale_freq_capacity(int cpu)
 }
 #define arch_scale_freq_capacity arch_scale_freq_capacity
 
+bool arch_enable_hybrid_capacity_scale(void);
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+			   unsigned long cap_freq, unsigned long base_freq);
+
+unsigned long arch_scale_cpu_capacity(int cpu);
+#define arch_scale_cpu_capacity arch_scale_cpu_capacity
+
 extern void arch_set_max_freq_ratio(bool turbo_disabled);
 extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled);
 #else
+static inline bool arch_enable_hybrid_capacity_scale(void) { return false; }
+static inline void arch_set_cpu_capacity(int cpu, unsigned long cap,
+					 unsigned long max_cap,
+					 unsigned long cap_freq,
+					 unsigned long base_freq) { }
+
 static inline void arch_set_max_freq_ratio(bool turbo_disabled) { }
 static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { }
 #endif
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 0b69bfbf345d..f642de2ebdac 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -349,9 +349,89 @@ static DECLARE_WORK(disable_freq_invariance_work,
 DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
 EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
 
+static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);
+
+struct arch_hybrid_cpu_scale {
+	unsigned long capacity;
+	unsigned long freq_ratio;
+};
+
+static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;
+
+/**
+ * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
+ *
+ * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
+ * initialize it and set the static key controlling its code paths.
+ *
+ * Must be called before arch_set_cpu_capacity().
+ */
+bool arch_enable_hybrid_capacity_scale(void)
+{
+	int cpu;
+
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key)) {
+		WARN_ONCE(1, "Hybrid CPU capacity scaling already enabled");
+		return true;
+	}
+
+	arch_cpu_scale = alloc_percpu(struct arch_hybrid_cpu_scale);
+	if (!arch_cpu_scale)
+		return false;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu_ptr(arch_cpu_scale, cpu)->capacity = SCHED_CAPACITY_SCALE;
+		per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio = arch_max_freq_ratio;
+	}
+
+	static_branch_enable(&arch_hybrid_cap_scale_key);
+
+	pr_info("Hybrid CPU capacity scaling enabled\n");
+
+	return true;
+}
+
+/**
+ * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
+ * @cpu: Target CPU.
+ * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
+ * @max_cap: System-wide maximum CPU capacity.
+ * @cap_freq: Frequency of @cpu corresponding to @cap.
+ * @base_freq: Frequency of @cpu at which MPERF counts.
+ *
+ * The units in which @cap and @max_cap are expressed do not matter, so long
+ * as they are consistent, because the former is effectively divided by the
+ * latter.  Analogously for @cap_freq and @base_freq.
+ *
+ * After calling this function for all CPUs, call arch_rebuild_sched_domains()
+ * to let the scheduler know that capacity-aware scheduling can be used going
+ * forward.
+ */
+void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
+			   unsigned long cap_freq, unsigned long base_freq)
+{
+	if (static_branch_likely(&arch_hybrid_cap_scale_key)) {
+		WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity,
+			   div_u64(cap << SCHED_CAPACITY_SHIFT, max_cap));
+		WRITE_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->freq_ratio,
+			   div_u64(cap_freq << SCHED_CAPACITY_SHIFT, base_freq));
+	} else {
+		WARN_ONCE(1, "Hybrid CPU capacity scaling not enabled");
+	}
+}
+
+unsigned long arch_scale_cpu_capacity(int cpu)
+{
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+		return READ_ONCE(per_cpu_ptr(arch_cpu_scale, cpu)->capacity);
+
+	return SCHED_CAPACITY_SCALE;
+}
+EXPORT_SYMBOL_GPL(arch_scale_cpu_capacity);
+
 static void scale_freq_tick(u64 acnt, u64 mcnt)
 {
-	u64 freq_scale;
+	u64 freq_scale, freq_ratio;
 
 	if (!arch_scale_freq_invariant())
 		return;
@@ -359,7 +439,12 @@ static void scale_freq_tick(u64 acnt, u64 mcnt)
 	if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
 		goto error;
 
-	if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
+	if (static_branch_unlikely(&arch_hybrid_cap_scale_key))
+		freq_ratio = READ_ONCE(this_cpu_ptr(arch_cpu_scale)->freq_ratio);
+	else
+		freq_ratio = arch_max_freq_ratio;
+
+	if (check_mul_overflow(mcnt, freq_ratio, &mcnt) || !mcnt)
 		goto error;
 
 	freq_scale = div64_u64(acnt, mcnt);

From 929ebc93ccaa8d183a2ba9f1cf769d1bfb847cca Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 4 Sep 2024 13:44:54 +0200
Subject: [PATCH 10/35] cpufreq: intel_pstate: Set asymmetric CPU capacity on
 hybrid systems

Make intel_pstate use the HWP_HIGHEST_PERF values from
MSR_HWP_CAPABILITIES to set asymmetric CPU capacity information
via the previously introduced arch_set_cpu_capacity() on hybrid
systems without SMT.

Setting asymmetric CPU capacity is generally necessary to allow the
scheduler to compute task sizes in a consistent way across all CPUs
in a system where they differ by capacity.  That, in turn, should help
to improve scheduling decisions.  It is also necessary for the schedutil
cpufreq governor to operate as expected on hybrid systems where tasks
migrate between CPUs of different capacities.

The underlying observation is that intel_pstate already uses
MSR_HWP_CAPABILITIES to get CPU performance information which is
exposed by it via sysfs and CPU performance scaling is based on it.
Thus using this information for setting asymmetric CPU capacity is
consistent with what the driver has been doing already.  Moreover,
HWP_HIGHEST_PERF reflects the maximum capacity of a given CPU including
both the instructions-per-cycle (IPC) factor and the maximum turbo
frequency and the units in which that value is expressed are the same
for all CPUs in the system, so the maximum capacity ratio between two
CPUs can be obtained by computing the ratio of their HWP_HIGHEST_PERF
values.  Of course, in principle that capacity ratio need not be
directly applicable at lower frequencies, so using it for providing the
asymmetric CPU capacity information to the scheduler is a rough
approximation, but it is as good as it gets.  Also, measurements
indicate that this approximation is not too bad in practice.

If the given system is hybrid and non-SMT, the new code disables ITMT
support in the scheduler (because it may get in the way of asymmetric CPU
capacity code in the scheduler that automatically gets enabled by setting
asymmetric CPU capacity) after initializing all online CPUs and finds
the one with the maximum HWP_HIGHEST_PERF value.  Next, it computes the
capacity number for each (online) CPU by dividing the product of its
HWP_HIGHEST_PERF and SCHED_CAPACITY_SCALE by the maximum HWP_HIGHEST_PERF.

When a CPU goes offline, its capacity is reset to SCHED_CAPACITY_SCALE
and if it is the one with the maximum HWP_HIGHEST_PERF value, the
capacity numbers for all of the other online CPUs are recomputed.  This
also takes care of a cleanup during driver operation mode changes.

Analogously, when a new CPU goes online, its capacity number is updated
and if its HWP_HIGHEST_PERF value is greater than the current maximum
one, the capacity numbers for all of the other online CPUs are
recomputed.

The case when the driver is notified of a CPU capacity change, either
through the HWP interrupt or through an ACPI notification, is handled
similarly to the CPU online case above, except that if the target CPU
is the current highest-capacity one and its capacity is reduced, the
capacity numbers for all of the other online CPUs need to be recomputed
either.

If the driver's "no_trubo" sysfs attribute is updated, all of the CPU
capacity information is computed from scratch to reflect the new turbo
status.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
Tested-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com> # scale invariance
Link: https://patch.msgid.link/1979653.PYKUYFuaPT@rjwysocki.net
[ rjw: Fixed a typo in the changelog ]
[ rjw: Renamed 3 new functions and added a comment ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 236 ++++++++++++++++++++++++++++++++-
 1 file changed, 232 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c69b420d0bc9..aaea9a39eced 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -16,6 +16,7 @@
 #include <linux/tick.h>
 #include <linux/slab.h>
 #include <linux/sched/cpufreq.h>
+#include <linux/sched/smt.h>
 #include <linux/list.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
@@ -215,6 +216,7 @@ struct global_params {
  * @hwp_req_cached:	Cached value of the last HWP Request MSR
  * @hwp_cap_cached:	Cached value of the last HWP Capabilities MSR
  * @last_io_update:	Last time when IO wake flag was set
+ * @capacity_perf:	Highest perf used for scale invariance
  * @sched_flags:	Store scheduler flags for possible cross CPU update
  * @hwp_boost_min:	Last HWP boosted min performance
  * @suspended:		Whether or not the driver has been suspended.
@@ -253,6 +255,7 @@ struct cpudata {
 	u64 hwp_req_cached;
 	u64 hwp_cap_cached;
 	u64 last_io_update;
+	unsigned int capacity_perf;
 	unsigned int sched_flags;
 	u32 hwp_boost_min;
 	bool suspended;
@@ -295,6 +298,7 @@ static int hwp_mode_bdw __ro_after_init;
 static bool per_cpu_limits __ro_after_init;
 static bool hwp_forced __ro_after_init;
 static bool hwp_boost __read_mostly;
+static bool hwp_is_hybrid;
 
 static struct cpufreq_driver *intel_pstate_driver __read_mostly;
 
@@ -934,6 +938,139 @@ static struct freq_attr *hwp_cpufreq_attrs[] = {
 	NULL,
 };
 
+static struct cpudata *hybrid_max_perf_cpu __read_mostly;
+/*
+ * Protects hybrid_max_perf_cpu, the capacity_perf fields in struct cpudata,
+ * and the x86 arch scale-invariance information from concurrent updates.
+ */
+static DEFINE_MUTEX(hybrid_capacity_lock);
+
+static void hybrid_set_cpu_capacity(struct cpudata *cpu)
+{
+	arch_set_cpu_capacity(cpu->cpu, cpu->capacity_perf,
+			      hybrid_max_perf_cpu->capacity_perf,
+			      cpu->capacity_perf,
+			      cpu->pstate.max_pstate_physical);
+
+	pr_debug("CPU%d: perf = %u, max. perf = %u, base perf = %d\n", cpu->cpu,
+		 cpu->capacity_perf, hybrid_max_perf_cpu->capacity_perf,
+		 cpu->pstate.max_pstate_physical);
+}
+
+static void hybrid_clear_cpu_capacity(unsigned int cpunum)
+{
+	arch_set_cpu_capacity(cpunum, 1, 1, 1, 1);
+}
+
+static void hybrid_get_capacity_perf(struct cpudata *cpu)
+{
+	if (READ_ONCE(global.no_turbo)) {
+		cpu->capacity_perf = cpu->pstate.max_pstate_physical;
+		return;
+	}
+
+	cpu->capacity_perf = HWP_HIGHEST_PERF(READ_ONCE(cpu->hwp_cap_cached));
+}
+
+static void hybrid_set_capacity_of_cpus(void)
+{
+	int cpunum;
+
+	for_each_online_cpu(cpunum) {
+		struct cpudata *cpu = all_cpu_data[cpunum];
+
+		if (cpu)
+			hybrid_set_cpu_capacity(cpu);
+	}
+}
+
+static void hybrid_update_cpu_capacity_scaling(void)
+{
+	struct cpudata *max_perf_cpu = NULL;
+	unsigned int max_cap_perf = 0;
+	int cpunum;
+
+	for_each_online_cpu(cpunum) {
+		struct cpudata *cpu = all_cpu_data[cpunum];
+
+		if (!cpu)
+			continue;
+
+		/*
+		 * During initialization, CPU performance at full capacity needs
+		 * to be determined.
+		 */
+		if (!hybrid_max_perf_cpu)
+			hybrid_get_capacity_perf(cpu);
+
+		/*
+		 * If hybrid_max_perf_cpu is not NULL at this point, it is
+		 * being replaced, so don't take it into account when looking
+		 * for the new one.
+		 */
+		if (cpu == hybrid_max_perf_cpu)
+			continue;
+
+		if (cpu->capacity_perf > max_cap_perf) {
+			max_cap_perf = cpu->capacity_perf;
+			max_perf_cpu = cpu;
+		}
+	}
+
+	if (max_perf_cpu) {
+		hybrid_max_perf_cpu = max_perf_cpu;
+		hybrid_set_capacity_of_cpus();
+	} else {
+		pr_info("Found no CPUs with nonzero maximum performance\n");
+		/* Revert to the flat CPU capacity structure. */
+		for_each_online_cpu(cpunum)
+			hybrid_clear_cpu_capacity(cpunum);
+	}
+}
+
+static void __hybrid_init_cpu_capacity_scaling(void)
+{
+	hybrid_max_perf_cpu = NULL;
+	hybrid_update_cpu_capacity_scaling();
+}
+
+static void hybrid_init_cpu_capacity_scaling(void)
+{
+	bool disable_itmt = false;
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	/*
+	 * If hybrid_max_perf_cpu is set at this point, the hybrid CPU capacity
+	 * scaling has been enabled already and the driver is just changing the
+	 * operation mode.
+	 */
+	if (hybrid_max_perf_cpu) {
+		__hybrid_init_cpu_capacity_scaling();
+		goto unlock;
+	}
+
+	/*
+	 * On hybrid systems, use asym capacity instead of ITMT, but because
+	 * the capacity of SMT threads is not deterministic even approximately,
+	 * do not do that when SMT is in use.
+	 */
+	if (hwp_is_hybrid && !sched_smt_active() && arch_enable_hybrid_capacity_scale()) {
+		__hybrid_init_cpu_capacity_scaling();
+		disable_itmt = true;
+	}
+
+unlock:
+	mutex_unlock(&hybrid_capacity_lock);
+
+	/*
+	 * Disabling ITMT causes sched domains to be rebuilt to disable asym
+	 * packing and enable asym capacity.
+	 */
+	if (disable_itmt)
+		sched_clear_itmt_support();
+}
+
 static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
 {
 	u64 cap;
@@ -962,6 +1099,43 @@ static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
 	}
 }
 
+static void hybrid_update_capacity(struct cpudata *cpu)
+{
+	unsigned int max_cap_perf;
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (!hybrid_max_perf_cpu)
+		goto unlock;
+
+	/*
+	 * The maximum performance of the CPU may have changed, but assume
+	 * that the performance of the other CPUs has not changed.
+	 */
+	max_cap_perf = hybrid_max_perf_cpu->capacity_perf;
+
+	intel_pstate_get_hwp_cap(cpu);
+
+	hybrid_get_capacity_perf(cpu);
+	/* Should hybrid_max_perf_cpu be replaced by this CPU? */
+	if (cpu->capacity_perf > max_cap_perf) {
+		hybrid_max_perf_cpu = cpu;
+		hybrid_set_capacity_of_cpus();
+		goto unlock;
+	}
+
+	/* If this CPU is hybrid_max_perf_cpu, should it be replaced? */
+	if (cpu == hybrid_max_perf_cpu && cpu->capacity_perf < max_cap_perf) {
+		hybrid_update_cpu_capacity_scaling();
+		goto unlock;
+	}
+
+	hybrid_set_cpu_capacity(cpu);
+
+unlock:
+	mutex_unlock(&hybrid_capacity_lock);
+}
+
 static void intel_pstate_hwp_set(unsigned int cpu)
 {
 	struct cpudata *cpu_data = all_cpu_data[cpu];
@@ -1070,6 +1244,22 @@ static void intel_pstate_hwp_offline(struct cpudata *cpu)
 		value |= HWP_ENERGY_PERF_PREFERENCE(HWP_EPP_POWERSAVE);
 
 	wrmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, value);
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (!hybrid_max_perf_cpu) {
+		mutex_unlock(&hybrid_capacity_lock);
+
+		return;
+	}
+
+	if (hybrid_max_perf_cpu == cpu)
+		hybrid_update_cpu_capacity_scaling();
+
+	mutex_unlock(&hybrid_capacity_lock);
+
+	/* Reset the capacity of the CPU going offline to the initial value. */
+	hybrid_clear_cpu_capacity(cpu->cpu);
 }
 
 #define POWER_CTL_EE_ENABLE	1
@@ -1165,21 +1355,46 @@ static void __intel_pstate_update_max_freq(struct cpudata *cpudata,
 static void intel_pstate_update_limits(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
+	struct cpudata *cpudata;
 
 	if (!policy)
 		return;
 
-	__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
+	cpudata = all_cpu_data[cpu];
+
+	__intel_pstate_update_max_freq(cpudata, policy);
+
+	/* Prevent the driver from being unregistered now. */
+	mutex_lock(&intel_pstate_driver_lock);
 
 	cpufreq_cpu_release(policy);
+
+	hybrid_update_capacity(cpudata);
+
+	mutex_unlock(&intel_pstate_driver_lock);
 }
 
 static void intel_pstate_update_limits_for_all(void)
 {
 	int cpu;
 
-	for_each_possible_cpu(cpu)
-		intel_pstate_update_limits(cpu);
+	for_each_possible_cpu(cpu) {
+		struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpu);
+
+		if (!policy)
+			continue;
+
+		__intel_pstate_update_max_freq(all_cpu_data[cpu], policy);
+
+		cpufreq_cpu_release(policy);
+	}
+
+	mutex_lock(&hybrid_capacity_lock);
+
+	if (hybrid_max_perf_cpu)
+		__hybrid_init_cpu_capacity_scaling();
+
+	mutex_unlock(&hybrid_capacity_lock);
 }
 
 /************************** sysfs begin ************************/
@@ -1618,6 +1833,13 @@ static void intel_pstate_notify_work(struct work_struct *work)
 		__intel_pstate_update_max_freq(cpudata, policy);
 
 		cpufreq_cpu_release(policy);
+
+		/*
+		 * The driver will not be unregistered while this function is
+		 * running, so update the capacity without acquiring the driver
+		 * lock.
+		 */
+		hybrid_update_capacity(cpudata);
 	}
 
 	wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_STATUS, 0);
@@ -2034,8 +2256,10 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 
 		if (pstate_funcs.get_cpu_scaling) {
 			cpu->pstate.scaling = pstate_funcs.get_cpu_scaling(cpu->cpu);
-			if (cpu->pstate.scaling != perf_ctl_scaling)
+			if (cpu->pstate.scaling != perf_ctl_scaling) {
 				intel_pstate_hybrid_hwp_adjust(cpu);
+				hwp_is_hybrid = true;
+			}
 		} else {
 			cpu->pstate.scaling = perf_ctl_scaling;
 		}
@@ -2707,6 +2931,8 @@ static int intel_pstate_cpu_online(struct cpufreq_policy *policy)
 		 */
 		intel_pstate_hwp_reenable(cpu);
 		cpu->suspended = false;
+
+		hybrid_update_capacity(cpu);
 	}
 
 	return 0;
@@ -3147,6 +3373,8 @@ static int intel_pstate_register_driver(struct cpufreq_driver *driver)
 
 	global.min_perf_pct = min_perf_pct_min();
 
+	hybrid_init_cpu_capacity_scaling();
+
 	return 0;
 }
 

From 3f66425a4fc8663ad074cf70f432c681953bfcb7 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Tue, 6 Aug 2024 07:58:26 -0600
Subject: [PATCH 11/35] cpufreq: Enable COMPILE_TEST on Arm drivers

COMPILE_TEST is useful for build testing without requiring a specific
architecture's compiler. Enable it for most of the Arm CPUFreq drivers.

As Kconfig.arm is only included on ARM and ARM64, COMPILE_TEST is only
enabled for those architectures until that is dropped.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/Kconfig.arm | 50 ++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 96b404ce829f..5f7e13e60c80 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -5,7 +5,7 @@
 
 config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM
 	tristate "Allwinner nvmem based SUN50I CPUFreq driver"
-	depends on ARCH_SUNXI
+	depends on ARCH_SUNXI || COMPILE_TEST
 	depends on NVMEM_SUNXI_SID
 	select PM_OPP
 	help
@@ -26,15 +26,17 @@ config ARM_APPLE_SOC_CPUFREQ
 
 config ARM_ARMADA_37XX_CPUFREQ
 	tristate "Armada 37xx CPUFreq support"
-	depends on ARCH_MVEBU && CPUFREQ_DT
+	depends on ARCH_MVEBU || COMPILE_TEST
+	depends on CPUFREQ_DT
 	help
 	  This adds the CPUFreq driver support for Marvell Armada 37xx SoCs.
 	  The Armada 37xx PMU supports 4 frequency and VDD levels.
 
 config ARM_ARMADA_8K_CPUFREQ
 	tristate "Armada 8K CPUFreq driver"
-	depends on ARCH_MVEBU && CPUFREQ_DT
-	select ARMADA_AP_CPU_CLK
+	depends on ARCH_MVEBU || COMPILE_TEST
+	depends on CPUFREQ_DT
+	select ARMADA_AP_CPU_CLK if COMMON_CLK
 	help
 	  This enables the CPUFreq driver support for Marvell
 	  Armada8k SOCs.
@@ -56,7 +58,7 @@ config ARM_SCPI_CPUFREQ
 config ARM_VEXPRESS_SPC_CPUFREQ
 	tristate "Versatile Express SPC based CPUfreq driver"
 	depends on ARM_CPU_TOPOLOGY && HAVE_CLK
-	depends on ARCH_VEXPRESS_SPC
+	depends on ARCH_VEXPRESS_SPC || COMPILE_TEST
 	select PM_OPP
 	help
 	  This add the CPUfreq driver support for Versatile Express
@@ -75,7 +77,8 @@ config ARM_BRCMSTB_AVS_CPUFREQ
 
 config ARM_HIGHBANK_CPUFREQ
 	tristate "Calxeda Highbank-based"
-	depends on ARCH_HIGHBANK && CPUFREQ_DT && REGULATOR
+	depends on ARCH_HIGHBANK || COMPILE_TEST
+	depends on CPUFREQ_DT && REGULATOR && PL320_MBOX
 	default m
 	help
 	  This adds the CPUFreq driver for Calxeda Highbank SoC
@@ -96,7 +99,8 @@ config ARM_IMX6Q_CPUFREQ
 
 config ARM_IMX_CPUFREQ_DT
 	tristate "Freescale i.MX8M cpufreq support"
-	depends on ARCH_MXC && CPUFREQ_DT
+	depends on CPUFREQ_DT
+	depends on ARCH_MXC || COMPILE_TEST
 	help
 	  This adds cpufreq driver support for Freescale i.MX7/i.MX8M
 	  series SoCs, based on cpufreq-dt.
@@ -111,7 +115,8 @@ config ARM_KIRKWOOD_CPUFREQ
 
 config ARM_MEDIATEK_CPUFREQ
 	tristate "CPU Frequency scaling support for MediaTek SoCs"
-	depends on ARCH_MEDIATEK && REGULATOR
+	depends on ARCH_MEDIATEK || COMPILE_TEST
+	depends on REGULATOR
 	select PM_OPP
 	help
 	  This adds the CPUFreq driver support for MediaTek SoCs.
@@ -130,12 +135,12 @@ config ARM_MEDIATEK_CPUFREQ_HW
 
 config ARM_OMAP2PLUS_CPUFREQ
 	bool "TI OMAP2+"
-	depends on ARCH_OMAP2PLUS
+	depends on ARCH_OMAP2PLUS || COMPILE_TEST
 	default ARCH_OMAP2PLUS
 
 config ARM_QCOM_CPUFREQ_NVMEM
 	tristate "Qualcomm nvmem based CPUFreq"
-	depends on ARCH_QCOM
+	depends on ARCH_QCOM || COMPILE_TEST
 	depends on NVMEM_QCOM_QFPROM
 	depends on QCOM_SMEM
 	select PM_OPP
@@ -166,7 +171,7 @@ config ARM_RASPBERRYPI_CPUFREQ
 
 config ARM_S3C64XX_CPUFREQ
 	bool "Samsung S3C64XX"
-	depends on CPU_S3C6410
+	depends on CPU_S3C6410 || COMPILE_TEST
 	default y
 	help
 	  This adds the CPUFreq driver for Samsung S3C6410 SoC.
@@ -175,7 +180,7 @@ config ARM_S3C64XX_CPUFREQ
 
 config ARM_S5PV210_CPUFREQ
 	bool "Samsung S5PV210 and S5PC110"
-	depends on CPU_S5PV210
+	depends on CPU_S5PV210 || COMPILE_TEST
 	default y
 	help
 	  This adds the CPUFreq driver for Samsung S5PV210 and
@@ -199,14 +204,15 @@ config ARM_SCMI_CPUFREQ
 
 config ARM_SPEAR_CPUFREQ
 	bool "SPEAr CPUFreq support"
-	depends on PLAT_SPEAR
+	depends on PLAT_SPEAR || COMPILE_TEST
 	default y
 	help
 	  This adds the CPUFreq driver support for SPEAr SOCs.
 
 config ARM_STI_CPUFREQ
 	tristate "STi CPUFreq support"
-	depends on CPUFREQ_DT && SOC_STIH407
+	depends on CPUFREQ_DT
+	depends on SOC_STIH407 || COMPILE_TEST
 	help
 	  This driver uses the generic OPP framework to match the running
 	  platform with a predefined set of suitable values.  If not provided
@@ -216,34 +222,38 @@ config ARM_STI_CPUFREQ
 
 config ARM_TEGRA20_CPUFREQ
 	tristate "Tegra20/30 CPUFreq support"
-	depends on ARCH_TEGRA && CPUFREQ_DT
+	depends on ARCH_TEGRA || COMPILE_TEST
+	depends on CPUFREQ_DT
 	default y
 	help
 	  This adds the CPUFreq driver support for Tegra20/30 SOCs.
 
 config ARM_TEGRA124_CPUFREQ
 	bool "Tegra124 CPUFreq support"
-	depends on ARCH_TEGRA && CPUFREQ_DT
+	depends on ARCH_TEGRA || COMPILE_TEST
+	depends on CPUFREQ_DT
 	default y
 	help
 	  This adds the CPUFreq driver support for Tegra124 SOCs.
 
 config ARM_TEGRA186_CPUFREQ
 	tristate "Tegra186 CPUFreq support"
-	depends on ARCH_TEGRA && TEGRA_BPMP
+	depends on ARCH_TEGRA || COMPILE_TEST
+	depends on TEGRA_BPMP
 	help
 	  This adds the CPUFreq driver support for Tegra186 SOCs.
 
 config ARM_TEGRA194_CPUFREQ
 	tristate "Tegra194 CPUFreq support"
-	depends on ARCH_TEGRA_194_SOC && TEGRA_BPMP
+	depends on ARCH_TEGRA_194_SOC || (64BIT && COMPILE_TEST)
+	depends on TEGRA_BPMP
 	default y
 	help
 	  This adds CPU frequency driver support for Tegra194 SOCs.
 
 config ARM_TI_CPUFREQ
 	bool "Texas Instruments CPUFreq support"
-	depends on ARCH_OMAP2PLUS || ARCH_K3
+	depends on ARCH_OMAP2PLUS || ARCH_K3 || COMPILE_TEST
 	default y
 	help
 	  This driver enables valid OPPs on the running platform based on
@@ -255,7 +265,7 @@ config ARM_TI_CPUFREQ
 
 config ARM_PXA2xx_CPUFREQ
 	tristate "Intel PXA2xx CPUfreq driver"
-	depends on PXA27x || PXA25x
+	depends on PXA27x || PXA25x || COMPILE_TEST
 	help
 	  This add the CPUFreq driver support for Intel PXA2xx SOCs.
 

From 81746019b9fbb9fbf7c522dcbeefb572ac0f9458 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Tue, 6 Aug 2024 07:58:27 -0600
Subject: [PATCH 12/35] cpufreq: Drop CONFIG_ARM and CONFIG_ARM64 dependency on
 Arm drivers

The CONFIG_ARM and CONFIG_ARM64 dependency is redundant as all the
drivers have necessary sub-arch dependency and don't depend on the
architecture support.

Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/Kconfig | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 10cda6f2fe1d..2561b215432a 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -231,9 +231,7 @@ if X86
 source "drivers/cpufreq/Kconfig.x86"
 endif
 
-if ARM || ARM64
 source "drivers/cpufreq/Kconfig.arm"
-endif
 
 if PPC32 || PPC64
 source "drivers/cpufreq/Kconfig.powerpc"

From 49243adc715e6ae34d6cc003827e63bcf5b3a21d Mon Sep 17 00:00:00 2001
From: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Date: Fri, 9 Aug 2024 06:08:16 +0000
Subject: [PATCH 13/35] cpufreq/amd-pstate: Add the missing cpufreq_cpu_put()

Fix the reference counting of cpufreq_policy object in amd_pstate_update()
function by adding the missing cpufreq_cpu_put().

Fixes: e8f555daacd3 ("cpufreq/amd-pstate: fix setting policy current frequency value")
Signed-off-by: Dhananjay Ugwekar <Dhananjay.Ugwekar@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/amd-pstate.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 68c616b572f2..eff039ba49ee 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -554,12 +554,15 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf,
 	}
 
 	if (value == prev)
-		return;
+		goto cpufreq_policy_put;
 
 	WRITE_ONCE(cpudata->cppc_req_cached, value);
 
 	amd_pstate_update_perf(cpudata, min_perf, des_perf,
 			       max_perf, fast_switch);
+
+cpufreq_policy_put:
+	cpufreq_cpu_put(policy);
 }
 
 static int amd_pstate_verify(struct cpufreq_policy_data *policy)

From 24f9bfb847b7340b91e35742adf0ab87440e7079 Mon Sep 17 00:00:00 2001
From: "Rob Herring (Arm)" <robh@kernel.org>
Date: Fri, 9 Aug 2024 11:24:38 -0600
Subject: [PATCH 14/35] cpufreq: Fix warning on unused of_device_id tables for
 !CONFIG_OF

!CONFIG_OF builds cause warnings on unused of_device_id tables. This is
due to of_match_node() being a macro rather than static inline function.
Add a __maybe_unused annotation to the of_device_id tables.

Fixes: c7582ec85342 ("cpufreq: Drop CONFIG_ARM and CONFIG_ARM64 dependency on Arm drivers")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202408090714.wcrqU6Pk-lkp@intel.com/
Signed-off-by: Rob Herring (Arm) <robh@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/apple-soc-cpufreq.c  | 2 +-
 drivers/cpufreq/mediatek-cpufreq.c   | 2 +-
 drivers/cpufreq/qcom-cpufreq-nvmem.c | 2 +-
 drivers/cpufreq/ti-cpufreq.c         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/apple-soc-cpufreq.c b/drivers/cpufreq/apple-soc-cpufreq.c
index af34c22fa273..4dcacab9b4bf 100644
--- a/drivers/cpufreq/apple-soc-cpufreq.c
+++ b/drivers/cpufreq/apple-soc-cpufreq.c
@@ -85,7 +85,7 @@ static const struct apple_soc_cpufreq_info soc_default_info = {
 	.cur_pstate_mask = 0, /* fallback */
 };
 
-static const struct of_device_id apple_soc_cpufreq_of_match[] = {
+static const struct of_device_id apple_soc_cpufreq_of_match[] __maybe_unused = {
 	{
 		.compatible = "apple,t8103-cluster-cpufreq",
 		.data = &soc_t8103_info,
diff --git a/drivers/cpufreq/mediatek-cpufreq.c b/drivers/cpufreq/mediatek-cpufreq.c
index 3a1aadaa723c..663f61565cf7 100644
--- a/drivers/cpufreq/mediatek-cpufreq.c
+++ b/drivers/cpufreq/mediatek-cpufreq.c
@@ -738,7 +738,7 @@ static const struct mtk_cpufreq_platform_data mt8516_platform_data = {
 };
 
 /* List of machines supported by this driver */
-static const struct of_device_id mtk_cpufreq_machines[] __initconst = {
+static const struct of_device_id mtk_cpufreq_machines[] __initconst __maybe_unused = {
 	{ .compatible = "mediatek,mt2701", .data = &mt2701_platform_data },
 	{ .compatible = "mediatek,mt2712", .data = &mt2701_platform_data },
 	{ .compatible = "mediatek,mt7622", .data = &mt7622_platform_data },
diff --git a/drivers/cpufreq/qcom-cpufreq-nvmem.c b/drivers/cpufreq/qcom-cpufreq-nvmem.c
index 939702dfa73f..703308fb891a 100644
--- a/drivers/cpufreq/qcom-cpufreq-nvmem.c
+++ b/drivers/cpufreq/qcom-cpufreq-nvmem.c
@@ -611,7 +611,7 @@ static struct platform_driver qcom_cpufreq_driver = {
 	},
 };
 
-static const struct of_device_id qcom_cpufreq_match_list[] __initconst = {
+static const struct of_device_id qcom_cpufreq_match_list[] __initconst __maybe_unused = {
 	{ .compatible = "qcom,apq8096", .data = &match_data_kryo },
 	{ .compatible = "qcom,msm8909", .data = &match_data_msm8909 },
 	{ .compatible = "qcom,msm8996", .data = &match_data_kryo },
diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index 4d3f27958fbd..220fff7a302e 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -419,7 +419,7 @@ static int ti_cpufreq_setup_syscon_register(struct ti_cpufreq_data *opp_data)
 	return 0;
 }
 
-static const struct of_device_id ti_cpufreq_of_match[] = {
+static const struct of_device_id ti_cpufreq_of_match[]  __maybe_unused = {
 	{ .compatible = "ti,am33xx", .data = &am3x_soc_data, },
 	{ .compatible = "ti,am3517", .data = &am3517_soc_data, },
 	{ .compatible = "ti,am43", .data = &am4x_soc_data, },

From b14ceb82c3a1b6f1422af4ea7a2a686dbaf0a094 Mon Sep 17 00:00:00 2001
From: Danila Tikhonov <danila@jiaxyga.com>
Date: Thu, 8 Aug 2024 21:40:17 +0300
Subject: [PATCH 15/35] cpufreq: Add SM7325 to cpufreq-dt-platdev blocklist

The Qualcomm SM7325 platform uses the qcom-cpufreq-hw driver, so add
it to the cpufreq-dt-platdev driver's blocklist.

Signed-off-by: Danila Tikhonov <danila@jiaxyga.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index cac379ba006d..18942bfe9c95 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -166,6 +166,7 @@ static const struct of_device_id blocklist[] __initconst = {
 	{ .compatible = "qcom,sm6350", },
 	{ .compatible = "qcom,sm6375", },
 	{ .compatible = "qcom,sm7225", },
+	{ .compatible = "qcom,sm7325", },
 	{ .compatible = "qcom,sm8150", },
 	{ .compatible = "qcom,sm8250", },
 	{ .compatible = "qcom,sm8350", },

From 5493f9714e4cdaf0ee7cec15899a231400cb1a9f Mon Sep 17 00:00:00 2001
From: Anastasia Belova <abelova@astralinux.ru>
Date: Mon, 26 Aug 2024 16:38:41 +0300
Subject: [PATCH 16/35] cpufreq: amd-pstate: add check for cpufreq_cpu_get's
 return value

cpufreq_cpu_get may return NULL. To avoid NULL-dereference check it
and return in case of error.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Signed-off-by: Anastasia Belova <abelova@astralinux.ru>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/amd-pstate.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index eff039ba49ee..f4b9fef3342b 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -659,7 +659,12 @@ static void amd_pstate_adjust_perf(unsigned int cpu,
 	unsigned long max_perf, min_perf, des_perf,
 		      cap_perf, lowest_nonlinear_perf;
 	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
-	struct amd_cpudata *cpudata = policy->driver_data;
+	struct amd_cpudata *cpudata;
+
+	if (!policy)
+		return;
+
+	cpudata = policy->driver_data;
 
 	if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq)
 		amd_pstate_update_min_max_limit(policy);
@@ -873,11 +878,16 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
 static void amd_pstate_update_limits(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
-	struct amd_cpudata *cpudata = policy->driver_data;
+	struct amd_cpudata *cpudata;
 	u32 prev_high = 0, cur_high = 0;
 	int ret;
 	bool highest_perf_changed = false;
 
+	if (!policy)
+		return;
+
+	cpudata = policy->driver_data;
+
 	mutex_lock(&amd_pstate_driver_lock);
 	if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
 		goto free_cpufreq_put;

From 2b7ec33e534f7a10033a5cf07794acf48b182bbe Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Wed, 28 Aug 2024 14:24:59 +0800
Subject: [PATCH 17/35] cpufreq: loongson3: Use raw_smp_processor_id() in
 do_service_request()

Use raw_smp_processor_id() instead of plain smp_processor_id() in
do_service_request(), otherwise we may get some errors with the driver
enabled:

 BUG: using smp_processor_id() in preemptible [00000000] code: (udev-worker)/208
 caller is loongson3_cpufreq_probe+0x5c/0x250 [loongson3_cpufreq]

Reported-by: Xi Ruoyao <xry111@xry111.site>
Tested-by: Binbin Zhou <zhoubinbin@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/loongson3_cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/loongson3_cpufreq.c b/drivers/cpufreq/loongson3_cpufreq.c
index 5f79b6de127c..6b5e6798d9a2 100644
--- a/drivers/cpufreq/loongson3_cpufreq.c
+++ b/drivers/cpufreq/loongson3_cpufreq.c
@@ -176,7 +176,7 @@ static DEFINE_PER_CPU(struct loongson3_freq_data *, freq_data);
 static inline int do_service_request(u32 id, u32 info, u32 cmd, u32 val, u32 extra)
 {
 	int retries;
-	unsigned int cpu = smp_processor_id();
+	unsigned int cpu = raw_smp_processor_id();
 	unsigned int package = cpu_data[cpu].package;
 	union smc_message msg, last;
 

From abc00ffda43bd4ba85896713464c7510c39f8165 Mon Sep 17 00:00:00 2001
From: Nishanth Menon <nm@ti.com>
Date: Wed, 28 Aug 2024 08:19:15 -0500
Subject: [PATCH 18/35] cpufreq: ti-cpufreq: Introduce quirks to handle syscon
 fails appropriately

Commit b4bc9f9e27ed ("cpufreq: ti-cpufreq: add support for omap34xx
and omap36xx") introduced special handling for OMAP3 class devices
where syscon node may not be present. However, this also creates a bug
where the syscon node is present, however the offset used to read
is beyond the syscon defined range.

Fix this by providing a quirk option that is populated when such
special handling is required. This allows proper failure for all other
platforms when the syscon node and efuse offsets are mismatched.

Fixes: b4bc9f9e27ed ("cpufreq: ti-cpufreq: add support for omap34xx and omap36xx")
Signed-off-by: Nishanth Menon <nm@ti.com>
Tested-by: Dhruva Gole <d-gole@ti.com>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/ti-cpufreq.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index 220fff7a302e..804329e81eb8 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -90,6 +90,9 @@ struct ti_cpufreq_soc_data {
 	unsigned long efuse_shift;
 	unsigned long rev_offset;
 	bool multi_regulator;
+/* Backward compatibility hack: Might have missing syscon */
+#define TI_QUIRK_SYSCON_MAY_BE_MISSING	0x1
+	u8 quirks;
 };
 
 struct ti_cpufreq_data {
@@ -254,6 +257,7 @@ static struct ti_cpufreq_soc_data omap34xx_soc_data = {
 	.efuse_mask = BIT(3),
 	.rev_offset = OMAP3_CONTROL_IDCODE - OMAP3_SYSCON_BASE,
 	.multi_regulator = false,
+	.quirks = TI_QUIRK_SYSCON_MAY_BE_MISSING,
 };
 
 /*
@@ -281,6 +285,7 @@ static struct ti_cpufreq_soc_data omap36xx_soc_data = {
 	.efuse_mask = BIT(9),
 	.rev_offset = OMAP3_CONTROL_IDCODE - OMAP3_SYSCON_BASE,
 	.multi_regulator = true,
+	.quirks = TI_QUIRK_SYSCON_MAY_BE_MISSING,
 };
 
 /*
@@ -295,6 +300,7 @@ static struct ti_cpufreq_soc_data am3517_soc_data = {
 	.efuse_mask = 0,
 	.rev_offset = OMAP3_CONTROL_IDCODE - OMAP3_SYSCON_BASE,
 	.multi_regulator = false,
+	.quirks = TI_QUIRK_SYSCON_MAY_BE_MISSING,
 };
 
 static struct ti_cpufreq_soc_data am625_soc_data = {
@@ -340,7 +346,7 @@ static int ti_cpufreq_get_efuse(struct ti_cpufreq_data *opp_data,
 
 	ret = regmap_read(opp_data->syscon, opp_data->soc_data->efuse_offset,
 			  &efuse);
-	if (ret == -EIO) {
+	if (opp_data->soc_data->quirks & TI_QUIRK_SYSCON_MAY_BE_MISSING && ret == -EIO) {
 		/* not a syscon register! */
 		void __iomem *regs = ioremap(OMAP3_SYSCON_BASE +
 				opp_data->soc_data->efuse_offset, 4);
@@ -381,7 +387,7 @@ static int ti_cpufreq_get_rev(struct ti_cpufreq_data *opp_data,
 
 	ret = regmap_read(opp_data->syscon, opp_data->soc_data->rev_offset,
 			  &revision);
-	if (ret == -EIO) {
+	if (opp_data->soc_data->quirks & TI_QUIRK_SYSCON_MAY_BE_MISSING && ret == -EIO) {
 		/* not a syscon register! */
 		void __iomem *regs = ioremap(OMAP3_SYSCON_BASE +
 				opp_data->soc_data->rev_offset, 4);

From 87fa4bd351a51ec1eb2d4854f4112c7811fef519 Mon Sep 17 00:00:00 2001
From: Liu Jing <liujing@cmss.chinamobile.com>
Date: Mon, 2 Sep 2024 16:28:16 +0800
Subject: [PATCH 19/35] cpufreq: Fix the cacography in powernv-cpufreq.c

The word 'swtich' is wrong, so fix it.

Signed-off-by: Liu Jing <liujing@cmss.chinamobile.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/powernv-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 50c62929f7ca..c1b6280de5dc 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -692,7 +692,7 @@ static void gpstate_timer_handler(struct timer_list *t)
 	}
 
 	/*
-	 * If PMCR was last updated was using fast_swtich then
+	 * If PMCR was last updated was using fast_switch then
 	 * We may have wrong in gpstate->last_lpstate_idx
 	 * value. Hence, read from PMCR to get correct data.
 	 */

From 6b612d1bac67b0f483fde7779a45f6310274d4eb Mon Sep 17 00:00:00 2001
From: Dhruva Gole <d-gole@ti.com>
Date: Mon, 2 Sep 2024 14:51:35 +0530
Subject: [PATCH 20/35] cpufreq: ti-cpufreq: Use socinfo to get revision in
 AM62 family

In the AM62x, AM62Ax, and AM62Px devices, we already have the revision
info within the k3-socinfo driver. Hence, re-use this information from
there instead of re using the offset for 2 drivers trying to get the
same information ie. revision.

Signed-off-by: Dhruva Gole <d-gole@ti.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/ti-cpufreq.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/cpufreq/ti-cpufreq.c b/drivers/cpufreq/ti-cpufreq.c
index 804329e81eb8..ba621ce1cdda 100644
--- a/drivers/cpufreq/ti-cpufreq.c
+++ b/drivers/cpufreq/ti-cpufreq.c
@@ -16,6 +16,7 @@
 #include <linux/pm_opp.h>
 #include <linux/regmap.h>
 #include <linux/slab.h>
+#include <linux/sys_soc.h>
 
 #define REVISION_MASK				0xF
 #define REVISION_SHIFT				28
@@ -303,6 +304,13 @@ static struct ti_cpufreq_soc_data am3517_soc_data = {
 	.quirks = TI_QUIRK_SYSCON_MAY_BE_MISSING,
 };
 
+static const struct soc_device_attribute k3_cpufreq_soc[] = {
+	{ .family = "AM62X", .revision = "SR1.0" },
+	{ .family = "AM62AX", .revision = "SR1.0" },
+	{ .family = "AM62PX", .revision = "SR1.0" },
+	{ /* sentinel */ }
+};
+
 static struct ti_cpufreq_soc_data am625_soc_data = {
 	.efuse_xlate = am625_efuse_xlate,
 	.efuse_offset = 0x0018,
@@ -384,6 +392,16 @@ static int ti_cpufreq_get_rev(struct ti_cpufreq_data *opp_data,
 	struct device *dev = opp_data->cpu_dev;
 	u32 revision;
 	int ret;
+	if (soc_device_match(k3_cpufreq_soc)) {
+		/*
+		 * Since the SR is 1.0, hard code the revision_value as
+		 * 0x1 here. This way we avoid re using the same register
+		 * that is giving us required information inside socinfo
+		 * anyway.
+		 */
+		*revision_value = 0x1;
+		goto done;
+	}
 
 	ret = regmap_read(opp_data->syscon, opp_data->soc_data->rev_offset,
 			  &revision);
@@ -406,6 +424,7 @@ static int ti_cpufreq_get_rev(struct ti_cpufreq_data *opp_data,
 
 	*revision_value = BIT((revision >> REVISION_SHIFT) & REVISION_MASK);
 
+done:
 	return 0;
 }
 

From c3e093efbc6cac7bf9dc531dcb751b86daaa65b0 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sun, 1 Sep 2024 00:00:35 -0500
Subject: [PATCH 21/35] cpufreq/amd-pstate: Catch failures for
 amd_pstate_epp_update_limit()

amd_pstate_set_epp() calls cppc_set_epp_perf() which can fail for
a variety of reasons but this is ignored.  Change the return flow
to allow failures.

Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/cpufreq/amd-pstate.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 89bda7a2bb8d..b5fcde1bed32 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -1555,7 +1555,7 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy)
 	pr_debug("CPU %d exiting\n", policy->cpu);
 }
 
-static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
+static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
 {
 	struct amd_cpudata *cpudata = policy->driver_data;
 	u32 max_perf, min_perf, min_limit_perf, max_limit_perf;
@@ -1605,7 +1605,7 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
 		 * This return value can only be negative for shared_memory
 		 * systems where EPP register read/write not supported.
 		 */
-		return;
+		return epp;
 	}
 
 	if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE)
@@ -1618,12 +1618,13 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy)
 	}
 
 	WRITE_ONCE(cpudata->cppc_req_cached, value);
-	amd_pstate_set_epp(cpudata, epp);
+	return amd_pstate_set_epp(cpudata, epp);
 }
 
 static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
 {
 	struct amd_cpudata *cpudata = policy->driver_data;
+	int ret;
 
 	if (!policy->cpuinfo.max_freq)
 		return -ENODEV;
@@ -1633,7 +1634,9 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy)
 
 	cpudata->policy = policy->policy;
 
-	amd_pstate_epp_update_limit(policy);
+	ret = amd_pstate_epp_update_limit(policy);
+	if (ret)
+		return ret;
 
 	/*
 	 * policy->cur is never updated with the amd_pstate_epp driver, but it

From 2bcec09cc4ae62229e149213499e45b74190a24a Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 26 Aug 2024 16:13:51 -0500
Subject: [PATCH 22/35] x86/amd: Move amd_get_highest_perf() from amd.c to
 cppc.c

To prepare to let amd_get_highest_perf() detect preferred cores
it will require CPPC functions. Move amd_get_highest_perf() to
cppc.c to prepare for 'preferred core detection' rework.

No functional changes intended.

Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 arch/x86/kernel/acpi/cppc.c | 16 ++++++++++++++++
 arch/x86/kernel/cpu/amd.c   | 16 ----------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index ff8f25faca3d..7ec8f2ce859c 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -116,3 +116,19 @@ void init_freq_invariance_cppc(void)
 	init_done = true;
 	mutex_unlock(&freq_invariance_lock);
 }
+
+u32 amd_get_highest_perf(void)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+
+	if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
+			       (c->x86_model >= 0x70 && c->x86_model < 0x80)))
+		return 166;
+
+	if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
+			       (c->x86_model >= 0x40 && c->x86_model < 0x70)))
+		return 166;
+
+	return 255;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index be5889bded49..1be1f913434f 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1190,22 +1190,6 @@ unsigned long amd_get_dr_addr_mask(unsigned int dr)
 }
 EXPORT_SYMBOL_GPL(amd_get_dr_addr_mask);
 
-u32 amd_get_highest_perf(void)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
-			       (c->x86_model >= 0x70 && c->x86_model < 0x80)))
-		return 166;
-
-	if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
-			       (c->x86_model >= 0x40 && c->x86_model < 0x70)))
-		return 166;
-
-	return 255;
-}
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
-
 static void zenbleed_check_cpu(void *unused)
 {
 	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());

From 01ced022e125f7b328335bb2944a107afcafe351 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 3 Sep 2024 15:32:16 -0500
Subject: [PATCH 23/35] ACPI: CPPC: Adjust return code for inline functions in
 !CONFIG_ACPI_CPPC_LIB

Checkpath emits the following warning:
```
WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP
```

Adjust the code accordingly.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 include/acpi/cppc_acpi.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 930b6afba6f4..409a7190a4a8 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -162,31 +162,31 @@ extern int cppc_set_auto_sel(int cpu, bool enable);
 #else /* !CONFIG_ACPI_CPPC_LIB */
 static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_set_enable(int cpu, bool enable)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_get_perf_caps(int cpu, struct cppc_perf_caps *caps)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline bool cppc_perf_ctrs_in_pcc(void)
 {
@@ -210,27 +210,27 @@ static inline bool cpc_ffh_supported(void)
 }
 static inline int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_get_epp_perf(int cpunum, u64 *epp_perf)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_set_auto_sel(int cpu, bool enable)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 #endif /* !CONFIG_ACPI_CPPC_LIB */
 

From 6c09e3b445a1a647a5b57ea6afd23e846225dd8f Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 26 Aug 2024 16:13:52 -0500
Subject: [PATCH 24/35] x86/amd: Rename amd_get_highest_perf() to
 amd_get_boost_ratio_numerator()

The function name is ambiguous because it returns an intermediate value
for calculating maximum frequency rather than the CPPC 'Highest Perf'
register.

Rename the function to clarify its use and allow the function to return
errors. Adjust the consumer in acpi-cpufreq to catch errors.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 arch/x86/include/asm/processor.h |  3 ---
 arch/x86/kernel/acpi/cppc.c      | 44 +++++++++++++++++++++++---------
 drivers/cpufreq/acpi-cpufreq.c   | 12 ++++++---
 include/acpi/cppc_acpi.h         |  5 ++++
 4 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a75a07f4931f..775acbdea1a9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -691,8 +691,6 @@ static inline u32 per_cpu_l2c_id(unsigned int cpu)
 }
 
 #ifdef CONFIG_CPU_SUP_AMD
-extern u32 amd_get_highest_perf(void);
-
 /*
  * Issue a DIV 0/1 insn to clear any division data from previous DIV
  * operations.
@@ -705,7 +703,6 @@ static __always_inline void amd_clear_divider(void)
 
 extern void amd_check_microcode(void);
 #else
-static inline u32 amd_get_highest_perf(void)		{ return 0; }
 static inline void amd_clear_divider(void)		{ }
 static inline void amd_check_microcode(void)		{ }
 #endif
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 7ec8f2ce859c..660cfeb6384b 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -69,7 +69,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val)
 static void amd_set_max_freq_ratio(void)
 {
 	struct cppc_perf_caps perf_caps;
-	u64 highest_perf, nominal_perf;
+	u64 numerator, nominal_perf;
 	u64 perf_ratio;
 	int rc;
 
@@ -79,15 +79,19 @@ static void amd_set_max_freq_ratio(void)
 		return;
 	}
 
-	highest_perf = amd_get_highest_perf();
+	rc = amd_get_boost_ratio_numerator(0, &numerator);
+	if (rc) {
+		pr_debug("Could not retrieve highest performance (%d)\n", rc);
+		return;
+	}
 	nominal_perf = perf_caps.nominal_perf;
 
-	if (!highest_perf || !nominal_perf) {
-		pr_debug("Could not retrieve highest or nominal performance\n");
+	if (!nominal_perf) {
+		pr_debug("Could not retrieve nominal performance\n");
 		return;
 	}
 
-	perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf);
+	perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
 	/* midpoint between max_boost and max_P */
 	perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
 	if (!perf_ratio) {
@@ -117,18 +121,34 @@ void init_freq_invariance_cppc(void)
 	mutex_unlock(&freq_invariance_lock);
 }
 
-u32 amd_get_highest_perf(void)
+/**
+ * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
+ * @cpu: CPU to get numerator for.
+ * @numerator: Output variable for numerator.
+ *
+ * Determine the numerator to use for calculating the boost ratio on
+ * a CPU. On systems that support preferred cores, this will be a hardcoded
+ * value. On other systems this will the highest performance register value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
-			       (c->x86_model >= 0x70 && c->x86_model < 0x80)))
-		return 166;
+			       (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
+		*numerator = 166;
+		return 0;
+	}
 
 	if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
-			       (c->x86_model >= 0x40 && c->x86_model < 0x70)))
-		return 166;
+			       (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
+		*numerator = 166;
+		return 0;
+	}
+	*numerator = 255;
 
-	return 255;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+EXPORT_SYMBOL_GPL(amd_get_boost_ratio_numerator);
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index a8ca625a98b8..0f04feb6cafa 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -642,10 +642,16 @@ static u64 get_max_boost_ratio(unsigned int cpu)
 		return 0;
 	}
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		highest_perf = amd_get_highest_perf();
-	else
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+		ret = amd_get_boost_ratio_numerator(cpu, &highest_perf);
+		if (ret) {
+			pr_debug("CPU%d: Unable to get boost ratio numerator (%d)\n",
+				 cpu, ret);
+			return 0;
+		}
+	} else {
 		highest_perf = perf_caps.highest_perf;
+	}
 
 	nominal_perf = perf_caps.nominal_perf;
 
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 409a7190a4a8..97861abc5f5b 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -159,6 +159,7 @@ extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf);
 extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
 extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
 extern int cppc_set_auto_sel(int cpu, bool enable);
+extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
 #else /* !CONFIG_ACPI_CPPC_LIB */
 static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
 {
@@ -232,6 +233,10 @@ static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf
 {
 	return -EOPNOTSUPP;
 }
+static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* !CONFIG_ACPI_CPPC_LIB */
 
 #endif /* _CPPC_ACPI_H*/

From 3355ac2541052154b6ca0b1263be5bf49dfa0158 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 27 Aug 2024 13:50:45 -0500
Subject: [PATCH 25/35] ACPI: CPPC: Drop check for non zero perf ratio

perf_ratio is a u64 and SCHED_CAPACITY_SCALE is a large number.
Shifting by one will never have a zero value.

Drop the check.

Suggested-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.sheoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 arch/x86/kernel/acpi/cppc.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index 660cfeb6384b..e65c77afab31 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -91,13 +91,8 @@ static void amd_set_max_freq_ratio(void)
 		return;
 	}
 
-	perf_ratio = div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf);
 	/* midpoint between max_boost and max_P */
-	perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1;
-	if (!perf_ratio) {
-		pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n");
-		return;
-	}
+	perf_ratio = (div_u64(numerator * SCHED_CAPACITY_SCALE, nominal_perf) + SCHED_CAPACITY_SCALE) >> 1;
 
 	freq_invariance_set_perf_ratio(perf_ratio, false);
 }

From 21fb59ab4b9767085f4fe1edbdbe3177fbb9ec97 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 26 Aug 2024 16:13:53 -0500
Subject: [PATCH 26/35] ACPI: CPPC: Adjust debug messages in
 amd_set_max_freq_ratio() to warn

If the boost ratio isn't calculated properly for the system for any
reason this can cause other problems that are non-obvious.

Raise all messages to warn instead.

Suggested-by: Perry Yuan <Perry.Yuan@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 arch/x86/kernel/acpi/cppc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index e65c77afab31..f0328ce98a8f 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -75,19 +75,19 @@ static void amd_set_max_freq_ratio(void)
 
 	rc = cppc_get_perf_caps(0, &perf_caps);
 	if (rc) {
-		pr_debug("Could not retrieve perf counters (%d)\n", rc);
+		pr_warn("Could not retrieve perf counters (%d)\n", rc);
 		return;
 	}
 
 	rc = amd_get_boost_ratio_numerator(0, &numerator);
 	if (rc) {
-		pr_debug("Could not retrieve highest performance (%d)\n", rc);
+		pr_warn("Could not retrieve highest performance (%d)\n", rc);
 		return;
 	}
 	nominal_perf = perf_caps.nominal_perf;
 
 	if (!nominal_perf) {
-		pr_debug("Could not retrieve nominal performance\n");
+		pr_warn("Could not retrieve nominal performance\n");
 		return;
 	}
 

From 2819bfef6483c66c55064ca678f2630a1a09f3f9 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 26 Aug 2024 16:13:54 -0500
Subject: [PATCH 27/35] x86/amd: Move amd_get_highest_perf() out of amd-pstate

amd_pstate_get_highest_perf() is a helper used to get the highest perf
value on AMD systems.  It's used in amd-pstate as part of preferred
core handling, but applicable for acpi-cpufreq as well.

Move it out to cppc handling code as amd_get_highest_perf().

Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 arch/x86/kernel/acpi/cppc.c  | 30 ++++++++++++++++++++++++++++++
 drivers/cpufreq/amd-pstate.c | 34 ++--------------------------------
 include/acpi/cppc_acpi.h     |  5 +++++
 3 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index f0328ce98a8f..a75dcb382c78 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -116,6 +116,36 @@ void init_freq_invariance_cppc(void)
 	mutex_unlock(&freq_invariance_lock);
 }
 
+/*
+ * Get the highest performance register value.
+ * @cpu: CPU from which to get highest performance.
+ * @highest_perf: Return address for highest performance value.
+ *
+ * Return: 0 for success, negative error code otherwise.
+ */
+int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+	u64 val;
+	int ret;
+
+	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
+		ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val);
+		if (ret)
+			goto out;
+
+		val = AMD_CPPC_HIGHEST_PERF(val);
+	} else {
+		ret = cppc_get_highest_perf(cpu, &val);
+		if (ret)
+			goto out;
+	}
+
+	WRITE_ONCE(*highest_perf, (u32)val);
+out:
+	return ret;
+}
+EXPORT_SYMBOL_GPL(amd_get_highest_perf);
+
 /**
  * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
  * @cpu: CPU to get numerator for.
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index f4b9fef3342b..8da567913f63 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -811,36 +811,6 @@ static void amd_pstste_sched_prefcore_workfn(struct work_struct *work)
 }
 static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
 
-/*
- * Get the highest performance register value.
- * @cpu: CPU from which to get highest performance.
- * @highest_perf: Return address.
- *
- * Return: 0 for success, -EIO otherwise.
- */
-static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf)
-{
-	int ret;
-
-	if (cpu_feature_enabled(X86_FEATURE_CPPC)) {
-		u64 cap1;
-
-		ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1);
-		if (ret)
-			return ret;
-		WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
-	} else {
-		u64 cppc_highest_perf;
-
-		ret = cppc_get_highest_perf(cpu, &cppc_highest_perf);
-		if (ret)
-			return ret;
-		WRITE_ONCE(*highest_perf, cppc_highest_perf);
-	}
-
-	return (ret);
-}
-
 #define CPPC_MAX_PERF	U8_MAX
 
 static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
@@ -848,7 +818,7 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
 	int ret, prio;
 	u32 highest_perf;
 
-	ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf);
+	ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
 	if (ret)
 		return;
 
@@ -892,7 +862,7 @@ static void amd_pstate_update_limits(unsigned int cpu)
 	if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
 		goto free_cpufreq_put;
 
-	ret = amd_pstate_get_highest_perf(cpu, &cur_high);
+	ret = amd_get_highest_perf(cpu, &cur_high);
 	if (ret)
 		goto free_cpufreq_put;
 
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index 97861abc5f5b..f7c7abf2a95e 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -159,6 +159,7 @@ extern int cppc_get_epp_perf(int cpunum, u64 *epp_perf);
 extern int cppc_set_epp_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls, bool enable);
 extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
 extern int cppc_set_auto_sel(int cpu, bool enable);
+extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
 extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
 #else /* !CONFIG_ACPI_CPPC_LIB */
 static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
@@ -233,6 +234,10 @@ static inline int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf
 {
 	return -EOPNOTSUPP;
 }
+static inline int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
+{
+	return -ENODEV;
+}
 static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
 {
 	return -EOPNOTSUPP;

From 279f838a61f96cbfeb1f9ba060e4a452e6e041d0 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 26 Aug 2024 16:13:55 -0500
Subject: [PATCH 28/35] x86/amd: Detect preferred cores in
 amd_get_boost_ratio_numerator()

AMD systems that support preferred cores will use "166" as their
numerator for max frequency calculations instead of "255".

Add a function for detecting preferred cores by looking at the
highest perf value on all cores.

If preferred cores are enabled return 166 and if disabled the
value in the highest perf register. As the function will be called
multiple times, cache the values for the boost numerator and if
preferred cores will be enabled in global variables.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 arch/x86/kernel/acpi/cppc.c  | 95 +++++++++++++++++++++++++++++++-----
 drivers/cpufreq/amd-pstate.c | 34 +++++--------
 include/acpi/cppc_acpi.h     |  5 ++
 3 files changed, 102 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index a75dcb382c78..df367bc35930 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,16 @@
 #include <asm/processor.h>
 #include <asm/topology.h>
 
+#define CPPC_HIGHEST_PERF_PREFCORE	166
+
+enum amd_pref_core {
+	AMD_PREF_CORE_UNKNOWN = 0,
+	AMD_PREF_CORE_SUPPORTED,
+	AMD_PREF_CORE_UNSUPPORTED,
+};
+static enum amd_pref_core amd_pref_core_detected;
+static u64 boost_numerator;
+
 /* Refer to drivers/acpi/cppc_acpi.c for the description of functions */
 
 bool cpc_supported_by_cpu(void)
@@ -146,6 +156,66 @@ int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf)
 }
 EXPORT_SYMBOL_GPL(amd_get_highest_perf);
 
+/**
+ * amd_detect_prefcore: Detect if CPUs in the system support preferred cores
+ * @detected: Output variable for the result of the detection.
+ *
+ * Determine whether CPUs in the system support preferred cores. On systems
+ * that support preferred cores, different highest perf values will be found
+ * on different cores. On other systems, the highest perf value will be the
+ * same on all cores.
+ *
+ * The result of the detection will be stored in the 'detected' parameter.
+ *
+ * Return: 0 for success, negative error code otherwise
+ */
+int amd_detect_prefcore(bool *detected)
+{
+	int cpu, count = 0;
+	u64 highest_perf[2] = {0};
+
+	if (WARN_ON(!detected))
+		return -EINVAL;
+
+	switch (amd_pref_core_detected) {
+	case AMD_PREF_CORE_SUPPORTED:
+		*detected = true;
+		return 0;
+	case AMD_PREF_CORE_UNSUPPORTED:
+		*detected = false;
+		return 0;
+	default:
+		break;
+	}
+
+	for_each_present_cpu(cpu) {
+		u32 tmp;
+		int ret;
+
+		ret = amd_get_highest_perf(cpu, &tmp);
+		if (ret)
+			return ret;
+
+		if (!count || (count == 1 && tmp != highest_perf[0]))
+			highest_perf[count++] = tmp;
+
+		if (count == 2)
+			break;
+	}
+
+	*detected = (count == 2);
+	boost_numerator = highest_perf[0];
+
+	amd_pref_core_detected = *detected ? AMD_PREF_CORE_SUPPORTED :
+					     AMD_PREF_CORE_UNSUPPORTED;
+
+	pr_debug("AMD CPPC preferred core is %ssupported (highest perf: 0x%llx)\n",
+		 *detected ? "" : "un", highest_perf[0]);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(amd_detect_prefcore);
+
 /**
  * amd_get_boost_ratio_numerator: Get the numerator to use for boost ratio calculation
  * @cpu: CPU to get numerator for.
@@ -155,24 +225,27 @@ EXPORT_SYMBOL_GPL(amd_get_highest_perf);
  * a CPU. On systems that support preferred cores, this will be a hardcoded
  * value. On other systems this will the highest performance register value.
  *
+ * If booting the system with amd-pstate enabled but preferred cores disabled then
+ * the correct boost numerator will be returned to match hardware capabilities
+ * even if the preferred cores scheduling hints are not enabled.
+ *
  * Return: 0 for success, negative error code otherwise.
  */
 int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
 {
-	struct cpuinfo_x86 *c = &boot_cpu_data;
+	bool prefcore;
+	int ret;
 
-	if (c->x86 == 0x17 && ((c->x86_model >= 0x30 && c->x86_model < 0x40) ||
-			       (c->x86_model >= 0x70 && c->x86_model < 0x80))) {
-		*numerator = 166;
+	ret = amd_detect_prefcore(&prefcore);
+	if (ret)
+		return ret;
+
+	/* without preferred cores, return the highest perf register value */
+	if (!prefcore) {
+		*numerator = boost_numerator;
 		return 0;
 	}
-
-	if (c->x86 == 0x19 && ((c->x86_model >= 0x20 && c->x86_model < 0x30) ||
-			       (c->x86_model >= 0x40 && c->x86_model < 0x70))) {
-		*numerator = 166;
-		return 0;
-	}
-	*numerator = 255;
+	*numerator = CPPC_HIGHEST_PERF_PREFCORE;
 
 	return 0;
 }
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 8da567913f63..937a3a817743 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -815,32 +815,18 @@ static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn);
 
 static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata)
 {
-	int ret, prio;
-	u32 highest_perf;
-
-	ret = amd_get_highest_perf(cpudata->cpu, &highest_perf);
-	if (ret)
+	/* user disabled or not detected */
+	if (!amd_pstate_prefcore)
 		return;
 
 	cpudata->hw_prefcore = true;
-	/* check if CPPC preferred core feature is enabled*/
-	if (highest_perf < CPPC_MAX_PERF)
-		prio = (int)highest_perf;
-	else {
-		pr_debug("AMD CPPC preferred core is unsupported!\n");
-		cpudata->hw_prefcore = false;
-		return;
-	}
-
-	if (!amd_pstate_prefcore)
-		return;
 
 	/*
 	 * The priorities can be set regardless of whether or not
 	 * sched_set_itmt_support(true) has been called and it is valid to
 	 * update them at any time after it has been called.
 	 */
-	sched_set_itmt_core_prio(prio, cpudata->cpu);
+	sched_set_itmt_core_prio((int)READ_ONCE(cpudata->highest_perf), cpudata->cpu);
 
 	schedule_work(&sched_prefcore_work);
 }
@@ -1011,12 +997,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy)
 
 	cpudata->cpu = policy->cpu;
 
-	amd_pstate_init_prefcore(cpudata);
-
 	ret = amd_pstate_init_perf(cpudata);
 	if (ret)
 		goto free_cpudata1;
 
+	amd_pstate_init_prefcore(cpudata);
+
 	ret = amd_pstate_init_freq(cpudata);
 	if (ret)
 		goto free_cpudata1;
@@ -1466,12 +1452,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
 	cpudata->cpu = policy->cpu;
 	cpudata->epp_policy = 0;
 
-	amd_pstate_init_prefcore(cpudata);
-
 	ret = amd_pstate_init_perf(cpudata);
 	if (ret)
 		goto free_cpudata1;
 
+	amd_pstate_init_prefcore(cpudata);
+
 	ret = amd_pstate_init_freq(cpudata);
 	if (ret)
 		goto free_cpudata1;
@@ -1916,6 +1902,12 @@ static int __init amd_pstate_init(void)
 		static_call_update(amd_pstate_update_perf, cppc_update_perf);
 	}
 
+	if (amd_pstate_prefcore) {
+		ret = amd_detect_prefcore(&amd_pstate_prefcore);
+		if (ret)
+			return ret;
+	}
+
 	/* enable amd pstate feature */
 	ret = amd_pstate_enable(true);
 	if (ret) {
diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h
index f7c7abf2a95e..482e0587a041 100644
--- a/include/acpi/cppc_acpi.h
+++ b/include/acpi/cppc_acpi.h
@@ -161,6 +161,7 @@ extern int cppc_get_auto_sel_caps(int cpunum, struct cppc_perf_caps *perf_caps);
 extern int cppc_set_auto_sel(int cpu, bool enable);
 extern int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf);
 extern int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator);
+extern int amd_detect_prefcore(bool *detected);
 #else /* !CONFIG_ACPI_CPPC_LIB */
 static inline int cppc_get_desired_perf(int cpunum, u64 *desired_perf)
 {
@@ -242,6 +243,10 @@ static inline int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator
 {
 	return -EOPNOTSUPP;
 }
+static inline int amd_detect_prefcore(bool *detected)
+{
+	return -ENODEV;
+}
 #endif /* !CONFIG_ACPI_CPPC_LIB */
 
 #endif /* _CPPC_ACPI_H*/

From ad4caad58d91d3293880f8074f7ad125490ce636 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 26 Aug 2024 16:13:56 -0500
Subject: [PATCH 29/35] cpufreq: amd-pstate: Merge
 amd_pstate_highest_perf_set() into amd_get_boost_ratio_numerator()

The special case in amd_pstate_highest_perf_set() is the value used
for calculating the boost numerator.  Merge this into
amd_get_boost_ratio_numerator() and then use that to calculate boost
ratio.

This allows dropping more special casing of the highest perf value.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 Documentation/admin-guide/pm/amd-pstate.rst |  3 +-
 arch/x86/kernel/acpi/cppc.c                 | 16 +++++++
 drivers/cpufreq/amd-pstate.c                | 52 ++++-----------------
 3 files changed, 28 insertions(+), 43 deletions(-)

diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index d0324d44f548..e13915c54064 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -251,7 +251,8 @@ performance supported in `AMD CPPC Performance Capability <perf_cap_>`_).
 In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
 table, so we need to expose it to sysfs. If boost is not active, but
 still supported, this maximum frequency will be larger than the one in
-``cpuinfo``.
+``cpuinfo``. On systems that support preferred core, the driver will have
+different values for some cores than others.
 This attribute is read-only.
 
 ``amd_pstate_lowest_nonlinear_freq``
diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c
index df367bc35930..956984054bf3 100644
--- a/arch/x86/kernel/acpi/cppc.c
+++ b/arch/x86/kernel/acpi/cppc.c
@@ -9,6 +9,7 @@
 #include <asm/processor.h>
 #include <asm/topology.h>
 
+#define CPPC_HIGHEST_PERF_PERFORMANCE	196
 #define CPPC_HIGHEST_PERF_PREFCORE	166
 
 enum amd_pref_core {
@@ -245,6 +246,21 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator)
 		*numerator = boost_numerator;
 		return 0;
 	}
+
+	/*
+	 * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
+	 * the highest performance level is set to 196.
+	 * https://bugzilla.kernel.org/show_bug.cgi?id=218759
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_ZEN4)) {
+		switch (boot_cpu_data.x86_model) {
+		case 0x70 ... 0x7f:
+			*numerator = CPPC_HIGHEST_PERF_PERFORMANCE;
+			return 0;
+		default:
+			break;
+		}
+	}
 	*numerator = CPPC_HIGHEST_PERF_PREFCORE;
 
 	return 0;
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 937a3a817743..89623203f918 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -52,8 +52,6 @@
 #define AMD_PSTATE_TRANSITION_LATENCY	20000
 #define AMD_PSTATE_TRANSITION_DELAY	1000
 #define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600
-#define CPPC_HIGHEST_PERF_PERFORMANCE	196
-#define CPPC_HIGHEST_PERF_DEFAULT	166
 
 #define AMD_CPPC_EPP_PERFORMANCE		0x00
 #define AMD_CPPC_EPP_BALANCE_PERFORMANCE	0x80
@@ -372,43 +370,17 @@ static inline int amd_pstate_enable(bool enable)
 	return static_call(amd_pstate_enable)(enable);
 }
 
-static u32 amd_pstate_highest_perf_set(struct amd_cpudata *cpudata)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	/*
-	 * For AMD CPUs with Family ID 19H and Model ID range 0x70 to 0x7f,
-	 * the highest performance level is set to 196.
-	 * https://bugzilla.kernel.org/show_bug.cgi?id=218759
-	 */
-	if (c->x86 == 0x19 && (c->x86_model >= 0x70 && c->x86_model <= 0x7f))
-		return CPPC_HIGHEST_PERF_PERFORMANCE;
-
-	return CPPC_HIGHEST_PERF_DEFAULT;
-}
-
 static int pstate_init_perf(struct amd_cpudata *cpudata)
 {
 	u64 cap1;
-	u32 highest_perf;
 
 	int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1,
 				     &cap1);
 	if (ret)
 		return ret;
 
-	/* For platforms that do not support the preferred core feature, the
-	 * highest_pef may be configured with 166 or 255, to avoid max frequency
-	 * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as
-	 * the default max perf.
-	 */
-	if (cpudata->hw_prefcore)
-		highest_perf = amd_pstate_highest_perf_set(cpudata);
-	else
-		highest_perf = AMD_CPPC_HIGHEST_PERF(cap1);
-
-	WRITE_ONCE(cpudata->highest_perf, highest_perf);
-	WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
+	WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1));
+	WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1));
 	WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1));
 	WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1));
 	WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1));
@@ -420,19 +392,13 @@ static int pstate_init_perf(struct amd_cpudata *cpudata)
 static int cppc_init_perf(struct amd_cpudata *cpudata)
 {
 	struct cppc_perf_caps cppc_perf;
-	u32 highest_perf;
 
 	int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf);
 	if (ret)
 		return ret;
 
-	if (cpudata->hw_prefcore)
-		highest_perf = amd_pstate_highest_perf_set(cpudata);
-	else
-		highest_perf = cppc_perf.highest_perf;
-
-	WRITE_ONCE(cpudata->highest_perf, highest_perf);
-	WRITE_ONCE(cpudata->max_limit_perf, highest_perf);
+	WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf);
+	WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf);
 	WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf);
 	WRITE_ONCE(cpudata->lowest_nonlinear_perf,
 		   cppc_perf.lowest_nonlinear_perf);
@@ -918,8 +884,8 @@ static u32 amd_pstate_get_transition_latency(unsigned int cpu)
 static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
 {
 	int ret;
-	u32 min_freq;
-	u32 highest_perf, max_freq;
+	u32 min_freq, max_freq;
+	u64 numerator;
 	u32 nominal_perf, nominal_freq;
 	u32 lowest_nonlinear_perf, lowest_nonlinear_freq;
 	u32 boost_ratio, lowest_nonlinear_ratio;
@@ -941,8 +907,10 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata)
 
 	nominal_perf = READ_ONCE(cpudata->nominal_perf);
 
-	highest_perf = READ_ONCE(cpudata->highest_perf);
-	boost_ratio = div_u64(highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf);
+	ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator);
+	if (ret)
+		return ret;
+	boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf);
 	max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000;
 
 	lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf);

From 45722e777fd99ea863fe653c1838d39f678506e2 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 26 Aug 2024 16:13:57 -0500
Subject: [PATCH 30/35] cpufreq: amd-pstate: Optimize
 amd_pstate_update_limits()

Don't take and release the mutex when prefcore isn't present and
avoid initialization of variables that will be initially set
in the function.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/cpufreq/amd-pstate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 89623203f918..f550d31f80eb 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -810,17 +810,17 @@ static void amd_pstate_update_limits(unsigned int cpu)
 
 	cpudata = policy->driver_data;
 
-	mutex_lock(&amd_pstate_driver_lock);
-	if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore))
-		goto free_cpufreq_put;
+	if (!amd_pstate_prefcore)
+		return;
 
+	mutex_lock(&amd_pstate_driver_lock);
 	ret = amd_get_highest_perf(cpu, &cur_high);
 	if (ret)
 		goto free_cpufreq_put;
 
 	prev_high = READ_ONCE(cpudata->prefcore_ranking);
-	if (prev_high != cur_high) {
-		highest_perf_changed = true;
+	highest_perf_changed = (prev_high != cur_high);
+	if (highest_perf_changed) {
 		WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
 
 		if (cur_high < CPPC_MAX_PERF)

From b96b82d1af7fbf35e3c7d50368275005bd6b6a03 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Mon, 26 Aug 2024 16:13:58 -0500
Subject: [PATCH 31/35] cpufreq: amd-pstate: Add documentation for
 `amd_pstate_hw_prefcore`

Explain that the sysfs file represents both preferred core being
enabled by the user and supported by the hardware.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 Documentation/admin-guide/pm/amd-pstate.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index e13915c54064..d5c050ea390d 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -263,6 +263,11 @@ lowest non-linear performance in `AMD CPPC Performance Capability
 <perf_cap_>`_.)
 This attribute is read-only.
 
+``amd_pstate_hw_prefcore``
+
+Whether the platform supports the preferred core feature and it has been
+enabled. This attribute is read-only.
+
 ``energy_performance_available_preferences``
 
 A list of all the supported EPP preferences that could be used for

From 15a2b764ea7c16dd2b1ecfd86ba27809f5bd8580 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Tue, 3 Sep 2024 14:43:10 -0500
Subject: [PATCH 32/35] amd-pstate: Add missing documentation for
 `amd_pstate_prefcore_ranking`

`amd_pstate_prefcore_ranking` reflects the dynamic rankings of a CPU
core based on platform conditions.  Explicitly include it in the
documentation.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 Documentation/admin-guide/pm/amd-pstate.rst | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
index d5c050ea390d..210a808b74ec 100644
--- a/Documentation/admin-guide/pm/amd-pstate.rst
+++ b/Documentation/admin-guide/pm/amd-pstate.rst
@@ -252,7 +252,8 @@ In some ASICs, the highest CPPC performance is not the one in the ``_CPC``
 table, so we need to expose it to sysfs. If boost is not active, but
 still supported, this maximum frequency will be larger than the one in
 ``cpuinfo``. On systems that support preferred core, the driver will have
-different values for some cores than others.
+different values for some cores than others and this will reflect the values
+advertised by the platform at bootup.
 This attribute is read-only.
 
 ``amd_pstate_lowest_nonlinear_freq``
@@ -268,6 +269,12 @@ This attribute is read-only.
 Whether the platform supports the preferred core feature and it has been
 enabled. This attribute is read-only.
 
+``amd_pstate_prefcore_ranking``
+
+The performance ranking of the core. This number doesn't have any unit, but
+larger numbers are preferred at the time of reading. This can change at
+runtime based on platform conditions. This attribute is read-only.
+
 ``energy_performance_available_preferences``
 
 A list of all the supported EPP preferences that could be used for

From 8d916815b0afad2a12a9f1b945c79fffc144dba8 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 31 Aug 2024 21:49:11 -0500
Subject: [PATCH 33/35] cpufreq/amd-pstate: Export symbols for changing modes

In order to effectively test all mode switch combinations export
everything necessarily for amd-pstate-ut to trigger a mode switch.

Reviewed-by: Perry Yuan <Perry.Yuan@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/cpufreq/amd-pstate.c | 23 ++++++++++-------------
 drivers/cpufreq/amd-pstate.h | 14 ++++++++++++++
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index f550d31f80eb..05b39a890bfa 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -58,18 +58,6 @@
 #define AMD_CPPC_EPP_BALANCE_POWERSAVE		0xBF
 #define AMD_CPPC_EPP_POWERSAVE			0xFF
 
-/*
- * enum amd_pstate_mode - driver working mode of amd pstate
- */
-enum amd_pstate_mode {
-	AMD_PSTATE_UNDEFINED = 0,
-	AMD_PSTATE_DISABLE,
-	AMD_PSTATE_PASSIVE,
-	AMD_PSTATE_ACTIVE,
-	AMD_PSTATE_GUIDED,
-	AMD_PSTATE_MAX,
-};
-
 static const char * const amd_pstate_mode_string[] = {
 	[AMD_PSTATE_UNDEFINED]   = "undefined",
 	[AMD_PSTATE_DISABLE]     = "disable",
@@ -79,6 +67,14 @@ static const char * const amd_pstate_mode_string[] = {
 	NULL,
 };
 
+const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode)
+{
+	if (mode < 0 || mode >= AMD_PSTATE_MAX)
+		return NULL;
+	return amd_pstate_mode_string[mode];
+}
+EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string);
+
 struct quirk_entry {
 	u32 nominal_freq;
 	u32 lowest_freq;
@@ -1286,7 +1282,7 @@ static ssize_t amd_pstate_show_status(char *buf)
 	return sysfs_emit(buf, "%s\n", amd_pstate_mode_string[cppc_state]);
 }
 
-static int amd_pstate_update_status(const char *buf, size_t size)
+int amd_pstate_update_status(const char *buf, size_t size)
 {
 	int mode_idx;
 
@@ -1303,6 +1299,7 @@ static int amd_pstate_update_status(const char *buf, size_t size)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(amd_pstate_update_status);
 
 static ssize_t status_show(struct device *dev,
 			   struct device_attribute *attr, char *buf)
diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h
index cc8bb2bc325a..cd573bc6b6db 100644
--- a/drivers/cpufreq/amd-pstate.h
+++ b/drivers/cpufreq/amd-pstate.h
@@ -103,4 +103,18 @@ struct amd_cpudata {
 	bool	boost_state;
 };
 
+/*
+ * enum amd_pstate_mode - driver working mode of amd pstate
+ */
+enum amd_pstate_mode {
+	AMD_PSTATE_UNDEFINED = 0,
+	AMD_PSTATE_DISABLE,
+	AMD_PSTATE_PASSIVE,
+	AMD_PSTATE_ACTIVE,
+	AMD_PSTATE_GUIDED,
+	AMD_PSTATE_MAX,
+};
+const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode);
+int amd_pstate_update_status(const char *buf, size_t size);
+
 #endif /* _LINUX_AMD_PSTATE_H */

From 3e39e68dfbb6d4fe8c1943003d2f2a6f3255c902 Mon Sep 17 00:00:00 2001
From: Mario Limonciello <mario.limonciello@amd.com>
Date: Sat, 31 Aug 2024 21:49:12 -0500
Subject: [PATCH 34/35] cpufreq/amd-pstate-ut: Add test case for mode switches

There is a state machine in the amd-pstate driver utilized for
switches for all modes. To make sure that cleanup and setup works
properly for each mode add a unit test case that tries all
combinations.

Reviewed-by: Perry Yuan <perry.yuan@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/cpufreq/amd-pstate-ut.c | 41 ++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c
index 66b73c308ce6..24432bd78a5a 100644
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -54,12 +54,14 @@ static void amd_pstate_ut_acpi_cpc_valid(u32 index);
 static void amd_pstate_ut_check_enabled(u32 index);
 static void amd_pstate_ut_check_perf(u32 index);
 static void amd_pstate_ut_check_freq(u32 index);
+static void amd_pstate_ut_check_driver(u32 index);
 
 static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = {
 	{"amd_pstate_ut_acpi_cpc_valid",   amd_pstate_ut_acpi_cpc_valid   },
 	{"amd_pstate_ut_check_enabled",    amd_pstate_ut_check_enabled    },
 	{"amd_pstate_ut_check_perf",       amd_pstate_ut_check_perf       },
-	{"amd_pstate_ut_check_freq",       amd_pstate_ut_check_freq       }
+	{"amd_pstate_ut_check_freq",       amd_pstate_ut_check_freq       },
+	{"amd_pstate_ut_check_driver",	   amd_pstate_ut_check_driver     }
 };
 
 static bool get_shared_mem(void)
@@ -254,6 +256,43 @@ static void amd_pstate_ut_check_freq(u32 index)
 	cpufreq_cpu_put(policy);
 }
 
+static int amd_pstate_set_mode(enum amd_pstate_mode mode)
+{
+	const char *mode_str = amd_pstate_get_mode_string(mode);
+
+	pr_debug("->setting mode to %s\n", mode_str);
+
+	return amd_pstate_update_status(mode_str, strlen(mode_str));
+}
+
+static void amd_pstate_ut_check_driver(u32 index)
+{
+	enum amd_pstate_mode mode1, mode2;
+	int ret;
+
+	for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {
+		ret = amd_pstate_set_mode(mode1);
+		if (ret)
+			goto out;
+		for (mode2 = AMD_PSTATE_DISABLE; mode2 < AMD_PSTATE_MAX; mode2++) {
+			if (mode1 == mode2)
+				continue;
+			ret = amd_pstate_set_mode(mode2);
+			if (ret)
+				goto out;
+		}
+	}
+out:
+	if (ret)
+		pr_warn("%s: failed to update status for %s->%s: %d\n", __func__,
+			amd_pstate_get_mode_string(mode1),
+			amd_pstate_get_mode_string(mode2), ret);
+
+	amd_pstate_ut_cases[index].result = ret ?
+					    AMD_PSTATE_UT_RESULT_FAIL :
+					    AMD_PSTATE_UT_RESULT_PASS;
+}
+
 static int __init amd_pstate_ut_init(void)
 {
 	u32 i = 0, arr_size = ARRAY_SIZE(amd_pstate_ut_cases);

From 93497752dfed196b41d2804503e80b9a04318adb Mon Sep 17 00:00:00 2001
From: Qianqiang Liu <qianqiang.liu@163.com>
Date: Wed, 11 Sep 2024 07:39:24 +0800
Subject: [PATCH 35/35] cpufreq/amd-pstate-ut: Fix an "Uninitialized variables"
 issue

Using uninitialized value "mode2" when calling "amd_pstate_get_mode_string".
Set "mode2" to "AMD_PSTATE_DISABLE" by default.

Signed-off-by: Qianqiang Liu <qianqiang.liu@163.com>
Link: https://lore.kernel.org/r/20240910233923.46470-1-qianqiang.liu@163.com
Acked-by: Mario Limonciello <mario.limonciello@amd.com>
Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
---
 drivers/cpufreq/amd-pstate-ut.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c
index 24432bd78a5a..d52f26d1ffba 100644
--- a/drivers/cpufreq/amd-pstate-ut.c
+++ b/drivers/cpufreq/amd-pstate-ut.c
@@ -267,7 +267,7 @@ static int amd_pstate_set_mode(enum amd_pstate_mode mode)
 
 static void amd_pstate_ut_check_driver(u32 index)
 {
-	enum amd_pstate_mode mode1, mode2;
+	enum amd_pstate_mode mode1, mode2 = AMD_PSTATE_DISABLE;
 	int ret;
 
 	for (mode1 = AMD_PSTATE_DISABLE; mode1 < AMD_PSTATE_MAX; mode1++) {