From 9fe89f022c05d99c052d6bc088b82d4ff83bf463 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 27 Jan 2026 16:17:48 +0100
Subject: [PATCH 01/46] sched/fair: More complex proportional newidle balance

It turns out that a few workloads (easyWave, fio) have a fairly low
success rate on newidle balance, but still benefit greatly from having
it anyway.

Luckliky these workloads have a faily low newidle rate, so the cost if
doing the newidle is relatively low, even if unsuccessfull.

Add a simple rate based part to the newidle ratio compute, such that
low rate newidle will still have a high newidle ratio.

This cures the easyWave and fio workloads while not affecting the
schbench numbers either (which have a very high newidle rate).

Reported-by: Mario Roy <marioeroy@gmail.com>
Reported-by: "Mohamed Abuelfotoh, Hazem" <abuehaze@amazon.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Mario Roy <marioeroy@gmail.com>
Tested-by: "Mohamed Abuelfotoh, Hazem" <abuehaze@amazon.com>
Link: https://patch.msgid.link/20260127151748.GA1079264@noisy.programming.kicks-ass.net
---
 include/linux/sched/topology.h |  1 +
 kernel/sched/fair.c            | 27 +++++++++++++++++++++++++--
 kernel/sched/features.h        |  1 +
 kernel/sched/topology.c        |  3 +++
 4 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 45c0022b91ce..a1e1032426dc 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -95,6 +95,7 @@ struct sched_domain {
 	unsigned int newidle_call;
 	unsigned int newidle_success;
 	unsigned int newidle_ratio;
+	u64 newidle_stamp;
 	u64 max_newidle_lb_cost;
 	unsigned long last_decay_max_lb_cost;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf948db905ed..66afa0ac7396 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12289,7 +12289,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su
 	sd->newidle_success += success;
 
 	if (sd->newidle_call >= 1024) {
-		sd->newidle_ratio = sd->newidle_success;
+		u64 now = sched_clock();
+		s64 delta = now - sd->newidle_stamp;
+		sd->newidle_stamp = now;
+		int ratio = 0;
+
+		if (delta < 0)
+			delta = 0;
+
+		if (sched_feat(NI_RATE)) {
+			/*
+			 * ratio  delta   freq
+			 *
+			 * 1024 -  4  s -  128 Hz
+			 *  512 -  2  s -  256 Hz
+			 *  256 -  1  s -  512 Hz
+			 *  128 - .5  s - 1024 Hz
+			 *   64 - .25 s - 2048 Hz
+			 */
+			ratio = delta >> 22;
+		}
+
+		ratio += sd->newidle_success;
+
+		sd->newidle_ratio = min(1024, ratio);
 		sd->newidle_call /= 2;
 		sd->newidle_success /= 2;
 	}
@@ -12996,7 +13019,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 		if (sd->flags & SD_BALANCE_NEWIDLE) {
 			unsigned int weight = 1;
 
-			if (sched_feat(NI_RANDOM)) {
+			if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) {
 				/*
 				 * Throw a 1k sided dice; and only run
 				 * newidle_balance according to the success
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 136a6584be79..37d5928fa6dd 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -126,3 +126,4 @@ SCHED_FEAT(LATENCY_WARN, false)
  * Do newidle balancing proportional to its success rate using randomization.
  */
 SCHED_FEAT(NI_RANDOM, true)
+SCHED_FEAT(NI_RATE, true)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 32dcddaead82..061f8c85f555 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched/isolation.h>
+#include <linux/sched/clock.h>
 #include <linux/bsearch.h>
 #include "sched.h"
 
@@ -1642,6 +1643,7 @@ sd_init(struct sched_domain_topology_level *tl,
 	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
 	int sd_id, sd_weight, sd_flags = 0;
 	struct cpumask *sd_span;
+	u64 now = sched_clock();
 
 	sd_weight = cpumask_weight(tl->mask(tl, cpu));
 
@@ -1679,6 +1681,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		.newidle_call		= 512,
 		.newidle_success	= 256,
 		.newidle_ratio		= 512,
+		.newidle_stamp		= now,
 
 		.max_newidle_lb_cost	= 0,
 		.last_decay_max_lb_cost	= jiffies,

From 4823725d9d1d9cc5b36647e0cb8ff616cad6536f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 26 Sep 2025 21:43:56 +0200
Subject: [PATCH 02/46] sched/fair: Increase weight bits for avg_vruntime

Due to the zero_vruntime patch, the deltas are now a lot smaller and
measurement with kernel-build and hackbench runs show about 45 bits
used.

This ensures avg_vruntime() tracks the full weight range, reducing
numerical artifacts in reweight and the like.

Also, lets keep the paranoid debug code around fow now.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
Link: https://patch.msgid.link/20260219080624.942813440%40infradead.org
---
 kernel/sched/debug.c    | 14 +++++-
 kernel/sched/fair.c     | 96 +++++++++++++++++++++++++++++++++--------
 kernel/sched/features.h |  2 +
 kernel/sched/sched.h    |  3 +-
 4 files changed, 94 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b24f40f05019..6246008c431e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,6 +8,7 @@
  */
 #include <linux/debugfs.h>
 #include <linux/nmi.h>
+#include <linux/log2.h>
 #include "sched.h"
 
 /*
@@ -901,10 +902,13 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
-	s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread;
+	s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread;
+	s64 zero_vruntime = -1, sum_w_vruntime = -1;
 	struct sched_entity *last, *first, *root;
 	struct rq *rq = cpu_rq(cpu);
+	unsigned int sum_shift;
 	unsigned long flags;
+	u64 sum_weight;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "\n");
@@ -925,6 +929,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	if (last)
 		right_vruntime = last->vruntime;
 	zero_vruntime = cfs_rq->zero_vruntime;
+	sum_w_vruntime = cfs_rq->sum_w_vruntime;
+	sum_weight = cfs_rq->sum_weight;
+	sum_shift = cfs_rq->sum_shift;
 	raw_spin_rq_unlock_irqrestore(rq, flags);
 
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_deadline",
@@ -933,6 +940,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			SPLIT_NS(left_vruntime));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "zero_vruntime",
 			SPLIT_NS(zero_vruntime));
+	SEQ_printf(m, "  .%-30s: %Ld (%d bits)\n", "sum_w_vruntime",
+		   sum_w_vruntime, ilog2(abs(sum_w_vruntime)));
+	SEQ_printf(m, "  .%-30s: %Lu\n", "sum_weight",
+		   sum_weight);
+	SEQ_printf(m, "  .%-30s: %u\n", "sum_shift", sum_shift);
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
 			SPLIT_NS(avg_vruntime(cfs_rq)));
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "right_vruntime",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 66afa0ac7396..fdb98d2ea131 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -665,25 +665,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
  * Since zero_vruntime closely tracks the per-task service, these
  * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag
  * induced in the system due to quantisation.
- *
- * Also, we use scale_load_down() to reduce the size.
- *
- * As measured, the max (key * weight) value was ~44 bits for a kernel build.
  */
+static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w)
+{
+#ifdef CONFIG_64BIT
+	if (cfs_rq->sum_shift)
+		w = max(2UL, w >> cfs_rq->sum_shift);
+#endif
+	return w;
+}
+
+static inline void
+__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+	s64 w_vruntime, key = entity_key(cfs_rq, se);
+
+	w_vruntime = key * weight;
+	WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62));
+
+	cfs_rq->sum_w_vruntime += w_vruntime;
+	cfs_rq->sum_weight += weight;
+}
+
+static void
+sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	unsigned long weight;
+	s64 key, tmp;
+
+again:
+	weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+	key = entity_key(cfs_rq, se);
+
+	if (check_mul_overflow(key, weight, &key))
+		goto overflow;
+
+	if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp))
+		goto overflow;
+
+	cfs_rq->sum_w_vruntime = tmp;
+	cfs_rq->sum_weight += weight;
+	return;
+
+overflow:
+	/*
+	 * There's gotta be a limit -- if we're still failing at this point
+	 * there's really nothing much to be done about things.
+	 */
+	BUG_ON(cfs_rq->sum_shift >= 10);
+	cfs_rq->sum_shift++;
+
+	/*
+	 * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1
+	 */
+	cfs_rq->sum_w_vruntime = 0;
+	cfs_rq->sum_weight = 0;
+
+	for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost;
+	     node; node = rb_next(node))
+		__sum_w_vruntime_add(cfs_rq, __node_2_se(node));
+
+	goto again;
+}
+
 static void
 sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long weight = scale_load_down(se->load.weight);
-	s64 key = entity_key(cfs_rq, se);
+	if (sched_feat(PARANOID_AVG))
+		return sum_w_vruntime_add_paranoid(cfs_rq, se);
 
-	cfs_rq->sum_w_vruntime += key * weight;
-	cfs_rq->sum_weight += weight;
+	__sum_w_vruntime_add(cfs_rq, se);
 }
 
 static void
 sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	unsigned long weight = scale_load_down(se->load.weight);
+	unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight);
 	s64 key = entity_key(cfs_rq, se);
 
 	cfs_rq->sum_w_vruntime -= key * weight;
@@ -725,7 +783,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
 		s64 runtime = cfs_rq->sum_w_vruntime;
 
 		if (curr) {
-			unsigned long w = scale_load_down(curr->load.weight);
+			unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight);
 
 			runtime += entity_key(cfs_rq, curr) * w;
 			weight += w;
@@ -735,7 +793,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
 		if (runtime < 0)
 			runtime -= (weight - 1);
 
-		delta = div_s64(runtime, weight);
+		delta = div64_long(runtime, weight);
 	} else if (curr) {
 		/*
 		 * When there is but one element, it is the average.
@@ -801,7 +859,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime)
 	long load = cfs_rq->sum_weight;
 
 	if (curr && curr->on_rq) {
-		unsigned long weight = scale_load_down(curr->load.weight);
+		unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight);
 
 		avg += entity_key(cfs_rq, curr) * weight;
 		load += weight;
@@ -3871,12 +3929,12 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
 	 * we need to scale se->vlag when w_i changes.
 	 */
-	se->vlag = div_s64(se->vlag * se->load.weight, weight);
+	se->vlag = div64_long(se->vlag * se->load.weight, weight);
 	if (se->rel_deadline)
-		se->deadline = div_s64(se->deadline * se->load.weight, weight);
+		se->deadline = div64_long(se->deadline * se->load.weight, weight);
 
 	if (rel_vprot)
-		vprot = div_s64(vprot * se->load.weight, weight);
+		vprot = div64_long(vprot * se->load.weight, weight);
 
 	update_load_set(&se->load, weight);
 
@@ -5180,7 +5238,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
 		struct sched_entity *curr = cfs_rq->curr;
-		unsigned long load;
+		long load;
 
 		lag = se->vlag;
 
@@ -5238,12 +5296,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		 */
 		load = cfs_rq->sum_weight;
 		if (curr && curr->on_rq)
-			load += scale_load_down(curr->load.weight);
+			load += avg_vruntime_weight(cfs_rq, curr->load.weight);
 
-		lag *= load + scale_load_down(se->load.weight);
+		lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
 		if (WARN_ON_ONCE(!load))
 			load = 1;
-		lag = div_s64(lag, load);
+		lag = div64_long(lag, load);
 	}
 
 	se->vruntime = vruntime - lag;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 37d5928fa6dd..a25f97201ab9 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
 SCHED_FEAT(DELAY_DEQUEUE, true)
 SCHED_FEAT(DELAY_ZERO, true)
 
+SCHED_FEAT(PARANOID_AVG, false)
+
 /*
  * Allow wakeup-time preemption of the current task:
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..8bf2f7d524cd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -684,8 +684,9 @@ struct cfs_rq {
 
 	s64			sum_w_vruntime;
 	u64			sum_weight;
-
 	u64			zero_vruntime;
+	unsigned int		sum_shift;
+
 #ifdef CONFIG_SCHED_CORE
 	unsigned int		forceidle_seq;
 	u64			zero_vruntime_fi;

From 101f3498b4bdfef97152a444847948de1543f692 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 26 Jan 2026 20:56:23 +0100
Subject: [PATCH 03/46] sched/fair: Revert 6d71a9c61604 ("sched/fair: Fix EEVDF
 entity placement bug causing scheduling lag")

Zicheng Qu reported that, because avg_vruntime() always includes
cfs_rq->curr, when ->on_rq, place_entity() doesn't work right.

Specifically, the lag scaling in place_entity() relies on
avg_vruntime() being the state *before* placement of the new entity.
However in this case avg_vruntime() will actually already include the
entity, which breaks things.

Also, Zicheng Qu argues that avg_vruntime should be invariant under
reweight. IOW commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
placement bug causing scheduling lag") was wrong!

The issue reported in 6d71a9c61604 could possibly be explained by
rounding artifacts -- notably the extreme weight '2' is outside of the
range of avg_vruntime/sum_w_vruntime, since that uses
scale_load_down(). By scaling vruntime by the real weight, but
accounting it in vruntime with a factor 1024 more, the average moves
significantly. However, that is now cured.

Tested by reverting 66951e4860d3 ("sched/fair: Fix update_cfs_group()
vs DELAY_DEQUEUE") and tracing vruntime and vlag figures again.

Reported-by: Zicheng Qu <quzicheng@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
Link: https://patch.msgid.link/20260219080625.066102672%40infradead.org
---
 kernel/sched/fair.c | 148 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 124 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fdb98d2ea131..2b98054cd754 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -822,17 +822,22 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq);
  *
  *   -r_max < lag < max(r_max, q)
  */
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime)
 {
 	u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC;
 	s64 vlag, limit;
 
-	WARN_ON_ONCE(!se->on_rq);
-
-	vlag = avg_vruntime(cfs_rq) - se->vruntime;
+	vlag = avruntime - se->vruntime;
 	limit = calc_delta_fair(max_slice, se);
 
-	se->vlag = clamp(vlag, -limit, limit);
+	return clamp(vlag, -limit, limit);
+}
+
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	WARN_ON_ONCE(!se->on_rq);
+
+	se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
 }
 
 /*
@@ -3898,23 +3903,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		    se_weight(se) * -se->avg.load_sum);
 }
 
-static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
+static void
+rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot)
+{
+	unsigned long old_weight = se->load.weight;
+
+	/*
+	 * VRUNTIME
+	 * --------
+	 *
+	 * COROLLARY #1: The virtual runtime of the entity needs to be
+	 * adjusted if re-weight at !0-lag point.
+	 *
+	 * Proof: For contradiction assume this is not true, so we can
+	 * re-weight without changing vruntime at !0-lag point.
+	 *
+	 *             Weight	VRuntime   Avg-VRuntime
+	 *     before    w          v            V
+	 *      after    w'         v'           V'
+	 *
+	 * Since lag needs to be preserved through re-weight:
+	 *
+	 *	lag = (V - v)*w = (V'- v')*w', where v = v'
+	 *	==>	V' = (V - v)*w/w' + v		(1)
+	 *
+	 * Let W be the total weight of the entities before reweight,
+	 * since V' is the new weighted average of entities:
+	 *
+	 *	V' = (WV + w'v - wv) / (W + w' - w)	(2)
+	 *
+	 * by using (1) & (2) we obtain:
+	 *
+	 *	(WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+	 *	==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+	 *	==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+	 *	==>	(V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+	 *
+	 * Since we are doing at !0-lag point which means V != v, we
+	 * can simplify (3):
+	 *
+	 *	==>	W / (W + w' - w) = w / w'
+	 *	==>	Ww' = Ww + ww' - ww
+	 *	==>	W * (w' - w) = w * (w' - w)
+	 *	==>	W = w	(re-weight indicates w' != w)
+	 *
+	 * So the cfs_rq contains only one entity, hence vruntime of
+	 * the entity @v should always equal to the cfs_rq's weighted
+	 * average vruntime @V, which means we will always re-weight
+	 * at 0-lag point, thus breach assumption. Proof completed.
+	 *
+	 *
+	 * COROLLARY #2: Re-weight does NOT affect weighted average
+	 * vruntime of all the entities.
+	 *
+	 * Proof: According to corollary #1, Eq. (1) should be:
+	 *
+	 *	(V - v)*w = (V' - v')*w'
+	 *	==>    v' = V' - (V - v)*w/w'		(4)
+	 *
+	 * According to the weighted average formula, we have:
+	 *
+	 *	V' = (WV - wv + w'v') / (W - w + w')
+	 *	   = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+	 *	   = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+	 *	   = (WV + w'V' - Vw) / (W - w + w')
+	 *
+	 *	==>  V'*(W - w + w') = WV + w'V' - Vw
+	 *	==>	V' * (W - w) = (W - w) * V	(5)
+	 *
+	 * If the entity is the only one in the cfs_rq, then reweight
+	 * always occurs at 0-lag point, so V won't change. Or else
+	 * there are other entities, hence W != w, then Eq. (5) turns
+	 * into V' = V. So V won't change in either case, proof done.
+	 *
+	 *
+	 * So according to corollary #1 & #2, the effect of re-weight
+	 * on vruntime should be:
+	 *
+	 *	v' = V' - (V - v) * w / w'		(4)
+	 *	   = V  - (V - v) * w / w'
+	 *	   = V  - vl * w / w'
+	 *	   = V  - vl'
+	 */
+	se->vlag = div64_long(se->vlag * old_weight, weight);
+
+	/*
+	 * DEADLINE
+	 * --------
+	 *
+	 * When the weight changes, the virtual time slope changes and
+	 * we should adjust the relative virtual deadline accordingly.
+	 *
+	 *	d' = v' + (d - v)*w/w'
+	 *	   = V' - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  - (V - v)*w/w' + (d - v)*w/w'
+	 *	   = V  + (d - V)*w/w'
+	 */
+	if (se->rel_deadline)
+		se->deadline = div64_long(se->deadline * old_weight, weight);
+
+	if (rel_vprot)
+		se->vprot = div64_long(se->vprot * old_weight, weight);
+}
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
 	bool curr = cfs_rq->curr == se;
 	bool rel_vprot = false;
-	u64 vprot;
+	u64 avruntime = 0;
 
 	if (se->on_rq) {
 		/* commit outstanding execution time */
 		update_curr(cfs_rq);
-		update_entity_lag(cfs_rq, se);
-		se->deadline -= se->vruntime;
+		avruntime = avg_vruntime(cfs_rq);
+		se->vlag = entity_lag(cfs_rq, se, avruntime);
+		se->deadline -= avruntime;
 		se->rel_deadline = 1;
 		if (curr && protect_slice(se)) {
-			vprot = se->vprot - se->vruntime;
+			se->vprot -= avruntime;
 			rel_vprot = true;
 		}
 
@@ -3925,30 +4032,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	}
 	dequeue_load_avg(cfs_rq, se);
 
-	/*
-	 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
-	 * we need to scale se->vlag when w_i changes.
-	 */
-	se->vlag = div64_long(se->vlag * se->load.weight, weight);
-	if (se->rel_deadline)
-		se->deadline = div64_long(se->deadline * se->load.weight, weight);
-
-	if (rel_vprot)
-		vprot = div64_long(vprot * se->load.weight, weight);
+	rescale_entity(se, weight, rel_vprot);
 
 	update_load_set(&se->load, weight);
 
 	do {
 		u32 divider = get_pelt_divider(&se->avg);
-
 		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
 	} while (0);
 
 	enqueue_load_avg(cfs_rq, se);
 	if (se->on_rq) {
-		place_entity(cfs_rq, se, 0);
 		if (rel_vprot)
-			se->vprot = se->vruntime + vprot;
+			se->vprot += avruntime;
+		se->deadline += avruntime;
+		se->rel_deadline = 0;
+		se->vruntime = avruntime - se->vlag;
+
 		update_load_add(&cfs_rq->load, se->load.weight);
 		if (!curr)
 			__enqueue_entity(cfs_rq, se);
@@ -5306,7 +5406,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	se->vruntime = vruntime - lag;
 
-	if (se->rel_deadline) {
+	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
 		se->deadline += se->vruntime;
 		se->rel_deadline = 0;
 		return;

From db4551e2ba346663b7b16f0b5d36d308b615c50e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Feb 2026 17:07:58 +0100
Subject: [PATCH 04/46] sched/fair: Use full weight to __calc_delta()

Since we now use the full weight for avg_vruntime(), also make
__calc_delta() use the full value.

Since weight is effectively NICE_0_LOAD, this is 20 bits on 64bit.
This leaves 44 bits for delta_exec, which is ~16k seconds, way longer
than any one tick would ever be, so no worry about overflow.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Shubhang Kaushik <shubhang@os.amperecomputing.com>
Link: https://patch.msgid.link/20260219080625.183283814%40infradead.org
---
 kernel/sched/fair.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2b98054cd754..23315c294da1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -225,6 +225,7 @@ void __init sched_init_granularity(void)
 	update_sysctl();
 }
 
+#ifndef CONFIG_64BIT
 #define WMULT_CONST	(~0U)
 #define WMULT_SHIFT	32
 
@@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight
 
 	return mul_u64_u32_shr(delta_exec, fact, shift);
 }
+#else
+static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
+{
+	return (delta_exec * weight) / lw->weight;
+}
+#endif
 
 /*
  * delta /= w

From 9264758066061e660c86e48cff1bac4a58a7324a Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 13 Feb 2026 11:17:51 +0100
Subject: [PATCH 05/46] sched/fair: Update overutilized detection

Checking uclamp_min is useless and counterproductive for overutilized state
as misfit can now happen without being in overutilized state.

Since commit e5ed0550c04c ("sched/fair: unlink misfit task from cpu overutilized")
util_fits_cpu returns -1 when uclamp_min is above capacity which is not
considered as cpu overutilized.

Remove the useless rq_util_min parameter.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Qais Yousef <qyousef@layalina.io>
Reviewed-by: Christian Loehle <christian.loehle@arm.com>
Link: https://patch.msgid.link/20260213101751.3121899-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23315c294da1..b8b052b2149b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7018,16 +7018,15 @@ static inline void hrtick_update(struct rq *rq)
 
 static inline bool cpu_overutilized(int cpu)
 {
-	unsigned long  rq_util_min, rq_util_max;
+	unsigned long rq_util_max;
 
 	if (!sched_energy_enabled())
 		return false;
 
-	rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
 	rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
 
 	/* Return true only if the utilization doesn't fit CPU's capacity */
-	return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
+	return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu);
 }
 
 /*

From d3d663faa1d4e86491b77ab72eabc3ea2f58b197 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 6 Feb 2026 10:54:54 +0100
Subject: [PATCH 06/46] sched/fair: Filter false overloaded_group case for EAS

With EAS, a group should be set overloaded if at least 1 CPU in the group
is overutilized but it can happen that a CPU is fully utilized by tasks
because of clamping the compute capacity of the CPU. In such case, the CPU
is not overutilized and as a result should not be set overloaded as well.

group_overloaded being a higher priority than group_misfit, such group can
be selected as the busiest group instead of a group with a mistfit task
and prevents load_balance to select the CPU with the misfit task to pull
the latter on a fitting CPU.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Pierre Gondois <pierre.gondois@arm.com>
Link: https://patch.msgid.link/20260206095454.1520619-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8b052b2149b..966e25282215 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10211,6 +10211,7 @@ struct sg_lb_stats {
 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
+	unsigned int group_overutilized;	/* At least one CPU is overutilized in the group */
 #ifdef CONFIG_NUMA_BALANCING
 	unsigned int nr_numa_running;
 	unsigned int nr_preferred_running;
@@ -10443,6 +10444,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 static inline bool
 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
 {
+	/*
+	 * With EAS and uclamp, 1 CPU in the group must be overutilized to
+	 * consider the group overloaded.
+	 */
+	if (sched_energy_enabled() && !sgs->group_overutilized)
+		return false;
+
 	if (sgs->sum_nr_running <= sgs->group_weight)
 		return false;
 
@@ -10626,14 +10634,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
  * @group: sched_group whose statistics are to be updated.
  * @sgs: variable to hold the statistics for this group.
  * @sg_overloaded: sched_group is overloaded
- * @sg_overutilized: sched_group is overutilized
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
 				      struct sd_lb_stats *sds,
 				      struct sched_group *group,
 				      struct sg_lb_stats *sgs,
-				      bool *sg_overloaded,
-				      bool *sg_overutilized)
+				      bool *sg_overloaded)
 {
 	int i, nr_running, local_group, sd_flags = env->sd->flags;
 	bool balancing_at_rd = !env->sd->parent;
@@ -10655,7 +10661,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		sgs->sum_nr_running += nr_running;
 
 		if (cpu_overutilized(i))
-			*sg_overutilized = 1;
+			sgs->group_overutilized = 1;
 
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
@@ -11326,13 +11332,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 				update_group_capacity(env->sd, env->dst_cpu);
 		}
 
-		update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized);
+		update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded);
 
 		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
 			sds->busiest = sg;
 			sds->busiest_stat = *sgs;
 		}
 
+		sg_overutilized |= sgs->group_overutilized;
+
 		/* Now, start updating sd_lb_stats */
 		sds->total_load += sgs->group_load;
 		sds->total_capacity += sgs->group_capacity;

From c0e1832ba6dad7057acf3f485a87e0adccc23141 Mon Sep 17 00:00:00 2001
From: Dengjun Su <dengjun.su@mediatek.com>
Date: Wed, 4 Feb 2026 19:59:29 +0800
Subject: [PATCH 07/46] sched: Fix incorrect schedstats for rt and dl thread

For RT and DL thread, only 'set_next_task_(rt/dl)' will call
'update_stats_wait_end_(rt/dl)' to update schedstats information.
However, during the migration process,
'update_stats_wait_start_(rt/dl)' will be called twice, which
will cause the values of wait_max and wait_sum to be incorrect.
The specific output as follows:
$ cat /proc/6046/task/6046/sched | grep wait
wait_start                                   :             0.000000
wait_max                                     :        496717.080029
wait_sum                                     :       7921540.776553

A complete schedstats information update flow of migrate should be
__update_stats_wait_start() [enter queue A, stage 1] ->
__update_stats_wait_end()   [leave queue A, stage 2] ->
__update_stats_wait_start() [enter queue B, stage 3] ->
__update_stats_wait_end()   [start running on queue B, stage 4]

    Stage 1: prev_wait_start is 0, and in the end, wait_start records the
    time of entering the queue.
    Stage 2: task_on_rq_migrating(p) is true, and wait_start is updated to
    the waiting time on queue A.
    Stage 3: prev_wait_start is the waiting time on queue A, wait_start is
    the time of entering queue B, and wait_start is expected to be greater
    than prev_wait_start. Under this condition, wait_start is updated to
    (the moment of entering queue B) - (the waiting time on queue A).
    Stage 4: the final wait time = (time when starting to run on queue B)
    - (time of entering queue B) + (waiting time on queue A) = waiting
    time on queue B + waiting time on queue A.

The current problem is that stage 2 does not call __update_stats_wait_end
to update wait_start, which causes the final computed wait time = waiting
time on queue B + the moment of entering queue A, leading to incorrect
wait_max and wait_sum.

Add 'update_stats_wait_end_(rt/dl)' in 'update_stats_dequeue_(rt/dl)' to
update schedstats information when dequeue_task.

Signed-off-by: Dengjun Su <dengjun.su@mediatek.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260204115959.3183567-1-dengjun.su@mediatek.com
---
 kernel/sched/deadline.c | 4 ++++
 kernel/sched/rt.c       | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d08b00429323..2de5727b94b4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2142,10 +2142,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se,
 			int flags)
 {
 	struct task_struct *p = dl_task_of(dl_se);
+	struct rq *rq = rq_of_dl_rq(dl_rq);
 
 	if (!schedstat_enabled())
 		return;
 
+	if (p != rq->curr)
+		update_stats_wait_end_dl(dl_rq, dl_se);
+
 	if ((flags & DEQUEUE_SLEEP)) {
 		unsigned int state;
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f69e1f16d923..3d823f5ffe2c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
 			int flags)
 {
 	struct task_struct *p = NULL;
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 
 	if (!schedstat_enabled())
 		return;
 
-	if (rt_entity_is_task(rt_se))
+	if (rt_entity_is_task(rt_se)) {
 		p = rt_task_of(rt_se);
 
+		if (p != rq->curr)
+			update_stats_wait_end_rt(rt_rq, rt_se);
+	}
+
 	if ((flags & DEQUEUE_SLEEP) && p) {
 		unsigned int state;
 

From c2a57380df9dd5df6fae11c6ba9f624b9cad3e6a Mon Sep 17 00:00:00 2001
From: Marco Crivellari <marco.crivellari@suse.com>
Date: Fri, 7 Nov 2025 10:24:52 +0100
Subject: [PATCH 08/46] sched: Replace use of system_unbound_wq with
 system_dfl_wq

Currently if a user enqueues a work item using schedule_delayed_work() the
used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use
WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to
schedule_work() that is using system_wq and queue_work(), that makes use
again of WORK_CPU_UNBOUND.

This lack of consistency cannot be addressed without refactoring the API.
For more details see the Link tag below.

This continues the effort to refactor workqueue APIs, which began with
the introduction of new workqueues and a new alloc_workqueue flag in:

commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq")
commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag")

Switch to using system_dfl_wq because system_unbound_wq is going away as part of
a workqueue restructuring.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Marco Crivellari <marco.crivellari@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/
Link: https://patch.msgid.link/20251107092452.43399-1-marco.crivellari@suse.com
---
 kernel/sched/core.c | 4 ++--
 kernel/sched/ext.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7f77c165a6e..bfd280ec0f97 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5678,7 +5678,7 @@ static void sched_tick_remote(struct work_struct *work)
 	os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
 	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
 	if (os == TICK_SCHED_REMOTE_RUNNING)
-		queue_delayed_work(system_unbound_wq, dwork, HZ);
+		queue_delayed_work(system_dfl_wq, dwork, HZ);
 }
 
 static void sched_tick_start(int cpu)
@@ -5697,7 +5697,7 @@ static void sched_tick_start(int cpu)
 	if (os == TICK_SCHED_REMOTE_OFFLINE) {
 		twork->cpu = cpu;
 		INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
-		queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+		queue_delayed_work(system_dfl_wq, &twork->work, HZ);
 	}
 }
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 06cc0a4aec66..a448a8407d8e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2762,7 +2762,7 @@ static void scx_watchdog_workfn(struct work_struct *work)
 
 		cond_resched();
 	}
-	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+	queue_delayed_work(system_dfl_wq, to_delayed_work(work),
 			   scx_watchdog_timeout / 2);
 }
 
@@ -5059,7 +5059,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 	WRITE_ONCE(scx_watchdog_timeout, timeout);
 	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
-	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+	queue_delayed_work(system_dfl_wq, &scx_watchdog_work,
 			   scx_watchdog_timeout / 2);
 
 	/*

From fd54d81c2c0e6cffd5470c2c27fbb04d0ebe7da0 Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Tue, 3 Feb 2026 18:49:39 +0000
Subject: [PATCH 09/46] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CPUs whose rq only have SCHED_IDLE tasks running are considered to be
equivalent to truly idle CPUs during wakeup path. For fork and exec
SCHED_IDLE is even preferred.
This is based on the assumption that the SCHED_IDLE CPU is not in an
idle state and might be in a higher P-state, allowing the task/wakee
to run immediately without sharing the rq.

However this assumption doesn't hold if the wakee has SCHED_IDLE policy
itself, as it will share the rq with existing SCHED_IDLE tasks. In this
case, we are better off continuing to look for a truly idle CPU.

On a Intel Xeon 2-socket with 64 logical cores in total this yields
for kernel compilation using SCHED_IDLE:

+---------+----------------------+----------------------+--------+
| workers | mainline (seconds)   | patch (seconds)      | delta% |
+=========+======================+======================+========+
|       1 | 4384.728 ± 21.085    | 3843.250 ± 16.235    | -12.35 |
|       2 | 2242.513 ± 2.099     | 1971.696 ± 2.842     | -12.08 |
|       4 | 1199.324 ± 1.823     | 1033.744 ± 1.803     | -13.81 |
|       8 |  649.083 ± 1.959     |  559.123 ± 4.301     | -13.86 |
|      16 |  370.425 ± 0.915     |  325.906 ± 4.623     | -12.02 |
|      32 |  234.651 ± 2.255     |  217.266 ± 0.253     |  -7.41 |
|      64 |  202.286 ± 1.452     |  197.977 ± 2.275     |  -2.13 |
|     128 |  217.092 ± 1.687     |  212.164 ± 1.138     |  -2.27 |
+---------+----------------------+----------------------+--------+

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260203184939.2138022-1-christian.loehle@arm.com
---
 kernel/sched/fair.c | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966e25282215..d57c02e82f3a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7064,9 +7064,15 @@ static int sched_idle_rq(struct rq *rq)
 			rq->nr_running);
 }
 
-static int sched_idle_cpu(int cpu)
+static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p)
 {
-	return sched_idle_rq(cpu_rq(cpu));
+	return sched_idle_rq(rq) && !task_has_idle_policy(p);
+}
+
+static int choose_idle_cpu(int cpu, struct task_struct *p)
+{
+	return available_idle_cpu(cpu) ||
+	       choose_sched_idle_rq(cpu_rq(cpu), p);
 }
 
 static void
@@ -7631,7 +7637,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *
 		if (!sched_core_cookie_match(rq, p))
 			continue;
 
-		if (sched_idle_cpu(i))
+		if (choose_sched_idle_rq(rq, p))
 			return i;
 
 		if (available_idle_cpu(i)) {
@@ -7722,8 +7728,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas
 
 static inline int __select_idle_cpu(int cpu, struct task_struct *p)
 {
-	if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
-	    sched_cpu_cookie_match(cpu_rq(cpu), p))
+	if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p))
 		return cpu;
 
 	return -1;
@@ -7796,7 +7801,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
 		if (!available_idle_cpu(cpu)) {
 			idle = false;
 			if (*idle_cpu == -1) {
-				if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) {
+				if (choose_sched_idle_rq(cpu_rq(cpu), p) &&
+				    cpumask_test_cpu(cpu, cpus)) {
 					*idle_cpu = cpu;
 					break;
 				}
@@ -7831,7 +7837,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
 		 */
 		if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
 			continue;
-		if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+		if (choose_idle_cpu(cpu, p))
 			return cpu;
 	}
 
@@ -7953,7 +7959,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
 	for_each_cpu_wrap(cpu, cpus, target) {
 		unsigned long cpu_cap = capacity_of(cpu);
 
-		if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
+		if (!choose_idle_cpu(cpu, p))
 			continue;
 
 		fits = util_fits_cpu(task_util, util_min, util_max, cpu);
@@ -8024,7 +8030,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 */
 	lockdep_assert_irqs_disabled();
 
-	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+	if (choose_idle_cpu(target, p) &&
 	    asym_fits_cpu(task_util, util_min, util_max, target))
 		return target;
 
@@ -8032,7 +8038,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
-	    (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+	    choose_idle_cpu(prev, p) &&
 	    asym_fits_cpu(task_util, util_min, util_max, prev)) {
 
 		if (!static_branch_unlikely(&sched_cluster_active) ||
@@ -8064,7 +8070,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	if (recent_used_cpu != prev &&
 	    recent_used_cpu != target &&
 	    cpus_share_cache(recent_used_cpu, target) &&
-	    (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
+	    choose_idle_cpu(recent_used_cpu, p) &&
 	    cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
 	    asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
 
@@ -12531,7 +12537,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
 {
 	int continue_balancing = 1;
 	int cpu = rq->cpu;
-	int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
+	int busy = idle != CPU_IDLE && !sched_idle_rq(rq);
 	unsigned long interval;
 	struct sched_domain *sd;
 	/* Earliest time when we have to do rebalance again */
@@ -12569,7 +12575,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
 				 * state even if we migrated tasks. Update it.
 				 */
 				idle = idle_cpu(cpu);
-				busy = !idle && !sched_idle_cpu(cpu);
+				busy = !idle && !sched_idle_rq(rq);
 			}
 			sd->last_balance = jiffies;
 			interval = get_sd_balance_interval(sd, busy);

From 2e7af192697ef2a71c76fd57860b0fcd02754e14 Mon Sep 17 00:00:00 2001
From: Tommaso Cucinotta <tommaso.cucinotta@gmail.com>
Date: Fri, 12 Sep 2025 07:38:29 +0200
Subject: [PATCH 10/46] sched/deadline: Add reporting of runtime left & abs
 deadline to sched_getattr() for DEADLINE tasks

The SCHED_DEADLINE scheduler allows reading the statically configured
run-time, deadline, and period parameters through the sched_getattr()
system call. However, there is no immediate way to access, from user space,
the current parameters used within the scheduler: the instantaneous runtime
left in the current cycle, as well as the current absolute deadline.

The `flags' sched_getattr() parameter, so far mandated to contain zero,
now supports the SCHED_GETATTR_FLAG_DL_DYNAMIC=1 flag, to request
retrieval of the leftover runtime and absolute deadline, converted to a
CLOCK_MONOTONIC reference, instead of the statically configured parameters.

This feature is useful for adaptive SCHED_DEADLINE tasks that need to
modify their behavior depending on whether or not there is enough runtime
left in the current period, and/or what is the current absolute deadline.

Notes:
- before returning the instantaneous parameters, the runtime is updated;
- the abs deadline is returned shifted from rq_clock() to ktime_get_ns(),
  in CLOCK_MONOTONIC reference; this causes multiple invocations from the
  same period to return values that may differ for a few ns (showing some
  small drift), albeit the deadline doesn't move, in rq_clock() reference;
- the abs deadline value returned to user-space, as unsigned 64-bit value,
  can represent nearly 585 years since boot time;
- setting flags=0 provides the old behavior (retrieve static parameters).

See also the notes from discussion held at OSPM 2025 on the topic
"Making user space aware of current deadline-scheduler parameters".

Signed-off-by: Tommaso Cucinotta <tommaso.cucinotta@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk>
Link: https://patch.msgid.link/20250912053937.31636-2-tommaso.cucinotta@santannapisa.it
---
 include/uapi/linux/sched.h |  3 +++
 kernel/sched/deadline.c    | 19 ++++++++++++++++---
 kernel/sched/sched.h       |  2 +-
 kernel/sched/syscalls.c    | 16 +++++++++++-----
 4 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 359a14cc76a4..52b69ce89368 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -146,4 +146,7 @@ struct clone_args {
 			 SCHED_FLAG_KEEP_ALL		| \
 			 SCHED_FLAG_UTIL_CLAMP)
 
+/* Only for sched_getattr() own flag param, if task is SCHED_DEADLINE */
+#define SCHED_GETATTR_FLAG_DL_DYNAMIC	0x01
+
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2de5727b94b4..9e253a825f39 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -3617,13 +3617,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 	dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
 }
 
-void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
+void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
 {
 	struct sched_dl_entity *dl_se = &p->dl;
+	struct rq *rq = task_rq(p);
+	u64 adj_deadline;
 
 	attr->sched_priority = p->rt_priority;
-	attr->sched_runtime = dl_se->dl_runtime;
-	attr->sched_deadline = dl_se->dl_deadline;
+	if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) {
+		guard(raw_spinlock_irq)(&rq->__lock);
+		update_rq_clock(rq);
+		if (task_current(rq, p))
+			update_curr_dl(rq);
+
+		attr->sched_runtime = dl_se->runtime;
+		adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns();
+		attr->sched_deadline = adj_deadline;
+	} else {
+		attr->sched_runtime = dl_se->dl_runtime;
+		attr->sched_deadline = dl_se->dl_deadline;
+	}
 	attr->sched_period = dl_se->dl_period;
 	attr->sched_flags &= ~SCHED_DL_FLAGS;
 	attr->sched_flags |= dl_se->flags;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8bf2f7d524cd..fa2237e89bee 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -356,7 +356,7 @@ extern int  sched_dl_global_validate(void);
 extern void sched_dl_do_global(void);
 extern int  sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
-extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
+extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags);
 extern bool __checkparam_dl(const struct sched_attr *attr);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
 extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 6f10db3646e7..a288ac0a633d 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -881,10 +881,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
 	return -E2BIG;
 }
 
-static void get_params(struct task_struct *p, struct sched_attr *attr)
+static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags)
 {
 	if (task_has_dl_policy(p)) {
-		__getparam_dl(p, attr);
+		__getparam_dl(p, attr, flags);
 	} else if (task_has_rt_policy(p)) {
 		attr->sched_priority = p->rt_priority;
 	} else {
@@ -950,7 +950,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
 		return -ESRCH;
 
 	if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
-		get_params(p, &attr);
+		get_params(p, &attr, 0);
 
 	return sched_setattr(p, &attr);
 }
@@ -1035,7 +1035,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	int retval;
 
 	if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
-		      usize < SCHED_ATTR_SIZE_VER0 || flags))
+		     usize < SCHED_ATTR_SIZE_VER0))
 		return -EINVAL;
 
 	scoped_guard (rcu) {
@@ -1043,6 +1043,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 		if (!p)
 			return -ESRCH;
 
+		if (flags) {
+			if (!task_has_dl_policy(p) ||
+			    flags != SCHED_GETATTR_FLAG_DL_DYNAMIC)
+				return -EINVAL;
+		}
+
 		retval = security_task_getscheduler(p);
 		if (retval)
 			return retval;
@@ -1050,7 +1056,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 		kattr.sched_policy = p->policy;
 		if (p->sched_reset_on_fork)
 			kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
-		get_params(p, &kattr);
+		get_params(p, &kattr, flags);
 		kattr.sched_flags &= SCHED_FLAG_ALL;
 
 #ifdef CONFIG_UCLAMP_TASK

From 4b9ef32c57a68eb98c45835c2beaa77f8e51c5c4 Mon Sep 17 00:00:00 2001
From: Xie Yuanbin <qq570070308@gmail.com>
Date: Tue, 17 Feb 2026 00:49:48 +0800
Subject: [PATCH 11/46] x86/mm/tlb: Make enter_lazy_tlb() always inline on x86

enter_lazy_tlb() on x86 is short enough, and is called in context
switching, which is the hot code path.

Make enter_lazy_tlb() always inline on x86 to optimize performance.

Suggested-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Xie Yuanbin <qq570070308@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20260216164950.147617-2-qq570070308@gmail.com
---
 arch/x86/include/asm/mmu_context.h |  3 ---
 arch/x86/include/asm/tlbflush.h    | 26 ++++++++++++++++++++++++++
 arch/x86/mm/tlb.c                  | 21 ---------------------
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 1acafb1c6a93..ef5b507de34e 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -136,9 +136,6 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm)
 }
 #endif
 
-#define enter_lazy_tlb enter_lazy_tlb
-extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
-
 extern void mm_init_global_asid(struct mm_struct *mm);
 extern void mm_free_global_asid(struct mm_struct *mm);
 
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 5a3cdc439e38..0545fe75c3fa 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,28 @@ struct tlb_state_shared {
 };
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
 
+/*
+ * Please ignore the name of this function.  It should be called
+ * switch_to_kernel_thread().
+ *
+ * enter_lazy_tlb() is a hint from the scheduler that we are entering a
+ * kernel thread or other context without an mm.  Acceptable implementations
+ * include doing nothing whatsoever, switching to init_mm, or various clever
+ * lazy tricks to try to minimize TLB flushes.
+ *
+ * The scheduler reserves the right to call enter_lazy_tlb() several times
+ * in a row.  It will notify us that we're going back to a real mm by
+ * calling switch_mm_irqs_off().
+ */
+#define enter_lazy_tlb enter_lazy_tlb
+static __always_inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+{
+	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
+		return;
+
+	this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
+}
+
 bool nmi_uaccess_okay(void);
 #define nmi_uaccess_okay nmi_uaccess_okay
 
@@ -480,6 +502,10 @@ static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
 {
 }
 #endif
+#else /* !MODULE */
+#define enter_lazy_tlb enter_lazy_tlb
+extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
+	__compiletime_error("enter_lazy_tlb() should not be used in modules");
 #endif /* !MODULE */
 
 static inline void __native_tlb_flush_global(unsigned long cr4)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 621e09d049cb..af43d177087e 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -971,27 +971,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next,
 	}
 }
 
-/*
- * Please ignore the name of this function.  It should be called
- * switch_to_kernel_thread().
- *
- * enter_lazy_tlb() is a hint from the scheduler that we are entering a
- * kernel thread or other context without an mm.  Acceptable implementations
- * include doing nothing whatsoever, switching to init_mm, or various clever
- * lazy tricks to try to minimize TLB flushes.
- *
- * The scheduler reserves the right to call enter_lazy_tlb() several times
- * in a row.  It will notify us that we're going back to a real mm by
- * calling switch_mm_irqs_off().
- */
-void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-	if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
-		return;
-
-	this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
-}
-
 /*
  * Using a temporary mm allows to set temporary mappings that are not accessible
  * by other CPUs. Such mappings are needed to perform sensitive memory writes

From 54a66e431eeacf23e1dc47cb3507f2d0c068aaf0 Mon Sep 17 00:00:00 2001
From: Xie Yuanbin <qq570070308@gmail.com>
Date: Tue, 17 Feb 2026 00:49:49 +0800
Subject: [PATCH 12/46] sched/headers: Inline raw_spin_rq_unlock()

raw_spin_rq_unlock() is short, and is called in some hot code paths
such as finish_lock_switch().

Inline raw_spin_rq_unlock() to micro-optimize performance a bit.

Signed-off-by: Xie Yuanbin <qq570070308@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20260216164950.147617-3-qq570070308@gmail.com
---
 kernel/sched/core.c  | 5 -----
 kernel/sched/sched.h | 9 ++++++---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfd280ec0f97..b59bab255e57 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -687,11 +687,6 @@ bool raw_spin_rq_trylock(struct rq *rq)
 	}
 }
 
-void raw_spin_rq_unlock(struct rq *rq)
-{
-	raw_spin_unlock(rq_lockp(rq));
-}
-
 /*
  * double_rq_lock - safely lock two runqueues
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fa2237e89bee..953d89d71804 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1607,15 +1607,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
 extern bool raw_spin_rq_trylock(struct rq *rq)
 	__cond_acquires(true, __rq_lockp(rq));
 
-extern void raw_spin_rq_unlock(struct rq *rq)
-	__releases(__rq_lockp(rq));
-
 static inline void raw_spin_rq_lock(struct rq *rq)
 	__acquires(__rq_lockp(rq))
 {
 	raw_spin_rq_lock_nested(rq, 0);
 }
 
+static inline void raw_spin_rq_unlock(struct rq *rq)
+	__releases(__rq_lockp(rq))
+{
+	raw_spin_unlock(rq_lockp(rq));
+}
+
 static inline void raw_spin_rq_lock_irq(struct rq *rq)
 	__acquires(__rq_lockp(rq))
 {

From 49b76317592ecbaefd0969d51d02019966cc994b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 1 Mar 2026 16:52:37 -0800
Subject: [PATCH 13/46] sched/wait: correct kernel-doc descriptions

Use the correct function name and function parameter name to avoid
these kernel-doc warnings:

Warning: include/linux/wait_bit.h:424 expecting prototype for
 wait_var_event_killable(). Prototype was for
 wait_var_event_interruptible() instead
Warning: include/linux/wait_bit.h:508 function parameter 'lock' not
 described in 'wait_var_event_mutex'

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260302005237.3473095-1-rdunlap@infradead.org
---
 include/linux/wait_bit.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 9e29d79fc790..ace7379d627d 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -406,7 +406,7 @@ do {									\
 			  schedule())
 
 /**
- * wait_var_event_killable - wait for a variable to be updated and notified
+ * wait_var_event_interruptible - wait for a variable to be updated and notified
  * @var: the address of variable being waited on
  * @condition: the condition to wait for
  *
@@ -492,7 +492,7 @@ do {									\
  * wait_var_event_mutex - wait for a variable to be updated under a mutex
  * @var: the address of the variable being waited on
  * @condition: condition to wait for
- * @mutex: the mutex which protects updates to the variable
+ * @lock: the mutex which protects updates to the variable
  *
  * Wait for a condition which can only be reliably tested while holding
  * a mutex.  The variables assessed in the condition will normal be

From 8d16e3c6f844823812f872df5ef1d3d2ed11b956 Mon Sep 17 00:00:00 2001
From: Zhan Xusheng <zhanxusheng1024@gmail.com>
Date: Mon, 9 Mar 2026 10:42:47 +0800
Subject: [PATCH 14/46] sched/fair: Fix comma operator misuse in NUMA fault
 accounting

Replace the comma operator with separate statements when assigning
NUMA fault statistics. This improves readability and follows kernel
coding style.

Signed-off-by: Zhan Xusheng <zhanxusheng@xiaomi.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260309024247.10908-1-zhanxusheng@xiaomi.com
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d57c02e82f3a..c1e5c8298fd1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -14226,7 +14226,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
 		}
 		if (ng) {
-			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
+			gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
 			gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
 		}
 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);

From 8e8e23dea43e64ddafbd1246644c3219209be113 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:26 +0000
Subject: [PATCH 15/46] sched/topology: Compute sd_weight considering cpuset
 partitions

The "sd_weight" used for calculating the load balancing interval, and
its limits, considers the span weight of the entire topology level
without accounting for cpuset partitions.

For example, consider a large system of 128CPUs divided into 8 * 16CPUs
partition which is typical when deploying virtual machines:

  [                      PKG Domain: 128CPUs                      ]

  [Partition0: 16CPUs][Partition1: 16CPUs] ... [Partition7: 16CPUs]

Although each partition only contains 16CPUs, the load balancing
interval is set to a minimum of 128 jiffies considering the span of the
entire domain with 128CPUs which can lead to longer imbalances within
the partition although balancing within is cheaper with 16CPUs.

Compute the "sd_weight" after computing the "sd_span" considering the
cpu_map covered by the partition, and set the load balancing interval,
and its limits accordingly.

For the above example, the balancing intervals for the partitions PKG
domain changes as follows:

                  before   after
balance_interval   128      16
min_interval       128      16
max_interval       256      32

Intervals are now proportional to the CPUs in the partitioned domain as
was intended by the original formula.

Fixes: cb83b629bae03 ("sched/numa: Rewrite the CONFIG_NUMA sched domain support")
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-2-kprateek.nayak@amd.com
---
 kernel/sched/topology.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 061f8c85f555..79bab80af8f2 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1645,13 +1645,17 @@ sd_init(struct sched_domain_topology_level *tl,
 	struct cpumask *sd_span;
 	u64 now = sched_clock();
 
-	sd_weight = cpumask_weight(tl->mask(tl, cpu));
+	sd_span = sched_domain_span(sd);
+	cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
+	sd_weight = cpumask_weight(sd_span);
+	sd_id = cpumask_first(sd_span);
 
 	if (tl->sd_flags)
 		sd_flags = (*tl->sd_flags)();
 	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
-			"wrong sd_flags in topology description\n"))
+		      "wrong sd_flags in topology description\n"))
 		sd_flags &= TOPOLOGY_SD_FLAGS;
+	sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
 
 	*sd = (struct sched_domain){
 		.min_interval		= sd_weight,
@@ -1689,12 +1693,6 @@ sd_init(struct sched_domain_topology_level *tl,
 		.name			= tl->name,
 	};
 
-	sd_span = sched_domain_span(sd);
-	cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu));
-	sd_id = cpumask_first(sd_span);
-
-	sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map);
-
 	WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) ==
 		  (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY),
 		  "CPU capacity asymmetry not supported on SMT\n");

From 5a7b576b3ec1acc2694c5b58f80cd1d44a11b2c1 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:27 +0000
Subject: [PATCH 16/46] sched/topology: Extract "imb_numa_nr" calculation into
 a separate helper

Subsequent changes to assign "sd->shared" from "s_data" would
necessitate finding the topmost SD_SHARE_LLC to assign shared object to.

This is very similar to the "imb_numa_nr" computation loop except that
"imb_numa_nr" cares about the first domain without the SD_SHARE_LLC flag
(immediate parent of sd_llc) whereas the "sd->shared" assignment would
require sd_llc itself.

Extract the "imb_numa_nr" calculation into a helper
adjust_numa_imbalance() and use the current loop in the
build_sched_domains() to find the sd_llc.

While at it, guard the call behind CONFIG_NUMA's status since
"imb_numa_nr" only makes sense on NUMA enabled configs with SD_NUMA
domains.

No functional changes intended.

Suggested-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-3-kprateek.nayak@amd.com
---
 kernel/sched/topology.c | 133 ++++++++++++++++++++++++----------------
 1 file changed, 80 insertions(+), 53 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79bab80af8f2..6303790a4143 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2549,6 +2549,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 	return true;
 }
 
+/*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+static void adjust_numa_imbalance(struct sched_domain *sd_llc)
+{
+	struct sched_domain *parent;
+	unsigned int imb_span = 1;
+	unsigned int imb = 0;
+	unsigned int nr_llcs;
+
+	WARN_ON(!(sd_llc->flags & SD_SHARE_LLC));
+	WARN_ON(!sd_llc->parent);
+
+	/*
+	 * For a single LLC per node, allow an
+	 * imbalance up to 12.5% of the node. This is
+	 * arbitrary cutoff based two factors -- SMT and
+	 * memory channels. For SMT-2, the intent is to
+	 * avoid premature sharing of HT resources but
+	 * SMT-4 or SMT-8 *may* benefit from a different
+	 * cutoff. For memory channels, this is a very
+	 * rough estimate of how many channels may be
+	 * active and is based on recent CPUs with
+	 * many cores.
+	 *
+	 * For multiple LLCs, allow an imbalance
+	 * until multiple tasks would share an LLC
+	 * on one node while LLCs on another node
+	 * remain idle. This assumes that there are
+	 * enough logical CPUs per LLC to avoid SMT
+	 * factors and that there is a correlation
+	 * between LLCs and memory channels.
+	 */
+	nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight;
+	if (nr_llcs == 1)
+		imb = sd_llc->parent->span_weight >> 3;
+	else
+		imb = nr_llcs;
+
+	imb = max(1U, imb);
+	sd_llc->parent->imb_numa_nr = imb;
+
+	/*
+	 * Set span based on the first NUMA domain.
+	 *
+	 * NUMA systems always add a NODE domain before
+	 * iterating the NUMA domains. Since this is before
+	 * degeneration, start from sd_llc's parent's
+	 * parent which is the lowest an SD_NUMA domain can
+	 * be relative to sd_llc.
+	 */
+	parent = sd_llc->parent->parent;
+	while (parent && !(parent->flags & SD_NUMA))
+		parent = parent->parent;
+
+	imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight;
+
+	/* Update the upper remainder of the topology */
+	parent = sd_llc->parent;
+	while (parent) {
+		int factor = max(1U, (parent->span_weight / imb_span));
+
+		parent->imb_numa_nr = imb * factor;
+		parent = parent->parent;
+	}
+}
+
 /*
  * Build sched domains for a given set of CPUs and attach the sched domains
  * to the individual CPUs
@@ -2606,62 +2674,21 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		}
 	}
 
-	/*
-	 * Calculate an allowed NUMA imbalance such that LLCs do not get
-	 * imbalanced.
-	 */
 	for_each_cpu(i, cpu_map) {
-		unsigned int imb = 0;
-		unsigned int imb_span = 1;
+		sd = *per_cpu_ptr(d.sd, i);
+		if (!sd)
+			continue;
 
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			struct sched_domain *child = sd->child;
+		/* First, find the topmost SD_SHARE_LLC domain */
+		while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
+			sd = sd->parent;
 
-			if (!(sd->flags & SD_SHARE_LLC) && child &&
-			    (child->flags & SD_SHARE_LLC)) {
-				struct sched_domain __rcu *top_p;
-				unsigned int nr_llcs;
-
-				/*
-				 * For a single LLC per node, allow an
-				 * imbalance up to 12.5% of the node. This is
-				 * arbitrary cutoff based two factors -- SMT and
-				 * memory channels. For SMT-2, the intent is to
-				 * avoid premature sharing of HT resources but
-				 * SMT-4 or SMT-8 *may* benefit from a different
-				 * cutoff. For memory channels, this is a very
-				 * rough estimate of how many channels may be
-				 * active and is based on recent CPUs with
-				 * many cores.
-				 *
-				 * For multiple LLCs, allow an imbalance
-				 * until multiple tasks would share an LLC
-				 * on one node while LLCs on another node
-				 * remain idle. This assumes that there are
-				 * enough logical CPUs per LLC to avoid SMT
-				 * factors and that there is a correlation
-				 * between LLCs and memory channels.
-				 */
-				nr_llcs = sd->span_weight / child->span_weight;
-				if (nr_llcs == 1)
-					imb = sd->span_weight >> 3;
-				else
-					imb = nr_llcs;
-				imb = max(1U, imb);
-				sd->imb_numa_nr = imb;
-
-				/* Set span based on the first NUMA domain. */
-				top_p = sd->parent;
-				while (top_p && !(top_p->flags & SD_NUMA)) {
-					top_p = top_p->parent;
-				}
-				imb_span = top_p ? top_p->span_weight : sd->span_weight;
-			} else {
-				int factor = max(1U, (sd->span_weight / imb_span));
-
-				sd->imb_numa_nr = imb * factor;
-			}
-		}
+		/*
+		 * In presence of higher domains, adjust the
+		 * NUMA imbalance stats for the hierarchy.
+		 */
+		if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent)
+			adjust_numa_imbalance(sd);
 	}
 
 	/* Calculate CPU capacity for physical packages and nodes */

From 1cc8a33ca7e8d38f962b64ece2a42c411a67bc76 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:28 +0000
Subject: [PATCH 17/46] sched/topology: Allocate per-CPU sched_domain_shared in
 s_data

The "sched_domain_shared" object is allocated for every topology level
in __sdt_alloc() and is freed post sched domain rebuild if they aren't
assigned during sd_init().

"sd->shared" is only assigned for SD_SHARE_LLC domains and out of all
the assigned objects, only "sd_llc_shared" is ever used by the
scheduler.

Since only "sd_llc_shared" is ever used, and since SD_SHARE_LLC domains
never overlap, allocate only a single range of per-CPU
"sched_domain_shared" object with s_data instead of doing it per
topology level.

The subsequent commit uses the degeneration path to correctly assign the
"sd->shared" to the topmost SD_SHARE_LLC domain.

No functional changes are expected at this point.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-4-kprateek.nayak@amd.com
---
 kernel/sched/topology.c | 48 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6303790a4143..9006586720bf 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -782,6 +782,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 }
 
 struct s_data {
+	struct sched_domain_shared * __percpu *sds;
 	struct sched_domain * __percpu *sd;
 	struct root_domain	*rd;
 };
@@ -789,6 +790,7 @@ struct s_data {
 enum s_alloc {
 	sa_rootdomain,
 	sa_sd,
+	sa_sd_shared,
 	sa_sd_storage,
 	sa_none,
 };
@@ -1535,6 +1537,9 @@ static void set_domain_attribute(struct sched_domain *sd,
 static void __sdt_free(const struct cpumask *cpu_map);
 static int __sdt_alloc(const struct cpumask *cpu_map);
 
+static void __sds_free(struct s_data *d, const struct cpumask *cpu_map);
+static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map);
+
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 				 const struct cpumask *cpu_map)
 {
@@ -1546,6 +1551,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 	case sa_sd:
 		free_percpu(d->sd);
 		fallthrough;
+	case sa_sd_shared:
+		__sds_free(d, cpu_map);
+		fallthrough;
 	case sa_sd_storage:
 		__sdt_free(cpu_map);
 		fallthrough;
@@ -1561,9 +1569,11 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
 
 	if (__sdt_alloc(cpu_map))
 		return sa_sd_storage;
+	if (__sds_alloc(d, cpu_map))
+		return sa_sd_shared;
 	d->sd = alloc_percpu(struct sched_domain *);
 	if (!d->sd)
-		return sa_sd_storage;
+		return sa_sd_shared;
 	d->rd = alloc_rootdomain();
 	if (!d->rd)
 		return sa_sd;
@@ -2464,6 +2474,42 @@ static void __sdt_free(const struct cpumask *cpu_map)
 	}
 }
 
+static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
+{
+	int j;
+
+	d->sds = alloc_percpu(struct sched_domain_shared *);
+	if (!d->sds)
+		return -ENOMEM;
+
+	for_each_cpu(j, cpu_map) {
+		struct sched_domain_shared *sds;
+
+		sds = kzalloc_node(sizeof(struct sched_domain_shared),
+				GFP_KERNEL, cpu_to_node(j));
+		if (!sds)
+			return -ENOMEM;
+
+		*per_cpu_ptr(d->sds, j) = sds;
+	}
+
+	return 0;
+}
+
+static void __sds_free(struct s_data *d, const struct cpumask *cpu_map)
+{
+	int j;
+
+	if (!d->sds)
+		return;
+
+	for_each_cpu(j, cpu_map)
+		kfree(*per_cpu_ptr(d->sds, j));
+
+	free_percpu(d->sds);
+	d->sds = NULL;
+}
+
 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)

From bb7a5e44fc6f3d5a252d95c48d057d5beccb8b35 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:29 +0000
Subject: [PATCH 18/46] sched/topology: Switch to assigning "sd->shared" from
 s_data

Use the "sched_domain_shared" object allocated in s_data for
"sd->shared" assignments. Assign "sd->shared" for the topmost
SD_SHARE_LLC domain before degeneration and rely on the degeneration
path to correctly pass down the shared object to "sd_llc".

sd_degenerate_parent() ensures degenerating domains must have the same
sched_domain_span() which ensures 1:1 passing down of the shared object.
If the topmost SD_SHARE_LLC domain degenerates, the shared object is
freed from destroy_sched_domain() when the last reference is dropped.

claim_allocations() NULLs out the objects that have been assigned as
"sd->shared" and the unassigned ones are freed from the __sds_free()
path.

To keep all the claim_allocations() bits in one place,
claim_allocations() has been extended to accept "s_data" and iterate the
domains internally to free both "sched_domain_shared" and the
per-topology-level data for the particular CPU in one place.

Post cpu_attach_domain(), all reclaims of "sd->shared" are handled via
call_rcu() on the sched_domain object via destroy_sched_domains_rcu().

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-5-kprateek.nayak@amd.com
---
 kernel/sched/topology.c | 73 +++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9006586720bf..b19d84f44669 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -685,6 +685,9 @@ static void update_top_cache_domain(int cpu)
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
+
+		/* If sd_llc exists, sd_llc_shared should exist too. */
+		WARN_ON_ONCE(!sd->shared);
 		sds = sd->shared;
 	}
 
@@ -733,6 +736,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 
+			/* Pick reference to parent->shared. */
+			if (parent->shared) {
+				WARN_ON_ONCE(tmp->shared);
+				tmp->shared = parent->shared;
+				parent->shared = NULL;
+			}
+
 			if (parent->parent) {
 				parent->parent->child = tmp;
 				parent->parent->groups->flags = tmp->flags;
@@ -1586,21 +1596,28 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
  * sched_group structure so that the subsequent __free_domain_allocs()
  * will not free the data we're using.
  */
-static void claim_allocations(int cpu, struct sched_domain *sd)
+static void claim_allocations(int cpu, struct s_data *d)
 {
-	struct sd_data *sdd = sd->private;
+	struct sched_domain *sd;
 
-	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-	*per_cpu_ptr(sdd->sd, cpu) = NULL;
+	if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref))
+		*per_cpu_ptr(d->sds, cpu) = NULL;
 
-	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
-		*per_cpu_ptr(sdd->sds, cpu) = NULL;
+	for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) {
+		struct sd_data *sdd = sd->private;
 
-	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
-		*per_cpu_ptr(sdd->sg, cpu) = NULL;
+		WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+		*per_cpu_ptr(sdd->sd, cpu) = NULL;
 
-	if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
-		*per_cpu_ptr(sdd->sgc, cpu) = NULL;
+		if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+			*per_cpu_ptr(sdd->sds, cpu) = NULL;
+
+		if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
+			*per_cpu_ptr(sdd->sg, cpu) = NULL;
+
+		if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+			*per_cpu_ptr(sdd->sgc, cpu) = NULL;
+	}
 }
 
 #ifdef CONFIG_NUMA
@@ -1738,16 +1755,6 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->cache_nice_tries = 1;
 	}
 
-	/*
-	 * For all levels sharing cache; connect a sched_domain_shared
-	 * instance.
-	 */
-	if (sd->flags & SD_SHARE_LLC) {
-		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-		atomic_inc(&sd->shared->ref);
-		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-	}
-
 	sd->private = sdd;
 
 	return sd;
@@ -2729,12 +2736,20 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		while (sd->parent && (sd->parent->flags & SD_SHARE_LLC))
 			sd = sd->parent;
 
-		/*
-		 * In presence of higher domains, adjust the
-		 * NUMA imbalance stats for the hierarchy.
-		 */
-		if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent)
-			adjust_numa_imbalance(sd);
+		if (sd->flags & SD_SHARE_LLC) {
+			int sd_id = cpumask_first(sched_domain_span(sd));
+
+			sd->shared = *per_cpu_ptr(d.sds, sd_id);
+			atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+			atomic_inc(&sd->shared->ref);
+
+			/*
+			 * In presence of higher domains, adjust the
+			 * NUMA imbalance stats for the hierarchy.
+			 */
+			if (IS_ENABLED(CONFIG_NUMA) && sd->parent)
+				adjust_numa_imbalance(sd);
+		}
 	}
 
 	/* Calculate CPU capacity for physical packages and nodes */
@@ -2742,10 +2757,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		if (!cpumask_test_cpu(i, cpu_map))
 			continue;
 
-		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-			claim_allocations(i, sd);
+		claim_allocations(i, &d);
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent)
 			init_sched_groups_capacity(i, sd);
-		}
 	}
 
 	/* Attach the domains */

From 10febd397591d93f42adb743c2c664041e7f1bcb Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:30 +0000
Subject: [PATCH 19/46] sched/topology: Remove sched_domain_shared allocation
 with sd_data

Now that "sd->shared" assignments are using the sched_domain_shared
objects allocated with s_data, remove the sd_data based allocations.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Valentin Schneider <vschneid@redhat.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-6-kprateek.nayak@amd.com
---
 include/linux/sched/topology.h |  1 -
 kernel/sched/topology.c        | 19 -------------------
 2 files changed, 20 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index a1e1032426dc..51c29581f15e 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -172,7 +172,6 @@ typedef int (*sched_domain_flags_f)(void);
 
 struct sd_data {
 	struct sched_domain *__percpu *sd;
-	struct sched_domain_shared *__percpu *sds;
 	struct sched_group *__percpu *sg;
 	struct sched_group_capacity *__percpu *sgc;
 };
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b19d84f44669..43150591914b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1609,9 +1609,6 @@ static void claim_allocations(int cpu, struct s_data *d)
 		WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 		*per_cpu_ptr(sdd->sd, cpu) = NULL;
 
-		if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
-			*per_cpu_ptr(sdd->sds, cpu) = NULL;
-
 		if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 			*per_cpu_ptr(sdd->sg, cpu) = NULL;
 
@@ -2390,10 +2387,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 		if (!sdd->sd)
 			return -ENOMEM;
 
-		sdd->sds = alloc_percpu(struct sched_domain_shared *);
-		if (!sdd->sds)
-			return -ENOMEM;
-
 		sdd->sg = alloc_percpu(struct sched_group *);
 		if (!sdd->sg)
 			return -ENOMEM;
@@ -2404,7 +2397,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
-			struct sched_domain_shared *sds;
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 
@@ -2415,13 +2407,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
 			*per_cpu_ptr(sdd->sd, j) = sd;
 
-			sds = kzalloc_node(sizeof(struct sched_domain_shared),
-					GFP_KERNEL, cpu_to_node(j));
-			if (!sds)
-				return -ENOMEM;
-
-			*per_cpu_ptr(sdd->sds, j) = sds;
-
 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sg)
@@ -2463,8 +2448,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
 				kfree(*per_cpu_ptr(sdd->sd, j));
 			}
 
-			if (sdd->sds)
-				kfree(*per_cpu_ptr(sdd->sds, j));
 			if (sdd->sg)
 				kfree(*per_cpu_ptr(sdd->sg, j));
 			if (sdd->sgc)
@@ -2472,8 +2455,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		}
 		free_percpu(sdd->sd);
 		sdd->sd = NULL;
-		free_percpu(sdd->sds);
-		sdd->sds = NULL;
 		free_percpu(sdd->sg);
 		sdd->sg = NULL;
 		free_percpu(sdd->sgc);

From f494bfb04615119f31dbd3222c9d39fea3817d40 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:31 +0000
Subject: [PATCH 20/46] sched/core: Check for rcu_read_lock_any_held() in
 idle_get_state()

Similar to commit 71fedc41c23b ("sched/fair: Switch to
rcu_dereference_all()"), switch to checking for rcu_read_lock_any_held()
in idle_get_state() to allow removing superfluous rcu_read_lock()
regions in the fair task's wakeup path where the pi_lock is held and
IRQs are disabled.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-7-kprateek.nayak@amd.com
---
 kernel/sched/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 953d89d71804..b863bbda6de8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2853,7 +2853,7 @@ static inline void idle_set_state(struct rq *rq,
 
 static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 {
-	WARN_ON_ONCE(!rcu_read_lock_held());
+	lockdep_assert(rcu_read_lock_any_held());
 
 	return rq->idle_state;
 }

From 8ca12326f592f7554acf2788ecb1c5c954dcf31c Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Mon, 16 Mar 2026 00:36:22 +0100
Subject: [PATCH 21/46] PM: EM: Switch to rcu_dereference_all() in wakeup path

em_cpu_energy() is part of the EAS (Fair) task wakeup path. Now that
rcu_read_{,un}lock() have been removed from find_energy_efficient_cpu()
switch to rcu_dereference_all() and check for rcu_read_lock_any_held()
in em_cpu_energy() as well.
In EAS (Fair) task wakeup path is a preempt/IRQ disabled region, so
rcu_read_{,un}lock() can be removed.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/5b1228b7-5949-4a45-9f62-e8ce936de694@arm.com
---
 include/linux/energy_model.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index e7497f804644..c909a8ba22e8 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -248,7 +248,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	struct em_perf_state *ps;
 	int i;
 
-	WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n");
+	lockdep_assert(rcu_read_lock_any_held());
 
 	if (!sum_util)
 		return 0;
@@ -267,7 +267,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
 	 * Find the lowest performance state of the Energy Model above the
 	 * requested performance.
 	 */
-	em_table = rcu_dereference(pd->em_table);
+	em_table = rcu_dereference_all(pd->em_table);
 	i = em_pd_get_efficient_state(em_table->state, pd, max_util);
 	ps = &em_table->state[i];
 

From fa6874dfeee06352ce7c4c271be6a25d84a38b54 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:32 +0000
Subject: [PATCH 22/46] sched/fair: Remove superfluous rcu_read_lock() in the
 wakeup path

select_task_rq_fair() is always called with p->pi_lock held and IRQs
disabled which makes it equivalent of an RCU read-side.

Since commit 71fedc41c23b ("sched/fair: Switch to
rcu_dereference_all()") switched to using rcu_dereference_all() in the
wakeup path, drop the explicit rcu_read_{lock,unlock}() in the fair
task's wakeup path.

Future plans to reuse select_task_rq_fair() /
find_energy_efficient_cpu() in the fair class' balance callback will do
so with IRQs disabled and will comply with the requirements of
rcu_dereference_all() which makes this safe keeping in mind future
development plans too.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-8-kprateek.nayak@amd.com
---
 kernel/sched/fair.c | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c1e5c8298fd1..3e24d3e16522 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8570,10 +8570,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	struct perf_domain *pd;
 	struct energy_env eenv;
 
-	rcu_read_lock();
 	pd = rcu_dereference_all(rd->pd);
 	if (!pd)
-		goto unlock;
+		return target;
 
 	/*
 	 * Energy-aware wake-up happens on the lowest sched_domain starting
@@ -8583,13 +8582,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 	while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
 		sd = sd->parent;
 	if (!sd)
-		goto unlock;
+		return target;
 
 	target = prev_cpu;
 
 	sync_entity_load_avg(&p->se);
 	if (!task_util_est(p) && p_util_min == 0)
-		goto unlock;
+		return target;
 
 	eenv_task_busy_time(&eenv, p, prev_cpu);
 
@@ -8684,7 +8683,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 						    prev_cpu);
 			/* CPU utilization has changed */
 			if (prev_delta < base_energy)
-				goto unlock;
+				return target;
 			prev_delta -= base_energy;
 			prev_actual_cap = cpu_actual_cap;
 			best_delta = min(best_delta, prev_delta);
@@ -8708,7 +8707,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 						   max_spare_cap_cpu);
 			/* CPU utilization has changed */
 			if (cur_delta < base_energy)
-				goto unlock;
+				return target;
 			cur_delta -= base_energy;
 
 			/*
@@ -8725,7 +8724,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 			best_actual_cap = cpu_actual_cap;
 		}
 	}
-	rcu_read_unlock();
 
 	if ((best_fits > prev_fits) ||
 	    ((best_fits > 0) && (best_delta < prev_delta)) ||
@@ -8733,11 +8731,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
 		target = best_energy_cpu;
 
 	return target;
-
-unlock:
-	rcu_read_unlock();
-
-	return target;
 }
 
 /*
@@ -8782,7 +8775,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
 	}
 
-	rcu_read_lock();
 	for_each_domain(cpu, tmp) {
 		/*
 		 * If both 'cpu' and 'prev_cpu' are part of this domain,
@@ -8808,14 +8800,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
 			break;
 	}
 
-	if (unlikely(sd)) {
-		/* Slow path */
-		new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
-	} else if (wake_flags & WF_TTWU) { /* XXX always ? */
-		/* Fast path */
-		new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-	}
-	rcu_read_unlock();
+	/* Slow path */
+	if (unlikely(sd))
+		return sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag);
+
+	/* Fast path */
+	if (wake_flags & WF_TTWU)
+		return select_idle_sibling(p, prev_cpu, new_cpu);
 
 	return new_cpu;
 }

From f1320a8dd8ba6518ddb53ea4e3efcb49dc41d257 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:33 +0000
Subject: [PATCH 23/46] sched/fair: Simplify the entry condition for
 update_idle_cpu_scan()

Only the topmost SD_SHARE_LLC domain has the "sd->shared" assigned.
Simply use "sd->shared" as an indicator for load balancing at the highest
SD_SHARE_LLC domain in update_idle_cpu_scan() instead of relying on
llc_size.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-9-kprateek.nayak@amd.com
---
 kernel/sched/fair.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e24d3e16522..85c22f0f8de8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11234,6 +11234,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
 				 unsigned long sum_util)
 {
 	struct sched_domain_shared *sd_share;
+	struct sched_domain *sd = env->sd;
 	int llc_weight, pct;
 	u64 x, y, tmp;
 	/*
@@ -11247,11 +11248,7 @@ static void update_idle_cpu_scan(struct lb_env *env,
 	if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE)
 		return;
 
-	llc_weight = per_cpu(sd_llc_size, env->dst_cpu);
-	if (env->sd->span_weight != llc_weight)
-		return;
-
-	sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu));
+	sd_share = sd->shared;
 	if (!sd_share)
 		return;
 
@@ -11285,10 +11282,11 @@ static void update_idle_cpu_scan(struct lb_env *env,
 	 */
 	/* equation [3] */
 	x = sum_util;
+	llc_weight = sd->span_weight;
 	do_div(x, llc_weight);
 
 	/* equation [4] */
-	pct = env->sd->imbalance_pct;
+	pct = sd->imbalance_pct;
 	tmp = x * x * pct * pct;
 	do_div(tmp, 10000 * SCHED_CAPACITY_SCALE);
 	tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE);

From fe7171d0d5dfbe189e41db99580ebacafc3c09ce Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Thu, 12 Mar 2026 04:44:34 +0000
Subject: [PATCH 24/46] sched/fair: Simplify SIS_UTIL handling in
 select_idle_cpu()

Use the "sd_llc" passed to select_idle_cpu() to obtain the
"sd_llc_shared" instead of dereferencing the per-CPU variable.

Since "sd->shared" is always reclaimed at the same time as "sd" via
call_rcu() and update_top_cache_domain() always ensures a valid
"sd->shared" assignment when "sd_llc" is present, "sd_llc->shared" can
always be dereferenced without needing an additional check.

While at it move the cpumask_and() operation after the SIS_UTIL bailout
check to avoid unnecessarily computing the cpumask.

Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260312044434.1974-10-kprateek.nayak@amd.com
---
 kernel/sched/fair.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 85c22f0f8de8..0a35a82e4792 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7876,21 +7876,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
 {
 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask);
 	int i, cpu, idle_cpu = -1, nr = INT_MAX;
-	struct sched_domain_shared *sd_share;
-
-	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
 	if (sched_feat(SIS_UTIL)) {
-		sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target));
-		if (sd_share) {
-			/* because !--nr is the condition to stop scan */
-			nr = READ_ONCE(sd_share->nr_idle_scan) + 1;
-			/* overloaded LLC is unlikely to have idle cpu/core */
-			if (nr == 1)
-				return -1;
-		}
+		/*
+		 * Increment because !--nr is the condition to stop scan.
+		 *
+		 * Since "sd" is "sd_llc" for target CPU dereferenced in the
+		 * caller, it is safe to directly dereference "sd->shared".
+		 * Topology bits always ensure it assigned for "sd_llc" abd it
+		 * cannot disappear as long as we have a RCU protected
+		 * reference to one the associated "sd" here.
+		 */
+		nr = READ_ONCE(sd->shared->nr_idle_scan) + 1;
+		/* overloaded LLC is unlikely to have idle cpu/core */
+		if (nr == 1)
+			return -1;
 	}
 
+	if (!cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr))
+		return -1;
+
 	if (static_branch_unlikely(&sched_cluster_active)) {
 		struct sched_group *sg = sd->groups;
 

From e379dce8af11d8d6040b4348316a499bfd174bfb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 23 Mar 2026 10:36:27 +0100
Subject: [PATCH 25/46] sched/topology: Fix sched_domain_span()

Commit 8e8e23dea43e ("sched/topology: Compute sd_weight considering
cpuset partitions") ends up relying on the fact that structure
initialization should not touch the flexible array.

However, the official GCC specification for "Arrays of Length Zero"
[*] says:

  Although the size of a zero-length array is zero, an array member of
  this kind may increase the size of the enclosing type as a result of
  tail padding.

Additionally, structure initialization will zero tail padding. With
the end result that since offsetof(*type, member) < sizeof(*type),
array initialization will clobber the flex array.

Luckily, the way flexible array sizes are calculated is:

  sizeof(*type) + count * sizeof(*type->member)

This means we have the complete size of the flex array *outside* of
sizeof(*type), so use that instead of relying on the broken flex array
definition.

[*] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html

Fixes: 8e8e23dea43e ("sched/topology: Compute sd_weight considering cpuset partitions")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Debugged-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Tested-by: Nathan Chancellor <nathan@kernel.org>
Link: https://patch.msgid.link/20260323093627.GY3738010@noisy.programming.kicks-ass.net
---
 include/linux/sched/topology.h | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 51c29581f15e..36553e14866d 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -142,18 +142,30 @@ struct sched_domain {
 
 	unsigned int span_weight;
 	/*
-	 * Span of all CPUs in this domain.
+	 * See sched_domain_span(), on why flex arrays are broken.
 	 *
-	 * NOTE: this field is variable length. (Allocated dynamically
-	 * by attaching extra space to the end of the structure,
-	 * depending on how many CPUs the kernel has booted up with)
-	 */
 	unsigned long span[];
+	 */
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
 {
-	return to_cpumask(sd->span);
+	/*
+	 * Turns out that C flexible arrays are fundamentally broken since it
+	 * is allowed for offsetof(*sd, span) < sizeof(*sd), this means that
+	 * structure initialzation *sd = { ... }; which writes every byte
+	 * inside sizeof(*type), will over-write the start of the flexible
+	 * array.
+	 *
+	 * Luckily, the way we allocate sched_domain is by:
+	 *
+	 *   sizeof(*sd) + cpumask_size()
+	 *
+	 * this means that we have sufficient space for the whole flex array
+	 * *outside* of sizeof(*sd). So use that, and avoid using sd->span.
+	 */
+	unsigned long *bitmap = (void *)sd + sizeof(*sd);
+	return to_cpumask(bitmap);
 }
 
 extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],

From 76504bce4ee6b8757647e07bc1710dcac9acdc2e Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.ibm.com>
Date: Tue, 24 Mar 2026 01:06:27 +0530
Subject: [PATCH 26/46] sched/fair: Get this cpu once in find_new_ilb()

Calling smp_processor_id() on:
 - In CONFIG_DEBUG_PREEMPT=y, if preemption/irq is disabled, then it does
   not print any warning.
 - In CONFIG_DEBUG_PREEMPT=n, it doesn't do anything apart from getting
   __smp_processor_id

So with both CONFIG_DEBUG_PREEMPT=y/n, in preemption disabled section
it is better to cache the value. It could save a few cycles. Though
tiny, repeated in loop could add up to a small value.

find_new_ilb is called in interrupt context. So preemption is disabled.
So Hoist the this_cpu out of loop

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mukesh Kumar Chaurasiya (IBM) <mkchauras@gmail.com>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260323193630.640311-2-sshegde@linux.ibm.com
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0a35a82e4792..226509231e67 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -12614,14 +12614,14 @@ static inline int on_null_domain(struct rq *rq)
  */
 static inline int find_new_ilb(void)
 {
+	int this_cpu = smp_processor_id();
 	const struct cpumask *hk_mask;
 	int ilb_cpu;
 
 	hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
 
 	for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
-
-		if (ilb_cpu == smp_processor_id())
+		if (ilb_cpu == this_cpu)
 			continue;
 
 		if (idle_cpu(ilb_cpu))

From 0e81fe79fec5a639700f09f39c8ab680c3312ba2 Mon Sep 17 00:00:00 2001
From: Shrikanth Hegde <sshegde@linux.ibm.com>
Date: Tue, 24 Mar 2026 01:06:28 +0530
Subject: [PATCH 27/46] sched/core: Get this cpu once in ttwu_queue_cond()

Calling smp_processor_id() on:
 - In CONFIG_DEBUG_PREEMPT=y, if preemption/irq is disabled, then it does
   not print any warning.
 - In CONFIG_DEBUG_PREEMPT=n, it doesn't do anything apart from getting
   __smp_processor_id

So with both CONFIG_DEBUG_PREEMPT=y/n, in preemption disabled section
it is better to cache the value. It could save a few cycles. Though
tiny, repeated could add up to a small value.

ttwu_queue_cond is called with interrupt disabled. So preemption is
disabled. Hence cache the value once instead.

Signed-off-by: Shrikanth Hegde <sshegde@linux.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Mukesh Kumar Chaurasiya (IBM) <mkchauras@gmail.com>
Link: https://patch.msgid.link/20260323193630.640311-3-sshegde@linux.ibm.com
---
 kernel/sched/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 64b467c1d5b6..7c7d4bf686d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3842,6 +3842,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
 
 static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 {
+	int this_cpu = smp_processor_id();
+
 	/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
 	if (!scx_allow_ttwu_queue(p))
 		return false;
@@ -3866,10 +3868,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 	 * If the CPU does not share cache, then queue the task on the
 	 * remote rqs wakelist to avoid accessing remote data.
 	 */
-	if (!cpus_share_cache(smp_processor_id(), cpu))
+	if (!cpus_share_cache(this_cpu, cpu))
 		return true;
 
-	if (cpu == smp_processor_id())
+	if (cpu == this_cpu)
 		return false;
 
 	/*

From 265439eb88fda0bf77821e10aafed22cdd450f9d Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Wed, 25 Mar 2026 17:26:04 +0000
Subject: [PATCH 28/46] MAINTAINERS: Add K Prateek Nayak to scheduler reviewers

I've been fortunate to have K Prateek take an active interest in
my Proxy Execution patches. He's provided great review insights
and productive feedback, sometimes remembering the subtleties of
the patch series better than I do myself! And he has done all
this with great kindness and humility.

I've really appreciated his reviews, and I think everyone should
be so lucky, so I wanted to propose adding him to the scheduler
reviewers list.

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Acked-by: K Prateek Nayak <kprateek.nayak@amd.com>
Acked-by: Juri Lelli <juri.lelli@redhat.com>
Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Link: https://patch.msgid.link/20260325172615.2895622-1-jstultz@google.com
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 96ea84948d76..82ed752f93da 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23661,6 +23661,7 @@ R:	Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
 R:	Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
 R:	Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
 R:	Valentin Schneider <vschneid@redhat.com> (TOPOLOGY)
+R:	K Prateek Nayak <kprateek.nayak@amd.com>
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core

From e0ca8991b2de6c9dfe6fcd8a0364951b2bd56797 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:16 +0000
Subject: [PATCH 29/46] sched: Make class_schedulers avoid pushing current, and
 get rid of proxy_tag_curr()

With proxy-execution, the scheduler selects the donor, but for
blocked donors, we end up running the lock owner.

This caused some complexity, because the class schedulers make
sure to remove the task they pick from their pushable task
lists, which prevents the donor from being migrated, but there
wasn't then anything to prevent rq->curr from being migrated
if rq->curr != rq->donor.

This was sort of hacked around by calling proxy_tag_curr() on
the rq->curr task if we were running something other then the
donor. proxy_tag_curr() did a dequeue/enqueue pair on the
rq->curr task, allowing the class schedulers to remove it from
their pushable list.

The dequeue/enqueue pair was wasteful, and additonally K Prateek
highlighted that we didn't properly undo things when we stopped
proxying, leaving the lock owner off the pushable list.

After some alternative approaches were considered, Peter
suggested just having the RT/DL classes just avoid migrating
when task_on_cpu().

So rework pick_next_pushable_dl_task() and the rt
pick_next_pushable_task() functions so that they skip over the
first pushable task if it is on_cpu.

Then just drop all of the proxy_tag_curr() logic.

Fixes: be39617e38e0 ("sched: Fix proxy/current (push,pull)ability")
Closes: https://lore.kernel.org/lkml/e735cae0-2cc9-4bae-b761-fcb082ed3e94@amd.com/
Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260324191337.1841376-2-jstultz@google.com
---
 kernel/sched/core.c     | 24 ------------------------
 kernel/sched/deadline.c | 18 ++++++++++++++++--
 kernel/sched/rt.c       | 15 ++++++++++++---
 3 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7c7d4bf686d7..29741685762d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6702,23 +6702,6 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 }
 #endif /* SCHED_PROXY_EXEC */
 
-static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner)
-{
-	if (!sched_proxy_exec())
-		return;
-	/*
-	 * pick_next_task() calls set_next_task() on the chosen task
-	 * at some point, which ensures it is not push/pullable.
-	 * However, the chosen/donor task *and* the mutex owner form an
-	 * atomic pair wrt push/pull.
-	 *
-	 * Make sure owner we run is not pushable. Unfortunately we can
-	 * only deal with that by means of a dequeue/enqueue cycle. :-/
-	 */
-	dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE);
-	enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE);
-}
-
 /*
  * __schedule() is the main scheduler function.
  *
@@ -6871,9 +6854,6 @@ static void __sched notrace __schedule(int sched_mode)
 		 */
 		RCU_INIT_POINTER(rq->curr, next);
 
-		if (!task_current_donor(rq, next))
-			proxy_tag_curr(rq, next);
-
 		/*
 		 * The membarrier system call requires each architecture
 		 * to have a full memory barrier after updating
@@ -6907,10 +6887,6 @@ static void __sched notrace __schedule(int sched_mode)
 		/* Also unlocks the rq: */
 		rq = context_switch(rq, prev, next, &rf);
 	} else {
-		/* In case next was already curr but just got blocked_donor */
-		if (!task_current_donor(rq, next))
-			proxy_tag_curr(rq, next);
-
 		rq_unpin_lock(rq, &rf);
 		__balance_callbacks(rq, NULL);
 		raw_spin_rq_unlock_irq(rq);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9e253a825f39..27359a1e995f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2805,12 +2805,26 @@ static int find_later_rq(struct task_struct *task)
 
 static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
 {
-	struct task_struct *p;
+	struct task_struct *i, *p = NULL;
+	struct rb_node *next_node;
 
 	if (!has_pushable_dl_tasks(rq))
 		return NULL;
 
-	p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
+	next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+	while (next_node) {
+		i = __node_2_pdl(next_node);
+		/* make sure task isn't on_cpu (possible with proxy-exec) */
+		if (!task_on_cpu(rq, i)) {
+			p = i;
+			break;
+		}
+
+		next_node = rb_next(next_node);
+	}
+
+	if (!p)
+		return NULL;
 
 	WARN_ON_ONCE(rq->cpu != task_cpu(p));
 	WARN_ON_ONCE(task_current(rq, p));
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3d823f5ffe2c..4e5f1957b91b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1858,13 +1858,22 @@ static int find_lowest_rq(struct task_struct *task)
 
 static struct task_struct *pick_next_pushable_task(struct rq *rq)
 {
-	struct task_struct *p;
+	struct plist_head *head = &rq->rt.pushable_tasks;
+	struct task_struct *i, *p = NULL;
 
 	if (!has_pushable_tasks(rq))
 		return NULL;
 
-	p = plist_first_entry(&rq->rt.pushable_tasks,
-			      struct task_struct, pushable_tasks);
+	plist_for_each_entry(i, head, pushable_tasks) {
+		/* make sure task isn't on_cpu (possible with proxy-exec) */
+		if (!task_on_cpu(rq, i)) {
+			p = i;
+			break;
+		}
+	}
+
+	if (!p)
+		return NULL;
 
 	BUG_ON(rq->cpu != task_cpu(p));
 	BUG_ON(task_current(rq, p));

From 37341ec573da7c16fdd45222b1bfb7b421dbdbcb Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:17 +0000
Subject: [PATCH 30/46] sched: Minimise repeated sched_proxy_exec() checking

Peter noted: Compilers are really bad (as in they utterly refuse)
optimizing (even when marked with __pure) the static branch
things, and will happily emit multiple identical in a row.

So pull out the one obvious sched_proxy_exec() branch in
__schedule() and remove some of the 'implicit' ones in that
path.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260324191337.1841376-3-jstultz@google.com
---
 kernel/sched/core.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 29741685762d..f3306d3f2aa1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6597,11 +6597,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 	struct mutex *mutex;
 
 	/* Follow blocked_on chain. */
-	for (p = donor; task_is_blocked(p); p = owner) {
-		mutex = p->blocked_on;
-		/* Something changed in the chain, so pick again */
-		if (!mutex)
-			return NULL;
+	for (p = donor; (mutex = p->blocked_on); p = owner) {
 		/*
 		 * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
 		 * and ensure @owner sticks around.
@@ -6832,12 +6828,14 @@ static void __sched notrace __schedule(int sched_mode)
 	next = pick_next_task(rq, rq->donor, &rf);
 	rq_set_donor(rq, next);
 	rq->next_class = next->sched_class;
-	if (unlikely(task_is_blocked(next))) {
-		next = find_proxy_task(rq, next, &rf);
-		if (!next)
-			goto pick_again;
-		if (next == rq->idle)
-			goto keep_resched;
+	if (sched_proxy_exec()) {
+		if (unlikely(next->blocked_on)) {
+			next = find_proxy_task(rq, next, &rf);
+			if (!next)
+				goto pick_again;
+			if (next == rq->idle)
+				goto keep_resched;
+		}
 	}
 picked:
 	clear_tsk_need_resched(prev);

From f4fe6be82e6d27349de66a42d6d1b2b11dc97a14 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:18 +0000
Subject: [PATCH 31/46] sched: Fix potentially missing balancing with Proxy
 Exec

K Prateek pointed out that with Proxy Exec, we may have cases
where we context switch in __schedule(), while the donor remains
the same. This could cause balancing issues, since the
put_prev_set_next() logic short-cuts if (prev == next). With
proxy-exec prev is the previous donor, and next is the next
donor. Should the donor remain the same, but different tasks are
picked to actually run, the shortcut will have avoided enqueuing
the sched class balance callback.

So, if we are context switching, add logic to catch the
same-donor case, and trigger the put_prev/set_next calls to
ensure the balance callbacks get enqueued.

Closes: https://lore.kernel.org/lkml/20ea3670-c30a-433b-a07f-c4ff98ae2379@amd.com/
Reported-by: K Prateek Nayak <kprateek.nayak@amd.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260324191337.1841376-4-jstultz@google.com
---
 kernel/sched/core.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f3306d3f2aa1..5b7f378af042 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6826,9 +6826,11 @@ static void __sched notrace __schedule(int sched_mode)
 
 pick_again:
 	next = pick_next_task(rq, rq->donor, &rf);
-	rq_set_donor(rq, next);
 	rq->next_class = next->sched_class;
 	if (sched_proxy_exec()) {
+		struct task_struct *prev_donor = rq->donor;
+
+		rq_set_donor(rq, next);
 		if (unlikely(next->blocked_on)) {
 			next = find_proxy_task(rq, next, &rf);
 			if (!next)
@@ -6836,7 +6838,27 @@ static void __sched notrace __schedule(int sched_mode)
 			if (next == rq->idle)
 				goto keep_resched;
 		}
+		if (rq->donor == prev_donor && prev != next) {
+			struct task_struct *donor = rq->donor;
+			/*
+			 * When transitioning like:
+			 *
+			 *         prev         next
+			 * donor:    B            B
+			 * curr:     A          B or C
+			 *
+			 * then put_prev_set_next_task() will not have done
+			 * anything, since B == B. However, A might have
+			 * missed a RT/DL balance opportunity due to being
+			 * on_cpu.
+			 */
+			donor->sched_class->put_prev_task(rq, donor, donor);
+			donor->sched_class->set_next_task(rq, donor, true);
+		}
+	} else {
+		rq_set_donor(rq, next);
 	}
+
 picked:
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();

From fa4a1ff8ab235a308d8c983827657a69649185fd Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:19 +0000
Subject: [PATCH 32/46] locking: Add task::blocked_lock to serialize blocked_on
 state

So far, we have been able to utilize the mutex::wait_lock
for serializing the blocked_on state, but when we move to
proxying across runqueues, we will need to add more state
and a way to serialize changes to this state in contexts
where we don't hold the mutex::wait_lock.

So introduce the task::blocked_lock, which nests under the
mutex::wait_lock in the locking order, and rework the locking
to use it.

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260324191337.1841376-5-jstultz@google.com
---
 include/linux/sched.h        | 48 +++++++++++++-----------------------
 init/init_task.c             |  1 +
 kernel/fork.c                |  1 +
 kernel/locking/mutex-debug.c |  4 +--
 kernel/locking/mutex.c       | 40 +++++++++++++++++++-----------
 kernel/locking/mutex.h       |  6 +++++
 kernel/locking/ww_mutex.h    |  4 +--
 kernel/sched/core.c          |  4 ++-
 8 files changed, 58 insertions(+), 50 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a5d3dbc9cdf..2eef9bc6daaa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1238,6 +1238,7 @@ struct task_struct {
 #endif
 
 	struct mutex			*blocked_on;	/* lock we're blocked on */
+	raw_spinlock_t			blocked_lock;
 
 #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
 	/*
@@ -2181,57 +2182,42 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock);
 #ifndef CONFIG_PREEMPT_RT
 static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
 {
-	struct mutex *m = p->blocked_on;
-
-	if (m)
-		lockdep_assert_held_once(&m->wait_lock);
-	return m;
+	lockdep_assert_held_once(&p->blocked_lock);
+	return p->blocked_on;
 }
 
 static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
 {
-	struct mutex *blocked_on = READ_ONCE(p->blocked_on);
-
 	WARN_ON_ONCE(!m);
 	/* The task should only be setting itself as blocked */
 	WARN_ON_ONCE(p != current);
-	/* Currently we serialize blocked_on under the mutex::wait_lock */
-	lockdep_assert_held_once(&m->wait_lock);
+	/* Currently we serialize blocked_on under the task::blocked_lock */
+	lockdep_assert_held_once(&p->blocked_lock);
 	/*
 	 * Check ensure we don't overwrite existing mutex value
 	 * with a different mutex. Note, setting it to the same
 	 * lock repeatedly is ok.
 	 */
-	WARN_ON_ONCE(blocked_on && blocked_on != m);
-	WRITE_ONCE(p->blocked_on, m);
-}
-
-static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m)
-{
-	guard(raw_spinlock_irqsave)(&m->wait_lock);
-	__set_task_blocked_on(p, m);
+	WARN_ON_ONCE(p->blocked_on && p->blocked_on != m);
+	p->blocked_on = m;
 }
 
 static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m)
 {
-	if (m) {
-		struct mutex *blocked_on = READ_ONCE(p->blocked_on);
-
-		/* Currently we serialize blocked_on under the mutex::wait_lock */
-		lockdep_assert_held_once(&m->wait_lock);
-		/*
-		 * There may be cases where we re-clear already cleared
-		 * blocked_on relationships, but make sure we are not
-		 * clearing the relationship with a different lock.
-		 */
-		WARN_ON_ONCE(blocked_on && blocked_on != m);
-	}
-	WRITE_ONCE(p->blocked_on, NULL);
+	/* Currently we serialize blocked_on under the task::blocked_lock */
+	lockdep_assert_held_once(&p->blocked_lock);
+	/*
+	 * There may be cases where we re-clear already cleared
+	 * blocked_on relationships, but make sure we are not
+	 * clearing the relationship with a different lock.
+	 */
+	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m);
+	p->blocked_on = NULL;
 }
 
 static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
 {
-	guard(raw_spinlock_irqsave)(&m->wait_lock);
+	guard(raw_spinlock_irqsave)(&p->blocked_lock);
 	__clear_task_blocked_on(p, m);
 }
 #else
diff --git a/init/init_task.c b/init/init_task.c
index 5c838757fc10..b5f48ebdc2b6 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -169,6 +169,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 	.journal_info	= NULL,
 	INIT_CPU_TIMERS(init_task)
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
+	.blocked_lock	= __RAW_SPIN_LOCK_UNLOCKED(init_task.blocked_lock),
 	.timer_slack_ns = 50000, /* 50 usec default slack */
 	.thread_pid	= &init_struct_pid,
 	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),
diff --git a/kernel/fork.c b/kernel/fork.c
index bc2bf58b93b6..079802cb6100 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2076,6 +2076,7 @@ __latent_entropy struct task_struct *copy_process(
 	ftrace_graph_init_task(p);
 
 	rt_mutex_init_task(p);
+	raw_spin_lock_init(&p->blocked_lock);
 
 	lockdep_assert_irqs_enabled();
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index 2c6b02d4699b..cc6aa9c6e981 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -54,13 +54,13 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 	lockdep_assert_held(&lock->wait_lock);
 
 	/* Current thread can't be already blocked (since it's executing!) */
-	DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task));
+	DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task));
 }
 
 void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
 			 struct task_struct *task)
 {
-	struct mutex *blocked_on = __get_task_blocked_on(task);
+	struct mutex *blocked_on = get_task_blocked_on(task);
 
 	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
 	DEBUG_LOCKS_WARN_ON(waiter->task != task);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2a1d165b3167..4aa79bcab08c 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -656,6 +656,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 			goto err_early_kill;
 	}
 
+	raw_spin_lock(&current->blocked_lock);
 	__set_task_blocked_on(current, lock);
 	set_current_state(state);
 	trace_contention_begin(lock, LCB_F_MUTEX);
@@ -669,8 +670,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 		 * the handoff.
 		 */
 		if (__mutex_trylock(lock))
-			goto acquired;
+			break;
 
+		raw_spin_unlock(&current->blocked_lock);
 		/*
 		 * Check for signals and kill conditions while holding
 		 * wait_lock. This ensures the lock cancellation is ordered
@@ -693,12 +695,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 
 		first = __mutex_waiter_is_first(lock, &waiter);
 
+		raw_spin_lock_irqsave(&lock->wait_lock, flags);
+		raw_spin_lock(&current->blocked_lock);
 		/*
 		 * As we likely have been woken up by task
 		 * that has cleared our blocked_on state, re-set
 		 * it to the lock we are trying to acquire.
 		 */
-		set_task_blocked_on(current, lock);
+		__set_task_blocked_on(current, lock);
 		set_current_state(state);
 		/*
 		 * Here we order against unlock; we must either see it change
@@ -709,25 +713,33 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 			break;
 
 		if (first) {
-			trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+			bool opt_acquired;
+
 			/*
 			 * mutex_optimistic_spin() can call schedule(), so
-			 * clear blocked on so we don't become unselectable
+			 * we need to release these locks before calling it,
+			 * and clear blocked on so we don't become unselectable
 			 * to run.
 			 */
-			clear_task_blocked_on(current, lock);
-			if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+			__clear_task_blocked_on(current, lock);
+			raw_spin_unlock(&current->blocked_lock);
+			raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+
+			trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+			opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter);
+
+			raw_spin_lock_irqsave(&lock->wait_lock, flags);
+			raw_spin_lock(&current->blocked_lock);
+			__set_task_blocked_on(current, lock);
+
+			if (opt_acquired)
 				break;
-			set_task_blocked_on(current, lock);
 			trace_contention_begin(lock, LCB_F_MUTEX);
 		}
-
-		raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	}
-	raw_spin_lock_irqsave(&lock->wait_lock, flags);
-acquired:
 	__clear_task_blocked_on(current, lock);
 	__set_current_state(TASK_RUNNING);
+	raw_spin_unlock(&current->blocked_lock);
 
 	if (ww_ctx) {
 		/*
@@ -756,11 +768,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
 	return 0;
 
 err:
-	__clear_task_blocked_on(current, lock);
+	clear_task_blocked_on(current, lock);
 	__set_current_state(TASK_RUNNING);
 	__mutex_remove_waiter(lock, &waiter);
 err_early_kill:
-	WARN_ON(__get_task_blocked_on(current));
+	WARN_ON(get_task_blocked_on(current));
 	trace_contention_end(lock, ret);
 	raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
 	debug_mutex_free_waiter(&waiter);
@@ -971,7 +983,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 		next = waiter->task;
 
 		debug_mutex_wake_waiter(lock, waiter);
-		__clear_task_blocked_on(next, lock);
+		clear_task_blocked_on(next, lock);
 		wake_q_add(&wake_q, next);
 	}
 
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 9ad4da8cea00..7a8ba13fee94 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -47,6 +47,12 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock)
 	return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS);
 }
 
+static inline struct mutex *get_task_blocked_on(struct task_struct *p)
+{
+	guard(raw_spinlock_irqsave)(&p->blocked_lock);
+	return __get_task_blocked_on(p);
+}
+
 #ifdef CONFIG_DEBUG_MUTEXES
 extern void debug_mutex_lock_common(struct mutex *lock,
 				    struct mutex_waiter *waiter);
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index 31a785afee6c..e4a81790ea7d 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -289,7 +289,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
 		 * blocked_on pointer. Otherwise we can see circular
 		 * blocked_on relationships that can't resolve.
 		 */
-		__clear_task_blocked_on(waiter->task, lock);
+		clear_task_blocked_on(waiter->task, lock);
 		wake_q_add(wake_q, waiter->task);
 	}
 
@@ -347,7 +347,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
 			 * are waking the mutex owner, who may be currently
 			 * blocked on a different mutex.
 			 */
-			__clear_task_blocked_on(owner, NULL);
+			clear_task_blocked_on(owner, NULL);
 			wake_q_add(wake_q, owner);
 		}
 		return true;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b7f378af042..1913dbc68eb9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6584,6 +6584,7 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
  *   p->pi_lock
  *     rq->lock
  *       mutex->wait_lock
+ *         p->blocked_lock
  *
  * Returns the task that is going to be used as execution context (the one
  * that is actually going to be run on cpu_of(rq)).
@@ -6603,8 +6604,9 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 		 * and ensure @owner sticks around.
 		 */
 		guard(raw_spinlock)(&mutex->wait_lock);
+		guard(raw_spinlock)(&p->blocked_lock);
 
-		/* Check again that p is blocked with wait_lock held */
+		/* Check again that p is blocked with blocked_lock held */
 		if (mutex != __get_task_blocked_on(p)) {
 			/*
 			 * Something changed in the blocked_on chain and

From 56f4b24267a643b0b9ab73f09feaaabfee5a37ae Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:20 +0000
Subject: [PATCH 33/46] sched: Fix modifying donor->blocked on without proper
 locking

Introduce an action enum in find_proxy_task() which allows
us to handle work needed to be done outside the mutex.wait_lock
and task.blocked_lock guard scopes.

This ensures proper locking when we clear the donor's blocked_on
pointer in proxy_deactivate(), and the switch statement will be
useful as we add more cases to handle later in this series.

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260324191337.1841376-6-jstultz@google.com
---
 kernel/sched/core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1913dbc68eb9..bf4338f71667 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6568,7 +6568,7 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
 		 * as unblocked, as we aren't doing proxy-migrations
 		 * yet (more logic will be needed then).
 		 */
-		donor->blocked_on = NULL;
+		clear_task_blocked_on(donor, NULL);
 	}
 	return NULL;
 }
@@ -6592,6 +6592,7 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
 static struct task_struct *
 find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 {
+	enum { FOUND, DEACTIVATE_DONOR } action = FOUND;
 	struct task_struct *owner = NULL;
 	int this_cpu = cpu_of(rq);
 	struct task_struct *p;
@@ -6625,12 +6626,14 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 
 		if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
 			/* XXX Don't handle blocked owners/delayed dequeue yet */
-			return proxy_deactivate(rq, donor);
+			action = DEACTIVATE_DONOR;
+			break;
 		}
 
 		if (task_cpu(owner) != this_cpu) {
 			/* XXX Don't handle migrations yet */
-			return proxy_deactivate(rq, donor);
+			action = DEACTIVATE_DONOR;
+			break;
 		}
 
 		if (task_on_rq_migrating(owner)) {
@@ -6688,6 +6691,13 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 		 */
 	}
 
+	/* Handle actions we need to do outside of the guard() scope */
+	switch (action) {
+	case DEACTIVATE_DONOR:
+		return proxy_deactivate(rq, donor);
+	case FOUND:
+		/* fallthrough */;
+	}
 	WARN_ON_ONCE(owner && !owner->on_rq);
 	return owner;
 }

From 2d7622669836dcbbb449741b4e6c503ffe005c25 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:21 +0000
Subject: [PATCH 34/46] sched/locking: Add special p->blocked_on==PROXY_WAKING
 value for proxy return-migration

As we add functionality to proxy execution, we may migrate a
donor task to a runqueue where it can't run due to cpu affinity.
Thus, we must be careful to ensure we return-migrate the task
back to a cpu in its cpumask when it becomes unblocked.

Peter helpfully provided the following example with pictures:
"Suppose we have a ww_mutex cycle:

                  ,-+-* Mutex-1 <-.
        Task-A ---' |             | ,-- Task-B
                    `-> Mutex-2 *-+-'

Where Task-A holds Mutex-1 and tries to acquire Mutex-2, and
where Task-B holds Mutex-2 and tries to acquire Mutex-1.

Then the blocked_on->owner chain will go in circles.

        Task-A  -> Mutex-2
          ^          |
          |          v
        Mutex-1 <- Task-B

We need two things:

 - find_proxy_task() to stop iterating the circle;

 - the woken task to 'unblock' and run, such that it can
   back-off and re-try the transaction.

Now, the current code [without this patch] does:
        __clear_task_blocked_on();
        wake_q_add();

And surely clearing ->blocked_on is sufficient to break the
cycle.

Suppose it is Task-B that is made to back-off, then we have:

  Task-A -> Mutex-2 -> Task-B (no further blocked_on)

and it would attempt to run Task-B. Or worse, it could directly
pick Task-B and run it, without ever getting into
find_proxy_task().

Now, here is a problem because Task-B might not be runnable on
the CPU it is currently on; and because !task_is_blocked() we
don't get into the proxy paths, so nobody is going to fix this
up.

Ideally we would have dequeued Task-B alongside of clearing
->blocked_on, but alas, [the lock ordering prevents us from
getting the task_rq_lock() and] spoils things."

Thus we need more than just a binary concept of the task being
blocked on a mutex or not.

So allow setting blocked_on to PROXY_WAKING as a special value
which specifies the task is no longer blocked, but needs to
be evaluated for return migration *before* it can be run.

This will then be used in a later patch to handle proxy
return-migration.

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260324191337.1841376-7-jstultz@google.com
---
 include/linux/sched.h     | 51 +++++++++++++++++++++++++++++++++++++--
 kernel/locking/mutex.c    |  2 +-
 kernel/locking/ww_mutex.h | 16 ++++++------
 kernel/sched/core.c       | 16 ++++++++++++
 4 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2eef9bc6daaa..8ec3b6d7d718 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2180,10 +2180,20 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock);
 })
 
 #ifndef CONFIG_PREEMPT_RT
+
+/*
+ * With proxy exec, if a task has been proxy-migrated, it may be a donor
+ * on a cpu that it can't actually run on. Thus we need a special state
+ * to denote that the task is being woken, but that it needs to be
+ * evaluated for return-migration before it is run. So if the task is
+ * blocked_on PROXY_WAKING, return migrate it before running it.
+ */
+#define PROXY_WAKING ((struct mutex *)(-1L))
+
 static inline struct mutex *__get_task_blocked_on(struct task_struct *p)
 {
 	lockdep_assert_held_once(&p->blocked_lock);
-	return p->blocked_on;
+	return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on;
 }
 
 static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m)
@@ -2211,7 +2221,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *
 	 * blocked_on relationships, but make sure we are not
 	 * clearing the relationship with a different lock.
 	 */
-	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m);
+	WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
 	p->blocked_on = NULL;
 }
 
@@ -2220,6 +2230,35 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m)
 	guard(raw_spinlock_irqsave)(&p->blocked_lock);
 	__clear_task_blocked_on(p, m);
 }
+
+static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
+{
+	/* Currently we serialize blocked_on under the task::blocked_lock */
+	lockdep_assert_held_once(&p->blocked_lock);
+
+	if (!sched_proxy_exec()) {
+		__clear_task_blocked_on(p, m);
+		return;
+	}
+
+	/* Don't set PROXY_WAKING if blocked_on was already cleared */
+	if (!p->blocked_on)
+		return;
+	/*
+	 * There may be cases where we set PROXY_WAKING on tasks that were
+	 * already set to waking, but make sure we are not changing
+	 * the relationship with a different lock.
+	 */
+	WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING);
+	p->blocked_on = PROXY_WAKING;
+}
+
+static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m)
+{
+	guard(raw_spinlock_irqsave)(&p->blocked_lock);
+	__set_task_blocked_on_waking(p, m);
+}
+
 #else
 static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
 {
@@ -2228,6 +2267,14 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute
 static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m)
 {
 }
+
+static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
+{
+}
+
+static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m)
+{
+}
 #endif /* !CONFIG_PREEMPT_RT */
 
 static __always_inline bool need_resched(void)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4aa79bcab08c..7d359647156d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -983,7 +983,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
 		next = waiter->task;
 
 		debug_mutex_wake_waiter(lock, waiter);
-		clear_task_blocked_on(next, lock);
+		set_task_blocked_on_waking(next, lock);
 		wake_q_add(&wake_q, next);
 	}
 
diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h
index e4a81790ea7d..5cd9dfa4b31e 100644
--- a/kernel/locking/ww_mutex.h
+++ b/kernel/locking/ww_mutex.h
@@ -285,11 +285,11 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter,
 		debug_mutex_wake_waiter(lock, waiter);
 #endif
 		/*
-		 * When waking up the task to die, be sure to clear the
-		 * blocked_on pointer. Otherwise we can see circular
-		 * blocked_on relationships that can't resolve.
+		 * When waking up the task to die, be sure to set the
+		 * blocked_on to PROXY_WAKING. Otherwise we can see
+		 * circular blocked_on relationships that can't resolve.
 		 */
-		clear_task_blocked_on(waiter->task, lock);
+		set_task_blocked_on_waking(waiter->task, lock);
 		wake_q_add(wake_q, waiter->task);
 	}
 
@@ -339,15 +339,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock,
 		 */
 		if (owner != current) {
 			/*
-			 * When waking up the task to wound, be sure to clear the
-			 * blocked_on pointer. Otherwise we can see circular
-			 * blocked_on relationships that can't resolve.
+			 * When waking up the task to wound, be sure to set the
+			 * blocked_on to PROXY_WAKING. Otherwise we can see
+			 * circular blocked_on relationships that can't resolve.
 			 *
 			 * NOTE: We pass NULL here instead of lock, because we
 			 * are waking the mutex owner, who may be currently
 			 * blocked on a different mutex.
 			 */
-			clear_task_blocked_on(owner, NULL);
+			set_task_blocked_on_waking(owner, NULL);
 			wake_q_add(wake_q, owner);
 		}
 		return true;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bf4338f71667..c997d516441d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4239,6 +4239,13 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		ttwu_queue(p, cpu, wake_flags);
 	}
 out:
+	/*
+	 * For now, if we've been woken up, clear the task->blocked_on
+	 * regardless if it was set to a mutex or PROXY_WAKING so the
+	 * task can run. We will need to be more careful later when
+	 * properly handling proxy migration
+	 */
+	clear_task_blocked_on(p, NULL);
 	if (success)
 		ttwu_stat(p, task_cpu(p), wake_flags);
 
@@ -6600,6 +6607,10 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 
 	/* Follow blocked_on chain. */
 	for (p = donor; (mutex = p->blocked_on); p = owner) {
+		/* if its PROXY_WAKING, resched_idle so ttwu can complete */
+		if (mutex == PROXY_WAKING)
+			return proxy_resched_idle(rq);
+
 		/*
 		 * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
 		 * and ensure @owner sticks around.
@@ -6620,6 +6631,11 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 
 		owner = __mutex_owner(mutex);
 		if (!owner) {
+			/*
+			 * If there is no owner, clear blocked_on
+			 * and return p so it can run and try to
+			 * acquire the lock
+			 */
 			__clear_task_blocked_on(p, mutex);
 			return p;
 		}

From f9530b3183358bbf945f7c20d4a6e2048061ec50 Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:22 +0000
Subject: [PATCH 35/46] sched: Add assert_balance_callbacks_empty helper

With proxy-exec utilizing pick-again logic, we can end up having
balance callbacks set by the preivous pick_next_task() call left
on the list.

So pull the warning out into a helper function, and make sure we
check it when we pick again.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260324191337.1841376-8-jstultz@google.com
---
 kernel/sched/core.c  | 1 +
 kernel/sched/sched.h | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c997d516441d..acb5894e7714 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6853,6 +6853,7 @@ static void __sched notrace __schedule(int sched_mode)
 	}
 
 pick_again:
+	assert_balance_callbacks_empty(rq);
 	next = pick_next_task(rq, rq->donor, &rf);
 	rq->next_class = next->sched_class;
 	if (sched_proxy_exec()) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b863bbda6de8..a2629d025c90 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1857,6 +1857,13 @@ static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
 static inline void scx_rq_clock_invalidate(struct rq *rq) {}
 #endif /* !CONFIG_SCHED_CLASS_EXT */
 
+static inline void assert_balance_callbacks_empty(struct rq *rq)
+{
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+		     rq->balance_callback &&
+		     rq->balance_callback != &balance_push_callback);
+}
+
 /*
  * Lockdep annotation that avoids accidental unlocks; it's like a
  * sticky/continuous lockdep_assert_held().
@@ -1873,7 +1880,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
 
 	rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
 	rf->clock_update_flags = 0;
-	WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
+	assert_balance_callbacks_empty(rq);
 }
 
 static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)

From 48fda62de67a1e88fc8bada12caf0fc9b45116df Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:23 +0000
Subject: [PATCH 36/46] sched: Add logic to zap balance callbacks if we pick
 again

With proxy-exec, a task is selected to run via pick_next_task(),
and then if it is a mutex blocked task, we call find_proxy_task()
to find a runnable owner. If the runnable owner is on another
cpu, we will need to migrate the selected donor task away, after
which we will pick_again can call pick_next_task() to choose
something else.

However, in the first call to pick_next_task(), we may have
had a balance_callback setup by the class scheduler. After we
pick again, its possible pick_next_task_fair() will be called
which calls sched_balance_newidle() and sched_balance_rq().

This will throw a warning:
[    8.796467] rq->balance_callback && rq->balance_callback != &balance_push_callback
[    8.796467] WARNING: CPU: 32 PID: 458 at kernel/sched/sched.h:1750 sched_balance_rq+0xe92/0x1250
...
[    8.796467] Call Trace:
[    8.796467]  <TASK>
[    8.796467]  ? __warn.cold+0xb2/0x14e
[    8.796467]  ? sched_balance_rq+0xe92/0x1250
[    8.796467]  ? report_bug+0x107/0x1a0
[    8.796467]  ? handle_bug+0x54/0x90
[    8.796467]  ? exc_invalid_op+0x17/0x70
[    8.796467]  ? asm_exc_invalid_op+0x1a/0x20
[    8.796467]  ? sched_balance_rq+0xe92/0x1250
[    8.796467]  sched_balance_newidle+0x295/0x820
[    8.796467]  pick_next_task_fair+0x51/0x3f0
[    8.796467]  __schedule+0x23a/0x14b0
[    8.796467]  ? lock_release+0x16d/0x2e0
[    8.796467]  schedule+0x3d/0x150
[    8.796467]  worker_thread+0xb5/0x350
[    8.796467]  ? __pfx_worker_thread+0x10/0x10
[    8.796467]  kthread+0xee/0x120
[    8.796467]  ? __pfx_kthread+0x10/0x10
[    8.796467]  ret_from_fork+0x31/0x50
[    8.796467]  ? __pfx_kthread+0x10/0x10
[    8.796467]  ret_from_fork_asm+0x1a/0x30
[    8.796467]  </TASK>

This is because if a RT task was originally picked, it will
setup the rq->balance_callback with push_rt_tasks() via
set_next_task_rt().

Once the task is migrated away and we pick again, we haven't
processed any balance callbacks, so rq->balance_callback is not
in the same state as it was the first time pick_next_task was
called.

To handle this, add a zap_balance_callbacks() helper function
which cleans up the balance callbacks without running them. This
should be ok, as we are effectively undoing the state set in
the first call to pick_next_task(), and when we pick again,
the new callback can be configured for the donor task actually
selected.

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260324191337.1841376-9-jstultz@google.com
---
 kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index acb5894e7714..162b24c76077 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4917,6 +4917,34 @@ static inline void finish_task(struct task_struct *prev)
 	smp_store_release(&prev->on_cpu, 0);
 }
 
+/*
+ * Only called from __schedule context
+ *
+ * There are some cases where we are going to re-do the action
+ * that added the balance callbacks. We may not be in a state
+ * where we can run them, so just zap them so they can be
+ * properly re-added on the next time around. This is similar
+ * handling to running the callbacks, except we just don't call
+ * them.
+ */
+static void zap_balance_callbacks(struct rq *rq)
+{
+	struct balance_callback *next, *head;
+	bool found = false;
+
+	lockdep_assert_rq_held(rq);
+
+	head = rq->balance_callback;
+	while (head) {
+		if (head == &balance_push_callback)
+			found = true;
+		next = head->next;
+		head->next = NULL;
+		head = next;
+	}
+	rq->balance_callback = found ? &balance_push_callback : NULL;
+}
+
 static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
 {
 	void (*func)(struct rq *rq);
@@ -6862,10 +6890,14 @@ static void __sched notrace __schedule(int sched_mode)
 		rq_set_donor(rq, next);
 		if (unlikely(next->blocked_on)) {
 			next = find_proxy_task(rq, next, &rf);
-			if (!next)
+			if (!next) {
+				zap_balance_callbacks(rq);
 				goto pick_again;
-			if (next == rq->idle)
+			}
+			if (next == rq->idle) {
+				zap_balance_callbacks(rq);
 				goto keep_resched;
+			}
 		}
 		if (rq->donor == prev_donor && prev != next) {
 			struct task_struct *donor = rq->donor;

From dec9554dc036183c715d02e9cfe48986d453427a Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:24 +0000
Subject: [PATCH 37/46] sched: Move attach_one_task and attach_task helpers to
 sched.h

The fair scheduler locally introduced attach_one_task() and
attach_task() helpers, but these could be generically useful so
move this code to sched.h so we can use them elsewhere.

One minor tweak made to utilize guard(rq_lock)(rq) to simplifiy
the function.

Suggested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://patch.msgid.link/20260324191337.1841376-10-jstultz@google.com
---
 kernel/sched/fair.c  | 26 --------------------------
 kernel/sched/sched.h | 23 +++++++++++++++++++++++
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7f35dd40d7ef..41293d5d9b75 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9946,32 +9946,6 @@ static int detach_tasks(struct lb_env *env)
 	return detached;
 }
 
-/*
- * attach_task() -- attach the task detached by detach_task() to its new rq.
- */
-static void attach_task(struct rq *rq, struct task_struct *p)
-{
-	lockdep_assert_rq_held(rq);
-
-	WARN_ON_ONCE(task_rq(p) != rq);
-	activate_task(rq, p, ENQUEUE_NOCLOCK);
-	wakeup_preempt(rq, p, 0);
-}
-
-/*
- * attach_one_task() -- attaches the task returned from detach_one_task() to
- * its new rq.
- */
-static void attach_one_task(struct rq *rq, struct task_struct *p)
-{
-	struct rq_flags rf;
-
-	rq_lock(rq, &rf);
-	update_rq_clock(rq);
-	attach_task(rq, p);
-	rq_unlock(rq, &rf);
-}
-
 /*
  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
  * new rq.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a2629d025c90..9594355a3681 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3012,6 +3012,29 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 
 extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
 
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static inline void attach_task(struct rq *rq, struct task_struct *p)
+{
+	lockdep_assert_rq_held(rq);
+
+	WARN_ON_ONCE(task_rq(p) != rq);
+	activate_task(rq, p, ENQUEUE_NOCLOCK);
+	wakeup_preempt(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static inline void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+	guard(rq_lock)(rq);
+	update_rq_clock(rq);
+	attach_task(rq, p);
+}
+
 #ifdef CONFIG_PREEMPT_RT
 # define SCHED_NR_MIGRATE_BREAK 8
 #else

From b049b81bdff6fc6794200a4c7d7d910e2008d57f Mon Sep 17 00:00:00 2001
From: John Stultz <jstultz@google.com>
Date: Tue, 24 Mar 2026 19:13:25 +0000
Subject: [PATCH 38/46] sched: Handle blocked-waiter migration (and return
 migration)

Add logic to handle migrating a blocked waiter to a remote
cpu where the lock owner is runnable.

Additionally, as the blocked task may not be able to run
on the remote cpu, add logic to handle return migration once
the waiting task is given the mutex.

Because tasks may get migrated to where they cannot run, also
modify the scheduling classes to avoid sched class migrations on
mutex blocked tasks, leaving find_proxy_task() and related logic
to do the migrations and return migrations.

This was split out from the larger proxy patch, and
significantly reworked.

Credits for the original patch go to:
  Peter Zijlstra (Intel) <peterz@infradead.org>
  Juri Lelli <juri.lelli@redhat.com>
  Valentin Schneider <valentin.schneider@arm.com>
  Connor O'Brien <connoro@google.com>

Signed-off-by: John Stultz <jstultz@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260324191337.1841376-11-jstultz@google.com
---
 kernel/sched/core.c | 232 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 194 insertions(+), 38 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 162b24c76077..c15c9865299e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4239,13 +4239,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		ttwu_queue(p, cpu, wake_flags);
 	}
 out:
-	/*
-	 * For now, if we've been woken up, clear the task->blocked_on
-	 * regardless if it was set to a mutex or PROXY_WAKING so the
-	 * task can run. We will need to be more careful later when
-	 * properly handling proxy migration
-	 */
-	clear_task_blocked_on(p, NULL);
 	if (success)
 		ttwu_stat(p, task_cpu(p), wake_flags);
 
@@ -6530,6 +6523,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
 	if (signal_pending_state(task_state, p)) {
 		WRITE_ONCE(p->__state, TASK_RUNNING);
 		*task_state_p = TASK_RUNNING;
+		set_task_blocked_on_waking(p, NULL);
+
 		return false;
 	}
 
@@ -6567,6 +6562,21 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p,
 }
 
 #ifdef CONFIG_SCHED_PROXY_EXEC
+static inline void proxy_set_task_cpu(struct task_struct *p, int cpu)
+{
+	unsigned int wake_cpu;
+
+	/*
+	 * Since we are enqueuing a blocked task on a cpu it may
+	 * not be able to run on, preserve wake_cpu when we
+	 * __set_task_cpu so we can return the task to where it
+	 * was previously runnable.
+	 */
+	wake_cpu = p->wake_cpu;
+	__set_task_cpu(p, cpu);
+	p->wake_cpu = wake_cpu;
+}
+
 static inline struct task_struct *proxy_resched_idle(struct rq *rq)
 {
 	put_prev_set_next_task(rq, rq->donor, rq->idle);
@@ -6575,7 +6585,7 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq)
 	return rq->idle;
 }
 
-static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static bool proxy_deactivate(struct rq *rq, struct task_struct *donor)
 {
 	unsigned long state = READ_ONCE(donor->__state);
 
@@ -6595,17 +6605,140 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor)
 	return try_to_block_task(rq, donor, &state, true);
 }
 
-static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor)
+static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf)
+	__releases(__rq_lockp(rq))
 {
-	if (!__proxy_deactivate(rq, donor)) {
+	/*
+	 * The class scheduler may have queued a balance callback
+	 * from pick_next_task() called earlier.
+	 *
+	 * So here we have to zap callbacks before unlocking the rq
+	 * as another CPU may jump in and call sched_balance_rq
+	 * which can trip the warning in rq_pin_lock() if we
+	 * leave callbacks set.
+	 *
+	 * After we later reaquire the rq lock, we will force __schedule()
+	 * to pick_again, so the callbacks will get re-established.
+	 */
+	zap_balance_callbacks(rq);
+	rq_unpin_lock(rq, rf);
+	raw_spin_rq_unlock(rq);
+}
+
+static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf)
+	__acquires(__rq_lockp(rq))
+{
+	raw_spin_rq_lock(rq);
+	rq_repin_lock(rq, rf);
+	update_rq_clock(rq);
+}
+
+/*
+ * If the blocked-on relationship crosses CPUs, migrate @p to the
+ * owner's CPU.
+ *
+ * This is because we must respect the CPU affinity of execution
+ * contexts (owner) but we can ignore affinity for scheduling
+ * contexts (@p). So we have to move scheduling contexts towards
+ * potential execution contexts.
+ *
+ * Note: The owner can disappear, but simply migrate to @target_cpu
+ * and leave that CPU to sort things out.
+ */
+static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf,
+			       struct task_struct *p, int target_cpu)
+	__must_hold(__rq_lockp(rq))
+{
+	struct rq *target_rq = cpu_rq(target_cpu);
+
+	lockdep_assert_rq_held(rq);
+	WARN_ON(p == rq->curr);
+	/*
+	 * Since we are migrating a blocked donor, it could be rq->donor,
+	 * and we want to make sure there aren't any references from this
+	 * rq to it before we drop the lock. This avoids another cpu
+	 * jumping in and grabbing the rq lock and referencing rq->donor
+	 * or cfs_rq->curr, etc after we have migrated it to another cpu,
+	 * and before we pick_again in __schedule.
+	 *
+	 * So call proxy_resched_idle() to drop the rq->donor references
+	 * before we release the lock.
+	 */
+	proxy_resched_idle(rq);
+
+	deactivate_task(rq, p, DEQUEUE_NOCLOCK);
+	proxy_set_task_cpu(p, target_cpu);
+
+	proxy_release_rq_lock(rq, rf);
+
+	attach_one_task(target_rq, p);
+
+	proxy_reacquire_rq_lock(rq, rf);
+}
+
+static void proxy_force_return(struct rq *rq, struct rq_flags *rf,
+			       struct task_struct *p)
+	__must_hold(__rq_lockp(rq))
+{
+	struct rq *task_rq, *target_rq = NULL;
+	int cpu, wake_flag = WF_TTWU;
+
+	lockdep_assert_rq_held(rq);
+	WARN_ON(p == rq->curr);
+
+	if (p == rq->donor)
+		proxy_resched_idle(rq);
+
+	proxy_release_rq_lock(rq, rf);
+	/*
+	 * We drop the rq lock, and re-grab task_rq_lock to get
+	 * the pi_lock (needed for select_task_rq) as well.
+	 */
+	scoped_guard (task_rq_lock, p) {
+		task_rq = scope.rq;
+
 		/*
-		 * XXX: For now, if deactivation failed, set donor
-		 * as unblocked, as we aren't doing proxy-migrations
-		 * yet (more logic will be needed then).
+		 * Since we let go of the rq lock, the task may have been
+		 * woken or migrated to another rq before we  got the
+		 * task_rq_lock. So re-check we're on the same RQ. If
+		 * not, the task has already been migrated and that CPU
+		 * will handle any futher migrations.
 		 */
-		clear_task_blocked_on(donor, NULL);
+		if (task_rq != rq)
+			break;
+
+		/*
+		 * Similarly, if we've been dequeued, someone else will
+		 * wake us
+		 */
+		if (!task_on_rq_queued(p))
+			break;
+
+		/*
+		 * Since we should only be calling here from __schedule()
+		 * -> find_proxy_task(), no one else should have
+		 * assigned current out from under us. But check and warn
+		 * if we see this, then bail.
+		 */
+		if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) {
+			WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d  on_cpu: %i\n",
+				  __func__, cpu_of(task_rq),
+				  p->comm, p->pid, p->on_cpu);
+			break;
+		}
+
+		update_rq_clock(task_rq);
+		deactivate_task(task_rq, p, DEQUEUE_NOCLOCK);
+		cpu = select_task_rq(p, p->wake_cpu, &wake_flag);
+		set_task_cpu(p, cpu);
+		target_rq = cpu_rq(cpu);
+		clear_task_blocked_on(p, NULL);
 	}
-	return NULL;
+
+	if (target_rq)
+		attach_one_task(target_rq, p);
+
+	proxy_reacquire_rq_lock(rq, rf);
 }
 
 /*
@@ -6626,18 +6759,25 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d
  */
 static struct task_struct *
 find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
+	__must_hold(__rq_lockp(rq))
 {
-	enum { FOUND, DEACTIVATE_DONOR } action = FOUND;
 	struct task_struct *owner = NULL;
+	bool curr_in_chain = false;
 	int this_cpu = cpu_of(rq);
 	struct task_struct *p;
 	struct mutex *mutex;
+	int owner_cpu;
 
 	/* Follow blocked_on chain. */
 	for (p = donor; (mutex = p->blocked_on); p = owner) {
-		/* if its PROXY_WAKING, resched_idle so ttwu can complete */
-		if (mutex == PROXY_WAKING)
-			return proxy_resched_idle(rq);
+		/* if its PROXY_WAKING, do return migration or run if current */
+		if (mutex == PROXY_WAKING) {
+			if (task_current(rq, p)) {
+				clear_task_blocked_on(p, PROXY_WAKING);
+				return p;
+			}
+			goto force_return;
+		}
 
 		/*
 		 * By taking mutex->wait_lock we hold off concurrent mutex_unlock()
@@ -6657,27 +6797,39 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 			return NULL;
 		}
 
+		if (task_current(rq, p))
+			curr_in_chain = true;
+
 		owner = __mutex_owner(mutex);
 		if (!owner) {
 			/*
-			 * If there is no owner, clear blocked_on
-			 * and return p so it can run and try to
-			 * acquire the lock
+			 * If there is no owner, either clear blocked_on
+			 * and return p (if it is current and safe to
+			 * just run on this rq), or return-migrate the task.
 			 */
-			__clear_task_blocked_on(p, mutex);
-			return p;
+			if (task_current(rq, p)) {
+				__clear_task_blocked_on(p, NULL);
+				return p;
+			}
+			goto force_return;
 		}
 
 		if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) {
 			/* XXX Don't handle blocked owners/delayed dequeue yet */
-			action = DEACTIVATE_DONOR;
-			break;
+			if (curr_in_chain)
+				return proxy_resched_idle(rq);
+			goto deactivate;
 		}
 
-		if (task_cpu(owner) != this_cpu) {
-			/* XXX Don't handle migrations yet */
-			action = DEACTIVATE_DONOR;
-			break;
+		owner_cpu = task_cpu(owner);
+		if (owner_cpu != this_cpu) {
+			/*
+			 * @owner can disappear, simply migrate to @owner_cpu
+			 * and leave that CPU to sort things out.
+			 */
+			if (curr_in_chain)
+				return proxy_resched_idle(rq);
+			goto migrate_task;
 		}
 
 		if (task_on_rq_migrating(owner)) {
@@ -6734,16 +6886,20 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf)
 		 * guarantee its existence, as per ttwu_remote().
 		 */
 	}
-
-	/* Handle actions we need to do outside of the guard() scope */
-	switch (action) {
-	case DEACTIVATE_DONOR:
-		return proxy_deactivate(rq, donor);
-	case FOUND:
-		/* fallthrough */;
-	}
 	WARN_ON_ONCE(owner && !owner->on_rq);
 	return owner;
+
+deactivate:
+	if (proxy_deactivate(rq, donor))
+		return NULL;
+	/* If deactivate fails, force return */
+	p = donor;
+force_return:
+	proxy_force_return(rq, rf, p);
+	return NULL;
+migrate_task:
+	proxy_migrate_task(rq, rf, p, owner_cpu);
+	return NULL;
 }
 #else /* SCHED_PROXY_EXEC */
 static struct task_struct *

From 2d4cc371baa5881da45120a65d264a59b486f486 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 27 Mar 2026 14:20:13 +0100
Subject: [PATCH 39/46] sched/fair: Use sched_energy_enabled()

Use helper sched_energy_enabled() everywhere we want to test if EAS is
enabled instead of mixing sched_energy_enabled() and direct call to
static_branch_unlikely().

No functional change

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260327132013.2800517-1-vincent.guittot@linaro.org
---
 kernel/sched/topology.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 43150591914b..5847b83d9d55 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -273,7 +273,7 @@ void rebuild_sched_domains_energy(void)
 static int sched_energy_aware_handler(const struct ctl_table *table, int write,
 		void *buffer, size_t *lenp, loff_t *ppos)
 {
-	int ret, state;
+	int ret;
 
 	if (write && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -289,8 +289,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write,
 
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
-		state = static_branch_unlikely(&sched_energy_present);
-		if (state != sysctl_sched_energy_aware)
+		if (sysctl_sched_energy_aware != sched_energy_enabled())
 			rebuild_sched_domains_energy();
 	}
 
@@ -388,11 +387,11 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp)
 
 static void sched_energy_set(bool has_eas)
 {
-	if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
+	if (!has_eas && sched_energy_enabled()) {
 		if (sched_debug())
 			pr_info("%s: stopping EAS\n", __func__);
 		static_branch_disable_cpuslocked(&sched_energy_present);
-	} else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
+	} else if (has_eas && !sched_energy_enabled()) {
 		if (sched_debug())
 			pr_info("%s: starting EAS\n", __func__);
 		static_branch_enable_cpuslocked(&sched_energy_present);

From 059258b0d424510202b6f2796279dbdbf0c6a83d Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 31 Mar 2026 18:23:52 +0200
Subject: [PATCH 40/46] sched/fair: Prevent negative lag increase during
 delayed dequeue

Delayed dequeue feature aims to reduce the negative lag of a dequeued
task while sleeping but it can happens that newly enqueued tasks will
move backward the avg vruntime and increase its negative lag.
When the delayed dequeued task wakes up, it has more neg lag compared
to being dequeued immediately or to other tasks that have been
dequeued just before theses new enqueues.

Ensure that the negative lag of a delayed dequeued task doesn't
increase during its delayed dequeued phase while waiting for its neg
lag to diseappear. Similarly, we remove any positive lag that the
delayed dequeued task could have gain during thsi period.

Short slice tasks are particularly impacted in overloaded system.

Test on snapdragon rb5:

hackbench -T -p -l 16000000 -g 2 1> /dev/null &
cyclictest -t 1 -i 2777 -D 333 --policy=fair --mlock  -h 20000 -q

The scheduling latency of cyclictest is:

                       tip/sched/core  tip/sched/core    +this patch
cyclictest slice  (ms) (default)2.8             8               8
hackbench slice   (ms) (default)2.8            20              20
Total Samples          |   115632          119733          119806
Average           (us) |      364              64(-82%)        61(- 5%)
Median (P50)      (us) |       60              56(- 7%)        56(  0%)
90th Percentile   (us) |     1166              62(-95%)        62(  0%)
99th Percentile   (us) |     4192              73(-98%)        72(- 1%)
99.9th Percentile (us) |     8528            2707(-68%)      1300(-52%)
Maximum           (us) |    17735           14273(-20%)     13525(- 5%)

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260331162352.551501-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 56 +++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 41293d5d9b75..597ce5b718d2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -840,11 +840,33 @@ static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avrunt
 	return clamp(vlag, -limit, limit);
 }
 
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
+/*
+ * Delayed dequeue aims to reduce the negative lag of a dequeued task. While
+ * updating the lag of an entity, check that negative lag didn't increase
+ * during the delayed dequeue period which would be unfair.
+ * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO
+ * is set.
+ *
+ * Return true if the lag has been adjusted.
+ */
+static __always_inline
+bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
+	bool ret;
+
 	WARN_ON_ONCE(!se->on_rq);
 
-	se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
+	if (se->sched_delayed) {
+		/* previous vlag < 0 otherwise se would not be delayed */
+		vlag = max(vlag, se->vlag);
+		if (sched_feat(DELAY_ZERO))
+			vlag = min(vlag, 0);
+	}
+	ret = (vlag == se->vlag);
+	se->vlag = vlag;
+
+	return ret;
 }
 
 /*
@@ -5564,13 +5586,6 @@ static void clear_delayed(struct sched_entity *se)
 	}
 }
 
-static inline void finish_delayed_dequeue_entity(struct sched_entity *se)
-{
-	clear_delayed(se);
-	if (sched_feat(DELAY_ZERO) && se->vlag > 0)
-		se->vlag = 0;
-}
-
 static bool
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -5596,6 +5611,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		if (sched_feat(DELAY_DEQUEUE) && delay &&
 		    !entity_eligible(cfs_rq, se)) {
 			update_load_avg(cfs_rq, se, 0);
+			update_entity_lag(cfs_rq, se);
 			set_delayed(se);
 			return false;
 		}
@@ -5635,7 +5651,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	update_cfs_group(se);
 
 	if (flags & DEQUEUE_DELAYED)
-		finish_delayed_dequeue_entity(se);
+		clear_delayed(se);
 
 	if (cfs_rq->nr_queued == 0) {
 		update_idle_cfs_rq_clock_pelt(cfs_rq);
@@ -7084,18 +7100,14 @@ requeue_delayed_entity(struct sched_entity *se)
 	WARN_ON_ONCE(!se->sched_delayed);
 	WARN_ON_ONCE(!se->on_rq);
 
-	if (sched_feat(DELAY_ZERO)) {
-		update_entity_lag(cfs_rq, se);
-		if (se->vlag > 0) {
-			cfs_rq->nr_queued--;
-			if (se != cfs_rq->curr)
-				__dequeue_entity(cfs_rq, se);
-			se->vlag = 0;
-			place_entity(cfs_rq, se, 0);
-			if (se != cfs_rq->curr)
-				__enqueue_entity(cfs_rq, se);
-			cfs_rq->nr_queued++;
-		}
+	if (update_entity_lag(cfs_rq, se)) {
+		cfs_rq->nr_queued--;
+		if (se != cfs_rq->curr)
+			__dequeue_entity(cfs_rq, se);
+		place_entity(cfs_rq, se, 0);
+		if (se != cfs_rq->curr)
+			__enqueue_entity(cfs_rq, se);
+		cfs_rq->nr_queued++;
 	}
 
 	update_load_avg(cfs_rq, se, 0);

From c6e80201e057dfb7253385e60bf541121bf5dc33 Mon Sep 17 00:00:00 2001
From: Joseph Salisbury <joseph.salisbury@oracle.com>
Date: Fri, 3 Apr 2026 17:00:14 -0400
Subject: [PATCH 41/46] sched: Use u64 for bandwidth ratio calculations

to_ratio() computes BW_SHIFT-scaled bandwidth ratios from u64 period and
runtime values, but it returns unsigned long.  tg_rt_schedulable() also
stores the current group limit and the accumulated child sum in unsigned
long.

On 32-bit builds, large bandwidth ratios can be truncated and the RT
group sum can wrap when enough siblings are present.  That can let an
overcommitted RT hierarchy pass the schedulability check, and it also
narrows the helper result for other callers.

Return u64 from to_ratio() and use u64 for the RT group totals so
bandwidth ratios are preserved and compared at full width on both 32-bit
and 64-bit builds.

Fixes: b40b2e8eb521 ("sched: rt: multi level group constraints")
Assisted-by: Codex:GPT-5
Signed-off-by: Joseph Salisbury <joseph.salisbury@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://patch.msgid.link/20260403210014.2713404-1-joseph.salisbury@oracle.com
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/rt.c    | 2 +-
 kernel/sched/sched.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c15c9865299e..49cd5d217161 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4735,7 +4735,7 @@ void sched_post_fork(struct task_struct *p)
 	scx_post_fork(p);
 }
 
-unsigned long to_ratio(u64 period, u64 runtime)
+u64 to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
 		return BW_UNIT;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4e5f1957b91b..a48e86794913 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2666,7 +2666,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 {
 	struct rt_schedulable_data *d = data;
 	struct task_group *child;
-	unsigned long total, sum = 0;
+	u64 total, sum = 0;
 	u64 period, runtime;
 
 	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9594355a3681..c95584191d58 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2907,7 +2907,7 @@ extern void init_cfs_throttle_work(struct task_struct *p);
 #define MAX_BW_BITS		(64 - BW_SHIFT)
 #define MAX_BW			((1ULL << MAX_BW_BITS) - 1)
 
-extern unsigned long to_ratio(u64 period, u64 runtime);
+extern u64 to_ratio(u64 period, u64 runtime);
 
 extern void init_entity_runnable_average(struct sched_entity *se);
 extern void post_init_entity_util_avg(struct task_struct *p);

From 556146ce5e9476db234134c46ddf0e154ca17028 Mon Sep 17 00:00:00 2001
From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: Tue, 7 Apr 2026 13:36:17 +0200
Subject: [PATCH 42/46] sched/fair: Avoid overflow in enqueue_entity()

Here is one scenario which was triggered when running:

    stress-ng --yield=32 -t 10000000s&
    while true; do perf bench sched messaging -p -t -l 100000 -g 16; done

on a 256CPUs machine after about an hour into the run:

    __enqeue_entity: entity_key(-141245081754) weight(90891264) overflow_mul(5608800059305154560) vlag(57498) delayed?(0)
    cfs_rq: zero_vruntime(3809707759657809) sum_w_vruntime(0) sum_weight(0) nr_queued(1)
    cfs_rq->curr: entity_key(0) vruntime(3809707759657809) deadline(3809723966988476) weight(37)

The above comes from __enqueue_entity() after a place_entity(). Breaking
this down:

    vlag_initial = 57498
    vlag = (57498 * (37 + 90891264)) / 37 = 141,245,081,754

    vruntime = 3809707759657809 - 141245081754 = 3,809,566,514,576,055
    entity_key(se, cfs_rq) = -141,245,081,754

Now, multiplying the entity_key with its own weight results to
5,608,800,059,305,154,560 (same as what overflow_mul() suggests) but
in Python, without overflow, this would be: -1,2837,944,014,404,397,056

Avoid the overflow (without doing the division for avg_vruntime()), by moving
zero_vruntime to the new entity when it is heavier.

Fixes: 4823725d9d1d ("sched/fair: Increase weight bits for avg_vruntime")
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
[peterz: suggested 'weight > load' condition]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260407120052.GG3738010@noisy.programming.kicks-ass.net
---
 kernel/sched/fair.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 597ce5b718d2..12890ef16603 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5352,6 +5352,7 @@ static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
 	u64 vslice, vruntime = avg_vruntime(cfs_rq);
+	bool update_zero = false;
 	s64 lag = 0;
 
 	if (!se->custom_slice)
@@ -5368,7 +5369,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 */
 	if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
 		struct sched_entity *curr = cfs_rq->curr;
-		long load;
+		long load, weight;
 
 		lag = se->vlag;
 
@@ -5428,14 +5429,41 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 		if (curr && curr->on_rq)
 			load += avg_vruntime_weight(cfs_rq, curr->load.weight);
 
-		lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight);
+		weight = avg_vruntime_weight(cfs_rq, se->load.weight);
+		lag *= load + weight;
 		if (WARN_ON_ONCE(!load))
 			load = 1;
 		lag = div64_long(lag, load);
+
+		/*
+		 * A heavy entity (relative to the tree) will pull the
+		 * avg_vruntime close to its vruntime position on enqueue. But
+		 * the zero_vruntime point is only updated at the next
+		 * update_deadline()/place_entity()/update_entity_lag().
+		 *
+		 * Specifically (see the comment near avg_vruntime_weight()):
+		 *
+		 *   sum_w_vruntime = \Sum (v_i - v0) * w_i
+		 *
+		 * Note that if v0 is near a light entity, both terms will be
+		 * small for the light entity, while in that case both terms
+		 * are large for the heavy entity, leading to risk of
+		 * overflow.
+		 *
+		 * OTOH if v0 is near the heavy entity, then the difference is
+		 * larger for the light entity, but the factor is small, while
+		 * for the heavy entity the difference is small but the factor
+		 * is large. Avoiding the multiplication overflow.
+		 */
+		if (weight > load)
+			update_zero = true;
 	}
 
 	se->vruntime = vruntime - lag;
 
+	if (update_zero)
+		update_zero_vruntime(cfs_rq, -lag);
+
 	if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
 		se->deadline += se->vruntime;
 		se->rel_deadline = 0;

From 8b016dcec9365675be81d26be88f2c09cf983bd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
Date: Mon, 23 Mar 2026 13:39:37 +0100
Subject: [PATCH 43/46] sched/rt: Skip group schedulable check with
 rt_group_sched=0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The warning from the commit 87f1fb77d87a6 ("sched: Add RT_GROUP WARN
checks for non-root task_groups") is wrong -- it assumes that only
task_groups with rt_rq are traversed, however, the schedulability check
would iterate all task_groups even when rt_group_sched=0 is disabled at
boot time but some non-root task_groups exist.

The schedulability check is supposed to validate:
  a) that children don't overcommit its parent,
  b) no RT task group overcommits global RT limit.
but with rt_group_sched=0 there is no (non-trivial) hierarchy of RT groups,
therefore skip the validation altogether. Otherwise, writes to the
global sched_rt_runtime_us knob will be rejected with incorrect
validation error.

This fix is immaterial with CONFIG_RT_GROUP_SCHED=n.

Fixes: 87f1fb77d87a6 ("sched: Add RT_GROUP WARN checks for non-root task_groups")
Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260323-sched-rert_groups-v3-1-1e7d5ed6b249@suse.com
---
 kernel/sched/rt.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a48e86794913..893e54dab13e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2690,9 +2690,6 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
 	    tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
-	if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group))
-		return -EBUSY;
-
 	total = to_ratio(period, runtime);
 
 	/*
@@ -2836,6 +2833,8 @@ long sched_group_rt_period(struct task_group *tg)
 static int sched_rt_global_constraints(void)
 {
 	int ret = 0;
+	if (!rt_group_sched_enabled())
+		return ret;
 
 	mutex_lock(&rt_constraints_mutex);
 	ret = __rt_schedulable(NULL, 0, 0);

From 4f70a0456d090303d5a6c915dd7d9db9da56cb16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
Date: Mon, 23 Mar 2026 13:39:38 +0100
Subject: [PATCH 44/46] sched/rt: Move group schedulability check to
 sched_rt_global_validate()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The sched_rt_global_constraints() function is a remnant that used to set
up global RT throttling but that is no more since commit 5f6bd380c7bdb
("sched/rt: Remove default bandwidth control") and the function ended up
only doing schedulability check.
Move the check into the validation function where it fits better.
(The order of validations sched_dl_global_validate() and
sched_rt_global_validate() shouldn't matter.)

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260323-sched-rert_groups-v3-2-1e7d5ed6b249@suse.com
---
 kernel/sched/rt.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 893e54dab13e..e16d8f1b4aa2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2832,15 +2832,7 @@ long sched_group_rt_period(struct task_group *tg)
 #ifdef CONFIG_SYSCTL
 static int sched_rt_global_constraints(void)
 {
-	int ret = 0;
-	if (!rt_group_sched_enabled())
-		return ret;
-
-	mutex_lock(&rt_constraints_mutex);
-	ret = __rt_schedulable(NULL, 0, 0);
-	mutex_unlock(&rt_constraints_mutex);
-
-	return ret;
+	return 0;
 }
 #endif /* CONFIG_SYSCTL */
 
@@ -2872,6 +2864,13 @@ static int sched_rt_global_validate(void)
 			NSEC_PER_USEC > max_rt_runtime)))
 		return -EINVAL;
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	if (!rt_group_sched_enabled())
+		return 0;
+
+	scoped_guard(mutex, &rt_constraints_mutex)
+		return __rt_schedulable(NULL, 0, 0);
+#endif
 	return 0;
 }
 

From 985215804dcbf02ab675977e770708e3f084e9fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Koutn=C3=BD?= <mkoutny@suse.com>
Date: Mon, 23 Mar 2026 13:39:39 +0100
Subject: [PATCH 45/46] sched/rt: Cleanup global RT bandwidth functions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit 5f6bd380c7bdb ("sched/rt: Remove default bandwidth control")
and followup changes made a few of the functions unnecessary, drop them
for simplicity.

Signed-off-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260323-sched-rert_groups-v3-3-1e7d5ed6b249@suse.com
---
 kernel/sched/rt.c | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e16d8f1b4aa2..4ee8faf01441 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2829,13 +2829,6 @@ long sched_group_rt_period(struct task_group *tg)
 	return rt_period_us;
 }
 
-#ifdef CONFIG_SYSCTL
-static int sched_rt_global_constraints(void)
-{
-	return 0;
-}
-#endif /* CONFIG_SYSCTL */
-
 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
 	/* Don't accept real-time tasks when there is no way for them to run */
@@ -2845,14 +2838,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 	return 1;
 }
 
-#else /* !CONFIG_RT_GROUP_SCHED: */
-
-#ifdef CONFIG_SYSCTL
-static int sched_rt_global_constraints(void)
-{
-	return 0;
-}
-#endif /* CONFIG_SYSCTL */
 #endif /* !CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_SYSCTL
@@ -2874,10 +2859,6 @@ static int sched_rt_global_validate(void)
 	return 0;
 }
 
-static void sched_rt_do_global(void)
-{
-}
-
 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos)
 {
@@ -2901,11 +2882,6 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
 		if (ret)
 			goto undo;
 
-		ret = sched_rt_global_constraints();
-		if (ret)
-			goto undo;
-
-		sched_rt_do_global();
 		sched_dl_do_global();
 	}
 	if (0) {

From 78cde54ea5f03398f1cf6656de2472068f6da966 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Fri, 10 Apr 2026 15:23:21 +0200
Subject: [PATCH 46/46] sched/eevdf: Clear buddies for preempt_short

next buddy should not prevent shorter slice preemption. Don't take buddy
into account when checking if shorter slice entity can preempt and clear it
if the entity with a shorter slice can preempt current.

Test on snapdragon rb5:
hackbench -T -p -l 16000000 -g 2 1> /dev/null &
hackbench runs in cgroup /test-A
cyclictest -t 1 -i 2777 -D 63 --policy=fair --mlock  -h 20000 -q
cyclictest runs in cgroup /test-B

                     tip/sched/core  tip/sched/core    +this patch
cyclictest slice  (ms) (default)2.8             8               8
hackbench slice   (ms) (default)2.8            20              20
Total Samples          |    22679           22595           22686
Average           (us) |       84              94(-12%)        59( 37%)
Median (P50)      (us) |       56              56(  0%)        56(  0%)
90th Percentile   (us) |       64              65(- 2%)        63(  3%)
99th Percentile   (us) |     1047            1273(-22%)        74( 94%)
99.9th Percentile (us) |     2431            4751(-95%)       663( 86%)
Maximum           (us) |     4694            8655(-84%)      3934( 55%)

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260410132321.2897789-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 12890ef16603..f179faf7a6a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1116,7 +1116,7 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
 	/*
 	 * Picking the ->next buddy will affect latency but not fairness.
 	 */
-	if (sched_feat(PICK_BUDDY) &&
+	if (sched_feat(PICK_BUDDY) && protect &&
 	    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
 		/* ->next will never be delayed */
 		WARN_ON_ONCE(cfs_rq->next->sched_delayed);
@@ -9138,8 +9138,10 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
 	return;
 
 preempt:
-	if (preempt_action == PREEMPT_WAKEUP_SHORT)
+	if (preempt_action == PREEMPT_WAKEUP_SHORT) {
 		cancel_protect_slice(se);
+		clear_buddies(cfs_rq, se);
+	}
 
 	resched_curr_lazy(rq);
 }