From 9fe89f022c05d99c052d6bc088b82d4ff83bf463 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Jan 2026 16:17:48 +0100 Subject: [PATCH 01/46] sched/fair: More complex proportional newidle balance It turns out that a few workloads (easyWave, fio) have a fairly low success rate on newidle balance, but still benefit greatly from having it anyway. Luckliky these workloads have a faily low newidle rate, so the cost if doing the newidle is relatively low, even if unsuccessfull. Add a simple rate based part to the newidle ratio compute, such that low rate newidle will still have a high newidle ratio. This cures the easyWave and fio workloads while not affecting the schbench numbers either (which have a very high newidle rate). Reported-by: Mario Roy Reported-by: "Mohamed Abuelfotoh, Hazem" Signed-off-by: Peter Zijlstra (Intel) Tested-by: Mario Roy Tested-by: "Mohamed Abuelfotoh, Hazem" Link: https://patch.msgid.link/20260127151748.GA1079264@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 1 + kernel/sched/fair.c | 27 +++++++++++++++++++++++++-- kernel/sched/features.h | 1 + kernel/sched/topology.c | 3 +++ 4 files changed, 30 insertions(+), 2 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 45c0022b91ce..a1e1032426dc 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -95,6 +95,7 @@ struct sched_domain { unsigned int newidle_call; unsigned int newidle_success; unsigned int newidle_ratio; + u64 newidle_stamp; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf948db905ed..66afa0ac7396 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12289,7 +12289,30 @@ static inline void update_newidle_stats(struct sched_domain *sd, unsigned int su sd->newidle_success += success; if (sd->newidle_call >= 1024) { - sd->newidle_ratio = sd->newidle_success; + u64 now = sched_clock(); + s64 delta = now - sd->newidle_stamp; + sd->newidle_stamp = now; + int ratio = 0; + + if (delta < 0) + delta = 0; + + if (sched_feat(NI_RATE)) { + /* + * ratio delta freq + * + * 1024 - 4 s - 128 Hz + * 512 - 2 s - 256 Hz + * 256 - 1 s - 512 Hz + * 128 - .5 s - 1024 Hz + * 64 - .25 s - 2048 Hz + */ + ratio = delta >> 22; + } + + ratio += sd->newidle_success; + + sd->newidle_ratio = min(1024, ratio); sd->newidle_call /= 2; sd->newidle_success /= 2; } @@ -12996,7 +13019,7 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (sd->flags & SD_BALANCE_NEWIDLE) { unsigned int weight = 1; - if (sched_feat(NI_RANDOM)) { + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) { /* * Throw a 1k sided dice; and only run * newidle_balance according to the success diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 136a6584be79..37d5928fa6dd 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -126,3 +126,4 @@ SCHED_FEAT(LATENCY_WARN, false) * Do newidle balancing proportional to its success rate using randomization. */ SCHED_FEAT(NI_RANDOM, true) +SCHED_FEAT(NI_RATE, true) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 32dcddaead82..061f8c85f555 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -4,6 +4,7 @@ */ #include +#include #include #include "sched.h" @@ -1642,6 +1643,7 @@ sd_init(struct sched_domain_topology_level *tl, struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); int sd_id, sd_weight, sd_flags = 0; struct cpumask *sd_span; + u64 now = sched_clock(); sd_weight = cpumask_weight(tl->mask(tl, cpu)); @@ -1679,6 +1681,7 @@ sd_init(struct sched_domain_topology_level *tl, .newidle_call = 512, .newidle_success = 256, .newidle_ratio = 512, + .newidle_stamp = now, .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, From 4823725d9d1d9cc5b36647e0cb8ff616cad6536f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2025 21:43:56 +0200 Subject: [PATCH 02/46] sched/fair: Increase weight bits for avg_vruntime Due to the zero_vruntime patch, the deltas are now a lot smaller and measurement with kernel-build and hackbench runs show about 45 bits used. This ensures avg_vruntime() tracks the full weight range, reducing numerical artifacts in reweight and the like. Also, lets keep the paranoid debug code around fow now. Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20260219080624.942813440%40infradead.org --- kernel/sched/debug.c | 14 +++++- kernel/sched/fair.c | 96 +++++++++++++++++++++++++++++++++-------- kernel/sched/features.h | 2 + kernel/sched/sched.h | 3 +- 4 files changed, 94 insertions(+), 21 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b24f40f05019..6246008c431e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,6 +8,7 @@ */ #include #include +#include #include "sched.h" /* @@ -901,10 +902,13 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread; + s64 zero_vruntime = -1, sum_w_vruntime = -1; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); + unsigned int sum_shift; unsigned long flags; + u64 sum_weight; #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, "\n"); @@ -925,6 +929,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) if (last) right_vruntime = last->vruntime; zero_vruntime = cfs_rq->zero_vruntime; + sum_w_vruntime = cfs_rq->sum_w_vruntime; + sum_weight = cfs_rq->sum_weight; + sum_shift = cfs_rq->sum_shift; raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", @@ -933,6 +940,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", SPLIT_NS(zero_vruntime)); + SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime", + sum_w_vruntime, ilog2(abs(sum_w_vruntime))); + SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight", + sum_weight); + SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", SPLIT_NS(avg_vruntime(cfs_rq))); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 66afa0ac7396..fdb98d2ea131 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -665,25 +665,83 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * Since zero_vruntime closely tracks the per-task service, these * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag * induced in the system due to quantisation. - * - * Also, we use scale_load_down() to reduce the size. - * - * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ +static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w) +{ +#ifdef CONFIG_64BIT + if (cfs_rq->sum_shift) + w = max(2UL, w >> cfs_rq->sum_shift); +#endif + return w; +} + +static inline void +__sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); + s64 w_vruntime, key = entity_key(cfs_rq, se); + + w_vruntime = key * weight; + WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62)); + + cfs_rq->sum_w_vruntime += w_vruntime; + cfs_rq->sum_weight += weight; +} + +static void +sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + unsigned long weight; + s64 key, tmp; + +again: + weight = avg_vruntime_weight(cfs_rq, se->load.weight); + key = entity_key(cfs_rq, se); + + if (check_mul_overflow(key, weight, &key)) + goto overflow; + + if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp)) + goto overflow; + + cfs_rq->sum_w_vruntime = tmp; + cfs_rq->sum_weight += weight; + return; + +overflow: + /* + * There's gotta be a limit -- if we're still failing at this point + * there's really nothing much to be done about things. + */ + BUG_ON(cfs_rq->sum_shift >= 10); + cfs_rq->sum_shift++; + + /* + * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1 + */ + cfs_rq->sum_w_vruntime = 0; + cfs_rq->sum_weight = 0; + + for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost; + node; node = rb_next(node)) + __sum_w_vruntime_add(cfs_rq, __node_2_se(node)); + + goto again; +} + static void sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight = scale_load_down(se->load.weight); - s64 key = entity_key(cfs_rq, se); + if (sched_feat(PARANOID_AVG)) + return sum_w_vruntime_add_paranoid(cfs_rq, se); - cfs_rq->sum_w_vruntime += key * weight; - cfs_rq->sum_weight += weight; + __sum_w_vruntime_add(cfs_rq, se); } static void sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { - unsigned long weight = scale_load_down(se->load.weight); + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); s64 key = entity_key(cfs_rq, se); cfs_rq->sum_w_vruntime -= key * weight; @@ -725,7 +783,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) s64 runtime = cfs_rq->sum_w_vruntime; if (curr) { - unsigned long w = scale_load_down(curr->load.weight); + unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight); runtime += entity_key(cfs_rq, curr) * w; weight += w; @@ -735,7 +793,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) if (runtime < 0) runtime -= (weight - 1); - delta = div_s64(runtime, weight); + delta = div64_long(runtime, weight); } else if (curr) { /* * When there is but one element, it is the average. @@ -801,7 +859,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) long load = cfs_rq->sum_weight; if (curr && curr->on_rq) { - unsigned long weight = scale_load_down(curr->load.weight); + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight); avg += entity_key(cfs_rq, curr) * weight; load += weight; @@ -3871,12 +3929,12 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), * we need to scale se->vlag when w_i changes. */ - se->vlag = div_s64(se->vlag * se->load.weight, weight); + se->vlag = div64_long(se->vlag * se->load.weight, weight); if (se->rel_deadline) - se->deadline = div_s64(se->deadline * se->load.weight, weight); + se->deadline = div64_long(se->deadline * se->load.weight, weight); if (rel_vprot) - vprot = div_s64(vprot * se->load.weight, weight); + vprot = div64_long(vprot * se->load.weight, weight); update_load_set(&se->load, weight); @@ -5180,7 +5238,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; - unsigned long load; + long load; lag = se->vlag; @@ -5238,12 +5296,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ load = cfs_rq->sum_weight; if (curr && curr->on_rq) - load += scale_load_down(curr->load.weight); + load += avg_vruntime_weight(cfs_rq, curr->load.weight); - lag *= load + scale_load_down(se->load.weight); + lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight); if (WARN_ON_ONCE(!load)) load = 1; - lag = div_s64(lag, load); + lag = div64_long(lag, load); } se->vruntime = vruntime - lag; diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 37d5928fa6dd..a25f97201ab9 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -58,6 +58,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(DELAY_DEQUEUE, true) SCHED_FEAT(DELAY_ZERO, true) +SCHED_FEAT(PARANOID_AVG, false) + /* * Allow wakeup-time preemption of the current task: */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 43bbf0693cca..8bf2f7d524cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -684,8 +684,9 @@ struct cfs_rq { s64 sum_w_vruntime; u64 sum_weight; - u64 zero_vruntime; + unsigned int sum_shift; + #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; u64 zero_vruntime_fi; From 101f3498b4bdfef97152a444847948de1543f692 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Jan 2026 20:56:23 +0100 Subject: [PATCH 03/46] sched/fair: Revert 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag") Zicheng Qu reported that, because avg_vruntime() always includes cfs_rq->curr, when ->on_rq, place_entity() doesn't work right. Specifically, the lag scaling in place_entity() relies on avg_vruntime() being the state *before* placement of the new entity. However in this case avg_vruntime() will actually already include the entity, which breaks things. Also, Zicheng Qu argues that avg_vruntime should be invariant under reweight. IOW commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag") was wrong! The issue reported in 6d71a9c61604 could possibly be explained by rounding artifacts -- notably the extreme weight '2' is outside of the range of avg_vruntime/sum_w_vruntime, since that uses scale_load_down(). By scaling vruntime by the real weight, but accounting it in vruntime with a factor 1024 more, the average moves significantly. However, that is now cured. Tested by reverting 66951e4860d3 ("sched/fair: Fix update_cfs_group() vs DELAY_DEQUEUE") and tracing vruntime and vlag figures again. Reported-by: Zicheng Qu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20260219080625.066102672%40infradead.org --- kernel/sched/fair.c | 148 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 124 insertions(+), 24 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fdb98d2ea131..2b98054cd754 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -822,17 +822,22 @@ static inline u64 cfs_rq_max_slice(struct cfs_rq *cfs_rq); * * -r_max < lag < max(r_max, q) */ -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime) { u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; s64 vlag, limit; - WARN_ON_ONCE(!se->on_rq); - - vlag = avg_vruntime(cfs_rq) - se->vruntime; + vlag = avruntime - se->vruntime; limit = calc_delta_fair(max_slice, se); - se->vlag = clamp(vlag, -limit, limit); + return clamp(vlag, -limit, limit); +} + +static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + WARN_ON_ONCE(!se->on_rq); + + se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); } /* @@ -3898,23 +3903,125 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) se_weight(se) * -se->avg.load_sum); } -static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); +static void +rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot) +{ + unsigned long old_weight = se->load.weight; + + /* + * VRUNTIME + * -------- + * + * COROLLARY #1: The virtual runtime of the entity needs to be + * adjusted if re-weight at !0-lag point. + * + * Proof: For contradiction assume this is not true, so we can + * re-weight without changing vruntime at !0-lag point. + * + * Weight VRuntime Avg-VRuntime + * before w v V + * after w' v' V' + * + * Since lag needs to be preserved through re-weight: + * + * lag = (V - v)*w = (V'- v')*w', where v = v' + * ==> V' = (V - v)*w/w' + v (1) + * + * Let W be the total weight of the entities before reweight, + * since V' is the new weighted average of entities: + * + * V' = (WV + w'v - wv) / (W + w' - w) (2) + * + * by using (1) & (2) we obtain: + * + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) + * + * Since we are doing at !0-lag point which means V != v, we + * can simplify (3): + * + * ==> W / (W + w' - w) = w / w' + * ==> Ww' = Ww + ww' - ww + * ==> W * (w' - w) = w * (w' - w) + * ==> W = w (re-weight indicates w' != w) + * + * So the cfs_rq contains only one entity, hence vruntime of + * the entity @v should always equal to the cfs_rq's weighted + * average vruntime @V, which means we will always re-weight + * at 0-lag point, thus breach assumption. Proof completed. + * + * + * COROLLARY #2: Re-weight does NOT affect weighted average + * vruntime of all the entities. + * + * Proof: According to corollary #1, Eq. (1) should be: + * + * (V - v)*w = (V' - v')*w' + * ==> v' = V' - (V - v)*w/w' (4) + * + * According to the weighted average formula, we have: + * + * V' = (WV - wv + w'v') / (W - w + w') + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') + * = (WV + w'V' - Vw) / (W - w + w') + * + * ==> V'*(W - w + w') = WV + w'V' - Vw + * ==> V' * (W - w) = (W - w) * V (5) + * + * If the entity is the only one in the cfs_rq, then reweight + * always occurs at 0-lag point, so V won't change. Or else + * there are other entities, hence W != w, then Eq. (5) turns + * into V' = V. So V won't change in either case, proof done. + * + * + * So according to corollary #1 & #2, the effect of re-weight + * on vruntime should be: + * + * v' = V' - (V - v) * w / w' (4) + * = V - (V - v) * w / w' + * = V - vl * w / w' + * = V - vl' + */ + se->vlag = div64_long(se->vlag * old_weight, weight); + + /* + * DEADLINE + * -------- + * + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + * + * d' = v' + (d - v)*w/w' + * = V' - (V - v)*w/w' + (d - v)*w/w' + * = V - (V - v)*w/w' + (d - v)*w/w' + * = V + (d - V)*w/w' + */ + if (se->rel_deadline) + se->deadline = div64_long(se->deadline * old_weight, weight); + + if (rel_vprot) + se->vprot = div64_long(se->vprot * old_weight, weight); +} static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { bool curr = cfs_rq->curr == se; bool rel_vprot = false; - u64 vprot; + u64 avruntime = 0; if (se->on_rq) { /* commit outstanding execution time */ update_curr(cfs_rq); - update_entity_lag(cfs_rq, se); - se->deadline -= se->vruntime; + avruntime = avg_vruntime(cfs_rq); + se->vlag = entity_lag(cfs_rq, se, avruntime); + se->deadline -= avruntime; se->rel_deadline = 1; if (curr && protect_slice(se)) { - vprot = se->vprot - se->vruntime; + se->vprot -= avruntime; rel_vprot = true; } @@ -3925,30 +4032,23 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, } dequeue_load_avg(cfs_rq, se); - /* - * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), - * we need to scale se->vlag when w_i changes. - */ - se->vlag = div64_long(se->vlag * se->load.weight, weight); - if (se->rel_deadline) - se->deadline = div64_long(se->deadline * se->load.weight, weight); - - if (rel_vprot) - vprot = div64_long(vprot * se->load.weight, weight); + rescale_entity(se, weight, rel_vprot); update_load_set(&se->load, weight); do { u32 divider = get_pelt_divider(&se->avg); - se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); enqueue_load_avg(cfs_rq, se); if (se->on_rq) { - place_entity(cfs_rq, se, 0); if (rel_vprot) - se->vprot = se->vruntime + vprot; + se->vprot += avruntime; + se->deadline += avruntime; + se->rel_deadline = 0; + se->vruntime = avruntime - se->vlag; + update_load_add(&cfs_rq->load, se->load.weight); if (!curr) __enqueue_entity(cfs_rq, se); @@ -5306,7 +5406,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->vruntime = vruntime - lag; - if (se->rel_deadline) { + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { se->deadline += se->vruntime; se->rel_deadline = 0; return; From db4551e2ba346663b7b16f0b5d36d308b615c50e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Feb 2026 17:07:58 +0100 Subject: [PATCH 04/46] sched/fair: Use full weight to __calc_delta() Since we now use the full weight for avg_vruntime(), also make __calc_delta() use the full value. Since weight is effectively NICE_0_LOAD, this is 20 bits on 64bit. This leaves 44 bits for delta_exec, which is ~16k seconds, way longer than any one tick would ever be, so no worry about overflow. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: K Prateek Nayak Tested-by: Shubhang Kaushik Link: https://patch.msgid.link/20260219080625.183283814%40infradead.org --- kernel/sched/fair.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b98054cd754..23315c294da1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -225,6 +225,7 @@ void __init sched_init_granularity(void) update_sysctl(); } +#ifndef CONFIG_64BIT #define WMULT_CONST (~0U) #define WMULT_SHIFT 32 @@ -283,6 +284,12 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight return mul_u64_u32_shr(delta_exec, fact, shift); } +#else +static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) +{ + return (delta_exec * weight) / lw->weight; +} +#endif /* * delta /= w From 9264758066061e660c86e48cff1bac4a58a7324a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Feb 2026 11:17:51 +0100 Subject: [PATCH 05/46] sched/fair: Update overutilized detection Checking uclamp_min is useless and counterproductive for overutilized state as misfit can now happen without being in overutilized state. Since commit e5ed0550c04c ("sched/fair: unlink misfit task from cpu overutilized") util_fits_cpu returns -1 when uclamp_min is above capacity which is not considered as cpu overutilized. Remove the useless rq_util_min parameter. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Qais Yousef Reviewed-by: Christian Loehle Link: https://patch.msgid.link/20260213101751.3121899-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 23315c294da1..b8b052b2149b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7018,16 +7018,15 @@ static inline void hrtick_update(struct rq *rq) static inline bool cpu_overutilized(int cpu) { - unsigned long rq_util_min, rq_util_max; + unsigned long rq_util_max; if (!sched_energy_enabled()) return false; - rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); /* Return true only if the utilization doesn't fit CPU's capacity */ - return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); + return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu); } /* From d3d663faa1d4e86491b77ab72eabc3ea2f58b197 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 6 Feb 2026 10:54:54 +0100 Subject: [PATCH 06/46] sched/fair: Filter false overloaded_group case for EAS With EAS, a group should be set overloaded if at least 1 CPU in the group is overutilized but it can happen that a CPU is fully utilized by tasks because of clamping the compute capacity of the CPU. In such case, the CPU is not overutilized and as a result should not be set overloaded as well. group_overloaded being a higher priority than group_misfit, such group can be selected as the busiest group instead of a group with a mistfit task and prevents load_balance to select the CPU with the misfit task to pull the latter on a fitting CPU. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Pierre Gondois Link: https://patch.msgid.link/20260206095454.1520619-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b8b052b2149b..966e25282215 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10211,6 +10211,7 @@ struct sg_lb_stats { unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ unsigned int group_smt_balance; /* Task on busy SMT be moved */ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -10443,6 +10444,13 @@ group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) static inline bool group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { + /* + * With EAS and uclamp, 1 CPU in the group must be overutilized to + * consider the group overloaded. + */ + if (sched_energy_enabled() && !sgs->group_overutilized) + return false; + if (sgs->sum_nr_running <= sgs->group_weight) return false; @@ -10626,14 +10634,12 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_overloaded: sched_group is overloaded - * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, - bool *sg_overloaded, - bool *sg_overutilized) + bool *sg_overloaded) { int i, nr_running, local_group, sd_flags = env->sd->flags; bool balancing_at_rd = !env->sd->parent; @@ -10655,7 +10661,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->sum_nr_running += nr_running; if (cpu_overutilized(i)) - *sg_overutilized = 1; + sgs->group_overutilized = 1; /* * No need to call idle_cpu() if nr_running is not 0 @@ -11326,13 +11332,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd update_group_capacity(env->sd, env->dst_cpu); } - update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded); if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; sds->busiest_stat = *sgs; } + sg_overutilized |= sgs->group_overutilized; + /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; sds->total_capacity += sgs->group_capacity; From c0e1832ba6dad7057acf3f485a87e0adccc23141 Mon Sep 17 00:00:00 2001 From: Dengjun Su Date: Wed, 4 Feb 2026 19:59:29 +0800 Subject: [PATCH 07/46] sched: Fix incorrect schedstats for rt and dl thread For RT and DL thread, only 'set_next_task_(rt/dl)' will call 'update_stats_wait_end_(rt/dl)' to update schedstats information. However, during the migration process, 'update_stats_wait_start_(rt/dl)' will be called twice, which will cause the values of wait_max and wait_sum to be incorrect. The specific output as follows: $ cat /proc/6046/task/6046/sched | grep wait wait_start : 0.000000 wait_max : 496717.080029 wait_sum : 7921540.776553 A complete schedstats information update flow of migrate should be __update_stats_wait_start() [enter queue A, stage 1] -> __update_stats_wait_end() [leave queue A, stage 2] -> __update_stats_wait_start() [enter queue B, stage 3] -> __update_stats_wait_end() [start running on queue B, stage 4] Stage 1: prev_wait_start is 0, and in the end, wait_start records the time of entering the queue. Stage 2: task_on_rq_migrating(p) is true, and wait_start is updated to the waiting time on queue A. Stage 3: prev_wait_start is the waiting time on queue A, wait_start is the time of entering queue B, and wait_start is expected to be greater than prev_wait_start. Under this condition, wait_start is updated to (the moment of entering queue B) - (the waiting time on queue A). Stage 4: the final wait time = (time when starting to run on queue B) - (time of entering queue B) + (waiting time on queue A) = waiting time on queue B + waiting time on queue A. The current problem is that stage 2 does not call __update_stats_wait_end to update wait_start, which causes the final computed wait time = waiting time on queue B + the moment of entering queue A, leading to incorrect wait_max and wait_sum. Add 'update_stats_wait_end_(rt/dl)' in 'update_stats_dequeue_(rt/dl)' to update schedstats information when dequeue_task. Signed-off-by: Dengjun Su Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260204115959.3183567-1-dengjun.su@mediatek.com --- kernel/sched/deadline.c | 4 ++++ kernel/sched/rt.c | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d08b00429323..2de5727b94b4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2142,10 +2142,14 @@ update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags) { struct task_struct *p = dl_task_of(dl_se); + struct rq *rq = rq_of_dl_rq(dl_rq); if (!schedstat_enabled()) return; + if (p != rq->curr) + update_stats_wait_end_dl(dl_rq, dl_se); + if ((flags & DEQUEUE_SLEEP)) { unsigned int state; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f69e1f16d923..3d823f5ffe2c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1302,13 +1302,18 @@ update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int flags) { struct task_struct *p = NULL; + struct rq *rq = rq_of_rt_rq(rt_rq); if (!schedstat_enabled()) return; - if (rt_entity_is_task(rt_se)) + if (rt_entity_is_task(rt_se)) { p = rt_task_of(rt_se); + if (p != rq->curr) + update_stats_wait_end_rt(rt_rq, rt_se); + } + if ((flags & DEQUEUE_SLEEP) && p) { unsigned int state; From c2a57380df9dd5df6fae11c6ba9f624b9cad3e6a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 7 Nov 2025 10:24:52 +0100 Subject: [PATCH 08/46] sched: Replace use of system_unbound_wq with system_dfl_wq Currently if a user enqueues a work item using schedule_delayed_work() the used wq is "system_wq" (per-cpu wq) while queue_delayed_work() use WORK_CPU_UNBOUND (used when a cpu is not specified). The same applies to schedule_work() that is using system_wq and queue_work(), that makes use again of WORK_CPU_UNBOUND. This lack of consistency cannot be addressed without refactoring the API. For more details see the Link tag below. This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") Switch to using system_dfl_wq because system_unbound_wq is going away as part of a workqueue restructuring. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ Link: https://patch.msgid.link/20251107092452.43399-1-marco.crivellari@suse.com --- kernel/sched/core.c | 4 ++-- kernel/sched/ext.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7f77c165a6e..bfd280ec0f97 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5678,7 +5678,7 @@ static void sched_tick_remote(struct work_struct *work) os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); if (os == TICK_SCHED_REMOTE_RUNNING) - queue_delayed_work(system_unbound_wq, dwork, HZ); + queue_delayed_work(system_dfl_wq, dwork, HZ); } static void sched_tick_start(int cpu) @@ -5697,7 +5697,7 @@ static void sched_tick_start(int cpu) if (os == TICK_SCHED_REMOTE_OFFLINE) { twork->cpu = cpu; INIT_DELAYED_WORK(&twork->work, sched_tick_remote); - queue_delayed_work(system_unbound_wq, &twork->work, HZ); + queue_delayed_work(system_dfl_wq, &twork->work, HZ); } } diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 06cc0a4aec66..a448a8407d8e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2762,7 +2762,7 @@ static void scx_watchdog_workfn(struct work_struct *work) cond_resched(); } - queue_delayed_work(system_unbound_wq, to_delayed_work(work), + queue_delayed_work(system_dfl_wq, to_delayed_work(work), scx_watchdog_timeout / 2); } @@ -5059,7 +5059,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) WRITE_ONCE(scx_watchdog_timeout, timeout); WRITE_ONCE(scx_watchdog_timestamp, jiffies); - queue_delayed_work(system_unbound_wq, &scx_watchdog_work, + queue_delayed_work(system_dfl_wq, &scx_watchdog_work, scx_watchdog_timeout / 2); /* From fd54d81c2c0e6cffd5470c2c27fbb04d0ebe7da0 Mon Sep 17 00:00:00 2001 From: Christian Loehle Date: Tue, 3 Feb 2026 18:49:39 +0000 Subject: [PATCH 09/46] sched/fair: Skip SCHED_IDLE rq for SCHED_IDLE task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPUs whose rq only have SCHED_IDLE tasks running are considered to be equivalent to truly idle CPUs during wakeup path. For fork and exec SCHED_IDLE is even preferred. This is based on the assumption that the SCHED_IDLE CPU is not in an idle state and might be in a higher P-state, allowing the task/wakee to run immediately without sharing the rq. However this assumption doesn't hold if the wakee has SCHED_IDLE policy itself, as it will share the rq with existing SCHED_IDLE tasks. In this case, we are better off continuing to look for a truly idle CPU. On a Intel Xeon 2-socket with 64 logical cores in total this yields for kernel compilation using SCHED_IDLE: +---------+----------------------+----------------------+--------+ | workers | mainline (seconds) | patch (seconds) | delta% | +=========+======================+======================+========+ | 1 | 4384.728 ± 21.085 | 3843.250 ± 16.235 | -12.35 | | 2 | 2242.513 ± 2.099 | 1971.696 ± 2.842 | -12.08 | | 4 | 1199.324 ± 1.823 | 1033.744 ± 1.803 | -13.81 | | 8 | 649.083 ± 1.959 | 559.123 ± 4.301 | -13.86 | | 16 | 370.425 ± 0.915 | 325.906 ± 4.623 | -12.02 | | 32 | 234.651 ± 2.255 | 217.266 ± 0.253 | -7.41 | | 64 | 202.286 ± 1.452 | 197.977 ± 2.275 | -2.13 | | 128 | 217.092 ± 1.687 | 212.164 ± 1.138 | -2.27 | +---------+----------------------+----------------------+--------+ Signed-off-by: Christian Loehle Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Tested-by: K Prateek Nayak Link: https://patch.msgid.link/20260203184939.2138022-1-christian.loehle@arm.com --- kernel/sched/fair.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 966e25282215..d57c02e82f3a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7064,9 +7064,15 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -static int sched_idle_cpu(int cpu) +static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p) { - return sched_idle_rq(cpu_rq(cpu)); + return sched_idle_rq(rq) && !task_has_idle_policy(p); +} + +static int choose_idle_cpu(int cpu, struct task_struct *p) +{ + return available_idle_cpu(cpu) || + choose_sched_idle_rq(cpu_rq(cpu), p); } static void @@ -7631,7 +7637,7 @@ sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct * if (!sched_core_cookie_match(rq, p)) continue; - if (sched_idle_cpu(i)) + if (choose_sched_idle_rq(rq, p)) return i; if (available_idle_cpu(i)) { @@ -7722,8 +7728,7 @@ static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct tas static inline int __select_idle_cpu(int cpu, struct task_struct *p) { - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && - sched_cpu_cookie_match(cpu_rq(cpu), p)) + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p)) return cpu; return -1; @@ -7796,7 +7801,8 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu if (!available_idle_cpu(cpu)) { idle = false; if (*idle_cpu == -1) { - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { + if (choose_sched_idle_rq(cpu_rq(cpu), p) && + cpumask_test_cpu(cpu, cpus)) { *idle_cpu = cpu; break; } @@ -7831,7 +7837,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t */ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + if (choose_idle_cpu(cpu, p)) return cpu; } @@ -7953,7 +7959,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) + if (!choose_idle_cpu(cpu, p)) continue; fits = util_fits_cpu(task_util, util_min, util_max, cpu); @@ -8024,7 +8030,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ lockdep_assert_irqs_disabled(); - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + if (choose_idle_cpu(target, p) && asym_fits_cpu(task_util, util_min, util_max, target)) return target; @@ -8032,7 +8038,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + choose_idle_cpu(prev, p) && asym_fits_cpu(task_util, util_min, util_max, prev)) { if (!static_branch_unlikely(&sched_cluster_active) || @@ -8064,7 +8070,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && + choose_idle_cpu(recent_used_cpu, p) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { @@ -12531,7 +12537,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { int continue_balancing = 1; int cpu = rq->cpu; - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ @@ -12569,7 +12575,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * state even if we migrated tasks. Update it. */ idle = idle_cpu(cpu); - busy = !idle && !sched_idle_cpu(cpu); + busy = !idle && !sched_idle_rq(rq); } sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); From 2e7af192697ef2a71c76fd57860b0fcd02754e14 Mon Sep 17 00:00:00 2001 From: Tommaso Cucinotta Date: Fri, 12 Sep 2025 07:38:29 +0200 Subject: [PATCH 10/46] sched/deadline: Add reporting of runtime left & abs deadline to sched_getattr() for DEADLINE tasks The SCHED_DEADLINE scheduler allows reading the statically configured run-time, deadline, and period parameters through the sched_getattr() system call. However, there is no immediate way to access, from user space, the current parameters used within the scheduler: the instantaneous runtime left in the current cycle, as well as the current absolute deadline. The `flags' sched_getattr() parameter, so far mandated to contain zero, now supports the SCHED_GETATTR_FLAG_DL_DYNAMIC=1 flag, to request retrieval of the leftover runtime and absolute deadline, converted to a CLOCK_MONOTONIC reference, instead of the statically configured parameters. This feature is useful for adaptive SCHED_DEADLINE tasks that need to modify their behavior depending on whether or not there is enough runtime left in the current period, and/or what is the current absolute deadline. Notes: - before returning the instantaneous parameters, the runtime is updated; - the abs deadline is returned shifted from rq_clock() to ktime_get_ns(), in CLOCK_MONOTONIC reference; this causes multiple invocations from the same period to return values that may differ for a few ns (showing some small drift), albeit the deadline doesn't move, in rq_clock() reference; - the abs deadline value returned to user-space, as unsigned 64-bit value, can represent nearly 585 years since boot time; - setting flags=0 provides the old behavior (retrieve static parameters). See also the notes from discussion held at OSPM 2025 on the topic "Making user space aware of current deadline-scheduler parameters". Signed-off-by: Tommaso Cucinotta Signed-off-by: Peter Zijlstra (Intel) Tested-by: Matteo Martelli Link: https://patch.msgid.link/20250912053937.31636-2-tommaso.cucinotta@santannapisa.it --- include/uapi/linux/sched.h | 3 +++ kernel/sched/deadline.c | 19 ++++++++++++++++--- kernel/sched/sched.h | 2 +- kernel/sched/syscalls.c | 16 +++++++++++----- 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..52b69ce89368 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -146,4 +146,7 @@ struct clone_args { SCHED_FLAG_KEEP_ALL | \ SCHED_FLAG_UTIL_CLAMP) +/* Only for sched_getattr() own flag param, if task is SCHED_DEADLINE */ +#define SCHED_GETATTR_FLAG_DL_DYNAMIC 0x01 + #endif /* _UAPI_LINUX_SCHED_H */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2de5727b94b4..9e253a825f39 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3617,13 +3617,26 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); } -void __getparam_dl(struct task_struct *p, struct sched_attr *attr) +void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { struct sched_dl_entity *dl_se = &p->dl; + struct rq *rq = task_rq(p); + u64 adj_deadline; attr->sched_priority = p->rt_priority; - attr->sched_runtime = dl_se->dl_runtime; - attr->sched_deadline = dl_se->dl_deadline; + if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) { + guard(raw_spinlock_irq)(&rq->__lock); + update_rq_clock(rq); + if (task_current(rq, p)) + update_curr_dl(rq); + + attr->sched_runtime = dl_se->runtime; + adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns(); + attr->sched_deadline = adj_deadline; + } else { + attr->sched_runtime = dl_se->dl_runtime; + attr->sched_deadline = dl_se->dl_deadline; + } attr->sched_period = dl_se->dl_period; attr->sched_flags &= ~SCHED_DL_FLAGS; attr->sched_flags |= dl_se->flags; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8bf2f7d524cd..fa2237e89bee 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -356,7 +356,7 @@ extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); -extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); +extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags); extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 6f10db3646e7..a288ac0a633d 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -881,10 +881,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a return -E2BIG; } -static void get_params(struct task_struct *p, struct sched_attr *attr) +static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags) { if (task_has_dl_policy(p)) { - __getparam_dl(p, attr); + __getparam_dl(p, attr, flags); } else if (task_has_rt_policy(p)) { attr->sched_priority = p->rt_priority; } else { @@ -950,7 +950,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, return -ESRCH; if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); + get_params(p, &attr, 0); return sched_setattr(p, &attr); } @@ -1035,7 +1035,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, int retval; if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || - usize < SCHED_ATTR_SIZE_VER0 || flags)) + usize < SCHED_ATTR_SIZE_VER0)) return -EINVAL; scoped_guard (rcu) { @@ -1043,6 +1043,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, if (!p) return -ESRCH; + if (flags) { + if (!task_has_dl_policy(p) || + flags != SCHED_GETATTR_FLAG_DL_DYNAMIC) + return -EINVAL; + } + retval = security_task_getscheduler(p); if (retval) return retval; @@ -1050,7 +1056,7 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); + get_params(p, &kattr, flags); kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK From 4b9ef32c57a68eb98c45835c2beaa77f8e51c5c4 Mon Sep 17 00:00:00 2001 From: Xie Yuanbin Date: Tue, 17 Feb 2026 00:49:48 +0800 Subject: [PATCH 11/46] x86/mm/tlb: Make enter_lazy_tlb() always inline on x86 enter_lazy_tlb() on x86 is short enough, and is called in context switching, which is the hot code path. Make enter_lazy_tlb() always inline on x86 to optimize performance. Suggested-by: Dave Hansen Signed-off-by: Xie Yuanbin Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20260216164950.147617-2-qq570070308@gmail.com --- arch/x86/include/asm/mmu_context.h | 3 --- arch/x86/include/asm/tlbflush.h | 26 ++++++++++++++++++++++++++ arch/x86/mm/tlb.c | 21 --------------------- 3 files changed, 26 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 1acafb1c6a93..ef5b507de34e 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -136,9 +136,6 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm) } #endif -#define enter_lazy_tlb enter_lazy_tlb -extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); - extern void mm_init_global_asid(struct mm_struct *mm); extern void mm_free_global_asid(struct mm_struct *mm); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 5a3cdc439e38..0545fe75c3fa 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -172,6 +172,28 @@ struct tlb_state_shared { }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); +/* + * Please ignore the name of this function. It should be called + * switch_to_kernel_thread(). + * + * enter_lazy_tlb() is a hint from the scheduler that we are entering a + * kernel thread or other context without an mm. Acceptable implementations + * include doing nothing whatsoever, switching to init_mm, or various clever + * lazy tricks to try to minimize TLB flushes. + * + * The scheduler reserves the right to call enter_lazy_tlb() several times + * in a row. It will notify us that we're going back to a real mm by + * calling switch_mm_irqs_off(). + */ +#define enter_lazy_tlb enter_lazy_tlb +static __always_inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) + return; + + this_cpu_write(cpu_tlbstate_shared.is_lazy, true); +} + bool nmi_uaccess_okay(void); #define nmi_uaccess_okay nmi_uaccess_okay @@ -480,6 +502,10 @@ static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask) { } #endif +#else /* !MODULE */ +#define enter_lazy_tlb enter_lazy_tlb +extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) + __compiletime_error("enter_lazy_tlb() should not be used in modules"); #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 621e09d049cb..af43d177087e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -971,27 +971,6 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, } } -/* - * Please ignore the name of this function. It should be called - * switch_to_kernel_thread(). - * - * enter_lazy_tlb() is a hint from the scheduler that we are entering a - * kernel thread or other context without an mm. Acceptable implementations - * include doing nothing whatsoever, switching to init_mm, or various clever - * lazy tricks to try to minimize TLB flushes. - * - * The scheduler reserves the right to call enter_lazy_tlb() several times - * in a row. It will notify us that we're going back to a real mm by - * calling switch_mm_irqs_off(). - */ -void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ - if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) - return; - - this_cpu_write(cpu_tlbstate_shared.is_lazy, true); -} - /* * Using a temporary mm allows to set temporary mappings that are not accessible * by other CPUs. Such mappings are needed to perform sensitive memory writes From 54a66e431eeacf23e1dc47cb3507f2d0c068aaf0 Mon Sep 17 00:00:00 2001 From: Xie Yuanbin Date: Tue, 17 Feb 2026 00:49:49 +0800 Subject: [PATCH 12/46] sched/headers: Inline raw_spin_rq_unlock() raw_spin_rq_unlock() is short, and is called in some hot code paths such as finish_lock_switch(). Inline raw_spin_rq_unlock() to micro-optimize performance a bit. Signed-off-by: Xie Yuanbin Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20260216164950.147617-3-qq570070308@gmail.com --- kernel/sched/core.c | 5 ----- kernel/sched/sched.h | 9 ++++++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bfd280ec0f97..b59bab255e57 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -687,11 +687,6 @@ bool raw_spin_rq_trylock(struct rq *rq) } } -void raw_spin_rq_unlock(struct rq *rq) -{ - raw_spin_unlock(rq_lockp(rq)); -} - /* * double_rq_lock - safely lock two runqueues */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fa2237e89bee..953d89d71804 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1607,15 +1607,18 @@ extern void raw_spin_rq_lock_nested(struct rq *rq, int subclass) extern bool raw_spin_rq_trylock(struct rq *rq) __cond_acquires(true, __rq_lockp(rq)); -extern void raw_spin_rq_unlock(struct rq *rq) - __releases(__rq_lockp(rq)); - static inline void raw_spin_rq_lock(struct rq *rq) __acquires(__rq_lockp(rq)) { raw_spin_rq_lock_nested(rq, 0); } +static inline void raw_spin_rq_unlock(struct rq *rq) + __releases(__rq_lockp(rq)) +{ + raw_spin_unlock(rq_lockp(rq)); +} + static inline void raw_spin_rq_lock_irq(struct rq *rq) __acquires(__rq_lockp(rq)) { From 49b76317592ecbaefd0969d51d02019966cc994b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 1 Mar 2026 16:52:37 -0800 Subject: [PATCH 13/46] sched/wait: correct kernel-doc descriptions Use the correct function name and function parameter name to avoid these kernel-doc warnings: Warning: include/linux/wait_bit.h:424 expecting prototype for wait_var_event_killable(). Prototype was for wait_var_event_interruptible() instead Warning: include/linux/wait_bit.h:508 function parameter 'lock' not described in 'wait_var_event_mutex' Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260302005237.3473095-1-rdunlap@infradead.org --- include/linux/wait_bit.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 9e29d79fc790..ace7379d627d 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h @@ -406,7 +406,7 @@ do { \ schedule()) /** - * wait_var_event_killable - wait for a variable to be updated and notified + * wait_var_event_interruptible - wait for a variable to be updated and notified * @var: the address of variable being waited on * @condition: the condition to wait for * @@ -492,7 +492,7 @@ do { \ * wait_var_event_mutex - wait for a variable to be updated under a mutex * @var: the address of the variable being waited on * @condition: condition to wait for - * @mutex: the mutex which protects updates to the variable + * @lock: the mutex which protects updates to the variable * * Wait for a condition which can only be reliably tested while holding * a mutex. The variables assessed in the condition will normal be From 8d16e3c6f844823812f872df5ef1d3d2ed11b956 Mon Sep 17 00:00:00 2001 From: Zhan Xusheng Date: Mon, 9 Mar 2026 10:42:47 +0800 Subject: [PATCH 14/46] sched/fair: Fix comma operator misuse in NUMA fault accounting Replace the comma operator with separate statements when assigning NUMA fault statistics. This improves readability and follows kernel coding style. Signed-off-by: Zhan Xusheng Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260309024247.10908-1-zhanxusheng@xiaomi.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d57c02e82f3a..c1e5c8298fd1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -14226,7 +14226,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)]; } if (ng) { - gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)], + gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)]; gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)]; } print_numa_stats(m, node, tsf, tpf, gsf, gpf); From 8e8e23dea43e64ddafbd1246644c3219209be113 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:26 +0000 Subject: [PATCH 15/46] sched/topology: Compute sd_weight considering cpuset partitions The "sd_weight" used for calculating the load balancing interval, and its limits, considers the span weight of the entire topology level without accounting for cpuset partitions. For example, consider a large system of 128CPUs divided into 8 * 16CPUs partition which is typical when deploying virtual machines: [ PKG Domain: 128CPUs ] [Partition0: 16CPUs][Partition1: 16CPUs] ... [Partition7: 16CPUs] Although each partition only contains 16CPUs, the load balancing interval is set to a minimum of 128 jiffies considering the span of the entire domain with 128CPUs which can lead to longer imbalances within the partition although balancing within is cheaper with 16CPUs. Compute the "sd_weight" after computing the "sd_span" considering the cpu_map covered by the partition, and set the load balancing interval, and its limits accordingly. For the above example, the balancing intervals for the partitions PKG domain changes as follows: before after balance_interval 128 16 min_interval 128 16 max_interval 256 32 Intervals are now proportional to the CPUs in the partitioned domain as was intended by the original formula. Fixes: cb83b629bae03 ("sched/numa: Rewrite the CONFIG_NUMA sched domain support") Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Shrikanth Hegde Reviewed-by: Chen Yu Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-2-kprateek.nayak@amd.com --- kernel/sched/topology.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 061f8c85f555..79bab80af8f2 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1645,13 +1645,17 @@ sd_init(struct sched_domain_topology_level *tl, struct cpumask *sd_span; u64 now = sched_clock(); - sd_weight = cpumask_weight(tl->mask(tl, cpu)); + sd_span = sched_domain_span(sd); + cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); + sd_weight = cpumask_weight(sd_span); + sd_id = cpumask_first(sd_span); if (tl->sd_flags) sd_flags = (*tl->sd_flags)(); if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, - "wrong sd_flags in topology description\n")) + "wrong sd_flags in topology description\n")) sd_flags &= TOPOLOGY_SD_FLAGS; + sd_flags |= asym_cpu_capacity_classify(sd_span, cpu_map); *sd = (struct sched_domain){ .min_interval = sd_weight, @@ -1689,12 +1693,6 @@ sd_init(struct sched_domain_topology_level *tl, .name = tl->name, }; - sd_span = sched_domain_span(sd); - cpumask_and(sd_span, cpu_map, tl->mask(tl, cpu)); - sd_id = cpumask_first(sd_span); - - sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); - WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), "CPU capacity asymmetry not supported on SMT\n"); From 5a7b576b3ec1acc2694c5b58f80cd1d44a11b2c1 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:27 +0000 Subject: [PATCH 16/46] sched/topology: Extract "imb_numa_nr" calculation into a separate helper Subsequent changes to assign "sd->shared" from "s_data" would necessitate finding the topmost SD_SHARE_LLC to assign shared object to. This is very similar to the "imb_numa_nr" computation loop except that "imb_numa_nr" cares about the first domain without the SD_SHARE_LLC flag (immediate parent of sd_llc) whereas the "sd->shared" assignment would require sd_llc itself. Extract the "imb_numa_nr" calculation into a helper adjust_numa_imbalance() and use the current loop in the build_sched_domains() to find the sd_llc. While at it, guard the call behind CONFIG_NUMA's status since "imb_numa_nr" only makes sense on NUMA enabled configs with SD_NUMA domains. No functional changes intended. Suggested-by: Valentin Schneider Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-3-kprateek.nayak@amd.com --- kernel/sched/topology.c | 133 ++++++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 53 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 79bab80af8f2..6303790a4143 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2549,6 +2549,74 @@ static bool topology_span_sane(const struct cpumask *cpu_map) return true; } +/* + * Calculate an allowed NUMA imbalance such that LLCs do not get + * imbalanced. + */ +static void adjust_numa_imbalance(struct sched_domain *sd_llc) +{ + struct sched_domain *parent; + unsigned int imb_span = 1; + unsigned int imb = 0; + unsigned int nr_llcs; + + WARN_ON(!(sd_llc->flags & SD_SHARE_LLC)); + WARN_ON(!sd_llc->parent); + + /* + * For a single LLC per node, allow an + * imbalance up to 12.5% of the node. This is + * arbitrary cutoff based two factors -- SMT and + * memory channels. For SMT-2, the intent is to + * avoid premature sharing of HT resources but + * SMT-4 or SMT-8 *may* benefit from a different + * cutoff. For memory channels, this is a very + * rough estimate of how many channels may be + * active and is based on recent CPUs with + * many cores. + * + * For multiple LLCs, allow an imbalance + * until multiple tasks would share an LLC + * on one node while LLCs on another node + * remain idle. This assumes that there are + * enough logical CPUs per LLC to avoid SMT + * factors and that there is a correlation + * between LLCs and memory channels. + */ + nr_llcs = sd_llc->parent->span_weight / sd_llc->span_weight; + if (nr_llcs == 1) + imb = sd_llc->parent->span_weight >> 3; + else + imb = nr_llcs; + + imb = max(1U, imb); + sd_llc->parent->imb_numa_nr = imb; + + /* + * Set span based on the first NUMA domain. + * + * NUMA systems always add a NODE domain before + * iterating the NUMA domains. Since this is before + * degeneration, start from sd_llc's parent's + * parent which is the lowest an SD_NUMA domain can + * be relative to sd_llc. + */ + parent = sd_llc->parent->parent; + while (parent && !(parent->flags & SD_NUMA)) + parent = parent->parent; + + imb_span = parent ? parent->span_weight : sd_llc->parent->span_weight; + + /* Update the upper remainder of the topology */ + parent = sd_llc->parent; + while (parent) { + int factor = max(1U, (parent->span_weight / imb_span)); + + parent->imb_numa_nr = imb * factor; + parent = parent->parent; + } +} + /* * Build sched domains for a given set of CPUs and attach the sched domains * to the individual CPUs @@ -2606,62 +2674,21 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att } } - /* - * Calculate an allowed NUMA imbalance such that LLCs do not get - * imbalanced. - */ for_each_cpu(i, cpu_map) { - unsigned int imb = 0; - unsigned int imb_span = 1; + sd = *per_cpu_ptr(d.sd, i); + if (!sd) + continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - struct sched_domain *child = sd->child; + /* First, find the topmost SD_SHARE_LLC domain */ + while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) + sd = sd->parent; - if (!(sd->flags & SD_SHARE_LLC) && child && - (child->flags & SD_SHARE_LLC)) { - struct sched_domain __rcu *top_p; - unsigned int nr_llcs; - - /* - * For a single LLC per node, allow an - * imbalance up to 12.5% of the node. This is - * arbitrary cutoff based two factors -- SMT and - * memory channels. For SMT-2, the intent is to - * avoid premature sharing of HT resources but - * SMT-4 or SMT-8 *may* benefit from a different - * cutoff. For memory channels, this is a very - * rough estimate of how many channels may be - * active and is based on recent CPUs with - * many cores. - * - * For multiple LLCs, allow an imbalance - * until multiple tasks would share an LLC - * on one node while LLCs on another node - * remain idle. This assumes that there are - * enough logical CPUs per LLC to avoid SMT - * factors and that there is a correlation - * between LLCs and memory channels. - */ - nr_llcs = sd->span_weight / child->span_weight; - if (nr_llcs == 1) - imb = sd->span_weight >> 3; - else - imb = nr_llcs; - imb = max(1U, imb); - sd->imb_numa_nr = imb; - - /* Set span based on the first NUMA domain. */ - top_p = sd->parent; - while (top_p && !(top_p->flags & SD_NUMA)) { - top_p = top_p->parent; - } - imb_span = top_p ? top_p->span_weight : sd->span_weight; - } else { - int factor = max(1U, (sd->span_weight / imb_span)); - - sd->imb_numa_nr = imb * factor; - } - } + /* + * In presence of higher domains, adjust the + * NUMA imbalance stats for the hierarchy. + */ + if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent) + adjust_numa_imbalance(sd); } /* Calculate CPU capacity for physical packages and nodes */ From 1cc8a33ca7e8d38f962b64ece2a42c411a67bc76 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:28 +0000 Subject: [PATCH 17/46] sched/topology: Allocate per-CPU sched_domain_shared in s_data The "sched_domain_shared" object is allocated for every topology level in __sdt_alloc() and is freed post sched domain rebuild if they aren't assigned during sd_init(). "sd->shared" is only assigned for SD_SHARE_LLC domains and out of all the assigned objects, only "sd_llc_shared" is ever used by the scheduler. Since only "sd_llc_shared" is ever used, and since SD_SHARE_LLC domains never overlap, allocate only a single range of per-CPU "sched_domain_shared" object with s_data instead of doing it per topology level. The subsequent commit uses the degeneration path to correctly assign the "sd->shared" to the topmost SD_SHARE_LLC domain. No functional changes are expected at this point. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Chen Yu Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-4-kprateek.nayak@amd.com --- kernel/sched/topology.c | 48 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 6303790a4143..9006586720bf 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -782,6 +782,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) } struct s_data { + struct sched_domain_shared * __percpu *sds; struct sched_domain * __percpu *sd; struct root_domain *rd; }; @@ -789,6 +790,7 @@ struct s_data { enum s_alloc { sa_rootdomain, sa_sd, + sa_sd_shared, sa_sd_storage, sa_none, }; @@ -1535,6 +1537,9 @@ static void set_domain_attribute(struct sched_domain *sd, static void __sdt_free(const struct cpumask *cpu_map); static int __sdt_alloc(const struct cpumask *cpu_map); +static void __sds_free(struct s_data *d, const struct cpumask *cpu_map); +static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map); + static void __free_domain_allocs(struct s_data *d, enum s_alloc what, const struct cpumask *cpu_map) { @@ -1546,6 +1551,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, case sa_sd: free_percpu(d->sd); fallthrough; + case sa_sd_shared: + __sds_free(d, cpu_map); + fallthrough; case sa_sd_storage: __sdt_free(cpu_map); fallthrough; @@ -1561,9 +1569,11 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) if (__sdt_alloc(cpu_map)) return sa_sd_storage; + if (__sds_alloc(d, cpu_map)) + return sa_sd_shared; d->sd = alloc_percpu(struct sched_domain *); if (!d->sd) - return sa_sd_storage; + return sa_sd_shared; d->rd = alloc_rootdomain(); if (!d->rd) return sa_sd; @@ -2464,6 +2474,42 @@ static void __sdt_free(const struct cpumask *cpu_map) } } +static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map) +{ + int j; + + d->sds = alloc_percpu(struct sched_domain_shared *); + if (!d->sds) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain_shared *sds; + + sds = kzalloc_node(sizeof(struct sched_domain_shared), + GFP_KERNEL, cpu_to_node(j)); + if (!sds) + return -ENOMEM; + + *per_cpu_ptr(d->sds, j) = sds; + } + + return 0; +} + +static void __sds_free(struct s_data *d, const struct cpumask *cpu_map) +{ + int j; + + if (!d->sds) + return; + + for_each_cpu(j, cpu_map) + kfree(*per_cpu_ptr(d->sds, j)); + + free_percpu(d->sds); + d->sds = NULL; +} + static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *child, int cpu) From bb7a5e44fc6f3d5a252d95c48d057d5beccb8b35 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:29 +0000 Subject: [PATCH 18/46] sched/topology: Switch to assigning "sd->shared" from s_data Use the "sched_domain_shared" object allocated in s_data for "sd->shared" assignments. Assign "sd->shared" for the topmost SD_SHARE_LLC domain before degeneration and rely on the degeneration path to correctly pass down the shared object to "sd_llc". sd_degenerate_parent() ensures degenerating domains must have the same sched_domain_span() which ensures 1:1 passing down of the shared object. If the topmost SD_SHARE_LLC domain degenerates, the shared object is freed from destroy_sched_domain() when the last reference is dropped. claim_allocations() NULLs out the objects that have been assigned as "sd->shared" and the unassigned ones are freed from the __sds_free() path. To keep all the claim_allocations() bits in one place, claim_allocations() has been extended to accept "s_data" and iterate the domains internally to free both "sched_domain_shared" and the per-topology-level data for the particular CPU in one place. Post cpu_attach_domain(), all reclaims of "sd->shared" are handled via call_rcu() on the sched_domain object via destroy_sched_domains_rcu(). Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-5-kprateek.nayak@amd.com --- kernel/sched/topology.c | 73 +++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 29 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9006586720bf..b19d84f44669 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -685,6 +685,9 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); + + /* If sd_llc exists, sd_llc_shared should exist too. */ + WARN_ON_ONCE(!sd->shared); sds = sd->shared; } @@ -733,6 +736,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) if (sd_parent_degenerate(tmp, parent)) { tmp->parent = parent->parent; + /* Pick reference to parent->shared. */ + if (parent->shared) { + WARN_ON_ONCE(tmp->shared); + tmp->shared = parent->shared; + parent->shared = NULL; + } + if (parent->parent) { parent->parent->child = tmp; parent->parent->groups->flags = tmp->flags; @@ -1586,21 +1596,28 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) * sched_group structure so that the subsequent __free_domain_allocs() * will not free the data we're using. */ -static void claim_allocations(int cpu, struct sched_domain *sd) +static void claim_allocations(int cpu, struct s_data *d) { - struct sd_data *sdd = sd->private; + struct sched_domain *sd; - WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); - *per_cpu_ptr(sdd->sd, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(d->sds, cpu))->ref)) + *per_cpu_ptr(d->sds, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; + for (sd = *per_cpu_ptr(d->sd, cpu); sd; sd = sd->parent) { + struct sd_data *sdd = sd->private; - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) - *per_cpu_ptr(sdd->sg, cpu) = NULL; + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) - *per_cpu_ptr(sdd->sgc, cpu) = NULL; + if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) + *per_cpu_ptr(sdd->sds, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) + *per_cpu_ptr(sdd->sg, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) + *per_cpu_ptr(sdd->sgc, cpu) = NULL; + } } #ifdef CONFIG_NUMA @@ -1738,16 +1755,6 @@ sd_init(struct sched_domain_topology_level *tl, sd->cache_nice_tries = 1; } - /* - * For all levels sharing cache; connect a sched_domain_shared - * instance. - */ - if (sd->flags & SD_SHARE_LLC) { - sd->shared = *per_cpu_ptr(sdd->sds, sd_id); - atomic_inc(&sd->shared->ref); - atomic_set(&sd->shared->nr_busy_cpus, sd_weight); - } - sd->private = sdd; return sd; @@ -2729,12 +2736,20 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att while (sd->parent && (sd->parent->flags & SD_SHARE_LLC)) sd = sd->parent; - /* - * In presence of higher domains, adjust the - * NUMA imbalance stats for the hierarchy. - */ - if (IS_ENABLED(CONFIG_NUMA) && (sd->flags & SD_SHARE_LLC) && sd->parent) - adjust_numa_imbalance(sd); + if (sd->flags & SD_SHARE_LLC) { + int sd_id = cpumask_first(sched_domain_span(sd)); + + sd->shared = *per_cpu_ptr(d.sds, sd_id); + atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight); + atomic_inc(&sd->shared->ref); + + /* + * In presence of higher domains, adjust the + * NUMA imbalance stats for the hierarchy. + */ + if (IS_ENABLED(CONFIG_NUMA) && sd->parent) + adjust_numa_imbalance(sd); + } } /* Calculate CPU capacity for physical packages and nodes */ @@ -2742,10 +2757,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (!cpumask_test_cpu(i, cpu_map)) continue; - for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { - claim_allocations(i, sd); + claim_allocations(i, &d); + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) init_sched_groups_capacity(i, sd); - } } /* Attach the domains */ From 10febd397591d93f42adb743c2c664041e7f1bcb Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:30 +0000 Subject: [PATCH 19/46] sched/topology: Remove sched_domain_shared allocation with sd_data Now that "sd->shared" assignments are using the sched_domain_shared objects allocated with s_data, remove the sd_data based allocations. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-6-kprateek.nayak@amd.com --- include/linux/sched/topology.h | 1 - kernel/sched/topology.c | 19 ------------------- 2 files changed, 20 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a1e1032426dc..51c29581f15e 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -172,7 +172,6 @@ typedef int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain *__percpu *sd; - struct sched_domain_shared *__percpu *sds; struct sched_group *__percpu *sg; struct sched_group_capacity *__percpu *sgc; }; diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b19d84f44669..43150591914b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1609,9 +1609,6 @@ static void claim_allocations(int cpu, struct s_data *d) WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); *per_cpu_ptr(sdd->sd, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref)) - *per_cpu_ptr(sdd->sds, cpu) = NULL; - if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) *per_cpu_ptr(sdd->sg, cpu) = NULL; @@ -2390,10 +2387,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) if (!sdd->sd) return -ENOMEM; - sdd->sds = alloc_percpu(struct sched_domain_shared *); - if (!sdd->sds) - return -ENOMEM; - sdd->sg = alloc_percpu(struct sched_group *); if (!sdd->sg) return -ENOMEM; @@ -2404,7 +2397,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) for_each_cpu(j, cpu_map) { struct sched_domain *sd; - struct sched_domain_shared *sds; struct sched_group *sg; struct sched_group_capacity *sgc; @@ -2415,13 +2407,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sd, j) = sd; - sds = kzalloc_node(sizeof(struct sched_domain_shared), - GFP_KERNEL, cpu_to_node(j)); - if (!sds) - return -ENOMEM; - - *per_cpu_ptr(sdd->sds, j) = sds; - sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sg) @@ -2463,8 +2448,6 @@ static void __sdt_free(const struct cpumask *cpu_map) kfree(*per_cpu_ptr(sdd->sd, j)); } - if (sdd->sds) - kfree(*per_cpu_ptr(sdd->sds, j)); if (sdd->sg) kfree(*per_cpu_ptr(sdd->sg, j)); if (sdd->sgc) @@ -2472,8 +2455,6 @@ static void __sdt_free(const struct cpumask *cpu_map) } free_percpu(sdd->sd); sdd->sd = NULL; - free_percpu(sdd->sds); - sdd->sds = NULL; free_percpu(sdd->sg); sdd->sg = NULL; free_percpu(sdd->sgc); From f494bfb04615119f31dbd3222c9d39fea3817d40 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:31 +0000 Subject: [PATCH 20/46] sched/core: Check for rcu_read_lock_any_held() in idle_get_state() Similar to commit 71fedc41c23b ("sched/fair: Switch to rcu_dereference_all()"), switch to checking for rcu_read_lock_any_held() in idle_get_state() to allow removing superfluous rcu_read_lock() regions in the fair task's wakeup path where the pi_lock is held and IRQs are disabled. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-7-kprateek.nayak@amd.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 953d89d71804..b863bbda6de8 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2853,7 +2853,7 @@ static inline void idle_set_state(struct rq *rq, static inline struct cpuidle_state *idle_get_state(struct rq *rq) { - WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert(rcu_read_lock_any_held()); return rq->idle_state; } From 8ca12326f592f7554acf2788ecb1c5c954dcf31c Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 16 Mar 2026 00:36:22 +0100 Subject: [PATCH 21/46] PM: EM: Switch to rcu_dereference_all() in wakeup path em_cpu_energy() is part of the EAS (Fair) task wakeup path. Now that rcu_read_{,un}lock() have been removed from find_energy_efficient_cpu() switch to rcu_dereference_all() and check for rcu_read_lock_any_held() in em_cpu_energy() as well. In EAS (Fair) task wakeup path is a preempt/IRQ disabled region, so rcu_read_{,un}lock() can be removed. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/5b1228b7-5949-4a45-9f62-e8ce936de694@arm.com --- include/linux/energy_model.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index e7497f804644..c909a8ba22e8 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -248,7 +248,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, struct em_perf_state *ps; int i; - WARN_ONCE(!rcu_read_lock_held(), "EM: rcu read lock needed\n"); + lockdep_assert(rcu_read_lock_any_held()); if (!sum_util) return 0; @@ -267,7 +267,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, * Find the lowest performance state of the Energy Model above the * requested performance. */ - em_table = rcu_dereference(pd->em_table); + em_table = rcu_dereference_all(pd->em_table); i = em_pd_get_efficient_state(em_table->state, pd, max_util); ps = &em_table->state[i]; From fa6874dfeee06352ce7c4c271be6a25d84a38b54 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:32 +0000 Subject: [PATCH 22/46] sched/fair: Remove superfluous rcu_read_lock() in the wakeup path select_task_rq_fair() is always called with p->pi_lock held and IRQs disabled which makes it equivalent of an RCU read-side. Since commit 71fedc41c23b ("sched/fair: Switch to rcu_dereference_all()") switched to using rcu_dereference_all() in the wakeup path, drop the explicit rcu_read_{lock,unlock}() in the fair task's wakeup path. Future plans to reuse select_task_rq_fair() / find_energy_efficient_cpu() in the fair class' balance callback will do so with IRQs disabled and will comply with the requirements of rcu_dereference_all() which makes this safe keeping in mind future development plans too. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-8-kprateek.nayak@amd.com --- kernel/sched/fair.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c1e5c8298fd1..3e24d3e16522 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8570,10 +8570,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) struct perf_domain *pd; struct energy_env eenv; - rcu_read_lock(); pd = rcu_dereference_all(rd->pd); if (!pd) - goto unlock; + return target; /* * Energy-aware wake-up happens on the lowest sched_domain starting @@ -8583,13 +8582,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) sd = sd->parent; if (!sd) - goto unlock; + return target; target = prev_cpu; sync_entity_load_avg(&p->se); if (!task_util_est(p) && p_util_min == 0) - goto unlock; + return target; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -8684,7 +8683,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) prev_cpu); /* CPU utilization has changed */ if (prev_delta < base_energy) - goto unlock; + return target; prev_delta -= base_energy; prev_actual_cap = cpu_actual_cap; best_delta = min(best_delta, prev_delta); @@ -8708,7 +8707,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) max_spare_cap_cpu); /* CPU utilization has changed */ if (cur_delta < base_energy) - goto unlock; + return target; cur_delta -= base_energy; /* @@ -8725,7 +8724,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) best_actual_cap = cpu_actual_cap; } } - rcu_read_unlock(); if ((best_fits > prev_fits) || ((best_fits > 0) && (best_delta < prev_delta)) || @@ -8733,11 +8731,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = best_energy_cpu; return target; - -unlock: - rcu_read_unlock(); - - return target; } /* @@ -8782,7 +8775,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); } - rcu_read_lock(); for_each_domain(cpu, tmp) { /* * If both 'cpu' and 'prev_cpu' are part of this domain, @@ -8808,14 +8800,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) break; } - if (unlikely(sd)) { - /* Slow path */ - new_cpu = sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (wake_flags & WF_TTWU) { /* XXX always ? */ - /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - } - rcu_read_unlock(); + /* Slow path */ + if (unlikely(sd)) + return sched_balance_find_dst_cpu(sd, p, cpu, prev_cpu, sd_flag); + + /* Fast path */ + if (wake_flags & WF_TTWU) + return select_idle_sibling(p, prev_cpu, new_cpu); return new_cpu; } From f1320a8dd8ba6518ddb53ea4e3efcb49dc41d257 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:33 +0000 Subject: [PATCH 23/46] sched/fair: Simplify the entry condition for update_idle_cpu_scan() Only the topmost SD_SHARE_LLC domain has the "sd->shared" assigned. Simply use "sd->shared" as an indicator for load balancing at the highest SD_SHARE_LLC domain in update_idle_cpu_scan() instead of relying on llc_size. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chen Yu Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-9-kprateek.nayak@amd.com --- kernel/sched/fair.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e24d3e16522..85c22f0f8de8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11234,6 +11234,7 @@ static void update_idle_cpu_scan(struct lb_env *env, unsigned long sum_util) { struct sched_domain_shared *sd_share; + struct sched_domain *sd = env->sd; int llc_weight, pct; u64 x, y, tmp; /* @@ -11247,11 +11248,7 @@ static void update_idle_cpu_scan(struct lb_env *env, if (!sched_feat(SIS_UTIL) || env->idle == CPU_NEWLY_IDLE) return; - llc_weight = per_cpu(sd_llc_size, env->dst_cpu); - if (env->sd->span_weight != llc_weight) - return; - - sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, env->dst_cpu)); + sd_share = sd->shared; if (!sd_share) return; @@ -11285,10 +11282,11 @@ static void update_idle_cpu_scan(struct lb_env *env, */ /* equation [3] */ x = sum_util; + llc_weight = sd->span_weight; do_div(x, llc_weight); /* equation [4] */ - pct = env->sd->imbalance_pct; + pct = sd->imbalance_pct; tmp = x * x * pct * pct; do_div(tmp, 10000 * SCHED_CAPACITY_SCALE); tmp = min_t(long, tmp, SCHED_CAPACITY_SCALE); From fe7171d0d5dfbe189e41db99580ebacafc3c09ce Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Thu, 12 Mar 2026 04:44:34 +0000 Subject: [PATCH 24/46] sched/fair: Simplify SIS_UTIL handling in select_idle_cpu() Use the "sd_llc" passed to select_idle_cpu() to obtain the "sd_llc_shared" instead of dereferencing the per-CPU variable. Since "sd->shared" is always reclaimed at the same time as "sd" via call_rcu() and update_top_cache_domain() always ensures a valid "sd->shared" assignment when "sd_llc" is present, "sd_llc->shared" can always be dereferenced without needing an additional check. While at it move the cpumask_and() operation after the SIS_UTIL bailout check to avoid unnecessarily computing the cpumask. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chen Yu Reviewed-by: Shrikanth Hegde Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://patch.msgid.link/20260312044434.1974-10-kprateek.nayak@amd.com --- kernel/sched/fair.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 85c22f0f8de8..0a35a82e4792 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7876,21 +7876,26 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; - struct sched_domain_shared *sd_share; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_UTIL)) { - sd_share = rcu_dereference_all(per_cpu(sd_llc_shared, target)); - if (sd_share) { - /* because !--nr is the condition to stop scan */ - nr = READ_ONCE(sd_share->nr_idle_scan) + 1; - /* overloaded LLC is unlikely to have idle cpu/core */ - if (nr == 1) - return -1; - } + /* + * Increment because !--nr is the condition to stop scan. + * + * Since "sd" is "sd_llc" for target CPU dereferenced in the + * caller, it is safe to directly dereference "sd->shared". + * Topology bits always ensure it assigned for "sd_llc" abd it + * cannot disappear as long as we have a RCU protected + * reference to one the associated "sd" here. + */ + nr = READ_ONCE(sd->shared->nr_idle_scan) + 1; + /* overloaded LLC is unlikely to have idle cpu/core */ + if (nr == 1) + return -1; } + if (!cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr)) + return -1; + if (static_branch_unlikely(&sched_cluster_active)) { struct sched_group *sg = sd->groups; From e379dce8af11d8d6040b4348316a499bfd174bfb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2026 10:36:27 +0100 Subject: [PATCH 25/46] sched/topology: Fix sched_domain_span() Commit 8e8e23dea43e ("sched/topology: Compute sd_weight considering cpuset partitions") ends up relying on the fact that structure initialization should not touch the flexible array. However, the official GCC specification for "Arrays of Length Zero" [*] says: Although the size of a zero-length array is zero, an array member of this kind may increase the size of the enclosing type as a result of tail padding. Additionally, structure initialization will zero tail padding. With the end result that since offsetof(*type, member) < sizeof(*type), array initialization will clobber the flex array. Luckily, the way flexible array sizes are calculated is: sizeof(*type) + count * sizeof(*type->member) This means we have the complete size of the flex array *outside* of sizeof(*type), so use that instead of relying on the broken flex array definition. [*] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html Fixes: 8e8e23dea43e ("sched/topology: Compute sd_weight considering cpuset partitions") Reported-by: Nathan Chancellor Debugged-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Tested-by: Jon Hunter Tested-by: Chen Yu Tested-by: K Prateek Nayak Tested-by: Nathan Chancellor Link: https://patch.msgid.link/20260323093627.GY3738010@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 51c29581f15e..36553e14866d 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -142,18 +142,30 @@ struct sched_domain { unsigned int span_weight; /* - * Span of all CPUs in this domain. + * See sched_domain_span(), on why flex arrays are broken. * - * NOTE: this field is variable length. (Allocated dynamically - * by attaching extra space to the end of the structure, - * depending on how many CPUs the kernel has booted up with) - */ unsigned long span[]; + */ }; static inline struct cpumask *sched_domain_span(struct sched_domain *sd) { - return to_cpumask(sd->span); + /* + * Turns out that C flexible arrays are fundamentally broken since it + * is allowed for offsetof(*sd, span) < sizeof(*sd), this means that + * structure initialzation *sd = { ... }; which writes every byte + * inside sizeof(*type), will over-write the start of the flexible + * array. + * + * Luckily, the way we allocate sched_domain is by: + * + * sizeof(*sd) + cpumask_size() + * + * this means that we have sufficient space for the whole flex array + * *outside* of sizeof(*sd). So use that, and avoid using sd->span. + */ + unsigned long *bitmap = (void *)sd + sizeof(*sd); + return to_cpumask(bitmap); } extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], From 76504bce4ee6b8757647e07bc1710dcac9acdc2e Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Tue, 24 Mar 2026 01:06:27 +0530 Subject: [PATCH 26/46] sched/fair: Get this cpu once in find_new_ilb() Calling smp_processor_id() on: - In CONFIG_DEBUG_PREEMPT=y, if preemption/irq is disabled, then it does not print any warning. - In CONFIG_DEBUG_PREEMPT=n, it doesn't do anything apart from getting __smp_processor_id So with both CONFIG_DEBUG_PREEMPT=y/n, in preemption disabled section it is better to cache the value. It could save a few cycles. Though tiny, repeated in loop could add up to a small value. find_new_ilb is called in interrupt context. So preemption is disabled. So Hoist the this_cpu out of loop Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mukesh Kumar Chaurasiya (IBM) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260323193630.640311-2-sshegde@linux.ibm.com --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0a35a82e4792..226509231e67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12614,14 +12614,14 @@ static inline int on_null_domain(struct rq *rq) */ static inline int find_new_ilb(void) { + int this_cpu = smp_processor_id(); const struct cpumask *hk_mask; int ilb_cpu; hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE); for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { - - if (ilb_cpu == smp_processor_id()) + if (ilb_cpu == this_cpu) continue; if (idle_cpu(ilb_cpu)) From 0e81fe79fec5a639700f09f39c8ab680c3312ba2 Mon Sep 17 00:00:00 2001 From: Shrikanth Hegde Date: Tue, 24 Mar 2026 01:06:28 +0530 Subject: [PATCH 27/46] sched/core: Get this cpu once in ttwu_queue_cond() Calling smp_processor_id() on: - In CONFIG_DEBUG_PREEMPT=y, if preemption/irq is disabled, then it does not print any warning. - In CONFIG_DEBUG_PREEMPT=n, it doesn't do anything apart from getting __smp_processor_id So with both CONFIG_DEBUG_PREEMPT=y/n, in preemption disabled section it is better to cache the value. It could save a few cycles. Though tiny, repeated could add up to a small value. ttwu_queue_cond is called with interrupt disabled. So preemption is disabled. Hence cache the value once instead. Signed-off-by: Shrikanth Hegde Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mukesh Kumar Chaurasiya (IBM) Link: https://patch.msgid.link/20260323193630.640311-3-sshegde@linux.ibm.com --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 64b467c1d5b6..7c7d4bf686d7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3842,6 +3842,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { + int this_cpu = smp_processor_id(); + /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */ if (!scx_allow_ttwu_queue(p)) return false; @@ -3866,10 +3868,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. */ - if (!cpus_share_cache(smp_processor_id(), cpu)) + if (!cpus_share_cache(this_cpu, cpu)) return true; - if (cpu == smp_processor_id()) + if (cpu == this_cpu) return false; /* From 265439eb88fda0bf77821e10aafed22cdd450f9d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 25 Mar 2026 17:26:04 +0000 Subject: [PATCH 28/46] MAINTAINERS: Add K Prateek Nayak to scheduler reviewers I've been fortunate to have K Prateek take an active interest in my Proxy Execution patches. He's provided great review insights and productive feedback, sometimes remembering the subtleties of the patch series better than I do myself! And he has done all this with great kindness and humility. I've really appreciated his reviews, and I think everyone should be so lucky, so I wanted to propose adding him to the scheduler reviewers list. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt (Google) Acked-by: K Prateek Nayak Acked-by: Juri Lelli Acked-by: Vincent Guittot Acked-by: Dietmar Eggemann Link: https://patch.msgid.link/20260325172615.2895622-1-jstultz@google.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 96ea84948d76..82ed752f93da 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23661,6 +23661,7 @@ R: Steven Rostedt (SCHED_FIFO/SCHED_RR) R: Ben Segall (CONFIG_CFS_BANDWIDTH) R: Mel Gorman (CONFIG_NUMA_BALANCING) R: Valentin Schneider (TOPOLOGY) +R: K Prateek Nayak L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core From e0ca8991b2de6c9dfe6fcd8a0364951b2bd56797 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:16 +0000 Subject: [PATCH 29/46] sched: Make class_schedulers avoid pushing current, and get rid of proxy_tag_curr() With proxy-execution, the scheduler selects the donor, but for blocked donors, we end up running the lock owner. This caused some complexity, because the class schedulers make sure to remove the task they pick from their pushable task lists, which prevents the donor from being migrated, but there wasn't then anything to prevent rq->curr from being migrated if rq->curr != rq->donor. This was sort of hacked around by calling proxy_tag_curr() on the rq->curr task if we were running something other then the donor. proxy_tag_curr() did a dequeue/enqueue pair on the rq->curr task, allowing the class schedulers to remove it from their pushable list. The dequeue/enqueue pair was wasteful, and additonally K Prateek highlighted that we didn't properly undo things when we stopped proxying, leaving the lock owner off the pushable list. After some alternative approaches were considered, Peter suggested just having the RT/DL classes just avoid migrating when task_on_cpu(). So rework pick_next_pushable_dl_task() and the rt pick_next_pushable_task() functions so that they skip over the first pushable task if it is on_cpu. Then just drop all of the proxy_tag_curr() logic. Fixes: be39617e38e0 ("sched: Fix proxy/current (push,pull)ability") Closes: https://lore.kernel.org/lkml/e735cae0-2cc9-4bae-b761-fcb082ed3e94@amd.com/ Reported-by: K Prateek Nayak Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260324191337.1841376-2-jstultz@google.com --- kernel/sched/core.c | 24 ------------------------ kernel/sched/deadline.c | 18 ++++++++++++++++-- kernel/sched/rt.c | 15 ++++++++++++--- 3 files changed, 28 insertions(+), 29 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7c7d4bf686d7..29741685762d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6702,23 +6702,6 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) } #endif /* SCHED_PROXY_EXEC */ -static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) -{ - if (!sched_proxy_exec()) - return; - /* - * pick_next_task() calls set_next_task() on the chosen task - * at some point, which ensures it is not push/pullable. - * However, the chosen/donor task *and* the mutex owner form an - * atomic pair wrt push/pull. - * - * Make sure owner we run is not pushable. Unfortunately we can - * only deal with that by means of a dequeue/enqueue cycle. :-/ - */ - dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); - enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); -} - /* * __schedule() is the main scheduler function. * @@ -6871,9 +6854,6 @@ static void __sched notrace __schedule(int sched_mode) */ RCU_INIT_POINTER(rq->curr, next); - if (!task_current_donor(rq, next)) - proxy_tag_curr(rq, next); - /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6907,10 +6887,6 @@ static void __sched notrace __schedule(int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - /* In case next was already curr but just got blocked_donor */ - if (!task_current_donor(rq, next)) - proxy_tag_curr(rq, next); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq, NULL); raw_spin_rq_unlock_irq(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9e253a825f39..27359a1e995f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2805,12 +2805,26 @@ static int find_later_rq(struct task_struct *task) static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) { - struct task_struct *p; + struct task_struct *i, *p = NULL; + struct rb_node *next_node; if (!has_pushable_dl_tasks(rq)) return NULL; - p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); + next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root); + while (next_node) { + i = __node_2_pdl(next_node); + /* make sure task isn't on_cpu (possible with proxy-exec) */ + if (!task_on_cpu(rq, i)) { + p = i; + break; + } + + next_node = rb_next(next_node); + } + + if (!p) + return NULL; WARN_ON_ONCE(rq->cpu != task_cpu(p)); WARN_ON_ONCE(task_current(rq, p)); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 3d823f5ffe2c..4e5f1957b91b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1858,13 +1858,22 @@ static int find_lowest_rq(struct task_struct *task) static struct task_struct *pick_next_pushable_task(struct rq *rq) { - struct task_struct *p; + struct plist_head *head = &rq->rt.pushable_tasks; + struct task_struct *i, *p = NULL; if (!has_pushable_tasks(rq)) return NULL; - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); + plist_for_each_entry(i, head, pushable_tasks) { + /* make sure task isn't on_cpu (possible with proxy-exec) */ + if (!task_on_cpu(rq, i)) { + p = i; + break; + } + } + + if (!p) + return NULL; BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); From 37341ec573da7c16fdd45222b1bfb7b421dbdbcb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:17 +0000 Subject: [PATCH 30/46] sched: Minimise repeated sched_proxy_exec() checking Peter noted: Compilers are really bad (as in they utterly refuse) optimizing (even when marked with __pure) the static branch things, and will happily emit multiple identical in a row. So pull out the one obvious sched_proxy_exec() branch in __schedule() and remove some of the 'implicit' ones in that path. Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-3-jstultz@google.com --- kernel/sched/core.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 29741685762d..f3306d3f2aa1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6597,11 +6597,7 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) struct mutex *mutex; /* Follow blocked_on chain. */ - for (p = donor; task_is_blocked(p); p = owner) { - mutex = p->blocked_on; - /* Something changed in the chain, so pick again */ - if (!mutex) - return NULL; + for (p = donor; (mutex = p->blocked_on); p = owner) { /* * By taking mutex->wait_lock we hold off concurrent mutex_unlock() * and ensure @owner sticks around. @@ -6832,12 +6828,14 @@ static void __sched notrace __schedule(int sched_mode) next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); rq->next_class = next->sched_class; - if (unlikely(task_is_blocked(next))) { - next = find_proxy_task(rq, next, &rf); - if (!next) - goto pick_again; - if (next == rq->idle) - goto keep_resched; + if (sched_proxy_exec()) { + if (unlikely(next->blocked_on)) { + next = find_proxy_task(rq, next, &rf); + if (!next) + goto pick_again; + if (next == rq->idle) + goto keep_resched; + } } picked: clear_tsk_need_resched(prev); From f4fe6be82e6d27349de66a42d6d1b2b11dc97a14 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:18 +0000 Subject: [PATCH 31/46] sched: Fix potentially missing balancing with Proxy Exec K Prateek pointed out that with Proxy Exec, we may have cases where we context switch in __schedule(), while the donor remains the same. This could cause balancing issues, since the put_prev_set_next() logic short-cuts if (prev == next). With proxy-exec prev is the previous donor, and next is the next donor. Should the donor remain the same, but different tasks are picked to actually run, the shortcut will have avoided enqueuing the sched class balance callback. So, if we are context switching, add logic to catch the same-donor case, and trigger the put_prev/set_next calls to ensure the balance callbacks get enqueued. Closes: https://lore.kernel.org/lkml/20ea3670-c30a-433b-a07f-c4ff98ae2379@amd.com/ Reported-by: K Prateek Nayak Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260324191337.1841376-4-jstultz@google.com --- kernel/sched/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3306d3f2aa1..5b7f378af042 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6826,9 +6826,11 @@ static void __sched notrace __schedule(int sched_mode) pick_again: next = pick_next_task(rq, rq->donor, &rf); - rq_set_donor(rq, next); rq->next_class = next->sched_class; if (sched_proxy_exec()) { + struct task_struct *prev_donor = rq->donor; + + rq_set_donor(rq, next); if (unlikely(next->blocked_on)) { next = find_proxy_task(rq, next, &rf); if (!next) @@ -6836,7 +6838,27 @@ static void __sched notrace __schedule(int sched_mode) if (next == rq->idle) goto keep_resched; } + if (rq->donor == prev_donor && prev != next) { + struct task_struct *donor = rq->donor; + /* + * When transitioning like: + * + * prev next + * donor: B B + * curr: A B or C + * + * then put_prev_set_next_task() will not have done + * anything, since B == B. However, A might have + * missed a RT/DL balance opportunity due to being + * on_cpu. + */ + donor->sched_class->put_prev_task(rq, donor, donor); + donor->sched_class->set_next_task(rq, donor, true); + } + } else { + rq_set_donor(rq, next); } + picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); From fa4a1ff8ab235a308d8c983827657a69649185fd Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:19 +0000 Subject: [PATCH 32/46] locking: Add task::blocked_lock to serialize blocked_on state So far, we have been able to utilize the mutex::wait_lock for serializing the blocked_on state, but when we move to proxying across runqueues, we will need to add more state and a way to serialize changes to this state in contexts where we don't hold the mutex::wait_lock. So introduce the task::blocked_lock, which nests under the mutex::wait_lock in the locking order, and rework the locking to use it. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-5-jstultz@google.com --- include/linux/sched.h | 48 +++++++++++++----------------------- init/init_task.c | 1 + kernel/fork.c | 1 + kernel/locking/mutex-debug.c | 4 +-- kernel/locking/mutex.c | 40 +++++++++++++++++++----------- kernel/locking/mutex.h | 6 +++++ kernel/locking/ww_mutex.h | 4 +-- kernel/sched/core.c | 4 ++- 8 files changed, 58 insertions(+), 50 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a5d3dbc9cdf..2eef9bc6daaa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1238,6 +1238,7 @@ struct task_struct { #endif struct mutex *blocked_on; /* lock we're blocked on */ + raw_spinlock_t blocked_lock; #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* @@ -2181,57 +2182,42 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { - struct mutex *m = p->blocked_on; - - if (m) - lockdep_assert_held_once(&m->wait_lock); - return m; + lockdep_assert_held_once(&p->blocked_lock); + return p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { - struct mutex *blocked_on = READ_ONCE(p->blocked_on); - WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ - WARN_ON_ONCE(blocked_on && blocked_on != m); - WRITE_ONCE(p->blocked_on, m); -} - -static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) -{ - guard(raw_spinlock_irqsave)(&m->wait_lock); - __set_task_blocked_on(p, m); + WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); + p->blocked_on = m; } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - if (m) { - struct mutex *blocked_on = READ_ONCE(p->blocked_on); - - /* Currently we serialize blocked_on under the mutex::wait_lock */ - lockdep_assert_held_once(&m->wait_lock); - /* - * There may be cases where we re-clear already cleared - * blocked_on relationships, but make sure we are not - * clearing the relationship with a different lock. - */ - WARN_ON_ONCE(blocked_on && blocked_on != m); - } - WRITE_ONCE(p->blocked_on, NULL); + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + p->blocked_on = NULL; } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { - guard(raw_spinlock_irqsave)(&m->wait_lock); + guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } #else diff --git a/init/init_task.c b/init/init_task.c index 5c838757fc10..b5f48ebdc2b6 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -169,6 +169,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { .journal_info = NULL, INIT_CPU_TIMERS(init_task) .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), + .blocked_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.blocked_lock), .timer_slack_ns = 50000, /* 50 usec default slack */ .thread_pid = &init_struct_pid, .thread_node = LIST_HEAD_INIT(init_signals.thread_head), diff --git a/kernel/fork.c b/kernel/fork.c index bc2bf58b93b6..079802cb6100 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2076,6 +2076,7 @@ __latent_entropy struct task_struct *copy_process( ftrace_graph_init_task(p); rt_mutex_init_task(p); + raw_spin_lock_init(&p->blocked_lock); lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 2c6b02d4699b..cc6aa9c6e981 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -54,13 +54,13 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, lockdep_assert_held(&lock->wait_lock); /* Current thread can't be already blocked (since it's executing!) */ - DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); + DEBUG_LOCKS_WARN_ON(get_task_blocked_on(task)); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - struct mutex *blocked_on = __get_task_blocked_on(task); + struct mutex *blocked_on = get_task_blocked_on(task); DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2a1d165b3167..4aa79bcab08c 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -656,6 +656,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + raw_spin_lock(¤t->blocked_lock); __set_task_blocked_on(current, lock); set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); @@ -669,8 +670,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas * the handoff. */ if (__mutex_trylock(lock)) - goto acquired; + break; + raw_spin_unlock(¤t->blocked_lock); /* * Check for signals and kill conditions while holding * wait_lock. This ensures the lock cancellation is ordered @@ -693,12 +695,14 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas first = __mutex_waiter_is_first(lock, &waiter); + raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); /* * As we likely have been woken up by task * that has cleared our blocked_on state, re-set * it to the lock we are trying to acquire. */ - set_task_blocked_on(current, lock); + __set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -709,25 +713,33 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas break; if (first) { - trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + bool opt_acquired; + /* * mutex_optimistic_spin() can call schedule(), so - * clear blocked on so we don't become unselectable + * we need to release these locks before calling it, + * and clear blocked on so we don't become unselectable * to run. */ - clear_task_blocked_on(current, lock); - if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) + __clear_task_blocked_on(current, lock); + raw_spin_unlock(¤t->blocked_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + opt_acquired = mutex_optimistic_spin(lock, ww_ctx, &waiter); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + raw_spin_lock(¤t->blocked_lock); + __set_task_blocked_on(current, lock); + + if (opt_acquired) break; - set_task_blocked_on(current, lock); trace_contention_begin(lock, LCB_F_MUTEX); } - - raw_spin_lock_irqsave(&lock->wait_lock, flags); } - raw_spin_lock_irqsave(&lock->wait_lock, flags); -acquired: __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); + raw_spin_unlock(¤t->blocked_lock); if (ww_ctx) { /* @@ -756,11 +768,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas return 0; err: - __clear_task_blocked_on(current, lock); + clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: - WARN_ON(__get_task_blocked_on(current)); + WARN_ON(get_task_blocked_on(current)); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -971,7 +983,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - __clear_task_blocked_on(next, lock); + clear_task_blocked_on(next, lock); wake_q_add(&wake_q, next); } diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 9ad4da8cea00..7a8ba13fee94 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -47,6 +47,12 @@ static inline struct task_struct *__mutex_owner(struct mutex *lock) return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); } +static inline struct mutex *get_task_blocked_on(struct task_struct *p) +{ + guard(raw_spinlock_irqsave)(&p->blocked_lock); + return __get_task_blocked_on(p); +} + #ifdef CONFIG_DEBUG_MUTEXES extern void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter); diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 31a785afee6c..e4a81790ea7d 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -289,7 +289,7 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. */ - __clear_task_blocked_on(waiter->task, lock); + clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -347,7 +347,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * are waking the mutex owner, who may be currently * blocked on a different mutex. */ - __clear_task_blocked_on(owner, NULL); + clear_task_blocked_on(owner, NULL); wake_q_add(wake_q, owner); } return true; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b7f378af042..1913dbc68eb9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6584,6 +6584,7 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d * p->pi_lock * rq->lock * mutex->wait_lock + * p->blocked_lock * * Returns the task that is going to be used as execution context (the one * that is actually going to be run on cpu_of(rq)). @@ -6603,8 +6604,9 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * and ensure @owner sticks around. */ guard(raw_spinlock)(&mutex->wait_lock); + guard(raw_spinlock)(&p->blocked_lock); - /* Check again that p is blocked with wait_lock held */ + /* Check again that p is blocked with blocked_lock held */ if (mutex != __get_task_blocked_on(p)) { /* * Something changed in the blocked_on chain and From 56f4b24267a643b0b9ab73f09feaaabfee5a37ae Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:20 +0000 Subject: [PATCH 33/46] sched: Fix modifying donor->blocked on without proper locking Introduce an action enum in find_proxy_task() which allows us to handle work needed to be done outside the mutex.wait_lock and task.blocked_lock guard scopes. This ensures proper locking when we clear the donor's blocked_on pointer in proxy_deactivate(), and the switch statement will be useful as we add more cases to handle later in this series. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-6-jstultz@google.com --- kernel/sched/core.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1913dbc68eb9..bf4338f71667 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6568,7 +6568,7 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d * as unblocked, as we aren't doing proxy-migrations * yet (more logic will be needed then). */ - donor->blocked_on = NULL; + clear_task_blocked_on(donor, NULL); } return NULL; } @@ -6592,6 +6592,7 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d static struct task_struct * find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) { + enum { FOUND, DEACTIVATE_DONOR } action = FOUND; struct task_struct *owner = NULL; int this_cpu = cpu_of(rq); struct task_struct *p; @@ -6625,12 +6626,14 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { /* XXX Don't handle blocked owners/delayed dequeue yet */ - return proxy_deactivate(rq, donor); + action = DEACTIVATE_DONOR; + break; } if (task_cpu(owner) != this_cpu) { /* XXX Don't handle migrations yet */ - return proxy_deactivate(rq, donor); + action = DEACTIVATE_DONOR; + break; } if (task_on_rq_migrating(owner)) { @@ -6688,6 +6691,13 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) */ } + /* Handle actions we need to do outside of the guard() scope */ + switch (action) { + case DEACTIVATE_DONOR: + return proxy_deactivate(rq, donor); + case FOUND: + /* fallthrough */; + } WARN_ON_ONCE(owner && !owner->on_rq); return owner; } From 2d7622669836dcbbb449741b4e6c503ffe005c25 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:21 +0000 Subject: [PATCH 34/46] sched/locking: Add special p->blocked_on==PROXY_WAKING value for proxy return-migration As we add functionality to proxy execution, we may migrate a donor task to a runqueue where it can't run due to cpu affinity. Thus, we must be careful to ensure we return-migrate the task back to a cpu in its cpumask when it becomes unblocked. Peter helpfully provided the following example with pictures: "Suppose we have a ww_mutex cycle: ,-+-* Mutex-1 <-. Task-A ---' | | ,-- Task-B `-> Mutex-2 *-+-' Where Task-A holds Mutex-1 and tries to acquire Mutex-2, and where Task-B holds Mutex-2 and tries to acquire Mutex-1. Then the blocked_on->owner chain will go in circles. Task-A -> Mutex-2 ^ | | v Mutex-1 <- Task-B We need two things: - find_proxy_task() to stop iterating the circle; - the woken task to 'unblock' and run, such that it can back-off and re-try the transaction. Now, the current code [without this patch] does: __clear_task_blocked_on(); wake_q_add(); And surely clearing ->blocked_on is sufficient to break the cycle. Suppose it is Task-B that is made to back-off, then we have: Task-A -> Mutex-2 -> Task-B (no further blocked_on) and it would attempt to run Task-B. Or worse, it could directly pick Task-B and run it, without ever getting into find_proxy_task(). Now, here is a problem because Task-B might not be runnable on the CPU it is currently on; and because !task_is_blocked() we don't get into the proxy paths, so nobody is going to fix this up. Ideally we would have dequeued Task-B alongside of clearing ->blocked_on, but alas, [the lock ordering prevents us from getting the task_rq_lock() and] spoils things." Thus we need more than just a binary concept of the task being blocked on a mutex or not. So allow setting blocked_on to PROXY_WAKING as a special value which specifies the task is no longer blocked, but needs to be evaluated for return migration *before* it can be run. This will then be used in a later patch to handle proxy return-migration. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-7-jstultz@google.com --- include/linux/sched.h | 51 +++++++++++++++++++++++++++++++++++++-- kernel/locking/mutex.c | 2 +- kernel/locking/ww_mutex.h | 16 ++++++------ kernel/sched/core.c | 16 ++++++++++++ 4 files changed, 74 insertions(+), 11 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2eef9bc6daaa..8ec3b6d7d718 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2180,10 +2180,20 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock) __must_hold(lock); }) #ifndef CONFIG_PREEMPT_RT + +/* + * With proxy exec, if a task has been proxy-migrated, it may be a donor + * on a cpu that it can't actually run on. Thus we need a special state + * to denote that the task is being woken, but that it needs to be + * evaluated for return-migration before it is run. So if the task is + * blocked_on PROXY_WAKING, return migrate it before running it. + */ +#define PROXY_WAKING ((struct mutex *)(-1L)) + static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { lockdep_assert_held_once(&p->blocked_lock); - return p->blocked_on; + return p->blocked_on == PROXY_WAKING ? NULL : p->blocked_on; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) @@ -2211,7 +2221,7 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex * * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ - WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m && p->blocked_on != PROXY_WAKING); p->blocked_on = NULL; } @@ -2220,6 +2230,35 @@ static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) guard(raw_spinlock_irqsave)(&p->blocked_lock); __clear_task_blocked_on(p, m); } + +static inline void __set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) +{ + /* Currently we serialize blocked_on under the task::blocked_lock */ + lockdep_assert_held_once(&p->blocked_lock); + + if (!sched_proxy_exec()) { + __clear_task_blocked_on(p, m); + return; + } + + /* Don't set PROXY_WAKING if blocked_on was already cleared */ + if (!p->blocked_on) + return; + /* + * There may be cases where we set PROXY_WAKING on tasks that were + * already set to waking, but make sure we are not changing + * the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on != m && p->blocked_on != PROXY_WAKING); + p->blocked_on = PROXY_WAKING; +} + +static inline void set_task_blocked_on_waking(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&p->blocked_lock); + __set_task_blocked_on_waking(p, m); +} + #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { @@ -2228,6 +2267,14 @@ static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mute static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } + +static inline void __set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) +{ +} + +static inline void set_task_blocked_on_waking(struct task_struct *p, struct rt_mutex *m) +{ +} #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4aa79bcab08c..7d359647156d 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -983,7 +983,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - clear_task_blocked_on(next, lock); + set_task_blocked_on_waking(next, lock); wake_q_add(&wake_q, next); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index e4a81790ea7d..5cd9dfa4b31e 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -285,11 +285,11 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, debug_mutex_wake_waiter(lock, waiter); #endif /* - * When waking up the task to die, be sure to clear the - * blocked_on pointer. Otherwise we can see circular - * blocked_on relationships that can't resolve. + * When waking up the task to die, be sure to set the + * blocked_on to PROXY_WAKING. Otherwise we can see + * circular blocked_on relationships that can't resolve. */ - clear_task_blocked_on(waiter->task, lock); + set_task_blocked_on_waking(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -339,15 +339,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock, */ if (owner != current) { /* - * When waking up the task to wound, be sure to clear the - * blocked_on pointer. Otherwise we can see circular - * blocked_on relationships that can't resolve. + * When waking up the task to wound, be sure to set the + * blocked_on to PROXY_WAKING. Otherwise we can see + * circular blocked_on relationships that can't resolve. * * NOTE: We pass NULL here instead of lock, because we * are waking the mutex owner, who may be currently * blocked on a different mutex. */ - clear_task_blocked_on(owner, NULL); + set_task_blocked_on_waking(owner, NULL); wake_q_add(wake_q, owner); } return true; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf4338f71667..c997d516441d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4239,6 +4239,13 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); } out: + /* + * For now, if we've been woken up, clear the task->blocked_on + * regardless if it was set to a mutex or PROXY_WAKING so the + * task can run. We will need to be more careful later when + * properly handling proxy migration + */ + clear_task_blocked_on(p, NULL); if (success) ttwu_stat(p, task_cpu(p), wake_flags); @@ -6600,6 +6607,10 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) /* Follow blocked_on chain. */ for (p = donor; (mutex = p->blocked_on); p = owner) { + /* if its PROXY_WAKING, resched_idle so ttwu can complete */ + if (mutex == PROXY_WAKING) + return proxy_resched_idle(rq); + /* * By taking mutex->wait_lock we hold off concurrent mutex_unlock() * and ensure @owner sticks around. @@ -6620,6 +6631,11 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) owner = __mutex_owner(mutex); if (!owner) { + /* + * If there is no owner, clear blocked_on + * and return p so it can run and try to + * acquire the lock + */ __clear_task_blocked_on(p, mutex); return p; } From f9530b3183358bbf945f7c20d4a6e2048061ec50 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:22 +0000 Subject: [PATCH 35/46] sched: Add assert_balance_callbacks_empty helper With proxy-exec utilizing pick-again logic, we can end up having balance callbacks set by the preivous pick_next_task() call left on the list. So pull the warning out into a helper function, and make sure we check it when we pick again. Suggested-by: Peter Zijlstra Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-8-jstultz@google.com --- kernel/sched/core.c | 1 + kernel/sched/sched.h | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c997d516441d..acb5894e7714 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6853,6 +6853,7 @@ static void __sched notrace __schedule(int sched_mode) } pick_again: + assert_balance_callbacks_empty(rq); next = pick_next_task(rq, rq->donor, &rf); rq->next_class = next->sched_class; if (sched_proxy_exec()) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b863bbda6de8..a2629d025c90 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1857,6 +1857,13 @@ static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {} static inline void scx_rq_clock_invalidate(struct rq *rq) {} #endif /* !CONFIG_SCHED_CLASS_EXT */ +static inline void assert_balance_callbacks_empty(struct rq *rq) +{ + WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_LOCKING) && + rq->balance_callback && + rq->balance_callback != &balance_push_callback); +} + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1873,7 +1880,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; - WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); + assert_balance_callbacks_empty(rq); } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) From 48fda62de67a1e88fc8bada12caf0fc9b45116df Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:23 +0000 Subject: [PATCH 36/46] sched: Add logic to zap balance callbacks if we pick again With proxy-exec, a task is selected to run via pick_next_task(), and then if it is a mutex blocked task, we call find_proxy_task() to find a runnable owner. If the runnable owner is on another cpu, we will need to migrate the selected donor task away, after which we will pick_again can call pick_next_task() to choose something else. However, in the first call to pick_next_task(), we may have had a balance_callback setup by the class scheduler. After we pick again, its possible pick_next_task_fair() will be called which calls sched_balance_newidle() and sched_balance_rq(). This will throw a warning: [ 8.796467] rq->balance_callback && rq->balance_callback != &balance_push_callback [ 8.796467] WARNING: CPU: 32 PID: 458 at kernel/sched/sched.h:1750 sched_balance_rq+0xe92/0x1250 ... [ 8.796467] Call Trace: [ 8.796467] [ 8.796467] ? __warn.cold+0xb2/0x14e [ 8.796467] ? sched_balance_rq+0xe92/0x1250 [ 8.796467] ? report_bug+0x107/0x1a0 [ 8.796467] ? handle_bug+0x54/0x90 [ 8.796467] ? exc_invalid_op+0x17/0x70 [ 8.796467] ? asm_exc_invalid_op+0x1a/0x20 [ 8.796467] ? sched_balance_rq+0xe92/0x1250 [ 8.796467] sched_balance_newidle+0x295/0x820 [ 8.796467] pick_next_task_fair+0x51/0x3f0 [ 8.796467] __schedule+0x23a/0x14b0 [ 8.796467] ? lock_release+0x16d/0x2e0 [ 8.796467] schedule+0x3d/0x150 [ 8.796467] worker_thread+0xb5/0x350 [ 8.796467] ? __pfx_worker_thread+0x10/0x10 [ 8.796467] kthread+0xee/0x120 [ 8.796467] ? __pfx_kthread+0x10/0x10 [ 8.796467] ret_from_fork+0x31/0x50 [ 8.796467] ? __pfx_kthread+0x10/0x10 [ 8.796467] ret_from_fork_asm+0x1a/0x30 [ 8.796467] This is because if a RT task was originally picked, it will setup the rq->balance_callback with push_rt_tasks() via set_next_task_rt(). Once the task is migrated away and we pick again, we haven't processed any balance callbacks, so rq->balance_callback is not in the same state as it was the first time pick_next_task was called. To handle this, add a zap_balance_callbacks() helper function which cleans up the balance callbacks without running them. This should be ok, as we are effectively undoing the state set in the first call to pick_next_task(), and when we pick again, the new callback can be configured for the donor task actually selected. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-9-jstultz@google.com --- kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index acb5894e7714..162b24c76077 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4917,6 +4917,34 @@ static inline void finish_task(struct task_struct *prev) smp_store_release(&prev->on_cpu, 0); } +/* + * Only called from __schedule context + * + * There are some cases where we are going to re-do the action + * that added the balance callbacks. We may not be in a state + * where we can run them, so just zap them so they can be + * properly re-added on the next time around. This is similar + * handling to running the callbacks, except we just don't call + * them. + */ +static void zap_balance_callbacks(struct rq *rq) +{ + struct balance_callback *next, *head; + bool found = false; + + lockdep_assert_rq_held(rq); + + head = rq->balance_callback; + while (head) { + if (head == &balance_push_callback) + found = true; + next = head->next; + head->next = NULL; + head = next; + } + rq->balance_callback = found ? &balance_push_callback : NULL; +} + static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -6862,10 +6890,14 @@ static void __sched notrace __schedule(int sched_mode) rq_set_donor(rq, next); if (unlikely(next->blocked_on)) { next = find_proxy_task(rq, next, &rf); - if (!next) + if (!next) { + zap_balance_callbacks(rq); goto pick_again; - if (next == rq->idle) + } + if (next == rq->idle) { + zap_balance_callbacks(rq); goto keep_resched; + } } if (rq->donor == prev_donor && prev != next) { struct task_struct *donor = rq->donor; From dec9554dc036183c715d02e9cfe48986d453427a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:24 +0000 Subject: [PATCH 37/46] sched: Move attach_one_task and attach_task helpers to sched.h The fair scheduler locally introduced attach_one_task() and attach_task() helpers, but these could be generically useful so move this code to sched.h so we can use them elsewhere. One minor tweak made to utilize guard(rq_lock)(rq) to simplifiy the function. Suggested-by: K Prateek Nayak Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Link: https://patch.msgid.link/20260324191337.1841376-10-jstultz@google.com --- kernel/sched/fair.c | 26 -------------------------- kernel/sched/sched.h | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7f35dd40d7ef..41293d5d9b75 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9946,32 +9946,6 @@ static int detach_tasks(struct lb_env *env) return detached; } -/* - * attach_task() -- attach the task detached by detach_task() to its new rq. - */ -static void attach_task(struct rq *rq, struct task_struct *p) -{ - lockdep_assert_rq_held(rq); - - WARN_ON_ONCE(task_rq(p) != rq); - activate_task(rq, p, ENQUEUE_NOCLOCK); - wakeup_preempt(rq, p, 0); -} - -/* - * attach_one_task() -- attaches the task returned from detach_one_task() to - * its new rq. - */ -static void attach_one_task(struct rq *rq, struct task_struct *p) -{ - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - attach_task(rq, p); - rq_unlock(rq, &rf); -} - /* * attach_tasks() -- attaches all tasks detached by detach_tasks() to their * new rq. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a2629d025c90..9594355a3681 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3012,6 +3012,29 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); +/* + * attach_task() -- attach the task detached by detach_task() to its new rq. + */ +static inline void attach_task(struct rq *rq, struct task_struct *p) +{ + lockdep_assert_rq_held(rq); + + WARN_ON_ONCE(task_rq(p) != rq); + activate_task(rq, p, ENQUEUE_NOCLOCK); + wakeup_preempt(rq, p, 0); +} + +/* + * attach_one_task() -- attaches the task returned from detach_one_task() to + * its new rq. + */ +static inline void attach_one_task(struct rq *rq, struct task_struct *p) +{ + guard(rq_lock)(rq); + update_rq_clock(rq); + attach_task(rq, p); +} + #ifdef CONFIG_PREEMPT_RT # define SCHED_NR_MIGRATE_BREAK 8 #else From b049b81bdff6fc6794200a4c7d7d910e2008d57f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 24 Mar 2026 19:13:25 +0000 Subject: [PATCH 38/46] sched: Handle blocked-waiter migration (and return migration) Add logic to handle migrating a blocked waiter to a remote cpu where the lock owner is runnable. Additionally, as the blocked task may not be able to run on the remote cpu, add logic to handle return migration once the waiting task is given the mutex. Because tasks may get migrated to where they cannot run, also modify the scheduling classes to avoid sched class migrations on mutex blocked tasks, leaving find_proxy_task() and related logic to do the migrations and return migrations. This was split out from the larger proxy patch, and significantly reworked. Credits for the original patch go to: Peter Zijlstra (Intel) Juri Lelli Valentin Schneider Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260324191337.1841376-11-jstultz@google.com --- kernel/sched/core.c | 232 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 194 insertions(+), 38 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 162b24c76077..c15c9865299e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4239,13 +4239,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); } out: - /* - * For now, if we've been woken up, clear the task->blocked_on - * regardless if it was set to a mutex or PROXY_WAKING so the - * task can run. We will need to be more careful later when - * properly handling proxy migration - */ - clear_task_blocked_on(p, NULL); if (success) ttwu_stat(p, task_cpu(p), wake_flags); @@ -6530,6 +6523,8 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, if (signal_pending_state(task_state, p)) { WRITE_ONCE(p->__state, TASK_RUNNING); *task_state_p = TASK_RUNNING; + set_task_blocked_on_waking(p, NULL); + return false; } @@ -6567,6 +6562,21 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_PROXY_EXEC +static inline void proxy_set_task_cpu(struct task_struct *p, int cpu) +{ + unsigned int wake_cpu; + + /* + * Since we are enqueuing a blocked task on a cpu it may + * not be able to run on, preserve wake_cpu when we + * __set_task_cpu so we can return the task to where it + * was previously runnable. + */ + wake_cpu = p->wake_cpu; + __set_task_cpu(p, cpu); + p->wake_cpu = wake_cpu; +} + static inline struct task_struct *proxy_resched_idle(struct rq *rq) { put_prev_set_next_task(rq, rq->donor, rq->idle); @@ -6575,7 +6585,7 @@ static inline struct task_struct *proxy_resched_idle(struct rq *rq) return rq->idle; } -static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) +static bool proxy_deactivate(struct rq *rq, struct task_struct *donor) { unsigned long state = READ_ONCE(donor->__state); @@ -6595,17 +6605,140 @@ static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) return try_to_block_task(rq, donor, &state, true); } -static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) +static inline void proxy_release_rq_lock(struct rq *rq, struct rq_flags *rf) + __releases(__rq_lockp(rq)) { - if (!__proxy_deactivate(rq, donor)) { + /* + * The class scheduler may have queued a balance callback + * from pick_next_task() called earlier. + * + * So here we have to zap callbacks before unlocking the rq + * as another CPU may jump in and call sched_balance_rq + * which can trip the warning in rq_pin_lock() if we + * leave callbacks set. + * + * After we later reaquire the rq lock, we will force __schedule() + * to pick_again, so the callbacks will get re-established. + */ + zap_balance_callbacks(rq); + rq_unpin_lock(rq, rf); + raw_spin_rq_unlock(rq); +} + +static inline void proxy_reacquire_rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(__rq_lockp(rq)) +{ + raw_spin_rq_lock(rq); + rq_repin_lock(rq, rf); + update_rq_clock(rq); +} + +/* + * If the blocked-on relationship crosses CPUs, migrate @p to the + * owner's CPU. + * + * This is because we must respect the CPU affinity of execution + * contexts (owner) but we can ignore affinity for scheduling + * contexts (@p). So we have to move scheduling contexts towards + * potential execution contexts. + * + * Note: The owner can disappear, but simply migrate to @target_cpu + * and leave that CPU to sort things out. + */ +static void proxy_migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int target_cpu) + __must_hold(__rq_lockp(rq)) +{ + struct rq *target_rq = cpu_rq(target_cpu); + + lockdep_assert_rq_held(rq); + WARN_ON(p == rq->curr); + /* + * Since we are migrating a blocked donor, it could be rq->donor, + * and we want to make sure there aren't any references from this + * rq to it before we drop the lock. This avoids another cpu + * jumping in and grabbing the rq lock and referencing rq->donor + * or cfs_rq->curr, etc after we have migrated it to another cpu, + * and before we pick_again in __schedule. + * + * So call proxy_resched_idle() to drop the rq->donor references + * before we release the lock. + */ + proxy_resched_idle(rq); + + deactivate_task(rq, p, DEQUEUE_NOCLOCK); + proxy_set_task_cpu(p, target_cpu); + + proxy_release_rq_lock(rq, rf); + + attach_one_task(target_rq, p); + + proxy_reacquire_rq_lock(rq, rf); +} + +static void proxy_force_return(struct rq *rq, struct rq_flags *rf, + struct task_struct *p) + __must_hold(__rq_lockp(rq)) +{ + struct rq *task_rq, *target_rq = NULL; + int cpu, wake_flag = WF_TTWU; + + lockdep_assert_rq_held(rq); + WARN_ON(p == rq->curr); + + if (p == rq->donor) + proxy_resched_idle(rq); + + proxy_release_rq_lock(rq, rf); + /* + * We drop the rq lock, and re-grab task_rq_lock to get + * the pi_lock (needed for select_task_rq) as well. + */ + scoped_guard (task_rq_lock, p) { + task_rq = scope.rq; + /* - * XXX: For now, if deactivation failed, set donor - * as unblocked, as we aren't doing proxy-migrations - * yet (more logic will be needed then). + * Since we let go of the rq lock, the task may have been + * woken or migrated to another rq before we got the + * task_rq_lock. So re-check we're on the same RQ. If + * not, the task has already been migrated and that CPU + * will handle any futher migrations. */ - clear_task_blocked_on(donor, NULL); + if (task_rq != rq) + break; + + /* + * Similarly, if we've been dequeued, someone else will + * wake us + */ + if (!task_on_rq_queued(p)) + break; + + /* + * Since we should only be calling here from __schedule() + * -> find_proxy_task(), no one else should have + * assigned current out from under us. But check and warn + * if we see this, then bail. + */ + if (task_current(task_rq, p) || task_on_cpu(task_rq, p)) { + WARN_ONCE(1, "%s rq: %i current/on_cpu task %s %d on_cpu: %i\n", + __func__, cpu_of(task_rq), + p->comm, p->pid, p->on_cpu); + break; + } + + update_rq_clock(task_rq); + deactivate_task(task_rq, p, DEQUEUE_NOCLOCK); + cpu = select_task_rq(p, p->wake_cpu, &wake_flag); + set_task_cpu(p, cpu); + target_rq = cpu_rq(cpu); + clear_task_blocked_on(p, NULL); } - return NULL; + + if (target_rq) + attach_one_task(target_rq, p); + + proxy_reacquire_rq_lock(rq, rf); } /* @@ -6626,18 +6759,25 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d */ static struct task_struct * find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) + __must_hold(__rq_lockp(rq)) { - enum { FOUND, DEACTIVATE_DONOR } action = FOUND; struct task_struct *owner = NULL; + bool curr_in_chain = false; int this_cpu = cpu_of(rq); struct task_struct *p; struct mutex *mutex; + int owner_cpu; /* Follow blocked_on chain. */ for (p = donor; (mutex = p->blocked_on); p = owner) { - /* if its PROXY_WAKING, resched_idle so ttwu can complete */ - if (mutex == PROXY_WAKING) - return proxy_resched_idle(rq); + /* if its PROXY_WAKING, do return migration or run if current */ + if (mutex == PROXY_WAKING) { + if (task_current(rq, p)) { + clear_task_blocked_on(p, PROXY_WAKING); + return p; + } + goto force_return; + } /* * By taking mutex->wait_lock we hold off concurrent mutex_unlock() @@ -6657,27 +6797,39 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) return NULL; } + if (task_current(rq, p)) + curr_in_chain = true; + owner = __mutex_owner(mutex); if (!owner) { /* - * If there is no owner, clear blocked_on - * and return p so it can run and try to - * acquire the lock + * If there is no owner, either clear blocked_on + * and return p (if it is current and safe to + * just run on this rq), or return-migrate the task. */ - __clear_task_blocked_on(p, mutex); - return p; + if (task_current(rq, p)) { + __clear_task_blocked_on(p, NULL); + return p; + } + goto force_return; } if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { /* XXX Don't handle blocked owners/delayed dequeue yet */ - action = DEACTIVATE_DONOR; - break; + if (curr_in_chain) + return proxy_resched_idle(rq); + goto deactivate; } - if (task_cpu(owner) != this_cpu) { - /* XXX Don't handle migrations yet */ - action = DEACTIVATE_DONOR; - break; + owner_cpu = task_cpu(owner); + if (owner_cpu != this_cpu) { + /* + * @owner can disappear, simply migrate to @owner_cpu + * and leave that CPU to sort things out. + */ + if (curr_in_chain) + return proxy_resched_idle(rq); + goto migrate_task; } if (task_on_rq_migrating(owner)) { @@ -6734,16 +6886,20 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) * guarantee its existence, as per ttwu_remote(). */ } - - /* Handle actions we need to do outside of the guard() scope */ - switch (action) { - case DEACTIVATE_DONOR: - return proxy_deactivate(rq, donor); - case FOUND: - /* fallthrough */; - } WARN_ON_ONCE(owner && !owner->on_rq); return owner; + +deactivate: + if (proxy_deactivate(rq, donor)) + return NULL; + /* If deactivate fails, force return */ + p = donor; +force_return: + proxy_force_return(rq, rf, p); + return NULL; +migrate_task: + proxy_migrate_task(rq, rf, p, owner_cpu); + return NULL; } #else /* SCHED_PROXY_EXEC */ static struct task_struct * From 2d4cc371baa5881da45120a65d264a59b486f486 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Mar 2026 14:20:13 +0100 Subject: [PATCH 39/46] sched/fair: Use sched_energy_enabled() Use helper sched_energy_enabled() everywhere we want to test if EAS is enabled instead of mixing sched_energy_enabled() and direct call to static_branch_unlikely(). No functional change Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260327132013.2800517-1-vincent.guittot@linaro.org --- kernel/sched/topology.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 43150591914b..5847b83d9d55 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -273,7 +273,7 @@ void rebuild_sched_domains_energy(void) static int sched_energy_aware_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int ret, state; + int ret; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -289,8 +289,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { - state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) + if (sysctl_sched_energy_aware != sched_energy_enabled()) rebuild_sched_domains_energy(); } @@ -388,11 +387,11 @@ static void destroy_perf_domain_rcu(struct rcu_head *rp) static void sched_energy_set(bool has_eas) { - if (!has_eas && static_branch_unlikely(&sched_energy_present)) { + if (!has_eas && sched_energy_enabled()) { if (sched_debug()) pr_info("%s: stopping EAS\n", __func__); static_branch_disable_cpuslocked(&sched_energy_present); - } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) { + } else if (has_eas && !sched_energy_enabled()) { if (sched_debug()) pr_info("%s: starting EAS\n", __func__); static_branch_enable_cpuslocked(&sched_energy_present); From 059258b0d424510202b6f2796279dbdbf0c6a83d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 31 Mar 2026 18:23:52 +0200 Subject: [PATCH 40/46] sched/fair: Prevent negative lag increase during delayed dequeue Delayed dequeue feature aims to reduce the negative lag of a dequeued task while sleeping but it can happens that newly enqueued tasks will move backward the avg vruntime and increase its negative lag. When the delayed dequeued task wakes up, it has more neg lag compared to being dequeued immediately or to other tasks that have been dequeued just before theses new enqueues. Ensure that the negative lag of a delayed dequeued task doesn't increase during its delayed dequeued phase while waiting for its neg lag to diseappear. Similarly, we remove any positive lag that the delayed dequeued task could have gain during thsi period. Short slice tasks are particularly impacted in overloaded system. Test on snapdragon rb5: hackbench -T -p -l 16000000 -g 2 1> /dev/null & cyclictest -t 1 -i 2777 -D 333 --policy=fair --mlock -h 20000 -q The scheduling latency of cyclictest is: tip/sched/core tip/sched/core +this patch cyclictest slice (ms) (default)2.8 8 8 hackbench slice (ms) (default)2.8 20 20 Total Samples | 115632 119733 119806 Average (us) | 364 64(-82%) 61(- 5%) Median (P50) (us) | 60 56(- 7%) 56( 0%) 90th Percentile (us) | 1166 62(-95%) 62( 0%) 99th Percentile (us) | 4192 73(-98%) 72(- 1%) 99.9th Percentile (us) | 8528 2707(-68%) 1300(-52%) Maximum (us) | 17735 14273(-20%) 13525(- 5%) Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260331162352.551501-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 56 +++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 41293d5d9b75..597ce5b718d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -840,11 +840,33 @@ static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avrunt return clamp(vlag, -limit, limit); } -static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) +/* + * Delayed dequeue aims to reduce the negative lag of a dequeued task. While + * updating the lag of an entity, check that negative lag didn't increase + * during the delayed dequeue period which would be unfair. + * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO + * is set. + * + * Return true if the lag has been adjusted. + */ +static __always_inline +bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { + s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); + bool ret; + WARN_ON_ONCE(!se->on_rq); - se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); + if (se->sched_delayed) { + /* previous vlag < 0 otherwise se would not be delayed */ + vlag = max(vlag, se->vlag); + if (sched_feat(DELAY_ZERO)) + vlag = min(vlag, 0); + } + ret = (vlag == se->vlag); + se->vlag = vlag; + + return ret; } /* @@ -5564,13 +5586,6 @@ static void clear_delayed(struct sched_entity *se) } } -static inline void finish_delayed_dequeue_entity(struct sched_entity *se) -{ - clear_delayed(se); - if (sched_feat(DELAY_ZERO) && se->vlag > 0) - se->vlag = 0; -} - static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -5596,6 +5611,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { update_load_avg(cfs_rq, se, 0); + update_entity_lag(cfs_rq, se); set_delayed(se); return false; } @@ -5635,7 +5651,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); if (flags & DEQUEUE_DELAYED) - finish_delayed_dequeue_entity(se); + clear_delayed(se); if (cfs_rq->nr_queued == 0) { update_idle_cfs_rq_clock_pelt(cfs_rq); @@ -7084,18 +7100,14 @@ requeue_delayed_entity(struct sched_entity *se) WARN_ON_ONCE(!se->sched_delayed); WARN_ON_ONCE(!se->on_rq); - if (sched_feat(DELAY_ZERO)) { - update_entity_lag(cfs_rq, se); - if (se->vlag > 0) { - cfs_rq->nr_queued--; - if (se != cfs_rq->curr) - __dequeue_entity(cfs_rq, se); - se->vlag = 0; - place_entity(cfs_rq, se, 0); - if (se != cfs_rq->curr) - __enqueue_entity(cfs_rq, se); - cfs_rq->nr_queued++; - } + if (update_entity_lag(cfs_rq, se)) { + cfs_rq->nr_queued--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + place_entity(cfs_rq, se, 0); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; } update_load_avg(cfs_rq, se, 0); From c6e80201e057dfb7253385e60bf541121bf5dc33 Mon Sep 17 00:00:00 2001 From: Joseph Salisbury Date: Fri, 3 Apr 2026 17:00:14 -0400 Subject: [PATCH 41/46] sched: Use u64 for bandwidth ratio calculations to_ratio() computes BW_SHIFT-scaled bandwidth ratios from u64 period and runtime values, but it returns unsigned long. tg_rt_schedulable() also stores the current group limit and the accumulated child sum in unsigned long. On 32-bit builds, large bandwidth ratios can be truncated and the RT group sum can wrap when enough siblings are present. That can let an overcommitted RT hierarchy pass the schedulability check, and it also narrows the helper result for other callers. Return u64 from to_ratio() and use u64 for the RT group totals so bandwidth ratios are preserved and compared at full width on both 32-bit and 64-bit builds. Fixes: b40b2e8eb521 ("sched: rt: multi level group constraints") Assisted-by: Codex:GPT-5 Signed-off-by: Joseph Salisbury Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20260403210014.2713404-1-joseph.salisbury@oracle.com --- kernel/sched/core.c | 2 +- kernel/sched/rt.c | 2 +- kernel/sched/sched.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c15c9865299e..49cd5d217161 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4735,7 +4735,7 @@ void sched_post_fork(struct task_struct *p) scx_post_fork(p); } -unsigned long to_ratio(u64 period, u64 runtime) +u64 to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) return BW_UNIT; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4e5f1957b91b..a48e86794913 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2666,7 +2666,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) { struct rt_schedulable_data *d = data; struct task_group *child; - unsigned long total, sum = 0; + u64 total, sum = 0; u64 period, runtime; period = ktime_to_ns(tg->rt_bandwidth.rt_period); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9594355a3681..c95584191d58 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2907,7 +2907,7 @@ extern void init_cfs_throttle_work(struct task_struct *p); #define MAX_BW_BITS (64 - BW_SHIFT) #define MAX_BW ((1ULL << MAX_BW_BITS) - 1) -extern unsigned long to_ratio(u64 period, u64 runtime); +extern u64 to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); extern void post_init_entity_util_avg(struct task_struct *p); From 556146ce5e9476db234134c46ddf0e154ca17028 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 7 Apr 2026 13:36:17 +0200 Subject: [PATCH 42/46] sched/fair: Avoid overflow in enqueue_entity() Here is one scenario which was triggered when running: stress-ng --yield=32 -t 10000000s& while true; do perf bench sched messaging -p -t -l 100000 -g 16; done on a 256CPUs machine after about an hour into the run: __enqeue_entity: entity_key(-141245081754) weight(90891264) overflow_mul(5608800059305154560) vlag(57498) delayed?(0) cfs_rq: zero_vruntime(3809707759657809) sum_w_vruntime(0) sum_weight(0) nr_queued(1) cfs_rq->curr: entity_key(0) vruntime(3809707759657809) deadline(3809723966988476) weight(37) The above comes from __enqueue_entity() after a place_entity(). Breaking this down: vlag_initial = 57498 vlag = (57498 * (37 + 90891264)) / 37 = 141,245,081,754 vruntime = 3809707759657809 - 141245081754 = 3,809,566,514,576,055 entity_key(se, cfs_rq) = -141,245,081,754 Now, multiplying the entity_key with its own weight results to 5,608,800,059,305,154,560 (same as what overflow_mul() suggests) but in Python, without overflow, this would be: -1,2837,944,014,404,397,056 Avoid the overflow (without doing the division for avg_vruntime()), by moving zero_vruntime to the new entity when it is heavier. Fixes: 4823725d9d1d ("sched/fair: Increase weight bits for avg_vruntime") Signed-off-by: K Prateek Nayak [peterz: suggested 'weight > load' condition] Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260407120052.GG3738010@noisy.programming.kicks-ass.net --- kernel/sched/fair.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 597ce5b718d2..12890ef16603 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5352,6 +5352,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { u64 vslice, vruntime = avg_vruntime(cfs_rq); + bool update_zero = false; s64 lag = 0; if (!se->custom_slice) @@ -5368,7 +5369,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { struct sched_entity *curr = cfs_rq->curr; - long load; + long load, weight; lag = se->vlag; @@ -5428,14 +5429,41 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (curr && curr->on_rq) load += avg_vruntime_weight(cfs_rq, curr->load.weight); - lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight); + weight = avg_vruntime_weight(cfs_rq, se->load.weight); + lag *= load + weight; if (WARN_ON_ONCE(!load)) load = 1; lag = div64_long(lag, load); + + /* + * A heavy entity (relative to the tree) will pull the + * avg_vruntime close to its vruntime position on enqueue. But + * the zero_vruntime point is only updated at the next + * update_deadline()/place_entity()/update_entity_lag(). + * + * Specifically (see the comment near avg_vruntime_weight()): + * + * sum_w_vruntime = \Sum (v_i - v0) * w_i + * + * Note that if v0 is near a light entity, both terms will be + * small for the light entity, while in that case both terms + * are large for the heavy entity, leading to risk of + * overflow. + * + * OTOH if v0 is near the heavy entity, then the difference is + * larger for the light entity, but the factor is small, while + * for the heavy entity the difference is small but the factor + * is large. Avoiding the multiplication overflow. + */ + if (weight > load) + update_zero = true; } se->vruntime = vruntime - lag; + if (update_zero) + update_zero_vruntime(cfs_rq, -lag); + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { se->deadline += se->vruntime; se->rel_deadline = 0; From 8b016dcec9365675be81d26be88f2c09cf983bd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 23 Mar 2026 13:39:37 +0100 Subject: [PATCH 43/46] sched/rt: Skip group schedulable check with rt_group_sched=0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The warning from the commit 87f1fb77d87a6 ("sched: Add RT_GROUP WARN checks for non-root task_groups") is wrong -- it assumes that only task_groups with rt_rq are traversed, however, the schedulability check would iterate all task_groups even when rt_group_sched=0 is disabled at boot time but some non-root task_groups exist. The schedulability check is supposed to validate: a) that children don't overcommit its parent, b) no RT task group overcommits global RT limit. but with rt_group_sched=0 there is no (non-trivial) hierarchy of RT groups, therefore skip the validation altogether. Otherwise, writes to the global sched_rt_runtime_us knob will be rejected with incorrect validation error. This fix is immaterial with CONFIG_RT_GROUP_SCHED=n. Fixes: 87f1fb77d87a6 ("sched: Add RT_GROUP WARN checks for non-root task_groups") Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260323-sched-rert_groups-v3-1-1e7d5ed6b249@suse.com --- kernel/sched/rt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a48e86794913..893e54dab13e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2690,9 +2690,6 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY; - if (WARN_ON(!rt_group_sched_enabled() && tg != &root_task_group)) - return -EBUSY; - total = to_ratio(period, runtime); /* @@ -2836,6 +2833,8 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { int ret = 0; + if (!rt_group_sched_enabled()) + return ret; mutex_lock(&rt_constraints_mutex); ret = __rt_schedulable(NULL, 0, 0); From 4f70a0456d090303d5a6c915dd7d9db9da56cb16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 23 Mar 2026 13:39:38 +0100 Subject: [PATCH 44/46] sched/rt: Move group schedulability check to sched_rt_global_validate() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sched_rt_global_constraints() function is a remnant that used to set up global RT throttling but that is no more since commit 5f6bd380c7bdb ("sched/rt: Remove default bandwidth control") and the function ended up only doing schedulability check. Move the check into the validation function where it fits better. (The order of validations sched_dl_global_validate() and sched_rt_global_validate() shouldn't matter.) Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260323-sched-rert_groups-v3-2-1e7d5ed6b249@suse.com --- kernel/sched/rt.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 893e54dab13e..e16d8f1b4aa2 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2832,15 +2832,7 @@ long sched_group_rt_period(struct task_group *tg) #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { - int ret = 0; - if (!rt_group_sched_enabled()) - return ret; - - mutex_lock(&rt_constraints_mutex); - ret = __rt_schedulable(NULL, 0, 0); - mutex_unlock(&rt_constraints_mutex); - - return ret; + return 0; } #endif /* CONFIG_SYSCTL */ @@ -2872,6 +2864,13 @@ static int sched_rt_global_validate(void) NSEC_PER_USEC > max_rt_runtime))) return -EINVAL; +#ifdef CONFIG_RT_GROUP_SCHED + if (!rt_group_sched_enabled()) + return 0; + + scoped_guard(mutex, &rt_constraints_mutex) + return __rt_schedulable(NULL, 0, 0); +#endif return 0; } From 985215804dcbf02ab675977e770708e3f084e9fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Mon, 23 Mar 2026 13:39:39 +0100 Subject: [PATCH 45/46] sched/rt: Cleanup global RT bandwidth functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 5f6bd380c7bdb ("sched/rt: Remove default bandwidth control") and followup changes made a few of the functions unnecessary, drop them for simplicity. Signed-off-by: Michal Koutný Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260323-sched-rert_groups-v3-3-1e7d5ed6b249@suse.com --- kernel/sched/rt.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e16d8f1b4aa2..4ee8faf01441 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2829,13 +2829,6 @@ long sched_group_rt_period(struct task_group *tg) return rt_period_us; } -#ifdef CONFIG_SYSCTL -static int sched_rt_global_constraints(void) -{ - return 0; -} -#endif /* CONFIG_SYSCTL */ - int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) { /* Don't accept real-time tasks when there is no way for them to run */ @@ -2845,14 +2838,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) return 1; } -#else /* !CONFIG_RT_GROUP_SCHED: */ - -#ifdef CONFIG_SYSCTL -static int sched_rt_global_constraints(void) -{ - return 0; -} -#endif /* CONFIG_SYSCTL */ #endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL @@ -2874,10 +2859,6 @@ static int sched_rt_global_validate(void) return 0; } -static void sched_rt_do_global(void) -{ -} - static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -2901,11 +2882,6 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff if (ret) goto undo; - ret = sched_rt_global_constraints(); - if (ret) - goto undo; - - sched_rt_do_global(); sched_dl_do_global(); } if (0) { From 78cde54ea5f03398f1cf6656de2472068f6da966 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 10 Apr 2026 15:23:21 +0200 Subject: [PATCH 46/46] sched/eevdf: Clear buddies for preempt_short next buddy should not prevent shorter slice preemption. Don't take buddy into account when checking if shorter slice entity can preempt and clear it if the entity with a shorter slice can preempt current. Test on snapdragon rb5: hackbench -T -p -l 16000000 -g 2 1> /dev/null & hackbench runs in cgroup /test-A cyclictest -t 1 -i 2777 -D 63 --policy=fair --mlock -h 20000 -q cyclictest runs in cgroup /test-B tip/sched/core tip/sched/core +this patch cyclictest slice (ms) (default)2.8 8 8 hackbench slice (ms) (default)2.8 20 20 Total Samples | 22679 22595 22686 Average (us) | 84 94(-12%) 59( 37%) Median (P50) (us) | 56 56( 0%) 56( 0%) 90th Percentile (us) | 64 65(- 2%) 63( 3%) 99th Percentile (us) | 1047 1273(-22%) 74( 94%) 99.9th Percentile (us) | 2431 4751(-95%) 663( 86%) Maximum (us) | 4694 8655(-84%) 3934( 55%) Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260410132321.2897789-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 12890ef16603..f179faf7a6a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1116,7 +1116,7 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) /* * Picking the ->next buddy will affect latency but not fairness. */ - if (sched_feat(PICK_BUDDY) && + if (sched_feat(PICK_BUDDY) && protect && cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { /* ->next will never be delayed */ WARN_ON_ONCE(cfs_rq->next->sched_delayed); @@ -9138,8 +9138,10 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f return; preempt: - if (preempt_action == PREEMPT_WAKEUP_SHORT) + if (preempt_action == PREEMPT_WAKEUP_SHORT) { cancel_protect_slice(se); + clear_buddies(cfs_rq, se); + } resched_curr_lazy(rq); }