From c5cd6fd75b6a55761337c9e965dd5ad02485d00d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Apr 2026 13:22:22 +0200
Subject: [PATCH 1/3] sched/fair: Fix the negative lag increase fix

Vincent reported that my rework of his original patch lost a little
something.

Specifically it got the return value wrong; it should not compare
against the old se->vlag, but rather against the current value. Since
the thing that matters is if the effective vruntime of an entity is
affected and the thing needs repositioning or not.

Fixes: 059258b0d424 ("sched/fair: Prevent negative lag increase during delayed dequeue")
Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://patch.msgid.link/20260423094107.GT3102624%40noisy.programming.kicks-ass.net
---
 kernel/sched/fair.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69361c63353a..615861d96357 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -847,13 +847,19 @@ static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avrunt
  * Similarly, check that the entity didn't gain positive lag when DELAY_ZERO
  * is set.
  *
- * Return true if the lag has been adjusted.
+ * Return true if the vlag has been modified. Specifically:
+ *
+ *   se->vlag != avg_vruntime() - se->vruntime
+ *
+ * This can be due to clamping in entity_lag() or clamping due to
+ * sched_delayed. Either way, when vlag is modified and the entity is
+ * retained, the tree needs to be adjusted.
  */
 static __always_inline
 bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	s64 vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq));
-	bool ret;
+	u64 avruntime = avg_vruntime(cfs_rq);
+	s64 vlag = entity_lag(cfs_rq, se, avruntime);
 
 	WARN_ON_ONCE(!se->on_rq);
 
@@ -863,10 +869,9 @@ bool update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		if (sched_feat(DELAY_ZERO))
 			vlag = min(vlag, 0);
 	}
-	ret = (vlag == se->vlag);
 	se->vlag = vlag;
 
-	return ret;
+	return avruntime - vlag != se->vruntime;
 }
 
 /*

From ac8e69e693631689d74d8f1ebee6f84f737f797f Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 22 Apr 2026 11:34:00 +0200
Subject: [PATCH 2/3] sched/fair: Fix wakeup_preempt_fair() vs delayed dequeue

Similar to how pick_next_entity() must dequeue delayed entities, so too must
wakeup_preempt_fair(). Any delayed task being found means it is eligible and
hence past the 0-lag point, ready for removal.

Worse, by not removing delayed entities from consideration, it can skew the
preemption decision, with the end result that a short slice wakeup will not
result in a preemption.

                     tip/sched/core  tip/sched/core    +this patch
cyclictest slice  (ms) (default)2.8             8               8
hackbench slice   (ms) (default)2.8            20              20
Total Samples          |    22559           22595           22683
Average           (us) |      157              64( 59%)        59(  8%)
Median (P50)      (us) |       57              57(  0%)        58(- 2%)
90th Percentile   (us) |       64              60(  6%)        60(  0%)
99th Percentile   (us) |     2407              67( 97%)        67(  0%)
99.9th Percentile (us) |     3400            2288( 33%)       727( 68%)
Maximum           (us) |     5037            9252(-84%)      7461( 19%)

Fixes: f12e148892ed ("sched/fair: Prepare pick_next_task() for delayed dequeue")
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260422093400.319251-1-vincent.guittot@linaro.org
---
 kernel/sched/fair.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 615861d96357..728965851842 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1104,7 +1104,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
  *
  * Which allows tree pruning through eligibility.
  */
-static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
+static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
 {
 	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
 	struct sched_entity *se = __pick_first_entity(cfs_rq);
@@ -1175,11 +1175,6 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
 	return best;
 }
 
-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
-{
-	return __pick_eevdf(cfs_rq, true);
-}
-
 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -5754,11 +5749,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
  * 4) do not run the "skip" process, if something else is available
  */
 static struct sched_entity *
-pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
+pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
 {
 	struct sched_entity *se;
 
-	se = pick_eevdf(cfs_rq);
+	se = pick_eevdf(cfs_rq, protect);
 	if (se->sched_delayed) {
 		dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
 		/*
@@ -9032,7 +9027,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
 {
 	enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
 	struct task_struct *donor = rq->donor;
-	struct sched_entity *se = &donor->se, *pse = &p->se;
+	struct sched_entity *nse, *se = &donor->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(donor);
 	int cse_is_idle, pse_is_idle;
 
@@ -9143,12 +9138,18 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
 	}
 
 pick:
-	/*
-	 * If @p has become the most eligible task, force preemption.
-	 */
-	if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
+	nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT);
+	/* If @p has become the most eligible task, force preemption */
+	if (nse == pse)
 		goto preempt;
 
+	/*
+	 * Because p is enqueued, nse being null can only mean that we
+	 * dequeued a delayed task.
+	 */
+	if (!nse)
+		goto pick;
+
 	if (sched_feat(RUN_TO_PARITY))
 		update_protect_slice(cfs_rq, se);
 
@@ -9184,7 +9185,7 @@ static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
 
 		throttled |= check_cfs_rq_runtime(cfs_rq);
 
-		se = pick_next_entity(rq, cfs_rq);
+		se = pick_next_entity(rq, cfs_rq, true);
 		if (!se)
 			goto again;
 		cfs_rq = group_cfs_rq(se);

From 3da56dc063cd77b9c0b40add930767fab4e389f3 Mon Sep 17 00:00:00 2001
From: Zicheng Qu <quzicheng@huawei.com>
Date: Fri, 24 Apr 2026 07:11:13 +0000
Subject: [PATCH 3/3] sched/fair: Clear rel_deadline when initializing forked
 entities

A yield-triggered crash can happen when a newly forked sched_entity
enters the fair class with se->rel_deadline unexpectedly set.

The failing sequence is:

  1. A task is forked while se->rel_deadline is still set.
  2. __sched_fork() initializes vruntime, vlag and other sched_entity
     state, but does not clear rel_deadline.
  3. On the first enqueue, enqueue_entity() calls place_entity().
  4. Because se->rel_deadline is set, place_entity() treats se->deadline
     as a relative deadline and converts it to an absolute deadline by
     adding the current vruntime.
  5. However, the forked entity's deadline is not a valid inherited
     relative deadline for this new scheduling instance, so the conversion
     produces an abnormally large deadline.
  6. If the task later calls sched_yield(), yield_task_fair() advances
     se->vruntime to se->deadline.
  7. The inflated vruntime is then used by the following enqueue path,
     where the vruntime-derived key can overflow when multiplied by the
     entity weight.
  8. This corrupts cfs_rq->sum_w_vruntime, breaks EEVDF eligibility
     calculation, and can eventually make all entities appear ineligible.
     pick_next_entity() may then return NULL unexpectedly, leading to a
     later NULL dereference.

A captured trace shows the effect clearly. Before yield, the entity's
vruntime was around:

  9834017729983308

After yield_task_fair() executed:

  se->vruntime = se->deadline

the vruntime jumped to:

  19668035460670230

and the deadline was later advanced further to:

  19668035463470230

This shows that the deadline had already become abnormally large before
yield_task_fair() copied it into vruntime.

rel_deadline is only meaningful when se->deadline really carries a
relative deadline that still needs to be placed against vruntime. A
freshly forked sched_entity should not inherit or retain this state.
Clear se->rel_deadline in __sched_fork(), together with the other
sched_entity runtime state, so that the first enqueue does not interpret
the new entity's deadline as a stale relative deadline.

Fixes: 82e9d0456e06 ("sched/fair: Avoid re-setting virtual deadline on 'migrations'")
Analyzed-by: Hui Tang <tanghui20@huawei.com>
Analyzed-by: Zhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: Zicheng Qu <quzicheng@huawei.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260424071113.1199600-1-quzicheng@huawei.com
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da20fb6ea25a..b8871449d3c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4458,6 +4458,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
 	p->se.nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	p->se.vlag			= 0;
+	p->se.rel_deadline		= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
 	/* A delayed task cannot be in clone(). */