sched/fair: Fix wakeup_preempt_fair() vs delayed dequeue

Similar to how pick_next_entity() must dequeue delayed entities, so too must
wakeup_preempt_fair(). Any delayed task being found means it is eligible and
hence past the 0-lag point, ready for removal.

Worse, by not removing delayed entities from consideration, it can skew the
preemption decision, with the end result that a short slice wakeup will not
result in a preemption.

                     tip/sched/core  tip/sched/core    +this patch
cyclictest slice  (ms) (default)2.8             8               8
hackbench slice   (ms) (default)2.8            20              20
Total Samples          |    22559           22595           22683
Average           (us) |      157              64( 59%)        59(  8%)
Median (P50)      (us) |       57              57(  0%)        58(- 2%)
90th Percentile   (us) |       64              60(  6%)        60(  0%)
99th Percentile   (us) |     2407              67( 97%)        67(  0%)
99.9th Percentile (us) |     3400            2288( 33%)       727( 68%)
Maximum           (us) |     5037            9252(-84%)      7461( 19%)

Fixes: f12e148892 ("sched/fair: Prepare pick_next_task() for delayed dequeue")
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20260422093400.319251-1-vincent.guittot@linaro.org
This commit is contained in:
Vincent Guittot 2026-04-22 11:34:00 +02:00 committed by Peter Zijlstra
parent c5cd6fd75b
commit ac8e69e693

View File

@ -1104,7 +1104,7 @@ static inline void cancel_protect_slice(struct sched_entity *se)
*
* Which allows tree pruning through eligibility.
*/
static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
{
struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
struct sched_entity *se = __pick_first_entity(cfs_rq);
@ -1175,11 +1175,6 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect)
return best;
}
static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
{
return __pick_eevdf(cfs_rq, true);
}
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@ -5754,11 +5749,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
* 4) do not run the "skip" process, if something else is available
*/
static struct sched_entity *
pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq, bool protect)
{
struct sched_entity *se;
se = pick_eevdf(cfs_rq);
se = pick_eevdf(cfs_rq, protect);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@ -9032,7 +9027,7 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
{
enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK;
struct task_struct *donor = rq->donor;
struct sched_entity *se = &donor->se, *pse = &p->se;
struct sched_entity *nse, *se = &donor->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
@ -9143,12 +9138,18 @@ static void wakeup_preempt_fair(struct rq *rq, struct task_struct *p, int wake_f
}
pick:
/*
* If @p has become the most eligible task, force preemption.
*/
if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse)
nse = pick_next_entity(rq, cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT);
/* If @p has become the most eligible task, force preemption */
if (nse == pse)
goto preempt;
/*
* Because p is enqueued, nse being null can only mean that we
* dequeued a delayed task.
*/
if (!nse)
goto pick;
if (sched_feat(RUN_TO_PARITY))
update_protect_slice(cfs_rq, se);
@ -9184,7 +9185,7 @@ static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf)
throttled |= check_cfs_rq_runtime(cfs_rq);
se = pick_next_entity(rq, cfs_rq);
se = pick_next_entity(rq, cfs_rq, true);
if (!se)
goto again;
cfs_rq = group_cfs_rq(se);