mirror of
https://github.com/torvalds/linux.git
synced 2026-05-28 00:53:34 +02:00
sched_ext: Close root-enable vs sched_ext_dead() race with SCX_TASK_INIT_BEGIN
scx_root_enable_workfn() drops the iter rq lock for ops.init_task() and a TASK_DEAD @p can fall through sched_ext_dead() in that window. The race hits when sched_ext_dead() observes SCX_TASK_INIT (the intermediate state before @p->scx.sched is published) and dereferences NULL via SCX_HAS_OP(NULL, exit_task), or observes SCX_TASK_NONE during the unlocked init window and skips cleanup so exit_task() never runs. Add SCX_TASK_INIT_BEGIN. The enable path writes NONE -> INIT_BEGIN under the iter rq lock, then takes the rq lock again after init to walk INIT_BEGIN -> INIT -> READY. sched_ext_dead() that wins the rq-lock race observes INIT_BEGIN and sets DEAD without calling into ops; the post-init recheck unwinds via scx_sub_init_cancel_task(). scx_fork() runs single-threaded against sched_ext_dead() (the task is not on scx_tasks until scx_post_fork() adds it) so its INIT_BEGIN -> INIT walk needs no rq-lock pairing; it rolls back to NONE on ops.init_task() failure. The validation matrix grows the INIT_BEGIN row and the INIT_BEGIN -> DEAD edge; INIT now requires INIT_BEGIN as the predecessor. scx_sub_disable()'s migration writes INIT_BEGIN as a synthetic predecessor to satisfy the tightened verification. The sub-sched paths still race with sched_ext_dead() during the unlocked init window. This will be fixed by the next patch. Reported-by: zhidao su <suzhidao@xiaomi.com> Link: https://lore.kernel.org/all/20260429133155.3825247-1-suzhidao@xiaomi.com/ Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Andrea Righi <arighi@nvidia.com>
This commit is contained in:
parent
cceb8fa9cb
commit
c941d7391f
|
|
@ -106,6 +106,7 @@ enum scx_ent_flags {
|
|||
* Bits 8 to 10 are used to carry task state:
|
||||
*
|
||||
* NONE ops.init_task() not called yet
|
||||
* INIT_BEGIN ops.init_task() in flight; see sched_ext_dead()
|
||||
* INIT ops.init_task() succeeded, but task can be cancelled
|
||||
* READY fully initialized, but not in sched_ext
|
||||
* ENABLED fully initialized and in sched_ext
|
||||
|
|
@ -116,10 +117,11 @@ enum scx_ent_flags {
|
|||
SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
|
||||
|
||||
SCX_TASK_NONE = 0 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_INIT = 1 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_READY = 2 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_ENABLED = 3 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_DEAD = 4 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_INIT_BEGIN = 1 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_INIT = 2 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_READY = 3 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_ENABLED = 4 << SCX_TASK_STATE_SHIFT,
|
||||
SCX_TASK_DEAD = 5 << SCX_TASK_STATE_SHIFT,
|
||||
|
||||
/*
|
||||
* Bits 12 and 13 are used to carry reenqueue reason. In addition to
|
||||
|
|
|
|||
|
|
@ -725,8 +725,11 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
|
|||
case SCX_TASK_NONE:
|
||||
warn = prev_state == SCX_TASK_DEAD;
|
||||
break;
|
||||
case SCX_TASK_INIT:
|
||||
case SCX_TASK_INIT_BEGIN:
|
||||
warn = prev_state != SCX_TASK_NONE;
|
||||
break;
|
||||
case SCX_TASK_INIT:
|
||||
warn = prev_state != SCX_TASK_INIT_BEGIN;
|
||||
p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
|
||||
break;
|
||||
case SCX_TASK_READY:
|
||||
|
|
@ -737,7 +740,8 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
|
|||
warn = prev_state != SCX_TASK_READY;
|
||||
break;
|
||||
case SCX_TASK_DEAD:
|
||||
warn = prev_state != SCX_TASK_NONE;
|
||||
warn = !(prev_state == SCX_TASK_NONE ||
|
||||
prev_state == SCX_TASK_INIT_BEGIN);
|
||||
break;
|
||||
default:
|
||||
WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
|
||||
|
|
@ -3753,9 +3757,12 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
|
|||
#else
|
||||
struct scx_sched *sch = scx_root;
|
||||
#endif
|
||||
scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
|
||||
ret = __scx_init_task(sch, p, true);
|
||||
if (unlikely(ret))
|
||||
if (unlikely(ret)) {
|
||||
scx_set_task_state(p, SCX_TASK_NONE);
|
||||
return ret;
|
||||
}
|
||||
scx_set_task_state(p, SCX_TASK_INIT);
|
||||
scx_set_task_sched(p, sch);
|
||||
}
|
||||
|
|
@ -3856,13 +3863,18 @@ void sched_ext_dead(struct task_struct *p)
|
|||
* scx_task_iter_next_locked(). NONE tasks need no marking: cgroup
|
||||
* iteration is only used from sub-sched paths, which require root
|
||||
* enabled. Root enable transitions every live task to at least READY.
|
||||
*
|
||||
* %INIT_BEGIN means ops.init_task() is running for @p. Don't call
|
||||
* into ops; transition to %DEAD so the post-init recheck unwinds
|
||||
* via scx_sub_init_cancel_task().
|
||||
*/
|
||||
if (scx_get_task_state(p) != SCX_TASK_NONE) {
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
|
||||
rq = task_rq_lock(p, &rf);
|
||||
scx_disable_and_exit_task(scx_task_sched(p), p);
|
||||
if (scx_get_task_state(p) != SCX_TASK_INIT_BEGIN)
|
||||
scx_disable_and_exit_task(scx_task_sched(p), p);
|
||||
scx_set_task_state(p, SCX_TASK_DEAD);
|
||||
task_rq_unlock(rq, p, &rf);
|
||||
}
|
||||
|
|
@ -5773,6 +5785,7 @@ static void scx_sub_disable(struct scx_sched *sch)
|
|||
* $p having already been initialized, and then enable.
|
||||
*/
|
||||
scx_disable_and_exit_task(sch, p);
|
||||
scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
|
||||
scx_set_task_state(p, SCX_TASK_INIT);
|
||||
scx_set_task_sched(p, parent);
|
||||
scx_set_task_state(p, SCX_TASK_READY);
|
||||
|
|
@ -6878,6 +6891,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
|
|||
|
||||
scx_task_iter_start(&sti, NULL);
|
||||
while ((p = scx_task_iter_next_locked(&sti))) {
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
|
||||
/*
|
||||
* @p may already be dead, have lost all its usages counts and
|
||||
* be waiting for RCU grace period before being freed. @p can't
|
||||
|
|
@ -6886,10 +6902,26 @@ static void scx_root_enable_workfn(struct kthread_work *work)
|
|||
if (!tryget_task_struct(p))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Set %INIT_BEGIN under the iter's rq lock so that a concurrent
|
||||
* sched_ext_dead() does not call ops.exit_task() on @p while
|
||||
* ops.init_task() is running. If sched_ext_dead() runs before
|
||||
* this store, it has already removed @p from scx_tasks and the
|
||||
* iter won't visit @p; if it runs after, it observes
|
||||
* %INIT_BEGIN and transitions to %DEAD without calling ops,
|
||||
* leaving the post-init recheck below to unwind.
|
||||
*/
|
||||
scx_set_task_state(p, SCX_TASK_INIT_BEGIN);
|
||||
scx_task_iter_unlock(&sti);
|
||||
|
||||
ret = __scx_init_task(sch, p, false);
|
||||
|
||||
rq = task_rq_lock(p, &rf);
|
||||
|
||||
if (unlikely(ret)) {
|
||||
if (scx_get_task_state(p) != SCX_TASK_DEAD)
|
||||
scx_set_task_state(p, SCX_TASK_NONE);
|
||||
task_rq_unlock(rq, p, &rf);
|
||||
put_task_struct(p);
|
||||
scx_task_iter_stop(&sti);
|
||||
scx_error(sch, "ops.init_task() failed (%d) for %s[%d]",
|
||||
|
|
@ -6897,10 +6929,20 @@ static void scx_root_enable_workfn(struct kthread_work *work)
|
|||
goto err_disable_unlock_all;
|
||||
}
|
||||
|
||||
scx_set_task_state(p, SCX_TASK_INIT);
|
||||
scx_set_task_sched(p, sch);
|
||||
scx_set_task_state(p, SCX_TASK_READY);
|
||||
if (scx_get_task_state(p) == SCX_TASK_DEAD) {
|
||||
/*
|
||||
* sched_ext_dead() observed %INIT_BEGIN and set %DEAD.
|
||||
* ops.exit_task() is owed to the sched __scx_init_task()
|
||||
* ran against; call it now.
|
||||
*/
|
||||
scx_sub_init_cancel_task(sch, p);
|
||||
} else {
|
||||
scx_set_task_state(p, SCX_TASK_INIT);
|
||||
scx_set_task_sched(p, sch);
|
||||
scx_set_task_state(p, SCX_TASK_READY);
|
||||
}
|
||||
|
||||
task_rq_unlock(rq, p, &rf);
|
||||
put_task_struct(p);
|
||||
}
|
||||
scx_task_iter_stop(&sti);
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user