From 7fb39e4eb4c3db52e4707a6a1cd45362f7e803f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 24 Apr 2026 14:31:36 -1000 Subject: [PATCH] sched_ext: Save and restore scx_locked_rq across SCX_CALL_OP SCX_CALL_OP{,_RET}() unconditionally clears scx_locked_rq_state to NULL on exit. Correct at the top level, but ops can recurse via scx_bpf_sub_dispatch(): a parent's ops.dispatch calls the helper, which invokes the child's ops.dispatch under another SCX_CALL_OP. When the inner call returns, the NULL clobbers the outer's state. The parent's BPF then calls kfuncs like scx_bpf_cpuperf_set() which read scx_locked_rq()==NULL and re-acquire the already-held rq. Snapshot scx_locked_rq_state on entry and restore on exit. Rename the rq parameter to locked_rq across all SCX_CALL_OP* macros so the snapshot local can be typed as 'struct rq *' without colliding with the parameter token in the expansion. SCX_CALL_OP_TASK{,_RET}() and SCX_CALL_OP_2TASKS_RET() funnel through the two base macros and inherit the fix. Fixes: 4f8b122848db ("sched_ext: Add basic building blocks for nested sub-scheduler dispatching") Reported-by: Chris Mason Signed-off-by: Tejun Heo Reviewed-by: Andrea Righi --- kernel/sched/ext.c | 49 ++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 8a2a90659c65..26968d0a6752 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -470,24 +470,35 @@ static inline void update_locked_rq(struct rq *rq) __this_cpu_write(scx_locked_rq_state, rq); } -#define SCX_CALL_OP(sch, op, rq, args...) \ +/* + * SCX ops can recurse via scx_bpf_sub_dispatch() - the inner call must not + * clobber the outer's scx_locked_rq_state. Save it on entry, restore on exit. + */ +#define SCX_CALL_OP(sch, op, locked_rq, args...) \ do { \ - if (rq) \ - update_locked_rq(rq); \ + struct rq *__prev_locked_rq; \ + \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ } while (0) -#define SCX_CALL_OP_RET(sch, op, rq, args...) \ +#define SCX_CALL_OP_RET(sch, op, locked_rq, args...) \ ({ \ + struct rq *__prev_locked_rq; \ __typeof__((sch)->ops.op(args)) __ret; \ \ - if (rq) \ - update_locked_rq(rq); \ + if (locked_rq) { \ + __prev_locked_rq = scx_locked_rq(); \ + update_locked_rq(locked_rq); \ + } \ __ret = (sch)->ops.op(args); \ - if (rq) \ - update_locked_rq(NULL); \ + if (locked_rq) \ + update_locked_rq(__prev_locked_rq); \ __ret; \ }) @@ -499,39 +510,39 @@ do { \ * those subject tasks. * * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held - - * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock - * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if - * kf_tasks[] is set, @p's scheduler-protected fields are stable. + * either via the @locked_rq argument here, or (for ops.select_cpu()) via @p's + * pi_lock held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. + * So if kf_tasks[] is set, @p's scheduler-protected fields are stable. * * kf_tasks[] can not stack, so task-based SCX ops must not nest. The * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants * while a previous one is still in progress. */ -#define SCX_CALL_OP_TASK(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK(sch, op, locked_rq, task, args...) \ do { \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - SCX_CALL_OP((sch), op, rq, task, ##args); \ + SCX_CALL_OP((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ } while (0) -#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...) \ +#define SCX_CALL_OP_TASK_RET(sch, op, locked_rq, task, args...) \ ({ \ __typeof__((sch)->ops.op(task, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task, ##args); \ current->scx.kf_tasks[0] = NULL; \ __ret; \ }) -#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...) \ +#define SCX_CALL_OP_2TASKS_RET(sch, op, locked_rq, task0, task1, args...) \ ({ \ __typeof__((sch)->ops.op(task0, task1, ##args)) __ret; \ WARN_ON_ONCE(current->scx.kf_tasks[0]); \ current->scx.kf_tasks[0] = task0; \ current->scx.kf_tasks[1] = task1; \ - __ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args); \ + __ret = SCX_CALL_OP_RET((sch), op, locked_rq, task0, task1, ##args); \ current->scx.kf_tasks[0] = NULL; \ current->scx.kf_tasks[1] = NULL; \ __ret; \