From f4fa7c25f632cd925352b4d46f245653a23b1d1a Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 29 Oct 2025 14:08:43 +0100 Subject: [PATCH 1/5] sched_ext: Fix use of uninitialized variable in scx_bpf_cpuperf_set() scx_bpf_cpuperf_set() has a typo where it dereferences the local variable @sch, instead of the global @scx_root pointer. Fix by dereferencing the correct variable. Fixes: 956f2b11a8a4f ("sched_ext: Drop kf_cpu_valid()") Signed-off-by: Andrea Righi Reviewed-by: Christian Loehle Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ecb251e883ea..1a019a7728fb 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -6401,7 +6401,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) guard(rcu)(); - sch = rcu_dereference(sch); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) return; From 5f02151c411dda46efcc5dc57b0845efcdcfc26d Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 12 Nov 2025 15:33:28 +0800 Subject: [PATCH 2/5] sched_ext: Fix unsafe locking in the scx_dump_state() For built with CONFIG_PREEMPT_RT=y kernels, the dump_lock will be converted sleepable spinlock and not disable-irq, so the following scenarios occur: inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. irq_work/0/27 [HC0[0]:SC0[0]:HE1:SE1] takes: (&rq->__lock){?...}-{2:2}, at: raw_spin_rq_lock_nested+0x2b/0x40 {IN-HARDIRQ-W} state was registered at: lock_acquire+0x1e1/0x510 _raw_spin_lock_nested+0x42/0x80 raw_spin_rq_lock_nested+0x2b/0x40 sched_tick+0xae/0x7b0 update_process_times+0x14c/0x1b0 tick_periodic+0x62/0x1f0 tick_handle_periodic+0x48/0xf0 timer_interrupt+0x55/0x80 __handle_irq_event_percpu+0x20a/0x5c0 handle_irq_event_percpu+0x18/0xc0 handle_irq_event+0xb5/0x150 handle_level_irq+0x220/0x460 __common_interrupt+0xa2/0x1e0 common_interrupt+0xb0/0xd0 asm_common_interrupt+0x2b/0x40 _raw_spin_unlock_irqrestore+0x45/0x80 __setup_irq+0xc34/0x1a30 request_threaded_irq+0x214/0x2f0 hpet_time_init+0x3e/0x60 x86_late_time_init+0x5b/0xb0 start_kernel+0x308/0x410 x86_64_start_reservations+0x1c/0x30 x86_64_start_kernel+0x96/0xa0 common_startup_64+0x13e/0x148 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&rq->__lock); lock(&rq->__lock); *** DEADLOCK *** stack backtrace: CPU: 0 UID: 0 PID: 27 Comm: irq_work/0 Call Trace: dump_stack_lvl+0x8c/0xd0 dump_stack+0x14/0x20 print_usage_bug+0x42e/0x690 mark_lock.part.44+0x867/0xa70 ? __pfx_mark_lock.part.44+0x10/0x10 ? string_nocheck+0x19c/0x310 ? number+0x739/0x9f0 ? __pfx_string_nocheck+0x10/0x10 ? __pfx_check_pointer+0x10/0x10 ? kvm_sched_clock_read+0x15/0x30 ? sched_clock_noinstr+0xd/0x20 ? local_clock_noinstr+0x1c/0xe0 __lock_acquire+0xc4b/0x62b0 ? __pfx_format_decode+0x10/0x10 ? __pfx_string+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 ? __pfx_vsnprintf+0x10/0x10 lock_acquire+0x1e1/0x510 ? raw_spin_rq_lock_nested+0x2b/0x40 ? __pfx_lock_acquire+0x10/0x10 ? dump_line+0x12e/0x270 ? raw_spin_rq_lock_nested+0x20/0x40 _raw_spin_lock_nested+0x42/0x80 ? raw_spin_rq_lock_nested+0x2b/0x40 raw_spin_rq_lock_nested+0x2b/0x40 scx_dump_state+0x3b3/0x1270 ? finish_task_switch+0x27e/0x840 scx_ops_error_irq_workfn+0x67/0x80 irq_work_single+0x113/0x260 irq_work_run_list.part.3+0x44/0x70 run_irq_workd+0x6b/0x90 ? __pfx_run_irq_workd+0x10/0x10 smpboot_thread_fn+0x529/0x870 ? __pfx_smpboot_thread_fn+0x10/0x10 kthread+0x305/0x3f0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x40/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 This commit therefore use rq_lock_irqsave/irqrestore() to replace rq_lock/unlock() in the scx_dump_state(). Fixes: 07814a9439a3 ("sched_ext: Print debug dump after an error exit") Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1a019a7728fb..184d562bda5e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4276,7 +4276,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) size_t avail, used; bool idle; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; @@ -4345,7 +4345,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) scx_dump_task(&s, &dctx, p, ' '); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); } dump_newline(&s); From c87488a12393a23f8a1b9850b989b386c58cac3f Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Wed, 12 Nov 2025 08:42:02 -1000 Subject: [PATCH 3/5] sched/ext: convert scx_tasks_lock to raw spinlock Update scx_task_locks so that it's safe to lock/unlock in a non-sleepable context in PREEMPT_RT kernels. scx_task_locks is (non-raw) spinlock used to protect the list of tasks under SCX. This list is updated during from finish_task_switch(), which cannot sleep. Regular spinlocks can be locked in such a context in non-RT kernels, but are sleepable under when CONFIG_PREEMPT_RT=y. Convert scx_task_locks into a raw spinlock, which is not sleepable even on RT kernels. Sample backtrace: dump_stack_lvl+0x83/0xa0 __might_resched+0x14a/0x200 rt_spin_lock+0x61/0x1c0 ? sched_ext_dead+0x2d/0xf0 ? lock_release+0xc6/0x280 sched_ext_dead+0x2d/0xf0 ? srso_alias_return_thunk+0x5/0xfbef5 finish_task_switch.isra.0+0x254/0x360 __schedule+0x584/0x11d0 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? tick_nohz_idle_exit+0x7e/0x120 schedule_idle+0x23/0x40 cpu_startup_entry+0x29/0x30 start_secondary+0xf8/0x100 common_startup_64+0x13e/0x148 Signed-off-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 184d562bda5e..03d05ea20006 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root; * guarantee system safety. Maintain a dedicated task list which contains every * task between its fork and eventual free. */ -static DEFINE_SPINLOCK(scx_tasks_lock); +static DEFINE_RAW_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ @@ -476,7 +476,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter) BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); @@ -507,14 +507,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter) __scx_task_iter_rq_unlock(iter); if (iter->list_locked) { iter->list_locked = false; - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); } } static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { if (!iter->list_locked) { - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->list_locked = true; } } @@ -2940,9 +2940,9 @@ void scx_post_fork(struct task_struct *p) } } - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); list_add_tail(&p->scx.tasks_node, &scx_tasks); - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); percpu_up_read(&scx_fork_rwsem); } @@ -2966,9 +2966,9 @@ void sched_ext_free(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&scx_tasks_lock, flags); + raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); - spin_unlock_irqrestore(&scx_tasks_lock, flags); + raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED From a257e974210320ede524f340ffe16bf4bf0dda1e Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 13 Nov 2025 19:43:55 +0800 Subject: [PATCH 4/5] sched_ext: Fix possible deadlock in the deferred_irq_workfn() For PREEMPT_RT=y kernels, the deferred_irq_workfn() is executed in the per-cpu irq_work/* task context and not disable-irq, if the rq returned by container_of() is current CPU's rq, the following scenarios may occur: lock(&rq->__lock); lock(&rq->__lock); This commit use IRQ_WORK_INIT_HARD() to replace init_irq_work() to initialize rq->scx.deferred_irq_work, make the deferred_irq_workfn() is always invoked in hard-irq context. Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 03d05ea20006..07399210ac2d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5321,7 +5321,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); - init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); + rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); if (cpu_online(cpu)) From 36c6f3c03d104faf1aa90922f2310549c175420f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 17 Nov 2025 20:53:10 +0800 Subject: [PATCH 5/5] sched_ext: Use IRQ_WORK_INIT_HARD() to initialize rq->scx.kick_cpus_irq_work For PREEMPT_RT kernels, the kick_cpus_irq_workfn() be invoked in the per-cpu irq_work/* task context and there is no rcu-read critical section to protect. this commit therefore use IRQ_WORK_INIT_HARD() to initialize the per-cpu rq->scx.kick_cpus_irq_work in the init_sched_ext_class(). Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 07399210ac2d..7aae1d0ce37e 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -5322,7 +5322,7 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); - init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); if (cpu_online(cpu)) cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE;