sched_ext: Make watchdog sub-sched aware

Currently, the watchdog checks all tasks as if they are all on scx_root.
Move scx_watchdog_timeout inside scx_sched and make check_rq_for_timeouts()
use the timeout from the scx_sched associated with each task.
refresh_watchdog() is added, which determines the timer interval as half of
the shortest watchdog timeouts of all scheds and arms or disarms it as
necessary. Every scx_sched instance has equivalent or better detection
latency while sharing the same timer.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
This commit is contained in:
Tejun Heo 2026-03-06 07:58:04 -10:00
parent 34ecfb3551
commit cde94c032b
2 changed files with 56 additions and 25 deletions

View File

@ -59,11 +59,10 @@ static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0); static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/* /*
* The maximum amount of time in jiffies that a task may be runnable without * Watchdog interval. All scx_sched's share a single watchdog timer and the
* being scheduled on a CPU. If this timeout is exceeded, it will trigger * interval is half of the shortest sch->watchdog_timeout.
* scx_error().
*/ */
static unsigned long scx_watchdog_timeout; static unsigned long scx_watchdog_interval;
/* /*
* The last time the delayed work was run. This delayed work relies on * The last time the delayed work was run. This delayed work relies on
@ -3038,10 +3037,11 @@ static bool check_rq_for_timeouts(struct rq *rq)
goto out_unlock; goto out_unlock;
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
struct scx_sched *sch = scx_task_sched(p);
unsigned long last_runnable = p->scx.runnable_at; unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies, if (unlikely(time_after(jiffies,
last_runnable + READ_ONCE(scx_watchdog_timeout)))) { last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@ -3058,6 +3058,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
static void scx_watchdog_workfn(struct work_struct *work) static void scx_watchdog_workfn(struct work_struct *work)
{ {
unsigned long intv;
int cpu; int cpu;
WRITE_ONCE(scx_watchdog_timestamp, jiffies); WRITE_ONCE(scx_watchdog_timestamp, jiffies);
@ -3068,28 +3069,31 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched(); cond_resched();
} }
intv = READ_ONCE(scx_watchdog_interval);
if (intv < ULONG_MAX)
queue_delayed_work(system_unbound_wq, to_delayed_work(work), queue_delayed_work(system_unbound_wq, to_delayed_work(work),
READ_ONCE(scx_watchdog_timeout) / 2); intv);
} }
void scx_tick(struct rq *rq) void scx_tick(struct rq *rq)
{ {
struct scx_sched *sch; struct scx_sched *root;
unsigned long last_check; unsigned long last_check;
if (!scx_enabled()) if (!scx_enabled())
return; return;
sch = rcu_dereference_bh(scx_root); root = rcu_dereference_bh(scx_root);
if (unlikely(!sch)) if (unlikely(!root))
return; return;
last_check = READ_ONCE(scx_watchdog_timestamp); last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies, if (unlikely(time_after(jiffies,
last_check + READ_ONCE(scx_watchdog_timeout)))) { last_check + READ_ONCE(root->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check); u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
"watchdog failed to check in for %u.%03us", "watchdog failed to check in for %u.%03us",
dur_ms / 1000, dur_ms % 1000); dur_ms / 1000, dur_ms % 1000);
} }
@ -4760,6 +4764,26 @@ static void free_kick_syncs(void)
} }
} }
static void refresh_watchdog(void)
{
struct scx_sched *sch;
unsigned long intv = ULONG_MAX;
/* take the shortest timeout and use its half for watchdog interval */
rcu_read_lock();
list_for_each_entry_rcu(sch, &scx_sched_all, all)
intv = max(min(intv, sch->watchdog_timeout / 2), 1);
rcu_read_unlock();
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
WRITE_ONCE(scx_watchdog_interval, intv);
if (intv < ULONG_MAX)
mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
else
cancel_delayed_work_sync(&scx_watchdog_work);
}
#ifdef CONFIG_EXT_SUB_SCHED #ifdef CONFIG_EXT_SUB_SCHED
static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq); static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
@ -4798,6 +4822,8 @@ static void scx_sub_disable(struct scx_sched *sch)
list_del_rcu(&sch->all); list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock); raw_spin_unlock_irq(&scx_sched_lock);
refresh_watchdog();
mutex_unlock(&scx_enable_mutex); mutex_unlock(&scx_enable_mutex);
/* /*
@ -4932,12 +4958,12 @@ static void scx_root_disable(struct scx_sched *sch)
if (sch->ops.exit) if (sch->ops.exit)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei); SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
raw_spin_lock_irq(&scx_sched_lock); raw_spin_lock_irq(&scx_sched_lock);
list_del_rcu(&sch->all); list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock); raw_spin_unlock_irq(&scx_sched_lock);
refresh_watchdog();
/* /*
* scx_root clearing must be inside cpus_read_lock(). See * scx_root clearing must be inside cpus_read_lock(). See
* handle_hotplug(). * handle_hotplug().
@ -5473,6 +5499,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
sch->ancestors[level] = sch; sch->ancestors[level] = sch;
sch->level = level; sch->level = level;
if (ops->timeout_ms)
sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
else
sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
sch->slice_dfl = SCX_SLICE_DFL; sch->slice_dfl = SCX_SLICE_DFL;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE); atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
init_irq_work(&sch->error_irq_work, scx_error_irq_workfn); init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
@ -5615,7 +5646,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
struct scx_sched *sch; struct scx_sched *sch;
struct scx_task_iter sti; struct scx_task_iter sti;
struct task_struct *p; struct task_struct *p;
unsigned long timeout;
int i, cpu, ret; int i, cpu, ret;
mutex_lock(&scx_enable_mutex); mutex_lock(&scx_enable_mutex);
@ -5667,6 +5697,8 @@ static void scx_root_enable_workfn(struct kthread_work *work)
list_add_tail_rcu(&sch->all, &scx_sched_all); list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock); raw_spin_unlock_irq(&scx_sched_lock);
refresh_watchdog();
scx_idle_enable(ops); scx_idle_enable(ops);
if (sch->ops.init) { if (sch->ops.init) {
@ -5697,16 +5729,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (ret) if (ret)
goto err_disable; goto err_disable;
if (ops->timeout_ms)
timeout = msecs_to_jiffies(ops->timeout_ms);
else
timeout = SCX_WATCHDOG_MAX_TIMEOUT;
WRITE_ONCE(scx_watchdog_timeout, timeout);
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
READ_ONCE(scx_watchdog_timeout) / 2);
/* /*
* Once __scx_enabled is set, %current can be switched to SCX anytime. * Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace * This can lead to stalls as some BPF schedulers (e.g. userspace
@ -5928,6 +5950,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
list_add_tail_rcu(&sch->all, &scx_sched_all); list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock); raw_spin_unlock_irq(&scx_sched_lock);
refresh_watchdog();
if (sch->level >= SCX_SUB_MAX_DEPTH) { if (sch->level >= SCX_SUB_MAX_DEPTH) {
scx_error(sch, "max nesting depth %d violated", scx_error(sch, "max nesting depth %d violated",
SCX_SUB_MAX_DEPTH); SCX_SUB_MAX_DEPTH);

View File

@ -1019,6 +1019,13 @@ struct scx_sched {
bool sub_attached; bool sub_attached;
#endif /* CONFIG_EXT_SUB_SCHED */ #endif /* CONFIG_EXT_SUB_SCHED */
/*
* The maximum amount of time in jiffies that a task may be runnable
* without being scheduled on a CPU. If this timeout is exceeded, it
* will trigger scx_error().
*/
unsigned long watchdog_timeout;
atomic_t exit_kind; atomic_t exit_kind;
struct scx_exit_info *exit_info; struct scx_exit_info *exit_info;