sched_ext: Make watchdog sub-sched aware

Currently, the watchdog checks all tasks as if they are all on scx_root.
Move scx_watchdog_timeout inside scx_sched and make check_rq_for_timeouts()
use the timeout from the scx_sched associated with each task.
refresh_watchdog() is added, which determines the timer interval as half of
the shortest watchdog timeouts of all scheds and arms or disarms it as
necessary. Every scx_sched instance has equivalent or better detection
latency while sharing the same timer.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
This commit is contained in:
Tejun Heo 2026-03-06 07:58:04 -10:00
parent 34ecfb3551
commit cde94c032b
2 changed files with 56 additions and 25 deletions

View File

@ -59,11 +59,10 @@ static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
/*
* The maximum amount of time in jiffies that a task may be runnable without
* being scheduled on a CPU. If this timeout is exceeded, it will trigger
* scx_error().
* Watchdog interval. All scx_sched's share a single watchdog timer and the
* interval is half of the shortest sch->watchdog_timeout.
*/
static unsigned long scx_watchdog_timeout;
static unsigned long scx_watchdog_interval;
/*
* The last time the delayed work was run. This delayed work relies on
@ -3038,10 +3037,11 @@ static bool check_rq_for_timeouts(struct rq *rq)
goto out_unlock;
list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
struct scx_sched *sch = scx_task_sched(p);
unsigned long last_runnable = p->scx.runnable_at;
if (unlikely(time_after(jiffies,
last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@ -3058,6 +3058,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
static void scx_watchdog_workfn(struct work_struct *work)
{
unsigned long intv;
int cpu;
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
@ -3068,28 +3069,31 @@ static void scx_watchdog_workfn(struct work_struct *work)
cond_resched();
}
queue_delayed_work(system_unbound_wq, to_delayed_work(work),
READ_ONCE(scx_watchdog_timeout) / 2);
intv = READ_ONCE(scx_watchdog_interval);
if (intv < ULONG_MAX)
queue_delayed_work(system_unbound_wq, to_delayed_work(work),
intv);
}
void scx_tick(struct rq *rq)
{
struct scx_sched *sch;
struct scx_sched *root;
unsigned long last_check;
if (!scx_enabled())
return;
sch = rcu_dereference_bh(scx_root);
if (unlikely(!sch))
root = rcu_dereference_bh(scx_root);
if (unlikely(!root))
return;
last_check = READ_ONCE(scx_watchdog_timestamp);
if (unlikely(time_after(jiffies,
last_check + READ_ONCE(scx_watchdog_timeout)))) {
last_check + READ_ONCE(root->watchdog_timeout)))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
"watchdog failed to check in for %u.%03us",
dur_ms / 1000, dur_ms % 1000);
}
@ -4760,6 +4764,26 @@ static void free_kick_syncs(void)
}
}
static void refresh_watchdog(void)
{
struct scx_sched *sch;
unsigned long intv = ULONG_MAX;
/* take the shortest timeout and use its half for watchdog interval */
rcu_read_lock();
list_for_each_entry_rcu(sch, &scx_sched_all, all)
intv = max(min(intv, sch->watchdog_timeout / 2), 1);
rcu_read_unlock();
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
WRITE_ONCE(scx_watchdog_interval, intv);
if (intv < ULONG_MAX)
mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
else
cancel_delayed_work_sync(&scx_watchdog_work);
}
#ifdef CONFIG_EXT_SUB_SCHED
static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
@ -4798,6 +4822,8 @@ static void scx_sub_disable(struct scx_sched *sch)
list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock);
refresh_watchdog();
mutex_unlock(&scx_enable_mutex);
/*
@ -4932,12 +4958,12 @@ static void scx_root_disable(struct scx_sched *sch)
if (sch->ops.exit)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work);
raw_spin_lock_irq(&scx_sched_lock);
list_del_rcu(&sch->all);
raw_spin_unlock_irq(&scx_sched_lock);
refresh_watchdog();
/*
* scx_root clearing must be inside cpus_read_lock(). See
* handle_hotplug().
@ -5473,6 +5499,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
sch->ancestors[level] = sch;
sch->level = level;
if (ops->timeout_ms)
sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
else
sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
sch->slice_dfl = SCX_SLICE_DFL;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
@ -5615,7 +5646,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
struct scx_sched *sch;
struct scx_task_iter sti;
struct task_struct *p;
unsigned long timeout;
int i, cpu, ret;
mutex_lock(&scx_enable_mutex);
@ -5667,6 +5697,8 @@ static void scx_root_enable_workfn(struct kthread_work *work)
list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock);
refresh_watchdog();
scx_idle_enable(ops);
if (sch->ops.init) {
@ -5697,16 +5729,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
if (ret)
goto err_disable;
if (ops->timeout_ms)
timeout = msecs_to_jiffies(ops->timeout_ms);
else
timeout = SCX_WATCHDOG_MAX_TIMEOUT;
WRITE_ONCE(scx_watchdog_timeout, timeout);
WRITE_ONCE(scx_watchdog_timestamp, jiffies);
queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
READ_ONCE(scx_watchdog_timeout) / 2);
/*
* Once __scx_enabled is set, %current can be switched to SCX anytime.
* This can lead to stalls as some BPF schedulers (e.g. userspace
@ -5928,6 +5950,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
list_add_tail_rcu(&sch->all, &scx_sched_all);
raw_spin_unlock_irq(&scx_sched_lock);
refresh_watchdog();
if (sch->level >= SCX_SUB_MAX_DEPTH) {
scx_error(sch, "max nesting depth %d violated",
SCX_SUB_MAX_DEPTH);

View File

@ -1019,6 +1019,13 @@ struct scx_sched {
bool sub_attached;
#endif /* CONFIG_EXT_SUB_SCHED */
/*
* The maximum amount of time in jiffies that a task may be runnable
* without being scheduled on a CPU. If this timeout is exceeded, it
* will trigger scx_error().
*/
unsigned long watchdog_timeout;
atomic_t exit_kind;
struct scx_exit_info *exit_info;