mirror of
https://github.com/torvalds/linux.git
synced 2026-05-30 18:13:41 +02:00
sched_ext: Implement hierarchical bypass mode
When a sub-scheduler enters bypass mode, its tasks must be scheduled by an ancestor to guarantee forward progress. Tasks from bypassing descendants are queued in the bypass DSQs of the nearest non-bypassing ancestor, or the root scheduler if all ancestors are bypassing. This requires coordination between bypassing schedulers and their hosts. Add bypass_enq_target_dsq() to find the correct bypass DSQ by walking up the hierarchy until reaching a non-bypassing ancestor. When a sub-scheduler starts bypassing, all its runnable tasks are re-enqueued after scx_bypassing() is set, ensuring proper migration to ancestor bypass DSQs. Update scx_dispatch_sched() to handle hosting bypassed descendants. When a scheduler is not bypassing but has bypassing descendants, it must schedule both its own tasks and bypassed descendant tasks. A simple policy is implemented where every Nth dispatch attempt (SCX_BYPASS_HOST_NTH=2) consumes from the bypass DSQ. A fallback consumption is also added at the end of dispatch to ensure bypassed tasks make progress even when normal scheduling is idle. Update enable_bypass_dsp() and disable_bypass_dsp() to increment bypass_dsp_enable_depth on both the bypassing scheduler and its parent host, ensuring both can detect that bypass dispatch is active through bypass_dsp_enabled(). Add SCX_EV_SUB_BYPASS_DISPATCH event counter to track scheduling of bypassed descendant tasks. v2: Fix comment typos (Andrea). Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Andrea Righi <arighi@nvidia.com>
This commit is contained in:
parent
aa2a0a1968
commit
025b1bd419
|
|
@ -357,6 +357,27 @@ static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu)
|
||||||
return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
|
return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu)
|
||||||
|
{
|
||||||
|
#ifdef CONFIG_EXT_SUB_SCHED
|
||||||
|
/*
|
||||||
|
* If @sch is a sub-sched which is bypassing, its tasks should go into
|
||||||
|
* the bypass DSQs of the nearest ancestor which is not bypassing. The
|
||||||
|
* not-bypassing ancestor is responsible for scheduling all tasks from
|
||||||
|
* bypassing sub-trees. If all ancestors including root are bypassing,
|
||||||
|
* all tasks should go to the root's bypass DSQs.
|
||||||
|
*
|
||||||
|
* Whenever a sched starts bypassing, all runnable tasks in its subtree
|
||||||
|
* are re-enqueued after scx_bypassing() is turned on, guaranteeing that
|
||||||
|
* all tasks are transferred to the right DSQs.
|
||||||
|
*/
|
||||||
|
while (scx_parent(sch) && scx_bypassing(sch, cpu))
|
||||||
|
sch = scx_parent(sch);
|
||||||
|
#endif /* CONFIG_EXT_SUB_SCHED */
|
||||||
|
|
||||||
|
return bypass_dsq(sch, cpu);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* bypass_dsp_enabled - Check if bypass dispatch path is enabled
|
* bypass_dsp_enabled - Check if bypass dispatch path is enabled
|
||||||
* @sch: scheduler to check
|
* @sch: scheduler to check
|
||||||
|
|
@ -1650,7 +1671,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
|
||||||
dsq = find_global_dsq(sch, p);
|
dsq = find_global_dsq(sch, p);
|
||||||
goto enqueue;
|
goto enqueue;
|
||||||
bypass:
|
bypass:
|
||||||
dsq = bypass_dsq(sch, task_cpu(p));
|
dsq = bypass_enq_target_dsq(sch, task_cpu(p));
|
||||||
goto enqueue;
|
goto enqueue;
|
||||||
|
|
||||||
enqueue:
|
enqueue:
|
||||||
|
|
@ -2420,8 +2441,33 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
|
||||||
if (consume_global_dsq(sch, rq))
|
if (consume_global_dsq(sch, rq))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
if (bypass_dsp_enabled(sch) && scx_bypassing(sch, cpu))
|
if (bypass_dsp_enabled(sch)) {
|
||||||
return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
|
/* if @sch is bypassing, only the bypass DSQs are active */
|
||||||
|
if (scx_bypassing(sch, cpu))
|
||||||
|
return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
|
||||||
|
|
||||||
|
#ifdef CONFIG_EXT_SUB_SCHED
|
||||||
|
/*
|
||||||
|
* If @sch isn't bypassing but its children are, @sch is
|
||||||
|
* responsible for making forward progress for both its own
|
||||||
|
* tasks that aren't bypassing and the bypassing descendants'
|
||||||
|
* tasks. The following implements a simple built-in behavior -
|
||||||
|
* let each CPU try to run the bypass DSQ every Nth time.
|
||||||
|
*
|
||||||
|
* Later, if necessary, we can add an ops flag to suppress the
|
||||||
|
* auto-consumption and a kfunc to consume the bypass DSQ and,
|
||||||
|
* so that the BPF scheduler can fully control scheduling of
|
||||||
|
* bypassed tasks.
|
||||||
|
*/
|
||||||
|
struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
|
||||||
|
|
||||||
|
if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
|
||||||
|
consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu))) {
|
||||||
|
__scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_EXT_SUB_SCHED */
|
||||||
|
}
|
||||||
|
|
||||||
if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
|
if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
|
||||||
return false;
|
return false;
|
||||||
|
|
@ -2467,6 +2513,14 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
|
||||||
}
|
}
|
||||||
} while (dspc->nr_tasks);
|
} while (dspc->nr_tasks);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Prevent the CPU from going idle while bypassed descendants have tasks
|
||||||
|
* queued. Without this fallback, bypassed tasks could stall if the host
|
||||||
|
* scheduler's ops.dispatch() doesn't yield any tasks.
|
||||||
|
*/
|
||||||
|
if (bypass_dsp_enabled(sch))
|
||||||
|
return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4085,6 +4139,7 @@ static ssize_t scx_attr_events_show(struct kobject *kobj,
|
||||||
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
|
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
|
||||||
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
|
at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
|
||||||
at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED);
|
at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED);
|
||||||
|
at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH);
|
||||||
return at;
|
return at;
|
||||||
}
|
}
|
||||||
SCX_ATTR(events);
|
SCX_ATTR(events);
|
||||||
|
|
@ -4460,6 +4515,7 @@ static bool dec_bypass_depth(struct scx_sched *sch)
|
||||||
|
|
||||||
static void enable_bypass_dsp(struct scx_sched *sch)
|
static void enable_bypass_dsp(struct scx_sched *sch)
|
||||||
{
|
{
|
||||||
|
struct scx_sched *host = scx_parent(sch) ?: sch;
|
||||||
u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
|
u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
|
||||||
s32 ret;
|
s32 ret;
|
||||||
|
|
||||||
|
|
@ -4471,14 +4527,35 @@ static void enable_bypass_dsp(struct scx_sched *sch)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The LB timer will stop running if bypass_arm_depth is 0. Increment
|
* When a sub-sched bypasses, its tasks are queued on the bypass DSQs of
|
||||||
* before starting the LB timer.
|
* the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is
|
||||||
|
* called iff @sch is not already bypassed due to an ancestor bypassing,
|
||||||
|
* we can assume that the parent is not bypassing and thus will be the
|
||||||
|
* host of the bypass DSQs.
|
||||||
|
*
|
||||||
|
* While the situation may change in the future, the following
|
||||||
|
* guarantees that the nearest non-bypassing ancestor or root has bypass
|
||||||
|
* dispatch enabled while a descendant is bypassing, which is all that's
|
||||||
|
* required.
|
||||||
|
*
|
||||||
|
* bypass_dsp_enabled() test is used to determine whether to enter the
|
||||||
|
* bypass dispatch handling path from both bypassing and hosting scheds.
|
||||||
|
* Bump enable depth on both @sch and bypass dispatch host.
|
||||||
*/
|
*/
|
||||||
ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
|
ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
|
||||||
WARN_ON_ONCE(ret <= 0);
|
WARN_ON_ONCE(ret <= 0);
|
||||||
|
|
||||||
if (intv_us && !timer_pending(&sch->bypass_lb_timer))
|
if (host != sch) {
|
||||||
mod_timer(&sch->bypass_lb_timer,
|
ret = atomic_inc_return(&host->bypass_dsp_enable_depth);
|
||||||
|
WARN_ON_ONCE(ret <= 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The LB timer will stop running if bypass dispatch is disabled. Start
|
||||||
|
* after enabling bypass dispatch.
|
||||||
|
*/
|
||||||
|
if (intv_us && !timer_pending(&host->bypass_lb_timer))
|
||||||
|
mod_timer(&host->bypass_lb_timer,
|
||||||
jiffies + usecs_to_jiffies(intv_us));
|
jiffies + usecs_to_jiffies(intv_us));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -4492,6 +4569,11 @@ static void disable_bypass_dsp(struct scx_sched *sch)
|
||||||
|
|
||||||
ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
|
ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
|
||||||
WARN_ON_ONCE(ret < 0);
|
WARN_ON_ONCE(ret < 0);
|
||||||
|
|
||||||
|
if (scx_parent(sch)) {
|
||||||
|
ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth);
|
||||||
|
WARN_ON_ONCE(ret < 0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -5266,6 +5348,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
|
||||||
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
|
scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
|
||||||
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
|
scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
|
||||||
scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED);
|
scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED);
|
||||||
|
scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH);
|
||||||
|
|
||||||
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
|
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
|
||||||
memcpy(ei->dump + dump_len - sizeof(trunc_marker),
|
memcpy(ei->dump + dump_len - sizeof(trunc_marker),
|
||||||
|
|
|
||||||
|
|
@ -24,6 +24,8 @@ enum scx_consts {
|
||||||
*/
|
*/
|
||||||
SCX_TASK_ITER_BATCH = 32,
|
SCX_TASK_ITER_BATCH = 32,
|
||||||
|
|
||||||
|
SCX_BYPASS_HOST_NTH = 2,
|
||||||
|
|
||||||
SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
|
SCX_BYPASS_LB_DFL_INTV_US = 500 * USEC_PER_MSEC,
|
||||||
SCX_BYPASS_LB_DONOR_PCT = 125,
|
SCX_BYPASS_LB_DONOR_PCT = 125,
|
||||||
SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
|
SCX_BYPASS_LB_MIN_DELTA_DIV = 4,
|
||||||
|
|
@ -923,6 +925,12 @@ struct scx_event_stats {
|
||||||
* scheduler.
|
* scheduler.
|
||||||
*/
|
*/
|
||||||
s64 SCX_EV_INSERT_NOT_OWNED;
|
s64 SCX_EV_INSERT_NOT_OWNED;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The number of times tasks from bypassing descendants are scheduled
|
||||||
|
* from sub_bypass_dsq's.
|
||||||
|
*/
|
||||||
|
s64 SCX_EV_SUB_BYPASS_DISPATCH;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum scx_sched_pcpu_flags {
|
enum scx_sched_pcpu_flags {
|
||||||
|
|
@ -940,6 +948,9 @@ struct scx_sched_pcpu {
|
||||||
struct scx_event_stats event_stats;
|
struct scx_event_stats event_stats;
|
||||||
|
|
||||||
struct scx_dispatch_q bypass_dsq;
|
struct scx_dispatch_q bypass_dsq;
|
||||||
|
#ifdef CONFIG_EXT_SUB_SCHED
|
||||||
|
u32 bypass_host_seq;
|
||||||
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
struct scx_sched {
|
struct scx_sched {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user