sched_ext: Use dynamic allocation for scx_sched

To prepare for supporting multiple schedulers, make scx_sched allocated
dynamically. scx_sched->kobj is now an embedded field and the kobj's
lifetime determines the lifetime of the containing scx_sched.

- Enable path is updated so that kobj init and addition are performed later.

- scx_sched freeing is initiated in scx_kobj_release() and also goes through
  an rcu_work so that scx_root can be accessed from an unsynchronized path -
  scx_disable().

- sched_ext_ops->priv is added and used to point to scx_sched instance
  created for the ops instance. This is used by bpf_scx_unreg() to determine
  the scx_sched instance to disable and put.

No behavior changes intended.

v2: Andrea reported kernel oops due to scx_bpf_unreg() trying to deref NULL
    scx_root after scheduler init failure. sched_ext_ops->priv added so that
    scx_bpf_unreg() can always find the scx_sched instance to unregister
    even if it failed early during init.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Acked-by: Changwoo Min <changwoo@igalia.com>
This commit is contained in:
Tejun Heo 2025-04-29 08:40:10 -10:00
parent a77d10d032
commit 17108735b4

View File

@ -752,6 +752,9 @@ struct sched_ext_ops {
* BPF scheduler is enabled. * BPF scheduler is enabled.
*/ */
char name[SCX_OPS_NAME_LEN]; char name[SCX_OPS_NAME_LEN];
/* internal use only, must be NULL */
void *priv;
}; };
enum scx_opi { enum scx_opi {
@ -772,7 +775,8 @@ struct scx_sched {
atomic_t exit_kind; atomic_t exit_kind;
struct scx_exit_info *exit_info; struct scx_exit_info *exit_info;
struct kobject *kobj; struct kobject kobj;
struct rcu_work rcu_work;
}; };
enum scx_wake_flags { enum scx_wake_flags {
@ -933,11 +937,7 @@ enum scx_ops_state {
#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) #define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) #define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
static struct scx_sched __scx_root = { static struct scx_sched __rcu *scx_root;
.exit_kind = ATOMIC_INIT(SCX_EXIT_DONE),
};
static struct scx_sched *scx_root = &__scx_root;
/* /*
* During exit, a task may schedule after losing its PIDs. When disabling the * During exit, a task may schedule after losing its PIDs. When disabling the
@ -4417,9 +4417,23 @@ static const struct attribute_group scx_global_attr_group = {
.attrs = scx_global_attrs, .attrs = scx_global_attrs,
}; };
static void free_exit_info(struct scx_exit_info *ei);
static void scx_sched_free_rcu_work(struct work_struct *work)
{
struct rcu_work *rcu_work = to_rcu_work(work);
struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
free_exit_info(sch->exit_info);
kfree(sch);
}
static void scx_kobj_release(struct kobject *kobj) static void scx_kobj_release(struct kobject *kobj)
{ {
kfree(kobj); struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
queue_rcu_work(system_unbound_wq, &sch->rcu_work);
} }
static ssize_t scx_attr_ops_show(struct kobject *kobj, static ssize_t scx_attr_ops_show(struct kobject *kobj,
@ -4709,14 +4723,15 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
static void scx_disable_workfn(struct kthread_work *work) static void scx_disable_workfn(struct kthread_work *work)
{ {
struct scx_exit_info *ei = scx_root->exit_info; struct scx_sched *sch = scx_root;
struct scx_exit_info *ei = sch->exit_info;
struct scx_task_iter sti; struct scx_task_iter sti;
struct task_struct *p; struct task_struct *p;
struct rhashtable_iter rht_iter; struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq; struct scx_dispatch_q *dsq;
int kind, cpu; int kind, cpu;
kind = atomic_read(&scx_root->exit_kind); kind = atomic_read(&sch->exit_kind);
while (true) { while (true) {
/* /*
* NONE indicates that a new scx_ops has been registered since * NONE indicates that a new scx_ops has been registered since
@ -4725,7 +4740,7 @@ static void scx_disable_workfn(struct kthread_work *work)
*/ */
if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
return; return;
if (atomic_try_cmpxchg(&scx_root->exit_kind, &kind, SCX_EXIT_DONE)) if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
break; break;
} }
ei->kind = kind; ei->kind = kind;
@ -4740,7 +4755,7 @@ static void scx_disable_workfn(struct kthread_work *work)
break; break;
case SCX_DISABLED: case SCX_DISABLED:
pr_warn("sched_ext: ops error detected without ops (%s)\n", pr_warn("sched_ext: ops error detected without ops (%s)\n",
scx_root->exit_info->msg); sch->exit_info->msg);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
goto done; goto done;
default: default:
@ -4807,41 +4822,43 @@ static void scx_disable_workfn(struct kthread_work *work)
/* no task is on scx, turn off all the switches and flush in-progress calls */ /* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_enabled); static_branch_disable(&__scx_enabled);
bitmap_zero(scx_root->has_op, SCX_OPI_END); bitmap_zero(sch->has_op, SCX_OPI_END);
scx_idle_disable(); scx_idle_disable();
synchronize_rcu(); synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) { if (ei->kind >= SCX_EXIT_ERROR) {
pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
scx_root->ops.name, ei->reason); sch->ops.name, ei->reason);
if (ei->msg[0] != '\0') if (ei->msg[0] != '\0')
pr_err("sched_ext: %s: %s\n", pr_err("sched_ext: %s: %s\n", sch->ops.name, ei->msg);
scx_root->ops.name, ei->msg);
#ifdef CONFIG_STACKTRACE #ifdef CONFIG_STACKTRACE
stack_trace_print(ei->bt, ei->bt_len, 2); stack_trace_print(ei->bt, ei->bt_len, 2);
#endif #endif
} else { } else {
pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", pr_info("sched_ext: BPF scheduler \"%s\" disabled (%s)\n",
scx_root->ops.name, ei->reason); sch->ops.name, ei->reason);
} }
if (scx_root->ops.exit) if (sch->ops.exit)
SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei); SCX_CALL_OP(SCX_KF_UNLOCKED, exit, NULL, ei);
cancel_delayed_work_sync(&scx_watchdog_work); cancel_delayed_work_sync(&scx_watchdog_work);
/* /*
* Delete the kobject from the hierarchy eagerly in addition to just * scx_root clearing must be inside cpus_read_lock(). See
* dropping a reference. Otherwise, if the object is deleted * handle_hotplug().
* asynchronously, sysfs could observe an object of the same name still
* in the hierarchy when another scheduler is loaded.
*/ */
kobject_del(scx_root->kobj); cpus_read_lock();
kobject_put(scx_root->kobj); RCU_INIT_POINTER(scx_root, NULL);
scx_root->kobj = NULL; cpus_read_unlock();
memset(&scx_root->ops, 0, sizeof(scx_root->ops)); /*
* Delete the kobject from the hierarchy synchronously. Otherwise, sysfs
* could observe an object of the same name still in the hierarchy when
* the next scheduler is loaded.
*/
kobject_del(&sch->kobj);
rhashtable_walk_enter(&dsq_hash, &rht_iter); rhashtable_walk_enter(&dsq_hash, &rht_iter);
do { do {
@ -4858,9 +4875,6 @@ static void scx_disable_workfn(struct kthread_work *work)
scx_dsp_ctx = NULL; scx_dsp_ctx = NULL;
scx_dsp_max_batch = 0; scx_dsp_max_batch = 0;
free_exit_info(scx_root->exit_info);
scx_root->exit_info = NULL;
mutex_unlock(&scx_enable_mutex); mutex_unlock(&scx_enable_mutex);
WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
@ -4885,13 +4899,18 @@ static void schedule_scx_disable_work(void)
static void scx_disable(enum scx_exit_kind kind) static void scx_disable(enum scx_exit_kind kind)
{ {
int none = SCX_EXIT_NONE; int none = SCX_EXIT_NONE;
struct scx_sched *sch;
if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
kind = SCX_EXIT_ERROR; kind = SCX_EXIT_ERROR;
atomic_try_cmpxchg(&scx_root->exit_kind, &none, kind); rcu_read_lock();
sch = rcu_dereference(scx_root);
if (sch) {
atomic_try_cmpxchg(&sch->exit_kind, &none, kind);
schedule_scx_disable_work(); schedule_scx_disable_work();
}
rcu_read_unlock();
} }
static void dump_newline(struct seq_buf *s) static void dump_newline(struct seq_buf *s)
@ -5288,6 +5307,7 @@ static int validate_ops(const struct sched_ext_ops *ops)
static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
{ {
struct scx_sched *sch;
struct scx_task_iter sti; struct scx_task_iter sti;
struct task_struct *p; struct task_struct *p;
unsigned long timeout; unsigned long timeout;
@ -5351,33 +5371,33 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
goto err_unlock; goto err_unlock;
} }
scx_root->kobj = kzalloc(sizeof(*scx_root->kobj), GFP_KERNEL); sch = kzalloc(sizeof(*sch), GFP_KERNEL);
if (!scx_root->kobj) { if (!sch) {
ret = -ENOMEM; ret = -ENOMEM;
goto err_unlock; goto err_unlock;
} }
scx_root->kobj->kset = scx_kset; sch->exit_info = alloc_exit_info(ops->exit_dump_len);
ret = kobject_init_and_add(scx_root->kobj, &scx_ktype, NULL, "root"); if (!sch->exit_info) {
if (ret < 0)
goto err;
scx_root->exit_info = alloc_exit_info(ops->exit_dump_len);
if (!scx_root->exit_info) {
ret = -ENOMEM; ret = -ENOMEM;
goto err_del; goto err_free;
} }
sch->kobj.kset = scx_kset;
ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
if (ret < 0)
goto err_free;
atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
sch->ops = *ops;
ops->priv = sch;
/* /*
* Set scx_ops, transition to ENABLING and clear exit info to arm the * Transition to ENABLING and clear exit info to arm the disable path.
* disable path. Failure triggers full disabling from here on. * Failure triggers full disabling from here on.
*/ */
scx_root->ops = *ops;
WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED); WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
WARN_ON_ONCE(scx_root);
atomic_set(&scx_root->exit_kind, SCX_EXIT_NONE);
scx_root->warned_zero_slice = false;
atomic_long_set(&scx_nr_rejected, 0); atomic_long_set(&scx_nr_rejected, 0);
@ -5390,9 +5410,15 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
*/ */
cpus_read_lock(); cpus_read_lock();
/*
* Make the scheduler instance visible. Must be inside cpus_read_lock().
* See handle_hotplug().
*/
rcu_assign_pointer(scx_root, sch);
scx_idle_enable(ops); scx_idle_enable(ops);
if (scx_root->ops.init) { if (sch->ops.init) {
ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL); ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init, NULL);
if (ret) { if (ret) {
ret = ops_sanitize_err("init", ret); ret = ops_sanitize_err("init", ret);
@ -5404,7 +5430,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
if (((void (**)(void))ops)[i]) if (((void (**)(void))ops)[i])
set_bit(i, scx_root->has_op); set_bit(i, sch->has_op);
check_hotplug_seq(ops); check_hotplug_seq(ops);
scx_idle_update_selcpu_topology(ops); scx_idle_update_selcpu_topology(ops);
@ -5445,10 +5471,10 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
if (((void (**)(void))ops)[i]) if (((void (**)(void))ops)[i])
set_bit(i, scx_root->has_op); set_bit(i, sch->has_op);
if (scx_root->ops.cpu_acquire || scx_root->ops.cpu_release) if (sch->ops.cpu_acquire || sch->ops.cpu_release)
scx_root->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT; sch->ops.flags |= SCX_OPS_HAS_CPU_PREEMPT;
/* /*
* Lock out forks, cgroup on/offlining and moves before opening the * Lock out forks, cgroup on/offlining and moves before opening the
@ -5547,7 +5573,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
scx_bypass(false); scx_bypass(false);
if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
WARN_ON_ONCE(atomic_read(&scx_root->exit_kind) == SCX_EXIT_NONE); WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
goto err_disable; goto err_disable;
} }
@ -5555,23 +5581,18 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable(&__scx_switched_all); static_branch_enable(&__scx_switched_all);
pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n", pr_info("sched_ext: BPF scheduler \"%s\" enabled%s\n",
scx_root->ops.name, scx_switched_all() ? "" : " (partial)"); sch->ops.name, scx_switched_all() ? "" : " (partial)");
kobject_uevent(scx_root->kobj, KOBJ_ADD); kobject_uevent(&sch->kobj, KOBJ_ADD);
mutex_unlock(&scx_enable_mutex); mutex_unlock(&scx_enable_mutex);
atomic_long_inc(&scx_enable_seq); atomic_long_inc(&scx_enable_seq);
return 0; return 0;
err_del: err_free:
kobject_del(scx_root->kobj); if (sch->exit_info)
err: free_exit_info(sch->exit_info);
kobject_put(scx_root->kobj); kfree(sch);
scx_root->kobj = NULL;
if (scx_root->exit_info) {
free_exit_info(scx_root->exit_info);
scx_root->exit_info = NULL;
}
err_unlock: err_unlock:
mutex_unlock(&scx_enable_mutex); mutex_unlock(&scx_enable_mutex);
return ret; return ret;
@ -5589,7 +5610,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
* is notified through ops.exit() with all the details. * is notified through ops.exit() with all the details.
* *
* Flush scx_disable_work to ensure that error is reported before init * Flush scx_disable_work to ensure that error is reported before init
* completion. * completion. sch's base reference will be put by bpf_scx_unreg().
*/ */
scx_error("scx_enable() failed (%d)", ret); scx_error("scx_enable() failed (%d)", ret);
kthread_flush_work(&scx_disable_work); kthread_flush_work(&scx_disable_work);
@ -5741,8 +5762,12 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
static void bpf_scx_unreg(void *kdata, struct bpf_link *link) static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
{ {
struct sched_ext_ops *ops = kdata;
struct scx_sched *sch = ops->priv;
scx_disable(SCX_EXIT_UNREG); scx_disable(SCX_EXIT_UNREG);
kthread_flush_work(&scx_disable_work); kthread_flush_work(&scx_disable_work);
kobject_put(&sch->kobj);
} }
static int bpf_scx_init(struct btf *btf) static int bpf_scx_init(struct btf *btf)