mirror of
https://github.com/torvalds/linux.git
synced 2026-06-03 03:53:37 +02:00
cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated
A chain of commits going back to v7.0 reworked rmdir to satisfy the controller invariant that a subsystem's ->css_offline() must not run while tasks are still doing kernel-side work in the cgroup. [1]d245698d72("cgroup: Defer task cgroup unlink until after the task is done switching out") [2]a72f73c4dd("cgroup: Don't expose dead tasks in cgroup") [3]1b164b876c("cgroup: Wait for dying tasks to leave on rmdir") [4]4c56a8ac68("cgroup: Fix cgroup_drain_dying() testing the wrong condition") [5]13e786b64b("cgroup: Increment nr_dying_subsys_* from rmdir context") [1] moved task cset unlink from do_exit() to finish_task_switch() so a task's cset link drops only after the task has fully stopped scheduling. That made tasks past exit_signals() linger on cset->tasks until their final context switch, which led to a series of problems as what userspace expected to see after rmdir diverged from what the kernel needs to wait for. [2]-[5] tried to bridge that divergence: [2] filtered the exiting tasks from cgroup.procs; [3] had rmdir(2) sleep in TASK_UNINTERRUPTIBLE for them; [4] fixed the wait's condition; [5] made nr_dying_subsys_* visible synchronously. The cgroup_drain_dying() wait in [3] turned out to be a dead end. When the rmdir caller is also the reaper of a zombie that pins a pidns teardown (e.g. host PID 1 systemd reaping orphan pids that were re-parented to it during the same teardown), rmdir blocks in TASK_UNINTERRUPTIBLE waiting for those pids to free, the pids can't free because PID 1 is the reaper and it's stuck in rmdir, and the system A-A deadlocks. No internal lock ordering breaks this; the wait itself is the bug. The css killing side that drove the original reorder, however, can be made cleanly asynchronous: ->css_offline() is already async, run from css_killed_work_fn() driven by percpu_ref_kill_and_confirm(). The fix is to make that chain start only after all tasks have left the cgroup. rmdir's user-visible side then returns as soon as cgroup.procs and friends are empty, while ->css_offline() still runs only after the cgroup is fully drained. Verified by the original reproducer (pidns teardown + zombie reaper, runs under vng) which hangs vanilla and succeeds here, and by per-commit deterministic repros for [2], [3], [4], [5] with a boot parameter that widens the post-exit_signals() window so each state is reliably reachable. Some stress tests on top of that. cgroup_apply_control_disable() has the same shape of pre-existing race: when a controller is disabled via subtree_control, kill_css() ran synchronously while tasks past exit_signals() could still be linked to the cgroup's csets, and ->css_offline() could fire before they drained. This patch preserves the existing synchronous behavior at that call site (kill_css_sync() + kill_css_finish() back-to-back) and a follow-up patch will defer kill_css_finish() there using a per-css trigger. This seems like the right approach and I don't see problems with it. The changes are somewhat invasive but not excessively so, so backporting to -stable should be okay. If something does turn out to be wrong, the fallback is to revert the entire chain ([1]-[5]) and rework in the development branch instead. v2: Pin cgrp across the deferred destroy work with explicit cgroup_get()/cgroup_put() around queue_work() and the work_fn. v1 wasn't actually broken (ordered cgroup_offline_wq + queue_work order in cgroup_task_dead() saved it) but the explicit ref removes the dependency on those non-obvious invariants. Also note the pre-existing cgroup_apply_control_disable() race in the description; a follow-up will defer kill_css_finish() there. Fixes:1b164b876c("cgroup: Wait for dying tasks to leave on rmdir") Cc: stable@vger.kernel.org # v7.0+ Reported-and-tested-by: Martin Pitt <martin@piware.de> Link: https://lore.kernel.org/all/afHNg2VX2jy9bW7y@piware.de/ Link: https://lore.kernel.org/all/35e0670adb4abeab13da2c321582af9f@kernel.org/ Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
This commit is contained in:
parent
981cd33861
commit
93618edf75
|
|
@ -611,8 +611,8 @@ struct cgroup {
|
|||
/* used to wait for offlining of csses */
|
||||
wait_queue_head_t offline_waitq;
|
||||
|
||||
/* used by cgroup_rmdir() to wait for dying tasks to leave */
|
||||
wait_queue_head_t dying_populated_waitq;
|
||||
/* defers killing csses after removal until cgroup is depopulated */
|
||||
struct work_struct finish_destroy_work;
|
||||
|
||||
/* used to schedule release agent */
|
||||
struct work_struct release_agent_work;
|
||||
|
|
|
|||
|
|
@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
|
|||
static void css_task_iter_skip(struct css_task_iter *it,
|
||||
struct task_struct *task);
|
||||
static int cgroup_destroy_locked(struct cgroup *cgrp);
|
||||
static void cgroup_finish_destroy(struct cgroup *cgrp);
|
||||
static void kill_css_sync(struct cgroup_subsys_state *css);
|
||||
static void kill_css_finish(struct cgroup_subsys_state *css);
|
||||
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
|
||||
struct cgroup_subsys *ss);
|
||||
static void css_release(struct percpu_ref *ref);
|
||||
static void kill_css(struct cgroup_subsys_state *css);
|
||||
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
|
||||
struct cgroup *cgrp, struct cftype cfts[],
|
||||
bool is_add);
|
||||
|
|
@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
|
|||
if (was_populated == cgroup_is_populated(cgrp))
|
||||
break;
|
||||
|
||||
/*
|
||||
* Subtree just emptied below an offlined cgrp. Fire deferred
|
||||
* destroy. The transition is one-shot.
|
||||
*/
|
||||
if (was_populated && !css_is_online(&cgrp->self)) {
|
||||
cgroup_get(cgrp);
|
||||
WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
|
||||
&cgrp->finish_destroy_work));
|
||||
}
|
||||
|
||||
cgroup1_check_for_release(cgrp);
|
||||
TRACE_CGROUP_PATH(notify_populated, cgrp,
|
||||
cgroup_is_populated(cgrp));
|
||||
|
|
@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void cgroup_finish_destroy_work_fn(struct work_struct *work)
|
||||
{
|
||||
struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);
|
||||
|
||||
cgroup_lock();
|
||||
cgroup_finish_destroy(cgrp);
|
||||
cgroup_unlock();
|
||||
cgroup_put(cgrp);
|
||||
}
|
||||
|
||||
static void init_cgroup_housekeeping(struct cgroup *cgrp)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
|
|
@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
|
|||
#endif
|
||||
|
||||
init_waitqueue_head(&cgrp->offline_waitq);
|
||||
init_waitqueue_head(&cgrp->dying_populated_waitq);
|
||||
INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
|
||||
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
|
||||
}
|
||||
|
||||
|
|
@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
|
|||
|
||||
if (css->parent &&
|
||||
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
|
||||
kill_css(css);
|
||||
kill_css_sync(css);
|
||||
kill_css_finish(css);
|
||||
} else if (!css_visible(css)) {
|
||||
css_clear_dir(css);
|
||||
if (ss->css_reset)
|
||||
|
|
@ -5514,7 +5537,7 @@ static struct cftype cgroup_psi_files[] = {
|
|||
* css destruction is four-stage process.
|
||||
*
|
||||
* 1. Destruction starts. Killing of the percpu_ref is initiated.
|
||||
* Implemented in kill_css().
|
||||
* Implemented in kill_css_finish().
|
||||
*
|
||||
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
|
||||
* and thus css_tryget_online() is guaranteed to fail, the css can be
|
||||
|
|
@ -5993,7 +6016,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
|
|||
/*
|
||||
* This is called when the refcnt of a css is confirmed to be killed.
|
||||
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
|
||||
* initiate destruction and put the css ref from kill_css().
|
||||
* initiate destruction and put the css ref from kill_css_finish().
|
||||
*/
|
||||
static void css_killed_work_fn(struct work_struct *work)
|
||||
{
|
||||
|
|
@ -6025,15 +6048,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
|
|||
}
|
||||
|
||||
/**
|
||||
* kill_css - destroy a css
|
||||
* @css: css to destroy
|
||||
* kill_css_sync - synchronous half of css teardown
|
||||
* @css: css being killed
|
||||
*
|
||||
* This function initiates destruction of @css by removing cgroup interface
|
||||
* files and putting its base reference. ->css_offline() will be invoked
|
||||
* asynchronously once css_tryget_online() is guaranteed to fail and when
|
||||
* the reference count reaches zero, @css will be released.
|
||||
* See cgroup_destroy_locked().
|
||||
*/
|
||||
static void kill_css(struct cgroup_subsys_state *css)
|
||||
static void kill_css_sync(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct cgroup_subsys *ss = css->ss;
|
||||
|
||||
|
|
@ -6056,24 +6076,6 @@ static void kill_css(struct cgroup_subsys_state *css)
|
|||
*/
|
||||
css_clear_dir(css);
|
||||
|
||||
/*
|
||||
* Killing would put the base ref, but we need to keep it alive
|
||||
* until after ->css_offline().
|
||||
*/
|
||||
css_get(css);
|
||||
|
||||
/*
|
||||
* cgroup core guarantees that, by the time ->css_offline() is
|
||||
* invoked, no new css reference will be given out via
|
||||
* css_tryget_online(). We can't simply call percpu_ref_kill() and
|
||||
* proceed to offlining css's because percpu_ref_kill() doesn't
|
||||
* guarantee that the ref is seen as killed on all CPUs on return.
|
||||
*
|
||||
* Use percpu_ref_kill_and_confirm() to get notifications as each
|
||||
* css is confirmed to be seen as killed on all CPUs.
|
||||
*/
|
||||
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
|
||||
|
||||
css->cgroup->nr_dying_subsys[ss->id]++;
|
||||
/*
|
||||
* Parent css and cgroup cannot be freed until after the freeing
|
||||
|
|
@ -6086,44 +6088,88 @@ static void kill_css(struct cgroup_subsys_state *css)
|
|||
}
|
||||
|
||||
/**
|
||||
* cgroup_destroy_locked - the first stage of cgroup destruction
|
||||
* kill_css_finish - deferred half of css teardown
|
||||
* @css: css being killed
|
||||
*
|
||||
* See cgroup_destroy_locked().
|
||||
*/
|
||||
static void kill_css_finish(struct cgroup_subsys_state *css)
|
||||
{
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
/*
|
||||
* Skip on re-entry: cgroup_apply_control_disable() may have killed @css
|
||||
* earlier. cgroup_destroy_locked() can still walk it because
|
||||
* offline_css() (which NULLs cgrp->subsys[ssid]) runs async.
|
||||
*/
|
||||
if (percpu_ref_is_dying(&css->refcnt))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Killing would put the base ref, but we need to keep it alive until
|
||||
* after ->css_offline().
|
||||
*/
|
||||
css_get(css);
|
||||
|
||||
/*
|
||||
* cgroup core guarantees that, by the time ->css_offline() is invoked,
|
||||
* no new css reference will be given out via css_tryget_online(). We
|
||||
* can't simply call percpu_ref_kill() and proceed to offlining css's
|
||||
* because percpu_ref_kill() doesn't guarantee that the ref is seen as
|
||||
* killed on all CPUs on return.
|
||||
*
|
||||
* Use percpu_ref_kill_and_confirm() to get notifications as each css is
|
||||
* confirmed to be seen as killed on all CPUs.
|
||||
*/
|
||||
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_destroy_locked - destroy @cgrp (called on rmdir)
|
||||
* @cgrp: cgroup to be destroyed
|
||||
*
|
||||
* css's make use of percpu refcnts whose killing latency shouldn't be
|
||||
* exposed to userland and are RCU protected. Also, cgroup core needs to
|
||||
* guarantee that css_tryget_online() won't succeed by the time
|
||||
* ->css_offline() is invoked. To satisfy all the requirements,
|
||||
* destruction is implemented in the following two steps.
|
||||
* Tear down @cgrp on behalf of rmdir. Constraints:
|
||||
*
|
||||
* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
|
||||
* userland visible parts and start killing the percpu refcnts of
|
||||
* css's. Set up so that the next stage will be kicked off once all
|
||||
* the percpu refcnts are confirmed to be killed.
|
||||
* - Userspace: rmdir must succeed when cgroup.procs and friends are empty.
|
||||
*
|
||||
* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
|
||||
* rest of destruction. Once all cgroup references are gone, the
|
||||
* cgroup is RCU-freed.
|
||||
* - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's
|
||||
* subtree is still doing kernel work. A task hidden from cgroup.procs (past
|
||||
* exit_signals() with signal->live cleared) can still schedule, allocate, and
|
||||
* consume resources until its final context switch. Dying descendants in the
|
||||
* subtree can host such tasks too.
|
||||
*
|
||||
* This function implements s1. After this step, @cgrp is gone as far as
|
||||
* the userland is concerned and a new cgroup with the same name may be
|
||||
* created. As cgroup doesn't care about the names internally, this
|
||||
* doesn't cause any problem.
|
||||
* - Kernel: css_tryget_online() must fail by the time ->css_offline() runs.
|
||||
*
|
||||
* The destruction runs in three parts:
|
||||
*
|
||||
* - This function: synchronous user-visible state teardown plus kill_css_sync()
|
||||
* on each subsystem css.
|
||||
*
|
||||
* - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
|
||||
* each subsystem css. Fires once @cgrp's subtree is fully drained, either
|
||||
* inline here or from cgroup_update_populated().
|
||||
*
|
||||
* - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
|
||||
* ->css_offline() -> release/free.
|
||||
*
|
||||
* Return 0 on success, -EBUSY if a userspace-visible task or an online child
|
||||
* remains.
|
||||
*/
|
||||
static int cgroup_destroy_locked(struct cgroup *cgrp)
|
||||
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
|
||||
{
|
||||
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cgrp_cset_link *link;
|
||||
struct css_task_iter it;
|
||||
struct task_struct *task;
|
||||
int ssid, ret;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
/*
|
||||
* Only migration can raise populated from zero and we're already
|
||||
* holding cgroup_mutex.
|
||||
*/
|
||||
if (cgroup_is_populated(cgrp))
|
||||
css_task_iter_start(&cgrp->self, 0, &it);
|
||||
task = css_task_iter_next(&it);
|
||||
css_task_iter_end(&it);
|
||||
if (task)
|
||||
return -EBUSY;
|
||||
|
||||
/*
|
||||
|
|
@ -6147,9 +6193,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
|||
link->cset->dead = true;
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
/* initiate massacre of all css's */
|
||||
for_each_css(css, ssid, cgrp)
|
||||
kill_css(css);
|
||||
kill_css_sync(css);
|
||||
|
||||
/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
|
||||
css_clear_dir(&cgrp->self);
|
||||
|
|
@ -6180,79 +6225,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
|
|||
/* put the base reference */
|
||||
percpu_ref_kill(&cgrp->self.refcnt);
|
||||
|
||||
if (!cgroup_is_populated(cgrp))
|
||||
cgroup_finish_destroy(cgrp);
|
||||
|
||||
return 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* cgroup_drain_dying - wait for dying tasks to leave before rmdir
|
||||
* @cgrp: the cgroup being removed
|
||||
* cgroup_finish_destroy - deferred half of @cgrp destruction
|
||||
* @cgrp: cgroup whose subtree just became empty
|
||||
*
|
||||
* cgroup.procs and cgroup.threads use css_task_iter which filters out
|
||||
* PF_EXITING tasks so that userspace doesn't see tasks that have already been
|
||||
* reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
|
||||
* cgroup has non-empty css_sets - is only updated when dying tasks pass through
|
||||
* cgroup_task_dead() in finish_task_switch(). This creates a window where
|
||||
* cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
|
||||
* fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
|
||||
* tasks.
|
||||
*
|
||||
* This function aligns cgroup_has_tasks() with what userspace can observe. If
|
||||
* cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
|
||||
* PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
|
||||
* window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
|
||||
*
|
||||
* This function only concerns itself with this cgroup's own dying tasks.
|
||||
* Whether the cgroup has children is cgroup_destroy_locked()'s problem.
|
||||
*
|
||||
* Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
|
||||
* retry the full check from scratch.
|
||||
*
|
||||
* Must be called with cgroup_mutex held.
|
||||
* See cgroup_destroy_locked() for the rationale.
|
||||
*/
|
||||
static int cgroup_drain_dying(struct cgroup *cgrp)
|
||||
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
|
||||
static void cgroup_finish_destroy(struct cgroup *cgrp)
|
||||
{
|
||||
struct css_task_iter it;
|
||||
struct task_struct *task;
|
||||
DEFINE_WAIT(wait);
|
||||
struct cgroup_subsys_state *css;
|
||||
int ssid;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
retry:
|
||||
if (!cgroup_has_tasks(cgrp))
|
||||
return 0;
|
||||
|
||||
/* Same iterator as cgroup.threads - if any task is visible, it's busy */
|
||||
css_task_iter_start(&cgrp->self, 0, &it);
|
||||
task = css_task_iter_next(&it);
|
||||
css_task_iter_end(&it);
|
||||
|
||||
if (task)
|
||||
return -EBUSY;
|
||||
|
||||
/*
|
||||
* All remaining tasks are PF_EXITING and will pass through
|
||||
* cgroup_task_dead() shortly. Wait for a kick and retry.
|
||||
*
|
||||
* cgroup_has_tasks() can't transition from false to true while we're
|
||||
* holding cgroup_mutex, but the true to false transition happens
|
||||
* under css_set_lock (via cgroup_task_dead()). We must retest and
|
||||
* prepare_to_wait() under css_set_lock. Otherwise, the transition
|
||||
* can happen between our first test and prepare_to_wait(), and we
|
||||
* sleep with no one to wake us.
|
||||
*/
|
||||
spin_lock_irq(&css_set_lock);
|
||||
if (!cgroup_has_tasks(cgrp)) {
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
return 0;
|
||||
}
|
||||
prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
schedule();
|
||||
finish_wait(&cgrp->dying_populated_waitq, &wait);
|
||||
mutex_lock(&cgroup_mutex);
|
||||
goto retry;
|
||||
for_each_css(css, ssid, cgrp)
|
||||
kill_css_finish(css);
|
||||
}
|
||||
|
||||
int cgroup_rmdir(struct kernfs_node *kn)
|
||||
|
|
@ -6264,12 +6257,9 @@ int cgroup_rmdir(struct kernfs_node *kn)
|
|||
if (!cgrp)
|
||||
return 0;
|
||||
|
||||
ret = cgroup_drain_dying(cgrp);
|
||||
if (!ret) {
|
||||
ret = cgroup_destroy_locked(cgrp);
|
||||
if (!ret)
|
||||
TRACE_CGROUP_PATH(rmdir, cgrp);
|
||||
}
|
||||
ret = cgroup_destroy_locked(cgrp);
|
||||
if (!ret)
|
||||
TRACE_CGROUP_PATH(rmdir, cgrp);
|
||||
|
||||
cgroup_kn_unlock(kn);
|
||||
return ret;
|
||||
|
|
@ -7029,7 +7019,6 @@ void cgroup_task_exit(struct task_struct *tsk)
|
|||
|
||||
static void do_cgroup_task_dead(struct task_struct *tsk)
|
||||
{
|
||||
struct cgrp_cset_link *link;
|
||||
struct css_set *cset;
|
||||
unsigned long flags;
|
||||
|
||||
|
|
@ -7043,11 +7032,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
|
|||
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
|
||||
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
|
||||
|
||||
/* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
|
||||
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
|
||||
if (waitqueue_active(&link->cgrp->dying_populated_waitq))
|
||||
wake_up(&link->cgrp->dying_populated_waitq);
|
||||
|
||||
if (dl_task(tsk))
|
||||
dec_dl_tasks_cs(tsk);
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user