cgroup: Fixes for v7.1-rc2

- During v6.19, cgroup task unlink was moved from do_exit() to after the
   final task switch to satisfy a controller invariant. That left the kernel
   seeing tasks past exit_signals() longer than userspace expected, and
   several v7.0 follow-ups tried to bridge the gap by making rmdir wait for
   the kernel side. None held up. The latest is an A-A deadlock when rmdir
   is invoked by the reaper of zombies whose pidns teardown the rmdir itself
   is waiting on, which points at the synchronizing approach being
   fundamentally wrong:
 
   - Take a different approach: drop the wait, leave rmdir's user-visible
     side returning as soon as cgroup.procs is empty, and defer the css
     percpu_ref kill that drives ->css_offline() until the cgroup is fully
     depopulated.
 
   - Tagged for stable. Somewhat invasive but contained. The hope is that
     fixing forward sticks. If not, the fallback is to revert the entire
     chain and rework on the development branch.
 
   - Doesn't plug a pre-existing analogous race in
     cgroup_apply_control_disable() (controller disable via subtree_control).
     Not a regression. The development branch will do the more invasive
     restructuring needed for that.
 
 - Documentation update for cgroup-v1 charge-commit section that still
   referenced functions removed when the memcg hugetlb try-commit-cancel
   protocol was retired.
 -----BEGIN PGP SIGNATURE-----
 
 iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCafphbw4cdGpAa2VybmVs
 Lm9yZwAKCRCxYfJx3gVYGbydAQDxlEIeJPdJlwbU6X4PBW/7DYeDHABG7OdrFf5K
 Fq4ECAD/ZHsFyCNEOcZym6t2/FCZR0xbaPGQibLt3er6AkLRFwM=
 =3Jra
 -----END PGP SIGNATURE-----

Merge tag 'cgroup-for-7.1-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

 - During v6.19, cgroup task unlink was moved from do_exit() to after the
   final task switch to satisfy a controller invariant. That left the kernel
   seeing tasks past exit_signals() longer than userspace expected, and
   several v7.0 follow-ups tried to bridge the gap by making rmdir wait for
   the kernel side. None held up.

   The latest is an A-A deadlock when rmdir is invoked by the reaper of
   zombies whose pidns teardown the rmdir itself is waiting on, which
   points at the synchronizing approach being fundamentally wrong.

   Take a different approach: drop the wait, leave rmdir's user-visible
   side returning as soon as cgroup.procs is empty, and defer the css
   percpu_ref kill that drives ->css_offline() until the cgroup is fully
   depopulated.

   Tagged for stable. Somewhat invasive but contained. The hope is that
   fixing forward sticks. If not, the fallback is to revert the entire
   chain and rework on the development branch.

   Note that this doesn't plug a pre-existing analogous race in
   cgroup_apply_control_disable() (controller disable via
   subtree_control). Not a regression. The development branch will do
   the more invasive restructuring needed for that.

 - Documentation update for cgroup-v1 charge-commit section that still
   referenced functions removed when the memcg hugetlb try-commit-cancel
   protocol was retired.

* tag 'cgroup-for-7.1-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  docs: cgroup-v1: Update charge-commit section
  cgroup: Defer css percpu_ref kill on rmdir until cgroup is depopulated
This commit is contained in:
Linus Torvalds 2026-05-05 15:43:32 -07:00
commit 11f00074f7
3 changed files with 118 additions and 136 deletions

View File

@ -47,21 +47,19 @@ Please note that implementation details can be changed.
Called when swp_entry's refcnt goes down to 0. A charge against swap
disappears.
3. charge-commit-cancel
3. charge-commit
=======================
Memcg pages are charged in two steps:
- mem_cgroup_try_charge()
- mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
- commit_charge()
At try_charge(), there are no flags to say "this page is charged".
at this point, usage += PAGE_SIZE.
At commit(), the page is associated with the memcg.
At cancel(), simply usage -= PAGE_SIZE.
Under below explanation, we assume CONFIG_SWAP=y.
4. Anonymous

View File

@ -611,8 +611,8 @@ struct cgroup {
/* used to wait for offlining of csses */
wait_queue_head_t offline_waitq;
/* used by cgroup_rmdir() to wait for dying tasks to leave */
wait_queue_head_t dying_populated_waitq;
/* defers killing csses after removal until cgroup is depopulated */
struct work_struct finish_destroy_work;
/* used to schedule release agent */
struct work_struct release_agent_work;

View File

@ -264,10 +264,12 @@ static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static void cgroup_finish_destroy(struct cgroup *cgrp);
static void kill_css_sync(struct cgroup_subsys_state *css);
static void kill_css_finish(struct cgroup_subsys_state *css);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
@ -797,6 +799,16 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
if (was_populated == cgroup_is_populated(cgrp))
break;
/*
* Subtree just emptied below an offlined cgrp. Fire deferred
* destroy. The transition is one-shot.
*/
if (was_populated && !css_is_online(&cgrp->self)) {
cgroup_get(cgrp);
WARN_ON_ONCE(!queue_work(cgroup_offline_wq,
&cgrp->finish_destroy_work));
}
cgroup1_check_for_release(cgrp);
TRACE_CGROUP_PATH(notify_populated, cgrp,
cgroup_is_populated(cgrp));
@ -2039,6 +2051,16 @@ static int cgroup_reconfigure(struct fs_context *fc)
return 0;
}
static void cgroup_finish_destroy_work_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, finish_destroy_work);
cgroup_lock();
cgroup_finish_destroy(cgrp);
cgroup_unlock();
cgroup_put(cgrp);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
@ -2065,7 +2087,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
#endif
init_waitqueue_head(&cgrp->offline_waitq);
init_waitqueue_head(&cgrp->dying_populated_waitq);
INIT_WORK(&cgrp->finish_destroy_work, cgroup_finish_destroy_work_fn);
INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
}
@ -3375,7 +3397,8 @@ static void cgroup_apply_control_disable(struct cgroup *cgrp)
if (css->parent &&
!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
kill_css(css);
kill_css_sync(css);
kill_css_finish(css);
} else if (!css_visible(css)) {
css_clear_dir(css);
if (ss->css_reset)
@ -5516,7 +5539,7 @@ static struct cftype cgroup_psi_files[] = {
* css destruction is four-stage process.
*
* 1. Destruction starts. Killing of the percpu_ref is initiated.
* Implemented in kill_css().
* Implemented in kill_css_finish().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
* and thus css_tryget_online() is guaranteed to fail, the css can be
@ -5995,7 +6018,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
* initiate destruction and put the css ref from kill_css().
* initiate destruction and put the css ref from kill_css_finish().
*/
static void css_killed_work_fn(struct work_struct *work)
{
@ -6028,15 +6051,12 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
}
/**
* kill_css - destroy a css
* @css: css to destroy
* kill_css_sync - synchronous half of css teardown
* @css: css being killed
*
* This function initiates destruction of @css by removing cgroup interface
* files and putting its base reference. ->css_offline() will be invoked
* asynchronously once css_tryget_online() is guaranteed to fail and when
* the reference count reaches zero, @css will be released.
* See cgroup_destroy_locked().
*/
static void kill_css(struct cgroup_subsys_state *css)
static void kill_css_sync(struct cgroup_subsys_state *css)
{
struct cgroup_subsys *ss = css->ss;
@ -6059,24 +6079,6 @@ static void kill_css(struct cgroup_subsys_state *css)
*/
css_clear_dir(css);
/*
* Killing would put the base ref, but we need to keep it alive
* until after ->css_offline().
*/
css_get(css);
/*
* cgroup core guarantees that, by the time ->css_offline() is
* invoked, no new css reference will be given out via
* css_tryget_online(). We can't simply call percpu_ref_kill() and
* proceed to offlining css's because percpu_ref_kill() doesn't
* guarantee that the ref is seen as killed on all CPUs on return.
*
* Use percpu_ref_kill_and_confirm() to get notifications as each
* css is confirmed to be seen as killed on all CPUs.
*/
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
css->cgroup->nr_dying_subsys[ss->id]++;
/*
* Parent css and cgroup cannot be freed until after the freeing
@ -6089,44 +6091,88 @@ static void kill_css(struct cgroup_subsys_state *css)
}
/**
* cgroup_destroy_locked - the first stage of cgroup destruction
* kill_css_finish - deferred half of css teardown
* @css: css being killed
*
* See cgroup_destroy_locked().
*/
static void kill_css_finish(struct cgroup_subsys_state *css)
{
lockdep_assert_held(&cgroup_mutex);
/*
* Skip on re-entry: cgroup_apply_control_disable() may have killed @css
* earlier. cgroup_destroy_locked() can still walk it because
* offline_css() (which NULLs cgrp->subsys[ssid]) runs async.
*/
if (percpu_ref_is_dying(&css->refcnt))
return;
/*
* Killing would put the base ref, but we need to keep it alive until
* after ->css_offline().
*/
css_get(css);
/*
* cgroup core guarantees that, by the time ->css_offline() is invoked,
* no new css reference will be given out via css_tryget_online(). We
* can't simply call percpu_ref_kill() and proceed to offlining css's
* because percpu_ref_kill() doesn't guarantee that the ref is seen as
* killed on all CPUs on return.
*
* Use percpu_ref_kill_and_confirm() to get notifications as each css is
* confirmed to be seen as killed on all CPUs.
*/
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
}
/**
* cgroup_destroy_locked - destroy @cgrp (called on rmdir)
* @cgrp: cgroup to be destroyed
*
* css's make use of percpu refcnts whose killing latency shouldn't be
* exposed to userland and are RCU protected. Also, cgroup core needs to
* guarantee that css_tryget_online() won't succeed by the time
* ->css_offline() is invoked. To satisfy all the requirements,
* destruction is implemented in the following two steps.
* Tear down @cgrp on behalf of rmdir. Constraints:
*
* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
* userland visible parts and start killing the percpu refcnts of
* css's. Set up so that the next stage will be kicked off once all
* the percpu refcnts are confirmed to be killed.
* - Userspace: rmdir must succeed when cgroup.procs and friends are empty.
*
* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
* rest of destruction. Once all cgroup references are gone, the
* cgroup is RCU-freed.
* - Kernel: subsystem ->css_offline() must not run while any task in @cgrp's
* subtree is still doing kernel work. A task hidden from cgroup.procs (past
* exit_signals() with signal->live cleared) can still schedule, allocate, and
* consume resources until its final context switch. Dying descendants in the
* subtree can host such tasks too.
*
* This function implements s1. After this step, @cgrp is gone as far as
* the userland is concerned and a new cgroup with the same name may be
* created. As cgroup doesn't care about the names internally, this
* doesn't cause any problem.
* - Kernel: css_tryget_online() must fail by the time ->css_offline() runs.
*
* The destruction runs in three parts:
*
* - This function: synchronous user-visible state teardown plus kill_css_sync()
* on each subsystem css.
*
* - cgroup_finish_destroy(): kicks the percpu_ref kill via kill_css_finish() on
* each subsystem css. Fires once @cgrp's subtree is fully drained, either
* inline here or from cgroup_update_populated().
*
* - The percpu_ref kill chain: css_killed_ref_fn -> css_killed_work_fn ->
* ->css_offline() -> release/free.
*
* Return 0 on success, -EBUSY if a userspace-visible task or an online child
* remains.
*/
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *css;
struct cgrp_cset_link *link;
struct css_task_iter it;
struct task_struct *task;
int ssid, ret;
lockdep_assert_held(&cgroup_mutex);
/*
* Only migration can raise populated from zero and we're already
* holding cgroup_mutex.
*/
if (cgroup_is_populated(cgrp))
css_task_iter_start(&cgrp->self, 0, &it);
task = css_task_iter_next(&it);
css_task_iter_end(&it);
if (task)
return -EBUSY;
/*
@ -6150,9 +6196,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
link->cset->dead = true;
spin_unlock_irq(&css_set_lock);
/* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
kill_css(css);
kill_css_sync(css);
/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
css_clear_dir(&cgrp->self);
@ -6183,79 +6228,27 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
if (!cgroup_is_populated(cgrp))
cgroup_finish_destroy(cgrp);
return 0;
};
/**
* cgroup_drain_dying - wait for dying tasks to leave before rmdir
* @cgrp: the cgroup being removed
* cgroup_finish_destroy - deferred half of @cgrp destruction
* @cgrp: cgroup whose subtree just became empty
*
* cgroup.procs and cgroup.threads use css_task_iter which filters out
* PF_EXITING tasks so that userspace doesn't see tasks that have already been
* reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the
* cgroup has non-empty css_sets - is only updated when dying tasks pass through
* cgroup_task_dead() in finish_task_switch(). This creates a window where
* cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir
* fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no
* tasks.
*
* This function aligns cgroup_has_tasks() with what userspace can observe. If
* cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are
* PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the
* window between PF_EXITING and cgroup_task_dead() is short, the wait is brief.
*
* This function only concerns itself with this cgroup's own dying tasks.
* Whether the cgroup has children is cgroup_destroy_locked()'s problem.
*
* Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we
* retry the full check from scratch.
*
* Must be called with cgroup_mutex held.
* See cgroup_destroy_locked() for the rationale.
*/
static int cgroup_drain_dying(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
static void cgroup_finish_destroy(struct cgroup *cgrp)
{
struct css_task_iter it;
struct task_struct *task;
DEFINE_WAIT(wait);
struct cgroup_subsys_state *css;
int ssid;
lockdep_assert_held(&cgroup_mutex);
retry:
if (!cgroup_has_tasks(cgrp))
return 0;
/* Same iterator as cgroup.threads - if any task is visible, it's busy */
css_task_iter_start(&cgrp->self, 0, &it);
task = css_task_iter_next(&it);
css_task_iter_end(&it);
if (task)
return -EBUSY;
/*
* All remaining tasks are PF_EXITING and will pass through
* cgroup_task_dead() shortly. Wait for a kick and retry.
*
* cgroup_has_tasks() can't transition from false to true while we're
* holding cgroup_mutex, but the true to false transition happens
* under css_set_lock (via cgroup_task_dead()). We must retest and
* prepare_to_wait() under css_set_lock. Otherwise, the transition
* can happen between our first test and prepare_to_wait(), and we
* sleep with no one to wake us.
*/
spin_lock_irq(&css_set_lock);
if (!cgroup_has_tasks(cgrp)) {
spin_unlock_irq(&css_set_lock);
return 0;
}
prepare_to_wait(&cgrp->dying_populated_waitq, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
schedule();
finish_wait(&cgrp->dying_populated_waitq, &wait);
mutex_lock(&cgroup_mutex);
goto retry;
for_each_css(css, ssid, cgrp)
kill_css_finish(css);
}
int cgroup_rmdir(struct kernfs_node *kn)
@ -6267,12 +6260,9 @@ int cgroup_rmdir(struct kernfs_node *kn)
if (!cgrp)
return 0;
ret = cgroup_drain_dying(cgrp);
if (!ret) {
ret = cgroup_destroy_locked(cgrp);
if (!ret)
TRACE_CGROUP_PATH(rmdir, cgrp);
}
ret = cgroup_destroy_locked(cgrp);
if (!ret)
TRACE_CGROUP_PATH(rmdir, cgrp);
cgroup_kn_unlock(kn);
return ret;
@ -7032,7 +7022,6 @@ void cgroup_task_exit(struct task_struct *tsk)
static void do_cgroup_task_dead(struct task_struct *tsk)
{
struct cgrp_cset_link *link;
struct css_set *cset;
unsigned long flags;
@ -7046,11 +7035,6 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
/* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
if (waitqueue_active(&link->cgrp->dying_populated_waitq))
wake_up(&link->cgrp->dying_populated_waitq);
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);