rhashtable: Bounce deferred worker kick through irq_work

Inserts past 75% load call schedule_work(&ht->run_work) to kick an async resize. If a caller holds a raw spinlock (e.g. an insecure_elasticity user), schedule_work() under that lock records caller_lock -> pool->lock -> pi_lock -> rq->__lock A cycle forms if any of these locks is acquired in the reverse direction elsewhere. sched_ext, the only current insecure_elasticity user, hits this: it holds scx_sched_lock across rhashtable inserts of sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock. Exercising the resize path produces: Chain exists of: &pool->lock --> &rq->__lock --> scx_sched_lock Bounce the kick from the insert paths through irq_work so schedule_work() runs from hard IRQ context with the caller's lock no longer held. rht_deferred_worker()'s self-rearm on error stays on schedule_work(&ht->run_work) - the worker runs in process context with no caller lock held, and keeping the self-requeue on @run_work lets cancel_work_sync() in rhashtable_free_and_destroy() drain it. v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work). Routing it through irq_work in v2 broke cancel_work_sync()'s self-requeue handling - an irq_work queued after irq_work_sync() returned but while cancel_work_sync() was still waiting could fire post-teardown. v2: Bounce unconditionally instead of gating on insecure_elasticity, as suggested by Herbert. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
2026-05-12 16:18:45 +02:00 · 2026-04-20 20:03:26 -10:00 · 2026-04-20 20:03:26 -10:00 · 4fe9852927
commit 4fe9852927
parent 5897ca15d2
3 changed files with 33 additions and 4 deletions
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@ -12,6 +12,7 @@
 #include <linux/alloc_tag.h>
 #include <linux/atomic.h>
 #include <linux/compiler.h>
+#include <linux/irq_work_types.h>
 #include <linux/mutex.h>
 #include <linux/workqueue_types.h>

@ -77,6 +78,7 @@ struct rhashtable_params {
 * @p: Configuration parameters
 * @rhlist: True if this is an rhltable
 * @run_work: Deferred worker to expand/shrink asynchronously
+ * @run_irq_work: Bounces the @run_work kick through hard IRQ context.
 * @mutex: Mutex to protect current/future table swapping
 * @lock: Spin lock to protect walker list
 * @nelems: Number of elements in table
@ -88,6 +90,7 @@ struct rhashtable {
 	struct rhashtable_params	p;
 	bool				rhlist;
 	struct work_struct		run_work;
+	struct irq_work			run_irq_work;
 	struct mutex                    mutex;
 	spinlock_t			lock;
 	atomic_t			nelems;
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@ -20,6 +20,7 @@

 #include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/irq_work.h>
 #include <linux/jhash.h>
 #include <linux/list_nulls.h>
 #include <linux/workqueue.h>
@ -847,7 +848,7 @@ static __always_inline void *__rhashtable_insert_fast(
 	rht_assign_unlock(tbl, bkt, obj, flags);

 	if (rht_grow_above_75(ht, tbl))
-		schedule_work(&ht->run_work);
+		irq_work_queue(&ht->run_irq_work);

 	data = NULL;
 out:
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@ -441,10 +441,33 @@ static void rht_deferred_worker(struct work_struct *work)

 	mutex_unlock(&ht->mutex);

+	/*
+	 * Re-arm via @run_work, not @run_irq_work.
+	 * rhashtable_free_and_destroy() drains async work as irq_work_sync()
+	 * followed by cancel_work_sync(). If this site queued irq_work while
+	 * cancel_work_sync() was waiting for us, irq_work_sync() would already
+	 * have returned and the stale irq_work could fire post-teardown.
+	 * cancel_work_sync() natively handles self-requeue on @run_work.
+	 */
 	if (err)
 		schedule_work(&ht->run_work);
 }

+/*
+ * Insert-path callers can run under a raw spinlock (e.g. an insecure_elasticity
+ * user). Calling schedule_work() under that lock records caller_lock ->
+ * pool->lock -> pi_lock -> rq->__lock, closing a locking cycle if any of
+ * these is acquired in the reverse direction elsewhere. Bounce through
+ * irq_work so the schedule_work() runs with the caller's lock no longer held.
+ */
+static void rht_deferred_irq_work(struct irq_work *irq_work)
+{
+	struct rhashtable *ht = container_of(irq_work, struct rhashtable,
+					     run_irq_work);
+
+	schedule_work(&ht->run_work);
+}
+
 static int rhashtable_insert_rehash(struct rhashtable *ht,
 				    struct bucket_table *tbl)
 {
@ -477,7 +500,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
 		if (err == -EEXIST)
 			err = 0;
 	} else
-		schedule_work(&ht->run_work);
+		irq_work_queue(&ht->run_irq_work);

 	return err;

@ -488,7 +511,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,

 	/* Schedule async rehash to retry allocation in process context. */
 	if (err == -ENOMEM)
-		schedule_work(&ht->run_work);
+		irq_work_queue(&ht->run_irq_work);

 	return err;
 }
@ -630,7 +653,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 			rht_unlock(tbl, bkt, flags);

 			if (inserted && rht_grow_above_75(ht, tbl))
-				schedule_work(&ht->run_work);
+				irq_work_queue(&ht->run_irq_work);
 		}
 	} while (!IS_ERR_OR_NULL(new_tbl));

@ -1085,6 +1108,7 @@ int rhashtable_init_noprof(struct rhashtable *ht,
 	RCU_INIT_POINTER(ht->tbl, tbl);

 	INIT_WORK(&ht->run_work, rht_deferred_worker);
+	init_irq_work(&ht->run_irq_work, rht_deferred_irq_work);

 	return 0;
 }
@ -1150,6 +1174,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
 	struct bucket_table *tbl, *next_tbl;
 	unsigned int i;

+	irq_work_sync(&ht->run_irq_work);
 	cancel_work_sync(&ht->run_work);

 	mutex_lock(&ht->mutex);