From afbd961305eb483515650ccfcb7743608e7add78 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:13 +0300 Subject: [PATCH 1/8] ipvs: fixes for the new ip_vs_status info Sashiko reports some problems for the recently added /proc/net/ip_vs_status: * ip_vs_status_show() as a table reader may run long after the conn_tab and svc_table table are released. While ip_vs_conn_flush() properly changes the conn_tab_changes counter when conn_tab is removed, ip_vs_del_service() and ip_vs_flush() were missing such change for the svc_table_changes counter. As result, readers like ip_vs_dst_event() and ip_vs_status_show() may continue to use a freed table after a cond_resched_rcu() call. * While counting the buckets in ip_vs_status_show() make sure we traverse only the needed number of entries in the chain. This also prevents possible overflow of the 'count' variable. * Add check for 'loops' to prevent infinite loops while restarting the traversal on table change. * While IP_VS_CONN_TAB_MAX_BITS is 20 on 32-bit platforms and there is no risk to overflow when multiplying the number of conn_tab buckets to 100, prefer the div_u64() helper to make the following dividing safer. * Use 0440 permissions for ip_vs_status to restrict the info only to root due to the exported information for hash distribution. Link: https://sashiko.dev/#/patchset/20260410112352.23599-1-fw%40strlen.de Fixes: 9a9ccef907a7 ("ipvs: add ip_vs_status info") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 51 ++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6632daa87ded..27e50afe9a54 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2032,6 +2032,9 @@ static int ip_vs_del_service(struct ip_vs_service *svc) cancel_delayed_work_sync(&ipvs->svc_resize_work); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); + /* Inform readers that table is removed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); while (1) { p = rcu_dereference_protected(t->new_tbl, 1); call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); @@ -2078,6 +2081,9 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) t = rcu_dereference_protected(ipvs->svc_table, 1); if (t) { rcu_assign_pointer(ipvs->svc_table, NULL); + /* Inform readers that table is removed */ + smp_mb__before_atomic(); + atomic_inc(&ipvs->svc_table_changes); while (1) { p = rcu_dereference_protected(t->new_tbl, 1); call_rcu(&t->rcu_head, ip_vs_rht_rcu_free); @@ -3004,7 +3010,8 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) int old_gen, new_gen; u32 counts[8]; u32 bucket; - int count; + u32 count; + int loops; u32 sum1; u32 sum; int i; @@ -3020,6 +3027,7 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) if (!atomic_read(&ipvs->conn_count)) goto after_conns; old_gen = atomic_read(&ipvs->conn_tab_changes); + loops = 0; repeat_conn: smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */ @@ -3032,8 +3040,11 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) resched_score++; ip_vs_rht_walk_bucket_rcu(t, bucket, head) { count = 0; - hlist_bl_for_each_entry_rcu(hn, e, head, node) + hlist_bl_for_each_entry_rcu(hn, e, head, node) { count++; + if (count >= ARRAY_SIZE(counts) - 1) + break; + } } resched_score += count; if (resched_score >= 100) { @@ -3042,37 +3053,41 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) new_gen = atomic_read(&ipvs->conn_tab_changes); /* New table installed ? */ if (old_gen != new_gen) { + /* Too many changes? */ + if (++loops >= 5) + goto after_conns; old_gen = new_gen; goto repeat_conn; } } - counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + counts[count]++; } } for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) sum += counts[i]; sum1 = sum - counts[0]; - seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n", - counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n", + counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); for (i = 1; i < ARRAY_SIZE(counts); i++) { if (!counts[i]) continue; - seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n", + seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n", i, counts[i], - (unsigned long)counts[i] * 100 / max(sum1, 1U)); + div_u64((u64)counts[i] * 100U, max(sum1, 1U))); } after_conns: t = rcu_dereference(ipvs->svc_table); count = ip_vs_get_num_services(ipvs); - seq_printf(seq, "Services:\t%d\n", count); + seq_printf(seq, "Services:\t%u\n", count); seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n", t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0); if (!count) goto after_svc; old_gen = atomic_read(&ipvs->svc_table_changes); + loops = 0; repeat_svc: smp_rmb(); /* ipvs->svc_table and svc_table_changes */ @@ -3086,8 +3101,11 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) ip_vs_rht_walk_bucket_rcu(t, bucket, head) { count = 0; hlist_bl_for_each_entry_rcu(svc, e, head, - s_list) + s_list) { count++; + if (count >= ARRAY_SIZE(counts) - 1) + break; + } } resched_score += count; if (resched_score >= 100) { @@ -3096,24 +3114,27 @@ static int ip_vs_status_show(struct seq_file *seq, void *v) new_gen = atomic_read(&ipvs->svc_table_changes); /* New table installed ? */ if (old_gen != new_gen) { + /* Too many changes? */ + if (++loops >= 5) + goto after_svc; old_gen = new_gen; goto repeat_svc; } } - counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++; + counts[count]++; } } for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++) sum += counts[i]; sum1 = sum - counts[0]; - seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n", - counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U)); + seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n", + counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U))); for (i = 1; i < ARRAY_SIZE(counts); i++) { if (!counts[i]) continue; - seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n", + seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n", i, counts[i], - (unsigned long)counts[i] * 100 / max(sum1, 1U)); + div_u64((u64)counts[i] * 100U, max(sum1, 1U))); } after_svc: @@ -5039,7 +5060,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) ipvs->net->proc_net, ip_vs_stats_percpu_show, NULL)) goto err_percpu; - if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net, + if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net, ip_vs_status_show, NULL)) goto err_status; #endif From f2da9a96abb4b7a64626e931cedd85f05d5498ca Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:14 +0300 Subject: [PATCH 2/8] ipvs: fix races around the conn_lfactor and svc_lfactor sysctl vars Sashiko warns that the new sysctls vars can be changed after the hash tables are destroyed and their respective resizing works canceled, leading to mod_delayed_work() being called for canceled works. Solve this in different ways. conn_tab can be present even without services and is destroyed only on netns exit, so use disable_delayed_work_sync() to disable the work instead of adding more synchronization mechanisms. As for the svc_table, it is destroyed when the services are deleted, so we must be sure that netns exit is not called yet (the check for 'enable') and the work is not canceled by checking all under same mutex lock. Also, use WRITE_ONCE when updating the sysctl vars as we already read them with READ_ONCE. Link: https://sashiko.dev/#/patchset/20260410112352.23599-1-fw%40strlen.de Fixes: 8d7de5477e47 ("ipvs: add conn_lfactor and svc_lfactor sysctl vars") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 2 +- net/netfilter/ipvs/ip_vs_ctl.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 2082bfb2d93c..84a4921a7865 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1835,7 +1835,7 @@ static void ip_vs_conn_flush(struct netns_ipvs *ipvs) if (!rcu_dereference_protected(ipvs->conn_tab, 1)) return; - cancel_delayed_work_sync(&ipvs->conn_resize_work); + disable_delayed_work_sync(&ipvs->conn_resize_work); if (!atomic_read(&ipvs->conn_count)) goto unreg; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 27e50afe9a54..caec516856e9 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2469,7 +2469,7 @@ static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write, if (val < -8 || val > 8) { ret = -EINVAL; } else { - *valp = val; + WRITE_ONCE(*valp, val); if (rcu_access_pointer(ipvs->conn_tab)) mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0); @@ -2496,10 +2496,16 @@ static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write, if (val < -8 || val > 8) { ret = -EINVAL; } else { - *valp = val; - if (rcu_access_pointer(ipvs->svc_table)) + mutex_lock(&ipvs->service_mutex); + WRITE_ONCE(*valp, val); + /* Make sure the services are present */ + if (rcu_access_pointer(ipvs->svc_table) && + READ_ONCE(ipvs->enable) && + !test_bit(IP_VS_WORK_SVC_NORESIZE, + &ipvs->work_flags)) mod_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 0); + mutex_unlock(&ipvs->service_mutex); } } return ret; From d493d9de1c21313cf62be0f6e1a4d48385fa7beb Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:15 +0300 Subject: [PATCH 3/8] ipvs: fix the spin_lock usage for RT build syzbot reports for sleeping function called from invalid context [1]. The recently added code for resizable hash tables uses hlist_bl bit locks in combination with spin_lock for the connection fields (cp->lock). Fix the following problems: * avoid using spin_lock(&cp->lock) under locked bit lock because it sleeps on PREEMPT_RT * as the recent changes call ip_vs_conn_hash() only for newly allocated connection, the spin_lock can be removed there because the connection is still not linked to table and does not need cp->lock protection. * the lock can be removed also from ip_vs_conn_unlink() where we are the last connection user. * the last place that is fixed is ip_vs_conn_fill_cport() where now the cp->lock is locked before the other locks to ensure other packets do not modify the cp->flags in non-atomic way. Here we make sure cport and flags are changed only once if two or more packets race to fill the cport. Also, we fill cport early, so that if we race with resizing there will be valid cport key for the hashing. Add a warning if too many hash table changes occur for our RCU read-side critical section which is error condition but minor because the connection still can expire gracefully. Still, restore the cport to 0 to allow retransmitted packet to properly fill the cport. Problems reported by Sashiko. [1]: BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 16, name: ktimers/0 preempt_count: 2, expected: 0 RCU nest depth: 3, expected: 3 8 locks held by ktimers/0/16: #0: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #1: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline] #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: timer_base_lock_expiry kernel/time/timer.c:1502 [inline] #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: __run_timer_base+0x120/0x9f0 kernel/time/timer.c:2384 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __rt_spin_lock kernel/locking/spinlock_rt.c:50 [inline] #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x1e0/0x400 kernel/locking/spinlock_rt.c:57 #4: ffffc90000157a80 ((&cp->timer)){+...}-{0:0}, at: call_timer_fn+0xd4/0x5e0 kernel/time/timer.c:1745 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:315 [inline] #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_expire+0x257/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 #6: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline] #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline] #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 Preemption disabled at: [] bit_spin_lock include/linux/bit_spinlock.h:38 [inline] [] hlist_bl_lock+0x18/0x110 include/linux/list_bl.h:149 CPU: 0 UID: 0 PID: 16 Comm: ktimers/0 Tainted: G W L syzkaller #0 PREEMPT_{RT,(full)} Tainted: [W]=WARN, [L]=SOFTLOCKUP Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/18/2026 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 __might_resched+0x329/0x480 kernel/sched/core.c:9162 __rt_spin_lock kernel/locking/spinlock_rt.c:48 [inline] rt_spin_lock+0xc2/0x400 kernel/locking/spinlock_rt.c:57 spin_lock include/linux/spinlock_rt.h:45 [inline] ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline] ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260 call_timer_fn+0x192/0x5e0 kernel/time/timer.c:1748 expire_timers kernel/time/timer.c:1799 [inline] __run_timers kernel/time/timer.c:2374 [inline] __run_timer_base+0x6a3/0x9f0 kernel/time/timer.c:2386 run_timer_base kernel/time/timer.c:2395 [inline] run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2405 handle_softirqs+0x1de/0x6d0 kernel/softirq.c:622 __do_softirq kernel/softirq.c:656 [inline] run_ktimerd+0x69/0x100 kernel/softirq.c:1151 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Reported-by: syzbot+504e778ddaecd36fdd17@syzkaller.appspotmail.com Link: https://sashiko.dev/#/patchset/20260415200216.79699-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260420165539.85174-4-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260422135823.50489-4-ja%40ssi.bg Fixes: 2fa7cc9c7025 ("ipvs: switch to per-net connection table") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 72 ++++++++++++++++++--------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 84a4921a7865..9ea6b4fa78bf 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -267,27 +267,20 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) hash_key2 = hash_key; use2 = false; } + conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */, &head, &head2); - spin_lock(&cp->lock); - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - cp->flags |= IP_VS_CONN_F_HASHED; - WRITE_ONCE(cp->hn0.hash_key, hash_key); - WRITE_ONCE(cp->hn1.hash_key, hash_key2); - refcount_inc(&cp->refcnt); - hlist_bl_add_head_rcu(&cp->hn0.node, head); - if (use2) - hlist_bl_add_head_rcu(&cp->hn1.node, head2); - ret = 1; - } else { - pr_err("%s(): request for already hashed, called from %pS\n", - __func__, __builtin_return_address(0)); - ret = 0; - } + cp->flags |= IP_VS_CONN_F_HASHED; + WRITE_ONCE(cp->hn0.hash_key, hash_key); + WRITE_ONCE(cp->hn1.hash_key, hash_key2); + refcount_inc(&cp->refcnt); + hlist_bl_add_head_rcu(&cp->hn0.node, head); + if (use2) + hlist_bl_add_head_rcu(&cp->hn1.node, head2); - spin_unlock(&cp->lock); conn_tab_unlock(head, head2); + ret = 1; /* Schedule resizing if load increases */ if (atomic_read(&ipvs->conn_count) > t->u_thresh && @@ -321,7 +314,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */, &head, &head2); - spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { /* Decrease refcnt and unlink conn only if we are last user */ @@ -334,7 +326,6 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) } } - spin_unlock(&cp->lock); conn_tab_unlock(head, head2); rcu_read_unlock(); @@ -637,6 +628,7 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) struct ip_vs_conn_hnode *hn; u32 hash_key, hash_key_new; struct ip_vs_conn_param p; + bool by_me = false; int ntbl; int dir; @@ -664,8 +656,16 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) t = rcu_dereference(t->new_tbl); ntbl++; /* We are lost? */ - if (ntbl >= 2) + if (ntbl >= 2) { + spin_lock_bh(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me) + cp->cport = 0; + /* hn1 will be rehashed on next packet */ + spin_unlock_bh(&cp->lock); + IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n", + __func__, dir); return; + } } /* Rehashing during resize? Use the recent table for adds */ @@ -683,10 +683,13 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) if (head > head2 && t == t2) swap(head, head2); + /* Protect the cp->flags modification */ + spin_lock_bh(&cp->lock); + /* Lock seqcount only for the old bucket, even if we are on new table * because it affects the del operation, not the adding. */ - spin_lock_bh(&t->lock[hash_key & t->lock_mask].l); + spin_lock(&t->lock[hash_key & t->lock_mask].l); preempt_disable_nested(); write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]); @@ -704,14 +707,23 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) hlist_bl_unlock(head); write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); - spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); + spin_unlock(&t->lock[hash_key & t->lock_mask].l); + spin_unlock_bh(&cp->lock); hash_key = hash_key_new; goto retry; } - spin_lock(&cp->lock); - if ((cp->flags & IP_VS_CONN_F_NO_CPORT) && - (cp->flags & IP_VS_CONN_F_HASHED)) { + /* Fill cport once, even if multiple packets try to do it */ + if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) { + /* If we race with resizing make sure cport is set for dir 1 */ + if (!cp->cport) { + cp->cport = cport; + by_me = true; + } + if (!dir) { + atomic_dec(&ipvs->no_cport_conns[af_id]); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + } /* We do not recalc hash_key_r under lock, we assume the * parameters in cp do not change, i.e. cport is * the only possible change. @@ -726,21 +738,17 @@ void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) hlist_bl_del_rcu(&hn->node); hlist_bl_add_head_rcu(&hn->node, head_new); } - if (!dir) { - atomic_dec(&ipvs->no_cport_conns[af_id]); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } } - spin_unlock(&cp->lock); if (head != head2) hlist_bl_unlock(head2); hlist_bl_unlock(head); write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]); preempt_enable_nested(); - spin_unlock_bh(&t->lock[hash_key & t->lock_mask].l); - if (dir--) + spin_unlock(&t->lock[hash_key & t->lock_mask].l); + + spin_unlock_bh(&cp->lock); + if (dir-- && by_me) goto next_dir; } From fbe1e01e818ee6db86ff947599bf0bea96de7e71 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:16 +0300 Subject: [PATCH 4/8] ipvs: do not leak dest after get from dest trash Sashiko warns about leaked dest if ip_vs_start_estimator() fails in ip_vs_add_dest(). Add ip_vs_trash_put_dest() to put back the dest into dest trash. Link: https://sashiko.dev/#/patchset/20260428175725.72050-1-ja%40ssi.bg Fixes: 705dd3444081 ("ipvs: use kthreads for stats estimation") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 37 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index caec516856e9..d81077c2457a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1102,6 +1102,24 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, return dest; } +/* Put destination in trash */ +static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs, + struct ip_vs_dest *dest, unsigned long istart, + bool cleanup) +{ + spin_lock_bh(&ipvs->dest_trash_lock); + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), + refcount_read(&dest->refcnt)); + if (list_empty(&ipvs->dest_trash) && !cleanup) + mod_timer(&ipvs->dest_trash_timer, + jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); + /* dest lives in trash with reference */ + list_add(&dest->t_list, &ipvs->dest_trash); + dest->idle_start = istart; + spin_unlock_bh(&ipvs->dest_trash_lock); +} + static void ip_vs_dest_rcu_free(struct rcu_head *head) { struct ip_vs_dest *dest; @@ -1461,9 +1479,12 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) ntohs(dest->vport)); ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + /* On error put back dest into the trash */ if (ret < 0) - return ret; - __ip_vs_update_dest(svc, dest, udest, 1); + ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start, + false); + else + __ip_vs_update_dest(svc, dest, udest, 1); } else { /* * Allocate and initialize the dest structure @@ -1533,17 +1554,7 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, */ ip_vs_rs_unhash(dest); - spin_lock_bh(&ipvs->dest_trash_lock); - IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", - IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), - refcount_read(&dest->refcnt)); - if (list_empty(&ipvs->dest_trash) && !cleanup) - mod_timer(&ipvs->dest_trash_timer, - jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); - /* dest lives in trash with reference */ - list_add(&dest->t_list, &ipvs->dest_trash); - dest->idle_start = 0; - spin_unlock_bh(&ipvs->dest_trash_lock); + ip_vs_trash_put_dest(ipvs, dest, 0, cleanup); /* Queue up delayed work to expire all no destination connections. * No-op when CONFIG_SYSCTL is disabled. From 2fd109238925d53c44ea409df0558844af7877b8 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:17 +0300 Subject: [PATCH 5/8] ipvs: fix races around est_mutex and est_cpulist Sashiko reports for races and possible crash around the usage of est_cpulist_valid and sysctl_est_cpulist. The problem is that we do not lock est_mutex in some places which can lead to wrong write ordering and as result problems when calling cpumask_weight() and cpumask_empty(). Fix them by moving the est_max_threads read/write under locked est_mutex. Do the same for one ip_vs_est_reload_start() call to protect the cpumask_empty() usage of sysctl_est_cpulist. To remove the chance of deadlock while stopping the estimation kthreads, keep the data structure for kthread 0 even after last estimator is removed and do not hold mutexes while stopping this task. Now we will use a new flag 'needed' to know when kthread 0 should run. The kthreads above 0 do not use mutexes, so stop them under est_mutex because their kthread data still can be destroyed if they do not serve estimators. Now all kthreads will be started by the est_reload_work to properly serialize the stop/start for kthread 0. Reduce the use of service_mutex in ip_vs_est_calc_phase() because under est_mutex we can safely walk est_kt_arr to stop the kthreads above slot 0. As ip_vs_stop_estimator() for tot_stats should be called under service_mutex, do it early in the netns exit path in ip_vs_flush() to avoid locking the mutex again later. It still should be called in ip_vs_control_net_cleanup_sysctl() when we are called during netns init error. Use -2 for ktid as indicator if estimator was already stopped. Finally, fix use-after-free for kd->est_row in ip_vs_est_calc_phase(). est->ktrow should simply switch to a delay value while estimator is linked to est_temp_list. Link: https://sashiko.dev/#/patchset/20260331165015.2777765-1-longman%40redhat.com Link: https://sashiko.dev/#/patchset/20260420171308.87192-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260422125123.40658-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260424175858.54752-1-ja%40ssi.bg Link: https://sashiko.dev/#/patchset/20260425103918.7447-1-ja%40ssi.bg Fixes: f0be83d54217 ("ipvs: add est_cpulist and est_nice sysctl vars") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 11 ++++- net/netfilter/ipvs/ip_vs_ctl.c | 51 +++++++++++++++++---- net/netfilter/ipvs/ip_vs_est.c | 83 ++++++++++++++++++++-------------- 3 files changed, 100 insertions(+), 45 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 72d325c81313..d28ad8a0541f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -491,6 +491,7 @@ struct ip_vs_est_kt_data { DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ unsigned long est_timer; /* estimation timer (jiffies) */ struct ip_vs_stats *calc_stats; /* Used for calculation */ + int needed; /* task is needed */ int tick_len[IPVS_EST_NTICKS]; /* est count */ int id; /* ktid per netns */ int chain_max; /* max ests per tick chain */ @@ -1884,11 +1885,19 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); -void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart); int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); +static inline void ip_vs_stop_estimator_tot_stats(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + ipvs->tot_stats->s.est.ktid = -2; +#endif +} + static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index d81077c2457a..5c9f8e0e238f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -261,12 +261,28 @@ static void est_reload_work_handler(struct work_struct *work) if (!kd) continue; /* New config ? Stop kthread tasks */ - if (genid != genid_done) - ip_vs_est_kthread_stop(kd); + if (genid != genid_done) { + if (!id) { + /* Only we can stop kt 0 but not under mutex */ + mutex_unlock(&ipvs->est_mutex); + ip_vs_est_kthread_stop(kd); + mutex_lock(&ipvs->est_mutex); + if (!READ_ONCE(ipvs->enable)) + goto unlock; + /* kd for kt 0 is never destroyed */ + } else { + ip_vs_est_kthread_stop(kd); + } + } if (!kd->task && !ip_vs_est_stopped(ipvs)) { + bool start; + /* Do not start kthreads above 0 in calc phase */ - if ((!id || !ipvs->est_calc_phase) && - ip_vs_est_kthread_start(ipvs, kd) < 0) + if (id) + start = !ipvs->est_calc_phase; + else + start = kd->needed; + if (start && ip_vs_est_kthread_start(ipvs, kd) < 0) repeat = true; } } @@ -1823,11 +1839,16 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; if (!READ_ONCE(ipvs->enable)) { + mutex_lock(&ipvs->est_mutex); + /* Now there is a service - full throttle */ WRITE_ONCE(ipvs->enable, 1); + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + /* Start estimation for first time */ - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); + mutex_unlock(&ipvs->est_mutex); } return 0; @@ -2103,6 +2124,11 @@ static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup) t = p; } } + /* Stop the tot_stats estimator early under service_mutex + * to avoid locking it again later. + */ + if (cleanup) + ip_vs_stop_estimator_tot_stats(ipvs); return 0; } @@ -2348,7 +2374,7 @@ static int ipvs_proc_est_cpumask_set(const struct ctl_table *table, /* est_max_threads may depend on cpulist size */ ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); ipvs->est_calc_phase = 1; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); unlock: mutex_unlock(&ipvs->est_mutex); @@ -2428,7 +2454,7 @@ static int ipvs_proc_est_nice(const struct ctl_table *table, int write, mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); } mutex_unlock(&ipvs->est_mutex); } @@ -2455,7 +2481,7 @@ static int ipvs_proc_run_estimation(const struct ctl_table *table, int write, mutex_lock(&ipvs->est_mutex); if (*valp != val) { *valp = val; - ip_vs_est_reload_start(ipvs); + ip_vs_est_reload_start(ipvs, true); } mutex_unlock(&ipvs->est_mutex); } @@ -5005,7 +5031,14 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + if (ipvs->tot_stats->s.est.ktid != -2) { + /* Not stopped yet? This happens only on netns init error and + * we even do not need to lock the service_mutex for this case. + */ + mutex_lock(&ipvs->service_mutex); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + mutex_unlock(&ipvs->service_mutex); + } if (ipvs->est_cpulist_valid) free_cpumask_var(ipvs->sysctl_est_cpulist); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 433ba3cab58c..ab09f5182951 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -68,6 +68,11 @@ and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data + - data protected by service_mutex: est_temp_list, est_add_ktid, + est_kt_count(R/W), est_kt_arr(R/W), est_genid_done, kd->needed(R/W) + - data protected by est_mutex: est_genid, est_max_threads, sysctl_est_cpulist, + est_cpulist_valid, sysctl_est_nice, est_stopped, sysctl_run_estimation, + est_kt_count(R), est_kt_arr(R), kd->needed(R), kd->task (id > 0) */ static struct lock_class_key __ipvs_est_key; @@ -227,14 +232,17 @@ static int ip_vs_estimation_kthread(void *data) } /* Schedule stop/start for kthread tasks */ -void ip_vs_est_reload_start(struct netns_ipvs *ipvs) +void ip_vs_est_reload_start(struct netns_ipvs *ipvs, bool restart) { + lockdep_assert_held(&ipvs->est_mutex); + /* Ignore reloads before first service is added */ if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); - /* Bump the kthread configuration genid */ - atomic_inc(&ipvs->est_genid); + /* Bump the kthread configuration genid if stopping is requested */ + if (restart) + atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } @@ -304,12 +312,17 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) void *arr = NULL; int i; - if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - READ_ONCE(ipvs->enable) && ipvs->est_max_threads) - return -EINVAL; - mutex_lock(&ipvs->est_mutex); + /* Allow kt 0 data to be created before the services are added + * and limit the kthreads when services are present. + */ + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) { + ret = -EINVAL; + goto out; + } + for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; @@ -333,6 +346,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); + kd->needed = 1; /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { @@ -341,12 +355,8 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) goto out; } - /* Start kthread tasks only when services are present */ - if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { - ret = ip_vs_est_kthread_start(ipvs, kd); - if (ret < 0) - goto out; - } + /* Request kthread to be started */ + ip_vs_est_reload_start(ipvs, false); if (arr) ipvs->est_kt_count++; @@ -482,12 +492,11 @@ static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { + struct ip_vs_est_kt_data *kd = ipvs->est_kt_count > 0 ? + ipvs->est_kt_arr[0] : NULL; struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) - ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); - est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ @@ -496,8 +505,15 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) * will not allocate much memory, just for kt 0. */ ret = 0; - if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) + if (!kd) { ret = ip_vs_est_add_kthread(ipvs); + } else if (!kd->needed) { + mutex_lock(&ipvs->est_mutex); + /* We have job for the kt 0 task */ + kd->needed = 1; + ip_vs_est_reload_start(ipvs, true); + mutex_unlock(&ipvs->est_mutex); + } if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else @@ -578,16 +594,14 @@ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) } end_kt0: - /* kt 0 is freed after all other kthreads and chains are empty */ + /* kt 0 task is stopped after all other kt slots and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; - if (!kd || !kd->est_count) { + if (kd && !kd->est_count) { mutex_lock(&ipvs->est_mutex); - if (kd) { - ip_vs_est_kthread_destroy(kd); - ipvs->est_kt_arr[0] = NULL; - } - ipvs->est_kt_count--; + /* Keep the kt0 data but request kthread_stop */ + kd->needed = 0; + ip_vs_est_reload_start(ipvs, true); mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } @@ -647,9 +661,9 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) u64 val; INIT_HLIST_HEAD(&chain); - mutex_lock(&ipvs->service_mutex); + mutex_lock(&ipvs->est_mutex); kd = ipvs->est_kt_arr[0]; - mutex_unlock(&ipvs->service_mutex); + mutex_unlock(&ipvs->est_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; @@ -748,16 +762,16 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; - mutex_lock(&ipvs->service_mutex); - /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!READ_ONCE(ipvs->enable)) - goto unlock2; + if (kthread_should_stop() || !READ_ONCE(ipvs->enable)) { + mutex_unlock(&ipvs->est_mutex); + return; + } kd = ipvs->est_kt_arr[id]; if (!kd) continue; @@ -765,9 +779,11 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) } mutex_unlock(&ipvs->est_mutex); + mutex_lock(&ipvs->service_mutex); + /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while - * we reschedule. Even for kthread 0. + * we reschedule. */ step = 0; @@ -849,9 +865,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; - est->ktrow = row - kd->est_row; - if (est->ktrow < 0) - est->ktrow += IPVS_EST_NTICKS; + est->ktrow = delay; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) @@ -889,7 +903,6 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; -unlock2: mutex_unlock(&ipvs->est_mutex); unlock: From 4ee52b7021a7cb9356f8b9aff5631c68512a9e1b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 30 Apr 2026 10:44:18 +0300 Subject: [PATCH 6/8] ipvs: fix shift-out-of-bounds in ip_vs_rht_desired_size Calling roundup_pow_of_two() with 0 has undefined result: UBSAN: shift-out-of-bounds in ./include/linux/log2.h:57:13 shift exponent 64 is too large for 64-bit type 'unsigned long' CPU: 1 UID: 0 PID: 77 Comm: kworker/u8:4 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/18/2026 Workqueue: events_unbound conn_resize_work_handler Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 ubsan_epilogue+0xa/0x30 lib/ubsan.c:233 __ubsan_handle_shift_out_of_bounds+0x385/0x410 lib/ubsan.c:494 __roundup_pow_of_two include/linux/log2.h:57 [inline] ip_vs_rht_desired_size+0x2cf/0x410 net/netfilter/ipvs/ip_vs_core.c:240 ip_vs_conn_desired_size net/netfilter/ipvs/ip_vs_conn.c:765 [inline] conn_resize_work_handler+0x1b6/0x14c0 net/netfilter/ipvs/ip_vs_conn.c:822 process_one_work kernel/workqueue.c:3302 [inline] process_scheduled_works+0xb5d/0x1860 kernel/workqueue.c:3385 worker_thread+0xa53/0xfc0 kernel/workqueue.c:3466 kthread+0x388/0x470 kernel/kthread.c:436 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Reported-by: syzbot+217f1db9c791e27fe54a@syzkaller.appspotmail.com Fixes: b655388111cf ("ipvs: add resizable hash tables") Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f5b7a2047291..d40b404c1bf6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -237,7 +237,7 @@ int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n, { if (!t) return 1 << min_bits; - n = roundup_pow_of_two(n); + n = n > 0 ? roundup_pow_of_two(n) : 1; if (lfactor < 0) { int factor = min(-lfactor, max_bits); From aa6065206987278291c09d0c6aebed687114c925 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:19 +0300 Subject: [PATCH 7/8] ipvs: Guard access of HK_TYPE_KTHREAD cpumask with RCU The ip_vs_ctl.c file and the associated ip_vs.h file are the only places in the kernel where HK_TYPE_KTHREAD cpumask is being retrieved and used. Now that HK_TYPE_KTHREAD/HK_TYPE_DOMAIN cpumask can be changed at run time. We need to use RCU to guard access to this cpumask to avoid a potential UAF problem as the returned cpumask may be freed before it is being used. We can replace HK_TYPE_KTHREAD by HK_TYPE_DOMAIN as they are aliases of each other, but keeping the HK_TYPE_KTHREAD name can highlight the fact that it is the kthread initiated by ipvs that is being controlled. Fixes: 03ff73510169 ("cpuset: Update HK_TYPE_DOMAIN cpumask from cpuset") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 20 ++++++++++++++++---- net/netfilter/ipvs/ip_vs_ctl.c | 13 ++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d28ad8a0541f..02762ce73a0c 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1412,7 +1412,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return ipvs->sysctl_run_estimation; } -static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs) { if (ipvs->est_cpulist_valid) return ipvs->sysctl_est_cpulist; @@ -1530,7 +1530,7 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return 1; } -static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +static inline const struct cpumask *__sysctl_est_cpulist(struct netns_ipvs *ipvs) { return housekeeping_cpumask(HK_TYPE_KTHREAD); } @@ -1565,6 +1565,18 @@ static inline int sysctl_svc_lfactor(struct netns_ipvs *ipvs) return READ_ONCE(ipvs->sysctl_svc_lfactor); } +static inline bool sysctl_est_cpulist_empty(struct netns_ipvs *ipvs) +{ + guard(rcu)(); + return cpumask_empty(__sysctl_est_cpulist(ipvs)); +} + +static inline unsigned int sysctl_est_cpulist_weight(struct netns_ipvs *ipvs) +{ + guard(rcu)(); + return cpumask_weight(__sysctl_est_cpulist(ipvs)); +} + /* IPVS core functions * (from ip_vs_core.c) */ @@ -1904,7 +1916,7 @@ static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) /* Stop tasks while cpulist is empty or if disabled with flag */ ipvs->est_stopped = !sysctl_run_estimation(ipvs) || (ipvs->est_cpulist_valid && - cpumask_empty(sysctl_est_cpulist(ipvs))); + sysctl_est_cpulist_empty(ipvs)); #endif } @@ -1920,7 +1932,7 @@ static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) { unsigned int limit = IPVS_EST_CPU_KTHREADS * - cpumask_weight(sysctl_est_cpulist(ipvs)); + sysctl_est_cpulist_weight(ipvs); return max(1U, limit); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5c9f8e0e238f..c7c7f6a7a9f6 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2394,11 +2394,14 @@ static int ipvs_proc_est_cpumask_get(const struct ctl_table *table, mutex_lock(&ipvs->est_mutex); - if (ipvs->est_cpulist_valid) - mask = *valp; - else - mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); - ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + /* HK_TYPE_KTHREAD cpumask needs RCU protection */ + scoped_guard(rcu) { + if (ipvs->est_cpulist_valid) + mask = *valp; + else + mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); + ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + } mutex_unlock(&ipvs->est_mutex); From 8f78b749f3da0f43990490b4c1193b5ede3eec0a Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 30 Apr 2026 10:44:20 +0300 Subject: [PATCH 8/8] sched/isolation: Make HK_TYPE_KTHREAD an alias of HK_TYPE_DOMAIN Since commit 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management"), kthreads default to use the HK_TYPE_DOMAIN cpumask. IOW, it is no longer affected by the setting of the nohz_full boot kernel parameter. That means HK_TYPE_KTHREAD should now be an alias of HK_TYPE_DOMAIN instead of HK_TYPE_KERNEL_NOISE to correctly reflect the current kthread behavior. Make the change as HK_TYPE_KTHREAD is still being used in some networking code. Fixes: 041ee6f3727a ("kthread: Rely on HK_TYPE_DOMAIN for preferred affinity management") Signed-off-by: Waiman Long Signed-off-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- include/linux/sched/isolation.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index dc3975ff1b2e..cf0fd03dd7a2 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -20,6 +20,11 @@ enum hk_type { HK_TYPE_KERNEL_NOISE, HK_TYPE_MAX, + /* + * HK_TYPE_KTHREAD is now an alias of HK_TYPE_DOMAIN + */ + HK_TYPE_KTHREAD = HK_TYPE_DOMAIN, + /* * The following housekeeping types are only set by the nohz_full * boot commandline option. So they can share the same value. @@ -29,7 +34,6 @@ enum hk_type { HK_TYPE_RCU = HK_TYPE_KERNEL_NOISE, HK_TYPE_MISC = HK_TYPE_KERNEL_NOISE, HK_TYPE_WQ = HK_TYPE_KERNEL_NOISE, - HK_TYPE_KTHREAD = HK_TYPE_KERNEL_NOISE }; #ifdef CONFIG_CPU_ISOLATION