ipvs: add resizable hash tables

Add infrastructure for resizable hash tables based on hlist_bl
which we will use in followup patches.

The tables allow RCU lookups during resizing, bucket modifications
are protected with per-bucket bit lock and additional custom locking,
the tables are resized when load reaches thresholds determined based
on load factor parameter.

Compared to other implementations we rely on:
* fast entry removal by using node unlinking without pre-lookup
* entry rehashing when hash key changes
* entries can contain multiple hash nodes
* custom locking depending on different contexts
* adjustable load factor to customize the grow/shrink process

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Florian Westphal <fw@strlen.de>
This commit is contained in:
Julian Anastasov 2026-03-03 23:04:05 +02:00 committed by Florian Westphal
parent 1ac252ad03
commit b655388111
3 changed files with 377 additions and 5 deletions

View File

@ -11,6 +11,7 @@
#include <asm/types.h> /* for __uXX types */
#include <linux/list.h> /* for struct list_head */
#include <linux/rculist_bl.h> /* for struct hlist_bl_head */
#include <linux/spinlock.h> /* for struct rwlock_t */
#include <linux/atomic.h> /* for struct atomic_t */
#include <linux/refcount.h> /* for struct refcount_t */
@ -30,6 +31,7 @@
#endif
#include <net/net_namespace.h> /* Netw namespace */
#include <linux/sched/isolation.h>
#include <linux/siphash.h>
#define IP_VS_HDR_INVERSE 1
#define IP_VS_HDR_ICMP 2
@ -271,6 +273,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
pr_err(msg, ##__VA_ARGS__); \
} while (0)
struct ip_vs_aligned_lock {
spinlock_t l; /* Protect buckets */
} ____cacheline_aligned_in_smp;
/* For arrays per family */
enum {
IP_VS_AF_INET,
@ -484,6 +490,198 @@ struct ip_vs_est_kt_data {
int est_row; /* estimated row */
};
/* IPVS resizable hash tables */
struct ip_vs_rht {
struct hlist_bl_head *buckets;
struct ip_vs_rht __rcu *new_tbl; /* New/Same table */
seqcount_t *seqc; /* Protects moves */
struct ip_vs_aligned_lock *lock; /* Protect seqc */
int mask; /* Buckets mask */
int size; /* Buckets */
int seqc_mask; /* seqc mask */
int lock_mask; /* lock mask */
u32 table_id;
int u_thresh; /* upper threshold */
int l_thresh; /* lower threshold */
int lfactor; /* Load Factor (shift)*/
int bits; /* size = 1 << bits */
siphash_key_t hash_key;
struct rcu_head rcu_head;
};
/**
* ip_vs_rht_for_each_table() - Walk the hash tables
* @table: struct ip_vs_rht __rcu *table
* @t: current table, used as cursor, struct ip_vs_rht *var
* @p: previous table, temp struct ip_vs_rht *var
*
* Walk tables assuming others can not change the installed tables
*/
#define ip_vs_rht_for_each_table(table, t, p) \
for (p = NULL, t = rcu_dereference_protected(table, 1); \
t != p; \
p = t, t = rcu_dereference_protected(t->new_tbl, 1))
/**
* ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock
* @table: struct ip_vs_rht __rcu *table
* @t: current table, used as cursor, struct ip_vs_rht *var
* @p: previous table, temp struct ip_vs_rht *var
*
* We usually search in one table and also in second table on resizing
*/
#define ip_vs_rht_for_each_table_rcu(table, t, p) \
for (p = NULL, t = rcu_dereference(table); \
t != p; \
p = t, t = rcu_dereference(t->new_tbl))
/**
* ip_vs_rht_for_each_bucket() - Walk all table buckets
* @t: current table, used as cursor, struct ip_vs_rht *var
* @bucket: bucket index, used as cursor, u32 var
* @head: bucket address, used as cursor, struct hlist_bl_head *var
*/
#define ip_vs_rht_for_each_bucket(t, bucket, head) \
for (bucket = 0, head = (t)->buckets; \
bucket < t->size; bucket++, head++)
/**
* ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved
* @t: current table, used as cursor, struct ip_vs_rht *var
* @bucket: index of current bucket or hash key
* @sc: temp seqcount_t *var
* @seq: temp unsigned int var for sequence count
* @retry: temp int var
*/
#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \
for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \
retry && ({ seq = read_seqcount_begin(sc); 1; }); \
retry = read_seqcount_retry(sc, seq))
/**
* DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables
*
* Variables for ip_vs_rht_walk_buckets_rcu
*/
#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \
struct ip_vs_rht *_t, *_p; \
unsigned int _seq; \
seqcount_t *_sc; \
u32 _bucket; \
int _retry
/**
* ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock
* @table: struct ip_vs_rht __rcu *table
* @head: bucket address, used as cursor, struct hlist_bl_head *var
*
* Can be used while others add/delete/move entries
* Not suitable if duplicates are not desired
* Possible cases for reader that uses cond_resched_rcu() in the loop:
* - new table can not be installed, no need to repeat
* - new table can be installed => check and repeat if new table is
* installed, needed for !PREEMPT_RCU
*/
#define ip_vs_rht_walk_buckets_rcu(table, head) \
ip_vs_rht_for_each_table_rcu(table, _t, _p) \
ip_vs_rht_for_each_bucket(_t, _bucket, head) \
ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \
_seq, _retry)
/**
* DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables
*
* Variables for ip_vs_rht_walk_bucket_rcu
*/
#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \
unsigned int _seq; \
seqcount_t *_sc; \
int _retry
/**
* ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock
* @t: current table, struct ip_vs_rht *var
* @bucket: index of current bucket or hash key
* @head: bucket address, used as cursor, struct hlist_bl_head *var
*
* Can be used while others add/delete/move entries
* Not suitable if duplicates are not desired
* Possible cases for reader that uses cond_resched_rcu() in the loop:
* - new table can not be installed, no need to repeat
* - new table can be installed => check and repeat if new table is
* installed, needed for !PREEMPT_RCU
*/
#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \
if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \
{} \
else \
ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry)
/**
* DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables
*
* Variables for ip_vs_rht_walk_buckets_safe_rcu
*/
#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \
struct ip_vs_rht *_t, *_p; \
u32 _bucket
/**
* ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock
* @table: struct ip_vs_rht __rcu *table
* @head: bucket address, used as cursor, struct hlist_bl_head *var
*
* Can be used while others add/delete entries but moving is disabled
* Using cond_resched_rcu() should be safe if tables do not change
*/
#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \
ip_vs_rht_for_each_table_rcu(table, _t, _p) \
ip_vs_rht_for_each_bucket(_t, _bucket, head)
/**
* DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables
*
* Variables for ip_vs_rht_walk_buckets
*/
#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \
struct ip_vs_rht *_t, *_p; \
u32 _bucket
/**
* ip_vs_rht_walk_buckets() - Walk all buckets
* @table: struct ip_vs_rht __rcu *table
* @head: bucket address, used as cursor, struct hlist_bl_head *var
*
* Use if others can not add/delete/move entries
*/
#define ip_vs_rht_walk_buckets(table, head) \
ip_vs_rht_for_each_table(table, _t, _p) \
ip_vs_rht_for_each_bucket(_t, _bucket, head)
/* Entries can be in one of two tables, so we flip bit when new table is
* created and store it as highest bit in hash keys
*/
#define IP_VS_RHT_TABLE_ID_MASK BIT(31)
/* Check if hash key is from this table */
static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key)
{
return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK);
}
/* Build per-table hash key from hash value */
static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash)
{
return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK);
}
void ip_vs_rht_free(struct ip_vs_rht *t);
void ip_vs_rht_rcu_free(struct rcu_head *head);
struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks);
int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
int lfactor, int min_bits, int max_bits);
void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
int min_bits, int max_bits);
u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
const union nf_inet_addr *addr, u32 v1, u32 v2);
struct dst_entry;
struct iphdr;
struct ip_vs_conn;

View File

@ -76,11 +76,6 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
#define IP_VS_ADDRSTRLEN (8+1)
#endif
struct ip_vs_aligned_lock
{
spinlock_t l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
/* lock array for conn table */
static struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;

View File

@ -117,6 +117,185 @@ void ip_vs_init_hash_table(struct list_head *table, int rows)
INIT_LIST_HEAD(&table[rows]);
}
/* IPVS Resizable Hash Tables:
* - list_bl buckets with bit lock
*
* Goals:
* - RCU lookup for entry can run in parallel with add/del/move operations
* - hash keys can be on non-contiguous memory
* - support entries with duplicate keys
* - unlink entries without lookup, use the saved table and bucket id
* - resizing can trigger on load change or depending on key refresh period
* - customizable load factor to balance between speed and memory usage
* - add/del/move operations should be allowed for any context
*
* Resizing:
* - new table is attached to the current table and all entries are moved
* with new hash key. Finally, the new table is installed as current one and
* the old table is released after RCU grace period.
* - RCU read-side critical sections will walk two tables while resizing is
* in progress
* - new entries are added to the new table
* - entries will be deleted from the old or from the new table, the table_id
* can be saved into entry as part of the hash key to know where the entry is
* hashed
* - move operations may delay readers or to cause retry for the modified
* bucket. As result, searched entry will be found but walkers that operate
* on multiple entries may see same entry twice if bucket walking is retried.
* - for fast path the number of entries (load) can be compared to u_thresh
* and l_thresh to decide when to trigger table growing/shrinking. They
* are calculated based on load factor (shift count), negative value allows
* load to be below 100% to reduce collisions by maintaining larger table
* while positive value tolerates collisions by using smaller table and load
* above 100%: u_thresh(load) = size * (2 ^ lfactor)
*
* Locking:
* - lock: protect seqc if other context except resizer can move entries
* - seqc: seqcount_t, delay/retry readers while entries are moved to
* new table on resizing
* - bit lock: serialize bucket modifications
* - writers may use other locking mechanisms to serialize operations for
* resizing, moving and installing new tables
*/
void ip_vs_rht_free(struct ip_vs_rht *t)
{
kvfree(t->buckets);
kvfree(t->seqc);
kvfree(t->lock);
kfree(t);
}
void ip_vs_rht_rcu_free(struct rcu_head *head)
{
struct ip_vs_rht *t;
t = container_of(head, struct ip_vs_rht, rcu_head);
ip_vs_rht_free(t);
}
struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks)
{
struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL);
int i;
if (!t)
return NULL;
if (scounts) {
int ml = roundup_pow_of_two(nr_cpu_ids);
scounts = min(scounts, buckets);
scounts = min(scounts, ml);
t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL);
if (!t->seqc)
goto err;
for (i = 0; i < scounts; i++)
seqcount_init(&t->seqc[i]);
if (locks) {
locks = min(locks, scounts);
t->lock = kvmalloc_array(locks, sizeof(*t->lock),
GFP_KERNEL);
if (!t->lock)
goto err;
for (i = 0; i < locks; i++)
spin_lock_init(&t->lock[i].l);
}
}
t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL);
if (!t->buckets)
goto err;
for (i = 0; i < buckets; i++)
INIT_HLIST_BL_HEAD(&t->buckets[i]);
t->mask = buckets - 1;
t->size = buckets;
t->seqc_mask = scounts - 1;
t->lock_mask = locks - 1;
t->u_thresh = buckets;
t->l_thresh = buckets >> 4;
t->bits = order_base_2(buckets);
/* new_tbl points to self if no new table is filled */
RCU_INIT_POINTER(t->new_tbl, t);
get_random_bytes(&t->hash_key, sizeof(t->hash_key));
return t;
err:
ip_vs_rht_free(t);
return NULL;
}
/* Get the desired table size for n entries based on current table size and
* by using the formula size = n / (2^lfactor)
* lfactor: shift value for the load factor:
* - >0: u_thresh=size << lfactor, for load factor above 100%
* - <0: u_thresh=size >> -lfactor, for load factor below 100%
* - 0: for load factor of 100%
*/
int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
int lfactor, int min_bits, int max_bits)
{
if (!t)
return 1 << min_bits;
n = roundup_pow_of_two(n);
if (lfactor < 0) {
int factor = min(-lfactor, max_bits);
n = min(n, 1 << (max_bits - factor));
n <<= factor;
} else {
n = min(n >> lfactor, 1 << max_bits);
}
if (lfactor != t->lfactor)
return clamp(n, 1 << min_bits, 1 << max_bits);
if (n > t->size)
return n;
if (n > t->size >> 4)
return t->size;
/* Shrink but keep it n * 2 to prevent frequent resizing */
return clamp(n << 1, 1 << min_bits, 1 << max_bits);
}
/* Set thresholds based on table size and load factor:
* u_thresh = size * (2^lfactor)
* l_thresh = u_thresh / 16
* u_thresh/l_thresh can be used to check if load triggers a table grow/shrink
*/
void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
int min_bits, int max_bits)
{
if (size >= 1 << max_bits)
t->u_thresh = INT_MAX; /* stop growing */
else if (lfactor <= 0)
t->u_thresh = size >> min(-lfactor, max_bits);
else
t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor;
/* l_thresh: shrink when load is 16 times lower, can be 0 */
if (size >= 1 << max_bits)
t->l_thresh = (1 << max_bits) >> 4;
else if (size > 1 << min_bits)
t->l_thresh = t->u_thresh >> 4;
else
t->l_thresh = 0; /* stop shrinking */
}
/* Return hash value for local info (fast, insecure) */
u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
const union nf_inet_addr *addr, u32 v1, u32 v2)
{
u32 v3;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
v3 = ipv6_addr_hash(&addr->in6);
else
#endif
v3 = addr->all[0];
return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]);
}
static inline void
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{