mirror of
https://github.com/torvalds/linux.git
synced 2026-05-13 00:28:54 +02:00
ipvs: add resizable hash tables
Add infrastructure for resizable hash tables based on hlist_bl which we will use in followup patches. The tables allow RCU lookups during resizing, bucket modifications are protected with per-bucket bit lock and additional custom locking, the tables are resized when load reaches thresholds determined based on load factor parameter. Compared to other implementations we rely on: * fast entry removal by using node unlinking without pre-lookup * entry rehashing when hash key changes * entries can contain multiple hash nodes * custom locking depending on different contexts * adjustable load factor to customize the grow/shrink process Signed-off-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Florian Westphal <fw@strlen.de>
This commit is contained in:
parent
1ac252ad03
commit
b655388111
|
|
@ -11,6 +11,7 @@
|
|||
#include <asm/types.h> /* for __uXX types */
|
||||
|
||||
#include <linux/list.h> /* for struct list_head */
|
||||
#include <linux/rculist_bl.h> /* for struct hlist_bl_head */
|
||||
#include <linux/spinlock.h> /* for struct rwlock_t */
|
||||
#include <linux/atomic.h> /* for struct atomic_t */
|
||||
#include <linux/refcount.h> /* for struct refcount_t */
|
||||
|
|
@ -30,6 +31,7 @@
|
|||
#endif
|
||||
#include <net/net_namespace.h> /* Netw namespace */
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/siphash.h>
|
||||
|
||||
#define IP_VS_HDR_INVERSE 1
|
||||
#define IP_VS_HDR_ICMP 2
|
||||
|
|
@ -271,6 +273,10 @@ static inline const char *ip_vs_dbg_addr(int af, char *buf, size_t buf_len,
|
|||
pr_err(msg, ##__VA_ARGS__); \
|
||||
} while (0)
|
||||
|
||||
struct ip_vs_aligned_lock {
|
||||
spinlock_t l; /* Protect buckets */
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
/* For arrays per family */
|
||||
enum {
|
||||
IP_VS_AF_INET,
|
||||
|
|
@ -484,6 +490,198 @@ struct ip_vs_est_kt_data {
|
|||
int est_row; /* estimated row */
|
||||
};
|
||||
|
||||
/* IPVS resizable hash tables */
|
||||
struct ip_vs_rht {
|
||||
struct hlist_bl_head *buckets;
|
||||
struct ip_vs_rht __rcu *new_tbl; /* New/Same table */
|
||||
seqcount_t *seqc; /* Protects moves */
|
||||
struct ip_vs_aligned_lock *lock; /* Protect seqc */
|
||||
int mask; /* Buckets mask */
|
||||
int size; /* Buckets */
|
||||
int seqc_mask; /* seqc mask */
|
||||
int lock_mask; /* lock mask */
|
||||
u32 table_id;
|
||||
int u_thresh; /* upper threshold */
|
||||
int l_thresh; /* lower threshold */
|
||||
int lfactor; /* Load Factor (shift)*/
|
||||
int bits; /* size = 1 << bits */
|
||||
siphash_key_t hash_key;
|
||||
struct rcu_head rcu_head;
|
||||
};
|
||||
|
||||
/**
|
||||
* ip_vs_rht_for_each_table() - Walk the hash tables
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @t: current table, used as cursor, struct ip_vs_rht *var
|
||||
* @p: previous table, temp struct ip_vs_rht *var
|
||||
*
|
||||
* Walk tables assuming others can not change the installed tables
|
||||
*/
|
||||
#define ip_vs_rht_for_each_table(table, t, p) \
|
||||
for (p = NULL, t = rcu_dereference_protected(table, 1); \
|
||||
t != p; \
|
||||
p = t, t = rcu_dereference_protected(t->new_tbl, 1))
|
||||
|
||||
/**
|
||||
* ip_vs_rht_for_each_table_rcu() - Walk the hash tables under RCU reader lock
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @t: current table, used as cursor, struct ip_vs_rht *var
|
||||
* @p: previous table, temp struct ip_vs_rht *var
|
||||
*
|
||||
* We usually search in one table and also in second table on resizing
|
||||
*/
|
||||
#define ip_vs_rht_for_each_table_rcu(table, t, p) \
|
||||
for (p = NULL, t = rcu_dereference(table); \
|
||||
t != p; \
|
||||
p = t, t = rcu_dereference(t->new_tbl))
|
||||
|
||||
/**
|
||||
* ip_vs_rht_for_each_bucket() - Walk all table buckets
|
||||
* @t: current table, used as cursor, struct ip_vs_rht *var
|
||||
* @bucket: bucket index, used as cursor, u32 var
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*/
|
||||
#define ip_vs_rht_for_each_bucket(t, bucket, head) \
|
||||
for (bucket = 0, head = (t)->buckets; \
|
||||
bucket < t->size; bucket++, head++)
|
||||
|
||||
/**
|
||||
* ip_vs_rht_for_bucket_retry() - Retry bucket if entries are moved
|
||||
* @t: current table, used as cursor, struct ip_vs_rht *var
|
||||
* @bucket: index of current bucket or hash key
|
||||
* @sc: temp seqcount_t *var
|
||||
* @seq: temp unsigned int var for sequence count
|
||||
* @retry: temp int var
|
||||
*/
|
||||
#define ip_vs_rht_for_bucket_retry(t, bucket, sc, seq, retry) \
|
||||
for (retry = 1, sc = &(t)->seqc[(bucket) & (t)->seqc_mask]; \
|
||||
retry && ({ seq = read_seqcount_begin(sc); 1; }); \
|
||||
retry = read_seqcount_retry(sc, seq))
|
||||
|
||||
/**
|
||||
* DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() - Declare variables
|
||||
*
|
||||
* Variables for ip_vs_rht_walk_buckets_rcu
|
||||
*/
|
||||
#define DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU() \
|
||||
struct ip_vs_rht *_t, *_p; \
|
||||
unsigned int _seq; \
|
||||
seqcount_t *_sc; \
|
||||
u32 _bucket; \
|
||||
int _retry
|
||||
/**
|
||||
* ip_vs_rht_walk_buckets_rcu() - Walk all buckets under RCU read lock
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*
|
||||
* Can be used while others add/delete/move entries
|
||||
* Not suitable if duplicates are not desired
|
||||
* Possible cases for reader that uses cond_resched_rcu() in the loop:
|
||||
* - new table can not be installed, no need to repeat
|
||||
* - new table can be installed => check and repeat if new table is
|
||||
* installed, needed for !PREEMPT_RCU
|
||||
*/
|
||||
#define ip_vs_rht_walk_buckets_rcu(table, head) \
|
||||
ip_vs_rht_for_each_table_rcu(table, _t, _p) \
|
||||
ip_vs_rht_for_each_bucket(_t, _bucket, head) \
|
||||
ip_vs_rht_for_bucket_retry(_t, _bucket, _sc, \
|
||||
_seq, _retry)
|
||||
|
||||
/**
|
||||
* DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() - Declare variables
|
||||
*
|
||||
* Variables for ip_vs_rht_walk_bucket_rcu
|
||||
*/
|
||||
#define DECLARE_IP_VS_RHT_WALK_BUCKET_RCU() \
|
||||
unsigned int _seq; \
|
||||
seqcount_t *_sc; \
|
||||
int _retry
|
||||
/**
|
||||
* ip_vs_rht_walk_bucket_rcu() - Walk bucket under RCU read lock
|
||||
* @t: current table, struct ip_vs_rht *var
|
||||
* @bucket: index of current bucket or hash key
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*
|
||||
* Can be used while others add/delete/move entries
|
||||
* Not suitable if duplicates are not desired
|
||||
* Possible cases for reader that uses cond_resched_rcu() in the loop:
|
||||
* - new table can not be installed, no need to repeat
|
||||
* - new table can be installed => check and repeat if new table is
|
||||
* installed, needed for !PREEMPT_RCU
|
||||
*/
|
||||
#define ip_vs_rht_walk_bucket_rcu(t, bucket, head) \
|
||||
if (({ head = (t)->buckets + ((bucket) & (t)->mask); 0; })) \
|
||||
{} \
|
||||
else \
|
||||
ip_vs_rht_for_bucket_retry(t, (bucket), _sc, _seq, _retry)
|
||||
|
||||
/**
|
||||
* DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() - Declare variables
|
||||
*
|
||||
* Variables for ip_vs_rht_walk_buckets_safe_rcu
|
||||
*/
|
||||
#define DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU() \
|
||||
struct ip_vs_rht *_t, *_p; \
|
||||
u32 _bucket
|
||||
/**
|
||||
* ip_vs_rht_walk_buckets_safe_rcu() - Walk all buckets under RCU read lock
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*
|
||||
* Can be used while others add/delete entries but moving is disabled
|
||||
* Using cond_resched_rcu() should be safe if tables do not change
|
||||
*/
|
||||
#define ip_vs_rht_walk_buckets_safe_rcu(table, head) \
|
||||
ip_vs_rht_for_each_table_rcu(table, _t, _p) \
|
||||
ip_vs_rht_for_each_bucket(_t, _bucket, head)
|
||||
|
||||
/**
|
||||
* DECLARE_IP_VS_RHT_WALK_BUCKETS() - Declare variables
|
||||
*
|
||||
* Variables for ip_vs_rht_walk_buckets
|
||||
*/
|
||||
#define DECLARE_IP_VS_RHT_WALK_BUCKETS() \
|
||||
struct ip_vs_rht *_t, *_p; \
|
||||
u32 _bucket
|
||||
|
||||
/**
|
||||
* ip_vs_rht_walk_buckets() - Walk all buckets
|
||||
* @table: struct ip_vs_rht __rcu *table
|
||||
* @head: bucket address, used as cursor, struct hlist_bl_head *var
|
||||
*
|
||||
* Use if others can not add/delete/move entries
|
||||
*/
|
||||
#define ip_vs_rht_walk_buckets(table, head) \
|
||||
ip_vs_rht_for_each_table(table, _t, _p) \
|
||||
ip_vs_rht_for_each_bucket(_t, _bucket, head)
|
||||
|
||||
/* Entries can be in one of two tables, so we flip bit when new table is
|
||||
* created and store it as highest bit in hash keys
|
||||
*/
|
||||
#define IP_VS_RHT_TABLE_ID_MASK BIT(31)
|
||||
|
||||
/* Check if hash key is from this table */
|
||||
static inline bool ip_vs_rht_same_table(struct ip_vs_rht *t, u32 hash_key)
|
||||
{
|
||||
return !((t->table_id ^ hash_key) & IP_VS_RHT_TABLE_ID_MASK);
|
||||
}
|
||||
|
||||
/* Build per-table hash key from hash value */
|
||||
static inline u32 ip_vs_rht_build_hash_key(struct ip_vs_rht *t, u32 hash)
|
||||
{
|
||||
return t->table_id | (hash & ~IP_VS_RHT_TABLE_ID_MASK);
|
||||
}
|
||||
|
||||
void ip_vs_rht_free(struct ip_vs_rht *t);
|
||||
void ip_vs_rht_rcu_free(struct rcu_head *head);
|
||||
struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks);
|
||||
int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
|
||||
int lfactor, int min_bits, int max_bits);
|
||||
void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
|
||||
int min_bits, int max_bits);
|
||||
u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
|
||||
const union nf_inet_addr *addr, u32 v1, u32 v2);
|
||||
|
||||
struct dst_entry;
|
||||
struct iphdr;
|
||||
struct ip_vs_conn;
|
||||
|
|
|
|||
|
|
@ -76,11 +76,6 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
|
|||
#define IP_VS_ADDRSTRLEN (8+1)
|
||||
#endif
|
||||
|
||||
struct ip_vs_aligned_lock
|
||||
{
|
||||
spinlock_t l;
|
||||
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
|
||||
|
||||
/* lock array for conn table */
|
||||
static struct ip_vs_aligned_lock
|
||||
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
|
||||
|
|
|
|||
|
|
@ -117,6 +117,185 @@ void ip_vs_init_hash_table(struct list_head *table, int rows)
|
|||
INIT_LIST_HEAD(&table[rows]);
|
||||
}
|
||||
|
||||
/* IPVS Resizable Hash Tables:
|
||||
* - list_bl buckets with bit lock
|
||||
*
|
||||
* Goals:
|
||||
* - RCU lookup for entry can run in parallel with add/del/move operations
|
||||
* - hash keys can be on non-contiguous memory
|
||||
* - support entries with duplicate keys
|
||||
* - unlink entries without lookup, use the saved table and bucket id
|
||||
* - resizing can trigger on load change or depending on key refresh period
|
||||
* - customizable load factor to balance between speed and memory usage
|
||||
* - add/del/move operations should be allowed for any context
|
||||
*
|
||||
* Resizing:
|
||||
* - new table is attached to the current table and all entries are moved
|
||||
* with new hash key. Finally, the new table is installed as current one and
|
||||
* the old table is released after RCU grace period.
|
||||
* - RCU read-side critical sections will walk two tables while resizing is
|
||||
* in progress
|
||||
* - new entries are added to the new table
|
||||
* - entries will be deleted from the old or from the new table, the table_id
|
||||
* can be saved into entry as part of the hash key to know where the entry is
|
||||
* hashed
|
||||
* - move operations may delay readers or to cause retry for the modified
|
||||
* bucket. As result, searched entry will be found but walkers that operate
|
||||
* on multiple entries may see same entry twice if bucket walking is retried.
|
||||
* - for fast path the number of entries (load) can be compared to u_thresh
|
||||
* and l_thresh to decide when to trigger table growing/shrinking. They
|
||||
* are calculated based on load factor (shift count), negative value allows
|
||||
* load to be below 100% to reduce collisions by maintaining larger table
|
||||
* while positive value tolerates collisions by using smaller table and load
|
||||
* above 100%: u_thresh(load) = size * (2 ^ lfactor)
|
||||
*
|
||||
* Locking:
|
||||
* - lock: protect seqc if other context except resizer can move entries
|
||||
* - seqc: seqcount_t, delay/retry readers while entries are moved to
|
||||
* new table on resizing
|
||||
* - bit lock: serialize bucket modifications
|
||||
* - writers may use other locking mechanisms to serialize operations for
|
||||
* resizing, moving and installing new tables
|
||||
*/
|
||||
|
||||
void ip_vs_rht_free(struct ip_vs_rht *t)
|
||||
{
|
||||
kvfree(t->buckets);
|
||||
kvfree(t->seqc);
|
||||
kvfree(t->lock);
|
||||
kfree(t);
|
||||
}
|
||||
|
||||
void ip_vs_rht_rcu_free(struct rcu_head *head)
|
||||
{
|
||||
struct ip_vs_rht *t;
|
||||
|
||||
t = container_of(head, struct ip_vs_rht, rcu_head);
|
||||
ip_vs_rht_free(t);
|
||||
}
|
||||
|
||||
struct ip_vs_rht *ip_vs_rht_alloc(int buckets, int scounts, int locks)
|
||||
{
|
||||
struct ip_vs_rht *t = kzalloc(sizeof(*t), GFP_KERNEL);
|
||||
int i;
|
||||
|
||||
if (!t)
|
||||
return NULL;
|
||||
if (scounts) {
|
||||
int ml = roundup_pow_of_two(nr_cpu_ids);
|
||||
|
||||
scounts = min(scounts, buckets);
|
||||
scounts = min(scounts, ml);
|
||||
t->seqc = kvmalloc_array(scounts, sizeof(*t->seqc), GFP_KERNEL);
|
||||
if (!t->seqc)
|
||||
goto err;
|
||||
for (i = 0; i < scounts; i++)
|
||||
seqcount_init(&t->seqc[i]);
|
||||
|
||||
if (locks) {
|
||||
locks = min(locks, scounts);
|
||||
t->lock = kvmalloc_array(locks, sizeof(*t->lock),
|
||||
GFP_KERNEL);
|
||||
if (!t->lock)
|
||||
goto err;
|
||||
for (i = 0; i < locks; i++)
|
||||
spin_lock_init(&t->lock[i].l);
|
||||
}
|
||||
}
|
||||
|
||||
t->buckets = kvmalloc_array(buckets, sizeof(*t->buckets), GFP_KERNEL);
|
||||
if (!t->buckets)
|
||||
goto err;
|
||||
for (i = 0; i < buckets; i++)
|
||||
INIT_HLIST_BL_HEAD(&t->buckets[i]);
|
||||
t->mask = buckets - 1;
|
||||
t->size = buckets;
|
||||
t->seqc_mask = scounts - 1;
|
||||
t->lock_mask = locks - 1;
|
||||
t->u_thresh = buckets;
|
||||
t->l_thresh = buckets >> 4;
|
||||
t->bits = order_base_2(buckets);
|
||||
/* new_tbl points to self if no new table is filled */
|
||||
RCU_INIT_POINTER(t->new_tbl, t);
|
||||
get_random_bytes(&t->hash_key, sizeof(t->hash_key));
|
||||
return t;
|
||||
|
||||
err:
|
||||
ip_vs_rht_free(t);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Get the desired table size for n entries based on current table size and
|
||||
* by using the formula size = n / (2^lfactor)
|
||||
* lfactor: shift value for the load factor:
|
||||
* - >0: u_thresh=size << lfactor, for load factor above 100%
|
||||
* - <0: u_thresh=size >> -lfactor, for load factor below 100%
|
||||
* - 0: for load factor of 100%
|
||||
*/
|
||||
int ip_vs_rht_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t, int n,
|
||||
int lfactor, int min_bits, int max_bits)
|
||||
{
|
||||
if (!t)
|
||||
return 1 << min_bits;
|
||||
n = roundup_pow_of_two(n);
|
||||
if (lfactor < 0) {
|
||||
int factor = min(-lfactor, max_bits);
|
||||
|
||||
n = min(n, 1 << (max_bits - factor));
|
||||
n <<= factor;
|
||||
} else {
|
||||
n = min(n >> lfactor, 1 << max_bits);
|
||||
}
|
||||
if (lfactor != t->lfactor)
|
||||
return clamp(n, 1 << min_bits, 1 << max_bits);
|
||||
if (n > t->size)
|
||||
return n;
|
||||
if (n > t->size >> 4)
|
||||
return t->size;
|
||||
/* Shrink but keep it n * 2 to prevent frequent resizing */
|
||||
return clamp(n << 1, 1 << min_bits, 1 << max_bits);
|
||||
}
|
||||
|
||||
/* Set thresholds based on table size and load factor:
|
||||
* u_thresh = size * (2^lfactor)
|
||||
* l_thresh = u_thresh / 16
|
||||
* u_thresh/l_thresh can be used to check if load triggers a table grow/shrink
|
||||
*/
|
||||
void ip_vs_rht_set_thresholds(struct ip_vs_rht *t, int size, int lfactor,
|
||||
int min_bits, int max_bits)
|
||||
{
|
||||
if (size >= 1 << max_bits)
|
||||
t->u_thresh = INT_MAX; /* stop growing */
|
||||
else if (lfactor <= 0)
|
||||
t->u_thresh = size >> min(-lfactor, max_bits);
|
||||
else
|
||||
t->u_thresh = min(size, 1 << (30 - lfactor)) << lfactor;
|
||||
|
||||
/* l_thresh: shrink when load is 16 times lower, can be 0 */
|
||||
if (size >= 1 << max_bits)
|
||||
t->l_thresh = (1 << max_bits) >> 4;
|
||||
else if (size > 1 << min_bits)
|
||||
t->l_thresh = t->u_thresh >> 4;
|
||||
else
|
||||
t->l_thresh = 0; /* stop shrinking */
|
||||
}
|
||||
|
||||
/* Return hash value for local info (fast, insecure) */
|
||||
u32 ip_vs_rht_hash_linfo(struct ip_vs_rht *t, int af,
|
||||
const union nf_inet_addr *addr, u32 v1, u32 v2)
|
||||
{
|
||||
u32 v3;
|
||||
|
||||
#ifdef CONFIG_IP_VS_IPV6
|
||||
if (af == AF_INET6)
|
||||
v3 = ipv6_addr_hash(&addr->in6);
|
||||
else
|
||||
#endif
|
||||
v3 = addr->all[0];
|
||||
|
||||
return jhash_3words(v1, v2, v3, (u32)t->hash_key.key[0]);
|
||||
}
|
||||
|
||||
static inline void
|
||||
ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user