From 8274a97aa4c694ad0d7b31b283a89dcca140e62b Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:55:29 -0800 Subject: [PATCH 01/17] fib_trie: Update usage stats to be percpu instead of global variables The trie usage stats were currently being shared by all threads that were calling fib_table_lookup. As a result when multiple threads were performing lookups simultaneously the trie would begin to cache bounce between those threads. In order to prevent this I have updated the usage stats to use a set of percpu variables. By doing this we should be able to avoid the cache bouncing and still make use of these stats. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- net/ipv4/fib_trie.c | 68 +++++++++++++++++++++++++++++------------ 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 23104a3f2924..66890209208f 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -67,7 +67,7 @@ static int __net_init fib4_rules_init(struct net *net) return 0; fail: - kfree(local_table); + fib_free_table(local_table); return -ENOMEM; } #else diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 18bcaf2ff2fd..d3dbb4821ae8 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -153,7 +153,7 @@ struct trie_stat { struct trie { struct rt_trie_node __rcu *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS - struct trie_use_stats stats; + struct trie_use_stats __percpu *stats; #endif }; @@ -631,7 +631,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) if (IS_ERR(tn)) { tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.resize_node_skipped++; + this_cpu_inc(t->stats->resize_node_skipped); #endif break; } @@ -658,7 +658,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) if (IS_ERR(tn)) { tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.resize_node_skipped++; + this_cpu_inc(t->stats->resize_node_skipped); #endif break; } @@ -1357,7 +1357,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, err = fib_props[fa->fa_type].error; if (err) { #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.semantic_match_passed++; + this_cpu_inc(t->stats->semantic_match_passed); #endif return err; } @@ -1372,7 +1372,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, continue; #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.semantic_match_passed++; + this_cpu_inc(t->stats->semantic_match_passed); #endif res->prefixlen = li->plen; res->nh_sel = nhsel; @@ -1388,7 +1388,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, } #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.semantic_match_miss++; + this_cpu_inc(t->stats->semantic_match_miss); #endif } @@ -1399,6 +1399,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; +#ifdef CONFIG_IP_FIB_TRIE_STATS + struct trie_use_stats __percpu *stats = t->stats; +#endif int ret; struct rt_trie_node *n; struct tnode *pn; @@ -1417,7 +1420,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, goto failed; #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.gets++; + this_cpu_inc(stats->gets); #endif /* Just a leaf? */ @@ -1441,7 +1444,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, if (n == NULL) { #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.null_node_hit++; + this_cpu_inc(stats->null_node_hit); #endif goto backtrace; } @@ -1576,7 +1579,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, chopped_off = 0; #ifdef CONFIG_IP_FIB_TRIE_STATS - t->stats.backtrack++; + this_cpu_inc(stats->backtrack); #endif goto backtrace; } @@ -1830,6 +1833,11 @@ int fib_table_flush(struct fib_table *tb) void fib_free_table(struct fib_table *tb) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + struct trie *t = (struct trie *)tb->tb_data; + + free_percpu(t->stats); +#endif /* CONFIG_IP_FIB_TRIE_STATS */ kfree(tb); } @@ -1973,7 +1981,14 @@ struct fib_table *fib_trie_table(u32 id) tb->tb_num_default = 0; t = (struct trie *) tb->tb_data; - memset(t, 0, sizeof(*t)); + RCU_INIT_POINTER(t->trie, NULL); +#ifdef CONFIG_IP_FIB_TRIE_STATS + t->stats = alloc_percpu(struct trie_use_stats); + if (!t->stats) { + kfree(tb); + tb = NULL; + } +#endif return tb; } @@ -2139,18 +2154,31 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) #ifdef CONFIG_IP_FIB_TRIE_STATS static void trie_show_usage(struct seq_file *seq, - const struct trie_use_stats *stats) + const struct trie_use_stats __percpu *stats) { + struct trie_use_stats s = { 0 }; + int cpu; + + /* loop through all of the CPUs and gather up the stats */ + for_each_possible_cpu(cpu) { + const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu); + + s.gets += pcpu->gets; + s.backtrack += pcpu->backtrack; + s.semantic_match_passed += pcpu->semantic_match_passed; + s.semantic_match_miss += pcpu->semantic_match_miss; + s.null_node_hit += pcpu->null_node_hit; + s.resize_node_skipped += pcpu->resize_node_skipped; + } + seq_printf(seq, "\nCounters:\n---------\n"); - seq_printf(seq, "gets = %u\n", stats->gets); - seq_printf(seq, "backtracks = %u\n", stats->backtrack); + seq_printf(seq, "gets = %u\n", s.gets); + seq_printf(seq, "backtracks = %u\n", s.backtrack); seq_printf(seq, "semantic match passed = %u\n", - stats->semantic_match_passed); - seq_printf(seq, "semantic match miss = %u\n", - stats->semantic_match_miss); - seq_printf(seq, "null node hit= %u\n", stats->null_node_hit); - seq_printf(seq, "skipped node resize = %u\n\n", - stats->resize_node_skipped); + s.semantic_match_passed); + seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss); + seq_printf(seq, "null node hit= %u\n", s.null_node_hit); + seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped); } #endif /* CONFIG_IP_FIB_TRIE_STATS */ @@ -2191,7 +2219,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) trie_collect_stats(t, &stat); trie_show_stats(seq, &stat); #ifdef CONFIG_IP_FIB_TRIE_STATS - trie_show_usage(seq, &t->stats); + trie_show_usage(seq, t->stats); #endif } } From 64c9b6fb26ebcc82e36438d4084f2258f29dbadf Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:55:35 -0800 Subject: [PATCH 02/17] fib_trie: Make leaf and tnode more uniform This change makes some fundamental changes to the way leaves and tnodes are constructed. The big differences are: 1. Leaves now populate pos and bits indicating their full key size. 2. Trie nodes now mask out their lower bits to be consistent with the leaf 3. Both structures have been reordered so that rt_trie_node now consisists of a much larger region including the pos, bits, and rcu portions of the tnode structure. On 32b systems this will result in the leaf being 4B larger as the pos and bits values were added to a hole created by the key as it was only 4B in length. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 198 +++++++++++++++++++------------------------- 1 file changed, 85 insertions(+), 113 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d3dbb4821ae8..2b7220728b24 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -87,24 +87,38 @@ typedef unsigned int t_key; -#define T_TNODE 0 -#define T_LEAF 1 -#define NODE_TYPE_MASK 0x1UL -#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) +#define IS_TNODE(n) ((n)->bits) +#define IS_LEAF(n) (!(n)->bits) -#define IS_TNODE(n) (!(n->parent & T_LEAF)) -#define IS_LEAF(n) (n->parent & T_LEAF) +struct tnode { + t_key key; + unsigned char bits; /* 2log(KEYLENGTH) bits needed */ + unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + struct tnode __rcu *parent; + union { + struct rcu_head rcu; + struct tnode *tnode_free; + }; + unsigned int full_children; /* KEYLENGTH bits needed */ + unsigned int empty_children; /* KEYLENGTH bits needed */ + struct rt_trie_node __rcu *child[0]; +}; struct rt_trie_node { - unsigned long parent; t_key key; + unsigned char bits; + unsigned char pos; + struct tnode __rcu *parent; + struct rcu_head rcu; }; struct leaf { - unsigned long parent; t_key key; - struct hlist_head list; + unsigned char bits; + unsigned char pos; + struct tnode __rcu *parent; struct rcu_head rcu; + struct hlist_head list; }; struct leaf_info { @@ -115,20 +129,6 @@ struct leaf_info { struct rcu_head rcu; }; -struct tnode { - unsigned long parent; - t_key key; - unsigned char pos; /* 2log(KEYLENGTH) bits needed */ - unsigned char bits; /* 2log(KEYLENGTH) bits needed */ - unsigned int full_children; /* KEYLENGTH bits needed */ - unsigned int empty_children; /* KEYLENGTH bits needed */ - union { - struct rcu_head rcu; - struct tnode *tnode_free; - }; - struct rt_trie_node __rcu *child[0]; -}; - #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats { unsigned int gets; @@ -176,38 +176,27 @@ static const int sync_pages = 128; static struct kmem_cache *fn_alias_kmem __read_mostly; static struct kmem_cache *trie_leaf_kmem __read_mostly; -/* - * caller must hold RTNL - */ -static inline struct tnode *node_parent(const struct rt_trie_node *node) -{ - unsigned long parent; +/* caller must hold RTNL */ +#define node_parent(n) rtnl_dereference((n)->parent) - parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held()); +/* caller must hold RCU read lock or RTNL */ +#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent) - return (struct tnode *)(parent & ~NODE_TYPE_MASK); -} - -/* - * caller must hold RCU read lock or RTNL - */ -static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node) -{ - unsigned long parent; - - parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() || - lockdep_rtnl_is_held()); - - return (struct tnode *)(parent & ~NODE_TYPE_MASK); -} - -/* Same as rcu_assign_pointer - * but that macro() assumes that value is a pointer. - */ +/* wrapper for rcu_assign_pointer */ static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) { - smp_wmb(); - node->parent = (unsigned long)ptr | NODE_TYPE(node); + if (node) + rcu_assign_pointer(node->parent, ptr); +} + +#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p) + +/* This provides us with the number of children in this node, in the case of a + * leaf this will return 0 meaning none of the children are accessible. + */ +static inline int tnode_child_length(const struct tnode *tn) +{ + return (1ul << tn->bits) & ~(1ul); } /* @@ -215,7 +204,7 @@ static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) */ static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) { - BUG_ON(i >= 1U << tn->bits); + BUG_ON(i >= tnode_child_length(tn)); return rtnl_dereference(tn->child[i]); } @@ -225,16 +214,11 @@ static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsig */ static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) { - BUG_ON(i >= 1U << tn->bits); + BUG_ON(i >= tnode_child_length(tn)); return rcu_dereference_rtnl(tn->child[i]); } -static inline int tnode_child_length(const struct tnode *tn) -{ - return 1 << tn->bits; -} - static inline t_key mask_pfx(t_key k, unsigned int l) { return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); @@ -336,11 +320,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b) */ -static inline void check_tnode(const struct tnode *tn) -{ - WARN_ON(tn && tn->pos+tn->bits > 32); -} - static const int halve_threshold = 25; static const int inflate_threshold = 50; static const int halve_threshold_root = 15; @@ -426,11 +405,20 @@ static void tnode_free_flush(void) } } -static struct leaf *leaf_new(void) +static struct leaf *leaf_new(t_key key) { struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (l) { - l->parent = T_LEAF; + l->parent = NULL; + /* set key and pos to reflect full key value + * any trailing zeros in the key should be ignored + * as the nodes are searched + */ + l->key = key; + l->pos = KEYLENGTH; + /* set bits to 0 indicating we are not a tnode */ + l->bits = 0; + INIT_HLIST_HEAD(&l->list); } return l; @@ -451,12 +439,16 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) { size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); struct tnode *tn = tnode_alloc(sz); + unsigned int shift = pos + bits; + + /* verify bits and pos their msb bits clear and values are valid */ + BUG_ON(!bits || (shift > KEYLENGTH)); if (tn) { - tn->parent = T_TNODE; + tn->parent = NULL; tn->pos = pos; tn->bits = bits; - tn->key = key; + tn->key = mask_pfx(key, pos); tn->full_children = 0; tn->empty_children = 1<pos == tn->pos + tn->bits; + return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits)); } static inline void put_child(struct tnode *tn, int i, @@ -514,8 +503,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node * else if (!wasfull && isfull) tn->full_children++; - if (n) - node_set_parent(n, tn); + node_set_parent(n, tn); rcu_assign_pointer(tn->child[i], n); } @@ -523,7 +511,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node * #define MAX_WORK 10 static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) { - int i; + struct rt_trie_node *n = NULL; struct tnode *old_tn; int inflate_threshold_use; int halve_threshold_use; @@ -536,12 +524,11 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) tn, inflate_threshold, halve_threshold); /* No children */ - if (tn->empty_children == tnode_child_length(tn)) { - tnode_free_safe(tn); - return NULL; - } + if (tn->empty_children > (tnode_child_length(tn) - 1)) + goto no_children; + /* One child */ - if (tn->empty_children == tnode_child_length(tn) - 1) + if (tn->empty_children == (tnode_child_length(tn) - 1)) goto one_child; /* * Double as long as the resulting node has a number of @@ -607,11 +594,9 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) * */ - check_tnode(tn); - /* Keep root node larger */ - if (!node_parent((struct rt_trie_node *)tn)) { + if (!node_parent(tn)) { inflate_threshold_use = inflate_threshold_root; halve_threshold_use = halve_threshold_root; } else { @@ -637,8 +622,6 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) } } - check_tnode(tn); - /* Return if at least one inflate is run */ if (max_work != MAX_WORK) return (struct rt_trie_node *) tn; @@ -666,21 +649,16 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) /* Only one child remains */ - if (tn->empty_children == tnode_child_length(tn) - 1) { + if (tn->empty_children == (tnode_child_length(tn) - 1)) { + unsigned long i; one_child: - for (i = 0; i < tnode_child_length(tn); i++) { - struct rt_trie_node *n; - - n = rtnl_dereference(tn->child[i]); - if (!n) - continue; - - /* compress one level */ - - node_set_parent(n, NULL); - tnode_free_safe(tn); - return n; - } + for (i = tnode_child_length(tn); !n && i;) + n = tnode_get_child(tn, --i); +no_children: + /* compress one level */ + node_set_parent(n, NULL); + tnode_free_safe(tn); + return n; } return (struct rt_trie_node *) tn; } @@ -760,8 +738,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) /* A leaf or an internal node with skipped bits */ - if (IS_LEAF(node) || ((struct tnode *) node)->pos > - tn->pos + tn->bits - 1) { + if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) { put_child(tn, tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1), node); @@ -958,11 +935,9 @@ fib_find_node(struct trie *t, u32 key) pos = 0; n = rcu_dereference_rtnl(t->trie); - while (n != NULL && NODE_TYPE(n) == T_TNODE) { + while (n && IS_TNODE(n)) { tn = (struct tnode *) n; - check_tnode(tn); - if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { pos = tn->pos + tn->bits; n = tnode_get_child_rcu(tn, @@ -988,7 +963,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) key = tn->key; - while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) { + while (tn != NULL && (tp = node_parent(tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *)resize(t, tn); @@ -996,7 +971,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) tnode_put_child_reorg(tp, cindex, (struct rt_trie_node *)tn, wasfull); - tp = node_parent((struct rt_trie_node *) tn); + tp = node_parent(tn); if (!tp) rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); @@ -1048,11 +1023,9 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) * If it doesn't, we need to replace it with a T_TNODE. */ - while (n != NULL && NODE_TYPE(n) == T_TNODE) { + while (n && IS_TNODE(n)) { tn = (struct tnode *) n; - check_tnode(tn); - if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { tp = tn; pos = tn->pos + tn->bits; @@ -1087,12 +1060,11 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) insert_leaf_info(&l->list, li); goto done; } - l = leaf_new(); + l = leaf_new(key); if (!l) return NULL; - l->key = key; li = leaf_info_new(plen); if (!li) { @@ -1569,7 +1541,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); } else { - struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn); + struct tnode *parent = node_parent_rcu(pn); if (!parent) goto failed; @@ -1597,7 +1569,7 @@ EXPORT_SYMBOL_GPL(fib_table_lookup); */ static void trie_leaf_remove(struct trie *t, struct leaf *l) { - struct tnode *tp = node_parent((struct rt_trie_node *) l); + struct tnode *tp = node_parent(l); pr_debug("entering trie_leaf_remove(%p)\n", l); @@ -2374,7 +2346,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (IS_TNODE(n)) { struct tnode *tn = (struct tnode *) n; - __be32 prf = htonl(mask_pfx(tn->key, tn->pos)); + __be32 prf = htonl(tn->key); seq_indent(seq, iter->depth-1); seq_printf(seq, " +-- %pI4/%d %d %d %d\n", From 37fd30f2da573c2625ed29561e5b8eb7b0258860 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:55:41 -0800 Subject: [PATCH 03/17] fib_trie: Merge tnode_free and leaf_free into node_free Both the leaf and the tnode had an rcu_head in them, but they had them in slightly different places. Since we now have them in the same spot and know that any node with bits == 0 is a leaf and the rest are either vmalloc or kmalloc tnodes depending on the value of bits it makes it easy to combine the functions and reduce overhead. In addition I have taken advantage of the rcu_head pointer to go ahead and put together a simple linked list instead of using the tnode pointer as this way we can merge either type of structure for freeing. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 90 ++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 50 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2b7220728b24..d1a9907901cb 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -95,15 +95,17 @@ struct tnode { unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char pos; /* 2log(KEYLENGTH) bits needed */ struct tnode __rcu *parent; - union { - struct rcu_head rcu; - struct tnode *tnode_free; - }; + struct rcu_head rcu; + /* everything above this comment must be the same as rt_trie_node */ unsigned int full_children; /* KEYLENGTH bits needed */ unsigned int empty_children; /* KEYLENGTH bits needed */ struct rt_trie_node __rcu *child[0]; }; +/* This struct represents the shared bits between tnode and leaf. If any + * ordering is changed here is must also be updated in tnode and leaf as + * well. + */ struct rt_trie_node { t_key key; unsigned char bits; @@ -118,6 +120,7 @@ struct leaf { unsigned char pos; struct tnode __rcu *parent; struct rcu_head rcu; + /* everything above this comment must be the same as rt_trie_node */ struct hlist_head list; }; @@ -163,7 +166,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ -static struct tnode *tnode_free_head; +static struct callback_head *tnode_free_head; static size_t tnode_free_size; /* @@ -336,16 +339,22 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) call_rcu(&fa->rcu, __alias_free_mem); } -static void __leaf_free_rcu(struct rcu_head *head) +#define TNODE_KMALLOC_MAX \ + ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct rt_trie_node *)) + +static void __node_free_rcu(struct rcu_head *head) { - struct leaf *l = container_of(head, struct leaf, rcu); - kmem_cache_free(trie_leaf_kmem, l); + struct rt_trie_node *n = container_of(head, struct rt_trie_node, rcu); + + if (IS_LEAF(n)) + kmem_cache_free(trie_leaf_kmem, n); + else if (n->bits <= TNODE_KMALLOC_MAX) + kfree(n); + else + vfree(n); } -static inline void free_leaf(struct leaf *l) -{ - call_rcu(&l->rcu, __leaf_free_rcu); -} +#define node_free(n) call_rcu(&n->rcu, __node_free_rcu) static inline void free_leaf_info(struct leaf_info *leaf) { @@ -360,43 +369,24 @@ static struct tnode *tnode_alloc(size_t size) return vzalloc(size); } -static void __tnode_free_rcu(struct rcu_head *head) -{ - struct tnode *tn = container_of(head, struct tnode, rcu); - size_t size = sizeof(struct tnode) + - (sizeof(struct rt_trie_node *) << tn->bits); - - if (size <= PAGE_SIZE) - kfree(tn); - else - vfree(tn); -} - -static inline void tnode_free(struct tnode *tn) -{ - if (IS_LEAF(tn)) - free_leaf((struct leaf *) tn); - else - call_rcu(&tn->rcu, __tnode_free_rcu); -} - static void tnode_free_safe(struct tnode *tn) { BUG_ON(IS_LEAF(tn)); - tn->tnode_free = tnode_free_head; - tnode_free_head = tn; - tnode_free_size += sizeof(struct tnode) + - (sizeof(struct rt_trie_node *) << tn->bits); + tn->rcu.next = tnode_free_head; + tnode_free_head = &tn->rcu; } static void tnode_free_flush(void) { - struct tnode *tn; + struct callback_head *head; - while ((tn = tnode_free_head)) { - tnode_free_head = tn->tnode_free; - tn->tnode_free = NULL; - tnode_free(tn); + while ((head = tnode_free_head)) { + struct tnode *tn = container_of(head, struct tnode, rcu); + + tnode_free_head = head->next; + tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); + + node_free(tn); } if (tnode_free_size >= PAGE_SIZE * sync_pages) { @@ -437,7 +427,7 @@ static struct leaf_info *leaf_info_new(int plen) static struct tnode *tnode_new(t_key key, int pos, int bits) { - size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits); + size_t sz = offsetof(struct tnode, child[1 << bits]); struct tnode *tn = tnode_alloc(sz); unsigned int shift = pos + bits; @@ -666,15 +656,15 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) static void tnode_clean_free(struct tnode *tn) { + struct rt_trie_node *tofree; int i; - struct tnode *tofree; for (i = 0; i < tnode_child_length(tn); i++) { - tofree = (struct tnode *)rtnl_dereference(tn->child[i]); + tofree = rtnl_dereference(tn->child[i]); if (tofree) - tnode_free(tofree); + node_free(tofree); } - tnode_free(tn); + node_free(tn); } static struct tnode *inflate(struct trie *t, struct tnode *tn) @@ -717,7 +707,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) inode->bits - 1); if (!right) { - tnode_free(left); + node_free(left); goto nomem; } @@ -1068,7 +1058,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) li = leaf_info_new(plen); if (!li) { - free_leaf(l); + node_free(l); return NULL; } @@ -1100,7 +1090,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) if (!tn) { free_leaf_info(li); - free_leaf(l); + node_free(l); return NULL; } @@ -1580,7 +1570,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) } else RCU_INIT_POINTER(t->trie, NULL); - free_leaf(l); + node_free(l); } /* From adaf981685b7b17d773d17f4ee6a128a3a6b6dc8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:55:47 -0800 Subject: [PATCH 04/17] fib_trie: Merge leaf into tnode This change makes it so that leaf and tnode are the same struct. As a result there is no need for rt_trie_node anymore since everyting can be merged into tnode. On 32b systems this results in the leaf being 4 bytes larger, however I don't know if that is really an issue as this and an eariler patch that added bits & pos have increased the size from 20 to 28. If I am not mistaken slub/slab allocate on power of 2 sizes so 20 was likely being rounded up to 32 anyway. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 322 +++++++++++++++++++------------------------- 1 file changed, 140 insertions(+), 182 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d1a9907901cb..94c929f6c752 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -96,32 +96,16 @@ struct tnode { unsigned char pos; /* 2log(KEYLENGTH) bits needed */ struct tnode __rcu *parent; struct rcu_head rcu; - /* everything above this comment must be the same as rt_trie_node */ - unsigned int full_children; /* KEYLENGTH bits needed */ - unsigned int empty_children; /* KEYLENGTH bits needed */ - struct rt_trie_node __rcu *child[0]; -}; - -/* This struct represents the shared bits between tnode and leaf. If any - * ordering is changed here is must also be updated in tnode and leaf as - * well. - */ -struct rt_trie_node { - t_key key; - unsigned char bits; - unsigned char pos; - struct tnode __rcu *parent; - struct rcu_head rcu; -}; - -struct leaf { - t_key key; - unsigned char bits; - unsigned char pos; - struct tnode __rcu *parent; - struct rcu_head rcu; - /* everything above this comment must be the same as rt_trie_node */ - struct hlist_head list; + union { + /* The fields in this struct are valid if bits > 0 (TNODE) */ + struct { + unsigned int full_children; /* KEYLENGTH bits needed */ + unsigned int empty_children; /* KEYLENGTH bits needed */ + struct tnode __rcu *child[0]; + }; + /* This list pointer if valid if bits == 0 (LEAF) */ + struct hlist_head list; + }; }; struct leaf_info { @@ -154,15 +138,15 @@ struct trie_stat { }; struct trie { - struct rt_trie_node __rcu *trie; + struct tnode __rcu *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats; #endif }; -static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, +static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n, int wasfull); -static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); +static struct tnode *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ @@ -186,10 +170,10 @@ static struct kmem_cache *trie_leaf_kmem __read_mostly; #define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent) /* wrapper for rcu_assign_pointer */ -static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr) +static inline void node_set_parent(struct tnode *n, struct tnode *tp) { - if (node) - rcu_assign_pointer(node->parent, ptr); + if (n) + rcu_assign_pointer(n->parent, tp); } #define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p) @@ -205,7 +189,7 @@ static inline int tnode_child_length(const struct tnode *tn) /* * caller must hold RTNL */ -static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i) +static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i) { BUG_ON(i >= tnode_child_length(tn)); @@ -215,7 +199,7 @@ static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsig /* * caller must hold RCU read lock or RTNL */ -static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) +static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) { BUG_ON(i >= tnode_child_length(tn)); @@ -340,11 +324,11 @@ static inline void alias_free_mem_rcu(struct fib_alias *fa) } #define TNODE_KMALLOC_MAX \ - ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct rt_trie_node *)) + ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *)) static void __node_free_rcu(struct rcu_head *head) { - struct rt_trie_node *n = container_of(head, struct rt_trie_node, rcu); + struct tnode *n = container_of(head, struct tnode, rcu); if (IS_LEAF(n)) kmem_cache_free(trie_leaf_kmem, n); @@ -395,9 +379,9 @@ static void tnode_free_flush(void) } } -static struct leaf *leaf_new(t_key key) +static struct tnode *leaf_new(t_key key) { - struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); + struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); if (l) { l->parent = NULL; /* set key and pos to reflect full key value @@ -444,7 +428,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) } pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), - sizeof(struct rt_trie_node *) << bits); + sizeof(struct tnode *) << bits); return tn; } @@ -453,13 +437,13 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n) +static inline int tnode_full(const struct tnode *tn, const struct tnode *n) { return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits)); } static inline void put_child(struct tnode *tn, int i, - struct rt_trie_node *n) + struct tnode *n) { tnode_put_child_reorg(tn, i, n, -1); } @@ -469,10 +453,10 @@ static inline void put_child(struct tnode *tn, int i, * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, +static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n, int wasfull) { - struct rt_trie_node *chi = rtnl_dereference(tn->child[i]); + struct tnode *chi = rtnl_dereference(tn->child[i]); int isfull; BUG_ON(i >= 1<bits); @@ -499,10 +483,9 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node * } #define MAX_WORK 10 -static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) +static struct tnode *resize(struct trie *t, struct tnode *tn) { - struct rt_trie_node *n = NULL; - struct tnode *old_tn; + struct tnode *old_tn, *n = NULL; int inflate_threshold_use; int halve_threshold_use; int max_work; @@ -614,7 +597,7 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return (struct rt_trie_node *) tn; + return tn; /* * Halve as long as the number of empty children in this @@ -650,13 +633,13 @@ static struct rt_trie_node *resize(struct trie *t, struct tnode *tn) tnode_free_safe(tn); return n; } - return (struct rt_trie_node *) tn; + return tn; } static void tnode_clean_free(struct tnode *tn) { - struct rt_trie_node *tofree; + struct tnode *tofree; int i; for (i = 0; i < tnode_child_length(tn); i++) { @@ -667,10 +650,10 @@ static void tnode_clean_free(struct tnode *tn) node_free(tn); } -static struct tnode *inflate(struct trie *t, struct tnode *tn) +static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) { - struct tnode *oldtnode = tn; - int olen = tnode_child_length(tn); + int olen = tnode_child_length(oldtnode); + struct tnode *tn; int i; pr_debug("In inflate\n"); @@ -690,11 +673,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) for (i = 0; i < olen; i++) { struct tnode *inode; - inode = (struct tnode *) tnode_get_child(oldtnode, i); - if (inode && - IS_TNODE(inode) && - inode->pos == oldtnode->pos + oldtnode->bits && - inode->bits > 1) { + inode = tnode_get_child(oldtnode, i); + if (tnode_full(oldtnode, inode) && inode->bits > 1) { struct tnode *left, *right; t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos; @@ -711,33 +691,29 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) goto nomem; } - put_child(tn, 2*i, (struct rt_trie_node *) left); - put_child(tn, 2*i+1, (struct rt_trie_node *) right); + put_child(tn, 2*i, left); + put_child(tn, 2*i+1, right); } } for (i = 0; i < olen; i++) { - struct tnode *inode; - struct rt_trie_node *node = tnode_get_child(oldtnode, i); + struct tnode *inode = tnode_get_child(oldtnode, i); struct tnode *left, *right; int size, j; /* An empty child */ - if (node == NULL) + if (inode == NULL) continue; /* A leaf or an internal node with skipped bits */ - - if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) { + if (!tnode_full(oldtnode, inode)) { put_child(tn, - tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1), - node); + tkey_extract_bits(inode->key, tn->pos, tn->bits), + inode); continue; } /* An internal node with two children */ - inode = (struct tnode *) node; - if (inode->bits == 1) { put_child(tn, 2*i, rtnl_dereference(inode->child[0])); put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); @@ -769,12 +745,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) * bit to zero. */ - left = (struct tnode *) tnode_get_child(tn, 2*i); + left = tnode_get_child(tn, 2*i); put_child(tn, 2*i, NULL); BUG_ON(!left); - right = (struct tnode *) tnode_get_child(tn, 2*i+1); + right = tnode_get_child(tn, 2*i+1); put_child(tn, 2*i+1, NULL); BUG_ON(!right); @@ -796,12 +772,11 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) return ERR_PTR(-ENOMEM); } -static struct tnode *halve(struct trie *t, struct tnode *tn) +static struct tnode *halve(struct trie *t, struct tnode *oldtnode) { - struct tnode *oldtnode = tn; - struct rt_trie_node *left, *right; + int olen = tnode_child_length(oldtnode); + struct tnode *tn, *left, *right; int i; - int olen = tnode_child_length(tn); pr_debug("In halve\n"); @@ -830,7 +805,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) if (!newn) goto nomem; - put_child(tn, i/2, (struct rt_trie_node *)newn); + put_child(tn, i/2, newn); } } @@ -855,7 +830,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) } /* Two nonempty children */ - newBinNode = (struct tnode *) tnode_get_child(tn, i/2); + newBinNode = tnode_get_child(tn, i/2); put_child(tn, i/2, NULL); put_child(newBinNode, 0, left); put_child(newBinNode, 1, right); @@ -871,7 +846,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) /* readside must use rcu_read_lock currently dump routines via get_fa_head and dump */ -static struct leaf_info *find_leaf_info(struct leaf *l, int plen) +static struct leaf_info *find_leaf_info(struct tnode *l, int plen) { struct hlist_head *head = &l->list; struct leaf_info *li; @@ -883,7 +858,7 @@ static struct leaf_info *find_leaf_info(struct leaf *l, int plen) return NULL; } -static inline struct list_head *get_fa_head(struct leaf *l, int plen) +static inline struct list_head *get_fa_head(struct tnode *l, int plen) { struct leaf_info *li = find_leaf_info(l, plen); @@ -915,32 +890,25 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) /* rcu_read_lock needs to be hold by caller from readside */ -static struct leaf * -fib_find_node(struct trie *t, u32 key) +static struct tnode *fib_find_node(struct trie *t, u32 key) { - int pos; - struct tnode *tn; - struct rt_trie_node *n; - - pos = 0; - n = rcu_dereference_rtnl(t->trie); + struct tnode *n = rcu_dereference_rtnl(t->trie); + int pos = 0; while (n && IS_TNODE(n)) { - tn = (struct tnode *) n; - - if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { - pos = tn->pos + tn->bits; - n = tnode_get_child_rcu(tn, + if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) { + pos = n->pos + n->bits; + n = tnode_get_child_rcu(n, tkey_extract_bits(key, - tn->pos, - tn->bits)); + n->pos, + n->bits)); } else break; } /* Case we have found a leaf. Compare prefixes */ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) - return (struct leaf *)n; + return n; return NULL; } @@ -956,14 +924,13 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) while (tn != NULL && (tp = node_parent(tn)) != NULL) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); - tn = (struct tnode *)resize(t, tn); + tn = resize(t, tn); - tnode_put_child_reorg(tp, cindex, - (struct rt_trie_node *)tn, wasfull); + tnode_put_child_reorg(tp, cindex, tn, wasfull); tp = node_parent(tn); if (!tp) - rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + rcu_assign_pointer(t->trie, tn); tnode_free_flush(); if (!tp) @@ -973,9 +940,9 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) /* Handle last (top) tnode */ if (IS_TNODE(tn)) - tn = (struct tnode *)resize(t, tn); + tn = resize(t, tn); - rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + rcu_assign_pointer(t->trie, tn); tnode_free_flush(); } @@ -985,8 +952,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) { int pos, newpos; struct tnode *tp = NULL, *tn = NULL; - struct rt_trie_node *n; - struct leaf *l; + struct tnode *n; + struct tnode *l; int missbit; struct list_head *fa_head = NULL; struct leaf_info *li; @@ -1014,17 +981,15 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) */ while (n && IS_TNODE(n)) { - tn = (struct tnode *) n; - - if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { - tp = tn; - pos = tn->pos + tn->bits; - n = tnode_get_child(tn, + if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) { + tp = n; + pos = n->pos + n->bits; + n = tnode_get_child(n, tkey_extract_bits(key, - tn->pos, - tn->bits)); + n->pos, + n->bits)); - BUG_ON(n && node_parent(n) != tn); + BUG_ON(n && node_parent(n) != tp); } else break; } @@ -1040,14 +1005,13 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) /* Case 1: n is a leaf. Compare prefixes */ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - l = (struct leaf *) n; li = leaf_info_new(plen); if (!li) return NULL; fa_head = &li->falh; - insert_leaf_info(&l->list, li); + insert_leaf_info(&n->list, li); goto done; } l = leaf_new(key); @@ -1068,10 +1032,10 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) if (t->trie && n == NULL) { /* Case 2: n is NULL, and will just insert a new leaf */ - node_set_parent((struct rt_trie_node *)l, tp); + node_set_parent(l, tp); cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(tp, cindex, (struct rt_trie_node *)l); + put_child(tp, cindex, l); } else { /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* @@ -1094,17 +1058,17 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) return NULL; } - node_set_parent((struct rt_trie_node *)tn, tp); + node_set_parent(tn, tp); missbit = tkey_extract_bits(key, newpos, 1); - put_child(tn, missbit, (struct rt_trie_node *)l); + put_child(tn, missbit, l); put_child(tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(tp, cindex, (struct rt_trie_node *)tn); + put_child(tp, cindex, tn); } else { - rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); + rcu_assign_pointer(t->trie, tn); } tp = tn; @@ -1134,7 +1098,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) u8 tos = cfg->fc_tos; u32 key, mask; int err; - struct leaf *l; + struct tnode *l; if (plen > 32) return -EINVAL; @@ -1292,7 +1256,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } /* should be called with rcu_read_lock */ -static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l, +static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l, t_key key, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { @@ -1365,7 +1329,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct trie_use_stats __percpu *stats = t->stats; #endif int ret; - struct rt_trie_node *n; + struct tnode *n; struct tnode *pn; unsigned int pos, bits; t_key key = ntohl(flp->daddr); @@ -1387,11 +1351,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, /* Just a leaf? */ if (IS_LEAF(n)) { - ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, n, key, flp, res, fib_flags); goto found; } - pn = (struct tnode *) n; + pn = n; chopped_off = 0; while (pn) { @@ -1412,13 +1376,13 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, } if (IS_LEAF(n)) { - ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags); + ret = check_leaf(tb, t, n, key, flp, res, fib_flags); if (ret > 0) goto backtrace; goto found; } - cn = (struct tnode *)n; + cn = n; /* * It's a tnode, and we can do some extra checks here if we @@ -1506,7 +1470,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, current_prefix_length = mp; } - pn = (struct tnode *)n; /* Descend */ + pn = n; /* Descend */ chopped_off = 0; continue; @@ -1557,7 +1521,7 @@ EXPORT_SYMBOL_GPL(fib_table_lookup); /* * Remove the leaf and return parent. */ -static void trie_leaf_remove(struct trie *t, struct leaf *l) +static void trie_leaf_remove(struct trie *t, struct tnode *l) { struct tnode *tp = node_parent(l); @@ -1584,7 +1548,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) u8 tos = cfg->fc_tos; struct fib_alias *fa, *fa_to_delete; struct list_head *fa_head; - struct leaf *l; + struct tnode *l; struct leaf_info *li; if (plen > 32) @@ -1682,7 +1646,7 @@ static int trie_flush_list(struct list_head *head) return found; } -static int trie_flush_leaf(struct leaf *l) +static int trie_flush_leaf(struct tnode *l) { int found = 0; struct hlist_head *lih = &l->list; @@ -1704,7 +1668,7 @@ static int trie_flush_leaf(struct leaf *l) * Scan for the next right leaf starting at node p->child[idx] * Since we have back pointer, no recursion necessary. */ -static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) +static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c) { do { t_key idx; @@ -1720,47 +1684,46 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) continue; if (IS_LEAF(c)) - return (struct leaf *) c; + return c; /* Rescan start scanning in new node */ - p = (struct tnode *) c; + p = c; idx = 0; } /* Node empty, walk back up to parent */ - c = (struct rt_trie_node *) p; + c = p; } while ((p = node_parent_rcu(c)) != NULL); return NULL; /* Root of trie */ } -static struct leaf *trie_firstleaf(struct trie *t) +static struct tnode *trie_firstleaf(struct trie *t) { - struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); + struct tnode *n = rcu_dereference_rtnl(t->trie); if (!n) return NULL; if (IS_LEAF(n)) /* trie is just a leaf */ - return (struct leaf *) n; + return n; return leaf_walk_rcu(n, NULL); } -static struct leaf *trie_nextleaf(struct leaf *l) +static struct tnode *trie_nextleaf(struct tnode *l) { - struct rt_trie_node *c = (struct rt_trie_node *) l; - struct tnode *p = node_parent_rcu(c); + struct tnode *p = node_parent_rcu(l); if (!p) return NULL; /* trie with just one leaf */ - return leaf_walk_rcu(p, c); + return leaf_walk_rcu(p, l); } -static struct leaf *trie_leafindex(struct trie *t, int index) +static struct tnode *trie_leafindex(struct trie *t, int index) { - struct leaf *l = trie_firstleaf(t); + struct tnode *l = trie_firstleaf(t); while (l && index-- > 0) l = trie_nextleaf(l); @@ -1775,7 +1738,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index) int fib_table_flush(struct fib_table *tb) { struct trie *t = (struct trie *) tb->tb_data; - struct leaf *l, *ll = NULL; + struct tnode *l, *ll = NULL; int found = 0; for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) { @@ -1840,7 +1803,7 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, return skb->len; } -static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, +static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { struct leaf_info *li; @@ -1876,7 +1839,7 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - struct leaf *l; + struct tnode *l; struct trie *t = (struct trie *) tb->tb_data; t_key key = cb->args[2]; int count = cb->args[3]; @@ -1922,7 +1885,7 @@ void __init fib_trie_init(void) 0, SLAB_PANIC, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", - max(sizeof(struct leaf), + max(sizeof(struct tnode), sizeof(struct leaf_info)), 0, SLAB_PANIC, NULL); } @@ -1965,7 +1928,7 @@ struct fib_trie_iter { unsigned int depth; }; -static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) +static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) { struct tnode *tn = iter->tnode; unsigned int cindex = iter->index; @@ -1979,7 +1942,7 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) iter->tnode, iter->index, iter->depth); rescan: while (cindex < (1<bits)) { - struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex); + struct tnode *n = tnode_get_child_rcu(tn, cindex); if (n) { if (IS_LEAF(n)) { @@ -1987,7 +1950,7 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) iter->index = cindex + 1; } else { /* push down one level */ - iter->tnode = (struct tnode *) n; + iter->tnode = n; iter->index = 0; ++iter->depth; } @@ -1998,7 +1961,7 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) } /* Current node exhausted, pop back up */ - p = node_parent_rcu((struct rt_trie_node *)tn); + p = node_parent_rcu(tn); if (p) { cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; tn = p; @@ -2010,10 +1973,10 @@ static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter) return NULL; } -static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, +static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter, struct trie *t) { - struct rt_trie_node *n; + struct tnode *n; if (!t) return NULL; @@ -2023,7 +1986,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, return NULL; if (IS_TNODE(n)) { - iter->tnode = (struct tnode *) n; + iter->tnode = n; iter->index = 0; iter->depth = 1; } else { @@ -2037,7 +2000,7 @@ static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter, static void trie_collect_stats(struct trie *t, struct trie_stat *s) { - struct rt_trie_node *n; + struct tnode *n; struct fib_trie_iter iter; memset(s, 0, sizeof(*s)); @@ -2045,7 +2008,6 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) rcu_read_lock(); for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) { if (IS_LEAF(n)) { - struct leaf *l = (struct leaf *)n; struct leaf_info *li; s->leaves++; @@ -2053,18 +2015,17 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) if (iter.depth > s->maxdepth) s->maxdepth = iter.depth; - hlist_for_each_entry_rcu(li, &l->list, hlist) + hlist_for_each_entry_rcu(li, &n->list, hlist) ++s->prefixes; } else { - const struct tnode *tn = (const struct tnode *) n; int i; s->tnodes++; - if (tn->bits < MAX_STAT_DEPTH) - s->nodesizes[tn->bits]++; + if (n->bits < MAX_STAT_DEPTH) + s->nodesizes[n->bits]++; - for (i = 0; i < (1<bits); i++) - if (!tn->child[i]) + for (i = 0; i < tnode_child_length(n); i++) + if (!rcu_access_pointer(n->child[i])) s->nullpointers++; } } @@ -2088,7 +2049,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth); seq_printf(seq, "\tLeaves: %u\n", stat->leaves); - bytes = sizeof(struct leaf) * stat->leaves; + bytes = sizeof(struct tnode) * stat->leaves; seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes); bytes += sizeof(struct leaf_info) * stat->prefixes; @@ -2109,7 +2070,7 @@ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) seq_putc(seq, '\n'); seq_printf(seq, "\tPointers: %u\n", pointers); - bytes += sizeof(struct rt_trie_node *) * pointers; + bytes += sizeof(struct tnode *) * pointers; seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers); seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024); } @@ -2163,7 +2124,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "Basic info: size of leaf:" " %Zd bytes, size of tnode: %Zd bytes.\n", - sizeof(struct leaf), sizeof(struct tnode)); + sizeof(struct tnode), sizeof(struct tnode)); for (h = 0; h < FIB_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv4.fib_table_hash[h]; @@ -2202,7 +2163,7 @@ static const struct file_operations fib_triestat_fops = { .release = single_release_net, }; -static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) +static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos) { struct fib_trie_iter *iter = seq->private; struct net *net = seq_file_net(seq); @@ -2214,7 +2175,7 @@ static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos) struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) { - struct rt_trie_node *n; + struct tnode *n; for (n = fib_trie_get_first(iter, (struct trie *) tb->tb_data); @@ -2243,7 +2204,7 @@ static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct fib_table *tb = iter->tb; struct hlist_node *tb_node; unsigned int h; - struct rt_trie_node *n; + struct tnode *n; ++*pos; /* next node in same table */ @@ -2329,29 +2290,26 @@ static inline const char *rtn_type(char *buf, size_t len, unsigned int t) static int fib_trie_seq_show(struct seq_file *seq, void *v) { const struct fib_trie_iter *iter = seq->private; - struct rt_trie_node *n = v; + struct tnode *n = v; if (!node_parent_rcu(n)) fib_table_print(seq, iter->tb); if (IS_TNODE(n)) { - struct tnode *tn = (struct tnode *) n; - __be32 prf = htonl(tn->key); + __be32 prf = htonl(n->key); - seq_indent(seq, iter->depth-1); + seq_indent(seq, iter->depth - 1); seq_printf(seq, " +-- %pI4/%d %d %d %d\n", - &prf, tn->pos, tn->bits, tn->full_children, - tn->empty_children); - + &prf, n->pos, n->bits, n->full_children, + n->empty_children); } else { - struct leaf *l = (struct leaf *) n; struct leaf_info *li; - __be32 val = htonl(l->key); + __be32 val = htonl(n->key); seq_indent(seq, iter->depth); seq_printf(seq, " |-- %pI4\n", &val); - hlist_for_each_entry_rcu(li, &l->list, hlist) { + hlist_for_each_entry_rcu(li, &n->list, hlist) { struct fib_alias *fa; list_for_each_entry_rcu(fa, &li->falh, fa_list) { @@ -2401,9 +2359,9 @@ struct fib_route_iter { t_key key; }; -static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) +static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos) { - struct leaf *l = NULL; + struct tnode *l = NULL; struct trie *t = iter->main_trie; /* use cache location of last found key */ @@ -2448,7 +2406,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; - struct leaf *l = v; + struct tnode *l = v; ++*pos; if (v == SEQ_START_TOKEN) { @@ -2494,7 +2452,7 @@ static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info */ static int fib_route_seq_show(struct seq_file *seq, void *v) { - struct leaf *l = v; + struct tnode *l = v; struct leaf_info *li; if (v == SEQ_START_TOKEN) { From 9f9e636d4f89f788c5cf9c6a5357501c0d405fcb Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:55:54 -0800 Subject: [PATCH 05/17] fib_trie: Optimize fib_table_lookup to avoid wasting time on loops/variables This patch is meant to reduce the complexity of fib_table_lookup by reducing the number of variables to the bare minimum while still keeping the same if not improved functionality versus the original. Most of this change was started off by the desire to rid the function of chopped_off and current_prefix_length as they actually added very little to the function since they only applied when computing the cindex. I was able to replace them mostly with just a check for the prefix match. As long as the prefix between the key and the node being tested was the same we know we can search the tnode fully versus just testing cindex 0. The second portion of the change ended up being a massive reordering. Originally the calls to check_leaf were up near the start of the loop, and the backtracing and descending into lower levels of tnodes was later. This didn't make much sense as the structure of the tree means the leaves are always the last thing to be tested. As such I reordered things so that we instead have a loop that will delve into the tree and only exit when we have either found a leaf or we have exhausted the tree. The advantage of rearranging things like this is that we can fully inline check_leaf since there is now only one reference to it in the function. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 268 +++++++++++++++++--------------------------- 1 file changed, 102 insertions(+), 166 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 94c929f6c752..3fe4dd917ce1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -90,6 +90,9 @@ typedef unsigned int t_key; #define IS_TNODE(n) ((n)->bits) #define IS_LEAF(n) (!(n)->bits) +#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits) +#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv)) + struct tnode { t_key key; unsigned char bits; /* 2log(KEYLENGTH) bits needed */ @@ -1281,7 +1284,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l, continue; fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; - if (err) { + if (unlikely(err < 0)) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(t->stats->semantic_match_passed); #endif @@ -1303,7 +1306,7 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l, res->prefixlen = li->plen; res->nh_sel = nhsel; res->type = fa->fa_type; - res->scope = fa->fa_info->fib_scope; + res->scope = fi->fib_scope; res->fi = fi; res->table = tb; res->fa_head = &li->falh; @@ -1321,23 +1324,24 @@ static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l, return 1; } +static inline t_key prefix_mismatch(t_key key, struct tnode *n) +{ + t_key prefix = n->key; + + return (key ^ prefix) & (prefix | -prefix); +} + int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { - struct trie *t = (struct trie *) tb->tb_data; + struct trie *t = (struct trie *)tb->tb_data; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats __percpu *stats = t->stats; #endif - int ret; - struct tnode *n; - struct tnode *pn; - unsigned int pos, bits; - t_key key = ntohl(flp->daddr); - unsigned int chopped_off; - t_key cindex = 0; - unsigned int current_prefix_length = KEYLENGTH; - struct tnode *cn; - t_key pref_mismatch; + const t_key key = ntohl(flp->daddr); + struct tnode *n, *pn; + t_key cindex; + int ret = 1; rcu_read_lock(); @@ -1349,170 +1353,102 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, this_cpu_inc(stats->gets); #endif - /* Just a leaf? */ - if (IS_LEAF(n)) { - ret = check_leaf(tb, t, n, key, flp, res, fib_flags); - goto found; - } - pn = n; - chopped_off = 0; + cindex = 0; - while (pn) { - pos = pn->pos; - bits = pn->bits; + /* Step 1: Travel to the longest prefix match in the trie */ + for (;;) { + unsigned long index = get_index(key, n); - if (!chopped_off) - cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length), - pos, bits); + /* This bit of code is a bit tricky but it combines multiple + * checks into a single check. The prefix consists of the + * prefix plus zeros for the "bits" in the prefix. The index + * is the difference between the key and this value. From + * this we can actually derive several pieces of data. + * if !(index >> bits) + * we know the value is child index + * else + * we have a mismatch in skip bits and failed + */ + if (index >> n->bits) + break; - n = tnode_get_child_rcu(pn, cindex); - - if (n == NULL) { -#ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->null_node_hit); -#endif - goto backtrace; - } - - if (IS_LEAF(n)) { - ret = check_leaf(tb, t, n, key, flp, res, fib_flags); - if (ret > 0) - goto backtrace; + /* we have found a leaf. Prefixes have already been compared */ + if (IS_LEAF(n)) goto found; + + /* only record pn and cindex if we are going to be chopping + * bits later. Otherwise we are just wasting cycles. + */ + if (index) { + pn = n; + cindex = index; } - cn = n; - - /* - * It's a tnode, and we can do some extra checks here if we - * like, to avoid descending into a dead-end branch. - * This tnode is in the parent's child array at index - * key[p_pos..p_pos+p_bits] but potentially with some bits - * chopped off, so in reality the index may be just a - * subprefix, padded with zero at the end. - * We can also take a look at any skipped bits in this - * tnode - everything up to p_pos is supposed to be ok, - * and the non-chopped bits of the index (se previous - * paragraph) are also guaranteed ok, but the rest is - * considered unknown. - * - * The skipped bits are key[pos+bits..cn->pos]. - */ - - /* If current_prefix_length < pos+bits, we are already doing - * actual prefix matching, which means everything from - * pos+(bits-chopped_off) onward must be zero along some - * branch of this subtree - otherwise there is *no* valid - * prefix present. Here we can only check the skipped - * bits. Remember, since we have already indexed into the - * parent's child array, we know that the bits we chopped of - * *are* zero. - */ - - /* NOTA BENE: Checking only skipped bits - for the new node here */ - - if (current_prefix_length < pos+bits) { - if (tkey_extract_bits(cn->key, current_prefix_length, - cn->pos - current_prefix_length) - || !(cn->child[0])) - goto backtrace; - } - - /* - * If chopped_off=0, the index is fully validated and we - * only need to look at the skipped bits for this, the new, - * tnode. What we actually want to do is to find out if - * these skipped bits match our key perfectly, or if we will - * have to count on finding a matching prefix further down, - * because if we do, we would like to have some way of - * verifying the existence of such a prefix at this point. - */ - - /* The only thing we can do at this point is to verify that - * any such matching prefix can indeed be a prefix to our - * key, and if the bits in the node we are inspecting that - * do not match our key are not ZERO, this cannot be true. - * Thus, find out where there is a mismatch (before cn->pos) - * and verify that all the mismatching bits are zero in the - * new tnode's key. - */ - - /* - * Note: We aren't very concerned about the piece of - * the key that precede pn->pos+pn->bits, since these - * have already been checked. The bits after cn->pos - * aren't checked since these are by definition - * "unknown" at this point. Thus, what we want to see - * is if we are about to enter the "prefix matching" - * state, and in that case verify that the skipped - * bits that will prevail throughout this subtree are - * zero, as they have to be if we are to find a - * matching prefix. - */ - - pref_mismatch = mask_pfx(cn->key ^ key, cn->pos); - - /* - * In short: If skipped bits in this node do not match - * the search key, enter the "prefix matching" - * state.directly. - */ - if (pref_mismatch) { - /* fls(x) = __fls(x) + 1 */ - int mp = KEYLENGTH - __fls(pref_mismatch) - 1; - - if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) - goto backtrace; - - if (current_prefix_length >= cn->pos) - current_prefix_length = mp; - } - - pn = n; /* Descend */ - chopped_off = 0; - continue; - -backtrace: - chopped_off++; - - /* As zero don't change the child key (cindex) */ - while ((chopped_off <= pn->bits) - && !(cindex & (1<<(chopped_off-1)))) - chopped_off++; - - /* Decrease current_... with bits chopped off */ - if (current_prefix_length > pn->pos + pn->bits - chopped_off) - current_prefix_length = pn->pos + pn->bits - - chopped_off; - - /* - * Either we do the actual chop off according or if we have - * chopped off all bits in this tnode walk up to our parent. - */ - - if (chopped_off <= pn->bits) { - cindex &= ~(1 << (chopped_off-1)); - } else { - struct tnode *parent = node_parent_rcu(pn); - if (!parent) - goto failed; - - /* Get Child's index */ - cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits); - pn = parent; - chopped_off = 0; - -#ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(stats->backtrack); -#endif + n = rcu_dereference(n->child[index]); + if (unlikely(!n)) goto backtrace; + } + + /* Step 2: Sort out leaves and begin backtracing for longest prefix */ + for (;;) { + /* record the pointer where our next node pointer is stored */ + struct tnode __rcu **cptr = n->child; + + /* This test verifies that none of the bits that differ + * between the key and the prefix exist in the region of + * the lsb and higher in the prefix. + */ + if (unlikely(prefix_mismatch(key, n))) + goto backtrace; + + /* exit out and process leaf */ + if (unlikely(IS_LEAF(n))) + break; + + /* Don't bother recording parent info. Since we are in + * prefix match mode we will have to come back to wherever + * we started this traversal anyway + */ + + while ((n = rcu_dereference(*cptr)) == NULL) { +backtrace: +#ifdef CONFIG_IP_FIB_TRIE_STATS + if (!n) + this_cpu_inc(stats->null_node_hit); +#endif + /* If we are at cindex 0 there are no more bits for + * us to strip at this level so we must ascend back + * up one level to see if there are any more bits to + * be stripped there. + */ + while (!cindex) { + t_key pkey = pn->key; + + pn = node_parent_rcu(pn); + if (unlikely(!pn)) + goto failed; +#ifdef CONFIG_IP_FIB_TRIE_STATS + this_cpu_inc(stats->backtrack); +#endif + /* Get Child's index */ + cindex = get_index(pkey, pn); + } + + /* strip the least significant bit from the cindex */ + cindex &= cindex - 1; + + /* grab pointer for next child node */ + cptr = &pn->child[cindex]; } } -failed: - ret = 1; + found: + /* Step 3: Process the leaf, if that fails fall back to backtracing */ + ret = check_leaf(tb, t, n, key, flp, res, fib_flags); + if (unlikely(ret > 0)) + goto backtrace; +failed: rcu_read_unlock(); return ret; } From 939afb0657dd8c8f9486d172d6bb62fc902e2f23 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:00 -0800 Subject: [PATCH 06/17] fib_trie: Optimize fib_find_node This patch makes use of the same features I made use of for fib_table_lookup to streamline fib_find_node. The resultant code should be smaller and run faster than the original. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3fe4dd917ce1..ac04f31a632e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -892,28 +892,34 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) } /* rcu_read_lock needs to be hold by caller from readside */ - static struct tnode *fib_find_node(struct trie *t, u32 key) { struct tnode *n = rcu_dereference_rtnl(t->trie); - int pos = 0; - while (n && IS_TNODE(n)) { - if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) { - pos = n->pos + n->bits; - n = tnode_get_child_rcu(n, - tkey_extract_bits(key, - n->pos, - n->bits)); - } else + while (n) { + unsigned long index = get_index(key, n); + + /* This bit of code is a bit tricky but it combines multiple + * checks into a single check. The prefix consists of the + * prefix plus zeros for the bits in the cindex. The index + * is the difference between the key and this value. From + * this we can actually derive several pieces of data. + * if !(index >> bits) + * we know the value is cindex + * else + * we have a mismatch in skip bits and failed + */ + if (index >> n->bits) + return NULL; + + /* we have found a leaf. Prefixes have already been compared */ + if (IS_LEAF(n)) break; + + n = rcu_dereference_rtnl(n->child[index]); } - /* Case we have found a leaf. Compare prefixes */ - if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) - return n; - - return NULL; + return n; } static void trie_rebalance(struct trie *t, struct tnode *tn) From 836a0123c98c91df71678ce63c0d42770996582a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:06 -0800 Subject: [PATCH 07/17] fib_trie: Optimize fib_table_insert This patch updates the fib_table_insert function to take advantage of the changes made to improve the performance of fib_table_lookup. As a result the code should be smaller and run faster then the original. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 212 +++++++++++++++++--------------------------- 1 file changed, 79 insertions(+), 133 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ac04f31a632e..8a147c3baaba 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -222,31 +222,6 @@ static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int return 0; } -static inline int tkey_equals(t_key a, t_key b) -{ - return a == b; -} - -static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) -{ - if (bits == 0 || offset >= KEYLENGTH) - return 1; - bits = bits > KEYLENGTH ? KEYLENGTH : bits; - return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; -} - -static inline int tkey_mismatch(t_key a, int offset, t_key b) -{ - t_key diff = a ^ b; - int i = offset; - - if (!diff) - return 0; - while ((diff << i) >> (KEYLENGTH-1) == 0) - i++; - return i; -} - /* To understand this stuff, an understanding of keys and all their bits is necessary. Every node in the trie has a key associated with it, but not @@ -485,6 +460,15 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n, rcu_assign_pointer(tn->child[i], n); } +static void put_child_root(struct tnode *tp, struct trie *t, + t_key key, struct tnode *n) +{ + if (tp) + put_child(tp, get_index(key, tp), n); + else + rcu_assign_pointer(t->trie, n); +} + #define MAX_WORK 10 static struct tnode *resize(struct trie *t, struct tnode *tn) { @@ -959,138 +943,100 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) { - int pos, newpos; - struct tnode *tp = NULL, *tn = NULL; - struct tnode *n; - struct tnode *l; - int missbit; struct list_head *fa_head = NULL; + struct tnode *l, *n, *tp = NULL; struct leaf_info *li; - t_key cindex; - pos = 0; + li = leaf_info_new(plen); + if (!li) + return NULL; + fa_head = &li->falh; + n = rtnl_dereference(t->trie); /* If we point to NULL, stop. Either the tree is empty and we should * just put a new leaf in if, or we have reached an empty child slot, * and we should just put our new leaf in that. - * If we point to a T_TNODE, check if it matches our key. Note that - * a T_TNODE might be skipping any number of bits - its 'pos' need - * not be the parent's 'pos'+'bits'! * - * If it does match the current key, get pos/bits from it, extract - * the index from our key, push the T_TNODE and walk the tree. - * - * If it doesn't, we have to replace it with a new T_TNODE. - * - * If we point to a T_LEAF, it might or might not have the same key - * as we do. If it does, just change the value, update the T_LEAF's - * value, and return it. - * If it doesn't, we need to replace it with a T_TNODE. + * If we hit a node with a key that does't match then we should stop + * and create a new tnode to replace that node and insert ourselves + * and the other node into the new tnode. */ + while (n) { + unsigned long index = get_index(key, n); - while (n && IS_TNODE(n)) { - if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) { - tp = n; - pos = n->pos + n->bits; - n = tnode_get_child(n, - tkey_extract_bits(key, - n->pos, - n->bits)); - - BUG_ON(n && node_parent(n) != tp); - } else - break; - } - - /* - * n ----> NULL, LEAF or TNODE - * - * tp is n's (parent) ----> NULL or TNODE - */ - - BUG_ON(tp && IS_LEAF(tp)); - - /* Case 1: n is a leaf. Compare prefixes */ - - if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - li = leaf_info_new(plen); - - if (!li) - return NULL; - - fa_head = &li->falh; - insert_leaf_info(&n->list, li); - goto done; - } - l = leaf_new(key); - - if (!l) - return NULL; - - li = leaf_info_new(plen); - - if (!li) { - node_free(l); - return NULL; - } - - fa_head = &li->falh; - insert_leaf_info(&l->list, li); - - if (t->trie && n == NULL) { - /* Case 2: n is NULL, and will just insert a new leaf */ - - node_set_parent(l, tp); - - cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(tp, cindex, l); - } else { - /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ - /* - * Add a new tnode here - * first tnode need some special handling + /* This bit of code is a bit tricky but it combines multiple + * checks into a single check. The prefix consists of the + * prefix plus zeros for the "bits" in the prefix. The index + * is the difference between the key and this value. From + * this we can actually derive several pieces of data. + * if !(index >> bits) + * we know the value is child index + * else + * we have a mismatch in skip bits and failed */ + if (index >> n->bits) + break; - if (n) { - pos = tp ? tp->pos+tp->bits : 0; - newpos = tkey_mismatch(key, pos, n->key); - tn = tnode_new(n->key, newpos, 1); - } else { - newpos = 0; - tn = tnode_new(key, newpos, 1); /* First tnode */ + /* we have found a leaf. Prefixes have already been compared */ + if (IS_LEAF(n)) { + /* Case 1: n is a leaf, and prefixes match*/ + insert_leaf_info(&n->list, li); + return fa_head; } + tp = n; + n = rcu_dereference_rtnl(n->child[index]); + } + + l = leaf_new(key); + if (!l) { + free_leaf_info(li); + return NULL; + } + + insert_leaf_info(&l->list, li); + + /* Case 2: n is a LEAF or a TNODE and the key doesn't match. + * + * Add a new tnode here + * first tnode need some special handling + * leaves us in position for handling as case 3 + */ + if (n) { + struct tnode *tn; + int newpos; + + newpos = KEYLENGTH - __fls(n->key ^ key) - 1; + + tn = tnode_new(key, newpos, 1); if (!tn) { free_leaf_info(li); node_free(l); return NULL; } - node_set_parent(tn, tp); + /* initialize routes out of node */ + NODE_INIT_PARENT(tn, tp); + put_child(tn, get_index(key, tn) ^ 1, n); - missbit = tkey_extract_bits(key, newpos, 1); - put_child(tn, missbit, l); - put_child(tn, 1-missbit, n); - - if (tp) { - cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(tp, cindex, tn); - } else { - rcu_assign_pointer(t->trie, tn); - } + /* start adding routes into the node */ + put_child_root(tp, t, key, tn); + node_set_parent(n, tn); + /* parent now has a NULL spot where the leaf can go */ tp = tn; } - if (tp && tp->pos + tp->bits > 32) - pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", - tp, tp->pos, tp->bits, key, plen); + /* Case 3: n is NULL, and will just insert a new leaf */ + if (tp) { + NODE_INIT_PARENT(l, tp); + put_child(tp, get_index(key, tp), l); + trie_rebalance(t, tp); + } else { + rcu_assign_pointer(t->trie, l); + } - /* Rebalance the trie */ - - trie_rebalance(t, tp); -done: return fa_head; } @@ -1470,11 +1416,11 @@ static void trie_leaf_remove(struct trie *t, struct tnode *l) pr_debug("entering trie_leaf_remove(%p)\n", l); if (tp) { - t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); - put_child(tp, cindex, NULL); + put_child(tp, get_index(l->key, tp), NULL); trie_rebalance(t, tp); - } else + } else { RCU_INIT_POINTER(t->trie, NULL); + } node_free(l); } From e9b44019d41a5cf41de43e895a5ed38b19a59b5b Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:12 -0800 Subject: [PATCH 08/17] fib_trie: Update meaning of pos to represent unchecked bits This change moves the pos value to the other side of the "bits" field. By doing this it actually simplifies a significant amount of code in the trie. For example when halving a tree we know that the bit lost exists at oldnode->pos, and if we inflate the tree the new bit being add is at tn->pos. Previously to find those bits you would have to subtract pos and bits from the keylength or start with a value of (1 << 31) and then shift that. There are a number of spots throughout the code that benefit from this. In the case of the hot-path searches the main advantage is that we can drop 2 or more operations from the search path as we no longer need to compute the value for the index to be shifted by and can instead just use the raw pos value. In addition the tkey_extract_bits is now defunct and can be replaced by get_index since the two operations were doing the same thing, but now get_index does it much more quickly as it is only an xor and shift versus a pair of shifts and a subtraction. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 194 ++++++++++++++++++-------------------------- 1 file changed, 81 insertions(+), 113 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 8a147c3baaba..9d675486b38c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -90,8 +90,7 @@ typedef unsigned int t_key; #define IS_TNODE(n) ((n)->bits) #define IS_LEAF(n) (!(n)->bits) -#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits) -#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv)) +#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos) struct tnode { t_key key; @@ -209,81 +208,64 @@ static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned return rcu_dereference_rtnl(tn->child[i]); } -static inline t_key mask_pfx(t_key k, unsigned int l) -{ - return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l); -} - -static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits) -{ - if (offset < KEYLENGTH) - return ((t_key)(a << offset)) >> (KEYLENGTH - bits); - else - return 0; -} - -/* - To understand this stuff, an understanding of keys and all their bits is - necessary. Every node in the trie has a key associated with it, but not - all of the bits in that key are significant. - - Consider a node 'n' and its parent 'tp'. - - If n is a leaf, every bit in its key is significant. Its presence is - necessitated by path compression, since during a tree traversal (when - searching for a leaf - unless we are doing an insertion) we will completely - ignore all skipped bits we encounter. Thus we need to verify, at the end of - a potentially successful search, that we have indeed been walking the - correct key path. - - Note that we can never "miss" the correct key in the tree if present by - following the wrong path. Path compression ensures that segments of the key - that are the same for all keys with a given prefix are skipped, but the - skipped part *is* identical for each node in the subtrie below the skipped - bit! trie_insert() in this implementation takes care of that - note the - call to tkey_sub_equals() in trie_insert(). - - if n is an internal node - a 'tnode' here, the various parts of its key - have many different meanings. - - Example: - _________________________________________________________________ - | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | - ----------------------------------------------------------------- - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - - _________________________________________________________________ - | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | - ----------------------------------------------------------------- - 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - - tp->pos = 7 - tp->bits = 3 - n->pos = 15 - n->bits = 4 - - First, let's just ignore the bits that come before the parent tp, that is - the bits from 0 to (tp->pos-1). They are *known* but at this point we do - not use them for anything. - - The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the - index into the parent's child array. That is, they will be used to find - 'n' among tp's children. - - The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits - for the node n. - - All the bits we have seen so far are significant to the node n. The rest - of the bits are really not needed or indeed known in n->key. - - The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into - n's child array, and will of course be different for each child. - - - The rest of the bits, from (n->pos + n->bits) onward, are completely unknown - at this point. - -*/ +/* To understand this stuff, an understanding of keys and all their bits is + * necessary. Every node in the trie has a key associated with it, but not + * all of the bits in that key are significant. + * + * Consider a node 'n' and its parent 'tp'. + * + * If n is a leaf, every bit in its key is significant. Its presence is + * necessitated by path compression, since during a tree traversal (when + * searching for a leaf - unless we are doing an insertion) we will completely + * ignore all skipped bits we encounter. Thus we need to verify, at the end of + * a potentially successful search, that we have indeed been walking the + * correct key path. + * + * Note that we can never "miss" the correct key in the tree if present by + * following the wrong path. Path compression ensures that segments of the key + * that are the same for all keys with a given prefix are skipped, but the + * skipped part *is* identical for each node in the subtrie below the skipped + * bit! trie_insert() in this implementation takes care of that. + * + * if n is an internal node - a 'tnode' here, the various parts of its key + * have many different meanings. + * + * Example: + * _________________________________________________________________ + * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | + * ----------------------------------------------------------------- + * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 + * + * _________________________________________________________________ + * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | + * ----------------------------------------------------------------- + * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 + * + * tp->pos = 22 + * tp->bits = 3 + * n->pos = 13 + * n->bits = 4 + * + * First, let's just ignore the bits that come before the parent tp, that is + * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this + * point we do not use them for anything. + * + * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the + * index into the parent's child array. That is, they will be used to find + * 'n' among tp's children. + * + * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits + * for the node n. + * + * All the bits we have seen so far are significant to the node n. The rest + * of the bits are really not needed or indeed known in n->key. + * + * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into + * n's child array, and will of course be different for each child. + * + * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown + * at this point. + */ static const int halve_threshold = 25; static const int inflate_threshold = 50; @@ -367,7 +349,7 @@ static struct tnode *leaf_new(t_key key) * as the nodes are searched */ l->key = key; - l->pos = KEYLENGTH; + l->pos = 0; /* set bits to 0 indicating we are not a tnode */ l->bits = 0; @@ -400,7 +382,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) tn->parent = NULL; tn->pos = pos; tn->bits = bits; - tn->key = mask_pfx(key, pos); + tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; tn->full_children = 0; tn->empty_children = 1<pos == (tn->pos + tn->bits)); + return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } static inline void put_child(struct tnode *tn, int i, @@ -641,11 +621,12 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) { int olen = tnode_child_length(oldtnode); struct tnode *tn; + t_key m; int i; pr_debug("In inflate\n"); - tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); + tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); if (!tn) return ERR_PTR(-ENOMEM); @@ -656,21 +637,18 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) * fails. In case of failure we return the oldnode and inflate * of tnode is ignored. */ + for (i = 0, m = 1u << tn->pos; i < olen; i++) { + struct tnode *inode = tnode_get_child(oldtnode, i); - for (i = 0; i < olen; i++) { - struct tnode *inode; - - inode = tnode_get_child(oldtnode, i); - if (tnode_full(oldtnode, inode) && inode->bits > 1) { + if (tnode_full(oldtnode, inode) && (inode->bits > 1)) { struct tnode *left, *right; - t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos; - left = tnode_new(inode->key&(~m), inode->pos + 1, + left = tnode_new(inode->key & ~m, inode->pos, inode->bits - 1); if (!left) goto nomem; - right = tnode_new(inode->key|m, inode->pos + 1, + right = tnode_new(inode->key | m, inode->pos, inode->bits - 1); if (!right) { @@ -694,9 +672,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) /* A leaf or an internal node with skipped bits */ if (!tnode_full(oldtnode, inode)) { - put_child(tn, - tkey_extract_bits(inode->key, tn->pos, tn->bits), - inode); + put_child(tn, get_index(inode->key, tn), inode); continue; } @@ -767,7 +743,7 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode) pr_debug("In halve\n"); - tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); + tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); if (!tn) return ERR_PTR(-ENOMEM); @@ -787,7 +763,7 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode) if (left && right) { struct tnode *newn; - newn = tnode_new(left->key, tn->pos + tn->bits, 1); + newn = tnode_new(left->key, oldtnode->pos, 1); if (!newn) goto nomem; @@ -915,7 +891,7 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) key = tn->key; while (tn != NULL && (tp = node_parent(tn)) != NULL) { - cindex = tkey_extract_bits(key, tp->pos, tp->bits); + cindex = get_index(key, tp); wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = resize(t, tn); @@ -1005,11 +981,8 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) */ if (n) { struct tnode *tn; - int newpos; - newpos = KEYLENGTH - __fls(n->key ^ key) - 1; - - tn = tnode_new(key, newpos, 1); + tn = tnode_new(key, __fls(key ^ n->key), 1); if (!tn) { free_leaf_info(li); node_free(l); @@ -1559,12 +1532,7 @@ static int trie_flush_leaf(struct tnode *l) static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c) { do { - t_key idx; - - if (c) - idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1; - else - idx = 0; + t_key idx = c ? idx = get_index(c->key, p) + 1 : 0; while (idx < 1u << p->bits) { c = tnode_get_child_rcu(p, idx++); @@ -1851,7 +1819,7 @@ static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) /* Current node exhausted, pop back up */ p = node_parent_rcu(tn); if (p) { - cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1; + cindex = get_index(tn->key, p) + 1; tn = p; --iter->depth; goto rescan; @@ -2186,10 +2154,10 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (IS_TNODE(n)) { __be32 prf = htonl(n->key); - seq_indent(seq, iter->depth - 1); - seq_printf(seq, " +-- %pI4/%d %d %d %d\n", - &prf, n->pos, n->bits, n->full_children, - n->empty_children); + seq_indent(seq, iter->depth-1); + seq_printf(seq, " +-- %pI4/%zu %u %u %u\n", + &prf, KEYLENGTH - n->pos - n->bits, n->bits, + n->full_children, n->empty_children); } else { struct leaf_info *li; __be32 val = htonl(n->key); From 98293e8d2f51976d5f790400859a517c0f7cb598 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:18 -0800 Subject: [PATCH 09/17] fib_trie: Use unsigned long for anything dealing with a shift by bits This change makes it so that anything that can be shifted by, or compared to a value shifted by bits is updated to be an unsigned long. This is mostly a precaution against an insanely huge address space that somehow starts coming close to the 2^32 root node size which would require something like 1.5 billion addresses. I chose unsigned long instead of unsigned long long since I do not believe it is possible to allocate a 32 bit tnode on a 32 bit system as the memory consumed would be 16GB + 28B which exceeds the addressible space for any one process. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 53 ++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9d675486b38c..28a3065470bc 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -146,8 +146,8 @@ struct trie { #endif }; -static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n, - int wasfull); +static void tnode_put_child_reorg(struct tnode *tn, unsigned long i, + struct tnode *n, int wasfull); static struct tnode *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); @@ -183,25 +183,23 @@ static inline void node_set_parent(struct tnode *n, struct tnode *tp) /* This provides us with the number of children in this node, in the case of a * leaf this will return 0 meaning none of the children are accessible. */ -static inline int tnode_child_length(const struct tnode *tn) +static inline unsigned long tnode_child_length(const struct tnode *tn) { return (1ul << tn->bits) & ~(1ul); } -/* - * caller must hold RTNL - */ -static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i) +/* caller must hold RTNL */ +static inline struct tnode *tnode_get_child(const struct tnode *tn, + unsigned long i) { BUG_ON(i >= tnode_child_length(tn)); return rtnl_dereference(tn->child[i]); } -/* - * caller must hold RCU read lock or RTNL - */ -static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i) +/* caller must hold RCU read lock or RTNL */ +static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, + unsigned long i) { BUG_ON(i >= tnode_child_length(tn)); @@ -400,7 +398,7 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n) return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } -static inline void put_child(struct tnode *tn, int i, +static inline void put_child(struct tnode *tn, unsigned long i, struct tnode *n) { tnode_put_child_reorg(tn, i, n, -1); @@ -411,13 +409,13 @@ static inline void put_child(struct tnode *tn, int i, * Update the value of full_children and empty_children. */ -static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n, - int wasfull) +static void tnode_put_child_reorg(struct tnode *tn, unsigned long i, + struct tnode *n, int wasfull) { struct tnode *chi = rtnl_dereference(tn->child[i]); int isfull; - BUG_ON(i >= 1<bits); + BUG_ON(i >= tnode_child_length(tn)); /* update emptyChildren */ if (n == NULL && chi != NULL) @@ -607,10 +605,10 @@ static struct tnode *resize(struct trie *t, struct tnode *tn) static void tnode_clean_free(struct tnode *tn) { struct tnode *tofree; - int i; + unsigned long i; for (i = 0; i < tnode_child_length(tn); i++) { - tofree = rtnl_dereference(tn->child[i]); + tofree = tnode_get_child(tn, i); if (tofree) node_free(tofree); } @@ -619,10 +617,10 @@ static void tnode_clean_free(struct tnode *tn) static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) { - int olen = tnode_child_length(oldtnode); + unsigned long olen = tnode_child_length(oldtnode); struct tnode *tn; + unsigned long i; t_key m; - int i; pr_debug("In inflate\n"); @@ -664,7 +662,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) for (i = 0; i < olen; i++) { struct tnode *inode = tnode_get_child(oldtnode, i); struct tnode *left, *right; - int size, j; + unsigned long size, j; /* An empty child */ if (inode == NULL) @@ -737,7 +735,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) static struct tnode *halve(struct trie *t, struct tnode *oldtnode) { - int olen = tnode_child_length(oldtnode); + unsigned long olen = tnode_child_length(oldtnode); struct tnode *tn, *left, *right; int i; @@ -1532,9 +1530,9 @@ static int trie_flush_leaf(struct tnode *l) static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c) { do { - t_key idx = c ? idx = get_index(c->key, p) + 1 : 0; + unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0; - while (idx < 1u << p->bits) { + while (idx < tnode_child_length(p)) { c = tnode_get_child_rcu(p, idx++); if (!c) continue; @@ -1786,8 +1784,8 @@ struct fib_trie_iter { static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) { + unsigned long cindex = iter->index; struct tnode *tn = iter->tnode; - unsigned int cindex = iter->index; struct tnode *p; /* A single entry routing table */ @@ -1797,7 +1795,7 @@ static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter) pr_debug("get_next iter={node=%p index=%d depth=%d}\n", iter->tnode, iter->index, iter->depth); rescan: - while (cindex < (1<bits)) { + while (cindex < tnode_child_length(tn)) { struct tnode *n = tnode_get_child_rcu(tn, cindex); if (n) { @@ -1874,15 +1872,16 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) hlist_for_each_entry_rcu(li, &n->list, hlist) ++s->prefixes; } else { - int i; + unsigned long i; s->tnodes++; if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; - for (i = 0; i < tnode_child_length(n); i++) + for (i = 0; i < tnode_child_length(n); i++) { if (!rcu_access_pointer(n->child[i])) s->nullpointers++; + } } } rcu_read_unlock(); From 345e9b54268ae065520a7252c182d22ef4591718 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:24 -0800 Subject: [PATCH 10/17] fib_trie: Push rcu_read_lock/unlock to callers This change is to start cleaning up some of the rcu_read_lock/unlock handling. I realized while reviewing the code there are several spots that I don't believe are being handled correctly or are masking warnings by locally calling rcu_read_lock/unlock instead of calling them at the correct level. A common example is a call to fib_get_table followed by fib_table_lookup. The rcu_read_lock/unlock ought to wrap both but there are several spots where they were not wrapped. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/net/ip_fib.h | 50 +++++++++------ net/ipv4/fib_frontend.c | 27 ++++---- net/ipv4/fib_rules.c | 24 ++++--- net/ipv4/fib_trie.c | 137 ++++++++++++++++++---------------------- 4 files changed, 115 insertions(+), 123 deletions(-) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 09a819ee2151..5bd120e4bc0a 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -222,16 +222,19 @@ static inline struct fib_table *fib_new_table(struct net *net, u32 id) static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res) { - struct fib_table *table; + int err = -ENETUNREACH; - table = fib_get_table(net, RT_TABLE_LOCAL); - if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF)) - return 0; + rcu_read_lock(); - table = fib_get_table(net, RT_TABLE_MAIN); - if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF)) - return 0; - return -ENETUNREACH; + if (!fib_table_lookup(fib_get_table(net, RT_TABLE_LOCAL), flp, res, + FIB_LOOKUP_NOREF) || + !fib_table_lookup(fib_get_table(net, RT_TABLE_MAIN), flp, res, + FIB_LOOKUP_NOREF)) + err = 0; + + rcu_read_unlock(); + + return err; } #else /* CONFIG_IP_MULTIPLE_TABLES */ @@ -247,20 +250,25 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res) { if (!net->ipv4.fib_has_custom_rules) { + int err = -ENETUNREACH; + + rcu_read_lock(); + res->tclassid = 0; - if (net->ipv4.fib_local && - !fib_table_lookup(net->ipv4.fib_local, flp, res, - FIB_LOOKUP_NOREF)) - return 0; - if (net->ipv4.fib_main && - !fib_table_lookup(net->ipv4.fib_main, flp, res, - FIB_LOOKUP_NOREF)) - return 0; - if (net->ipv4.fib_default && - !fib_table_lookup(net->ipv4.fib_default, flp, res, - FIB_LOOKUP_NOREF)) - return 0; - return -ENETUNREACH; + if ((net->ipv4.fib_local && + !fib_table_lookup(net->ipv4.fib_local, flp, res, + FIB_LOOKUP_NOREF)) || + (net->ipv4.fib_main && + !fib_table_lookup(net->ipv4.fib_main, flp, res, + FIB_LOOKUP_NOREF)) || + (net->ipv4.fib_default && + !fib_table_lookup(net->ipv4.fib_default, flp, res, + FIB_LOOKUP_NOREF))) + err = 0; + + rcu_read_unlock(); + + return err; } return __fib_lookup(net, flp, res); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 66890209208f..57be71dd6a9e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -109,6 +109,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) return tb; } +/* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) { struct fib_table *tb; @@ -119,15 +120,11 @@ struct fib_table *fib_get_table(struct net *net, u32 id) id = RT_TABLE_MAIN; h = id & (FIB_TABLE_HASHSZ - 1); - rcu_read_lock(); head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb_hlist) { - if (tb->tb_id == id) { - rcu_read_unlock(); + if (tb->tb_id == id) return tb; - } } - rcu_read_unlock(); return NULL; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ @@ -167,16 +164,18 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, if (ipv4_is_multicast(addr)) return RTN_MULTICAST; + rcu_read_lock(); + local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; - rcu_read_lock(); if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; } - rcu_read_unlock(); } + + rcu_read_unlock(); return ret; } @@ -919,7 +918,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) #undef BRD1_OK } -static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) +static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) { struct fib_result res; @@ -929,6 +928,11 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) .flowi4_tos = frn->fl_tos, .flowi4_scope = frn->fl_scope, }; + struct fib_table *tb; + + rcu_read_lock(); + + tb = fib_get_table(net, frn->tb_id_in); frn->err = -ENOENT; if (tb) { @@ -945,6 +949,8 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) } local_bh_enable(); } + + rcu_read_unlock(); } static void nl_fib_input(struct sk_buff *skb) @@ -952,7 +958,6 @@ static void nl_fib_input(struct sk_buff *skb) struct net *net; struct fib_result_nl *frn; struct nlmsghdr *nlh; - struct fib_table *tb; u32 portid; net = sock_net(skb->sk); @@ -967,9 +972,7 @@ static void nl_fib_input(struct sk_buff *skb) nlh = nlmsg_hdr(skb); frn = (struct fib_result_nl *) nlmsg_data(nlh); - tb = fib_get_table(net, frn->tb_id_in); - - nl_fib_lookup(frn, tb); + nl_fib_lookup(net, frn); portid = NETLINK_CB(skb).portid; /* netlink portid */ NETLINK_CB(skb).portid = 0; /* from kernel */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 8f7bd56955b0..d3db718be51d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -81,27 +81,25 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, break; case FR_ACT_UNREACHABLE: - err = -ENETUNREACH; - goto errout; + return -ENETUNREACH; case FR_ACT_PROHIBIT: - err = -EACCES; - goto errout; + return -EACCES; case FR_ACT_BLACKHOLE: default: - err = -EINVAL; - goto errout; + return -EINVAL; } - tbl = fib_get_table(rule->fr_net, rule->table); - if (!tbl) - goto errout; + rcu_read_lock(); - err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags); - if (err > 0) - err = -EAGAIN; -errout: + tbl = fib_get_table(rule->fr_net, rule->table); + if (tbl) + err = fib_table_lookup(tbl, &flp->u.ip4, + (struct fib_result *)arg->result, + arg->flags); + + rcu_read_unlock(); return err; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 28a3065470bc..987b06d1effe 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1181,72 +1181,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) return err; } -/* should be called with rcu_read_lock */ -static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l, - t_key key, const struct flowi4 *flp, - struct fib_result *res, int fib_flags) -{ - struct leaf_info *li; - struct hlist_head *hhead = &l->list; - - hlist_for_each_entry_rcu(li, hhead, hlist) { - struct fib_alias *fa; - - if (l->key != (key & li->mask_plen)) - continue; - - list_for_each_entry_rcu(fa, &li->falh, fa_list) { - struct fib_info *fi = fa->fa_info; - int nhsel, err; - - if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) - continue; - if (fi->fib_dead) - continue; - if (fa->fa_info->fib_scope < flp->flowi4_scope) - continue; - fib_alias_accessed(fa); - err = fib_props[fa->fa_type].error; - if (unlikely(err < 0)) { -#ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->semantic_match_passed); -#endif - return err; - } - if (fi->fib_flags & RTNH_F_DEAD) - continue; - for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { - const struct fib_nh *nh = &fi->fib_nh[nhsel]; - - if (nh->nh_flags & RTNH_F_DEAD) - continue; - if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) - continue; - -#ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->semantic_match_passed); -#endif - res->prefixlen = li->plen; - res->nh_sel = nhsel; - res->type = fa->fa_type; - res->scope = fi->fib_scope; - res->fi = fi; - res->table = tb; - res->fa_head = &li->falh; - if (!(fib_flags & FIB_LOOKUP_NOREF)) - atomic_inc(&fi->fib_clntref); - return 0; - } - } - -#ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->semantic_match_miss); -#endif - } - - return 1; -} - static inline t_key prefix_mismatch(t_key key, struct tnode *n) { t_key prefix = n->key; @@ -1254,6 +1188,7 @@ static inline t_key prefix_mismatch(t_key key, struct tnode *n) return (key ^ prefix) & (prefix | -prefix); } +/* should be called with rcu_read_lock */ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags) { @@ -1263,14 +1198,12 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, #endif const t_key key = ntohl(flp->daddr); struct tnode *n, *pn; + struct leaf_info *li; t_key cindex; - int ret = 1; - - rcu_read_lock(); n = rcu_dereference(t->trie); if (!n) - goto failed; + return -EAGAIN; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->gets); @@ -1350,7 +1283,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, pn = node_parent_rcu(pn); if (unlikely(!pn)) - goto failed; + return -EAGAIN; #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->backtrack); #endif @@ -1368,12 +1301,62 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, found: /* Step 3: Process the leaf, if that fails fall back to backtracing */ - ret = check_leaf(tb, t, n, key, flp, res, fib_flags); - if (unlikely(ret > 0)) - goto backtrace; -failed: - rcu_read_unlock(); - return ret; + hlist_for_each_entry_rcu(li, &n->list, hlist) { + struct fib_alias *fa; + + if ((key ^ n->key) & li->mask_plen) + continue; + + list_for_each_entry_rcu(fa, &li->falh, fa_list) { + struct fib_info *fi = fa->fa_info; + int nhsel, err; + + if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos) + continue; + if (fi->fib_dead) + continue; + if (fa->fa_info->fib_scope < flp->flowi4_scope) + continue; + fib_alias_accessed(fa); + err = fib_props[fa->fa_type].error; + if (unlikely(err < 0)) { +#ifdef CONFIG_IP_FIB_TRIE_STATS + this_cpu_inc(stats->semantic_match_passed); +#endif + return err; + } + if (fi->fib_flags & RTNH_F_DEAD) + continue; + for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) { + const struct fib_nh *nh = &fi->fib_nh[nhsel]; + + if (nh->nh_flags & RTNH_F_DEAD) + continue; + if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) + continue; + + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&fi->fib_clntref); + + res->prefixlen = li->plen; + res->nh_sel = nhsel; + res->type = fa->fa_type; + res->scope = fi->fib_scope; + res->fi = fi; + res->table = tb; + res->fa_head = &li->falh; +#ifdef CONFIG_IP_FIB_TRIE_STATS + this_cpu_inc(stats->semantic_match_passed); +#endif + return err; + } + } + +#ifdef CONFIG_IP_FIB_TRIE_STATS + this_cpu_inc(stats->semantic_match_miss); +#endif + } + goto backtrace; } EXPORT_SYMBOL_GPL(fib_table_lookup); From cf3637bb8f07fb3b6791708f1d5118afb5446061 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:31 -0800 Subject: [PATCH 11/17] fib_trie: Move resize to after inflate/halve This change consists of a cut/paste of resize to behind inflate and halve so that I could remove the two function prototypes. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 311 ++++++++++++++++++++++---------------------- 1 file changed, 154 insertions(+), 157 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 987b06d1effe..d044fa355f69 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -149,8 +149,6 @@ struct trie { static void tnode_put_child_reorg(struct tnode *tn, unsigned long i, struct tnode *n, int wasfull); static struct tnode *resize(struct trie *t, struct tnode *tn); -static struct tnode *inflate(struct trie *t, struct tnode *tn); -static struct tnode *halve(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ static struct callback_head *tnode_free_head; static size_t tnode_free_size; @@ -447,161 +445,6 @@ static void put_child_root(struct tnode *tp, struct trie *t, rcu_assign_pointer(t->trie, n); } -#define MAX_WORK 10 -static struct tnode *resize(struct trie *t, struct tnode *tn) -{ - struct tnode *old_tn, *n = NULL; - int inflate_threshold_use; - int halve_threshold_use; - int max_work; - - if (!tn) - return NULL; - - pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", - tn, inflate_threshold, halve_threshold); - - /* No children */ - if (tn->empty_children > (tnode_child_length(tn) - 1)) - goto no_children; - - /* One child */ - if (tn->empty_children == (tnode_child_length(tn) - 1)) - goto one_child; - /* - * Double as long as the resulting node has a number of - * nonempty nodes that are above the threshold. - */ - - /* - * From "Implementing a dynamic compressed trie" by Stefan Nilsson of - * the Helsinki University of Technology and Matti Tikkanen of Nokia - * Telecommunications, page 6: - * "A node is doubled if the ratio of non-empty children to all - * children in the *doubled* node is at least 'high'." - * - * 'high' in this instance is the variable 'inflate_threshold'. It - * is expressed as a percentage, so we multiply it with - * tnode_child_length() and instead of multiplying by 2 (since the - * child array will be doubled by inflate()) and multiplying - * the left-hand side by 100 (to handle the percentage thing) we - * multiply the left-hand side by 50. - * - * The left-hand side may look a bit weird: tnode_child_length(tn) - * - tn->empty_children is of course the number of non-null children - * in the current node. tn->full_children is the number of "full" - * children, that is non-null tnodes with a skip value of 0. - * All of those will be doubled in the resulting inflated tnode, so - * we just count them one extra time here. - * - * A clearer way to write this would be: - * - * to_be_doubled = tn->full_children; - * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - - * tn->full_children; - * - * new_child_length = tnode_child_length(tn) * 2; - * - * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / - * new_child_length; - * if (new_fill_factor >= inflate_threshold) - * - * ...and so on, tho it would mess up the while () loop. - * - * anyway, - * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= - * inflate_threshold - * - * avoid a division: - * 100 * (not_to_be_doubled + 2*to_be_doubled) >= - * inflate_threshold * new_child_length - * - * expand not_to_be_doubled and to_be_doubled, and shorten: - * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children) >= inflate_threshold * new_child_length - * - * expand new_child_length: - * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children) >= - * inflate_threshold * tnode_child_length(tn) * 2 - * - * shorten again: - * 50 * (tn->full_children + tnode_child_length(tn) - - * tn->empty_children) >= inflate_threshold * - * tnode_child_length(tn) - * - */ - - /* Keep root node larger */ - - if (!node_parent(tn)) { - inflate_threshold_use = inflate_threshold_root; - halve_threshold_use = halve_threshold_root; - } else { - inflate_threshold_use = inflate_threshold; - halve_threshold_use = halve_threshold; - } - - max_work = MAX_WORK; - while ((tn->full_children > 0 && max_work-- && - 50 * (tn->full_children + tnode_child_length(tn) - - tn->empty_children) - >= inflate_threshold_use * tnode_child_length(tn))) { - - old_tn = tn; - tn = inflate(t, tn); - - if (IS_ERR(tn)) { - tn = old_tn; -#ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->resize_node_skipped); -#endif - break; - } - } - - /* Return if at least one inflate is run */ - if (max_work != MAX_WORK) - return tn; - - /* - * Halve as long as the number of empty children in this - * node is above threshold. - */ - - max_work = MAX_WORK; - while (tn->bits > 1 && max_work-- && - 100 * (tnode_child_length(tn) - tn->empty_children) < - halve_threshold_use * tnode_child_length(tn)) { - - old_tn = tn; - tn = halve(t, tn); - if (IS_ERR(tn)) { - tn = old_tn; -#ifdef CONFIG_IP_FIB_TRIE_STATS - this_cpu_inc(t->stats->resize_node_skipped); -#endif - break; - } - } - - - /* Only one child remains */ - if (tn->empty_children == (tnode_child_length(tn) - 1)) { - unsigned long i; -one_child: - for (i = tnode_child_length(tn); !n && i;) - n = tnode_get_child(tn, --i); -no_children: - /* compress one level */ - node_set_parent(n, NULL); - tnode_free_safe(tn); - return n; - } - return tn; -} - - static void tnode_clean_free(struct tnode *tn) { struct tnode *tofree; @@ -804,6 +647,160 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode) return ERR_PTR(-ENOMEM); } +#define MAX_WORK 10 +static struct tnode *resize(struct trie *t, struct tnode *tn) +{ + struct tnode *old_tn, *n = NULL; + int inflate_threshold_use; + int halve_threshold_use; + int max_work; + + if (!tn) + return NULL; + + pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); + + /* No children */ + if (tn->empty_children > (tnode_child_length(tn) - 1)) + goto no_children; + + /* One child */ + if (tn->empty_children == (tnode_child_length(tn) - 1)) + goto one_child; + /* + * Double as long as the resulting node has a number of + * nonempty nodes that are above the threshold. + */ + + /* + * From "Implementing a dynamic compressed trie" by Stefan Nilsson of + * the Helsinki University of Technology and Matti Tikkanen of Nokia + * Telecommunications, page 6: + * "A node is doubled if the ratio of non-empty children to all + * children in the *doubled* node is at least 'high'." + * + * 'high' in this instance is the variable 'inflate_threshold'. It + * is expressed as a percentage, so we multiply it with + * tnode_child_length() and instead of multiplying by 2 (since the + * child array will be doubled by inflate()) and multiplying + * the left-hand side by 100 (to handle the percentage thing) we + * multiply the left-hand side by 50. + * + * The left-hand side may look a bit weird: tnode_child_length(tn) + * - tn->empty_children is of course the number of non-null children + * in the current node. tn->full_children is the number of "full" + * children, that is non-null tnodes with a skip value of 0. + * All of those will be doubled in the resulting inflated tnode, so + * we just count them one extra time here. + * + * A clearer way to write this would be: + * + * to_be_doubled = tn->full_children; + * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - + * tn->full_children; + * + * new_child_length = tnode_child_length(tn) * 2; + * + * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / + * new_child_length; + * if (new_fill_factor >= inflate_threshold) + * + * ...and so on, tho it would mess up the while () loop. + * + * anyway, + * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= + * inflate_threshold + * + * avoid a division: + * 100 * (not_to_be_doubled + 2*to_be_doubled) >= + * inflate_threshold * new_child_length + * + * expand not_to_be_doubled and to_be_doubled, and shorten: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children) >= inflate_threshold * new_child_length + * + * expand new_child_length: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children) >= + * inflate_threshold * tnode_child_length(tn) * 2 + * + * shorten again: + * 50 * (tn->full_children + tnode_child_length(tn) - + * tn->empty_children) >= inflate_threshold * + * tnode_child_length(tn) + * + */ + + /* Keep root node larger */ + + if (!node_parent(tn)) { + inflate_threshold_use = inflate_threshold_root; + halve_threshold_use = halve_threshold_root; + } else { + inflate_threshold_use = inflate_threshold; + halve_threshold_use = halve_threshold; + } + + max_work = MAX_WORK; + while ((tn->full_children > 0 && max_work-- && + 50 * (tn->full_children + tnode_child_length(tn) + - tn->empty_children) + >= inflate_threshold_use * tnode_child_length(tn))) { + + old_tn = tn; + tn = inflate(t, tn); + + if (IS_ERR(tn)) { + tn = old_tn; +#ifdef CONFIG_IP_FIB_TRIE_STATS + this_cpu_inc(t->stats->resize_node_skipped); +#endif + break; + } + } + + /* Return if at least one inflate is run */ + if (max_work != MAX_WORK) + return tn; + + /* + * Halve as long as the number of empty children in this + * node is above threshold. + */ + + max_work = MAX_WORK; + while (tn->bits > 1 && max_work-- && + 100 * (tnode_child_length(tn) - tn->empty_children) < + halve_threshold_use * tnode_child_length(tn)) { + + old_tn = tn; + tn = halve(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; +#ifdef CONFIG_IP_FIB_TRIE_STATS + this_cpu_inc(t->stats->resize_node_skipped); +#endif + break; + } + } + + + /* Only one child remains */ + if (tn->empty_children == (tnode_child_length(tn) - 1)) { + unsigned long i; +one_child: + for (i = tnode_child_length(tn); !n && i;) + n = tnode_get_child(tn, --i); +no_children: + /* compress one level */ + node_set_parent(n, NULL); + tnode_free_safe(tn); + return n; + } + return tn; +} + /* readside must use rcu_read_lock currently dump routines via get_fa_head and dump */ From f05a48198bf742efd9d36553c51e98a648dbe807 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:37 -0800 Subject: [PATCH 12/17] fib_trie: Add functions should_inflate and should_halve This change pulls the logic for if we should inflate/halve the nodes out into separate functions. It also addresses what I believe is a bug where 1 full node is all that is needed to keep a node from ever being halved. Simple script to reproduce the issue: modprobe dummy; ifconfig dummy0 up for i in `seq 0 255`; do ifconfig dummy0:$i 10.0.${i}.1/24 up; done ifconfig dummy0:256 10.0.255.33/16 up for i in `seq 0 254`; do ifconfig dummy0:$i down; done Results from /proc/net/fib_triestat Before: Local: Aver depth: 3.00 Max depth: 4 Leaves: 17 Prefixes: 18 Internal nodes: 11 1: 8 2: 2 10: 1 Pointers: 1048 Null ptrs: 1021 Total size: 11 kB After: Local: Aver depth: 3.41 Max depth: 5 Leaves: 17 Prefixes: 18 Internal nodes: 12 1: 8 2: 3 3: 1 Pointers: 36 Null ptrs: 8 Total size: 3 kB Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 175 ++++++++++++++++++++++---------------------- 1 file changed, 89 insertions(+), 86 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d044fa355f69..58e1224786fa 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -647,12 +647,94 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode) return ERR_PTR(-ENOMEM); } +/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of + * the Helsinki University of Technology and Matti Tikkanen of Nokia + * Telecommunications, page 6: + * "A node is doubled if the ratio of non-empty children to all + * children in the *doubled* node is at least 'high'." + * + * 'high' in this instance is the variable 'inflate_threshold'. It + * is expressed as a percentage, so we multiply it with + * tnode_child_length() and instead of multiplying by 2 (since the + * child array will be doubled by inflate()) and multiplying + * the left-hand side by 100 (to handle the percentage thing) we + * multiply the left-hand side by 50. + * + * The left-hand side may look a bit weird: tnode_child_length(tn) + * - tn->empty_children is of course the number of non-null children + * in the current node. tn->full_children is the number of "full" + * children, that is non-null tnodes with a skip value of 0. + * All of those will be doubled in the resulting inflated tnode, so + * we just count them one extra time here. + * + * A clearer way to write this would be: + * + * to_be_doubled = tn->full_children; + * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - + * tn->full_children; + * + * new_child_length = tnode_child_length(tn) * 2; + * + * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / + * new_child_length; + * if (new_fill_factor >= inflate_threshold) + * + * ...and so on, tho it would mess up the while () loop. + * + * anyway, + * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= + * inflate_threshold + * + * avoid a division: + * 100 * (not_to_be_doubled + 2*to_be_doubled) >= + * inflate_threshold * new_child_length + * + * expand not_to_be_doubled and to_be_doubled, and shorten: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children) >= inflate_threshold * new_child_length + * + * expand new_child_length: + * 100 * (tnode_child_length(tn) - tn->empty_children + + * tn->full_children) >= + * inflate_threshold * tnode_child_length(tn) * 2 + * + * shorten again: + * 50 * (tn->full_children + tnode_child_length(tn) - + * tn->empty_children) >= inflate_threshold * + * tnode_child_length(tn) + * + */ +static bool should_inflate(const struct tnode *tn) +{ + unsigned long used = tnode_child_length(tn); + unsigned long threshold = used; + + /* Keep root node larger */ + threshold *= node_parent(tn) ? inflate_threshold : + inflate_threshold_root; + used += tn->full_children; + used -= tn->empty_children; + + return tn->pos && ((50 * used) >= threshold); +} + +static bool should_halve(const struct tnode *tn) +{ + unsigned long used = tnode_child_length(tn); + unsigned long threshold = used; + + /* Keep root node larger */ + threshold *= node_parent(tn) ? halve_threshold : + halve_threshold_root; + used -= tn->empty_children; + + return (tn->bits > 1) && ((100 * used) < threshold); +} + #define MAX_WORK 10 static struct tnode *resize(struct trie *t, struct tnode *tn) { struct tnode *old_tn, *n = NULL; - int inflate_threshold_use; - int halve_threshold_use; int max_work; if (!tn) @@ -668,86 +750,12 @@ static struct tnode *resize(struct trie *t, struct tnode *tn) /* One child */ if (tn->empty_children == (tnode_child_length(tn) - 1)) goto one_child; - /* - * Double as long as the resulting node has a number of + + /* Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ - - /* - * From "Implementing a dynamic compressed trie" by Stefan Nilsson of - * the Helsinki University of Technology and Matti Tikkanen of Nokia - * Telecommunications, page 6: - * "A node is doubled if the ratio of non-empty children to all - * children in the *doubled* node is at least 'high'." - * - * 'high' in this instance is the variable 'inflate_threshold'. It - * is expressed as a percentage, so we multiply it with - * tnode_child_length() and instead of multiplying by 2 (since the - * child array will be doubled by inflate()) and multiplying - * the left-hand side by 100 (to handle the percentage thing) we - * multiply the left-hand side by 50. - * - * The left-hand side may look a bit weird: tnode_child_length(tn) - * - tn->empty_children is of course the number of non-null children - * in the current node. tn->full_children is the number of "full" - * children, that is non-null tnodes with a skip value of 0. - * All of those will be doubled in the resulting inflated tnode, so - * we just count them one extra time here. - * - * A clearer way to write this would be: - * - * to_be_doubled = tn->full_children; - * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - - * tn->full_children; - * - * new_child_length = tnode_child_length(tn) * 2; - * - * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / - * new_child_length; - * if (new_fill_factor >= inflate_threshold) - * - * ...and so on, tho it would mess up the while () loop. - * - * anyway, - * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= - * inflate_threshold - * - * avoid a division: - * 100 * (not_to_be_doubled + 2*to_be_doubled) >= - * inflate_threshold * new_child_length - * - * expand not_to_be_doubled and to_be_doubled, and shorten: - * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children) >= inflate_threshold * new_child_length - * - * expand new_child_length: - * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children) >= - * inflate_threshold * tnode_child_length(tn) * 2 - * - * shorten again: - * 50 * (tn->full_children + tnode_child_length(tn) - - * tn->empty_children) >= inflate_threshold * - * tnode_child_length(tn) - * - */ - - /* Keep root node larger */ - - if (!node_parent(tn)) { - inflate_threshold_use = inflate_threshold_root; - halve_threshold_use = halve_threshold_root; - } else { - inflate_threshold_use = inflate_threshold; - halve_threshold_use = halve_threshold; - } - max_work = MAX_WORK; - while ((tn->full_children > 0 && max_work-- && - 50 * (tn->full_children + tnode_child_length(tn) - - tn->empty_children) - >= inflate_threshold_use * tnode_child_length(tn))) { - + while (should_inflate(tn) && max_work--) { old_tn = tn; tn = inflate(t, tn); @@ -764,16 +772,11 @@ static struct tnode *resize(struct trie *t, struct tnode *tn) if (max_work != MAX_WORK) return tn; - /* - * Halve as long as the number of empty children in this + /* Halve as long as the number of empty children in this * node is above threshold. */ - max_work = MAX_WORK; - while (tn->bits > 1 && max_work-- && - 100 * (tnode_child_length(tn) - tn->empty_children) < - halve_threshold_use * tnode_child_length(tn)) { - + while (should_halve(tn) && max_work--) { old_tn = tn; tn = halve(t, tn); if (IS_ERR(tn)) { From ff181ed8768fa57fbda40fa97914f27fe275129b Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:43 -0800 Subject: [PATCH 13/17] fib_trie: Push assignment of child to parent down into inflate/halve This change makes it so that the assignment of the tnode to the parent is handled directly within whatever function is currently handling the node be it inflate, halve, or resize. By doing this we can avoid some of the need to set NULL pointers in the tree while we are resizing the subnodes. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 149 ++++++++++++++++++++------------------------ 1 file changed, 66 insertions(+), 83 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 58e1224786fa..7fbd2c5a76b1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -146,9 +146,7 @@ struct trie { #endif }; -static void tnode_put_child_reorg(struct tnode *tn, unsigned long i, - struct tnode *n, int wasfull); -static struct tnode *resize(struct trie *t, struct tnode *tn); +static void resize(struct trie *t, struct tnode *tn); /* tnodes to free after resize(); protected by RTNL */ static struct callback_head *tnode_free_head; static size_t tnode_free_size; @@ -396,22 +394,13 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n) return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n); } -static inline void put_child(struct tnode *tn, unsigned long i, - struct tnode *n) -{ - tnode_put_child_reorg(tn, i, n, -1); -} - - /* - * Add a child at position i overwriting the old value. - * Update the value of full_children and empty_children. - */ - -static void tnode_put_child_reorg(struct tnode *tn, unsigned long i, - struct tnode *n, int wasfull) +/* Add a child at position i overwriting the old value. + * Update the value of full_children and empty_children. + */ +static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) { struct tnode *chi = rtnl_dereference(tn->child[i]); - int isfull; + int isfull, wasfull; BUG_ON(i >= tnode_child_length(tn)); @@ -422,10 +411,9 @@ static void tnode_put_child_reorg(struct tnode *tn, unsigned long i, tn->empty_children--; /* update fullChildren */ - if (wasfull == -1) - wasfull = tnode_full(tn, chi); - + wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); + if (wasfull && !isfull) tn->full_children--; else if (!wasfull && isfull) @@ -458,9 +446,10 @@ static void tnode_clean_free(struct tnode *tn) node_free(tn); } -static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) +static int inflate(struct trie *t, struct tnode *oldtnode) { unsigned long olen = tnode_child_length(oldtnode); + struct tnode *tp = node_parent(oldtnode); struct tnode *tn; unsigned long i; t_key m; @@ -468,9 +457,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1); - if (!tn) - return ERR_PTR(-ENOMEM); + return -ENOMEM; /* * Preallocate and store tnodes before the actual work so we @@ -564,30 +552,36 @@ static struct tnode *inflate(struct trie *t, struct tnode *oldtnode) put_child(left, j, rtnl_dereference(inode->child[j])); put_child(right, j, rtnl_dereference(inode->child[j + size])); } - put_child(tn, 2*i, resize(t, left)); - put_child(tn, 2*i+1, resize(t, right)); + + put_child(tn, 2 * i, left); + put_child(tn, 2 * i + 1, right); tnode_free_safe(inode); + + resize(t, left); + resize(t, right); } + + put_child_root(tp, t, tn->key, tn); tnode_free_safe(oldtnode); - return tn; + return 0; nomem: tnode_clean_free(tn); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } -static struct tnode *halve(struct trie *t, struct tnode *oldtnode) +static int halve(struct trie *t, struct tnode *oldtnode) { unsigned long olen = tnode_child_length(oldtnode); + struct tnode *tp = node_parent(oldtnode); struct tnode *tn, *left, *right; int i; pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1); - if (!tn) - return ERR_PTR(-ENOMEM); + return -ENOMEM; /* * Preallocate and store tnodes before the actual work so we @@ -606,8 +600,10 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode) newn = tnode_new(left->key, oldtnode->pos, 1); - if (!newn) - goto nomem; + if (!newn) { + tnode_clean_free(tn); + return -ENOMEM; + } put_child(tn, i/2, newn); } @@ -635,16 +631,18 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode) /* Two nonempty children */ newBinNode = tnode_get_child(tn, i/2); - put_child(tn, i/2, NULL); put_child(newBinNode, 0, left); put_child(newBinNode, 1, right); - put_child(tn, i/2, resize(t, newBinNode)); + + put_child(tn, i / 2, newBinNode); + + resize(t, newBinNode); } + + put_child_root(tp, t, tn->key, tn); tnode_free_safe(oldtnode); - return tn; -nomem: - tnode_clean_free(tn); - return ERR_PTR(-ENOMEM); + + return 0; } /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of @@ -704,45 +702,48 @@ static struct tnode *halve(struct trie *t, struct tnode *oldtnode) * tnode_child_length(tn) * */ -static bool should_inflate(const struct tnode *tn) +static bool should_inflate(const struct tnode *tp, const struct tnode *tn) { unsigned long used = tnode_child_length(tn); unsigned long threshold = used; /* Keep root node larger */ - threshold *= node_parent(tn) ? inflate_threshold : - inflate_threshold_root; + threshold *= tp ? inflate_threshold : inflate_threshold_root; used += tn->full_children; used -= tn->empty_children; return tn->pos && ((50 * used) >= threshold); } -static bool should_halve(const struct tnode *tn) +static bool should_halve(const struct tnode *tp, const struct tnode *tn) { unsigned long used = tnode_child_length(tn); unsigned long threshold = used; /* Keep root node larger */ - threshold *= node_parent(tn) ? halve_threshold : - halve_threshold_root; + threshold *= tp ? halve_threshold : halve_threshold_root; used -= tn->empty_children; return (tn->bits > 1) && ((100 * used) < threshold); } #define MAX_WORK 10 -static struct tnode *resize(struct trie *t, struct tnode *tn) +static void resize(struct trie *t, struct tnode *tn) { - struct tnode *old_tn, *n = NULL; + struct tnode *tp = node_parent(tn), *n = NULL; + struct tnode __rcu **cptr; int max_work; - if (!tn) - return NULL; - pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", tn, inflate_threshold, halve_threshold); + /* track the tnode via the pointer from the parent instead of + * doing it ourselves. This way we can let RCU fully do its + * thing without us interfering + */ + cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie; + BUG_ON(tn != rtnl_dereference(*cptr)); + /* No children */ if (tn->empty_children > (tnode_child_length(tn) - 1)) goto no_children; @@ -755,39 +756,35 @@ static struct tnode *resize(struct trie *t, struct tnode *tn) * nonempty nodes that are above the threshold. */ max_work = MAX_WORK; - while (should_inflate(tn) && max_work--) { - old_tn = tn; - tn = inflate(t, tn); - - if (IS_ERR(tn)) { - tn = old_tn; + while (should_inflate(tp, tn) && max_work--) { + if (inflate(t, tn)) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(t->stats->resize_node_skipped); #endif break; } + + tn = rtnl_dereference(*cptr); } /* Return if at least one inflate is run */ if (max_work != MAX_WORK) - return tn; + return; /* Halve as long as the number of empty children in this * node is above threshold. */ max_work = MAX_WORK; - while (should_halve(tn) && max_work--) { - old_tn = tn; - tn = halve(t, tn); - if (IS_ERR(tn)) { - tn = old_tn; + while (should_halve(tp, tn) && max_work--) { + if (halve(t, tn)) { #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(t->stats->resize_node_skipped); #endif break; } - } + tn = rtnl_dereference(*cptr); + } /* Only one child remains */ if (tn->empty_children == (tnode_child_length(tn) - 1)) { @@ -797,11 +794,12 @@ static struct tnode *resize(struct trie *t, struct tnode *tn) n = tnode_get_child(tn, --i); no_children: /* compress one level */ - node_set_parent(n, NULL); + put_child_root(tp, t, tn->key, n); + node_set_parent(n, tp); + + /* drop dead node */ tnode_free_safe(tn); - return n; } - return tn; } /* readside must use rcu_read_lock currently dump routines @@ -882,34 +880,19 @@ static struct tnode *fib_find_node(struct trie *t, u32 key) static void trie_rebalance(struct trie *t, struct tnode *tn) { - int wasfull; - t_key cindex, key; struct tnode *tp; - key = tn->key; - - while (tn != NULL && (tp = node_parent(tn)) != NULL) { - cindex = get_index(key, tp); - wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); - tn = resize(t, tn); - - tnode_put_child_reorg(tp, cindex, tn, wasfull); - - tp = node_parent(tn); - if (!tp) - rcu_assign_pointer(t->trie, tn); + while ((tp = node_parent(tn)) != NULL) { + resize(t, tn); tnode_free_flush(); - if (!tp) - break; tn = tp; } /* Handle last (top) tnode */ if (IS_TNODE(tn)) - tn = resize(t, tn); + resize(t, tn); - rcu_assign_pointer(t->trie, tn); tnode_free_flush(); } From fc86a93b46d707c6e245b15127409340eaf6ae47 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:49 -0800 Subject: [PATCH 14/17] fib_trie: Push tnode flushing down to inflate/halve This change pushes the tnode freeing down into the inflate and halve functions. It makes more sense here as we have a better grasp of what is going on and when a given cluster of nodes is ready to be freed. I believe this may address a bug in the freeing logic as well. For some reason if the freelist got to a certain size we would call synchronize_rcu(). I'm assuming that what they meant to do is call synchronize_rcu() after they had handed off that much memory via call_rcu(). As such that is what I have updated the behavior to be. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 103 +++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 53 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 7fbd2c5a76b1..485983ef0444 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -147,8 +147,6 @@ struct trie { }; static void resize(struct trie *t, struct tnode *tn); -/* tnodes to free after resize(); protected by RTNL */ -static struct callback_head *tnode_free_head; static size_t tnode_free_size; /* @@ -307,32 +305,6 @@ static struct tnode *tnode_alloc(size_t size) return vzalloc(size); } -static void tnode_free_safe(struct tnode *tn) -{ - BUG_ON(IS_LEAF(tn)); - tn->rcu.next = tnode_free_head; - tnode_free_head = &tn->rcu; -} - -static void tnode_free_flush(void) -{ - struct callback_head *head; - - while ((head = tnode_free_head)) { - struct tnode *tn = container_of(head, struct tnode, rcu); - - tnode_free_head = head->next; - tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); - - node_free(tn); - } - - if (tnode_free_size >= PAGE_SIZE * sync_pages) { - tnode_free_size = 0; - synchronize_rcu(); - } -} - static struct tnode *leaf_new(t_key key) { struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL); @@ -433,17 +405,33 @@ static void put_child_root(struct tnode *tp, struct trie *t, rcu_assign_pointer(t->trie, n); } -static void tnode_clean_free(struct tnode *tn) +static inline void tnode_free_init(struct tnode *tn) { - struct tnode *tofree; - unsigned long i; + tn->rcu.next = NULL; +} - for (i = 0; i < tnode_child_length(tn); i++) { - tofree = tnode_get_child(tn, i); - if (tofree) - node_free(tofree); +static inline void tnode_free_append(struct tnode *tn, struct tnode *n) +{ + n->rcu.next = tn->rcu.next; + tn->rcu.next = &n->rcu; +} + +static void tnode_free(struct tnode *tn) +{ + struct callback_head *head = &tn->rcu; + + while (head) { + head = head->next; + tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]); + node_free(tn); + + tn = container_of(head, struct tnode, rcu); + } + + if (tnode_free_size >= PAGE_SIZE * sync_pages) { + tnode_free_size = 0; + synchronize_rcu(); } - node_free(tn); } static int inflate(struct trie *t, struct tnode *oldtnode) @@ -476,20 +464,23 @@ static int inflate(struct trie *t, struct tnode *oldtnode) inode->bits - 1); if (!left) goto nomem; + tnode_free_append(tn, left); right = tnode_new(inode->key | m, inode->pos, inode->bits - 1); - if (!right) { - node_free(left); + if (!right) goto nomem; - } + tnode_free_append(tn, right); put_child(tn, 2*i, left); put_child(tn, 2*i+1, right); } } + /* prepare oldtnode to be freed */ + tnode_free_init(oldtnode); + for (i = 0; i < olen; i++) { struct tnode *inode = tnode_get_child(oldtnode, i); struct tnode *left, *right; @@ -505,12 +496,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode) continue; } + /* drop the node in the old tnode free list */ + tnode_free_append(oldtnode, inode); + /* An internal node with two children */ if (inode->bits == 1) { put_child(tn, 2*i, rtnl_dereference(inode->child[0])); put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); - - tnode_free_safe(inode); continue; } @@ -556,17 +548,19 @@ static int inflate(struct trie *t, struct tnode *oldtnode) put_child(tn, 2 * i, left); put_child(tn, 2 * i + 1, right); - tnode_free_safe(inode); - + /* resize child nodes */ resize(t, left); resize(t, right); } put_child_root(tp, t, tn->key, tn); - tnode_free_safe(oldtnode); + + /* we completed without error, prepare to free old node */ + tnode_free(oldtnode); return 0; nomem: - tnode_clean_free(tn); + /* all pointers should be clean so we are done */ + tnode_free(tn); return -ENOMEM; } @@ -599,17 +593,20 @@ static int halve(struct trie *t, struct tnode *oldtnode) struct tnode *newn; newn = tnode_new(left->key, oldtnode->pos, 1); - if (!newn) { - tnode_clean_free(tn); + tnode_free(tn); return -ENOMEM; } + tnode_free_append(tn, newn); put_child(tn, i/2, newn); } } + /* prepare oldtnode to be freed */ + tnode_free_init(oldtnode); + for (i = 0; i < olen; i += 2) { struct tnode *newBinNode; @@ -636,11 +633,14 @@ static int halve(struct trie *t, struct tnode *oldtnode) put_child(tn, i / 2, newBinNode); + /* resize child node */ resize(t, newBinNode); } put_child_root(tp, t, tn->key, tn); - tnode_free_safe(oldtnode); + + /* all pointers should be clean so we are done */ + tnode_free(oldtnode); return 0; } @@ -798,7 +798,8 @@ static void resize(struct trie *t, struct tnode *tn) node_set_parent(n, tp); /* drop dead node */ - tnode_free_safe(tn); + tnode_free_init(tn); + tnode_free(tn); } } @@ -884,16 +885,12 @@ static void trie_rebalance(struct trie *t, struct tnode *tn) while ((tp = node_parent(tn)) != NULL) { resize(t, tn); - - tnode_free_flush(); tn = tp; } /* Handle last (top) tnode */ if (IS_TNODE(tn)) resize(t, tn); - - tnode_free_flush(); } /* only used from updater-side */ From 12c081a5c82ef64a90906247cecfa5422962e387 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:56:55 -0800 Subject: [PATCH 15/17] fib_trie: inflate/halve nodes in a more RCU friendly way This change pulls the node_set_parent functionality out of put_child_reorg and instead leaves that to the function to take care of as well. By doing this we can fully construct the new cluster of tnodes and all of the pointers out of it before we start routing pointers into it. I am suspecting this will likely fix some concurency issues though I don't have a good test to show as such. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 246 +++++++++++++++++++++----------------------- 1 file changed, 120 insertions(+), 126 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 485983ef0444..0c88df09d25d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -391,8 +391,6 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) else if (!wasfull && isfull) tn->full_children++; - node_set_parent(n, tn); - rcu_assign_pointer(tn->child[i], n); } @@ -436,10 +434,8 @@ static void tnode_free(struct tnode *tn) static int inflate(struct trie *t, struct tnode *oldtnode) { - unsigned long olen = tnode_child_length(oldtnode); - struct tnode *tp = node_parent(oldtnode); - struct tnode *tn; - unsigned long i; + struct tnode *inode, *node0, *node1, *tn, *tp; + unsigned long i, j, k; t_key m; pr_debug("In inflate\n"); @@ -448,43 +444,13 @@ static int inflate(struct trie *t, struct tnode *oldtnode) if (!tn) return -ENOMEM; - /* - * Preallocate and store tnodes before the actual work so we - * don't get into an inconsistent state if memory allocation - * fails. In case of failure we return the oldnode and inflate - * of tnode is ignored. + /* Assemble all of the pointers in our cluster, in this case that + * represents all of the pointers out of our allocated nodes that + * point to existing tnodes and the links between our allocated + * nodes. */ - for (i = 0, m = 1u << tn->pos; i < olen; i++) { - struct tnode *inode = tnode_get_child(oldtnode, i); - - if (tnode_full(oldtnode, inode) && (inode->bits > 1)) { - struct tnode *left, *right; - - left = tnode_new(inode->key & ~m, inode->pos, - inode->bits - 1); - if (!left) - goto nomem; - tnode_free_append(tn, left); - - right = tnode_new(inode->key | m, inode->pos, - inode->bits - 1); - - if (!right) - goto nomem; - tnode_free_append(tn, right); - - put_child(tn, 2*i, left); - put_child(tn, 2*i+1, right); - } - } - - /* prepare oldtnode to be freed */ - tnode_free_init(oldtnode); - - for (i = 0; i < olen; i++) { - struct tnode *inode = tnode_get_child(oldtnode, i); - struct tnode *left, *right; - unsigned long size, j; + for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) { + inode = tnode_get_child(oldtnode, --i); /* An empty child */ if (inode == NULL) @@ -496,65 +462,99 @@ static int inflate(struct trie *t, struct tnode *oldtnode) continue; } - /* drop the node in the old tnode free list */ - tnode_free_append(oldtnode, inode); - /* An internal node with two children */ if (inode->bits == 1) { - put_child(tn, 2*i, rtnl_dereference(inode->child[0])); - put_child(tn, 2*i+1, rtnl_dereference(inode->child[1])); + put_child(tn, 2 * i + 1, tnode_get_child(inode, 1)); + put_child(tn, 2 * i, tnode_get_child(inode, 0)); continue; } - /* An internal node with more than two children */ - /* We will replace this node 'inode' with two new - * ones, 'left' and 'right', each with half of the + * ones, 'node0' and 'node1', each with half of the * original children. The two new nodes will have * a position one bit further down the key and this * means that the "significant" part of their keys * (see the discussion near the top of this file) * will differ by one bit, which will be "0" in - * left's key and "1" in right's key. Since we are + * node0's key and "1" in node1's key. Since we are * moving the key position by one step, the bit that * we are moving away from - the bit at position - * (inode->pos) - is the one that will differ between - * left and right. So... we synthesize that bit in the - * two new keys. - * The mask 'm' below will be a single "one" bit at - * the position (inode->pos) + * (tn->pos) - is the one that will differ between + * node0 and node1. So... we synthesize that bit in the + * two new keys. */ + node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1); + if (!node1) + goto nomem; + tnode_free_append(tn, node1); - /* Use the old key, but set the new significant - * bit to zero. - */ + node0 = tnode_new(inode->key & ~m, inode->pos, inode->bits - 1); + if (!node0) + goto nomem; + tnode_free_append(tn, node0); - left = tnode_get_child(tn, 2*i); - put_child(tn, 2*i, NULL); - - BUG_ON(!left); - - right = tnode_get_child(tn, 2*i+1); - put_child(tn, 2*i+1, NULL); - - BUG_ON(!right); - - size = tnode_child_length(left); - for (j = 0; j < size; j++) { - put_child(left, j, rtnl_dereference(inode->child[j])); - put_child(right, j, rtnl_dereference(inode->child[j + size])); + /* populate child pointers in new nodes */ + for (k = tnode_child_length(inode), j = k / 2; j;) { + put_child(node1, --j, tnode_get_child(inode, --k)); + put_child(node0, j, tnode_get_child(inode, j)); + put_child(node1, --j, tnode_get_child(inode, --k)); + put_child(node0, j, tnode_get_child(inode, j)); } - put_child(tn, 2 * i, left); - put_child(tn, 2 * i + 1, right); + /* link new nodes to parent */ + NODE_INIT_PARENT(node1, tn); + NODE_INIT_PARENT(node0, tn); - /* resize child nodes */ - resize(t, left); - resize(t, right); + /* link parent to nodes */ + put_child(tn, 2 * i + 1, node1); + put_child(tn, 2 * i, node0); } + /* setup the parent pointer into and out of this node */ + tp = node_parent(oldtnode); + NODE_INIT_PARENT(tn, tp); put_child_root(tp, t, tn->key, tn); + /* prepare oldtnode to be freed */ + tnode_free_init(oldtnode); + + /* update all child nodes parent pointers to route to us */ + for (i = tnode_child_length(oldtnode); i;) { + inode = tnode_get_child(oldtnode, --i); + + /* A leaf or an internal node with skipped bits */ + if (!tnode_full(oldtnode, inode)) { + node_set_parent(inode, tn); + continue; + } + + /* drop the node in the old tnode free list */ + tnode_free_append(oldtnode, inode); + + /* fetch new nodes */ + node1 = tnode_get_child(tn, 2 * i + 1); + node0 = tnode_get_child(tn, 2 * i); + + /* bits == 1 then node0 and node1 represent inode's children */ + if (inode->bits == 1) { + node_set_parent(node1, tn); + node_set_parent(node0, tn); + continue; + } + + /* update parent pointers in child node's children */ + for (k = tnode_child_length(inode), j = k / 2; j;) { + node_set_parent(tnode_get_child(inode, --k), node1); + node_set_parent(tnode_get_child(inode, --j), node0); + node_set_parent(tnode_get_child(inode, --k), node1); + node_set_parent(tnode_get_child(inode, --j), node0); + } + + /* resize child nodes */ + resize(t, node1); + resize(t, node0); + } + /* we completed without error, prepare to free old node */ tnode_free(oldtnode); return 0; @@ -566,10 +566,8 @@ static int inflate(struct trie *t, struct tnode *oldtnode) static int halve(struct trie *t, struct tnode *oldtnode) { - unsigned long olen = tnode_child_length(oldtnode); - struct tnode *tp = node_parent(oldtnode); - struct tnode *tn, *left, *right; - int i; + struct tnode *tn, *tp, *inode, *node0, *node1; + unsigned long i; pr_debug("In halve\n"); @@ -577,68 +575,64 @@ static int halve(struct trie *t, struct tnode *oldtnode) if (!tn) return -ENOMEM; - /* - * Preallocate and store tnodes before the actual work so we - * don't get into an inconsistent state if memory allocation - * fails. In case of failure we return the oldnode and halve - * of tnode is ignored. + /* Assemble all of the pointers in our cluster, in this case that + * represents all of the pointers out of our allocated nodes that + * point to existing tnodes and the links between our allocated + * nodes. */ + for (i = tnode_child_length(oldtnode); i;) { + node1 = tnode_get_child(oldtnode, --i); + node0 = tnode_get_child(oldtnode, --i); - for (i = 0; i < olen; i += 2) { - left = tnode_get_child(oldtnode, i); - right = tnode_get_child(oldtnode, i+1); - - /* Two nonempty children */ - if (left && right) { - struct tnode *newn; - - newn = tnode_new(left->key, oldtnode->pos, 1); - if (!newn) { - tnode_free(tn); - return -ENOMEM; - } - tnode_free_append(tn, newn); - - put_child(tn, i/2, newn); + /* At least one of the children is empty */ + if (!node1 || !node0) { + put_child(tn, i / 2, node1 ? : node0); + continue; } + /* Two nonempty children */ + inode = tnode_new(node0->key, oldtnode->pos, 1); + if (!inode) { + tnode_free(tn); + return -ENOMEM; + } + tnode_free_append(tn, inode); + + /* initialize pointers out of node */ + put_child(inode, 1, node1); + put_child(inode, 0, node0); + NODE_INIT_PARENT(inode, tn); + + /* link parent to node */ + put_child(tn, i / 2, inode); } + /* setup the parent pointer out of and back into this node */ + tp = node_parent(oldtnode); + NODE_INIT_PARENT(tn, tp); + put_child_root(tp, t, tn->key, tn); + /* prepare oldtnode to be freed */ tnode_free_init(oldtnode); - for (i = 0; i < olen; i += 2) { - struct tnode *newBinNode; + /* update all of the child parent pointers */ + for (i = tnode_child_length(tn); i;) { + inode = tnode_get_child(tn, --i); - left = tnode_get_child(oldtnode, i); - right = tnode_get_child(oldtnode, i+1); - - /* At least one of the children is empty */ - if (left == NULL) { - if (right == NULL) /* Both are empty */ - continue; - put_child(tn, i/2, right); - continue; - } - - if (right == NULL) { - put_child(tn, i/2, left); + /* only new tnodes will be considered "full" nodes */ + if (!tnode_full(tn, inode)) { + node_set_parent(inode, tn); continue; } /* Two nonempty children */ - newBinNode = tnode_get_child(tn, i/2); - put_child(newBinNode, 0, left); - put_child(newBinNode, 1, right); - - put_child(tn, i / 2, newBinNode); + node_set_parent(tnode_get_child(inode, 1), inode); + node_set_parent(tnode_get_child(inode, 0), inode); /* resize child node */ - resize(t, newBinNode); + resize(t, inode); } - put_child_root(tp, t, tn->key, tn); - /* all pointers should be clean so we are done */ tnode_free(oldtnode); From 21d1f11db0e2f20a549ad8322879507850544670 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:57:02 -0800 Subject: [PATCH 16/17] fib_trie: Remove checks for index >= tnode_child_length from tnode_get_child For some reason the compiler doesn't seem to understand that when we are in a loop that runs from tnode_child_length - 1 to 0 we don't expect the value of tn->bits to change. As such every call to tnode_get_child was rerunning tnode_chile_length which ended up consuming quite a bit of space in the resultant assembly code. I have gone though and verified that in all cases where tnode_get_child is used we are either winding though a fixed loop from tnode_child_length - 1 to 0, or are in a fastpath case where we are verifying the value by either checking for any remaining bits after shifting index by bits and testing for leaf, or by using tnode_child_length. size net/ipv4/fib_trie.o Before: text data bss dec hex filename 15506 376 8 15890 3e12 net/ipv4/fib_trie.o After: text data bss dec hex filename 14827 376 8 15211 3b6b net/ipv4/fib_trie.o Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 0c88df09d25d..87fc9a466fa2 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -186,8 +186,6 @@ static inline unsigned long tnode_child_length(const struct tnode *tn) static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned long i) { - BUG_ON(i >= tnode_child_length(tn)); - return rtnl_dereference(tn->child[i]); } @@ -195,8 +193,6 @@ static inline struct tnode *tnode_get_child(const struct tnode *tn, static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned long i) { - BUG_ON(i >= tnode_child_length(tn)); - return rcu_dereference_rtnl(tn->child[i]); } @@ -371,7 +367,7 @@ static inline int tnode_full(const struct tnode *tn, const struct tnode *n) */ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) { - struct tnode *chi = rtnl_dereference(tn->child[i]); + struct tnode *chi = tnode_get_child(tn, i); int isfull, wasfull; BUG_ON(i >= tnode_child_length(tn)); @@ -867,7 +863,7 @@ static struct tnode *fib_find_node(struct trie *t, u32 key) if (IS_LEAF(n)) break; - n = rcu_dereference_rtnl(n->child[index]); + n = tnode_get_child_rcu(n, index); } return n; @@ -934,7 +930,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) } tp = n; - n = rcu_dereference_rtnl(n->child[index]); + n = tnode_get_child_rcu(n, index); } l = leaf_new(key); @@ -1215,7 +1211,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, cindex = index; } - n = rcu_dereference(n->child[index]); + n = tnode_get_child_rcu(n, index); if (unlikely(!n)) goto backtrace; } @@ -1835,7 +1831,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) if (n->bits < MAX_STAT_DEPTH) s->nodesizes[n->bits]++; - for (i = 0; i < tnode_child_length(n); i++) { + for (i = tnode_child_length(n); i--;) { if (!rcu_access_pointer(n->child[i])) s->nullpointers++; } From 5405afd1a30620de8601436bae739c67c0bc7324 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 31 Dec 2014 10:57:08 -0800 Subject: [PATCH 17/17] fib_trie: Add tracking value for suffix length This change adds a tracking value for the maximum suffix length of all prefixes stored in any given tnode. With this value we can determine if we need to backtrace or not based on if the suffix is greater than the pos value. By doing this we can reduce the CPU overhead for lookups in the local table as many of the prefixes there are 32b long and have a suffix length of 0 meaning we can immediately backtrace to the root node without needing to test any of the nodes between it and where we ended up. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 122 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 116 insertions(+), 6 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 87fc9a466fa2..281e5e00025f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -96,6 +96,7 @@ struct tnode { t_key key; unsigned char bits; /* 2log(KEYLENGTH) bits needed */ unsigned char pos; /* 2log(KEYLENGTH) bits needed */ + unsigned char slen; struct tnode __rcu *parent; struct rcu_head rcu; union { @@ -311,6 +312,7 @@ static struct tnode *leaf_new(t_key key) * as the nodes are searched */ l->key = key; + l->slen = 0; l->pos = 0; /* set bits to 0 indicating we are not a tnode */ l->bits = 0; @@ -342,6 +344,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) if (tn) { tn->parent = NULL; + tn->slen = pos; tn->pos = pos; tn->bits = bits; tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0; @@ -387,6 +390,9 @@ static void put_child(struct tnode *tn, unsigned long i, struct tnode *n) else if (!wasfull && isfull) tn->full_children++; + if (n && (tn->slen < n->slen)) + tn->slen = n->slen; + rcu_assign_pointer(tn->child[i], n); } @@ -635,6 +641,41 @@ static int halve(struct trie *t, struct tnode *oldtnode) return 0; } +static unsigned char update_suffix(struct tnode *tn) +{ + unsigned char slen = tn->pos; + unsigned long stride, i; + + /* search though the list of children looking for nodes that might + * have a suffix greater than the one we currently have. This is + * why we start with a stride of 2 since a stride of 1 would + * represent the nodes with suffix length equal to tn->pos + */ + for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) { + struct tnode *n = tnode_get_child(tn, i); + + if (!n || (n->slen <= slen)) + continue; + + /* update stride and slen based on new value */ + stride <<= (n->slen - slen); + slen = n->slen; + i &= ~(stride - 1); + + /* if slen covers all but the last bit we can stop here + * there will be nothing longer than that since only node + * 0 and 1 << (bits - 1) could have that as their suffix + * length. + */ + if ((slen + 1) >= (tn->pos + tn->bits)) + break; + } + + tn->slen = slen; + + return slen; +} + /* From "Implementing a dynamic compressed trie" by Stefan Nilsson of * the Helsinki University of Technology and Matti Tikkanen of Nokia * Telecommunications, page 6: @@ -790,6 +831,19 @@ static void resize(struct trie *t, struct tnode *tn) /* drop dead node */ tnode_free_init(tn); tnode_free(tn); + return; + } + + /* Return if at least one deflate was run */ + if (max_work != MAX_WORK) + return; + + /* push the suffix length to the parent node */ + if (tn->slen > tn->pos) { + unsigned char slen = update_suffix(tn); + + if (tp && (slen > tp->slen)) + tp->slen = slen; } } @@ -818,8 +872,58 @@ static inline struct list_head *get_fa_head(struct tnode *l, int plen) return &li->falh; } -static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) +static void leaf_pull_suffix(struct tnode *l) { + struct tnode *tp = node_parent(l); + + while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) { + if (update_suffix(tp) > l->slen) + break; + tp = node_parent(tp); + } +} + +static void leaf_push_suffix(struct tnode *l) +{ + struct tnode *tn = node_parent(l); + + /* if this is a new leaf then tn will be NULL and we can sort + * out parent suffix lengths as a part of trie_rebalance + */ + while (tn && (tn->slen < l->slen)) { + tn->slen = l->slen; + tn = node_parent(tn); + } +} + +static void remove_leaf_info(struct tnode *l, struct leaf_info *old) +{ + struct hlist_node *prev; + + /* record the location of the pointer to this object */ + prev = rtnl_dereference(hlist_pprev_rcu(&old->hlist)); + + /* remove the leaf info from the list */ + hlist_del_rcu(&old->hlist); + + /* if we emptied the list this leaf will be freed and we can sort + * out parent suffix lengths as a part of trie_rebalance + */ + if (hlist_empty(&l->list)) + return; + + /* if we removed the tail then we need to update slen */ + if (!rcu_access_pointer(hlist_next_rcu(prev))) { + struct leaf_info *li = hlist_entry(prev, typeof(*li), hlist); + + l->slen = KEYLENGTH - li->plen; + leaf_pull_suffix(l); + } +} + +static void insert_leaf_info(struct tnode *l, struct leaf_info *new) +{ + struct hlist_head *head = &l->list; struct leaf_info *li = NULL, *last = NULL; if (hlist_empty(head)) { @@ -836,6 +940,12 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) else hlist_add_before_rcu(&new->hlist, &li->hlist); } + + /* if we added to the tail node then we need to update slen */ + if (!rcu_access_pointer(hlist_next_rcu(&new->hlist))) { + l->slen = KEYLENGTH - new->plen; + leaf_push_suffix(l); + } } /* rcu_read_lock needs to be hold by caller from readside */ @@ -925,7 +1035,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) /* we have found a leaf. Prefixes have already been compared */ if (IS_LEAF(n)) { /* Case 1: n is a leaf, and prefixes match*/ - insert_leaf_info(&n->list, li); + insert_leaf_info(n, li); return fa_head; } @@ -939,7 +1049,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen) return NULL; } - insert_leaf_info(&l->list, li); + insert_leaf_info(l, li); /* Case 2: n is a LEAF or a TNODE and the key doesn't match. * @@ -1206,7 +1316,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, /* only record pn and cindex if we are going to be chopping * bits later. Otherwise we are just wasting cycles. */ - if (index) { + if (n->slen > n->pos) { pn = n; cindex = index; } @@ -1225,7 +1335,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, * between the key and the prefix exist in the region of * the lsb and higher in the prefix. */ - if (unlikely(prefix_mismatch(key, n))) + if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos)) goto backtrace; /* exit out and process leaf */ @@ -1425,7 +1535,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) tb->tb_num_default--; if (list_empty(fa_head)) { - hlist_del_rcu(&li->hlist); + remove_leaf_info(l, li); free_leaf_info(li); }