diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2597077442e5..31906f90706e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -451,6 +451,7 @@ struct nft_set_ext; * @init: initialize private data of new set instance * @destroy: destroy private data of set instance * @gc_init: initialize garbage collection + * @abort_skip_removal: skip removal of elements from abort path * @elemsize: element private size * * Operations lookup, update and delete have simpler interfaces, are faster @@ -508,6 +509,7 @@ struct nft_set_ops { const struct nft_set *set); void (*gc_init)(const struct nft_set *set); + bool abort_skip_removal; unsigned int elemsize; }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e7247363c643..be4924aeaf0e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -7807,7 +7807,8 @@ static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, continue; } - if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) + if (!te->set->ops->abort_skip_removal || + nft_setelem_is_catchall(te->set, te->elems[i].priv)) nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 6d77a5f0088a..18e1903b1d3d 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2370,6 +2370,7 @@ const struct nft_set_type nft_set_pipapo_type = { .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, + .abort_skip_removal = true, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2394,6 +2395,7 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, + .abort_skip_removal = true, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index ca594161b840..7598c368c4e5 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -10,15 +10,29 @@ #include #include #include +#include #include #include #include #include +struct nft_array_interval { + struct nft_set_ext *from; + struct nft_set_ext *to; +}; + +struct nft_array { + u32 max_intervals; + u32 num_intervals; + struct nft_array_interval *intervals; + struct rcu_head rcu_head; +}; + struct nft_rbtree { struct rb_root root; rwlock_t lock; - seqcount_rwlock_t count; + struct nft_array __rcu *array; + struct nft_array *array_next; unsigned long last_gc; }; @@ -47,67 +61,33 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } -static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) +struct nft_array_lookup_ctx { + const u32 *key; + u32 klen; +}; + +static int nft_array_lookup_cmp(const void *pkey, const void *entry) { - return nft_set_elem_expired(&rbe->ext); -} + const struct nft_array_interval *interval = entry; + const struct nft_array_lookup_ctx *ctx = pkey; + int a, b; -static const struct nft_set_ext * -__nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, unsigned int seq) -{ - struct nft_rbtree *priv = nft_set_priv(set); - const struct nft_rbtree_elem *rbe, *interval = NULL; - u8 genmask = nft_genmask_cur(net); - const struct rb_node *parent; - int d; + if (!interval->from) + return 1; - parent = rcu_dereference_raw(priv->root.rb_node); - while (parent != NULL) { - if (read_seqcount_retry(&priv->count, seq)) - return NULL; + a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); + if (!interval->to) + b = -1; + else + b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); - rbe = rb_entry(parent, struct nft_rbtree_elem, node); + if (a >= 0 && b < 0) + return 0; - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); - if (d < 0) { - parent = rcu_dereference_raw(parent->rb_left); - if (interval && - !nft_rbtree_cmp(set, rbe, interval) && - nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(interval)) - continue; - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_rbtree_elem_expired(rbe)) - interval = rbe; - } else if (d > 0) - parent = rcu_dereference_raw(parent->rb_right); - else { - if (!nft_set_elem_active(&rbe->ext, genmask)) { - parent = rcu_dereference_raw(parent->rb_left); - continue; - } + if (a < 0) + return -1; - if (nft_rbtree_elem_expired(rbe)) - return NULL; - - if (nft_rbtree_interval_end(rbe)) { - if (nft_set_is_anonymous(set)) - return NULL; - parent = rcu_dereference_raw(parent->rb_left); - interval = NULL; - continue; - } - - return &rbe->ext; - } - } - - if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_rbtree_interval_start(interval)) - return &interval->ext; - - return NULL; + return 1; } INDIRECT_CALLABLE_SCOPE @@ -116,83 +96,57 @@ nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_rbtree *priv = nft_set_priv(set); - unsigned int seq = read_seqcount_begin(&priv->count); - const struct nft_set_ext *ext; + struct nft_array *array = rcu_dereference(priv->array); + const struct nft_array_interval *interval; + struct nft_array_lookup_ctx ctx = { + .key = key, + .klen = set->klen, + }; - ext = __nft_rbtree_lookup(net, set, key, seq); - if (ext || !read_seqcount_retry(&priv->count, seq)) - return ext; + if (!array) + return NULL; - read_lock_bh(&priv->lock); - seq = read_seqcount_begin(&priv->count); - ext = __nft_rbtree_lookup(net, set, key, seq); - read_unlock_bh(&priv->lock); + interval = bsearch(&ctx, array->intervals, array->num_intervals, + sizeof(struct nft_array_interval), + nft_array_lookup_cmp); + if (!interval || nft_set_elem_expired(interval->from)) + return NULL; - return ext; + return interval->from; } -static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, - const u32 *key, struct nft_rbtree_elem **elem, - unsigned int seq, unsigned int flags, u8 genmask) +struct nft_array_get_ctx { + const u32 *key; + unsigned int flags; + u32 klen; +}; + +static int nft_array_get_cmp(const void *pkey, const void *entry) { - struct nft_rbtree_elem *rbe, *interval = NULL; - struct nft_rbtree *priv = nft_set_priv(set); - const struct rb_node *parent; - const void *this; - int d; + const struct nft_array_interval *interval = entry; + const struct nft_array_get_ctx *ctx = pkey; + int a, b; - parent = rcu_dereference_raw(priv->root.rb_node); - while (parent != NULL) { - if (read_seqcount_retry(&priv->count, seq)) - return false; + if (!interval->from) + return 1; - rbe = rb_entry(parent, struct nft_rbtree_elem, node); + a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); + if (!interval->to) + b = -1; + else + b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); - if (d < 0) { - parent = rcu_dereference_raw(parent->rb_left); - if (!(flags & NFT_SET_ELEM_INTERVAL_END)) - interval = rbe; - } else if (d > 0) { - parent = rcu_dereference_raw(parent->rb_right); - if (flags & NFT_SET_ELEM_INTERVAL_END) - interval = rbe; - } else { - if (!nft_set_elem_active(&rbe->ext, genmask)) { - parent = rcu_dereference_raw(parent->rb_left); - continue; - } - - if (nft_set_elem_expired(&rbe->ext)) - return false; - - if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || - (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == - (flags & NFT_SET_ELEM_INTERVAL_END)) { - *elem = rbe; - return true; - } - - if (nft_rbtree_interval_end(rbe)) - interval = NULL; - - parent = rcu_dereference_raw(parent->rb_left); - } + if (a >= 0) { + if (ctx->flags & NFT_SET_ELEM_INTERVAL_END && b <= 0) + return 0; + else if (b < 0) + return 0; } - if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && - ((!nft_rbtree_interval_end(interval) && - !(flags & NFT_SET_ELEM_INTERVAL_END)) || - (nft_rbtree_interval_end(interval) && - (flags & NFT_SET_ELEM_INTERVAL_END)))) { - *elem = interval; - return true; - } + if (a < 0) + return -1; - return false; + return 1; } static struct nft_elem_priv * @@ -200,24 +154,28 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); - unsigned int seq = read_seqcount_begin(&priv->count); - struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); - const u32 *key = (const u32 *)&elem->key.val; - u8 genmask = nft_genmask_cur(net); - bool ret; + struct nft_array *array = rcu_dereference(priv->array); + const struct nft_array_interval *interval; + struct nft_array_get_ctx ctx = { + .key = (const u32 *)&elem->key.val, + .flags = flags, + .klen = set->klen, + }; + struct nft_rbtree_elem *rbe; - ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - if (ret || !read_seqcount_retry(&priv->count, seq)) - return &rbe->priv; - - read_lock_bh(&priv->lock); - seq = read_seqcount_begin(&priv->count); - ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - read_unlock_bh(&priv->lock); - - if (!ret) + if (!array) return ERR_PTR(-ENOENT); + interval = bsearch(&ctx, array->intervals, array->num_intervals, + sizeof(struct nft_array_interval), nft_array_get_cmp); + if (!interval || nft_set_elem_expired(interval->from)) + return ERR_PTR(-ENOENT); + + if (flags & NFT_SET_ELEM_INTERVAL_END) + rbe = container_of(interval->to, struct nft_rbtree_elem, ext); + else + rbe = container_of(interval->from, struct nft_rbtree_elem, ext); + return &rbe->priv; } @@ -481,6 +439,87 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, return 0; } +static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals) +{ + struct nft_array_interval *intervals; + + intervals = kvcalloc(max_intervals, sizeof(struct nft_array_interval), + GFP_KERNEL_ACCOUNT); + if (!intervals) + return -ENOMEM; + + if (array->intervals) + kvfree(array->intervals); + + array->intervals = intervals; + array->max_intervals = max_intervals; + + return 0; +} + +static struct nft_array *nft_array_alloc(u32 max_intervals) +{ + struct nft_array *array; + + array = kzalloc(sizeof(*array), GFP_KERNEL_ACCOUNT); + if (!array) + return NULL; + + if (nft_array_intervals_alloc(array, max_intervals) < 0) { + kfree(array); + return NULL; + } + + return array; +} + +#define NFT_ARRAY_EXTRA_SIZE 10240 + +/* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider + * packed representation coming from userspace for anonymous sets too. + */ +static u32 nft_array_elems(const struct nft_set *set) +{ + u32 nelems = atomic_read(&set->nelems); + + /* Adjacent intervals are represented with a single start element in + * anonymous sets, use the current element counter as is. + */ + if (nft_set_is_anonymous(set)) + return nelems; + + /* Add extra room for never matching interval at the beginning and open + * interval at the end which only use a single element to represent it. + * The conversion to array will compact intervals, this allows reduce + * memory consumption. + */ + return (nelems / 2) + 2; +} + +static int nft_array_may_resize(const struct nft_set *set) +{ + u32 nelems = nft_array_elems(set), new_max_intervals; + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_array *array; + + if (!priv->array_next) { + array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE); + if (!array) + return -ENOMEM; + + priv->array_next = array; + } + + if (nelems < priv->array_next->max_intervals) + return 0; + + new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE; + if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) + return -ENOMEM; + + return 0; +} + static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) @@ -489,6 +528,9 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree *priv = nft_set_priv(set); int err; + if (nft_array_may_resize(set) < 0) + return -ENOMEM; + do { if (fatal_signal_pending(current)) return -EINTR; @@ -496,9 +538,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, cond_resched(); write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); err = __nft_rbtree_insert(net, set, rbe, elem_priv); - write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); } while (err == -EAGAIN); @@ -508,9 +548,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); rb_erase(&rbe->node, &priv->root); - write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); } @@ -553,6 +591,9 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, u64 tstamp = nft_net_tstamp(net); int d; + if (nft_array_may_resize(set) < 0) + return NULL; + while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -615,6 +656,11 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, switch (iter->type) { case NFT_ITER_UPDATE: lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (nft_array_may_resize(set) < 0) { + iter->err = -ENOMEM; + break; + } nft_rbtree_do_walk(ctx, set, iter); break; case NFT_ITER_READ: @@ -714,17 +760,26 @@ static int nft_rbtree_init(const struct nft_set *set, BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); rwlock_init(&priv->lock); - seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; + priv->array = NULL; + priv->array_next = NULL; + return 0; } +static void __nft_array_free(struct nft_array *array) +{ + kvfree(array->intervals); + kfree(array); +} + static void nft_rbtree_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; + struct nft_array *array; struct rb_node *node; while ((node = priv->root.rb_node) != NULL) { @@ -732,6 +787,12 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx, rbe = rb_entry(node, struct nft_rbtree_elem, node); nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } + + array = rcu_dereference_protected(priv->array, true); + if (array) + __nft_array_free(array); + if (priv->array_next) + __nft_array_free(priv->array_next); } static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, @@ -752,12 +813,91 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static void nft_array_free_rcu(struct rcu_head *rcu_head) +{ + struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head); + + __nft_array_free(array); +} + static void nft_rbtree_commit(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe, *prev_rbe; + struct nft_array *old; + u32 num_intervals = 0; + struct rb_node *node; if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) nft_rbtree_gc(set); + + /* No changes, skip, eg. elements updates only. */ + if (!priv->array_next) + return; + + /* Reverse walk to create an array from smaller to largest interval. */ + node = rb_last(&priv->root); + if (node) + prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); + else + prev_rbe = NULL; + + while (prev_rbe) { + rbe = prev_rbe; + + if (nft_rbtree_interval_start(rbe)) + priv->array_next->intervals[num_intervals].from = &rbe->ext; + else if (nft_rbtree_interval_end(rbe)) + priv->array_next->intervals[num_intervals++].to = &rbe->ext; + + if (num_intervals >= priv->array_next->max_intervals) { + pr_warn_once("malformed interval set from userspace?"); + goto err_out; + } + + node = rb_prev(node); + if (!node) + break; + + prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); + + /* For anonymous sets, when adjacent ranges are found, + * the end element is not added to the set to pack the set + * representation. Use next start element to complete this + * interval. + */ + if (nft_rbtree_interval_start(rbe) && + nft_rbtree_interval_start(prev_rbe) && + priv->array_next->intervals[num_intervals].from) + priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext; + + if (num_intervals >= priv->array_next->max_intervals) { + pr_warn_once("malformed interval set from userspace?"); + goto err_out; + } + } + + if (priv->array_next->intervals[num_intervals].from) + num_intervals++; +err_out: + priv->array_next->num_intervals = num_intervals; + old = rcu_replace_pointer(priv->array, priv->array_next, true); + priv->array_next = NULL; + if (old) + call_rcu(&old->rcu_head, nft_array_free_rcu); +} + +static void nft_rbtree_abort(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_array *array_next; + + if (!priv->array_next) + return; + + array_next = priv->array_next; + priv->array_next = NULL; + __nft_array_free(array_next); } static void nft_rbtree_gc_init(const struct nft_set *set) @@ -821,6 +961,7 @@ const struct nft_set_type nft_set_rbtree_type = { .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, .commit = nft_rbtree_commit, + .abort = nft_rbtree_abort, .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk,