mirror of
https://github.com/torvalds/linux.git
synced 2026-06-02 19:43:40 +02:00
netfilter pull request nf-next-26-02-06
-----BEGIN PGP SIGNATURE----- iQJdBAABCABHFiEEgKkgxbID4Gn1hq6fcJGo2a1f9gAFAmmGB20bFIAAAAAABAAO bWFudTIsMi41KzEuMTEsMiwyDRxmd0BzdHJsZW4uZGUACgkQcJGo2a1f9gC/tQ/7 B7/akiCP/QeGF7go78PZQlpIGmjtoCOcQ9uxymlmpLkArepcIEkgZ04tFH0FClY6 d3QPfT9iNap222aCQxZwCiaWrXqUNynW7RwH72SkqGmO8JTLKlzW8CQC+yGkyznj FxwRKzB8XO5Ohtw0wED3mzcf9DelsvJpX5rCU5gEjsHZjKA/rEwYgovyM+es+xSx JbHHc2tzLQuDZ1BL7rEW8TJDxmJ2bCsFJHKeIvykk3D2nVg01P0AwhUeIy+7ObV7 bQh7B8DhYwKNLtgZvDi8D6o4nWQvkjfF5BadrWusumDCtIupcwbelpcUeCsUWBqC oCjLMcH7TwmT513RXWMId50z93FWciduCHUGrQt4BxLBZmkQ9kE0iamZVIAAzLl8 VYIM9qb+nUk58jnLFl3xTqW2GetSj/p31bp6e78+SQFvqjie2z9/I+nGBr7A8aAB bNd5vpvHSEg5OP7oKk+Dhr26MiCDowtuzvdC4lYR+loFYoI+a1FS6a1w/kcw9/VA XmR6Y8is+CTy4XYTQZ4klYTVpoTkWa/D/t1CTC4IlELzYS49L6qSyef6m91IWeQ6 Way5+3ZON7sA6SM1PZ/zjsKDxYLo/hQz2+dw6YLVflfY62khvuK2Yc56MQcZEjsH 7x0b3MaKvNn9yqKC+Mk7QZ55nCjV3wyGp3GQ+ClAqZ4= =wU6p -----END PGP SIGNATURE----- Merge tag 'nf-next-26-02-06' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next Florian Westphal says: ==================== netfilter: updates for net-next The following patchset contains Netfilter updates for *net-next*: 1) Fix net-next-only use-after-free bug in nf_tables rbtree set: Expired elements cannot be released right away after unlink anymore because there is no guarantee that the binary-search blob is going to be updated. Spotted by syzkaller. 2) Fix esoteric bug in nf_queue with udp fraglist gro, broken since 6.11. Patch 3 adds extends the nfqueue selftest for this. 4) Use dedicated slab for flowtable entries, currently the -512 cache is used, which is wasteful. From Qingfang Deng. 5) Recent net-next update extended existing test for ip6ip6 tunnels, add the required /config entry. Test still passed by accident because the previous tests network setup gets re-used, so also update the test so it will fail in case the ip6ip6 tunnel interface cannot be added. 6) Fix 'nft get element mytable myset { 1.2.3.4 }' on big endian platforms, this was broken since code was added in v5.1. 7) Fix nf_tables counter reset support on 32bit platforms, where counter reset may cause huge values to appear due to wraparound. Broken since reset feature was added in v6.11. From Anders Grahn. 8-11) update nf_tables rbtree set type to detect partial operlaps. This will eventually speed up nftables userspace: at this time userspace does a netlink dump of the set content which slows down incremental updates on interval sets. From Pablo Neira Ayuso. * tag 'nf-next-26-02-06' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next: netfilter: nft_set_rbtree: validate open interval overlap netfilter: nft_set_rbtree: validate element belonging to interval netfilter: nft_set_rbtree: check for partial overlaps in anonymous sets netfilter: nft_set_rbtree: fix bogus EEXIST with NLM_F_CREATE with null interval netfilter: nft_counter: fix reset of counters on 32bit archs netfilter: nft_set_hash: fix get operation on big endian selftests: netfilter: add IPV6_TUNNEL to config netfilter: flowtable: dedicated slab for flow entry selftests: netfilter: nft_queue.sh: add udp fraglist gro test case netfilter: nfnetlink_queue: do shared-unconfirmed check before segmentation netfilter: nft_set_rbtree: don't gc elements on insert ==================== Link: https://patch.msgid.link/20260206153048.17570-1-fw@strlen.de Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
792aaea994
|
|
@ -97,6 +97,11 @@ static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
|
|||
local64_add(val, &p->v);
|
||||
}
|
||||
|
||||
static inline void u64_stats_sub(u64_stats_t *p, s64 val)
|
||||
{
|
||||
local64_sub(val, &p->v);
|
||||
}
|
||||
|
||||
static inline void u64_stats_inc(u64_stats_t *p)
|
||||
{
|
||||
local64_inc(&p->v);
|
||||
|
|
@ -145,6 +150,11 @@ static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
|
|||
p->v += val;
|
||||
}
|
||||
|
||||
static inline void u64_stats_sub(u64_stats_t *p, s64 val)
|
||||
{
|
||||
p->v -= val;
|
||||
}
|
||||
|
||||
static inline void u64_stats_inc(u64_stats_t *p)
|
||||
{
|
||||
p->v++;
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ struct nf_queue_entry {
|
|||
struct net_device *physout;
|
||||
#endif
|
||||
struct nf_hook_state state;
|
||||
bool nf_ct_is_unconfirmed;
|
||||
u16 size; /* sizeof(entry) + saved route keys */
|
||||
u16 queue_num;
|
||||
|
||||
|
|
|
|||
|
|
@ -277,6 +277,8 @@ struct nft_userdata {
|
|||
unsigned char data[];
|
||||
};
|
||||
|
||||
#define NFT_SET_ELEM_INTERNAL_LAST 0x1
|
||||
|
||||
/* placeholder structure for opaque set element backend representation. */
|
||||
struct nft_elem_priv { };
|
||||
|
||||
|
|
@ -286,6 +288,7 @@ struct nft_elem_priv { };
|
|||
* @key: element key
|
||||
* @key_end: closing element key
|
||||
* @data: element data
|
||||
* @flags: flags
|
||||
* @priv: element private data and extensions
|
||||
*/
|
||||
struct nft_set_elem {
|
||||
|
|
@ -301,6 +304,7 @@ struct nft_set_elem {
|
|||
u32 buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
|
||||
struct nft_data val;
|
||||
} data;
|
||||
u32 flags;
|
||||
struct nft_elem_priv *priv;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
static DEFINE_MUTEX(flowtable_lock);
|
||||
static LIST_HEAD(flowtables);
|
||||
static __read_mostly struct kmem_cache *flow_offload_cachep;
|
||||
|
||||
static void
|
||||
flow_offload_fill_dir(struct flow_offload *flow,
|
||||
|
|
@ -56,7 +57,7 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
|
|||
if (unlikely(nf_ct_is_dying(ct)))
|
||||
return NULL;
|
||||
|
||||
flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
|
||||
flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC);
|
||||
if (!flow)
|
||||
return NULL;
|
||||
|
||||
|
|
@ -812,9 +813,13 @@ static int __init nf_flow_table_module_init(void)
|
|||
{
|
||||
int ret;
|
||||
|
||||
flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN);
|
||||
if (!flow_offload_cachep)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = register_pernet_subsys(&nf_flow_table_net_ops);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
goto out_pernet;
|
||||
|
||||
ret = nf_flow_table_offload_init();
|
||||
if (ret)
|
||||
|
|
@ -830,6 +835,8 @@ static int __init nf_flow_table_module_init(void)
|
|||
nf_flow_table_offload_exit();
|
||||
out_offload:
|
||||
unregister_pernet_subsys(&nf_flow_table_net_ops);
|
||||
out_pernet:
|
||||
kmem_cache_destroy(flow_offload_cachep);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
@ -837,6 +844,7 @@ static void __exit nf_flow_table_module_exit(void)
|
|||
{
|
||||
nf_flow_table_offload_exit();
|
||||
unregister_pernet_subsys(&nf_flow_table_net_ops);
|
||||
kmem_cache_destroy(flow_offload_cachep);
|
||||
}
|
||||
|
||||
module_init(nf_flow_table_module_init);
|
||||
|
|
|
|||
|
|
@ -7270,7 +7270,8 @@ static u32 nft_set_maxsize(const struct nft_set *set)
|
|||
}
|
||||
|
||||
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
|
||||
const struct nlattr *attr, u32 nlmsg_flags)
|
||||
const struct nlattr *attr, u32 nlmsg_flags,
|
||||
bool last)
|
||||
{
|
||||
struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
|
||||
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
|
||||
|
|
@ -7556,6 +7557,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
|
|||
if (flags)
|
||||
*nft_set_ext_flags(ext) = flags;
|
||||
|
||||
if (last)
|
||||
elem.flags = NFT_SET_ELEM_INTERNAL_LAST;
|
||||
else
|
||||
elem.flags = 0;
|
||||
|
||||
if (obj)
|
||||
*nft_set_ext_obj(ext) = obj;
|
||||
|
||||
|
|
@ -7636,6 +7642,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
|
|||
* and an existing one.
|
||||
*/
|
||||
err = -EEXIST;
|
||||
} else if (err == -ECANCELED) {
|
||||
/* ECANCELED reports an existing nul-element in
|
||||
* interval sets.
|
||||
*/
|
||||
err = 0;
|
||||
}
|
||||
goto err_element_clash;
|
||||
}
|
||||
|
|
@ -7714,7 +7725,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
|
|||
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
|
||||
|
||||
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
|
||||
err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
|
||||
err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags,
|
||||
nla_is_last(attr, rem));
|
||||
if (err < 0) {
|
||||
NL_SET_BAD_ATTR(extack, attr);
|
||||
return err;
|
||||
|
|
@ -7838,7 +7850,7 @@ static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx,
|
|||
}
|
||||
|
||||
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
|
||||
const struct nlattr *attr)
|
||||
const struct nlattr *attr, bool last)
|
||||
{
|
||||
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
|
||||
struct nft_set_ext_tmpl tmpl;
|
||||
|
|
@ -7906,6 +7918,11 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
|
|||
if (flags)
|
||||
*nft_set_ext_flags(ext) = flags;
|
||||
|
||||
if (last)
|
||||
elem.flags = NFT_SET_ELEM_INTERNAL_LAST;
|
||||
else
|
||||
elem.flags = 0;
|
||||
|
||||
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
|
||||
if (trans == NULL)
|
||||
goto fail_trans;
|
||||
|
|
@ -8053,7 +8070,8 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
|
|||
return nft_set_flush(&ctx, set, genmask);
|
||||
|
||||
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
|
||||
err = nft_del_setelem(&ctx, set, attr);
|
||||
err = nft_del_setelem(&ctx, set, attr,
|
||||
nla_is_last(attr, rem));
|
||||
if (err == -ENOENT &&
|
||||
NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM)
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -435,6 +435,34 @@ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
|
|||
nf_queue_entry_free(entry);
|
||||
}
|
||||
|
||||
/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
|
||||
* exclusively.
|
||||
*/
|
||||
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
|
||||
{
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
struct nf_conn *ct = (void *)skb_nfct(entry->skb);
|
||||
|
||||
if (!ct || nf_ct_is_confirmed(ct))
|
||||
return false;
|
||||
|
||||
if (is_unconfirmed)
|
||||
*is_unconfirmed = true;
|
||||
|
||||
/* in some cases skb_clone() can occur after initial conntrack
|
||||
* pickup, but conntrack assumes exclusive skb->_nfct ownership for
|
||||
* unconfirmed entries.
|
||||
*
|
||||
* This happens for br_netfilter and with ip multicast routing.
|
||||
* This can't be solved with serialization here because one clone
|
||||
* could have been queued for local delivery or could be transmitted
|
||||
* in parallel on another CPU.
|
||||
*/
|
||||
return refcount_read(&ct->ct_general.use) > 1;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
|
||||
{
|
||||
const struct nf_ct_hook *ct_hook;
|
||||
|
|
@ -462,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
|
|||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
|
||||
/* If first queued segment was already reinjected then
|
||||
* there is a good chance the ct entry is now confirmed.
|
||||
*
|
||||
* Handle the rare cases:
|
||||
* - out-of-order verdict
|
||||
* - threaded userspace reinjecting in parallel
|
||||
* - first segment was dropped
|
||||
*
|
||||
* In all of those cases we can't handle this packet
|
||||
* because we can't be sure that another CPU won't modify
|
||||
* nf_conn->ext in parallel which isn't allowed.
|
||||
*/
|
||||
if (nf_ct_drop_unconfirmed(entry, NULL))
|
||||
verdict = NF_DROP;
|
||||
}
|
||||
|
||||
nf_reinject(entry, verdict);
|
||||
}
|
||||
|
||||
|
|
@ -891,49 +937,6 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
|
||||
{
|
||||
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
|
||||
static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
|
||||
struct nf_conn *ct = (void *)skb_nfct(entry->skb);
|
||||
unsigned long status;
|
||||
unsigned int use;
|
||||
|
||||
if (!ct)
|
||||
return false;
|
||||
|
||||
status = READ_ONCE(ct->status);
|
||||
if ((status & flags) == IPS_DYING)
|
||||
return true;
|
||||
|
||||
if (status & IPS_CONFIRMED)
|
||||
return false;
|
||||
|
||||
/* in some cases skb_clone() can occur after initial conntrack
|
||||
* pickup, but conntrack assumes exclusive skb->_nfct ownership for
|
||||
* unconfirmed entries.
|
||||
*
|
||||
* This happens for br_netfilter and with ip multicast routing.
|
||||
* We can't be solved with serialization here because one clone could
|
||||
* have been queued for local delivery.
|
||||
*/
|
||||
use = refcount_read(&ct->ct_general.use);
|
||||
if (likely(use == 1))
|
||||
return false;
|
||||
|
||||
/* Can't decrement further? Exclusive ownership. */
|
||||
if (!refcount_dec_not_one(&ct->ct_general.use))
|
||||
return false;
|
||||
|
||||
skb_set_nfct(entry->skb, 0);
|
||||
/* No nf_ct_put(): we already decremented .use and it cannot
|
||||
* drop down to 0.
|
||||
*/
|
||||
return true;
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
static int
|
||||
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
|
||||
struct nf_queue_entry *entry)
|
||||
|
|
@ -950,9 +953,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
|
|||
}
|
||||
spin_lock_bh(&queue->lock);
|
||||
|
||||
if (nf_ct_drop_unconfirmed(entry))
|
||||
goto err_out_free_nskb;
|
||||
|
||||
if (queue->queue_total >= queue->queue_maxlen)
|
||||
goto err_out_queue_drop;
|
||||
|
||||
|
|
@ -995,7 +995,6 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
|
|||
else
|
||||
net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
|
||||
}
|
||||
err_out_free_nskb:
|
||||
kfree_skb(nskb);
|
||||
err_out_unlock:
|
||||
spin_unlock_bh(&queue->lock);
|
||||
|
|
@ -1074,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
|
|||
static int
|
||||
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
|
||||
{
|
||||
unsigned int queued;
|
||||
struct nfqnl_instance *queue;
|
||||
struct sk_buff *skb, *segs, *nskb;
|
||||
bool ct_is_unconfirmed = false;
|
||||
struct nfqnl_instance *queue;
|
||||
unsigned int queued;
|
||||
int err = -ENOBUFS;
|
||||
struct net *net = entry->state.net;
|
||||
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
|
||||
|
|
@ -1100,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
|
|||
break;
|
||||
}
|
||||
|
||||
/* Check if someone already holds another reference to
|
||||
* unconfirmed ct. If so, we cannot queue the skb:
|
||||
* concurrent modifications of nf_conn->ext are not
|
||||
* allowed and we can't know if another CPU isn't
|
||||
* processing the same nf_conn entry in parallel.
|
||||
*/
|
||||
if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
|
||||
return -EINVAL;
|
||||
|
||||
if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
|
||||
return __nfqnl_enqueue_packet(net, queue, entry);
|
||||
|
||||
|
|
@ -1113,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
|
|||
goto out_err;
|
||||
queued = 0;
|
||||
err = 0;
|
||||
|
||||
skb_list_walk_safe(segs, segs, nskb) {
|
||||
if (ct_is_unconfirmed && queued > 0) {
|
||||
/* skb_gso_segment() increments the ct refcount.
|
||||
* This is a problem for unconfirmed (not in hash)
|
||||
* entries, those can race when reinjections happen
|
||||
* in parallel.
|
||||
*
|
||||
* Annotate this for all queued entries except the
|
||||
* first one.
|
||||
*
|
||||
* As long as the first one is reinjected first it
|
||||
* will do the confirmation for us.
|
||||
*/
|
||||
entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
|
||||
}
|
||||
|
||||
if (err == 0)
|
||||
err = __nfqnl_enqueue_packet_gso(net, queue,
|
||||
segs, entry);
|
||||
|
|
|
|||
|
|
@ -117,8 +117,8 @@ static void nft_counter_reset(struct nft_counter_percpu_priv *priv,
|
|||
nft_sync = this_cpu_ptr(&nft_counter_sync);
|
||||
|
||||
u64_stats_update_begin(nft_sync);
|
||||
u64_stats_add(&this_cpu->packets, -total->packets);
|
||||
u64_stats_add(&this_cpu->bytes, -total->bytes);
|
||||
u64_stats_sub(&this_cpu->packets, total->packets);
|
||||
u64_stats_sub(&this_cpu->bytes, total->bytes);
|
||||
u64_stats_update_end(nft_sync);
|
||||
|
||||
local_bh_enable();
|
||||
|
|
|
|||
|
|
@ -619,15 +619,20 @@ static struct nft_elem_priv *
|
|||
nft_hash_get(const struct net *net, const struct nft_set *set,
|
||||
const struct nft_set_elem *elem, unsigned int flags)
|
||||
{
|
||||
const u32 *key = (const u32 *)&elem->key.val;
|
||||
struct nft_hash *priv = nft_set_priv(set);
|
||||
u8 genmask = nft_genmask_cur(net);
|
||||
struct nft_hash_elem *he;
|
||||
u32 hash;
|
||||
|
||||
hash = jhash(elem->key.val.data, set->klen, priv->seed);
|
||||
if (set->klen == 4)
|
||||
hash = jhash_1word(*key, priv->seed);
|
||||
else
|
||||
hash = jhash(key, set->klen, priv->seed);
|
||||
|
||||
hash = reciprocal_scale(hash, priv->buckets);
|
||||
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
|
||||
if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) &&
|
||||
if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) &&
|
||||
nft_set_elem_active(&he->ext, genmask))
|
||||
return &he->priv;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,12 +33,18 @@ struct nft_rbtree {
|
|||
rwlock_t lock;
|
||||
struct nft_array __rcu *array;
|
||||
struct nft_array *array_next;
|
||||
unsigned long start_rbe_cookie;
|
||||
unsigned long last_gc;
|
||||
struct list_head expired;
|
||||
u64 last_tstamp;
|
||||
};
|
||||
|
||||
struct nft_rbtree_elem {
|
||||
struct nft_elem_priv priv;
|
||||
struct rb_node node;
|
||||
union {
|
||||
struct rb_node node;
|
||||
struct list_head list;
|
||||
};
|
||||
struct nft_set_ext ext;
|
||||
};
|
||||
|
||||
|
|
@ -53,6 +59,13 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
|
|||
return !nft_rbtree_interval_end(rbe);
|
||||
}
|
||||
|
||||
static bool nft_rbtree_interval_null(const struct nft_set *set,
|
||||
const struct nft_rbtree_elem *rbe)
|
||||
{
|
||||
return (!memchr_inv(nft_set_ext_key(&rbe->ext), 0, set->klen) &&
|
||||
nft_rbtree_interval_end(rbe));
|
||||
}
|
||||
|
||||
static int nft_rbtree_cmp(const struct nft_set *set,
|
||||
const struct nft_rbtree_elem *e1,
|
||||
const struct nft_rbtree_elem *e2)
|
||||
|
|
@ -179,13 +192,16 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set,
|
|||
return &rbe->priv;
|
||||
}
|
||||
|
||||
static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
|
||||
struct nft_rbtree *priv,
|
||||
struct nft_rbtree_elem *rbe)
|
||||
static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set,
|
||||
struct nft_rbtree *priv,
|
||||
struct nft_rbtree_elem *rbe)
|
||||
{
|
||||
lockdep_assert_held_write(&priv->lock);
|
||||
nft_setelem_data_deactivate(net, set, &rbe->priv);
|
||||
rb_erase(&rbe->node, &priv->root);
|
||||
|
||||
/* collected later on in commit callback */
|
||||
list_add(&rbe->list, &priv->expired);
|
||||
}
|
||||
|
||||
static const struct nft_rbtree_elem *
|
||||
|
|
@ -196,11 +212,6 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
|
|||
struct rb_node *prev = rb_prev(&rbe->node);
|
||||
struct net *net = read_pnet(&set->net);
|
||||
struct nft_rbtree_elem *rbe_prev;
|
||||
struct nft_trans_gc *gc;
|
||||
|
||||
gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
|
||||
if (!gc)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/* search for end interval coming before this element.
|
||||
* end intervals don't carry a timeout extension, they
|
||||
|
|
@ -218,28 +229,10 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
|
|||
rbe_prev = NULL;
|
||||
if (prev) {
|
||||
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
|
||||
nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
|
||||
|
||||
/* There is always room in this trans gc for this element,
|
||||
* memory allocation never actually happens, hence, the warning
|
||||
* splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
|
||||
* this is synchronous gc which never fails.
|
||||
*/
|
||||
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
|
||||
if (WARN_ON_ONCE(!gc))
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
nft_trans_gc_elem_add(gc, rbe_prev);
|
||||
nft_rbtree_gc_elem_move(net, set, priv, rbe_prev);
|
||||
}
|
||||
|
||||
nft_rbtree_gc_elem_remove(net, set, priv, rbe);
|
||||
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
|
||||
if (WARN_ON_ONCE(!gc))
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
nft_trans_gc_elem_add(gc, rbe);
|
||||
|
||||
nft_trans_gc_queue_sync_done(gc);
|
||||
nft_rbtree_gc_elem_move(net, set, priv, rbe);
|
||||
|
||||
return rbe_prev;
|
||||
}
|
||||
|
|
@ -260,16 +253,107 @@ static bool nft_rbtree_update_first(const struct nft_set *set,
|
|||
return false;
|
||||
}
|
||||
|
||||
/* Only for anonymous sets which do not allow updates, all element are active. */
|
||||
static struct nft_rbtree_elem *nft_rbtree_prev_active(struct nft_rbtree_elem *rbe)
|
||||
{
|
||||
struct rb_node *node;
|
||||
|
||||
node = rb_prev(&rbe->node);
|
||||
if (!node)
|
||||
return NULL;
|
||||
|
||||
return rb_entry(node, struct nft_rbtree_elem, node);
|
||||
}
|
||||
|
||||
static struct nft_rbtree_elem *
|
||||
__nft_rbtree_next_active(struct rb_node *node, u8 genmask)
|
||||
{
|
||||
struct nft_rbtree_elem *next_rbe;
|
||||
|
||||
while (node) {
|
||||
next_rbe = rb_entry(node, struct nft_rbtree_elem, node);
|
||||
if (!nft_set_elem_active(&next_rbe->ext, genmask)) {
|
||||
node = rb_next(node);
|
||||
continue;
|
||||
}
|
||||
|
||||
return next_rbe;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct nft_rbtree_elem *
|
||||
nft_rbtree_next_active(struct nft_rbtree_elem *rbe, u8 genmask)
|
||||
{
|
||||
return __nft_rbtree_next_active(rb_next(&rbe->node), genmask);
|
||||
}
|
||||
|
||||
static void nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree *priv,
|
||||
u64 tstamp)
|
||||
{
|
||||
if (priv->last_tstamp != tstamp) {
|
||||
priv->start_rbe_cookie = 0;
|
||||
priv->last_tstamp = tstamp;
|
||||
}
|
||||
}
|
||||
|
||||
static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv,
|
||||
const struct nft_rbtree_elem *rbe)
|
||||
{
|
||||
priv->start_rbe_cookie = (unsigned long)rbe;
|
||||
}
|
||||
|
||||
static void nft_rbtree_set_start_cookie_open(struct nft_rbtree *priv,
|
||||
const struct nft_rbtree_elem *rbe,
|
||||
unsigned long open_interval)
|
||||
{
|
||||
priv->start_rbe_cookie = (unsigned long)rbe | open_interval;
|
||||
}
|
||||
|
||||
#define NFT_RBTREE_OPEN_INTERVAL 1UL
|
||||
|
||||
static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv,
|
||||
const struct nft_rbtree_elem *rbe)
|
||||
{
|
||||
return (priv->start_rbe_cookie & ~NFT_RBTREE_OPEN_INTERVAL) == (unsigned long)rbe;
|
||||
}
|
||||
|
||||
static bool nft_rbtree_insert_same_interval(const struct net *net,
|
||||
struct nft_rbtree *priv,
|
||||
struct nft_rbtree_elem *rbe)
|
||||
{
|
||||
u8 genmask = nft_genmask_next(net);
|
||||
struct nft_rbtree_elem *next_rbe;
|
||||
|
||||
if (!priv->start_rbe_cookie)
|
||||
return true;
|
||||
|
||||
next_rbe = nft_rbtree_next_active(rbe, genmask);
|
||||
if (next_rbe) {
|
||||
/* Closest start element differs from last element added. */
|
||||
if (nft_rbtree_interval_start(next_rbe) &&
|
||||
nft_rbtree_cmp_start_cookie(priv, next_rbe)) {
|
||||
priv->start_rbe_cookie = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
priv->start_rbe_cookie = 0;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
|
||||
struct nft_rbtree_elem *new,
|
||||
struct nft_elem_priv **elem_priv)
|
||||
struct nft_elem_priv **elem_priv, u64 tstamp, bool last)
|
||||
{
|
||||
struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
|
||||
struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev;
|
||||
struct rb_node *node, *next, *parent, **p, *first = NULL;
|
||||
struct nft_rbtree *priv = nft_set_priv(set);
|
||||
u8 cur_genmask = nft_genmask_cur(net);
|
||||
u8 genmask = nft_genmask_next(net);
|
||||
u64 tstamp = nft_net_tstamp(net);
|
||||
unsigned long open_interval = 0;
|
||||
int d;
|
||||
|
||||
/* Descend the tree to search for an existing element greater than the
|
||||
|
|
@ -375,12 +459,46 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
|
|||
}
|
||||
}
|
||||
|
||||
if (nft_rbtree_interval_null(set, new)) {
|
||||
priv->start_rbe_cookie = 0;
|
||||
} else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie) {
|
||||
if (nft_set_is_anonymous(set)) {
|
||||
priv->start_rbe_cookie = 0;
|
||||
} else if (priv->start_rbe_cookie & NFT_RBTREE_OPEN_INTERVAL) {
|
||||
/* Previous element is an open interval that partially
|
||||
* overlaps with an existing non-open interval.
|
||||
*/
|
||||
return -ENOTEMPTY;
|
||||
}
|
||||
}
|
||||
|
||||
/* - new start element matching existing start element: full overlap
|
||||
* reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
|
||||
*/
|
||||
if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
|
||||
nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
|
||||
*elem_priv = &rbe_ge->priv;
|
||||
|
||||
/* - Corner case: new start element of open interval (which
|
||||
* comes as last element in the batch) overlaps the start of
|
||||
* an existing interval with an end element: partial overlap.
|
||||
*/
|
||||
node = rb_first(&priv->root);
|
||||
rbe = __nft_rbtree_next_active(node, genmask);
|
||||
if (rbe && nft_rbtree_interval_end(rbe)) {
|
||||
rbe = nft_rbtree_next_active(rbe, genmask);
|
||||
if (rbe &&
|
||||
nft_rbtree_interval_start(rbe) &&
|
||||
!nft_rbtree_cmp(set, new, rbe)) {
|
||||
if (last)
|
||||
return -ENOTEMPTY;
|
||||
|
||||
/* Maybe open interval? */
|
||||
open_interval = NFT_RBTREE_OPEN_INTERVAL;
|
||||
}
|
||||
}
|
||||
nft_rbtree_set_start_cookie_open(priv, rbe_ge, open_interval);
|
||||
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
|
|
@ -389,18 +507,37 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
|
|||
*/
|
||||
if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
|
||||
nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
|
||||
/* - ignore null interval, otherwise NLM_F_CREATE bogusly
|
||||
* reports EEXIST.
|
||||
*/
|
||||
if (nft_rbtree_interval_null(set, new))
|
||||
return -ECANCELED;
|
||||
|
||||
*elem_priv = &rbe_le->priv;
|
||||
|
||||
/* - start and end element belong to the same interval. */
|
||||
if (!nft_rbtree_insert_same_interval(net, priv, rbe_le))
|
||||
return -ENOTEMPTY;
|
||||
|
||||
return -EEXIST;
|
||||
}
|
||||
|
||||
/* - new start element with existing closest, less or equal key value
|
||||
* being a start element: partial overlap, reported as -ENOTEMPTY.
|
||||
* Anonymous sets allow for two consecutive start element since they
|
||||
* are constant, skip them to avoid bogus overlap reports.
|
||||
* are constant, but validate that this new start element does not
|
||||
* sit in between an existing start and end elements: partial overlap,
|
||||
* reported as -ENOTEMPTY.
|
||||
*/
|
||||
if (!nft_set_is_anonymous(set) && rbe_le &&
|
||||
nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
|
||||
return -ENOTEMPTY;
|
||||
if (rbe_le &&
|
||||
nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) {
|
||||
if (!nft_set_is_anonymous(set))
|
||||
return -ENOTEMPTY;
|
||||
|
||||
rbe_prev = nft_rbtree_prev_active(rbe_le);
|
||||
if (rbe_prev && nft_rbtree_interval_end(rbe_prev))
|
||||
return -ENOTEMPTY;
|
||||
}
|
||||
|
||||
/* - new end element with existing closest, less or equal key value
|
||||
* being a end element: partial overlap, reported as -ENOTEMPTY.
|
||||
|
|
@ -416,6 +553,12 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
|
|||
nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
|
||||
return -ENOTEMPTY;
|
||||
|
||||
/* - start element overlaps an open interval but end element is new:
|
||||
* partial overlap, reported as -ENOEMPTY.
|
||||
*/
|
||||
if (!rbe_ge && priv->start_rbe_cookie && nft_rbtree_interval_end(new))
|
||||
return -ENOTEMPTY;
|
||||
|
||||
/* Accepted element: pick insertion point depending on key value */
|
||||
parent = NULL;
|
||||
p = &priv->root.rb_node;
|
||||
|
|
@ -525,9 +668,13 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
|
|||
struct nft_elem_priv **elem_priv)
|
||||
{
|
||||
struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
|
||||
bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST);
|
||||
struct nft_rbtree *priv = nft_set_priv(set);
|
||||
u64 tstamp = nft_net_tstamp(net);
|
||||
int err;
|
||||
|
||||
nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
|
||||
|
||||
if (nft_array_may_resize(set) < 0)
|
||||
return -ENOMEM;
|
||||
|
||||
|
|
@ -538,8 +685,12 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
|
|||
cond_resched();
|
||||
|
||||
write_lock_bh(&priv->lock);
|
||||
err = __nft_rbtree_insert(net, set, rbe, elem_priv);
|
||||
err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp, last);
|
||||
write_unlock_bh(&priv->lock);
|
||||
|
||||
if (nft_rbtree_interval_end(rbe))
|
||||
priv->start_rbe_cookie = 0;
|
||||
|
||||
} while (err == -EAGAIN);
|
||||
|
||||
return err;
|
||||
|
|
@ -571,6 +722,48 @@ static void nft_rbtree_activate(const struct net *net,
|
|||
nft_clear(net, &rbe->ext);
|
||||
}
|
||||
|
||||
static struct nft_rbtree_elem *
|
||||
nft_rbtree_next_inactive(struct nft_rbtree_elem *rbe, u8 genmask)
|
||||
{
|
||||
struct nft_rbtree_elem *next_rbe;
|
||||
struct rb_node *node;
|
||||
|
||||
node = rb_next(&rbe->node);
|
||||
if (node) {
|
||||
next_rbe = rb_entry(node, struct nft_rbtree_elem, node);
|
||||
if (nft_rbtree_interval_start(next_rbe) &&
|
||||
!nft_set_elem_active(&next_rbe->ext, genmask))
|
||||
return next_rbe;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static bool nft_rbtree_deactivate_same_interval(const struct net *net,
|
||||
struct nft_rbtree *priv,
|
||||
struct nft_rbtree_elem *rbe)
|
||||
{
|
||||
u8 genmask = nft_genmask_next(net);
|
||||
struct nft_rbtree_elem *next_rbe;
|
||||
|
||||
if (!priv->start_rbe_cookie)
|
||||
return true;
|
||||
|
||||
next_rbe = nft_rbtree_next_inactive(rbe, genmask);
|
||||
if (next_rbe) {
|
||||
/* Closest start element differs from last element added. */
|
||||
if (nft_rbtree_interval_start(next_rbe) &&
|
||||
nft_rbtree_cmp_start_cookie(priv, next_rbe)) {
|
||||
priv->start_rbe_cookie = 0;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
priv->start_rbe_cookie = 0;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void nft_rbtree_flush(const struct net *net,
|
||||
const struct nft_set *set,
|
||||
struct nft_elem_priv *elem_priv)
|
||||
|
|
@ -585,12 +778,19 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
|
|||
const struct nft_set_elem *elem)
|
||||
{
|
||||
struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
|
||||
const struct nft_rbtree *priv = nft_set_priv(set);
|
||||
bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST);
|
||||
struct nft_rbtree *priv = nft_set_priv(set);
|
||||
const struct rb_node *parent = priv->root.rb_node;
|
||||
u8 genmask = nft_genmask_next(net);
|
||||
u64 tstamp = nft_net_tstamp(net);
|
||||
int d;
|
||||
|
||||
nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
|
||||
|
||||
if (nft_rbtree_interval_start(this) ||
|
||||
nft_rbtree_interval_null(set, this))
|
||||
priv->start_rbe_cookie = 0;
|
||||
|
||||
if (nft_array_may_resize(set) < 0)
|
||||
return NULL;
|
||||
|
||||
|
|
@ -618,6 +818,13 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
|
|||
parent = parent->rb_left;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (nft_rbtree_interval_start(rbe)) {
|
||||
if (!last)
|
||||
nft_rbtree_set_start_cookie(priv, rbe);
|
||||
} else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe))
|
||||
return NULL;
|
||||
|
||||
nft_rbtree_flush(net, set, &rbe->priv);
|
||||
return &rbe->priv;
|
||||
}
|
||||
|
|
@ -675,29 +882,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
|
|||
}
|
||||
}
|
||||
|
||||
static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
|
||||
struct nft_rbtree *priv,
|
||||
struct nft_rbtree_elem *rbe)
|
||||
{
|
||||
nft_setelem_data_deactivate(net, set, &rbe->priv);
|
||||
nft_rbtree_erase(priv, rbe);
|
||||
}
|
||||
|
||||
static void nft_rbtree_gc(struct nft_set *set)
|
||||
static void nft_rbtree_gc_scan(struct nft_set *set)
|
||||
{
|
||||
struct nft_rbtree *priv = nft_set_priv(set);
|
||||
struct nft_rbtree_elem *rbe, *rbe_end = NULL;
|
||||
struct net *net = read_pnet(&set->net);
|
||||
u64 tstamp = nft_net_tstamp(net);
|
||||
struct rb_node *node, *next;
|
||||
struct nft_trans_gc *gc;
|
||||
|
||||
set = nft_set_container_of(priv);
|
||||
net = read_pnet(&set->net);
|
||||
|
||||
gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
|
||||
if (!gc)
|
||||
return;
|
||||
|
||||
for (node = rb_first(&priv->root); node ; node = next) {
|
||||
next = rb_next(node);
|
||||
|
|
@ -715,34 +906,46 @@ static void nft_rbtree_gc(struct nft_set *set)
|
|||
if (!__nft_set_elem_expired(&rbe->ext, tstamp))
|
||||
continue;
|
||||
|
||||
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
|
||||
if (!gc)
|
||||
goto try_later;
|
||||
|
||||
/* end element needs to be removed first, it has
|
||||
* no timeout extension.
|
||||
*/
|
||||
write_lock_bh(&priv->lock);
|
||||
if (rbe_end) {
|
||||
nft_rbtree_gc_remove(net, set, priv, rbe_end);
|
||||
nft_trans_gc_elem_add(gc, rbe_end);
|
||||
nft_rbtree_gc_elem_move(net, set, priv, rbe_end);
|
||||
rbe_end = NULL;
|
||||
}
|
||||
|
||||
nft_rbtree_gc_elem_move(net, set, priv, rbe);
|
||||
write_unlock_bh(&priv->lock);
|
||||
}
|
||||
|
||||
priv->last_gc = jiffies;
|
||||
}
|
||||
|
||||
static void nft_rbtree_gc_queue(struct nft_set *set)
|
||||
{
|
||||
struct nft_rbtree *priv = nft_set_priv(set);
|
||||
struct nft_rbtree_elem *rbe, *rbe_end;
|
||||
struct nft_trans_gc *gc;
|
||||
|
||||
if (list_empty(&priv->expired))
|
||||
return;
|
||||
|
||||
gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
|
||||
if (!gc)
|
||||
return;
|
||||
|
||||
list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) {
|
||||
list_del(&rbe->list);
|
||||
nft_trans_gc_elem_add(gc, rbe);
|
||||
|
||||
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
|
||||
if (!gc)
|
||||
goto try_later;
|
||||
|
||||
nft_rbtree_gc_remove(net, set, priv, rbe);
|
||||
nft_trans_gc_elem_add(gc, rbe);
|
||||
return;
|
||||
}
|
||||
|
||||
try_later:
|
||||
|
||||
if (gc) {
|
||||
gc = nft_trans_gc_catchall_sync(gc);
|
||||
nft_trans_gc_queue_sync_done(gc);
|
||||
priv->last_gc = jiffies;
|
||||
}
|
||||
gc = nft_trans_gc_catchall_sync(gc);
|
||||
nft_trans_gc_queue_sync_done(gc);
|
||||
}
|
||||
|
||||
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
|
||||
|
|
@ -761,6 +964,7 @@ static int nft_rbtree_init(const struct nft_set *set,
|
|||
|
||||
rwlock_init(&priv->lock);
|
||||
priv->root = RB_ROOT;
|
||||
INIT_LIST_HEAD(&priv->expired);
|
||||
|
||||
priv->array = NULL;
|
||||
priv->array_next = NULL;
|
||||
|
|
@ -778,10 +982,15 @@ static void nft_rbtree_destroy(const struct nft_ctx *ctx,
|
|||
const struct nft_set *set)
|
||||
{
|
||||
struct nft_rbtree *priv = nft_set_priv(set);
|
||||
struct nft_rbtree_elem *rbe;
|
||||
struct nft_rbtree_elem *rbe, *next;
|
||||
struct nft_array *array;
|
||||
struct rb_node *node;
|
||||
|
||||
list_for_each_entry_safe(rbe, next, &priv->expired, list) {
|
||||
list_del(&rbe->list);
|
||||
nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
|
||||
}
|
||||
|
||||
while ((node = priv->root.rb_node) != NULL) {
|
||||
rb_erase(node, &priv->root);
|
||||
rbe = rb_entry(node, struct nft_rbtree_elem, node);
|
||||
|
|
@ -828,13 +1037,21 @@ static void nft_rbtree_commit(struct nft_set *set)
|
|||
u32 num_intervals = 0;
|
||||
struct rb_node *node;
|
||||
|
||||
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
|
||||
nft_rbtree_gc(set);
|
||||
|
||||
/* No changes, skip, eg. elements updates only. */
|
||||
if (!priv->array_next)
|
||||
return;
|
||||
|
||||
/* GC can be performed if the binary search blob is going
|
||||
* to be rebuilt. It has to be done in two phases: first
|
||||
* scan tree and move all expired elements to the expired
|
||||
* list.
|
||||
*
|
||||
* Then, after blob has been re-built and published to other
|
||||
* CPUs, queue collected entries for freeing.
|
||||
*/
|
||||
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
|
||||
nft_rbtree_gc_scan(set);
|
||||
|
||||
/* Reverse walk to create an array from smaller to largest interval. */
|
||||
node = rb_last(&priv->root);
|
||||
if (node)
|
||||
|
|
@ -881,10 +1098,16 @@ static void nft_rbtree_commit(struct nft_set *set)
|
|||
num_intervals++;
|
||||
err_out:
|
||||
priv->array_next->num_intervals = num_intervals;
|
||||
old = rcu_replace_pointer(priv->array, priv->array_next, true);
|
||||
old = rcu_replace_pointer(priv->array, priv->array_next,
|
||||
lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex));
|
||||
priv->array_next = NULL;
|
||||
if (old)
|
||||
call_rcu(&old->rcu_head, nft_array_free_rcu);
|
||||
|
||||
/* New blob is public, queue collected entries for freeing.
|
||||
* call_rcu ensures elements stay around until readers are done.
|
||||
*/
|
||||
nft_rbtree_gc_queue(set);
|
||||
}
|
||||
|
||||
static void nft_rbtree_abort(const struct nft_set *set)
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ CONFIG_IP_NF_RAW=m
|
|||
CONFIG_IP_SCTP=m
|
||||
CONFIG_IPV6=y
|
||||
CONFIG_IPV6_MULTIPLE_TABLES=y
|
||||
CONFIG_IPV6_TUNNEL=m
|
||||
CONFIG_IP_VS=m
|
||||
CONFIG_IP_VS_PROTO_TCP=y
|
||||
CONFIG_IP_VS_RR=m
|
||||
|
|
|
|||
|
|
@ -601,14 +601,19 @@ ip -net "$nsr2" link set tun0 up
|
|||
ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0
|
||||
ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
|
||||
|
||||
ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1
|
||||
ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 || ret=1
|
||||
ip -net "$nsr2" link set tun6 up
|
||||
ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad
|
||||
|
||||
ip -net "$nsr1" route change default via 192.168.100.2
|
||||
ip -net "$nsr2" route change default via 192.168.100.1
|
||||
ip -6 -net "$nsr1" route change default via fee1:3::2
|
||||
ip -6 -net "$nsr2" route change default via fee1:3::1
|
||||
|
||||
# do not use "route change" and delete old default so
|
||||
# socat fails to connect in case new default can't be added.
|
||||
ip -6 -net "$nsr1" route delete default
|
||||
ip -6 -net "$nsr1" route add default via fee1:3::2
|
||||
ip -6 -net "$nsr2" route delete default
|
||||
ip -6 -net "$nsr2" route add default via fee1:3::1
|
||||
ip -net "$ns2" route add default via 10.0.2.1
|
||||
ip -6 -net "$ns2" route add default via dead:2::1
|
||||
|
||||
|
|
@ -649,7 +654,8 @@ ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 a
|
|||
ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2
|
||||
ip -net "$nsr1" link set tun6.10 up
|
||||
ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad
|
||||
ip -6 -net "$nsr1" route change default via fee1:5::2
|
||||
ip -6 -net "$nsr1" route delete default
|
||||
ip -6 -net "$nsr1" route add default via fee1:5::2
|
||||
ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept'
|
||||
|
||||
ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10
|
||||
|
|
@ -664,10 +670,11 @@ ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10
|
|||
ip -net "$nsr2" route change default via 192.168.200.1
|
||||
ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
|
||||
|
||||
ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1
|
||||
ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 || ret=1
|
||||
ip -net "$nsr2" link set tun6.10 up
|
||||
ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad
|
||||
ip -6 -net "$nsr2" route change default via fee1:5::1
|
||||
ip -6 -net "$nsr2" route delete default
|
||||
ip -6 -net "$nsr2" route add default via fee1:5::1
|
||||
|
||||
if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then
|
||||
echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2
|
||||
|
|
|
|||
|
|
@ -510,7 +510,7 @@ EOF
|
|||
|
||||
udp_listener_ready()
|
||||
{
|
||||
ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345
|
||||
ss -S -N "$1" -uln -o "sport = :$2" | grep -q "$2"
|
||||
}
|
||||
|
||||
output_files_written()
|
||||
|
|
@ -518,7 +518,7 @@ output_files_written()
|
|||
test -s "$1" && test -s "$2"
|
||||
}
|
||||
|
||||
test_udp_ct_race()
|
||||
test_udp_nat_race()
|
||||
{
|
||||
ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
|
||||
flush ruleset
|
||||
|
|
@ -545,8 +545,8 @@ EOF
|
|||
ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 &
|
||||
local nfqpid=$!
|
||||
|
||||
busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2"
|
||||
busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3"
|
||||
busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12345
|
||||
busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" 12345
|
||||
busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12
|
||||
|
||||
# Send two packets, one should end up in ns1, other in ns2.
|
||||
|
|
@ -557,7 +557,7 @@ EOF
|
|||
|
||||
busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2"
|
||||
|
||||
kill "$nfqpid"
|
||||
kill "$nfqpid" "$rpid1" "$rpid2"
|
||||
|
||||
if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then
|
||||
echo "FAIL: Expected One udp conntrack entry"
|
||||
|
|
@ -585,6 +585,135 @@ EOF
|
|||
echo "PASS: both udp receivers got one packet each"
|
||||
}
|
||||
|
||||
# Make sure UDPGRO aggregated packets don't lose
|
||||
# their skb->nfct entry when nfqueue passes the
|
||||
# skb to userspace with software gso segmentation on.
|
||||
test_udp_gro_ct()
|
||||
{
|
||||
local errprefix="FAIL: test_udp_gro_ct:"
|
||||
|
||||
ip netns exec "$nsrouter" conntrack -F 2>/dev/null
|
||||
|
||||
ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF
|
||||
flush ruleset
|
||||
table inet udpq {
|
||||
# Number of packets/bytes queued to userspace
|
||||
counter toqueue { }
|
||||
# Number of packets/bytes reinjected from userspace with 'ct new' intact
|
||||
counter fromqueue { }
|
||||
# These two counters should be identical and not 0.
|
||||
|
||||
chain prerouting {
|
||||
type filter hook prerouting priority -300; policy accept;
|
||||
|
||||
# userspace sends small packets, if < 1000, UDPGRO did
|
||||
# not kick in, but test needs a 'new' conntrack with udpgro skb.
|
||||
meta iifname veth0 meta l4proto udp meta length > 1000 accept
|
||||
|
||||
# don't pick up non-gso packets and don't queue them to
|
||||
# userspace.
|
||||
notrack
|
||||
}
|
||||
|
||||
chain postrouting {
|
||||
type filter hook postrouting priority 0; policy accept;
|
||||
|
||||
# Only queue unconfirmed fraglist gro skbs to userspace.
|
||||
udp dport 12346 ct status ! confirmed counter name "toqueue" mark set 1 queue num 1
|
||||
}
|
||||
|
||||
chain validate {
|
||||
type filter hook postrouting priority 1; policy accept;
|
||||
# ... and only count those that were reinjected with the
|
||||
# skb->nfct intact.
|
||||
mark 1 counter name "fromqueue"
|
||||
}
|
||||
}
|
||||
EOF
|
||||
timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12346,fork,pf=ipv4 OPEN:"$TMPFILE1",trunc &
|
||||
local rpid=$!
|
||||
|
||||
ip netns exec "$nsrouter" ./nf_queue -G -c -q 1 -t 2 > "$TMPFILE2" &
|
||||
local nfqpid=$!
|
||||
|
||||
ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding on rx-gro-list on generic-receive-offload on
|
||||
|
||||
busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 12346
|
||||
busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 1
|
||||
|
||||
local bs=512
|
||||
local count=$(((32 * 1024 * 1024) / bs))
|
||||
dd if=/dev/zero bs="$bs" count="$count" 2>/dev/null | for i in $(seq 1 16); do
|
||||
timeout 5 ip netns exec "$ns1" \
|
||||
socat -u -b 512 STDIN UDP-DATAGRAM:10.0.2.99:12346,reuseport,bind=0.0.0.0:55221 &
|
||||
done
|
||||
|
||||
busywait 10000 test -s "$TMPFILE1"
|
||||
|
||||
kill "$rpid"
|
||||
|
||||
wait
|
||||
|
||||
local p
|
||||
local b
|
||||
local pqueued
|
||||
local bqueued
|
||||
|
||||
c=$(ip netns exec "$nsrouter" nft list counter inet udpq "toqueue" | grep packets)
|
||||
read p pqueued b bqueued <<EOF
|
||||
$c
|
||||
EOF
|
||||
local preinject
|
||||
local breinject
|
||||
c=$(ip netns exec "$nsrouter" nft list counter inet udpq "fromqueue" | grep packets)
|
||||
read p preinject b breinject <<EOF
|
||||
$c
|
||||
EOF
|
||||
ip netns exec "$nsrouter" ethtool -K "veth0" rx-udp-gro-forwarding off
|
||||
ip netns exec "$nsrouter" ethtool -K "veth1" rx-udp-gro-forwarding off
|
||||
|
||||
if [ "$pqueued" -eq 0 ];then
|
||||
# happens when gro did not build at least on aggregate
|
||||
echo "SKIP: No packets were queued"
|
||||
return
|
||||
fi
|
||||
|
||||
local saw_ct_entry=0
|
||||
if ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12346 2>/dev/null | wc -l | grep -q "^1"'; then
|
||||
saw_ct_entry=1
|
||||
else
|
||||
echo "$errprefix Expected udp conntrack entry"
|
||||
ip netns exec "$nsrouter" conntrack -L
|
||||
ret=1
|
||||
fi
|
||||
|
||||
if [ "$pqueued" -ge "$preinject" ] ;then
|
||||
echo "$errprefix Expected software segmentation to occur, had $pqueued and $preinject"
|
||||
ret=1
|
||||
return
|
||||
fi
|
||||
|
||||
# sw segmentation adds extra udp and ip headers.
|
||||
local breinject_expect=$((preinject * (512 + 20 + 8)))
|
||||
|
||||
if [ "$breinject" -eq "$breinject_expect" ]; then
|
||||
if [ "$saw_ct_entry" -eq 1 ];then
|
||||
echo "PASS: fraglist gro skb passed with conntrack entry"
|
||||
else
|
||||
echo "$errprefix fraglist gro skb passed without conntrack entry"
|
||||
ret=1
|
||||
fi
|
||||
else
|
||||
echo "$errprefix Counter mismatch, conntrack entry dropped by nfqueue? Queued: $pqueued, $bqueued. Post-queue: $preinject, $breinject. Expected $breinject_expect"
|
||||
ret=1
|
||||
fi
|
||||
|
||||
if ! ip netns exec "$nsrouter" nft delete table inet udpq; then
|
||||
echo "$errprefix: Could not delete udpq table"
|
||||
ret=1
|
||||
fi
|
||||
}
|
||||
|
||||
test_queue_removal()
|
||||
{
|
||||
read tainted_then < /proc/sys/kernel/tainted
|
||||
|
|
@ -663,7 +792,8 @@ test_tcp_localhost_connectclose
|
|||
test_tcp_localhost_requeue
|
||||
test_sctp_forward
|
||||
test_sctp_output
|
||||
test_udp_ct_race
|
||||
test_udp_nat_race
|
||||
test_udp_gro_ct
|
||||
|
||||
# should be last, adds vrf device in ns1 and changes routes
|
||||
test_icmp_vrf
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user