mirror of
https://github.com/torvalds/linux.git
synced 2026-05-22 06:01:53 +02:00
Merge branch 'inetpeer-reduce-false-sharing-and-atomic-operations'
Eric Dumazet says:
====================
inetpeer: reduce false sharing and atomic operations
After commit 8c2bd38b95 ("icmp: change the order of rate limits"),
there is a risk that a host receiving packets from an unique
source targeting closed ports is using a common inet_peer structure
from many cpus.
All these cpus have to acquire/release a refcount and update
the inet_peer timestamp (p->dtime)
Switch to pure RCU to avoid changing the refcount, and update
p->dtime only once per jiffy.
Tested:
DUT : 128 cores, 32 hw rx queues.
receiving 8,400,000 UDP packets per second, targeting closed ports.
Before the series:
- napi poll can not keep up, NIC drops 1,200,000 packets
per second.
- We use 20 % of cpu cycles
After this series:
- All packets are received (no more hw drops)
- We use 12 % of cpu cycles.
v1: https://lore.kernel.org/20241213130212.1783302-1-edumazet@google.com
====================
Link: https://patch.msgid.link/20241215175629.1248773-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
3a41305509
|
|
@ -96,30 +96,28 @@ static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr)
|
|||
|
||||
/* can be called with or without local BH being disabled */
|
||||
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
|
||||
const struct inetpeer_addr *daddr,
|
||||
int create);
|
||||
const struct inetpeer_addr *daddr);
|
||||
|
||||
static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base,
|
||||
__be32 v4daddr,
|
||||
int vif, int create)
|
||||
int vif)
|
||||
{
|
||||
struct inetpeer_addr daddr;
|
||||
|
||||
daddr.a4.addr = v4daddr;
|
||||
daddr.a4.vif = vif;
|
||||
daddr.family = AF_INET;
|
||||
return inet_getpeer(base, &daddr, create);
|
||||
return inet_getpeer(base, &daddr);
|
||||
}
|
||||
|
||||
static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base,
|
||||
const struct in6_addr *v6daddr,
|
||||
int create)
|
||||
const struct in6_addr *v6daddr)
|
||||
{
|
||||
struct inetpeer_addr daddr;
|
||||
|
||||
daddr.a6 = *v6daddr;
|
||||
daddr.family = AF_INET6;
|
||||
return inet_getpeer(base, &daddr, create);
|
||||
return inet_getpeer(base, &daddr);
|
||||
}
|
||||
|
||||
static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a,
|
||||
|
|
|
|||
|
|
@ -312,7 +312,6 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
|
|||
struct dst_entry *dst = &rt->dst;
|
||||
struct inet_peer *peer;
|
||||
bool rc = true;
|
||||
int vif;
|
||||
|
||||
if (!apply_ratelimit)
|
||||
return true;
|
||||
|
|
@ -321,12 +320,12 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
|
|||
if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
|
||||
goto out;
|
||||
|
||||
vif = l3mdev_master_ifindex(dst->dev);
|
||||
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
|
||||
rcu_read_lock();
|
||||
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
|
||||
l3mdev_master_ifindex_rcu(dst->dev));
|
||||
rc = inet_peer_xrlim_allow(peer,
|
||||
READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
|
||||
if (peer)
|
||||
inet_putpeer(peer);
|
||||
rcu_read_unlock();
|
||||
out:
|
||||
if (!rc)
|
||||
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
|
||||
|
|
|
|||
|
|
@ -95,6 +95,7 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
|
|||
{
|
||||
struct rb_node **pp, *parent, *next;
|
||||
struct inet_peer *p;
|
||||
u32 now;
|
||||
|
||||
pp = &base->rb_root.rb_node;
|
||||
parent = NULL;
|
||||
|
|
@ -108,8 +109,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
|
|||
p = rb_entry(parent, struct inet_peer, rb_node);
|
||||
cmp = inetpeer_addr_cmp(daddr, &p->daddr);
|
||||
if (cmp == 0) {
|
||||
if (!refcount_inc_not_zero(&p->refcnt))
|
||||
break;
|
||||
now = jiffies;
|
||||
if (READ_ONCE(p->dtime) != now)
|
||||
WRITE_ONCE(p->dtime, now);
|
||||
return p;
|
||||
}
|
||||
if (gc_stack) {
|
||||
|
|
@ -150,9 +152,6 @@ static void inet_peer_gc(struct inet_peer_base *base,
|
|||
for (i = 0; i < gc_cnt; i++) {
|
||||
p = gc_stack[i];
|
||||
|
||||
/* The READ_ONCE() pairs with the WRITE_ONCE()
|
||||
* in inet_putpeer()
|
||||
*/
|
||||
delta = (__u32)jiffies - READ_ONCE(p->dtime);
|
||||
|
||||
if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
|
||||
|
|
@ -168,31 +167,23 @@ static void inet_peer_gc(struct inet_peer_base *base,
|
|||
}
|
||||
}
|
||||
|
||||
/* Must be called under RCU : No refcount change is done here. */
|
||||
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
|
||||
const struct inetpeer_addr *daddr,
|
||||
int create)
|
||||
const struct inetpeer_addr *daddr)
|
||||
{
|
||||
struct inet_peer *p, *gc_stack[PEER_MAX_GC];
|
||||
struct rb_node **pp, *parent;
|
||||
unsigned int gc_cnt, seq;
|
||||
int invalidated;
|
||||
|
||||
/* Attempt a lockless lookup first.
|
||||
* Because of a concurrent writer, we might not find an existing entry.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
seq = read_seqbegin(&base->lock);
|
||||
p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
|
||||
invalidated = read_seqretry(&base->lock, seq);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (p)
|
||||
return p;
|
||||
|
||||
/* If no writer did a change during our lookup, we can return early. */
|
||||
if (!create && !invalidated)
|
||||
return NULL;
|
||||
|
||||
/* retry an exact lookup, taking the lock before.
|
||||
* At least, nodes should be hot in our cache.
|
||||
*/
|
||||
|
|
@ -201,12 +192,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
|
|||
|
||||
gc_cnt = 0;
|
||||
p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
|
||||
if (!p && create) {
|
||||
if (!p) {
|
||||
p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
|
||||
if (p) {
|
||||
p->daddr = *daddr;
|
||||
p->dtime = (__u32)jiffies;
|
||||
refcount_set(&p->refcnt, 2);
|
||||
refcount_set(&p->refcnt, 1);
|
||||
atomic_set(&p->rid, 0);
|
||||
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
|
||||
p->rate_tokens = 0;
|
||||
|
|
@ -231,15 +222,9 @@ EXPORT_SYMBOL_GPL(inet_getpeer);
|
|||
|
||||
void inet_putpeer(struct inet_peer *p)
|
||||
{
|
||||
/* The WRITE_ONCE() pairs with itself (we run lockless)
|
||||
* and the READ_ONCE() in inet_peer_gc()
|
||||
*/
|
||||
WRITE_ONCE(p->dtime, (__u32)jiffies);
|
||||
|
||||
if (refcount_dec_and_test(&p->refcnt))
|
||||
kfree_rcu(p, rcu);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(inet_putpeer);
|
||||
|
||||
/*
|
||||
* Check transmit rate limitation for given message.
|
||||
|
|
|
|||
|
|
@ -82,15 +82,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
|
|||
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
|
||||
{
|
||||
struct ipq *qp = container_of(q, struct ipq, q);
|
||||
struct net *net = q->fqdir->net;
|
||||
|
||||
const struct frag_v4_compare_key *key = a;
|
||||
struct net *net = q->fqdir->net;
|
||||
struct inet_peer *p = NULL;
|
||||
|
||||
q->key.v4 = *key;
|
||||
qp->ecn = 0;
|
||||
qp->peer = q->fqdir->max_dist ?
|
||||
inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
|
||||
NULL;
|
||||
if (q->fqdir->max_dist) {
|
||||
rcu_read_lock();
|
||||
p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif);
|
||||
if (p && !refcount_inc_not_zero(&p->refcnt))
|
||||
p = NULL;
|
||||
rcu_read_unlock();
|
||||
}
|
||||
qp->peer = p;
|
||||
}
|
||||
|
||||
static void ip4_frag_free(struct inet_frag_queue *q)
|
||||
|
|
|
|||
|
|
@ -870,11 +870,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
|
|||
}
|
||||
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
|
||||
vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
|
||||
rcu_read_unlock();
|
||||
|
||||
net = dev_net(rt->dst.dev);
|
||||
peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
|
||||
peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
|
||||
if (!peer) {
|
||||
rcu_read_unlock();
|
||||
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
|
||||
rt_nexthop(rt, ip_hdr(skb)->daddr));
|
||||
return;
|
||||
|
|
@ -893,7 +893,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
|
|||
*/
|
||||
if (peer->n_redirects >= ip_rt_redirect_number) {
|
||||
peer->rate_last = jiffies;
|
||||
goto out_put_peer;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/* Check for load limit; set rate_last to the latest sent
|
||||
|
|
@ -914,8 +914,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
|
|||
&ip_hdr(skb)->saddr, inet_iif(skb),
|
||||
&ip_hdr(skb)->daddr, &gw);
|
||||
}
|
||||
out_put_peer:
|
||||
inet_putpeer(peer);
|
||||
out_unlock:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static int ip_error(struct sk_buff *skb)
|
||||
|
|
@ -975,9 +975,9 @@ static int ip_error(struct sk_buff *skb)
|
|||
break;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
|
||||
l3mdev_master_ifindex(skb->dev), 1);
|
||||
|
||||
l3mdev_master_ifindex_rcu(skb->dev));
|
||||
send = true;
|
||||
if (peer) {
|
||||
now = jiffies;
|
||||
|
|
@ -989,8 +989,9 @@ static int ip_error(struct sk_buff *skb)
|
|||
peer->rate_tokens -= ip_rt_error_cost;
|
||||
else
|
||||
send = false;
|
||||
inet_putpeer(peer);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (send)
|
||||
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
|
||||
|
||||
|
|
|
|||
|
|
@ -222,10 +222,10 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
|
|||
if (rt->rt6i_dst.plen < 128)
|
||||
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
|
||||
|
||||
peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
|
||||
rcu_read_lock();
|
||||
peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr);
|
||||
res = inet_peer_xrlim_allow(peer, tmo);
|
||||
if (peer)
|
||||
inet_putpeer(peer);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
if (!res)
|
||||
__ICMP6_INC_STATS(net, ip6_dst_idev(dst),
|
||||
|
|
|
|||
|
|
@ -613,15 +613,15 @@ int ip6_forward(struct sk_buff *skb)
|
|||
else
|
||||
target = &hdr->daddr;
|
||||
|
||||
peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
|
||||
rcu_read_lock();
|
||||
peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
|
||||
|
||||
/* Limit redirects both by destination (here)
|
||||
and by source (inside ndisc_send_redirect)
|
||||
*/
|
||||
if (inet_peer_xrlim_allow(peer, 1*HZ))
|
||||
ndisc_send_redirect(skb, target);
|
||||
if (peer)
|
||||
inet_putpeer(peer);
|
||||
rcu_read_unlock();
|
||||
} else {
|
||||
int addrtype = ipv6_addr_type(&hdr->saddr);
|
||||
|
||||
|
|
|
|||
|
|
@ -1731,10 +1731,12 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
|
|||
"Redirect: destination is not a neighbour\n");
|
||||
goto release;
|
||||
}
|
||||
peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1);
|
||||
|
||||
rcu_read_lock();
|
||||
peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr);
|
||||
ret = inet_peer_xrlim_allow(peer, 1*HZ);
|
||||
if (peer)
|
||||
inet_putpeer(peer);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!ret)
|
||||
goto release;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user