mirror of
https://github.com/torvalds/linux.git
synced 2026-05-27 00:22:00 +02:00
Merge branch 'veth-qdisc-backpressure-and-qdisc-check-refactor'
Jesper Dangaard Brouer says: ==================== veth: qdisc backpressure and qdisc check refactor This patch series addresses TX drops seen on veth devices under load, particularly when using threaded NAPI, which is our setup in production. The root cause is that the NAPI consumer often runs on a different CPU than the producer. Combined with scheduling delays or simply slower consumption, this increases the chance that the ptr_ring fills up before packets are drained, resulting in drops from veth_xmit() (ndo_start_xmit()). To make this easier to reproduce, we’ve created a script that sets up a test scenario using network namespaces. The script inserts 1000 iptables rules in the consumer namespace to slow down packet processing and amplify the issue. Reproducer script: https://github.com/xdp-project/xdp-project/blob/main/areas/core/veth_setup01_NAPI_TX_drops.sh This series first introduces a helper to detect no-queue qdiscs and then uses it in the veth driver to conditionally apply qdisc-level backpressure when a real qdisc is attached. The behavior is off by default and opt-in, ensuring minimal impact and easy activation. v6: https://lore.kernel.org/174549933665.608169.392044991754158047.stgit@firesoul v5: https://lore.kernel.org/174489803410.355490.13216831426556849084.stgit@firesoul v4 https://lore.kernel.org/174472463778.274639.12670590457453196991.stgit@firesoul v3: https://lore.kernel.org/174464549885.20396.6987653753122223942.stgit@firesoul v2: https://lore.kernel.org/174412623473.3702169.4235683143719614624.stgit@firesoul RFC-v1: https://lore.kernel.org/174377814192.3376479.16481605648460889310.stgit@firesoul ==================== Link: https://patch.msgid.link/174559288731.827981.8748257839971869213.stgit@firesoul Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
c0b0a360ed
|
|
@ -307,12 +307,10 @@ static void __veth_xdp_flush(struct veth_rq *rq)
|
|||
|
||||
static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
|
||||
{
|
||||
if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
|
||||
dev_kfree_skb_any(skb);
|
||||
return NET_RX_DROP;
|
||||
}
|
||||
if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb)))
|
||||
return NETDEV_TX_BUSY; /* signal qdisc layer */
|
||||
|
||||
return NET_RX_SUCCESS;
|
||||
return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */
|
||||
}
|
||||
|
||||
static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
|
||||
|
|
@ -346,11 +344,11 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
|
|||
{
|
||||
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
|
||||
struct veth_rq *rq = NULL;
|
||||
int ret = NETDEV_TX_OK;
|
||||
struct netdev_queue *txq;
|
||||
struct net_device *rcv;
|
||||
int length = skb->len;
|
||||
bool use_napi = false;
|
||||
int rxq;
|
||||
int ret, rxq;
|
||||
|
||||
rcu_read_lock();
|
||||
rcv = rcu_dereference(priv->peer);
|
||||
|
|
@ -373,17 +371,45 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
|
|||
}
|
||||
|
||||
skb_tx_timestamp(skb);
|
||||
if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
|
||||
|
||||
ret = veth_forward_skb(rcv, skb, rq, use_napi);
|
||||
switch (ret) {
|
||||
case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */
|
||||
if (!use_napi)
|
||||
dev_sw_netstats_tx_add(dev, 1, length);
|
||||
else
|
||||
__veth_xdp_flush(rq);
|
||||
} else {
|
||||
break;
|
||||
case NETDEV_TX_BUSY:
|
||||
/* If a qdisc is attached to our virtual device, returning
|
||||
* NETDEV_TX_BUSY is allowed.
|
||||
*/
|
||||
txq = netdev_get_tx_queue(dev, rxq);
|
||||
|
||||
if (qdisc_txq_has_no_queue(txq)) {
|
||||
dev_kfree_skb_any(skb);
|
||||
goto drop;
|
||||
}
|
||||
/* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */
|
||||
__skb_push(skb, ETH_HLEN);
|
||||
/* Depend on prior success packets started NAPI consumer via
|
||||
* __veth_xdp_flush(). Cancel TXQ stop if consumer stopped,
|
||||
* paired with empty check in veth_poll().
|
||||
*/
|
||||
netif_tx_stop_queue(txq);
|
||||
smp_mb__after_atomic();
|
||||
if (unlikely(__ptr_ring_empty(&rq->xdp_ring)))
|
||||
netif_tx_wake_queue(txq);
|
||||
break;
|
||||
case NET_RX_DROP: /* same as NET_XMIT_DROP */
|
||||
drop:
|
||||
atomic64_inc(&priv->dropped);
|
||||
ret = NET_XMIT_DROP;
|
||||
break;
|
||||
default:
|
||||
net_crit_ratelimited("%s(%s): Invalid return code(%d)",
|
||||
__func__, dev->name, ret);
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
|
|
@ -874,9 +900,17 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
|
|||
struct veth_xdp_tx_bq *bq,
|
||||
struct veth_stats *stats)
|
||||
{
|
||||
struct veth_priv *priv = netdev_priv(rq->dev);
|
||||
int queue_idx = rq->xdp_rxq.queue_index;
|
||||
struct netdev_queue *peer_txq;
|
||||
struct net_device *peer_dev;
|
||||
int i, done = 0, n_xdpf = 0;
|
||||
void *xdpf[VETH_XDP_BATCH];
|
||||
|
||||
/* NAPI functions as RCU section */
|
||||
peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held());
|
||||
peer_txq = netdev_get_tx_queue(peer_dev, queue_idx);
|
||||
|
||||
for (i = 0; i < budget; i++) {
|
||||
void *ptr = __ptr_ring_consume(&rq->xdp_ring);
|
||||
|
||||
|
|
@ -925,6 +959,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
|
|||
rq->stats.vs.xdp_packets += done;
|
||||
u64_stats_update_end(&rq->stats.syncp);
|
||||
|
||||
if (unlikely(netif_tx_queue_stopped(peer_txq)))
|
||||
netif_tx_wake_queue(peer_txq);
|
||||
|
||||
return done;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -343,15 +343,13 @@ static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id)
|
|||
static bool qdisc_tx_is_default(const struct net_device *dev)
|
||||
{
|
||||
struct netdev_queue *txq;
|
||||
struct Qdisc *qdisc;
|
||||
|
||||
if (dev->num_tx_queues > 1)
|
||||
return false;
|
||||
|
||||
txq = netdev_get_tx_queue(dev, 0);
|
||||
qdisc = rcu_access_pointer(txq->qdisc);
|
||||
|
||||
return !qdisc->enqueue;
|
||||
return qdisc_txq_has_no_queue(txq);
|
||||
}
|
||||
|
||||
/* Local traffic destined to local address. Reinsert the packet to rx
|
||||
|
|
|
|||
|
|
@ -803,6 +803,14 @@ static inline bool qdisc_tx_changing(const struct net_device *dev)
|
|||
return false;
|
||||
}
|
||||
|
||||
/* "noqueue" qdisc identified by not having any enqueue, see noqueue_init() */
|
||||
static inline bool qdisc_txq_has_no_queue(const struct netdev_queue *txq)
|
||||
{
|
||||
struct Qdisc *qdisc = rcu_access_pointer(txq->qdisc);
|
||||
|
||||
return qdisc->enqueue == NULL;
|
||||
}
|
||||
|
||||
/* Is the device using the noop qdisc on all queues? */
|
||||
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
|
||||
{
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user