diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 813a19122ebb..cc232508e695 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -2,6 +2,12 @@ #ifndef _LINUX_NETDEVICE_XMIT_H #define _LINUX_NETDEVICE_XMIT_H +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED) +#define MIRRED_NEST_LIMIT 4 +#endif + +struct net_device; + struct netdev_xmit { u16 recursion; u8 more; @@ -9,7 +15,8 @@ struct netdev_xmit { u8 skip_txqueue; #endif #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) - u8 sched_mirred_nest; + u8 sched_mirred_nest; + struct net_device *sched_mirred_dev[MIRRED_NEST_LIMIT]; #endif #if IS_ENABLED(CONFIG_NF_DUP_NETDEV) u8 nf_dup_skb_recursion; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 738cd5b13c62..94966692ccdf 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -41,13 +41,6 @@ enum qdisc_state_t { __QDISC_STATE_DRAINING, }; -enum qdisc_state2_t { - /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. - * Use qdisc_run_begin/end() or qdisc_is_running() instead. - */ - __QDISC_STATE2_RUNNING, -}; - #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) @@ -117,13 +110,14 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; - int owner; + bool running; /* must be written under qdisc spinlock */ unsigned long state; - unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; - spinlock_t busylock ____cacheline_aligned_in_smp; + atomic_long_t defer_count ____cacheline_aligned_in_smp; + struct llist_head defer_list; + spinlock_t seqlock; struct rcu_head rcu; @@ -168,7 +162,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); - return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + return READ_ONCE(qdisc->running); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) @@ -211,7 +205,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) */ return spin_trylock(&qdisc->seqlock); } - return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + if (READ_ONCE(qdisc->running)) + return false; + WRITE_ONCE(qdisc->running, true); + return true; } static inline void qdisc_run_end(struct Qdisc *qdisc) @@ -229,7 +226,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) &qdisc->state))) __netif_schedule(qdisc); } else { - __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + WRITE_ONCE(qdisc->running, false); } } diff --git a/net/core/dev.c b/net/core/dev.c index 35010faf0b78..821e7c718924 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4125,9 +4125,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { + struct sk_buff *next, *to_free = NULL; spinlock_t *root_lock = qdisc_lock(q); - struct sk_buff *to_free = NULL; - bool contended; + struct llist_node *ll_list, *first_n; + unsigned long defer_count = 0; int rc; qdisc_calculate_pkt_len(skb, q); @@ -4167,67 +4168,81 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } - if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { - kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); - return NET_XMIT_DROP; - } - /* - * Heuristic to force contended enqueues to serialize on a - * separate lock before trying to get qdisc main lock. - * This permits qdisc->running owner to get the lock more - * often and dequeue packets faster. - * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit - * and then other tasks will only enqueue packets. The packets will be - * sent after the qdisc owner is scheduled again. To prevent this - * scenario the task always serialize on the lock. + /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. + * In the try_cmpxchg() loop, we want to increment q->defer_count + * at most once to limit the number of skbs in defer_list. + * We perform the defer_count increment only if the list is not empty, + * because some arches have slow atomic_long_inc_return(). */ - contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); - if (unlikely(contended)) - spin_lock(&q->busylock); + first_n = READ_ONCE(q->defer_list.first); + do { + if (first_n && !defer_count) { + defer_count = atomic_long_inc_return(&q->defer_count); + if (unlikely(defer_count > q->limit)) { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + return NET_XMIT_DROP; + } + } + skb->ll_node.next = first_n; + } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node)); + + /* If defer_list was not empty, we know the cpu which queued + * the first skb will process the whole list for us. + */ + if (first_n) + return NET_XMIT_SUCCESS; spin_lock(root_lock); + + ll_list = llist_del_all(&q->defer_list); + /* There is a small race because we clear defer_count not atomically + * with the prior llist_del_all(). This means defer_list could grow + * over q->limit. + */ + atomic_long_set(&q->defer_count, 0); + + ll_list = llist_reverse_order(ll_list); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - __qdisc_drop(skb, &to_free); + llist_for_each_entry_safe(skb, next, ll_list, ll_node) + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; - } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && - qdisc_run_begin(q)) { + goto unlock; + } + if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + !llist_next(ll_list) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list, + struct sk_buff, + ll_node)); qdisc_bstats_update(q, skb); - - if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - } - qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - WRITE_ONCE(q->owner, smp_processor_id()); - rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - WRITE_ONCE(q->owner, -1); - if (qdisc_run_begin(q)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } - __qdisc_run(q); - qdisc_run_end(q); + int count = 0; + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + prefetch(next); + skb_mark_not_on_list(skb); + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + count++; } + qdisc_run(q); + if (count != 1) + rc = NET_XMIT_SUCCESS; } +unlock: spin_unlock(root_lock); if (unlikely(to_free)) kfree_skb_list_reason(to_free, tcf_get_drop_reason(to_free)); - if (unlikely(contended)) - spin_unlock(&q->busylock); return rc; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6be01454f262..8eb3c5820724 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1136,7 +1136,16 @@ void skb_release_head_state(struct sk_buff *skb) skb_dst_drop(skb); if (skb->destructor) { DEBUG_NET_WARN_ON_ONCE(in_hardirq()); - skb->destructor(skb); +#ifdef CONFIG_INET + INDIRECT_CALL_3(skb->destructor, + tcp_wfree, __sock_wfree, sock_wfree, + skb); +#else + INDIRECT_CALL_1(skb->destructor, + sock_wfree, + skb); + +#endif } #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb_nfct(skb)); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5f01f567c934..f27b583def78 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -29,31 +29,6 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); -#define MIRRED_NEST_LIMIT 4 - -#ifndef CONFIG_PREEMPT_RT -static u8 tcf_mirred_nest_level_inc_return(void) -{ - return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); -} - -static void tcf_mirred_nest_level_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); -} - -#else -static u8 tcf_mirred_nest_level_inc_return(void) -{ - return current->net_xmit.sched_mirred_nest++; -} - -static void tcf_mirred_nest_level_dec(void) -{ - current->net_xmit.sched_mirred_nest--; -} -#endif - static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; @@ -439,44 +414,53 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); - unsigned int nest_level; + struct netdev_xmit *xmit; bool m_mac_header_xmit; struct net_device *dev; - int m_eaction; + int i, m_eaction; u32 blockid; - nest_level = tcf_mirred_nest_level_inc_return(); - if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { +#ifdef CONFIG_PREEMPT_RT + xmit = ¤t->net_xmit; +#else + xmit = this_cpu_ptr(&softnet_data.xmit); +#endif + if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); - retval = TC_ACT_SHOT; - goto dec_nest_level; + return TC_ACT_SHOT; } tcf_lastuse_update(&m->tcf_tm); tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); - if (blockid) { - retval = tcf_blockcast(skb, m, blockid, res, retval); - goto dec_nest_level; - } + if (blockid) + return tcf_blockcast(skb, m, blockid, res, retval); dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); - goto dec_nest_level; + return retval; } + for (i = 0; i < xmit->sched_mirred_nest; i++) { + if (xmit->sched_mirred_dev[i] != dev) + continue; + pr_notice_once("tc mirred: loop on device %s\n", + netdev_name(dev)); + tcf_action_inc_overlimit_qstats(&m->common); + return retval; + } + + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); - -dec_nest_level: - tcf_mirred_nest_level_dec(); + xmit->sched_mirred_nest--; return retval; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 1e008a228ebd..d9a98d02a55f 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -666,7 +666,6 @@ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, - .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, .prev = (struct sk_buff *)&noop_qdisc.gso_skb, @@ -679,7 +678,6 @@ struct Qdisc noop_qdisc = { .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, - .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); @@ -971,10 +969,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } } - spin_lock_init(&sch->busylock); - lockdep_set_class(&sch->busylock, - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); lockdep_set_class(&sch->seqlock, @@ -985,7 +979,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; - sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); diff --git a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt index 183051ba0cae..6882b8240a8a 100644 --- a/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt +++ b/tools/testing/selftests/net/packetdrill/tcp_user_timeout_user-timeout-probe.pkt @@ -23,14 +23,16 @@ // install a qdisc dropping all packets +0 `tc qdisc delete dev tun0 root 2>/dev/null ; tc qdisc add dev tun0 root pfifo limit 0` + +0 write(4, ..., 24) = 24 // When qdisc is congested we retry every 500ms // (TCP_RESOURCE_PROBE_INTERVAL) and therefore // we retry 6 times before hitting 3s timeout. // First verify that the connection is alive: -+3.250 write(4, ..., 24) = 24 ++3 write(4, ..., 24) = 24 + // Now verify that shortly after that the socket is dead: - +.100 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out) ++1 write(4, ..., 24) = -1 ETIMEDOUT (Connection timed out) +0 %{ assert tcpi_probes == 6, tcpi_probes; \ assert tcpi_backoff == 0, tcpi_backoff }%