diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 424f8a2728a3..74660e7fe674 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -457,22 +457,8 @@ static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work) { struct mlx5e_ptpsq *ptpsq = container_of(work, struct mlx5e_ptpsq, report_unhealthy_work); - struct mlx5e_txqsq *sq = &ptpsq->txqsq; - - /* Recovering the PTP SQ means re-enabling NAPI, which requires the - * netdev instance lock. However, SQ closing has to wait for this work - * task to finish while also holding the same lock. So either get the - * lock or find that the SQ is no longer enabled and thus this work is - * not relevant anymore. - */ - while (!netdev_trylock(sq->netdev)) { - if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)) - return; - msleep(20); - } mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq); - netdev_unlock(sq->netdev); } static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c index 0686fbdd5a05..6efb626b5506 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_rx.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Mellanox Technologies. +#include + #include "health.h" #include "params.h" #include "txrx.h" @@ -177,6 +179,16 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx) rq = ctx; priv = rq->priv; + /* Acquire netdev instance lock to synchronize with channel close and + * reopen flows. Either successfully obtain the lock, or detect that + * channels are closing for another reason, making this work no longer + * necessary. + */ + while (!netdev_trylock(rq->netdev)) { + if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state)) + return 0; + msleep(20); + } mutex_lock(&priv->state_lock); eq = rq->cq.mcq.eq; @@ -186,6 +198,7 @@ static int mlx5e_rx_reporter_timeout_recover(void *ctx) clear_bit(MLX5E_SQ_STATE_ENABLED, &rq->icosq->state); mutex_unlock(&priv->state_lock); + netdev_unlock(rq->netdev); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index 4adc1adf9897..60ba840e00fa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (c) 2019 Mellanox Technologies. */ +#include + #include "health.h" #include "en/ptp.h" #include "en/devlink.h" @@ -79,6 +81,18 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) return 0; + /* Recovering queues means re-enabling NAPI, which requires the netdev + * instance lock. However, SQ closing flows have to wait for work tasks + * to finish while also holding the netdev instance lock. So either get + * the lock or find that the SQ is no longer enabled and thus this work + * is not relevant anymore. + */ + while (!netdev_trylock(dev)) { + if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)) + return 0; + msleep(20); + } + err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); if (err) { netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", @@ -114,9 +128,11 @@ static int mlx5e_tx_reporter_err_cqe_recover(void *ctx) else mlx5e_trigger_napi_sched(sq->cq.napi); + netdev_unlock(dev); return 0; out: clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); + netdev_unlock(dev); return err; } @@ -137,10 +153,24 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) sq = to_ctx->sq; eq = sq->cq.mcq.eq; priv = sq->priv; + + /* Recovering the TX queues implies re-enabling NAPI, which requires + * the netdev instance lock. + * However, channel closing flows have to wait for this work to finish + * while holding the same lock. So either get the lock or find that + * channels are being closed for other reason and this work is not + * relevant anymore. + */ + while (!netdev_trylock(sq->netdev)) { + if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state)) + return 0; + msleep(20); + } + err = mlx5e_health_channel_eq_recover(sq->netdev, eq, sq->cq.ch_stats); if (!err) { to_ctx->status = 0; /* this sq recovered */ - return err; + goto out; } mutex_lock(&priv->state_lock); @@ -148,7 +178,7 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) mutex_unlock(&priv->state_lock); if (!err) { to_ctx->status = 1; /* all channels recovered */ - return err; + goto out; } to_ctx->status = err; @@ -156,7 +186,8 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) netdev_err(priv->netdev, "mlx5e_safe_reopen_channels failed recovering from a tx_timeout, err(%d).\n", err); - +out: + netdev_unlock(sq->netdev); return err; } @@ -173,10 +204,22 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) return 0; priv = ptpsq->txqsq.priv; + netdev = priv->netdev; + + /* Recovering the PTP SQ means re-enabling NAPI, which requires the + * netdev instance lock. However, SQ closing has to wait for this work + * task to finish while also holding the same lock. So either get the + * lock or find that the SQ is no longer enabled and thus this work is + * not relevant anymore. + */ + while (!netdev_trylock(netdev)) { + if (!test_bit(MLX5E_SQ_STATE_ENABLED, &ptpsq->txqsq.state)) + return 0; + msleep(20); + } mutex_lock(&priv->state_lock); chs = &priv->channels; - netdev = priv->netdev; carrier_ok = netif_carrier_ok(netdev); netif_carrier_off(netdev); @@ -193,6 +236,7 @@ static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) netif_carrier_on(netdev); mutex_unlock(&priv->state_lock); + netdev_unlock(netdev); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 6a7ca4571c19..7eb691c2a1bd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -631,19 +631,7 @@ static void mlx5e_rq_timeout_work(struct work_struct *timeout_work) struct mlx5e_rq, rx_timeout_work); - /* Acquire netdev instance lock to synchronize with channel close and - * reopen flows. Either successfully obtain the lock, or detect that - * channels are closing for another reason, making this work no longer - * necessary. - */ - while (!netdev_trylock(rq->netdev)) { - if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &rq->priv->state)) - return; - msleep(20); - } - mlx5e_reporter_rx_timeout(rq); - netdev_unlock(rq->netdev); } static int mlx5e_alloc_mpwqe_rq_drop_page(struct mlx5e_rq *rq) @@ -1952,20 +1940,7 @@ void mlx5e_tx_err_cqe_work(struct work_struct *recover_work) struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq, recover_work); - /* Recovering queues means re-enabling NAPI, which requires the netdev - * instance lock. However, SQ closing flows have to wait for work tasks - * to finish while also holding the netdev instance lock. So either get - * the lock or find that the SQ is no longer enabled and thus this work - * is not relevant anymore. - */ - while (!netdev_trylock(sq->netdev)) { - if (!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state)) - return; - msleep(20); - } - mlx5e_reporter_tx_err_cqe(sq); - netdev_unlock(sq->netdev); } static struct dim_cq_moder mlx5e_get_def_tx_moderation(u8 cq_period_mode) @@ -5115,19 +5090,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work) struct net_device *netdev = priv->netdev; int i; - /* Recovering the TX queues implies re-enabling NAPI, which requires - * the netdev instance lock. - * However, channel closing flows have to wait for this work to finish - * while holding the same lock. So either get the lock or find that - * channels are being closed for other reason and this work is not - * relevant anymore. - */ - while (!netdev_trylock(netdev)) { - if (!test_bit(MLX5E_STATE_CHANNELS_ACTIVE, &priv->state)) - return; - msleep(20); - } - for (i = 0; i < netdev->real_num_tx_queues; i++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(netdev, i); @@ -5140,8 +5102,6 @@ static void mlx5e_tx_timeout_work(struct work_struct *work) /* break if tried to reopened channels */ break; } - - netdev_unlock(netdev); } static void mlx5e_tx_timeout(struct net_device *dev, unsigned int txqueue)