net/mlx5: Propagate LAG effective max_tx_speed to vports

Currently, vports report only their parent's uplink speed, which in LAG
setups does not reflect the true aggregated bandwidth. This makes it
hard for upper-layer software to optimize load balancing decisions
based on accurate bandwidth information.

Fix the issue by calculating the possible maximum speed of a LAG as
the sum of speeds of all active uplinks that are part of the LAG.
Propagate this effective max speed to vports associated with the LAG
whenever a relevant event occurs, such as physical port link state
changes or LAG creation/modification.

With this change, upper-layer components receive accurate bandwidth
information corresponding to the active members of the LAG and can
make better load balancing decisions.

Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
Reviewed-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>
This commit is contained in:
Or Har-Toov 2025-12-18 17:58:05 +02:00 committed by Leon Romanovsky
parent 3df5dd46fc
commit 50f1d188c5
6 changed files with 241 additions and 0 deletions

View File

@ -996,6 +996,126 @@ static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
ldev->mode != MLX5_LAG_MODE_MPESW;
}
#ifdef CONFIG_MLX5_ESWITCH
static int
mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed,
int (*get_speed)(struct mlx5_core_dev *, u32 *))
{
struct mlx5_core_dev *pf_mdev;
int pf_idx;
u32 speed;
int ret;
*sum_speed = 0;
mlx5_ldev_for_each(pf_idx, 0, ldev) {
pf_mdev = ldev->pf[pf_idx].dev;
if (!pf_mdev)
continue;
ret = get_speed(pf_mdev, &speed);
if (ret) {
mlx5_core_dbg(pf_mdev,
"Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n",
get_speed, dev_name(pf_mdev->device),
ret);
return ret;
}
*sum_speed += speed;
}
return 0;
}
static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed)
{
return mlx5_lag_sum_devices_speed(ldev, max_speed,
mlx5_port_max_linkspeed);
}
static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev,
u32 speed)
{
u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT;
struct mlx5_eswitch *esw = mdev->priv.eswitch;
struct mlx5_vport *vport;
unsigned long i;
int ret;
if (!esw)
return;
if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed))
return;
mlx5_esw_for_each_vport(esw, i, vport) {
if (!vport)
continue;
if (vport->vport == MLX5_VPORT_UPLINK)
continue;
ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod,
vport->vport, true, speed);
if (ret)
mlx5_core_dbg(mdev,
"Failed to set vport %d speed %d, err=%d\n",
vport->vport, speed, ret);
}
}
void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev)
{
struct mlx5_core_dev *mdev;
u32 speed;
int pf_idx;
speed = ldev->tracker.bond_speed_mbps;
if (speed == SPEED_UNKNOWN)
return;
/* If speed is not set, use the sum of max speeds of all PFs */
if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed))
return;
speed = speed / MLX5_MAX_TX_SPEED_UNIT;
mlx5_ldev_for_each(pf_idx, 0, ldev) {
mdev = ldev->pf[pf_idx].dev;
if (!mdev)
continue;
mlx5_lag_modify_device_vports_speed(mdev, speed);
}
}
void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev)
{
struct mlx5_core_dev *mdev;
u32 speed;
int pf_idx;
int ret;
mlx5_ldev_for_each(pf_idx, 0, ldev) {
mdev = ldev->pf[pf_idx].dev;
if (!mdev)
continue;
ret = mlx5_port_oper_linkspeed(mdev, &speed);
if (ret) {
mlx5_core_dbg(mdev,
"Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n",
dev_name(mdev->device), ret);
continue;
}
speed = speed / MLX5_MAX_TX_SPEED_UNIT;
mlx5_lag_modify_device_vports_speed(mdev, speed);
}
}
#endif
static void mlx5_do_bond(struct mlx5_lag *ldev)
{
int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
@ -1083,9 +1203,12 @@ static void mlx5_do_bond(struct mlx5_lag *ldev)
ndev);
dev_put(ndev);
}
mlx5_lag_set_vports_agg_speed(ldev);
} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
mlx5_modify_lag(ldev, &tracker);
mlx5_lag_set_vports_agg_speed(ldev);
} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
mlx5_lag_reset_vports_speed(ldev);
mlx5_disable_lag(ldev);
}
}
@ -1286,6 +1409,38 @@ static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
return 1;
}
static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker,
struct net_device *ndev)
{
struct ethtool_link_ksettings lksettings;
struct net_device *bond_dev;
int err;
if (netif_is_lag_master(ndev))
bond_dev = ndev;
else
bond_dev = netdev_master_upper_dev_get(ndev);
if (!bond_dev) {
tracker->bond_speed_mbps = SPEED_UNKNOWN;
return;
}
err = __ethtool_get_link_ksettings(bond_dev, &lksettings);
if (err) {
netdev_dbg(bond_dev,
"Failed to get speed for bond dev %s, err=%d\n",
bond_dev->name, err);
tracker->bond_speed_mbps = SPEED_UNKNOWN;
return;
}
if (lksettings.base.speed == SPEED_UNKNOWN)
tracker->bond_speed_mbps = 0;
else
tracker->bond_speed_mbps = lksettings.base.speed;
}
/* this handler is always registered to netdev events */
static int mlx5_lag_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
@ -1317,6 +1472,9 @@ static int mlx5_lag_netdev_event(struct notifier_block *this,
break;
}
if (changed)
mlx5_lag_update_tracker_speed(&tracker, ndev);
ldev->tracker = tracker;
if (changed)

View File

@ -48,6 +48,7 @@ struct lag_tracker {
unsigned int is_bonded:1;
unsigned int has_inactive:1;
enum netdev_lag_hash hash_type;
u32 bond_speed_mbps;
};
/* LAG data of a ConnectX card.
@ -116,6 +117,14 @@ int mlx5_deactivate_lag(struct mlx5_lag *ldev);
void mlx5_lag_add_devices(struct mlx5_lag *ldev);
struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev);
#ifdef CONFIG_MLX5_ESWITCH
void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev);
void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev);
#else
static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {}
static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {}
#endif
static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev)
{
if (!MLX5_CAP_GEN(dev, vport_group_manager) ||

View File

@ -381,6 +381,7 @@ const struct mlx5_link_info *mlx5_port_ptys2info(struct mlx5_core_dev *mdev,
u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
struct mlx5_link_info *info,
bool force_legacy);
int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed);
#define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \

View File

@ -1200,6 +1200,30 @@ u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev,
return link_modes;
}
int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
{
const struct mlx5_link_info *table;
struct mlx5_port_eth_proto eproto;
u32 oper_speed = 0;
u32 max_size;
bool ext;
int err;
int i;
ext = mlx5_ptys_ext_supported(mdev);
err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto);
if (err)
return err;
mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false);
for (i = 0; i < max_size; ++i)
if (eproto.oper & MLX5E_PROT_MASK(i))
oper_speed = max(oper_speed, table[i].speed);
*speed = oper_speed;
return 0;
}
int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed)
{
const struct mlx5_link_info *table;

View File

@ -62,6 +62,28 @@ u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
return MLX5_GET(query_vport_state_out, out, state);
}
static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
u16 vport, u8 other_vport,
u8 *admin_state)
{
u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
int err;
MLX5_SET(query_vport_state_in, in, opcode,
MLX5_CMD_OP_QUERY_VPORT_STATE);
MLX5_SET(query_vport_state_in, in, op_mod, opmod);
MLX5_SET(query_vport_state_in, in, vport_number, vport);
MLX5_SET(query_vport_state_in, in, other_vport, other_vport);
err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
if (err)
return err;
*admin_state = MLX5_GET(query_vport_state_out, out, admin_state);
return 0;
}
int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
u16 vport, u8 other_vport, u8 state)
{
@ -77,6 +99,29 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
}
int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
u16 vport, u8 other_vport, u16 max_tx_speed)
{
u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {};
u8 admin_state;
int err;
err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport,
&admin_state);
if (err)
return err;
MLX5_SET(modify_vport_state_in, in, opcode,
MLX5_CMD_OP_MODIFY_VPORT_STATE);
MLX5_SET(modify_vport_state_in, in, op_mod, opmod);
MLX5_SET(modify_vport_state_in, in, vport_number, vport);
MLX5_SET(modify_vport_state_in, in, other_vport, other_vport);
MLX5_SET(modify_vport_state_in, in, admin_state, admin_state);
MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed);
return mlx5_cmd_exec_in(mdev, modify_vport_state, in);
}
static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport,
bool other_vport, u32 *out)
{

View File

@ -41,6 +41,8 @@
(MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \
mlx5_core_is_pf(mdev))
#define MLX5_MAX_TX_SPEED_UNIT 100
enum {
MLX5_CAP_INLINE_MODE_L2,
MLX5_CAP_INLINE_MODE_VPORT_CONTEXT,
@ -58,6 +60,8 @@ enum {
u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport);
int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod,
u16 vport, u8 other_vport, u8 state);
int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod,
u16 vport, u8 other_vport, u16 max_tx_speed);
int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
u16 vport, bool other, u8 *addr);
int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr);