mirror of
https://github.com/torvalds/linux.git
synced 2026-05-24 15:12:13 +02:00
eth: fbnic: add FW health reporter
Add a health reporter to catch FW crashes. Dumping the reporter if FW has not crashed will create a snapshot of FW memory. Reviewed-by: Lee Trager <lee@trager.us> Reviewed-by: Simon Horman <horms@kernel.org> Signed-off-by: Jakub Kicinski <kuba@kernel.org> Link: https://patch.msgid.link/20250916231420.1693955-8-kuba@kernel.org Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
parent
5df1d0a084
commit
005a54722e
|
|
@ -69,6 +69,16 @@ On host boot the latest UEFI driver is always used, no explicit activation
|
|||
is required. Firmware activation is required to run new control firmware. cmrt
|
||||
firmware can only be activated by power cycling the NIC.
|
||||
|
||||
Health reporters
|
||||
----------------
|
||||
|
||||
fw reporter
|
||||
~~~~~~~~~~~
|
||||
|
||||
The ``fw`` health reporter tracks FW crashes. Dumping the reporter will
|
||||
show the core dump of the most recent FW crash, and if no FW crash has
|
||||
happened since power cycle - a snapshot of the FW memory.
|
||||
|
||||
Statistics
|
||||
----------
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ struct fbnic_dev {
|
|||
struct net_device *netdev;
|
||||
struct dentry *dbg_fbd;
|
||||
struct device *hwmon;
|
||||
struct devlink_health_reporter *fw_reporter;
|
||||
|
||||
u32 __iomem *uc_addr0;
|
||||
u32 __iomem *uc_addr4;
|
||||
|
|
@ -159,8 +160,12 @@ extern char fbnic_driver_name[];
|
|||
|
||||
void fbnic_devlink_free(struct fbnic_dev *fbd);
|
||||
struct fbnic_dev *fbnic_devlink_alloc(struct pci_dev *pdev);
|
||||
int fbnic_devlink_health_create(struct fbnic_dev *fbd);
|
||||
void fbnic_devlink_health_destroy(struct fbnic_dev *fbd);
|
||||
void fbnic_devlink_register(struct fbnic_dev *fbd);
|
||||
void fbnic_devlink_unregister(struct fbnic_dev *fbd);
|
||||
void __printf(2, 3)
|
||||
fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...);
|
||||
|
||||
int fbnic_fw_request_mbx(struct fbnic_dev *fbd);
|
||||
void fbnic_fw_free_mbx(struct fbnic_dev *fbd);
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
#include <net/devlink.h>
|
||||
|
||||
#include "fbnic.h"
|
||||
#include "fbnic_fw.h"
|
||||
#include "fbnic_tlv.h"
|
||||
|
||||
#define FBNIC_SN_STR_LEN 24
|
||||
|
|
@ -369,6 +370,160 @@ static const struct devlink_ops fbnic_devlink_ops = {
|
|||
.flash_update = fbnic_devlink_flash_update,
|
||||
};
|
||||
|
||||
static int fbnic_fw_reporter_dump(struct devlink_health_reporter *reporter,
|
||||
struct devlink_fmsg *fmsg, void *priv_ctx,
|
||||
struct netlink_ext_ack *extack)
|
||||
{
|
||||
struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter);
|
||||
u32 offset, index, index_count, length, size;
|
||||
struct fbnic_fw_completion *fw_cmpl;
|
||||
u8 *dump_data, **data;
|
||||
int err;
|
||||
|
||||
fw_cmpl = fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP);
|
||||
if (!fw_cmpl)
|
||||
return -ENOMEM;
|
||||
|
||||
err = fbnic_fw_xmit_coredump_info_msg(fbd, fw_cmpl, true);
|
||||
if (err) {
|
||||
NL_SET_ERR_MSG_MOD(extack,
|
||||
"Failed to transmit core dump info msg");
|
||||
goto cmpl_free;
|
||||
}
|
||||
if (!wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) {
|
||||
NL_SET_ERR_MSG_MOD(extack,
|
||||
"Timed out waiting on core dump info");
|
||||
err = -ETIMEDOUT;
|
||||
goto cmpl_cleanup;
|
||||
}
|
||||
|
||||
size = fw_cmpl->u.coredump_info.size;
|
||||
err = fw_cmpl->result;
|
||||
|
||||
fbnic_mbx_clear_cmpl(fbd, fw_cmpl);
|
||||
fbnic_fw_put_cmpl(fw_cmpl);
|
||||
|
||||
/* Handle error returned by firmware */
|
||||
if (err) {
|
||||
NL_SET_ERR_MSG_MOD(extack, "Firmware core dump returned error");
|
||||
return err;
|
||||
}
|
||||
if (!size) {
|
||||
NL_SET_ERR_MSG_MOD(extack,
|
||||
"Firmware core dump returned size 0");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/* Read the dump, we can only transfer TLV_MAX_DATA at a time */
|
||||
index_count = DIV_ROUND_UP(size, TLV_MAX_DATA);
|
||||
|
||||
fw_cmpl = __fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP,
|
||||
sizeof(void *) * index_count + size);
|
||||
if (!fw_cmpl)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Populate pointer table w/ pointer offsets */
|
||||
dump_data = (void *)&fw_cmpl->u.coredump.data[index_count];
|
||||
data = fw_cmpl->u.coredump.data;
|
||||
fw_cmpl->u.coredump.size = size;
|
||||
fw_cmpl->u.coredump.stride = TLV_MAX_DATA;
|
||||
|
||||
for (index = 0; index < index_count; index++) {
|
||||
/* First iteration installs completion */
|
||||
struct fbnic_fw_completion *cmpl_arg = index ? NULL : fw_cmpl;
|
||||
|
||||
offset = index * TLV_MAX_DATA;
|
||||
length = min(size - offset, TLV_MAX_DATA);
|
||||
|
||||
data[index] = dump_data + offset;
|
||||
err = fbnic_fw_xmit_coredump_read_msg(fbd, cmpl_arg,
|
||||
offset, length);
|
||||
if (err) {
|
||||
NL_SET_ERR_MSG_MOD(extack,
|
||||
"Failed to transmit core dump msg");
|
||||
if (cmpl_arg)
|
||||
goto cmpl_free;
|
||||
else
|
||||
goto cmpl_cleanup;
|
||||
}
|
||||
|
||||
if (wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) {
|
||||
reinit_completion(&fw_cmpl->done);
|
||||
} else {
|
||||
NL_SET_ERR_MSG_FMT_MOD(extack,
|
||||
"Timed out waiting on core dump (%d/%d)",
|
||||
index + 1, index_count);
|
||||
err = -ETIMEDOUT;
|
||||
goto cmpl_cleanup;
|
||||
}
|
||||
|
||||
/* If we didn't see the reply record as incomplete */
|
||||
if (fw_cmpl->u.coredump.data[index]) {
|
||||
NL_SET_ERR_MSG_FMT_MOD(extack,
|
||||
"No data for core dump chunk (%d/%d)",
|
||||
index + 1, index_count);
|
||||
err = -EIO;
|
||||
goto cmpl_cleanup;
|
||||
}
|
||||
}
|
||||
|
||||
devlink_fmsg_binary_pair_nest_start(fmsg, "FW coredump");
|
||||
|
||||
for (offset = 0; offset < size; offset += length) {
|
||||
length = min_t(u32, size - offset, TLV_MAX_DATA);
|
||||
|
||||
devlink_fmsg_binary_put(fmsg, dump_data + offset, length);
|
||||
}
|
||||
|
||||
devlink_fmsg_binary_pair_nest_end(fmsg);
|
||||
|
||||
cmpl_cleanup:
|
||||
fbnic_mbx_clear_cmpl(fbd, fw_cmpl);
|
||||
cmpl_free:
|
||||
fbnic_fw_put_cmpl(fw_cmpl);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void __printf(2, 3)
|
||||
fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...)
|
||||
{
|
||||
char msg[FBNIC_FW_LOG_MAX_SIZE];
|
||||
va_list args;
|
||||
|
||||
va_start(args, format);
|
||||
vsnprintf(msg, FBNIC_FW_LOG_MAX_SIZE, format, args);
|
||||
va_end(args);
|
||||
|
||||
devlink_health_report(fbd->fw_reporter, msg, fbd);
|
||||
if (fbnic_fw_log_ready(fbd))
|
||||
fbnic_fw_log_write(fbd, 0, fbd->firmware_time, msg);
|
||||
}
|
||||
|
||||
static const struct devlink_health_reporter_ops fbnic_fw_ops = {
|
||||
.name = "fw",
|
||||
.dump = fbnic_fw_reporter_dump,
|
||||
};
|
||||
|
||||
int fbnic_devlink_health_create(struct fbnic_dev *fbd)
|
||||
{
|
||||
fbd->fw_reporter = devlink_health_reporter_create(priv_to_devlink(fbd),
|
||||
&fbnic_fw_ops, fbd);
|
||||
if (IS_ERR(fbd->fw_reporter)) {
|
||||
dev_warn(fbd->dev,
|
||||
"Failed to create FW fault reporter: %pe\n",
|
||||
fbd->fw_reporter);
|
||||
return PTR_ERR(fbd->fw_reporter);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void fbnic_devlink_health_destroy(struct fbnic_dev *fbd)
|
||||
{
|
||||
devlink_health_reporter_destroy(fbd->fw_reporter);
|
||||
}
|
||||
|
||||
void fbnic_devlink_free(struct fbnic_dev *fbd)
|
||||
{
|
||||
struct devlink *devlink = priv_to_devlink(fbd);
|
||||
|
|
|
|||
|
|
@ -196,6 +196,8 @@ static void fbnic_health_check(struct fbnic_dev *fbd)
|
|||
if (tx_mbx->head != tx_mbx->tail)
|
||||
return;
|
||||
|
||||
fbnic_devlink_fw_report(fbd, "Firmware crashed detected!");
|
||||
|
||||
if (fbnic_fw_config_after_crash(fbd))
|
||||
dev_err(fbd->dev, "Firmware recovery failed after crash\n");
|
||||
}
|
||||
|
|
@ -278,6 +280,10 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
|
|||
return -ENOMEM;
|
||||
}
|
||||
|
||||
err = fbnic_devlink_health_create(fbd);
|
||||
if (err)
|
||||
goto free_fbd;
|
||||
|
||||
/* Populate driver with hardware-specific info and handlers */
|
||||
fbd->max_num_queues = info->max_num_queues;
|
||||
|
||||
|
|
@ -288,7 +294,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
|
|||
|
||||
err = fbnic_alloc_irqs(fbd);
|
||||
if (err)
|
||||
goto free_fbd;
|
||||
goto err_destroy_health;
|
||||
|
||||
err = fbnic_mac_init(fbd);
|
||||
if (err) {
|
||||
|
|
@ -357,6 +363,8 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
|
|||
return 0;
|
||||
free_irqs:
|
||||
fbnic_free_irqs(fbd);
|
||||
err_destroy_health:
|
||||
fbnic_devlink_health_destroy(fbd);
|
||||
free_fbd:
|
||||
fbnic_devlink_free(fbd);
|
||||
|
||||
|
|
@ -391,6 +399,7 @@ static void fbnic_remove(struct pci_dev *pdev)
|
|||
fbnic_fw_free_mbx(fbd);
|
||||
fbnic_free_irqs(fbd);
|
||||
|
||||
fbnic_devlink_health_destroy(fbd);
|
||||
fbnic_devlink_free(fbd);
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user