From e6c8ab0a11293ac44a2d5550cafaf775ccf64ea0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:12 -0700 Subject: [PATCH 1/9] eth: fbnic: make fbnic_fw_log_write() parameter const Make the log message parameter const, it's not modified and this lets us pass in strings which are const for the caller. Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250916231420.1693955-2-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c | 2 +- drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c index c1663f042245..85a883dba385 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.c @@ -72,7 +72,7 @@ void fbnic_fw_log_free(struct fbnic_dev *fbd) } int fbnic_fw_log_write(struct fbnic_dev *fbd, u64 index, u32 timestamp, - char *msg) + const char *msg) { struct fbnic_fw_log_entry *entry, *head, *tail, *next; struct fbnic_fw_log *log = &fbd->fw_log; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h index cb6555f40a24..50ec79003108 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw_log.h @@ -41,5 +41,5 @@ void fbnic_fw_log_disable(struct fbnic_dev *fbd); int fbnic_fw_log_init(struct fbnic_dev *fbd); void fbnic_fw_log_free(struct fbnic_dev *fbd); int fbnic_fw_log_write(struct fbnic_dev *fbd, u64 index, u32 timestamp, - char *msg); + const char *msg); #endif /* _FBNIC_FW_LOG_H_ */ From 7fd1f7bac2b878eb203fb316e03e1b87e8b86f6e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:13 -0700 Subject: [PATCH 2/9] eth: fbnic: use fw uptime to detect fw crashes Currently we only detect FW crashes when it stops responding to heartbeat messages. FW has a watchdog which will reset it in case of crashes. Use FW uptime sent in the ownership and heartbeat messages to detect that the watchdog has fired (uptime went down). Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250916231420.1693955-3-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic.h | 4 ++++ drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 17 ++++++++++++++++- drivers/net/ethernet/meta/fbnic/fbnic_fw.h | 7 +++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 311c7dda911a..09058d847729 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -84,6 +84,10 @@ struct fbnic_dev { /* Local copy of hardware statistics */ struct fbnic_hw_stats hw_stats; + /* Firmware time since boot in milliseconds */ + u64 firmware_time; + u64 prev_firmware_time; + struct fbnic_fw_log fw_log; }; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index 6e580654493c..9b39a73e4c35 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -495,6 +495,11 @@ int fbnic_fw_xmit_ownership_msg(struct fbnic_dev *fbd, bool take_ownership) fbd->last_heartbeat_request = req_time; + /* Set prev_firmware_time to 0 to avoid triggering firmware crash + * detection until we receive the second uptime in a heartbeat resp. + */ + fbd->prev_firmware_time = 0; + /* Set heartbeat detection based on if we are taking ownership */ fbd->fw_heartbeat_enabled = take_ownership; @@ -660,6 +665,7 @@ static int fbnic_fw_parse_cap_resp(void *opaque, struct fbnic_tlv_msg **results) } static const struct fbnic_tlv_index fbnic_ownership_resp_index[] = { + FBNIC_TLV_ATTR_U64(FBNIC_FW_OWNERSHIP_TIME), FBNIC_TLV_ATTR_LAST }; @@ -671,10 +677,14 @@ static int fbnic_fw_parse_ownership_resp(void *opaque, /* Count the ownership response as a heartbeat reply */ fbd->last_heartbeat_response = jiffies; + /* Capture firmware time for logging and firmware crash check */ + fbd->firmware_time = fta_get_uint(results, FBNIC_FW_OWNERSHIP_TIME); + return 0; } static const struct fbnic_tlv_index fbnic_heartbeat_resp_index[] = { + FBNIC_TLV_ATTR_U64(FBNIC_FW_HEARTBEAT_UPTIME), FBNIC_TLV_ATTR_LAST }; @@ -685,6 +695,9 @@ static int fbnic_fw_parse_heartbeat_resp(void *opaque, fbd->last_heartbeat_response = jiffies; + /* Capture firmware time for logging and firmware crash check */ + fbd->firmware_time = fta_get_uint(results, FBNIC_FW_HEARTBEAT_UPTIME); + return 0; } @@ -706,6 +719,7 @@ static int fbnic_fw_xmit_heartbeat_message(struct fbnic_dev *fbd) goto free_message; fbd->last_heartbeat_request = req_time; + fbd->prev_firmware_time = fbd->firmware_time; return err; @@ -766,7 +780,8 @@ void fbnic_fw_check_heartbeat(struct fbnic_dev *fbd) return; /* Was the last heartbeat response long time ago? */ - if (!fbnic_fw_heartbeat_current(fbd)) { + if (!fbnic_fw_heartbeat_current(fbd) || + fbd->firmware_time < fbd->prev_firmware_time) { dev_warn(fbd->dev, "Firmware did not respond to heartbeat message\n"); fbd->fw_heartbeat_enabled = false; diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h index ec67b80809b0..be7f2dc88698 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h @@ -198,9 +198,16 @@ enum { enum { FBNIC_FW_OWNERSHIP_FLAG = 0x0, + FBNIC_FW_OWNERSHIP_TIME = 0x1, FBNIC_FW_OWNERSHIP_MSG_MAX }; +enum { + FBNIC_FW_HEARTBEAT_UPTIME = 0x0, + FBNIC_FW_HEARTBEAT_NUMBER_OF_MESSAGES = 0x1, + FBNIC_FW_HEARTBEAT_MSG_MAX +}; + enum { FBNIC_FW_START_UPGRADE_ERROR = 0x0, FBNIC_FW_START_UPGRADE_SECTION = 0x1, From 504f8b7119eb44a9b04b5249356405ebb302e382 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:14 -0700 Subject: [PATCH 3/9] eth: fbnic: factor out clearing the action TCAM We'll want to wipe the driver TCAM state after FW crash, to force a re-programming. Factor out the clearing logic. Remove the micro- -optimization to skip clearing the BMC entry twice, it doesn't hurt. Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250916231420.1693955-4-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_rpc.c | 36 ++++++++++++--------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c index 4284b3cb7fcc..d944d0fdd3b7 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c @@ -1124,13 +1124,25 @@ void fbnic_write_ip_addr(struct fbnic_dev *fbd) } } -void fbnic_clear_rules(struct fbnic_dev *fbd) +static void fbnic_clear_valid_act_tcam(struct fbnic_dev *fbd) { - u32 dest = FIELD_PREP(FBNIC_RPC_ACT_TBL0_DEST_MASK, - FBNIC_RPC_ACT_TBL0_DEST_BMC); int i = FBNIC_RPC_TCAM_ACT_NUM_ENTRIES - 1; struct fbnic_act_tcam *act_tcam; + /* Work from the bottom up deleting all other rules from hardware */ + do { + act_tcam = &fbd->act_tcam[i]; + + if (act_tcam->state != FBNIC_TCAM_S_VALID) + continue; + + fbnic_clear_act_tcam(fbd, i); + act_tcam->state = FBNIC_TCAM_S_UPDATE; + } while (i--); +} + +void fbnic_clear_rules(struct fbnic_dev *fbd) +{ /* Clear MAC rules */ fbnic_clear_macda(fbd); @@ -1145,6 +1157,11 @@ void fbnic_clear_rules(struct fbnic_dev *fbd) * the interface back up. */ if (fbnic_bmc_present(fbd)) { + u32 dest = FIELD_PREP(FBNIC_RPC_ACT_TBL0_DEST_MASK, + FBNIC_RPC_ACT_TBL0_DEST_BMC); + int i = FBNIC_RPC_TCAM_ACT_NUM_ENTRIES - 1; + struct fbnic_act_tcam *act_tcam; + act_tcam = &fbd->act_tcam[i]; if (act_tcam->state == FBNIC_TCAM_S_VALID && @@ -1153,21 +1170,10 @@ void fbnic_clear_rules(struct fbnic_dev *fbd) wr32(fbd, FBNIC_RPC_ACT_TBL1(i), 0); act_tcam->state = FBNIC_TCAM_S_UPDATE; - - i--; } } - /* Work from the bottom up deleting all other rules from hardware */ - do { - act_tcam = &fbd->act_tcam[i]; - - if (act_tcam->state != FBNIC_TCAM_S_VALID) - continue; - - fbnic_clear_act_tcam(fbd, i); - act_tcam->state = FBNIC_TCAM_S_UPDATE; - } while (i--); + fbnic_clear_valid_act_tcam(fbd); } static void fbnic_delete_act_tcam(struct fbnic_dev *fbd, unsigned int idx) From 6ae7da8e9e069ff63ceea9953d53920b136ff008 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:15 -0700 Subject: [PATCH 4/9] eth: fbnic: reprogram TCAMs after FW crash FW may mess with the TCAM after it boots, to try to restore the traffic flow to the BMC (it may not be aware that the host is already up). Make sure that we reprogram the TCAMs after detecting a crash. Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250916231420.1693955-5-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic.h | 2 ++ drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 23 ++++++++++++++------- drivers/net/ethernet/meta/fbnic/fbnic_rpc.c | 21 +++++++++++++++++++ 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 09058d847729..b364c2f0724b 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -191,6 +191,8 @@ void fbnic_dbg_fbd_exit(struct fbnic_dev *fbd); void fbnic_dbg_init(void); void fbnic_dbg_exit(void); +void fbnic_rpc_reset_valid_entries(struct fbnic_dev *fbd); + void fbnic_csr_get_regs(struct fbnic_dev *fbd, u32 *data, u32 *regs_version); int fbnic_csr_regs_len(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 9fdc8f4f36cc..7d9b93f8ebd8 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -167,6 +167,20 @@ void fbnic_down(struct fbnic_net *fbn) fbnic_flush(fbn); } +static int fbnic_fw_config_after_crash(struct fbnic_dev *fbd) +{ + if (fbnic_fw_xmit_ownership_msg(fbd, true)) { + dev_err(fbd->dev, "NIC failed to take ownership\n"); + + return -1; + } + + fbnic_rpc_reset_valid_entries(fbd); + __fbnic_set_rx_mode(fbd); + + return 0; +} + static void fbnic_health_check(struct fbnic_dev *fbd) { struct fbnic_fw_mbx *tx_mbx = &fbd->mbx[FBNIC_IPC_MBX_TX_IDX]; @@ -182,13 +196,8 @@ static void fbnic_health_check(struct fbnic_dev *fbd) if (tx_mbx->head != tx_mbx->tail) return; - /* TBD: Need to add a more thorough recovery here. - * Specifically I need to verify what all the firmware will have - * changed since we had setup and it rebooted. May just need to - * perform a down/up. For now we will just reclaim ownership so - * the heartbeat can catch the next fault. - */ - fbnic_fw_xmit_ownership_msg(fbd, true); + if (fbnic_fw_config_after_crash(fbd)) + dev_err(fbd->dev, "Firmware recovery failed after crash\n"); } static void fbnic_service_task(struct work_struct *work) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c index d944d0fdd3b7..7f31e890031c 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c @@ -596,6 +596,21 @@ static void fbnic_clear_macda(struct fbnic_dev *fbd) } } +static void fbnic_clear_valid_macda(struct fbnic_dev *fbd) +{ + int idx; + + for (idx = ARRAY_SIZE(fbd->mac_addr); idx--;) { + struct fbnic_mac_addr *mac_addr = &fbd->mac_addr[idx]; + + if (mac_addr->state == FBNIC_TCAM_S_VALID) { + fbnic_clear_macda_entry(fbd, idx); + + mac_addr->state = FBNIC_TCAM_S_UPDATE; + } + } +} + static void fbnic_write_macda_entry(struct fbnic_dev *fbd, unsigned int idx, struct fbnic_mac_addr *mac_addr) { @@ -1223,3 +1238,9 @@ void fbnic_write_rules(struct fbnic_dev *fbd) fbnic_update_act_tcam(fbd, i); } } + +void fbnic_rpc_reset_valid_entries(struct fbnic_dev *fbd) +{ + fbnic_clear_valid_act_tcam(fbd); + fbnic_clear_valid_macda(fbd); +} From a8896d14fc0c1f1b8d532be5605c05440b15d881 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:16 -0700 Subject: [PATCH 5/9] eth: fbnic: support allocating FW completions with extra space Support allocating extra space after the FW completion. This makes it easy to pass extra variable size buffer space to FW response handlers without worrying about synchronization (completion itself is already refcounted). Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250916231420.1693955-6-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 10 ++++++++-- drivers/net/ethernet/meta/fbnic/fbnic_fw.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index 9b39a73e4c35..198922a942b2 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -1544,11 +1544,12 @@ void fbnic_get_fw_ver_commit_str(struct fbnic_dev *fbd, char *fw_version, fw_version, str_sz); } -struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type) +struct fbnic_fw_completion *__fbnic_fw_alloc_cmpl(u32 msg_type, + size_t priv_size) { struct fbnic_fw_completion *cmpl; - cmpl = kzalloc(sizeof(*cmpl), GFP_KERNEL); + cmpl = kzalloc(sizeof(*cmpl) + priv_size, GFP_KERNEL); if (!cmpl) return NULL; @@ -1559,6 +1560,11 @@ struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type) return cmpl; } +struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type) +{ + return __fbnic_fw_alloc_cmpl(msg_type, 0); +} + void fbnic_fw_put_cmpl(struct fbnic_fw_completion *fw_cmpl) { kref_put(&fw_cmpl->ref_count, fbnic_fw_release_cmpl_data); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h index be7f2dc88698..d4c0fb4c94cc 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h @@ -100,6 +100,8 @@ int fbnic_fw_xmit_tsene_read_msg(struct fbnic_dev *fbd, int fbnic_fw_xmit_send_logs(struct fbnic_dev *fbd, bool enable, bool send_log_history); int fbnic_fw_xmit_rpc_macda_sync(struct fbnic_dev *fbd); +struct fbnic_fw_completion *__fbnic_fw_alloc_cmpl(u32 msg_type, + size_t priv_size); struct fbnic_fw_completion *fbnic_fw_alloc_cmpl(u32 msg_type); void fbnic_fw_put_cmpl(struct fbnic_fw_completion *cmpl_data); From 5df1d0a08483eff13f0da1cc66883e0bc2cf4fcf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:17 -0700 Subject: [PATCH 6/9] eth: fbnic: support FW communication for core dump To read FW core dump we need to issue two commands to FW: - first get the FW core dump info - second read the dump chunk by chunk Implement these two FW commands. Subsequent commits will use them to expose FW dump via devlink heath. Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250916231420.1693955-7-kuba@kernel.org Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_fw.c | 214 +++++++++++++++++++++ drivers/net/ethernet/meta/fbnic/fbnic_fw.h | 38 ++++ 2 files changed, 252 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c index 198922a942b2..6c3e7f81a2ed 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.c @@ -793,6 +793,215 @@ void fbnic_fw_check_heartbeat(struct fbnic_dev *fbd) dev_warn(fbd->dev, "Failed to send heartbeat message\n"); } +/** + * fbnic_fw_xmit_coredump_info_msg - Create and transmit a coredump info message + * @fbd: FBNIC device structure + * @cmpl_data: Structure to store info in + * @force: Force coredump event if one hasn't already occurred + * + * Return: zero on success, negative errno on failure + * + * Asks the FW for info related to coredump. If a coredump doesn't exist it + * can optionally force one if force is true. + */ +int fbnic_fw_xmit_coredump_info_msg(struct fbnic_dev *fbd, + struct fbnic_fw_completion *cmpl_data, + bool force) +{ + struct fbnic_tlv_msg *msg; + int err = 0; + + msg = fbnic_tlv_msg_alloc(FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_REQ); + if (!msg) + return -ENOMEM; + + if (force) { + err = fbnic_tlv_attr_put_flag(msg, FBNIC_FW_COREDUMP_REQ_INFO_CREATE); + if (err) + goto free_msg; + } + + err = fbnic_mbx_map_req_w_cmpl(fbd, msg, cmpl_data); + if (err) + goto free_msg; + + return 0; + +free_msg: + free_page((unsigned long)msg); + return err; +} + +static const struct fbnic_tlv_index fbnic_coredump_info_resp_index[] = { + FBNIC_TLV_ATTR_FLAG(FBNIC_FW_COREDUMP_INFO_AVAILABLE), + FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_INFO_SIZE), + FBNIC_TLV_ATTR_S32(FBNIC_FW_COREDUMP_INFO_ERROR), + FBNIC_TLV_ATTR_LAST +}; + +static int +fbnic_fw_parse_coredump_info_resp(void *opaque, struct fbnic_tlv_msg **results) +{ + struct fbnic_fw_completion *cmpl_data; + struct fbnic_dev *fbd = opaque; + u32 msg_type; + s32 err; + + /* Verify we have a completion pointer to provide with data */ + msg_type = FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP; + cmpl_data = fbnic_fw_get_cmpl_by_type(fbd, msg_type); + if (!cmpl_data) + return -ENOSPC; + + err = fta_get_sint(results, FBNIC_FW_COREDUMP_INFO_ERROR); + if (err) + goto msg_err; + + if (!results[FBNIC_FW_COREDUMP_INFO_AVAILABLE]) { + err = -ENOENT; + goto msg_err; + } + + cmpl_data->u.coredump_info.size = + fta_get_uint(results, FBNIC_FW_COREDUMP_INFO_SIZE); + +msg_err: + cmpl_data->result = err; + complete(&cmpl_data->done); + fbnic_fw_put_cmpl(cmpl_data); + + return err; +} + +/** + * fbnic_fw_xmit_coredump_read_msg - Create and transmit a coredump read request + * @fbd: FBNIC device structure + * @cmpl_data: Completion struct to store coredump + * @offset: Offset into coredump requested + * @length: Length of section of cordeump to fetch + * + * Return: zero on success, negative errno on failure + * + * Asks the firmware to provide a section of the cordeump back in a message. + * The response will have an offset and size matching the values provided. + */ +int fbnic_fw_xmit_coredump_read_msg(struct fbnic_dev *fbd, + struct fbnic_fw_completion *cmpl_data, + u32 offset, u32 length) +{ + struct fbnic_tlv_msg *msg; + int err = 0; + + msg = fbnic_tlv_msg_alloc(FBNIC_TLV_MSG_ID_COREDUMP_READ_REQ); + if (!msg) + return -ENOMEM; + + if (offset) { + err = fbnic_tlv_attr_put_int(msg, FBNIC_FW_COREDUMP_READ_OFFSET, + offset); + if (err) + goto free_message; + } + + if (length) { + err = fbnic_tlv_attr_put_int(msg, FBNIC_FW_COREDUMP_READ_LENGTH, + length); + if (err) + goto free_message; + } + + err = fbnic_mbx_map_req_w_cmpl(fbd, msg, cmpl_data); + if (err) + goto free_message; + + return 0; + +free_message: + free_page((unsigned long)msg); + return err; +} + +static const struct fbnic_tlv_index fbnic_coredump_resp_index[] = { + FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_READ_OFFSET), + FBNIC_TLV_ATTR_U32(FBNIC_FW_COREDUMP_READ_LENGTH), + FBNIC_TLV_ATTR_RAW_DATA(FBNIC_FW_COREDUMP_READ_DATA), + FBNIC_TLV_ATTR_S32(FBNIC_FW_COREDUMP_READ_ERROR), + FBNIC_TLV_ATTR_LAST +}; + +static int fbnic_fw_parse_coredump_resp(void *opaque, + struct fbnic_tlv_msg **results) +{ + struct fbnic_fw_completion *cmpl_data; + u32 index, last_offset, last_length; + struct fbnic_dev *fbd = opaque; + struct fbnic_tlv_msg *data_hdr; + u32 length, offset; + u32 msg_type; + s32 err; + + /* Verify we have a completion pointer to provide with data */ + msg_type = FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP; + cmpl_data = fbnic_fw_get_cmpl_by_type(fbd, msg_type); + if (!cmpl_data) + return -ENOSPC; + + err = fta_get_sint(results, FBNIC_FW_COREDUMP_READ_ERROR); + if (err) + goto msg_err; + + data_hdr = results[FBNIC_FW_COREDUMP_READ_DATA]; + if (!data_hdr) { + err = -ENODATA; + goto msg_err; + } + + offset = fta_get_uint(results, FBNIC_FW_COREDUMP_READ_OFFSET); + length = fta_get_uint(results, FBNIC_FW_COREDUMP_READ_LENGTH); + + if (length > le16_to_cpu(data_hdr->hdr.len) - sizeof(u32)) { + dev_err(fbd->dev, "length greater than size of message\n"); + err = -EINVAL; + goto msg_err; + } + + /* Only the last offset can have a length != stride */ + last_length = + (cmpl_data->u.coredump.size % cmpl_data->u.coredump.stride) ? : + cmpl_data->u.coredump.stride; + last_offset = cmpl_data->u.coredump.size - last_length; + + /* Verify offset and length */ + if (offset % cmpl_data->u.coredump.stride || offset > last_offset) { + dev_err(fbd->dev, "offset %d out of range\n", offset); + err = -EINVAL; + } else if (length != ((offset == last_offset) ? + last_length : cmpl_data->u.coredump.stride)) { + dev_err(fbd->dev, "length %d out of range for offset %d\n", + length, offset); + err = -EINVAL; + } + if (err) + goto msg_err; + + /* If data pointer is NULL it is already filled, just skip the copy */ + index = offset / cmpl_data->u.coredump.stride; + if (!cmpl_data->u.coredump.data[index]) + goto msg_err; + + /* Copy data and mark index filled by setting pointer to NULL */ + memcpy(cmpl_data->u.coredump.data[index], + fbnic_tlv_attr_get_value_ptr(data_hdr), length); + cmpl_data->u.coredump.data[index] = NULL; + +msg_err: + cmpl_data->result = err; + complete(&cmpl_data->done); + fbnic_fw_put_cmpl(cmpl_data); + + return err; +} + int fbnic_fw_xmit_fw_start_upgrade(struct fbnic_dev *fbd, struct fbnic_fw_completion *cmpl_data, unsigned int id, unsigned int len) @@ -1222,6 +1431,11 @@ static const struct fbnic_tlv_parser fbnic_fw_tlv_parser[] = { fbnic_fw_parse_ownership_resp), FBNIC_TLV_PARSER(HEARTBEAT_RESP, fbnic_heartbeat_resp_index, fbnic_fw_parse_heartbeat_resp), + FBNIC_TLV_PARSER(COREDUMP_GET_INFO_RESP, + fbnic_coredump_info_resp_index, + fbnic_fw_parse_coredump_info_resp), + FBNIC_TLV_PARSER(COREDUMP_READ_RESP, fbnic_coredump_resp_index, + fbnic_fw_parse_coredump_resp), FBNIC_TLV_PARSER(FW_START_UPGRADE_RESP, fbnic_fw_start_upgrade_resp_index, fbnic_fw_parse_fw_start_upgrade_resp), diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h index d4c0fb4c94cc..d776be9fc7f7 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_fw.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_fw.h @@ -66,6 +66,14 @@ struct fbnic_fw_completion { struct kref ref_count; int result; union { + struct { + u32 size; + } coredump_info; + struct { + u32 size; + u16 stride; + u8 *data[]; + } coredump; struct { u32 offset; u32 length; @@ -89,6 +97,12 @@ void fbnic_mbx_flush_tx(struct fbnic_dev *fbd); int fbnic_fw_xmit_ownership_msg(struct fbnic_dev *fbd, bool take_ownership); int fbnic_fw_init_heartbeat(struct fbnic_dev *fbd, bool poll); void fbnic_fw_check_heartbeat(struct fbnic_dev *fbd); +int fbnic_fw_xmit_coredump_info_msg(struct fbnic_dev *fbd, + struct fbnic_fw_completion *cmpl_data, + bool force); +int fbnic_fw_xmit_coredump_read_msg(struct fbnic_dev *fbd, + struct fbnic_fw_completion *cmpl_data, + u32 offset, u32 length); int fbnic_fw_xmit_fw_start_upgrade(struct fbnic_dev *fbd, struct fbnic_fw_completion *cmpl_data, unsigned int id, unsigned int len); @@ -137,6 +151,10 @@ enum { FBNIC_TLV_MSG_ID_OWNERSHIP_RESP = 0x13, FBNIC_TLV_MSG_ID_HEARTBEAT_REQ = 0x14, FBNIC_TLV_MSG_ID_HEARTBEAT_RESP = 0x15, + FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_REQ = 0x18, + FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP = 0x19, + FBNIC_TLV_MSG_ID_COREDUMP_READ_REQ = 0x20, + FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP = 0x21, FBNIC_TLV_MSG_ID_FW_START_UPGRADE_REQ = 0x22, FBNIC_TLV_MSG_ID_FW_START_UPGRADE_RESP = 0x23, FBNIC_TLV_MSG_ID_FW_WRITE_CHUNK_REQ = 0x24, @@ -210,6 +228,26 @@ enum { FBNIC_FW_HEARTBEAT_MSG_MAX }; +enum { + FBNIC_FW_COREDUMP_REQ_INFO_CREATE = 0x0, + FBNIC_FW_COREDUMP_REQ_INFO_MSG_MAX +}; + +enum { + FBNIC_FW_COREDUMP_INFO_AVAILABLE = 0x0, + FBNIC_FW_COREDUMP_INFO_SIZE = 0x1, + FBNIC_FW_COREDUMP_INFO_ERROR = 0x2, + FBNIC_FW_COREDUMP_INFO_MSG_MAX +}; + +enum { + FBNIC_FW_COREDUMP_READ_OFFSET = 0x0, + FBNIC_FW_COREDUMP_READ_LENGTH = 0x1, + FBNIC_FW_COREDUMP_READ_DATA = 0x2, + FBNIC_FW_COREDUMP_READ_ERROR = 0x3, + FBNIC_FW_COREDUMP_READ_MSG_MAX +}; + enum { FBNIC_FW_START_UPGRADE_ERROR = 0x0, FBNIC_FW_START_UPGRADE_SECTION = 0x1, From 005a54722e9d493be58405a77f2a444e06f03be0 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:18 -0700 Subject: [PATCH 7/9] eth: fbnic: add FW health reporter Add a health reporter to catch FW crashes. Dumping the reporter if FW has not crashed will create a snapshot of FW memory. Reviewed-by: Lee Trager Reviewed-by: Simon Horman Signed-off-by: Jakub Kicinski Link: https://patch.msgid.link/20250916231420.1693955-8-kuba@kernel.org Signed-off-by: Paolo Abeni --- .../device_drivers/ethernet/meta/fbnic.rst | 10 ++ drivers/net/ethernet/meta/fbnic/fbnic.h | 5 + .../net/ethernet/meta/fbnic/fbnic_devlink.c | 155 ++++++++++++++++++ drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 11 +- 4 files changed, 180 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst index fb6559fa4be4..62693566ff1f 100644 --- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst +++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst @@ -69,6 +69,16 @@ On host boot the latest UEFI driver is always used, no explicit activation is required. Firmware activation is required to run new control firmware. cmrt firmware can only be activated by power cycling the NIC. +Health reporters +---------------- + +fw reporter +~~~~~~~~~~~ + +The ``fw`` health reporter tracks FW crashes. Dumping the reporter will +show the core dump of the most recent FW crash, and if no FW crash has +happened since power cycle - a snapshot of the FW memory. + Statistics ---------- diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index b364c2f0724b..5f99976de0bb 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -27,6 +27,7 @@ struct fbnic_dev { struct net_device *netdev; struct dentry *dbg_fbd; struct device *hwmon; + struct devlink_health_reporter *fw_reporter; u32 __iomem *uc_addr0; u32 __iomem *uc_addr4; @@ -159,8 +160,12 @@ extern char fbnic_driver_name[]; void fbnic_devlink_free(struct fbnic_dev *fbd); struct fbnic_dev *fbnic_devlink_alloc(struct pci_dev *pdev); +int fbnic_devlink_health_create(struct fbnic_dev *fbd); +void fbnic_devlink_health_destroy(struct fbnic_dev *fbd); void fbnic_devlink_register(struct fbnic_dev *fbd); void fbnic_devlink_unregister(struct fbnic_dev *fbd); +void __printf(2, 3) +fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...); int fbnic_fw_request_mbx(struct fbnic_dev *fbd); void fbnic_fw_free_mbx(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c b/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c index c5f81f139e7e..195245fb1a96 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c @@ -8,6 +8,7 @@ #include #include "fbnic.h" +#include "fbnic_fw.h" #include "fbnic_tlv.h" #define FBNIC_SN_STR_LEN 24 @@ -369,6 +370,160 @@ static const struct devlink_ops fbnic_devlink_ops = { .flash_update = fbnic_devlink_flash_update, }; +static int fbnic_fw_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter); + u32 offset, index, index_count, length, size; + struct fbnic_fw_completion *fw_cmpl; + u8 *dump_data, **data; + int err; + + fw_cmpl = fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_GET_INFO_RESP); + if (!fw_cmpl) + return -ENOMEM; + + err = fbnic_fw_xmit_coredump_info_msg(fbd, fw_cmpl, true); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to transmit core dump info msg"); + goto cmpl_free; + } + if (!wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) { + NL_SET_ERR_MSG_MOD(extack, + "Timed out waiting on core dump info"); + err = -ETIMEDOUT; + goto cmpl_cleanup; + } + + size = fw_cmpl->u.coredump_info.size; + err = fw_cmpl->result; + + fbnic_mbx_clear_cmpl(fbd, fw_cmpl); + fbnic_fw_put_cmpl(fw_cmpl); + + /* Handle error returned by firmware */ + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Firmware core dump returned error"); + return err; + } + if (!size) { + NL_SET_ERR_MSG_MOD(extack, + "Firmware core dump returned size 0"); + return -EIO; + } + + /* Read the dump, we can only transfer TLV_MAX_DATA at a time */ + index_count = DIV_ROUND_UP(size, TLV_MAX_DATA); + + fw_cmpl = __fbnic_fw_alloc_cmpl(FBNIC_TLV_MSG_ID_COREDUMP_READ_RESP, + sizeof(void *) * index_count + size); + if (!fw_cmpl) + return -ENOMEM; + + /* Populate pointer table w/ pointer offsets */ + dump_data = (void *)&fw_cmpl->u.coredump.data[index_count]; + data = fw_cmpl->u.coredump.data; + fw_cmpl->u.coredump.size = size; + fw_cmpl->u.coredump.stride = TLV_MAX_DATA; + + for (index = 0; index < index_count; index++) { + /* First iteration installs completion */ + struct fbnic_fw_completion *cmpl_arg = index ? NULL : fw_cmpl; + + offset = index * TLV_MAX_DATA; + length = min(size - offset, TLV_MAX_DATA); + + data[index] = dump_data + offset; + err = fbnic_fw_xmit_coredump_read_msg(fbd, cmpl_arg, + offset, length); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to transmit core dump msg"); + if (cmpl_arg) + goto cmpl_free; + else + goto cmpl_cleanup; + } + + if (wait_for_completion_timeout(&fw_cmpl->done, 2 * HZ)) { + reinit_completion(&fw_cmpl->done); + } else { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Timed out waiting on core dump (%d/%d)", + index + 1, index_count); + err = -ETIMEDOUT; + goto cmpl_cleanup; + } + + /* If we didn't see the reply record as incomplete */ + if (fw_cmpl->u.coredump.data[index]) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "No data for core dump chunk (%d/%d)", + index + 1, index_count); + err = -EIO; + goto cmpl_cleanup; + } + } + + devlink_fmsg_binary_pair_nest_start(fmsg, "FW coredump"); + + for (offset = 0; offset < size; offset += length) { + length = min_t(u32, size - offset, TLV_MAX_DATA); + + devlink_fmsg_binary_put(fmsg, dump_data + offset, length); + } + + devlink_fmsg_binary_pair_nest_end(fmsg); + +cmpl_cleanup: + fbnic_mbx_clear_cmpl(fbd, fw_cmpl); +cmpl_free: + fbnic_fw_put_cmpl(fw_cmpl); + + return err; +} + +void __printf(2, 3) +fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...) +{ + char msg[FBNIC_FW_LOG_MAX_SIZE]; + va_list args; + + va_start(args, format); + vsnprintf(msg, FBNIC_FW_LOG_MAX_SIZE, format, args); + va_end(args); + + devlink_health_report(fbd->fw_reporter, msg, fbd); + if (fbnic_fw_log_ready(fbd)) + fbnic_fw_log_write(fbd, 0, fbd->firmware_time, msg); +} + +static const struct devlink_health_reporter_ops fbnic_fw_ops = { + .name = "fw", + .dump = fbnic_fw_reporter_dump, +}; + +int fbnic_devlink_health_create(struct fbnic_dev *fbd) +{ + fbd->fw_reporter = devlink_health_reporter_create(priv_to_devlink(fbd), + &fbnic_fw_ops, fbd); + if (IS_ERR(fbd->fw_reporter)) { + dev_warn(fbd->dev, + "Failed to create FW fault reporter: %pe\n", + fbd->fw_reporter); + return PTR_ERR(fbd->fw_reporter); + } + + return 0; +} + +void fbnic_devlink_health_destroy(struct fbnic_dev *fbd) +{ + devlink_health_reporter_destroy(fbd->fw_reporter); +} + void fbnic_devlink_free(struct fbnic_dev *fbd) { struct devlink *devlink = priv_to_devlink(fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 7d9b93f8ebd8..576fc89f8704 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -196,6 +196,8 @@ static void fbnic_health_check(struct fbnic_dev *fbd) if (tx_mbx->head != tx_mbx->tail) return; + fbnic_devlink_fw_report(fbd, "Firmware crashed detected!"); + if (fbnic_fw_config_after_crash(fbd)) dev_err(fbd->dev, "Firmware recovery failed after crash\n"); } @@ -278,6 +280,10 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return -ENOMEM; } + err = fbnic_devlink_health_create(fbd); + if (err) + goto free_fbd; + /* Populate driver with hardware-specific info and handlers */ fbd->max_num_queues = info->max_num_queues; @@ -288,7 +294,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err = fbnic_alloc_irqs(fbd); if (err) - goto free_fbd; + goto err_destroy_health; err = fbnic_mac_init(fbd); if (err) { @@ -357,6 +363,8 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; free_irqs: fbnic_free_irqs(fbd); +err_destroy_health: + fbnic_devlink_health_destroy(fbd); free_fbd: fbnic_devlink_free(fbd); @@ -391,6 +399,7 @@ static void fbnic_remove(struct pci_dev *pdev) fbnic_fw_free_mbx(fbd); fbnic_free_irqs(fbd); + fbnic_devlink_health_destroy(fbd); fbnic_devlink_free(fbd); } From 6da8344f92dfc607069ed10ed5ba76b61e612691 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:19 -0700 Subject: [PATCH 8/9] eth: fbnic: report FW uptime in health diagnose FW crashes are detected based on uptime going back, expose the uptime via devlink health diagnose. $ devlink -j health diagnose pci/0000:01:00.0 reporter fw {"last_heartbeat":{"fw_uptime":{"sec":201,"msec":76}}} $ devlink -j health diagnose pci/0000:01:00.0 reporter fw last_heartbeat: fw_uptime: sec: 201 msec: 76 Reviewed-by: Lee Trager Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250916231420.1693955-9-kuba@kernel.org Signed-off-by: Paolo Abeni --- .../device_drivers/ethernet/meta/fbnic.rst | 4 ++- .../net/ethernet/meta/fbnic/fbnic_devlink.c | 29 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst index 62693566ff1f..8b7ae9975bf7 100644 --- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst +++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst @@ -77,7 +77,9 @@ fw reporter The ``fw`` health reporter tracks FW crashes. Dumping the reporter will show the core dump of the most recent FW crash, and if no FW crash has -happened since power cycle - a snapshot of the FW memory. +happened since power cycle - a snapshot of the FW memory. Diagnose callback +shows FW uptime based on the most recently received heartbeat message +(the crashes are detected by checking if uptime goes down). Statistics ---------- diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c b/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c index 195245fb1a96..fd7df44ae7a4 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c @@ -485,6 +485,34 @@ static int fbnic_fw_reporter_dump(struct devlink_health_reporter *reporter, return err; } +static int +fbnic_fw_reporter_diagnose(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, + struct netlink_ext_ack *extack) +{ + struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter); + u32 sec, msec; + + /* Device is most likely down, we're not exchanging heartbeats */ + if (!fbd->prev_firmware_time) + return 0; + + sec = div_u64_rem(fbd->firmware_time, MSEC_PER_SEC, &msec); + + devlink_fmsg_pair_nest_start(fmsg, "last_heartbeat"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_pair_nest_start(fmsg, "fw_uptime"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_u32_pair_put(fmsg, "sec", sec); + devlink_fmsg_u32_pair_put(fmsg, "msec", msec); + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + + return 0; +} + void __printf(2, 3) fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...) { @@ -503,6 +531,7 @@ fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...) static const struct devlink_health_reporter_ops fbnic_fw_ops = { .name = "fw", .dump = fbnic_fw_reporter_dump, + .diagnose = fbnic_fw_reporter_diagnose, }; int fbnic_devlink_health_create(struct fbnic_dev *fbd) From e6afcd60c26fca227c700825a94020209970c05e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Sep 2025 16:14:20 -0700 Subject: [PATCH 9/9] eth: fbnic: add OTP health reporter OTP memory ("fuses") are used for secure boot and anti-rollback protection. The OTP memory is ECC protected. Check for its health periodically to notice when the chip is starting to go bad. Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250916231420.1693955-10-kuba@kernel.org Signed-off-by: Paolo Abeni --- .../device_drivers/ethernet/meta/fbnic.rst | 7 ++ drivers/net/ethernet/meta/fbnic/fbnic.h | 2 + drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 18 +++++ .../net/ethernet/meta/fbnic/fbnic_devlink.c | 65 +++++++++++++++++++ drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 5 ++ 5 files changed, 97 insertions(+) diff --git a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst index 8b7ae9975bf7..1e82f90d9ad2 100644 --- a/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst +++ b/Documentation/networking/device_drivers/ethernet/meta/fbnic.rst @@ -81,6 +81,13 @@ happened since power cycle - a snapshot of the FW memory. Diagnose callback shows FW uptime based on the most recently received heartbeat message (the crashes are detected by checking if uptime goes down). +otp reporter +~~~~~~~~~~~~ + +OTP memory ("fuses") are used for secure boot and anti-rollback +protection. The OTP memory is ECC protected, ECC errors indicate +either manufacturing defect or part deteriorating with age. + Statistics ---------- diff --git a/drivers/net/ethernet/meta/fbnic/fbnic.h b/drivers/net/ethernet/meta/fbnic/fbnic.h index 5f99976de0bb..b03e5a3d5144 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic.h @@ -28,6 +28,7 @@ struct fbnic_dev { struct dentry *dbg_fbd; struct device *hwmon; struct devlink_health_reporter *fw_reporter; + struct devlink_health_reporter *otp_reporter; u32 __iomem *uc_addr0; u32 __iomem *uc_addr4; @@ -166,6 +167,7 @@ void fbnic_devlink_register(struct fbnic_dev *fbd); void fbnic_devlink_unregister(struct fbnic_dev *fbd); void __printf(2, 3) fbnic_devlink_fw_report(struct fbnic_dev *fbd, const char *format, ...); +void fbnic_devlink_otp_check(struct fbnic_dev *fbd, const char *msg); int fbnic_fw_request_mbx(struct fbnic_dev *fbd); void fbnic_fw_free_mbx(struct fbnic_dev *fbd); diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index e2fffe1597e9..d3a7ad921f18 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -1178,4 +1178,22 @@ enum { #define FBNIC_IPC_MBX_DESC_FW_CMPL DESC_BIT(1) #define FBNIC_IPC_MBX_DESC_HOST_CMPL DESC_BIT(0) +/* OTP Registers + * These registers are accessible via bar4 offset and are written by CMRT + * on boot. For the write status, the register is broken up in half with OTP + * Write Data Status occupying the top 16 bits and the ECC status occupying the + * bottom 16 bits. + */ +#define FBNIC_NS_OTP_STATUS 0x0021d +#define FBNIC_NS_OTP_WRITE_STATUS 0x0021e + +#define FBNIC_NS_OTP_WRITE_DATA_STATUS_MASK CSR_GENMASK(31, 16) +#define FBNIC_NS_OTP_WRITE_ECC_STATUS_MASK CSR_GENMASK(15, 0) + +#define FBNIC_REGS_VERSION CSR_GENMASK(31, 16) +#define FBNIC_REGS_HW_TYPE CSR_GENMASK(15, 8) +enum{ + FBNIC_CSR_VERSION_V1_0_ASIC = 1, +}; + #endif /* _FBNIC_CSR_H_ */ diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c b/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c index fd7df44ae7a4..b62b1d5b1453 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_devlink.c @@ -534,6 +534,60 @@ static const struct devlink_health_reporter_ops fbnic_fw_ops = { .diagnose = fbnic_fw_reporter_diagnose, }; +static u32 fbnic_read_otp_status(struct fbnic_dev *fbd) +{ + return fbnic_fw_rd32(fbd, FBNIC_NS_OTP_STATUS); +} + +static int +fbnic_otp_reporter_dump(struct devlink_health_reporter *reporter, + struct devlink_fmsg *fmsg, void *priv_ctx, + struct netlink_ext_ack *extack) +{ + struct fbnic_dev *fbd = devlink_health_reporter_priv(reporter); + u32 otp_status, otp_write_status, m; + + otp_status = fbnic_read_otp_status(fbd); + otp_write_status = fbnic_fw_rd32(fbd, FBNIC_NS_OTP_WRITE_STATUS); + + /* Dump OTP status */ + devlink_fmsg_pair_nest_start(fmsg, "OTP"); + devlink_fmsg_obj_nest_start(fmsg); + + devlink_fmsg_u32_pair_put(fmsg, "Status", otp_status); + + /* Extract OTP Write Data status */ + m = FBNIC_NS_OTP_WRITE_DATA_STATUS_MASK; + devlink_fmsg_u32_pair_put(fmsg, "Data", + FIELD_GET(m, otp_write_status)); + + /* Extract OTP Write ECC status */ + m = FBNIC_NS_OTP_WRITE_ECC_STATUS_MASK; + devlink_fmsg_u32_pair_put(fmsg, "ECC", + FIELD_GET(m, otp_write_status)); + + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + + return 0; +} + +void fbnic_devlink_otp_check(struct fbnic_dev *fbd, const char *msg) +{ + /* Check if there is anything to report */ + if (!fbnic_read_otp_status(fbd)) + return; + + devlink_health_report(fbd->otp_reporter, msg, fbd); + if (fbnic_fw_log_ready(fbd)) + fbnic_fw_log_write(fbd, 0, fbd->firmware_time, msg); +} + +static const struct devlink_health_reporter_ops fbnic_otp_ops = { + .name = "otp", + .dump = fbnic_otp_reporter_dump, +}; + int fbnic_devlink_health_create(struct fbnic_dev *fbd) { fbd->fw_reporter = devlink_health_reporter_create(priv_to_devlink(fbd), @@ -545,11 +599,22 @@ int fbnic_devlink_health_create(struct fbnic_dev *fbd) return PTR_ERR(fbd->fw_reporter); } + fbd->otp_reporter = devlink_health_reporter_create(priv_to_devlink(fbd), + &fbnic_otp_ops, fbd); + if (IS_ERR(fbd->otp_reporter)) { + devlink_health_reporter_destroy(fbd->fw_reporter); + dev_warn(fbd->dev, + "Failed to create OTP fault reporter: %pe\n", + fbd->otp_reporter); + return PTR_ERR(fbd->otp_reporter); + } + return 0; } void fbnic_devlink_health_destroy(struct fbnic_dev *fbd) { + devlink_health_reporter_destroy(fbd->otp_reporter); devlink_health_reporter_destroy(fbd->fw_reporter); } diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c index 576fc89f8704..a7a6b4db8016 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c @@ -197,6 +197,7 @@ static void fbnic_health_check(struct fbnic_dev *fbd) return; fbnic_devlink_fw_report(fbd, "Firmware crashed detected!"); + fbnic_devlink_otp_check(fbd, "error detected after firmware recovery"); if (fbnic_fw_config_after_crash(fbd)) dev_err(fbd->dev, "Firmware recovery failed after crash\n"); @@ -321,6 +322,7 @@ static int fbnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) err); fbnic_devlink_register(fbd); + fbnic_devlink_otp_check(fbd, "error detected during probe"); fbnic_dbg_fbd_init(fbd); /* Capture snapshot of hardware stats so netdev can calculate delta */ @@ -474,6 +476,9 @@ static int __fbnic_pm_resume(struct device *dev) */ fbnic_fw_log_enable(fbd, list_empty(&fbd->fw_log.entries)); + /* Since the FW should be up, check if it reported OTP errors */ + fbnic_devlink_otp_check(fbd, "error detected after PM resume"); + /* No netdev means there isn't a network interface to bring up */ if (fbnic_init_failure(fbd)) return 0;