x86/mce: Move machine_check_poll() status checks to helper functions

There are a number of generic and vendor-specific status checks in
machine_check_poll(). These are used to determine if an error should be
skipped.

Move these into helper functions. Future vendor-specific checks will be
added to the helpers.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/20250908-wip-mca-updates-v6-0-eef5d6c74b9c@amd.com
This commit is contained in:
Yazen Ghannam 2025-09-08 15:40:35 +00:00 committed by Borislav Petkov (AMD)
parent 7eee1e9268
commit 91af6842e9

View File

@ -714,6 +714,52 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i)
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
* Newer Intel systems that support software error
* recovery need to make additional checks. Other
* CPUs should skip over uncorrected errors, but log
* everything else.
*/
static bool ser_should_log_poll_error(struct mce *m)
{
/* Log "not enabled" (speculative) errors */
if (!(m->status & MCI_STATUS_EN))
return true;
/*
* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
* UC == 1 && PCC == 0 && S == 0
*/
if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
return true;
return false;
}
static bool should_log_poll_error(enum mcp_flags flags, struct mce_hw_err *err)
{
struct mce *m = &err->m;
/* If this entry is not valid, ignore it. */
if (!(m->status & MCI_STATUS_VAL))
return false;
/*
* If we are logging everything (at CPU online) or this
* is a corrected error, then we must log it.
*/
if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
return true;
if (mca_cfg.ser)
return ser_should_log_poll_error(m);
if (m->status & MCI_STATUS_UC)
return false;
return true;
}
/*
* Poll for corrected events or events that happened before reset.
* Those are just logged through /dev/mcelog.
@ -765,48 +811,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!mca_cfg.cmci_disabled)
mce_track_storm(m);
/* If this entry is not valid, ignore it */
if (!(m->status & MCI_STATUS_VAL))
/* Verify that the error should be logged based on hardware conditions. */
if (!should_log_poll_error(flags, &err))
continue;
/*
* If we are logging everything (at CPU online) or this
* is a corrected error, then we must log it.
*/
if ((flags & MCP_UC) || !(m->status & MCI_STATUS_UC))
goto log_it;
/*
* Newer Intel systems that support software error
* recovery need to make additional checks. Other
* CPUs should skip over uncorrected errors, but log
* everything else.
*/
if (!mca_cfg.ser) {
if (m->status & MCI_STATUS_UC)
continue;
goto log_it;
}
/* Log "not enabled" (speculative) errors */
if (!(m->status & MCI_STATUS_EN))
goto log_it;
/*
* Log UCNA (SDM: 15.6.3 "UCR Error Classification")
* UC == 1 && PCC == 0 && S == 0
*/
if (!(m->status & MCI_STATUS_PCC) && !(m->status & MCI_STATUS_S))
goto log_it;
/*
* Skip anything else. Presumption is that our read of this
* bank is racing with a machine check. Leave the log alone
* for do_machine_check() to deal with it.
*/
continue;
log_it:
mce_read_aux(&err, i);
m->severity = mce_severity(m, NULL, NULL, false);
/*