From 2368048bf5c2ec4b604ac3431564071e89a0bc71 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:14 +0000 Subject: [PATCH 1/3] KVM: x86: Signal #GP, not -EPERM, on bad WRMSR(MCi_CTL/STATUS) Return '1', not '-1', when handling an illegal WRMSR to a MCi_CTL or MCi_STATUS MSR. The behavior of "all zeros' or "all ones" for CTL MSRs is architectural, as is the "only zeros" behavior for STATUS MSRs. I.e. the intent is to inject a #GP, not exit to userspace due to an unhandled emulation case. Returning '-1' gets interpreted as -EPERM up the stack and effecitvely kills the guest. Fixes: 890ca9aefa78 ("KVM: Add MCE support") Fixes: 9ffd986c6e4e ("KVM: X86: #GP when guest attempts to write MCi_STATUS register w/o 0") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-2-seanjc@google.com --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ce0c6fe166d..891ca973c838 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3239,13 +3239,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ if ((offset & 0x3) == 0 && data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return -1; + return 1; /* MCi_STATUS */ if (!msr_info->host_initiated && (offset & 0x3) == 1 && data != 0) { if (!can_set_mci_status(vcpu)) - return -1; + return 1; } vcpu->arch.mce_banks[offset] = data; From f5223a332f3647a0e3725e9b4a102e9659c84ce4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:15 +0000 Subject: [PATCH 2/3] KVM: x86: Use explicit case-statements for MCx banks in {g,s}et_msr_mce() Use an explicit case statement to grab the full range of MCx bank MSRs in {g,s}et_msr_mce(), and manually check only the "end" (the number of banks configured by userspace may be less than the max). The "default" trick works, but is a bit odd now, and will be quite odd if/when support for accessing MCx_CTL2 MSRs is added, which has near identical logic. Hoist "offset" to function scope so as to avoid curly braces for the case statement, and because MCx_CTL2 support will need the same variables. Opportunstically clean up the comment about allowing bit 10 to be cleared from bank 4. No functional change intended. Cc: Jue Wang Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-3-seanjc@google.com --- arch/x86/kvm/x86.c | 77 ++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 891ca973c838..c18d57027838 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3209,6 +3209,7 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) unsigned bank_num = mcg_cap & 0xff; u32 msr = msr_info->index; u64 data = msr_info->data; + u32 offset, last_msr; switch (msr) { case MSR_IA32_MCG_STATUS: @@ -3222,35 +3223,36 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.mcg_ctl = data; break; + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; + + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + + /* + * Only 0 or all 1s can be written to IA32_MCi_CTL, all other + * values are architecturally undefined. But, some Linux + * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB + * issue on AMD K8s, allow bit 10 to be clear when setting all + * other bits in order to avoid an uncaught #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, + * single-bit ECC data errors. + */ + if ((offset & 0x3) == 0 && + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* MCi_STATUS */ + if (!msr_info->host_initiated && (offset & 0x3) == 1 && + data != 0 && !can_set_mci_status(vcpu)) + return 1; + + vcpu->arch.mce_banks[offset] = data; + break; default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); - - /* only 0 or all 1s can be written to IA32_MCi_CTL - * some Linux kernels though clear bit 10 in bank 4 to - * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest. - * - * UNIXWARE clears bit 0 of MC1_CTL to ignore - * correctable, single-bit ECC data errors. - */ - if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10) | 1) != ~(u64)0) - return 1; - - /* MCi_STATUS */ - if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) { - if (!can_set_mci_status(vcpu)) - return 1; - } - - vcpu->arch.mce_banks[offset] = data; - break; - } return 1; } return 0; @@ -3819,6 +3821,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) u64 data; u64 mcg_cap = vcpu->arch.mcg_cap; unsigned bank_num = mcg_cap & 0xff; + u32 offset, last_msr; switch (msr) { case MSR_IA32_P5_MC_ADDR: @@ -3836,16 +3839,16 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) case MSR_IA32_MCG_STATUS: data = vcpu->arch.mcg_status; break; - default: - if (msr >= MSR_IA32_MC0_CTL && - msr < MSR_IA32_MCx_CTL(bank_num)) { - u32 offset = array_index_nospec( - msr - MSR_IA32_MC0_CTL, - MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL); + case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: + last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; + if (msr > last_msr) + return 1; - data = vcpu->arch.mce_banks[offset]; - break; - } + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); + data = vcpu->arch.mce_banks[offset]; + break; + default: return 1; } *pdata = data; From 54ad60ba9d2673293c72a7c1c4f092622a1f8789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 12 May 2022 22:27:16 +0000 Subject: [PATCH 3/3] KVM: x86: Add helpers to identify CTL and STATUS MCi MSRs Add helpers to identify CTL (control) and STATUS MCi MSR types instead of open coding the checks using the offset. Using the offset is perfectly safe, but unintuitive, as understanding what the code does requires knowing that the offset calcuation will not affect the lower three bits. Opportunistically comment the STATUS logic to save readers a trip to Intel's SDM or AMD's APM to understand the "data != 0" check. No functional change intended. Signed-off-by: Sean Christopherson Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20220512222716.4112548-4-seanjc@google.com --- arch/x86/kvm/x86.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c18d57027838..3b7c9c1b4540 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3191,6 +3191,16 @@ static void kvmclock_sync_fn(struct work_struct *work) KVMCLOCK_SYNC_PERIOD); } +/* These helpers are safe iff @msr is known to be an MCx bank MSR. */ +static bool is_mci_control_msr(u32 msr) +{ + return (msr & 3) == 0; +} +static bool is_mci_status_msr(u32 msr) +{ + return (msr & 3) == 1; +} + /* * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. */ @@ -3228,9 +3238,6 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr > last_msr) return 1; - offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, - last_msr + 1 - MSR_IA32_MC0_CTL); - /* * Only 0 or all 1s can be written to IA32_MCi_CTL, all other * values are architecturally undefined. But, some Linux @@ -3241,15 +3248,21 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, * single-bit ECC data errors. */ - if ((offset & 0x3) == 0 && + if (is_mci_control_msr(msr) && data != 0 && (data | (1 << 10) | 1) != ~(u64)0) return 1; - /* MCi_STATUS */ - if (!msr_info->host_initiated && (offset & 0x3) == 1 && + /* + * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. + * AMD-based CPUs allow non-zero values, but if and only if + * HWCR[McStatusWrEn] is set. + */ + if (!msr_info->host_initiated && is_mci_status_msr(msr) && data != 0 && !can_set_mci_status(vcpu)) return 1; + offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, + last_msr + 1 - MSR_IA32_MC0_CTL); vcpu->arch.mce_banks[offset] = data; break; default: