mirror of
https://github.com/torvalds/linux.git
synced 2026-05-30 10:04:04 +02:00
KVM x86 fixes for 6.14-rcN #2
- Set RFLAGS.IF in C code on SVM to get VMRUN out of the STI shadow.
- Ensure DEBUGCTL is context switched on AMD to avoid running the guest with
the host's value, which can lead to unexpected bus lock #DBs.
- Suppress DEBUGCTL.BTF on AMD (to match Intel), as KVM doesn't properly
emulate BTF. KVM's lack of context switching has meant BTF has always been
broken to some extent.
- Always save DR masks for SNP vCPUs if DebugSwap is *supported*, as the guest
can enable DebugSwap without KVM's knowledge.
- Fix a bug in mmu_stress_tests where a vCPU could finish the "writes to RO
memory" phase without actually generating a write-protection fault.
- Fix a printf() goof in the SEV smoke test that causes build failures with
-Werror.
- Explicitly zero EAX and EBX in CPUID.0x8000_0022 output when PERFMON_V2
isn't supported by KVM.
-----BEGIN PGP SIGNATURE-----
iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmfLlhUACgkQOlYIJqCj
N/0x7w/+MhqJdHbshL7Gzw+rcXwCROiCkqsxFP+YoTXte8uaHS5CEfcMYjE8SuGp
KBpgLo4Lj1dVTXiCjemlY5sn6CDiuSs74X8A88ksuu5hVsFByJUgyWU9iw8J/crZ
B2vj8huhqa8OCEPe5JujWfnfyAkKE5tUA4GFi73vhHMcftTNj+ftxT33/Pfg7y7M
xOvWFWS6ZshKrouRKzI7ZFEYLwp0lr4U3dzO5rCRAd5J4MSBWRx6Dx2um5dyEYKJ
xgwl4ylM4S/+78u1+0nQnToM0UWHJ3e7x8nze6UXYTZIrBr/lSeKlbhOPnEWJcJB
Eemnur9ORI2BRPUReqBKluCZsSK+E5B/HPCVt5cxtuRIuUOD+kW17LPgnPyE4Sso
eVt+XAvQc7EjrpWDSHr3ZQZZM89l9zHhuSAQ0npO6y71s0FzEVZQoDamNmOLAPjH
Qg+qhBV2l6pyfqhqiLzADasYLOl57cJsfiMjM331ALLqAn57jzd+B8c4hdB2Xg4s
KPuy8w8uBaY9zpd9YDBLLr7JJVs35KexNZMjT2vqBYXcScyLgmAuSQXy3hub6Mzn
gI5ZXIKG8eO9v2jejfClI6/OEdtEwgSGEVwuBKB16pMrIxqpguMTMTWLVRn5G+oo
qA8anmKaac62GaB66JE/Wjy069OPIGYnHSU2nal0Tej6kG0xv6E=
=as6u
-----END PGP SIGNATURE-----
Merge tag 'kvm-x86-fixes-6.14-rcN.2' of https://github.com/kvm-x86/linux into HEAD
KVM x86 fixes for 6.14-rcN #2
- Set RFLAGS.IF in C code on SVM to get VMRUN out of the STI shadow.
- Ensure DEBUGCTL is context switched on AMD to avoid running the guest with
the host's value, which can lead to unexpected bus lock #DBs.
- Suppress DEBUGCTL.BTF on AMD (to match Intel), as KVM doesn't properly
emulate BTF. KVM's lack of context switching has meant BTF has always been
broken to some extent.
- Always save DR masks for SNP vCPUs if DebugSwap is *supported*, as the guest
can enable DebugSwap without KVM's knowledge.
- Fix a bug in mmu_stress_tests where a vCPU could finish the "writes to RO
memory" phase without actually generating a write-protection fault.
- Fix a printf() goof in the SEV smoke test that causes build failures with
-Werror.
- Explicitly zero EAX and EBX in CPUID.0x8000_0022 output when PERFMON_V2
isn't supported by KVM.
This commit is contained in:
commit
ea9bd29a9c
|
|
@ -780,6 +780,7 @@ struct kvm_vcpu_arch {
|
|||
u32 pkru;
|
||||
u32 hflags;
|
||||
u64 efer;
|
||||
u64 host_debugctl;
|
||||
u64 apic_base;
|
||||
struct kvm_lapic *apic; /* kernel irqchip context */
|
||||
bool load_eoi_exitmap_pending;
|
||||
|
|
|
|||
|
|
@ -1763,7 +1763,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
|
|||
|
||||
entry->ecx = entry->edx = 0;
|
||||
if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) {
|
||||
entry->eax = entry->ebx;
|
||||
entry->eax = entry->ebx = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4590,6 +4590,8 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm)
|
|||
|
||||
void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_area *hostsa)
|
||||
{
|
||||
struct kvm *kvm = svm->vcpu.kvm;
|
||||
|
||||
/*
|
||||
* All host state for SEV-ES guests is categorized into three swap types
|
||||
* based on how it is handled by hardware during a world switch:
|
||||
|
|
@ -4613,14 +4615,22 @@ void sev_es_prepare_switch_to_guest(struct vcpu_svm *svm, struct sev_es_save_are
|
|||
|
||||
/*
|
||||
* If DebugSwap is enabled, debug registers are loaded but NOT saved by
|
||||
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU both
|
||||
* saves and loads debug registers (Type-A).
|
||||
* the CPU (Type-B). If DebugSwap is disabled/unsupported, the CPU does
|
||||
* not save or load debug registers. Sadly, KVM can't prevent SNP
|
||||
* guests from lying about DebugSwap on secondary vCPUs, i.e. the
|
||||
* SEV_FEATURES provided at "AP Create" isn't guaranteed to match what
|
||||
* the guest has actually enabled (or not!) in the VMSA.
|
||||
*
|
||||
* If DebugSwap is *possible*, save the masks so that they're restored
|
||||
* if the guest enables DebugSwap. But for the DRs themselves, do NOT
|
||||
* rely on the CPU to restore the host values; KVM will restore them as
|
||||
* needed in common code, via hw_breakpoint_restore(). Note, KVM does
|
||||
* NOT support virtualizing Breakpoint Extensions, i.e. the mask MSRs
|
||||
* don't need to be restored per se, KVM just needs to ensure they are
|
||||
* loaded with the correct values *if* the CPU writes the MSRs.
|
||||
*/
|
||||
if (sev_vcpu_has_debug_swap(svm)) {
|
||||
hostsa->dr0 = native_get_debugreg(0);
|
||||
hostsa->dr1 = native_get_debugreg(1);
|
||||
hostsa->dr2 = native_get_debugreg(2);
|
||||
hostsa->dr3 = native_get_debugreg(3);
|
||||
if (sev_vcpu_has_debug_swap(svm) ||
|
||||
(sev_snp_guest(kvm) && cpu_feature_enabled(X86_FEATURE_DEBUG_SWAP))) {
|
||||
hostsa->dr0_addr_mask = amd_get_dr_addr_mask(0);
|
||||
hostsa->dr1_addr_mask = amd_get_dr_addr_mask(1);
|
||||
hostsa->dr2_addr_mask = amd_get_dr_addr_mask(2);
|
||||
|
|
|
|||
|
|
@ -3165,6 +3165,27 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
|||
kvm_pr_unimpl_wrmsr(vcpu, ecx, data);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* AMD changed the architectural behavior of bits 5:2. On CPUs
|
||||
* without BusLockTrap, bits 5:2 control "external pins", but
|
||||
* on CPUs that support BusLockDetect, bit 2 enables BusLockTrap
|
||||
* and bits 5:3 are reserved-to-zero. Sadly, old KVM allowed
|
||||
* the guest to set bits 5:2 despite not actually virtualizing
|
||||
* Performance-Monitoring/Breakpoint external pins. Drop bits
|
||||
* 5:2 for backwards compatibility.
|
||||
*/
|
||||
data &= ~GENMASK(5, 2);
|
||||
|
||||
/*
|
||||
* Suppress BTF as KVM doesn't virtualize BTF, but there's no
|
||||
* way to communicate lack of support to the guest.
|
||||
*/
|
||||
if (data & DEBUGCTLMSR_BTF) {
|
||||
kvm_pr_unimpl_wrmsr(vcpu, MSR_IA32_DEBUGCTLMSR, data);
|
||||
data &= ~DEBUGCTLMSR_BTF;
|
||||
}
|
||||
|
||||
if (data & DEBUGCTL_RESERVED_BITS)
|
||||
return 1;
|
||||
|
||||
|
|
@ -4189,6 +4210,18 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
|
|||
|
||||
guest_state_enter_irqoff();
|
||||
|
||||
/*
|
||||
* Set RFLAGS.IF prior to VMRUN, as the host's RFLAGS.IF at the time of
|
||||
* VMRUN controls whether or not physical IRQs are masked (KVM always
|
||||
* runs with V_INTR_MASKING_MASK). Toggle RFLAGS.IF here to avoid the
|
||||
* temptation to do STI+VMRUN+CLI, as AMD CPUs bleed the STI shadow
|
||||
* into guest state if delivery of an event during VMRUN triggers a
|
||||
* #VMEXIT, and the guest_state transitions already tell lockdep that
|
||||
* IRQs are being enabled/disabled. Note! GIF=0 for the entirety of
|
||||
* this path, so IRQs aren't actually unmasked while running host code.
|
||||
*/
|
||||
raw_local_irq_enable();
|
||||
|
||||
amd_clear_divider();
|
||||
|
||||
if (sev_es_guest(vcpu->kvm))
|
||||
|
|
@ -4197,6 +4230,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_in
|
|||
else
|
||||
__svm_vcpu_run(svm, spec_ctrl_intercepted);
|
||||
|
||||
raw_local_irq_disable();
|
||||
|
||||
guest_state_exit_irqoff();
|
||||
}
|
||||
|
||||
|
|
@ -4253,6 +4288,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
|
|||
clgi();
|
||||
kvm_load_guest_xsave_state(vcpu);
|
||||
|
||||
/*
|
||||
* Hardware only context switches DEBUGCTL if LBR virtualization is
|
||||
* enabled. Manually load DEBUGCTL if necessary (and restore it after
|
||||
* VM-Exit), as running with the host's DEBUGCTL can negatively affect
|
||||
* guest state and can even be fatal, e.g. due to Bus Lock Detect.
|
||||
*/
|
||||
if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
|
||||
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
|
||||
update_debugctlmsr(svm->vmcb->save.dbgctl);
|
||||
|
||||
kvm_wait_lapic_expire(vcpu);
|
||||
|
||||
/*
|
||||
|
|
@ -4280,6 +4325,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu,
|
|||
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
|
||||
kvm_before_interrupt(vcpu, KVM_HANDLING_NMI);
|
||||
|
||||
if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) &&
|
||||
vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl)
|
||||
update_debugctlmsr(vcpu->arch.host_debugctl);
|
||||
|
||||
kvm_load_host_xsave_state(vcpu);
|
||||
stgi();
|
||||
|
||||
|
|
|
|||
|
|
@ -584,7 +584,7 @@ static inline bool is_vnmi_enabled(struct vcpu_svm *svm)
|
|||
/* svm.c */
|
||||
#define MSR_INVALID 0xffffffffU
|
||||
|
||||
#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
|
||||
#define DEBUGCTL_RESERVED_BITS (~DEBUGCTLMSR_LBR)
|
||||
|
||||
extern bool dump_invalid_vmcb;
|
||||
|
||||
|
|
|
|||
|
|
@ -170,12 +170,8 @@ SYM_FUNC_START(__svm_vcpu_run)
|
|||
mov VCPU_RDI(%_ASM_DI), %_ASM_DI
|
||||
|
||||
/* Enter guest mode */
|
||||
sti
|
||||
|
||||
3: vmrun %_ASM_AX
|
||||
4:
|
||||
cli
|
||||
|
||||
/* Pop @svm to RAX while it's the only available register. */
|
||||
pop %_ASM_AX
|
||||
|
||||
|
|
@ -340,12 +336,8 @@ SYM_FUNC_START(__svm_sev_es_vcpu_run)
|
|||
mov KVM_VMCB_pa(%rax), %rax
|
||||
|
||||
/* Enter guest mode */
|
||||
sti
|
||||
|
||||
1: vmrun %rax
|
||||
|
||||
2: cli
|
||||
|
||||
2:
|
||||
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
|
||||
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT
|
||||
|
||||
|
|
|
|||
|
|
@ -1514,16 +1514,12 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
|
|||
*/
|
||||
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
{
|
||||
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
||||
|
||||
if (vcpu->scheduled_out && !kvm_pause_in_guest(vcpu->kvm))
|
||||
shrink_ple_window(vcpu);
|
||||
|
||||
vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
|
||||
|
||||
vmx_vcpu_pi_load(vcpu, cpu);
|
||||
|
||||
vmx->host_debugctlmsr = get_debugctlmsr();
|
||||
}
|
||||
|
||||
void vmx_vcpu_put(struct kvm_vcpu *vcpu)
|
||||
|
|
@ -7458,8 +7454,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
|
|||
}
|
||||
|
||||
/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
|
||||
if (vmx->host_debugctlmsr)
|
||||
update_debugctlmsr(vmx->host_debugctlmsr);
|
||||
if (vcpu->arch.host_debugctl)
|
||||
update_debugctlmsr(vcpu->arch.host_debugctl);
|
||||
|
||||
#ifndef CONFIG_X86_64
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -340,8 +340,6 @@ struct vcpu_vmx {
|
|||
/* apic deadline value in host tsc */
|
||||
u64 hv_deadline_tsc;
|
||||
|
||||
unsigned long host_debugctlmsr;
|
||||
|
||||
/*
|
||||
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
|
||||
* msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
|
||||
|
|
|
|||
|
|
@ -10968,6 +10968,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
|
|||
set_debugreg(0, 7);
|
||||
}
|
||||
|
||||
vcpu->arch.host_debugctl = get_debugctlmsr();
|
||||
|
||||
guest_timing_enter_irqoff();
|
||||
|
||||
for (;;) {
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@
|
|||
#include "ucall_common.h"
|
||||
|
||||
static bool mprotect_ro_done;
|
||||
static bool all_vcpus_hit_ro_fault;
|
||||
|
||||
static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
|
||||
{
|
||||
|
|
@ -36,9 +37,9 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
|
|||
|
||||
/*
|
||||
* Write to the region while mprotect(PROT_READ) is underway. Keep
|
||||
* looping until the memory is guaranteed to be read-only, otherwise
|
||||
* vCPUs may complete their writes and advance to the next stage
|
||||
* prematurely.
|
||||
* looping until the memory is guaranteed to be read-only and a fault
|
||||
* has occurred, otherwise vCPUs may complete their writes and advance
|
||||
* to the next stage prematurely.
|
||||
*
|
||||
* For architectures that support skipping the faulting instruction,
|
||||
* generate the store via inline assembly to ensure the exact length
|
||||
|
|
@ -56,7 +57,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
|
|||
#else
|
||||
vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
|
||||
#endif
|
||||
} while (!READ_ONCE(mprotect_ro_done));
|
||||
} while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault));
|
||||
|
||||
/*
|
||||
* Only architectures that write the entire range can explicitly sync,
|
||||
|
|
@ -81,6 +82,7 @@ struct vcpu_info {
|
|||
|
||||
static int nr_vcpus;
|
||||
static atomic_t rendezvous;
|
||||
static atomic_t nr_ro_faults;
|
||||
|
||||
static void rendezvous_with_boss(void)
|
||||
{
|
||||
|
|
@ -148,12 +150,16 @@ static void *vcpu_worker(void *data)
|
|||
* be stuck on the faulting instruction for other architectures. Go to
|
||||
* stage 3 without a rendezvous
|
||||
*/
|
||||
do {
|
||||
r = _vcpu_run(vcpu);
|
||||
} while (!r);
|
||||
r = _vcpu_run(vcpu);
|
||||
TEST_ASSERT(r == -1 && errno == EFAULT,
|
||||
"Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);
|
||||
|
||||
atomic_inc(&nr_ro_faults);
|
||||
if (atomic_read(&nr_ro_faults) == nr_vcpus) {
|
||||
WRITE_ONCE(all_vcpus_hit_ro_fault, true);
|
||||
sync_global_to_guest(vm, all_vcpus_hit_ro_fault);
|
||||
}
|
||||
|
||||
#if defined(__x86_64__) || defined(__aarch64__)
|
||||
/*
|
||||
* Verify *all* writes from the guest hit EFAULT due to the VMA now
|
||||
|
|
@ -378,7 +384,6 @@ int main(int argc, char *argv[])
|
|||
rendezvous_with_vcpus(&time_run2, "run 2");
|
||||
|
||||
mprotect(mem, slot_size, PROT_READ);
|
||||
usleep(10);
|
||||
mprotect_ro_done = true;
|
||||
sync_global_to_guest(vm, mprotect_ro_done);
|
||||
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ static void svm_run_l2(struct svm_test_data *svm, void *l2_code, int vector,
|
|||
|
||||
GUEST_ASSERT_EQ(ctrl->exit_code, (SVM_EXIT_EXCP_BASE + vector));
|
||||
GUEST_ASSERT_EQ(ctrl->exit_info_1, error_code);
|
||||
GUEST_ASSERT(!ctrl->int_state);
|
||||
}
|
||||
|
||||
static void l1_svm_code(struct svm_test_data *svm)
|
||||
|
|
@ -122,6 +123,7 @@ static void vmx_run_l2(void *l2_code, int vector, uint32_t error_code)
|
|||
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_REASON), EXIT_REASON_EXCEPTION_NMI);
|
||||
GUEST_ASSERT_EQ((vmreadz(VM_EXIT_INTR_INFO) & 0xff), vector);
|
||||
GUEST_ASSERT_EQ(vmreadz(VM_EXIT_INTR_ERROR_CODE), error_code);
|
||||
GUEST_ASSERT(!vmreadz(GUEST_INTERRUPTIBILITY_INFO));
|
||||
}
|
||||
|
||||
static void l1_vmx_code(struct vmx_pages *vmx)
|
||||
|
|
|
|||
|
|
@ -52,7 +52,8 @@ static void compare_xsave(u8 *from_host, u8 *from_guest)
|
|||
bool bad = false;
|
||||
for (i = 0; i < 4095; i++) {
|
||||
if (from_host[i] != from_guest[i]) {
|
||||
printf("mismatch at %02hhx | %02hhx %02hhx\n", i, from_host[i], from_guest[i]);
|
||||
printf("mismatch at %u | %02hhx %02hhx\n",
|
||||
i, from_host[i], from_guest[i]);
|
||||
bad = true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user