KVM VMX changes for 6.20

- Fix an SGX bug where KVM would incorrectly try to handle EPCM #PFs by always
    relecting EPCM #PFs back into the guest.  KVM doesn't shadow EPCM entries,
    and so EPCM violations cannot be due to KVM interference, and can't be
    resolved by KVM.
 
  - Fix a bug where KVM would register its posted interrupt wakeup handler even
    if loading kvm-intel.ko ultimately failed.
 
  - Disallow access to vmcb12 fields that aren't fully supported, mostly to
    avoid weirdness and complexity for FRED and other features, where KVM wants
    enable VMCS shadowing for fields that conditionally exist.
 
  - Print out the "bad" offsets and values if kvm-intel.ko refuses to load (or
    refuses to online a CPU) due to a VMCS config mismatch.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKTobbabEP7vbhhN9OlYIJqCjN/0FAmmGstUACgkQOlYIJqCj
 N/3z3w/+NSA+/0/JfeCmw+CiMtmHY4eCOtScPwmrP0RONcee4HzX2LlzhZww9YeL
 GSBouvaU5eyNoYjA14mgOjTHfLUEkhH6/3kULN2LjE8md+oLxD0ZBVQNwkXDKSgX
 BpP8EJ/9vBuAjSzUNWeikBsNlXt8I4+QxZYSHPe+BiKE0kMVtFuua2LQeDtj5qc9
 SYKguN0EmYdCox09a1YOX9tExk4VULrOtwcOnNK0I7m87os5Xl2DLHy1vYLZ0WPT
 R9iSnh/AfTsYuvCfotlGccDW8x9x+5PILZ7zxyipXBOGvRBgTaOmsgho/Rf81vpj
 laj6PDk06ep5PLfX0IPM7I4+8usQCxWB0dTXnB6Fu32BnmwuFRpwYCW3XJsqBMrb
 Q4fa14a0Aj5rviCn/CWDJOmMZtTRbQ/U+AaYT+A1VlaMRo8hkIMvW3coYSqvCuZY
 tceW2/3oobwzad5pi37OPsNws6STQc/UOgQDsmAIX6c5/B+cc8PF/a/DAInHPyX2
 356rpdIBOnF7uheLfHGBefFeD1TdkVZvW9Gy6rHPaVjWAwyc59+C6OZoA8bTJtyP
 x4akIaS0GrJ7Gi9RcHRJpvKQucMWbhOrpZxov9QDMRgkdH00eznVwixVZfYAFLPN
 iyQpYJU+moyhXQBGmVUJlWTuMud3qwwCxhY4DEi/pGT8JtK1v5M=
 =XHNe
 -----END PGP SIGNATURE-----

Merge tag 'kvm-x86-vmx-6.20' of https://github.com/kvm-x86/linux into HEAD

KVM VMX changes for 6.20

 - Fix an SGX bug where KVM would incorrectly try to handle EPCM #PFs by always
   relecting EPCM #PFs back into the guest.  KVM doesn't shadow EPCM entries,
   and so EPCM violations cannot be due to KVM interference, and can't be
   resolved by KVM.

 - Fix a bug where KVM would register its posted interrupt wakeup handler even
   if loading kvm-intel.ko ultimately failed.

 - Disallow access to vmcb12 fields that aren't fully supported, mostly to
   avoid weirdness and complexity for FRED and other features, where KVM wants
   enable VMCS shadowing for fields that conditionally exist.

 - Print out the "bad" offsets and values if kvm-intel.ko refuses to load (or
   refuses to online a CPU) due to a VMCS config mismatch.
This commit is contained in:
Paolo Bonzini 2026-02-09 18:50:04 +01:00
commit 687603fb2b
7 changed files with 171 additions and 41 deletions

View File

@ -7,7 +7,7 @@
#include "hyperv_evmcs.h"
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
#define EVMCS1_FIELD(number, name, clean_field)[ENC_TO_VMCS12_IDX(number)] = \
{EVMCS1_OFFSET(name), clean_field}
const struct evmcs_field vmcs_field_to_evmcs_1[] = {

View File

@ -130,7 +130,7 @@ static __always_inline int evmcs_field_offset(unsigned long field,
u16 *clean_field)
{
const struct evmcs_field *evmcs_field;
unsigned int index = ROL16(field, 6);
unsigned int index = ENC_TO_VMCS12_IDX(field);
if (unlikely(index >= nr_evmcs_1_fields))
return -ENOENT;

View File

@ -86,6 +86,9 @@ static void init_vmcs_shadow_fields(void)
pr_err("Missing field from shadow_read_only_field %x\n",
field + 1);
if (get_vmcs12_field_offset(field) < 0)
continue;
clear_bit(field, vmx_vmread_bitmap);
if (field & 1)
#ifdef CONFIG_X86_64
@ -111,10 +114,14 @@ static void init_vmcs_shadow_fields(void)
field <= GUEST_TR_AR_BYTES,
"Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
if (get_vmcs12_field_offset(field) < 0)
continue;
/*
* PML and the preemption timer can be emulated, but the
* processor cannot vmwrite to fields that don't exist
* on bare metal.
* KVM emulates PML and the VMX preemption timer irrespective
* of hardware support, but shadowing their related VMCS fields
* requires hardware support as the CPU will reject VMWRITEs to
* fields that don't exist.
*/
switch (field) {
case GUEST_PML_INDEX:
@ -125,10 +132,6 @@ static void init_vmcs_shadow_fields(void)
if (!cpu_has_vmx_preemption_timer())
continue;
break;
case GUEST_INTR_STATUS:
if (!cpu_has_vmx_apicv())
continue;
break;
default:
break;
}
@ -7074,12 +7077,6 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void)
}
}
/*
* Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo
* that madness to get the encoding for comparison.
*/
#define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10)))
static u64 nested_vmx_calc_vmcs_enum_msr(void)
{
/*
@ -7407,6 +7404,14 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
{
int i;
/*
* Note! The set of supported vmcs12 fields is consumed by both VMX
* MSR and shadow VMCS setup.
*/
nested_vmx_setup_vmcs12_fields();
nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
if (!cpu_has_vmx_shadow_vmcs())
enable_shadow_vmcs = 0;
if (enable_shadow_vmcs) {

View File

@ -11,7 +11,16 @@
#include "capabilities.h"
/*
* Indexing into the vmcs12 uses the VMCS encoding rotated left by 6 as a very
* rudimentary compression of the range of indices. The compression ratio is
* good enough to allow KVM to use a (very sparsely populated) array without
* wasting too much memory, while the "algorithm" is fast enough to be used to
* lookup vmcs12 fields on-demand, e.g. for emulation.
*/
#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n)))))
#define VMCS12_IDX_TO_ENC(idx) ROL16(idx, 10)
#define ENC_TO_VMCS12_IDX(enc) ROL16(enc, 6)
struct vmcs_hdr {
u32 revision_id:31;

View File

@ -4,12 +4,12 @@
#include "vmcs12.h"
#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name)
#define FIELD(number, name) [ENC_TO_VMCS12_IDX(number)] = VMCS12_OFFSET(name)
#define FIELD64(number, name) \
FIELD(number, name), \
[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
[ENC_TO_VMCS12_IDX(number##_HIGH)] = VMCS12_OFFSET(name) + sizeof(u32)
const unsigned short vmcs12_field_offsets[] = {
static const u16 kvm_supported_vmcs12_field_offsets[] __initconst = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@ -158,4 +158,70 @@ const unsigned short vmcs12_field_offsets[] = {
FIELD(HOST_SSP, host_ssp),
FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl),
};
const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
u16 vmcs12_field_offsets[ARRAY_SIZE(kvm_supported_vmcs12_field_offsets)] __ro_after_init;
unsigned int nr_vmcs12_fields __ro_after_init;
#define VMCS12_CASE64(enc) case enc##_HIGH: case enc
static __init bool cpu_has_vmcs12_field(unsigned int idx)
{
switch (VMCS12_IDX_TO_ENC(idx)) {
case VIRTUAL_PROCESSOR_ID:
return cpu_has_vmx_vpid();
case POSTED_INTR_NV:
return cpu_has_vmx_posted_intr();
VMCS12_CASE64(TSC_MULTIPLIER):
return cpu_has_vmx_tsc_scaling();
case TPR_THRESHOLD:
VMCS12_CASE64(VIRTUAL_APIC_PAGE_ADDR):
return cpu_has_vmx_tpr_shadow();
VMCS12_CASE64(APIC_ACCESS_ADDR):
return cpu_has_vmx_virtualize_apic_accesses();
VMCS12_CASE64(POSTED_INTR_DESC_ADDR):
return cpu_has_vmx_posted_intr();
case GUEST_INTR_STATUS:
return cpu_has_vmx_virtual_intr_delivery();
VMCS12_CASE64(VM_FUNCTION_CONTROL):
VMCS12_CASE64(EPTP_LIST_ADDRESS):
return cpu_has_vmx_vmfunc();
VMCS12_CASE64(EPT_POINTER):
return cpu_has_vmx_ept();
VMCS12_CASE64(XSS_EXIT_BITMAP):
return cpu_has_vmx_xsaves();
VMCS12_CASE64(ENCLS_EXITING_BITMAP):
return cpu_has_vmx_encls_vmexit();
VMCS12_CASE64(GUEST_IA32_PERF_GLOBAL_CTRL):
VMCS12_CASE64(HOST_IA32_PERF_GLOBAL_CTRL):
return cpu_has_load_perf_global_ctrl();
case SECONDARY_VM_EXEC_CONTROL:
return cpu_has_secondary_exec_ctrls();
case GUEST_S_CET:
case GUEST_SSP:
case GUEST_INTR_SSP_TABLE:
case HOST_S_CET:
case HOST_SSP:
case HOST_INTR_SSP_TABLE:
return cpu_has_load_cet_ctrl();
/* KVM always emulates PML and the VMX preemption timer in software. */
case GUEST_PML_INDEX:
case VMX_PREEMPTION_TIMER_VALUE:
default:
return true;
}
}
void __init nested_vmx_setup_vmcs12_fields(void)
{
unsigned int i;
for (i = 0; i < ARRAY_SIZE(kvm_supported_vmcs12_field_offsets); i++) {
if (!kvm_supported_vmcs12_field_offsets[i] ||
!cpu_has_vmcs12_field(i))
continue;
vmcs12_field_offsets[i] = kvm_supported_vmcs12_field_offsets[i];
nr_vmcs12_fields = i + 1;
}
}

View File

@ -374,8 +374,10 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(guest_pml_index, 996);
}
extern const unsigned short vmcs12_field_offsets[];
extern const unsigned int nr_vmcs12_fields;
extern u16 vmcs12_field_offsets[] __ro_after_init;
extern unsigned int nr_vmcs12_fields __ro_after_init;
void __init nested_vmx_setup_vmcs12_fields(void);
static inline short get_vmcs12_field_offset(unsigned long field)
{
@ -385,7 +387,7 @@ static inline short get_vmcs12_field_offset(unsigned long field)
if (field >> 15)
return -ENOENT;
index = ROL16(field, 6);
index = ENC_TO_VMCS12_IDX(field);
if (index >= nr_vmcs12_fields)
return -ENOENT;

View File

@ -2921,8 +2921,23 @@ int vmx_check_processor_compat(void)
}
if (nested)
nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config))) {
pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
u32 *gold = (void *)&vmcs_config;
u32 *mine = (void *)&vmcs_conf;
int i;
BUILD_BUG_ON(sizeof(struct vmcs_config) % sizeof(u32));
pr_err("VMCS config on CPU %d doesn't match reference config:", cpu);
for (i = 0; i < sizeof(struct vmcs_config) / sizeof(u32); i++) {
if (gold[i] == mine[i])
continue;
pr_cont("\n Offset %u REF = 0x%08x, CPU%u = 0x%08x, mismatch = 0x%08x",
i * (int)sizeof(u32), gold[i], cpu, mine[i], gold[i] ^ mine[i]);
}
pr_cont("\n");
return -EIO;
}
return 0;
@ -5303,12 +5318,53 @@ static bool is_xfd_nm_fault(struct kvm_vcpu *vcpu)
!kvm_is_cr0_bit_set(vcpu, X86_CR0_TS);
}
static int vmx_handle_page_fault(struct kvm_vcpu *vcpu, u32 error_code)
{
unsigned long cr2 = vmx_get_exit_qual(vcpu);
if (vcpu->arch.apf.host_apf_flags)
goto handle_pf;
/* When using EPT, KVM intercepts #PF only to detect illegal GPAs. */
WARN_ON_ONCE(enable_ept && !allow_smaller_maxphyaddr);
/*
* On SGX2 hardware, EPCM violations are delivered as #PF with the SGX
* flag set in the error code (SGX1 hardware generates #GP(0)). EPCM
* violations have nothing to do with shadow paging and can never be
* resolved by KVM; always reflect them into the guest.
*/
if (error_code & PFERR_SGX_MASK) {
WARN_ON_ONCE(!IS_ENABLED(CONFIG_X86_SGX_KVM) ||
!cpu_feature_enabled(X86_FEATURE_SGX2));
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SGX2))
kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
else
kvm_inject_gp(vcpu, 0);
return 1;
}
/*
* If EPT is enabled, fixup and inject the #PF. KVM intercepts #PFs
* only to set PFERR_RSVD as appropriate (hardware won't set RSVD due
* to the GPA being legal with respect to host.MAXPHYADDR).
*/
if (enable_ept) {
kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
return 1;
}
handle_pf:
return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 intr_info, ex_no, error_code;
unsigned long cr2, dr6;
unsigned long dr6;
u32 vect_info;
vect_info = vmx->idt_vectoring_info;
@ -5383,19 +5439,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
return 0;
}
if (is_page_fault(intr_info)) {
cr2 = vmx_get_exit_qual(vcpu);
if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
/*
* EPT will cause page fault only if we need to
* detect illegal GPAs.
*/
WARN_ON_ONCE(!allow_smaller_maxphyaddr);
kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
return 1;
} else
return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
if (is_page_fault(intr_info))
return vmx_handle_page_fault(vcpu, error_code);
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@ -8672,16 +8717,14 @@ __init int vmx_hardware_setup(void)
* can hide/show features based on kvm_cpu_cap_has().
*/
if (nested) {
nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
if (r)
return r;
}
r = alloc_kvm_area();
if (r && nested)
nested_vmx_hardware_unsetup();
if (r)
goto err_kvm_area;
kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
@ -8708,6 +8751,11 @@ __init int vmx_hardware_setup(void)
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_IGNORE_GUEST_PAT;
return 0;
err_kvm_area:
if (nested)
nested_vmx_hardware_unsetup();
return r;
}