From 06f2969c6a1237f05f8ba4324b6ddc2570a808d0 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:11 -0700 Subject: [PATCH 01/48] KVM: x86: Introduce KVM_{G,S}ET_ONE_REG uAPIs support Enable KVM_{G,S}ET_ONE_REG uAPIs so that userspace can access MSRs and other non-MSR registers through them, along with support for KVM_GET_REG_LIST to enumerate support for KVM-defined registers. This is in preparation for allowing userspace to read/write the guest SSP register, which is needed for the upcoming CET virtualization support. Currently, two types of registers are supported: KVM_X86_REG_TYPE_MSR and KVM_X86_REG_TYPE_KVM. All MSRs are in the former type; the latter type is added for registers that lack existing KVM uAPIs to access them. The "KVM" in the name is intended to be vague to give KVM flexibility to include other potential registers. More precise names like "SYNTHETIC" and "SYNTHETIC_MSR" were considered, but were deemed too confusing (e.g. can be conflated with synthetic guest-visible MSRs) and may put KVM into a corner (e.g. if KVM wants to change how a KVM-defined register is modeled internally). Enumerate only KVM-defined registers in KVM_GET_REG_LIST to avoid duplicating KVM_GET_MSR_INDEX_LIST, and so that KVM can return _only_ registers that are fully supported (KVM_GET_REG_LIST is vCPU-scoped, i.e. can be precise, whereas KVM_GET_MSR_INDEX_LIST is system-scoped). Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Link: https://lore.kernel.org/all/20240219074733.122080-18-weijiang.yang@intel.com [1] Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20250919223258.1604852-5-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 6 +- arch/x86/include/uapi/asm/kvm.h | 26 +++++++++ arch/x86/kvm/x86.c | 100 ++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index b3f2395a62d1..0e82fc5abd7b 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2908,6 +2908,8 @@ such as set vcpu counter or reset vcpu, and they have the following id bit patte 0x9030 0000 0002 +x86 MSR registers have the following id bit patterns:: + 0x2030 0002 4.69 KVM_GET_ONE_REG -------------------- @@ -3588,7 +3590,7 @@ VCPU matching underlying host. --------------------- :Capability: basic -:Architectures: arm64, mips, riscv +:Architectures: arm64, mips, riscv, x86 (if KVM_CAP_ONE_REG) :Type: vcpu ioctl :Parameters: struct kvm_reg_list (in/out) :Returns: 0 on success; -1 on error @@ -3631,6 +3633,8 @@ Note that s390 does not support KVM_GET_REG_LIST for historical reasons - KVM_REG_S390_GBEA +Note, for x86, all MSRs enumerated by KVM_GET_MSR_INDEX_LIST are supported as +type KVM_X86_REG_TYPE_MSR, but are NOT enumerated via KVM_GET_REG_LIST. 4.85 KVM_ARM_SET_DEVICE_ADDR (deprecated) ----------------------------------------- diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 0f15d683817d..aae1033c8afa 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -411,6 +411,32 @@ struct kvm_xcrs { __u64 padding[16]; }; +#define KVM_X86_REG_TYPE_MSR 2 +#define KVM_X86_REG_TYPE_KVM 3 + +#define KVM_X86_KVM_REG_SIZE(reg) \ +({ \ + reg == KVM_REG_GUEST_SSP ? KVM_REG_SIZE_U64 : 0; \ +}) + +#define KVM_X86_REG_TYPE_SIZE(type, reg) \ +({ \ + __u64 type_size = (__u64)type << 32; \ + \ + type_size |= type == KVM_X86_REG_TYPE_MSR ? KVM_REG_SIZE_U64 : \ + type == KVM_X86_REG_TYPE_KVM ? KVM_X86_KVM_REG_SIZE(reg) : \ + 0; \ + type_size; \ +}) + +#define KVM_X86_REG_ID(type, index) \ + (KVM_REG_X86 | KVM_X86_REG_TYPE_SIZE(type, index) | index) + +#define KVM_X86_REG_MSR(index) \ + KVM_X86_REG_ID(KVM_X86_REG_TYPE_MSR, index) +#define KVM_X86_REG_KVM(index) \ + KVM_X86_REG_ID(KVM_X86_REG_TYPE_KVM, index) + #define KVM_SYNC_X86_REGS (1UL << 0) #define KVM_SYNC_X86_SREGS (1UL << 1) #define KVM_SYNC_X86_EVENTS (1UL << 2) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f718834bc00b..bc245e0b0443 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4735,6 +4735,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_MEMORY_FAULT_INFO: case KVM_CAP_X86_GUEST_MODE: + case KVM_CAP_ONE_REG: r = 1; break; case KVM_CAP_PRE_FAULT_MEMORY: @@ -5913,6 +5914,98 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } } +struct kvm_x86_reg_id { + __u32 index; + __u8 type; + __u8 rsvd1; + __u8 rsvd2:4; + __u8 size:4; + __u8 x86; +}; + +static int kvm_translate_kvm_reg(struct kvm_x86_reg_id *reg) +{ + return -EINVAL; +} + +static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (do_get_msr(vcpu, msr, &val)) + return -EINVAL; + + if (put_user(val, user_val)) + return -EFAULT; + + return 0; +} + +static int kvm_set_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) +{ + u64 val; + + if (get_user(val, user_val)) + return -EFAULT; + + if (do_set_msr(vcpu, msr, &val)) + return -EINVAL; + + return 0; +} + +static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, + void __user *argp) +{ + struct kvm_one_reg one_reg; + struct kvm_x86_reg_id *reg; + u64 __user *user_val; + int r; + + if (copy_from_user(&one_reg, argp, sizeof(one_reg))) + return -EFAULT; + + if ((one_reg.id & KVM_REG_ARCH_MASK) != KVM_REG_X86) + return -EINVAL; + + reg = (struct kvm_x86_reg_id *)&one_reg.id; + if (reg->rsvd1 || reg->rsvd2) + return -EINVAL; + + if (reg->type == KVM_X86_REG_TYPE_KVM) { + r = kvm_translate_kvm_reg(reg); + if (r) + return r; + } + + if (reg->type != KVM_X86_REG_TYPE_MSR) + return -EINVAL; + + if ((one_reg.id & KVM_REG_SIZE_MASK) != KVM_REG_SIZE_U64) + return -EINVAL; + + guard(srcu)(&vcpu->kvm->srcu); + + user_val = u64_to_user_ptr(one_reg.addr); + if (ioctl == KVM_GET_ONE_REG) + r = kvm_get_one_msr(vcpu, reg->index, user_val); + else + r = kvm_set_one_msr(vcpu, reg->index, user_val); + + return r; +} + +static int kvm_get_reg_list(struct kvm_vcpu *vcpu, + struct kvm_reg_list __user *user_list) +{ + u64 nr_regs = 0; + + if (put_user(nr_regs, &user_list->n)) + return -EFAULT; + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -6029,6 +6122,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } + case KVM_GET_ONE_REG: + case KVM_SET_ONE_REG: + r = kvm_get_set_one_reg(vcpu, ioctl, argp); + break; + case KVM_GET_REG_LIST: + r = kvm_get_reg_list(vcpu, argp); + break; case KVM_TPR_ACCESS_REPORTING: { struct kvm_tpr_access_ctl tac; From c0a5f298912222f3bb4e8f5dc67c6a1f0e93d83f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:12 -0700 Subject: [PATCH 02/48] KVM: x86: Report XSS as to-be-saved if there are supported features Add MSR_IA32_XSS to list of MSRs reported to userspace if supported_xss is non-zero, i.e. KVM supports at least one XSS based feature. Before enabling CET virtualization series, guest IA32_MSR_XSS is guaranteed to be 0, i.e., XSAVES/XRSTORS is executed in non-root mode with XSS == 0, which equals to the effect of XSAVE/XRSTOR. Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Reviewed-by: Xiaoyao Li Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bc245e0b0443..757878a222a7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -332,7 +332,7 @@ static const u32 msrs_to_save_base[] = { MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, MSR_IA32_UMWAIT_CONTROL, - MSR_IA32_XFD, MSR_IA32_XFD_ERR, + MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, }; static const u32 msrs_to_save_pmu[] = { @@ -7503,6 +7503,10 @@ static void kvm_probe_msr_to_save(u32 msr_index) if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) return; break; + case MSR_IA32_XSS: + if (!kvm_caps.supported_xss) + return; + break; default: break; } From 338543cbe033e56dcc8c13adcdf6c228953c0829 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Sep 2025 15:32:13 -0700 Subject: [PATCH 03/48] KVM: x86: Check XSS validity against guest CPUIDs Maintain per-guest valid XSS bits and check XSS validity against them rather than against KVM capabilities. This is to prevent bits that are supported by KVM but not supported for a guest from being set. Opportunistically return KVM_MSR_RET_UNSUPPORTED on IA32_XSS MSR accesses if guest CPUID doesn't enumerate X86_FEATURE_XSAVES. Since KVM_MSR_RET_UNSUPPORTED takes care of host_initiated cases, drop the host_initiated check. Signed-off-by: Chao Gao Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/cpuid.c | 12 ++++++++++++ arch/x86/kvm/x86.c | 7 +++---- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b1e2a2e033f0..5865c9b77b6d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -815,7 +815,6 @@ struct kvm_vcpu_arch { bool at_instruction_boundary; bool tpr_access_reporting; bool xfd_no_write_intercept; - u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; u64 perf_capabilities; @@ -876,6 +875,8 @@ struct kvm_vcpu_arch { u64 xcr0; u64 guest_supported_xcr0; + u64 ia32_xss; + u64 guest_supported_xss; struct kvm_pio_request pio; void *pio_data; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index efee08fad72e..6b8b5d8b13cc 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -263,6 +263,17 @@ static u64 cpuid_get_supported_xcr0(struct kvm_vcpu *vcpu) return (best->eax | ((u64)best->edx << 32)) & kvm_caps.supported_xcr0; } +static u64 cpuid_get_supported_xss(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry_index(vcpu, 0xd, 1); + if (!best) + return 0; + + return (best->ecx | ((u64)best->edx << 32)) & kvm_caps.supported_xss; +} + static __always_inline void kvm_update_feature_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entry, unsigned int x86_feature, @@ -424,6 +435,7 @@ void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) } vcpu->arch.guest_supported_xcr0 = cpuid_get_supported_xcr0(vcpu); + vcpu->arch.guest_supported_xss = cpuid_get_supported_xss(vcpu); vcpu->arch.pv_cpuid.features = kvm_apply_cpuid_pv_features_quirk(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 757878a222a7..6ae12e8c9d05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3984,15 +3984,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_XSS: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) - return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return KVM_MSR_RET_UNSUPPORTED; /* * KVM supports exposing PT to the guest, but does not support * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data & ~kvm_caps.supported_xss) + if (data & ~vcpu->arch.guest_supported_xss) return 1; vcpu->arch.ia32_xss = data; vcpu->arch.cpuid_dynamic_bits_dirty = true; From 9622e116d0d25f1ddc47bf0328612cc7d87d20f2 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:14 -0700 Subject: [PATCH 04/48] KVM: x86: Refresh CPUID on write to guest MSR_IA32_XSS Update CPUID.(EAX=0DH,ECX=1).EBX to reflect current required xstate size due to XSS MSR modification. CPUID(EAX=0DH,ECX=1).EBX reports the required storage size of all enabled xstate features in (XCR0 | IA32_XSS). The CPUID value can be used by guest before allocate sufficient xsave buffer. Note, KVM does not yet support any XSS based features, i.e. supported_xss is guaranteed to be zero at this time. Opportunistically skip CPUID updates if XSS value doesn't change. Suggested-by: Sean Christopherson Co-developed-by: Zhang Yi Z Signed-off-by: Zhang Yi Z Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Reviewed-by: Xiaoyao Li Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6b8b5d8b13cc..32fde9e80c28 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -316,7 +316,8 @@ static void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry_index(vcpu, 0xD, 1); if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || cpuid_entry_has(best, X86_FEATURE_XSAVEC))) - best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + best->ebx = xstate_required_size(vcpu->arch.xcr0 | + vcpu->arch.ia32_xss, true); } static bool kvm_cpuid_has_hyperv(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6ae12e8c9d05..d142cbc71aaa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3993,6 +3993,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ if (data & ~vcpu->arch.guest_supported_xss) return 1; + if (vcpu->arch.ia32_xss == data) + break; vcpu->arch.ia32_xss = data; vcpu->arch.cpuid_dynamic_bits_dirty = true; break; From 779ed05511f2931b89fcd0087aba2fcca8890715 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:15 -0700 Subject: [PATCH 05/48] KVM: x86: Initialize kvm_caps.supported_xss Set original kvm_caps.supported_xss to (host_xss & KVM_SUPPORTED_XSS) if XSAVES is supported. host_xss contains the host supported xstate feature bits for thread FPU context switch, KVM_SUPPORTED_XSS includes all KVM enabled XSS feature bits, the resulting value represents the supervisor xstates that are available to guest and are backed by host FPU framework for swapping {guest,host} XSAVE-managed registers/MSRs. [sean: relocate and enhance comment about PT / XSS[8] ] Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Reviewed-by: Xiaoyao Li Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d142cbc71aaa..831c5e488de3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -217,6 +217,14 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) +/* + * Note, KVM supports exposing PT to the guest, but does not support context + * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping + * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support + * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs). + */ +#define KVM_SUPPORTED_XSS 0 + bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); @@ -3986,11 +3994,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_XSS: if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return KVM_MSR_RET_UNSUPPORTED; - /* - * KVM supports exposing PT to the guest, but does not support - * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than - * XSAVES/XRSTORS to save/restore PT MSRs. - */ + if (data & ~vcpu->arch.guest_supported_xss) return 1; if (vcpu->arch.ia32_xss == data) @@ -9822,14 +9826,17 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) kvm_host.xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_caps.supported_xcr0 = kvm_host.xcr0 & KVM_SUPPORTED_XCR0; } + + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + rdmsrq(MSR_IA32_XSS, kvm_host.xss); + kvm_caps.supported_xss = kvm_host.xss & KVM_SUPPORTED_XSS; + } + kvm_caps.supported_quirks = KVM_X86_VALID_QUIRKS; kvm_caps.inapplicable_quirks = KVM_X86_CONDITIONAL_QUIRKS; rdmsrq_safe(MSR_EFER, &kvm_host.efer); - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrq(MSR_IA32_XSS, kvm_host.xss); - kvm_init_pmu_capability(ops->pmu_ops); if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) From e44eb58334bbc28d790ed2851385cc86ea76dc12 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:16 -0700 Subject: [PATCH 06/48] KVM: x86: Load guest FPU state when access XSAVE-managed MSRs Load the guest's FPU state if userspace is accessing MSRs whose values are managed by XSAVES. Introduce two helpers, kvm_{get,set}_xstate_msr(), to facilitate access to such kind of MSRs. If MSRs supported in kvm_caps.supported_xss are passed through to guest, the guest MSRs are swapped with host's before vCPU exits to userspace and after it reenters kernel before next VM-entry. Because the modified code is also used for the KVM_GET_MSRS device ioctl(), explicitly check @vcpu is non-null before attempting to load guest state. The XSAVE-managed MSRs cannot be retrieved via the device ioctl() without loading guest FPU state (which doesn't exist). Note that guest_cpuid_has() is not queried as host userspace is allowed to access MSRs that have not been exposed to the guest, e.g. it might do KVM_SET_MSRS prior to KVM_SET_CPUID2. The two helpers are put here in order to manifest accessing xsave-managed MSRs requires special check and handling to guarantee the correctness of read/write to the MSRs. Co-developed-by: Yang Weijiang Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: drop S_CET, add big comment, move accessors to x86.c] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Reviewed-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250919223258.1604852-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 87 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 831c5e488de3..c2e11f3d50fb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -136,6 +136,9 @@ static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); static DEFINE_MUTEX(vendor_module_lock); +static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu); +static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu); + struct kvm_x86_ops kvm_x86_ops __read_mostly; #define KVM_X86_OP(func) \ @@ -3801,6 +3804,67 @@ static void record_steal_time(struct kvm_vcpu *vcpu) mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } +/* + * Returns true if the MSR in question is managed via XSTATE, i.e. is context + * switched with the rest of guest FPU state. Note! S_CET is _not_ context + * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS. + * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields, + * the value saved/restored via XSTATE is always the host's value. That detail + * is _extremely_ important, as the guest's S_CET must _never_ be resident in + * hardware while executing in the host. Loading guest values for U_CET and + * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to + * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower + * privilege levels, i.e. are effectively only consumed by userspace as well. + */ +static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + if (!vcpu) + return false; + + switch (msr) { + case MSR_IA32_U_CET: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) || + guest_cpu_cap_has(vcpu, X86_FEATURE_IBT); + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + default: + return false; + } +} + +/* + * Lock (and if necessary, re-load) the guest FPU, i.e. XSTATE, and access an + * MSR that is managed via XSTATE. Note, the caller is responsible for doing + * the initial FPU load, this helper only ensures that guest state is resident + * in hardware (the kernel can load its FPU state in IRQ context). + */ +static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, + struct msr_data *msr_info, + int access) +{ + BUILD_BUG_ON(access != MSR_TYPE_R && access != MSR_TYPE_W); + + KVM_BUG_ON(!is_xstate_managed_msr(vcpu, msr_info->index), vcpu->kvm); + KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm); + + kvm_fpu_get(); + if (access == MSR_TYPE_R) + rdmsrq(msr_info->index, msr_info->data); + else + wrmsrq(msr_info->index, msr_info->data); + kvm_fpu_put(); +} + +static __maybe_unused void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); +} + +static __maybe_unused void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u32 msr = msr_info->index; @@ -4551,11 +4615,25 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { + bool fpu_loaded = false; int i; - for (i = 0; i < msrs->nmsrs; ++i) + for (i = 0; i < msrs->nmsrs; ++i) { + /* + * If userspace is accessing one or more XSTATE-managed MSRs, + * temporarily load the guest's FPU state so that the guest's + * MSR value(s) is resident in hardware and thus can be accessed + * via RDMSR/WRMSR. + */ + if (!fpu_loaded && is_xstate_managed_msr(vcpu, entries[i].index)) { + kvm_load_guest_fpu(vcpu); + fpu_loaded = true; + } if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; + } + if (fpu_loaded) + kvm_put_guest_fpu(vcpu); return i; } @@ -5965,6 +6043,7 @@ static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, struct kvm_one_reg one_reg; struct kvm_x86_reg_id *reg; u64 __user *user_val; + bool load_fpu; int r; if (copy_from_user(&one_reg, argp, sizeof(one_reg))) @@ -5991,12 +6070,18 @@ static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, guard(srcu)(&vcpu->kvm->srcu); + load_fpu = is_xstate_managed_msr(vcpu, reg->index); + if (load_fpu) + kvm_load_guest_fpu(vcpu); + user_val = u64_to_user_ptr(one_reg.addr); if (ioctl == KVM_GET_ONE_REG) r = kvm_get_one_msr(vcpu, reg->index, user_val); else r = kvm_set_one_msr(vcpu, reg->index, user_val); + if (load_fpu) + kvm_put_guest_fpu(vcpu); return r; } From 586ef9dcbb28fc0bc6beb496e6dc8a54276e7a32 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:17 -0700 Subject: [PATCH 07/48] KVM: x86: Add fault checks for guest CR4.CET setting Check potential faults for CR4.CET setting per Intel SDM requirements. CET can be enabled if and only if CR0.WP == 1, i.e. setting CR4.CET == 1 faults if CR0.WP == 0 and setting CR0.WP == 0 fails if CR4.CET == 1. Signed-off-by: Yang Weijiang Reviewed-by: Chao Gao Reviewed-by: Maxim Levitsky Reviewed-by: Xiaoyao Li Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20250919223258.1604852-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c2e11f3d50fb..6d67c969e18a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1176,6 +1176,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) return 1; + if (!(cr0 & X86_CR0_WP) && kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) + return 1; + kvm_x86_call(set_cr0)(vcpu, cr0); kvm_post_set_cr0(vcpu, old_cr0, cr0); @@ -1376,6 +1379,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } + if ((cr4 & X86_CR4_CET) && !kvm_is_cr0_bit_set(vcpu, X86_CR0_WP)) + return 1; + kvm_x86_call(set_cr4)(vcpu, cr4); kvm_post_set_cr4(vcpu, old_cr4, cr4); From 6a11c860d8a4aa1c7351246bd1ee7b41f26399b6 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:18 -0700 Subject: [PATCH 08/48] KVM: x86: Report KVM supported CET MSRs as to-be-saved Add CET MSRs to the list of MSRs reported to userspace if the feature, i.e. IBT or SHSTK, associated with the MSRs is supported by KVM. Suggested-by: Chao Gao Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d67c969e18a..5f23d2d2731d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -344,6 +344,10 @@ static const u32 msrs_to_save_base[] = { MSR_IA32_UMWAIT_CONTROL, MSR_IA32_XFD, MSR_IA32_XFD_ERR, MSR_IA32_XSS, + + MSR_IA32_U_CET, MSR_IA32_S_CET, + MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, + MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, }; static const u32 msrs_to_save_pmu[] = { @@ -7603,6 +7607,20 @@ static void kvm_probe_msr_to_save(u32 msr_index) if (!kvm_caps.supported_xss) return; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + return; + break; + case MSR_IA32_INT_SSP_TAB: + if (!kvm_cpu_cap_has(X86_FEATURE_LM)) + return; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + return; + break; default: break; } From d6c387fc396b3d2c5aa00cb5b46f6401fb86bb43 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:19 -0700 Subject: [PATCH 09/48] KVM: VMX: Introduce CET VMCS fields and control bits Control-flow Enforcement Technology (CET) is a kind of CPU feature used to prevent Return/CALL/Jump-Oriented Programming (ROP/COP/JOP) attacks. It provides two sub-features(SHSTK,IBT) to defend against ROP/COP/JOP style control-flow subversion attacks. Shadow Stack (SHSTK): A shadow stack is a second stack used exclusively for control transfer operations. The shadow stack is separate from the data/normal stack and can be enabled individually in user and kernel mode. When shadow stack is enabled, CALL pushes the return address on both the data and shadow stack. RET pops the return address from both stacks and compares them. If the return addresses from the two stacks do not match, the processor generates a #CP. Indirect Branch Tracking (IBT): IBT introduces instruction(ENDBRANCH)to mark valid target addresses of indirect branches (CALL, JMP etc...). If an indirect branch is executed and the next instruction is _not_ an ENDBRANCH, the processor generates a #CP. These instruction behaves as a NOP on platforms that have no CET. Several new CET MSRs are defined to support CET: MSR_IA32_{U,S}_CET: CET settings for {user,supervisor} CET respectively. MSR_IA32_PL{0,1,2,3}_SSP: SHSTK pointer linear address for CPL{0,1,2,3}. MSR_IA32_INT_SSP_TAB: Linear address of SHSTK pointer table, whose entry is indexed by IST of interrupt gate desc. Two XSAVES state bits are introduced for CET: IA32_XSS:[bit 11]: Control saving/restoring user mode CET states IA32_XSS:[bit 12]: Control saving/restoring supervisor mode CET states. Six VMCS fields are introduced for CET: {HOST,GUEST}_S_CET: Stores CET settings for kernel mode. {HOST,GUEST}_SSP: Stores current active SSP. {HOST,GUEST}_INTR_SSP_TABLE: Stores current active MSR_IA32_INT_SSP_TAB. On Intel platforms, two additional bits are defined in VM_EXIT and VM_ENTRY control fields: If VM_EXIT_LOAD_CET_STATE = 1, host CET states are loaded from following VMCS fields at VM-Exit: HOST_S_CET HOST_SSP HOST_INTR_SSP_TABLE If VM_ENTRY_LOAD_CET_STATE = 1, guest CET states are loaded from following VMCS fields at VM-Entry: GUEST_S_CET GUEST_SSP GUEST_INTR_SSP_TABLE Co-developed-by: Zhang Yi Z Signed-off-by: Zhang Yi Z Signed-off-by: Yang Weijiang Reviewed-by: Chao Gao Reviewed-by: Maxim Levitsky Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cca7d6641287..ce10a7e2d3d9 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -106,6 +106,7 @@ #define VM_EXIT_CLEAR_BNDCFGS 0x00800000 #define VM_EXIT_PT_CONCEAL_PIP 0x01000000 #define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 +#define VM_EXIT_LOAD_CET_STATE 0x10000000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff @@ -119,6 +120,7 @@ #define VM_ENTRY_LOAD_BNDCFGS 0x00010000 #define VM_ENTRY_PT_CONCEAL_PIP 0x00020000 #define VM_ENTRY_LOAD_IA32_RTIT_CTL 0x00040000 +#define VM_ENTRY_LOAD_CET_STATE 0x00100000 #define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff @@ -369,6 +371,9 @@ enum vmcs_field { GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822, GUEST_SYSENTER_ESP = 0x00006824, GUEST_SYSENTER_EIP = 0x00006826, + GUEST_S_CET = 0x00006828, + GUEST_SSP = 0x0000682a, + GUEST_INTR_SSP_TABLE = 0x0000682c, HOST_CR0 = 0x00006c00, HOST_CR3 = 0x00006c02, HOST_CR4 = 0x00006c04, @@ -381,6 +386,9 @@ enum vmcs_field { HOST_IA32_SYSENTER_EIP = 0x00006c12, HOST_RSP = 0x00006c14, HOST_RIP = 0x00006c16, + HOST_S_CET = 0x00006c18, + HOST_SSP = 0x00006c1a, + HOST_INTR_SSP_TABLE = 0x00006c1c }; /* From 9d6812d415358372aaaf1dfe95bc30d11e4e95db Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:20 -0700 Subject: [PATCH 10/48] KVM: x86: Enable guest SSP read/write interface with new uAPIs Add a KVM-defined ONE_REG register, KVM_REG_GUEST_SSP, to let userspace save and restore the guest's Shadow Stack Pointer (SSP). On both Intel and AMD, SSP is a hardware register that can only be accessed by software via dedicated ISA (e.g. RDSSP) or via VMCS/VMCB fields (used by hardware to context switch SSP at entry/exit). As a result, SSP doesn't fit in any of KVM's existing interfaces for saving/restoring state. Internally, treat SSP as a fake/synthetic MSR, as the semantics of writes to SSP follow that of several other Shadow Stack MSRs, e.g. the PLx_SSP MSRs. Use a translation layer to hide the KVM-internal MSR index so that the arbitrary index doesn't become ABI, e.g. so that KVM can rework its implementation as needed, so long as the ONE_REG ABI is maintained. Explicitly reject accesses to SSP if the vCPU doesn't have Shadow Stack support to avoid running afoul of ignore_msrs, which unfortunately applies to host-initiated accesses (which is a discussion for another day). I.e. ensure consistent behavior for KVM-defined registers irrespective of ignore_msrs. Link: https://lore.kernel.org/all/aca9d389-f11e-4811-90cf-d98e345a5cc2@intel.com Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-14-seanjc@google.com Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/api.rst | 8 +++++++ arch/x86/include/uapi/asm/kvm.h | 3 +++ arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++++++++---- arch/x86/kvm/x86.h | 10 +++++++++ 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 0e82fc5abd7b..fac1774031ee 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2911,6 +2911,14 @@ such as set vcpu counter or reset vcpu, and they have the following id bit patte x86 MSR registers have the following id bit patterns:: 0x2030 0002 +Following are the KVM-defined registers for x86: + +======================= ========= ============================================= + Encoding Register Description +======================= ========= ============================================= + 0x2030 0003 0000 0000 SSP Shadow Stack Pointer +======================= ========= ============================================= + 4.69 KVM_GET_ONE_REG -------------------- diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index aae1033c8afa..467116186e71 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -437,6 +437,9 @@ struct kvm_xcrs { #define KVM_X86_REG_KVM(index) \ KVM_X86_REG_ID(KVM_X86_REG_TYPE_KVM, index) +/* KVM-defined registers starting from 0 */ +#define KVM_REG_GUEST_SSP 0 + #define KVM_SYNC_X86_REGS (1UL << 0) #define KVM_SYNC_X86_SREGS (1UL << 1) #define KVM_SYNC_X86_EVENTS (1UL << 2) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5f23d2d2731d..d85bb723f25a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6016,9 +6016,27 @@ struct kvm_x86_reg_id { __u8 x86; }; -static int kvm_translate_kvm_reg(struct kvm_x86_reg_id *reg) +static int kvm_translate_kvm_reg(struct kvm_vcpu *vcpu, + struct kvm_x86_reg_id *reg) { - return -EINVAL; + switch (reg->index) { + case KVM_REG_GUEST_SSP: + /* + * FIXME: If host-initiated accesses are ever exempted from + * ignore_msrs (in kvm_do_msr_access()), drop this manual check + * and rely on KVM's standard checks to reject accesses to regs + * that don't exist. + */ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return -EINVAL; + + reg->type = KVM_X86_REG_TYPE_MSR; + reg->index = MSR_KVM_INTERNAL_GUEST_SSP; + break; + default: + return -EINVAL; + } + return 0; } static int kvm_get_one_msr(struct kvm_vcpu *vcpu, u32 msr, u64 __user *user_val) @@ -6067,7 +6085,7 @@ static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, return -EINVAL; if (reg->type == KVM_X86_REG_TYPE_KVM) { - r = kvm_translate_kvm_reg(reg); + r = kvm_translate_kvm_reg(vcpu, reg); if (r) return r; } @@ -6098,11 +6116,22 @@ static int kvm_get_set_one_reg(struct kvm_vcpu *vcpu, unsigned int ioctl, static int kvm_get_reg_list(struct kvm_vcpu *vcpu, struct kvm_reg_list __user *user_list) { - u64 nr_regs = 0; + u64 nr_regs = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) ? 1 : 0; + u64 user_nr_regs; + + if (get_user(user_nr_regs, &user_list->n)) + return -EFAULT; if (put_user(nr_regs, &user_list->n)) return -EFAULT; + if (user_nr_regs < nr_regs) + return -E2BIG; + + if (nr_regs && + put_user(KVM_X86_REG_KVM(KVM_REG_GUEST_SSP), &user_list->reg[0])) + return -EFAULT; + return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 786e36fcd0fb..a7c9c72fca93 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -101,6 +101,16 @@ do { \ #define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX #define KVM_SVM_DEFAULT_PLE_WINDOW 3000 +/* + * KVM's internal, non-ABI indices for synthetic MSRs. The values themselves + * are arbitrary and have no meaning, the only requirement is that they don't + * conflict with "real" MSRs that KVM supports. Use values at the upper end + * of KVM's reserved paravirtual MSR range to minimize churn, i.e. these values + * will be usable until KVM exhausts its supply of paravirtual MSR indices. + */ + +#define MSR_KVM_INTERNAL_GUEST_SSP 0x4b564dff + static inline unsigned int __grow_ple_window(unsigned int val, unsigned int base, unsigned int modifier, unsigned int max) { From 8b59d0275c964bcbe573dde46bd3c1f20ed2d2bd Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:21 -0700 Subject: [PATCH 11/48] KVM: VMX: Emulate read and write to CET MSRs Add emulation interface for CET MSR access. The emulation code is split into common part and vendor specific part. The former does common checks for MSRs, e.g., accessibility, data validity etc., then passes operation to either XSAVE-managed MSRs via the helpers or CET VMCS fields. SSP can only be read via RDSSP. Writing even requires destructive and potentially faulting operations such as SAVEPREVSSP/RSTORSSP or SETSSBSY/CLRSSBSY. Let the host use a pseudo-MSR that is just a wrapper for the GUEST_SSP field of the VMCS. Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: drop call to kvm_set_xstate_msr() for S_CET, consolidate code] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-15-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 18 ++++++++++++ arch/x86/kvm/x86.c | 64 ++++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/x86.h | 23 +++++++++++++++ 3 files changed, 103 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d2fa64bddb57..62bf93dc0825 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2093,6 +2093,15 @@ int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; + case MSR_IA32_S_CET: + msr_info->data = vmcs_readl(GUEST_S_CET); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + msr_info->data = vmcs_readl(GUEST_SSP); + break; + case MSR_IA32_INT_SSP_TAB: + msr_info->data = vmcs_readl(GUEST_INTR_SSP_TABLE); + break; case MSR_IA32_DEBUGCTLMSR: msr_info->data = vmx_guest_debugctl_read(); break; @@ -2411,6 +2420,15 @@ int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else vmx->pt_desc.guest.addr_a[index / 2] = data; break; + case MSR_IA32_S_CET: + vmcs_writel(GUEST_S_CET, data); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + vmcs_writel(GUEST_SSP, data); + break; + case MSR_IA32_INT_SSP_TAB: + vmcs_writel(GUEST_INTR_SSP_TABLE, data); + break; case MSR_IA32_PERF_CAPABILITIES: if (data & PERF_CAP_LBR_FMT) { if ((data & PERF_CAP_LBR_FMT) != diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d85bb723f25a..54d280fe9a4b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1890,6 +1890,44 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, data = (u32)data; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + if (!kvm_is_valid_u_s_cet(vcpu, data)) + return 1; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + /* + * Note that the MSR emulation here is flawed when a vCPU + * doesn't support the Intel 64 architecture. The expected + * architectural behavior in this case is that the upper 32 + * bits do not exist and should always read '0'. However, + * because the actual hardware on which the virtual CPU is + * running does support Intel 64, XRSTORS/XSAVES in the + * guest could observe behavior that violates the + * architecture. Intercepting XRSTORS/XSAVES for this + * special case isn't deemed worthwhile. + */ + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + /* + * MSR_IA32_INT_SSP_TAB is not present on processors that do + * not support Intel 64 architecture. + */ + if (index == MSR_IA32_INT_SSP_TAB && !guest_cpu_cap_has(vcpu, X86_FEATURE_LM)) + return KVM_MSR_RET_UNSUPPORTED; + if (is_noncanonical_msr_address(data, vcpu)) + return 1; + /* All SSP MSRs except MSR_IA32_INT_SSP_TAB must be 4-byte aligned */ + if (index != MSR_IA32_INT_SSP_TAB && !IS_ALIGNED(data, 4)) + return 1; + break; } msr.data = data; @@ -1934,6 +1972,20 @@ static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, !guest_cpu_cap_has(vcpu, X86_FEATURE_RDPID)) return 1; break; + case MSR_IA32_U_CET: + case MSR_IA32_S_CET: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT)) + return KVM_MSR_RET_UNSUPPORTED; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + if (!host_initiated) + return 1; + fallthrough; + case MSR_IA32_PL0_SSP ... MSR_IA32_INT_SSP_TAB: + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + return KVM_MSR_RET_UNSUPPORTED; + break; } msr.index = index; @@ -3865,12 +3917,12 @@ static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu, kvm_fpu_put(); } -static __maybe_unused void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static void kvm_set_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_W); } -static __maybe_unused void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +static void kvm_get_xstate_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_access_xstate_msr(vcpu, msr_info, MSR_TYPE_R); } @@ -4256,6 +4308,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.guest_fpu.xfd_err = data; break; #endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_set_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); @@ -4605,6 +4661,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.guest_fpu.xfd_err; break; #endif + case MSR_IA32_U_CET: + case MSR_IA32_PL0_SSP ... MSR_IA32_PL3_SSP: + kvm_get_xstate_msr(vcpu, msr_info); + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a7c9c72fca93..076eccba0f7e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -710,4 +710,27 @@ int ____kvm_emulate_hypercall(struct kvm_vcpu *vcpu, int cpl, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); +#define CET_US_RESERVED_BITS GENMASK(9, 6) +#define CET_US_SHSTK_MASK_BITS GENMASK(1, 0) +#define CET_US_IBT_MASK_BITS (GENMASK_ULL(5, 2) | GENMASK_ULL(63, 10)) +#define CET_US_LEGACY_BITMAP_BASE(data) ((data) >> 12) + +static inline bool kvm_is_valid_u_s_cet(struct kvm_vcpu *vcpu, u64 data) +{ + if (data & CET_US_RESERVED_BITS) + return false; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + (data & CET_US_SHSTK_MASK_BITS)) + return false; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && + (data & CET_US_IBT_MASK_BITS)) + return false; + if (!IS_ALIGNED(CET_US_LEGACY_BITMAP_BASE(data), 4)) + return false; + /* IBT can be suppressed iff the TRACKER isn't WAIT_ENDBR. */ + if ((data & CET_SUPPRESS) && (data & CET_WAIT_ENDBR)) + return false; + + return true; +} #endif From 1a61bd0d126abbf01c384c44b46bcdd4ef495850 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:22 -0700 Subject: [PATCH 12/48] KVM: x86: Save and reload SSP to/from SMRAM Save CET SSP to SMRAM on SMI and reload it on RSM. KVM emulates HW arch behavior when guest enters/leaves SMM mode,i.e., save registers to SMRAM at the entry of SMM and reload them at the exit to SMM. Per SDM, SSP is one of such registers on 64-bit Arch, and add the support for SSP. Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-16-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/smm.c | 8 ++++++++ arch/x86/kvm/smm.h | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 5dd8a1646800..b0b14ba37f9a 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -269,6 +269,10 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, enter_smm_save_seg_64(vcpu, &smram->gs, VCPU_SREG_GS); smram->int_shadow = kvm_x86_call(get_interrupt_shadow)(vcpu); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + kvm_msr_read(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, &smram->ssp)) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } #endif @@ -558,6 +562,10 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, kvm_x86_call(set_interrupt_shadow)(vcpu, 0); ctxt->interruptibility = (u8)smstate->int_shadow; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + kvm_msr_write(vcpu, MSR_KVM_INTERNAL_GUEST_SSP, smstate->ssp)) + return X86EMUL_UNHANDLEABLE; + return X86EMUL_CONTINUE; } #endif diff --git a/arch/x86/kvm/smm.h b/arch/x86/kvm/smm.h index 551703fbe200..db3c88f16138 100644 --- a/arch/x86/kvm/smm.h +++ b/arch/x86/kvm/smm.h @@ -116,8 +116,8 @@ struct kvm_smram_state_64 { u32 smbase; u32 reserved4[5]; - /* ssp and svm_* fields below are not implemented by KVM */ u64 ssp; + /* svm_* fields below are not implemented by KVM */ u64 svm_guest_pat; u64 svm_host_efer; u64 svm_host_cr4; From 25f3840483e6c69e4d1a689cf3edc70f93604f2a Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:23 -0700 Subject: [PATCH 13/48] KVM: VMX: Set up interception for CET MSRs Disable interception for CET MSRs that can be accessed via XSAVES/XRSTORS, and exist accordingly to CPUID, as accesses through XSTATE aren't subject to MSR interception checks, i.e. can't be intercepted without intercepting and emulating XSAVES/XRSTORS, and KVM doesn't support emulating XSAVE/XRSTOR instructions. Don't condition interception on the guest actually having XSAVES as there is no benefit to intercepting the accesses (when the MSRs exist). The MSRs in question are either context switched by the CPU on VM-Enter/VM-Exit or by KVM via XSAVES/XRSTORS (KVM requires XSAVES to virtualization SHSTK), i.e. KVM is going to load guest values into hardware irrespective of guest XSAVES support. Suggested-by: Sean Christopherson Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Reviewed-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250919223258.1604852-17-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 62bf93dc0825..bde768c7111f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4088,6 +4088,8 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { + bool intercept; + if (!cpu_has_vmx_msr_bitmap()) return; @@ -4133,6 +4135,23 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W, !guest_cpu_cap_has(vcpu, X86_FEATURE_FLUSH_L1D)); + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, intercept); + } + + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK) || kvm_cpu_cap_has(X86_FEATURE_IBT)) { + intercept = !guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + vmx_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); + } + /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be * filtered by userspace. From 584ba3ffb9843fd12d3b4a33cfe056e2264392a0 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:24 -0700 Subject: [PATCH 14/48] KVM: VMX: Set host constant supervisor states to VMCS fields Save constant values to HOST_{S_CET,SSP,INTR_SSP_TABLE} field explicitly. Kernel IBT is supported and the setting in MSR_IA32_S_CET is static after post-boot(The exception is BIOS call case but vCPU thread never across it) and KVM doesn't need to refresh HOST_S_CET field before every VM-Enter/ VM-Exit sequence. Host supervisor shadow stack is not enabled now and SSP is not accessible to kernel mode, thus it's safe to set host IA32_INT_SSP_TAB/SSP VMCS field to 0s. When shadow stack is enabled for CPL3, SSP is reloaded from PL3_SSP before it exits to userspace. Check SDM Vol 2A/B Chapter 3/4 for SYSCALL/ SYSRET/SYSENTER SYSEXIT/RDSSP/CALL etc. Prevent KVM module loading if host supervisor shadow stack SHSTK_EN is set in MSR_IA32_S_CET as KVM cannot co-exit with it correctly. Suggested-by: Sean Christopherson Suggested-by: Chao Gao Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: snapshot host S_CET if SHSTK *or* IBT is supported] Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-18-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/capabilities.h | 4 ++++ arch/x86/kvm/vmx/vmx.c | 15 +++++++++++++++ arch/x86/kvm/x86.c | 12 ++++++++++++ arch/x86/kvm/x86.h | 1 + 4 files changed, 32 insertions(+) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index f614428dbeda..59c83888bdc0 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -100,6 +100,10 @@ static inline bool cpu_has_load_perf_global_ctrl(void) return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; } +static inline bool cpu_has_load_cet_ctrl(void) +{ + return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE); +} static inline bool cpu_has_vmx_mpx(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bde768c7111f..4ab066a3c22f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4312,6 +4312,21 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) if (cpu_has_load_ia32_efer()) vmcs_write64(HOST_IA32_EFER, kvm_host.efer); + + /* + * Supervisor shadow stack is not enabled on host side, i.e., + * host IA32_S_CET.SHSTK_EN bit is guaranteed to 0 now, per SDM + * description(RDSSP instruction), SSP is not readable in CPL0, + * so resetting the two registers to 0s at VM-Exit does no harm + * to kernel execution. When execution flow exits to userspace, + * SSP is reloaded from IA32_PL3_SSP. Check SDM Vol.2A/B Chapter + * 3 and 4 for details. + */ + if (cpu_has_load_cet_ctrl()) { + vmcs_writel(HOST_S_CET, kvm_host.s_cet); + vmcs_writel(HOST_SSP, 0); + vmcs_writel(HOST_INTR_SSP_TABLE, 0); + } } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54d280fe9a4b..0050509a7de2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9997,6 +9997,18 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) return -EIO; } + if (boot_cpu_has(X86_FEATURE_SHSTK) || boot_cpu_has(X86_FEATURE_IBT)) { + rdmsrq(MSR_IA32_S_CET, kvm_host.s_cet); + /* + * Linux doesn't yet support supervisor shadow stacks (SSS), so + * KVM doesn't save/restore the associated MSRs, i.e. KVM may + * clobber the host values. Yell and refuse to load if SSS is + * unexpectedly enabled, e.g. to avoid crashing the host. + */ + if (WARN_ON_ONCE(kvm_host.s_cet & CET_SHSTK_EN)) + return -EIO; + } + memset(&kvm_caps, 0, sizeof(kvm_caps)); x86_emulator_cache = kvm_alloc_emulator_cache(); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 076eccba0f7e..65cbd454c4f1 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -50,6 +50,7 @@ struct kvm_host_values { u64 efer; u64 xcr0; u64 xss; + u64 s_cet; u64 arch_capabilities; }; From 57c3db7e2e26970ee3630a25913368f849ea803a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:25 -0700 Subject: [PATCH 15/48] KVM: x86: Don't emulate instructions affected by CET features Don't emulate branch instructions, e.g. CALL/RET/JMP etc., that are affected by Shadow Stacks and/or Indirect Branch Tracking when said features are enabled in the guest, as fully emulating CET would require significant complexity for no practical benefit (KVM shouldn't need to emulate branch instructions on modern hosts). Simply doing nothing isn't an option as that would allow a malicious entity to subvert CET protections via the emulator. To detect instructions that are subject to IBT or affect IBT state, use the existing IsBranch flag along with the source operand type to detect indirect branches, and the existing NearBranch flag to detect far JMPs and CALLs, all of which are effectively indirect. Explicitly check for emulation of IRET, FAR RET (IMM), and SYSEXIT (the ret-like far branches) instead of adding another flag, e.g. IsRet, as it's unlikely the emulator will ever need to check for return-like instructions outside of this one specific flow. Use an allow-list instead of a deny-list because (a) it's a shorter list and (b) so that a missed entry gets a false positive, not a false negative (i.e. reject emulation instead of clobbering CET state). For Shadow Stacks, explicitly track instructions that directly affect the current SSP, as KVM's emulator doesn't have existing flags that can be used to precisely detect such instructions. Alternatively, the em_xxx() helpers could directly check for ShadowStack interactions, but using a dedicated flag is arguably easier to audit, and allows for handling both IBT and SHSTK in one fell swoop. Note! On far transfers, do NOT consult the current privilege level and instead treat SHSTK/IBT as being enabled if they're enabled for User *or* Supervisor mode. On inter-privilege level far transfers, SHSTK and IBT can be in play for the target privilege level, i.e. checking the current privilege could get a false negative, and KVM doesn't know the target privilege level until emulation gets under way. Note #2, FAR JMP from 64-bit mode to compatibility mode interacts with the current SSP, but only to ensure SSP[63:32] == 0. Don't tag FAR JMP as SHSTK, which would be rather confusing and would result in FAR JMP being rejected unnecessarily the vast majority of the time (ignoring that it's unlikely to ever be emulated). A future commit will add the #GP(0) check for the specific FAR JMP scenario. Note #3, task switches also modify SSP and so need to be rejected. That too will be addressed in a future commit. Suggested-by: Chao Gao Originally-by: Yang Weijiang Cc: Mathias Krause Cc: John Allen Cc: Rick Edgecombe Reviewed-by: Chao Gao Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-19-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 115 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 102 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 23929151a5b8..48b067b179e0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -178,6 +178,7 @@ #define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */ #define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */ #define IsBranch ((u64)1 << 56) /* Instruction is considered a branch. */ +#define ShadowStack ((u64)1 << 57) /* Instruction affects Shadow Stacks. */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -4068,8 +4069,8 @@ static const struct opcode group4[] = { static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), - I(SrcMem | NearBranch | IsBranch, em_call_near_abs), - I(SrcMemFAddr | ImplicitOps | IsBranch, em_call_far), + I(SrcMem | NearBranch | IsBranch | ShadowStack, em_call_near_abs), + I(SrcMemFAddr | ImplicitOps | IsBranch | ShadowStack, em_call_far), I(SrcMem | NearBranch | IsBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps | IsBranch, em_jmp_far), I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined), @@ -4304,7 +4305,7 @@ static const struct opcode opcode_table[256] = { DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), - I(SrcImmFAddr | No64 | IsBranch, em_call_far), N, + I(SrcImmFAddr | No64 | IsBranch | ShadowStack, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), II(ImplicitOps | Stack, em_popf, popf), I(ImplicitOps, em_sahf), I(ImplicitOps, em_lahf), @@ -4324,19 +4325,19 @@ static const struct opcode opcode_table[256] = { X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), - I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch, em_ret_near_imm), - I(ImplicitOps | NearBranch | IsBranch, em_ret), + I(ImplicitOps | NearBranch | SrcImmU16 | IsBranch | ShadowStack, em_ret_near_imm), + I(ImplicitOps | NearBranch | IsBranch | ShadowStack, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), - I(ImplicitOps | SrcImmU16 | IsBranch, em_ret_far_imm), - I(ImplicitOps | IsBranch, em_ret_far), - D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch, intn), + I(ImplicitOps | SrcImmU16 | IsBranch | ShadowStack, em_ret_far_imm), + I(ImplicitOps | IsBranch | ShadowStack, em_ret_far), + D(ImplicitOps | IsBranch), DI(SrcImmByte | IsBranch | ShadowStack, intn), D(ImplicitOps | No64 | IsBranch), - II(ImplicitOps | IsBranch, em_iret, iret), + II(ImplicitOps | IsBranch | ShadowStack, em_iret, iret), /* 0xD0 - 0xD7 */ G(Src2One | ByteOp, group2), G(Src2One, group2), G(Src2CL | ByteOp, group2), G(Src2CL, group2), @@ -4352,7 +4353,7 @@ static const struct opcode opcode_table[256] = { I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in), I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out), /* 0xE8 - 0xEF */ - I(SrcImm | NearBranch | IsBranch, em_call), + I(SrcImm | NearBranch | IsBranch | ShadowStack, em_call), D(SrcImm | ImplicitOps | NearBranch | IsBranch), I(SrcImmFAddr | No64 | IsBranch, em_jmp_far), D(SrcImmByte | ImplicitOps | NearBranch | IsBranch), @@ -4371,7 +4372,7 @@ static const struct opcode opcode_table[256] = { static const struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ G(0, group6), GD(0, &group7), N, N, - N, I(ImplicitOps | EmulateOnUD | IsBranch, em_syscall), + N, I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, @@ -4402,8 +4403,8 @@ static const struct opcode twobyte_table[256] = { IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), II(ImplicitOps | Priv, em_rdmsr, rdmsr), IIP(ImplicitOps, em_rdpmc, rdpmc, check_rdpmc), - I(ImplicitOps | EmulateOnUD | IsBranch, em_sysenter), - I(ImplicitOps | Priv | EmulateOnUD | IsBranch, em_sysexit), + I(ImplicitOps | EmulateOnUD | IsBranch | ShadowStack, em_sysenter), + I(ImplicitOps | Priv | EmulateOnUD | IsBranch | ShadowStack, em_sysexit), N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ @@ -4514,6 +4515,60 @@ static const struct opcode opcode_map_0f_38[256] = { #undef I2bvIP #undef I6ALU +static bool is_shstk_instruction(struct x86_emulate_ctxt *ctxt) +{ + return ctxt->d & ShadowStack; +} + +static bool is_ibt_instruction(struct x86_emulate_ctxt *ctxt) +{ + u64 flags = ctxt->d; + + if (!(flags & IsBranch)) + return false; + + /* + * All far JMPs and CALLs (including SYSCALL, SYSENTER, and INTn) are + * indirect and thus affect IBT state. All far RETs (including SYSEXIT + * and IRET) are protected via Shadow Stacks and thus don't affect IBT + * state. IRET #GPs when returning to virtual-8086 and IBT or SHSTK is + * enabled, but that should be handled by IRET emulation (in the very + * unlikely scenario that KVM adds support for fully emulating IRET). + */ + if (!(flags & NearBranch)) + return ctxt->execute != em_iret && + ctxt->execute != em_ret_far && + ctxt->execute != em_ret_far_imm && + ctxt->execute != em_sysexit; + + switch (flags & SrcMask) { + case SrcReg: + case SrcMem: + case SrcMem16: + case SrcMem32: + return true; + case SrcMemFAddr: + case SrcImmFAddr: + /* Far branches should be handled above. */ + WARN_ON_ONCE(1); + return true; + case SrcNone: + case SrcImm: + case SrcImmByte: + /* + * Note, ImmU16 is used only for the stack adjustment operand on ENTER + * and RET instructions. ENTER isn't a branch and RET FAR is handled + * by the NearBranch check above. RET itself isn't an indirect branch. + */ + case SrcImmU16: + return false; + default: + WARN_ONCE(1, "Unexpected Src operand '%llx' on branch", + flags & SrcMask); + return false; + } +} + static unsigned imm_size(struct x86_emulate_ctxt *ctxt) { unsigned size; @@ -4943,6 +4998,40 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len, int ctxt->execute = opcode.u.execute; + /* + * Reject emulation if KVM might need to emulate shadow stack updates + * and/or indirect branch tracking enforcement, which the emulator + * doesn't support. + */ + if ((is_ibt_instruction(ctxt) || is_shstk_instruction(ctxt)) && + ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET) { + u64 u_cet = 0, s_cet = 0; + + /* + * Check both User and Supervisor on far transfers as inter- + * privilege level transfers are impacted by CET at the target + * privilege level, and that is not known at this time. The + * expectation is that the guest will not require emulation of + * any CET-affected instructions at any privilege level. + */ + if (!(ctxt->d & NearBranch)) + u_cet = s_cet = CET_SHSTK_EN | CET_ENDBR_EN; + else if (ctxt->ops->cpl(ctxt) == 3) + u_cet = CET_SHSTK_EN | CET_ENDBR_EN; + else + s_cet = CET_SHSTK_EN | CET_ENDBR_EN; + + if ((u_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_U_CET, &u_cet)) || + (s_cet && ctxt->ops->get_msr(ctxt, MSR_IA32_S_CET, &s_cet))) + return EMULATION_FAILED; + + if ((u_cet | s_cet) & CET_SHSTK_EN && is_shstk_instruction(ctxt)) + return EMULATION_FAILED; + + if ((u_cet | s_cet) & CET_ENDBR_EN && is_ibt_instruction(ctxt)) + return EMULATION_FAILED; + } + if (unlikely(emulation_type & EMULTYPE_TRAP_UD) && likely(!(ctxt->d & EmulateOnUD))) return EMULATION_FAILED; From 82c0ec02825814e1ef332d635d3441b07c05b1c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:26 -0700 Subject: [PATCH 16/48] KVM: x86: Don't emulate task switches when IBT or SHSTK is enabled Exit to userspace with KVM_INTERNAL_ERROR_EMULATION if the guest triggers task switch emulation with Indirect Branch Tracking or Shadow Stacks enabled, as attempting to do the right thing would require non-trivial effort and complexity, KVM doesn't support emulating CET generally, and it's extremely unlikely that any guest will do task switches while also utilizing CET. Defer taking on the complexity until someone cares enough to put in the time and effort to add support. Per the SDM: If shadow stack is enabled, then the SSP of the task is located at the 4 bytes at offset 104 in the 32-bit TSS and is used by the processor to establish the SSP when a task switch occurs from a task associated with this TSS. Note that the processor does not write the SSP of the task initiating the task switch to the TSS of that task, and instead the SSP of the previous task is pushed onto the shadow stack of the new task. Note, per the SDM's pseudocode on TASK SWITCHING, IBT state for the new privilege level is updated. To keep things simple, check both S_CET and U_CET (again, anyone that wants more precise checking can have the honor of implementing support). Reported-by: Binbin Wu Closes: https://lore.kernel.org/all/819bd98b-2a60-4107-8e13-41f1e4c706b1@linux.intel.com Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-20-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0050509a7de2..31aaff9db083 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12175,6 +12175,25 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; + if (kvm_is_cr4_bit_set(vcpu, X86_CR4_CET)) { + u64 u_cet, s_cet; + + /* + * Check both User and Supervisor on task switches as inter- + * privilege level task switches are impacted by CET at both + * the current privilege level and the new privilege level, and + * that information is not known at this time. The expectation + * is that the guest won't require emulation of task switches + * while using IBT or Shadow Stacks. + */ + if (__kvm_emulate_msr_read(vcpu, MSR_IA32_U_CET, &u_cet) || + __kvm_emulate_msr_read(vcpu, MSR_IA32_S_CET, &s_cet)) + goto unhandled_task_switch; + + if ((u_cet | s_cet) & (CET_ENDBR_EN | CET_SHSTK_EN)) + goto unhandled_task_switch; + } + init_emulate_ctxt(vcpu); ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, @@ -12184,17 +12203,19 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, * Report an error userspace if MMIO is needed, as KVM doesn't support * MMIO during a task switch (or any other complex operation). */ - if (ret || vcpu->mmio_needed) { - vcpu->mmio_needed = false; - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + if (ret || vcpu->mmio_needed) + goto unhandled_task_switch; kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); return 1; + +unhandled_task_switch: + vcpu->mmio_needed = false; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; } EXPORT_SYMBOL_GPL(kvm_task_switch); From d4c03f63957c66bc95f5f33052f8b4be804631c3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:27 -0700 Subject: [PATCH 17/48] KVM: x86: Emulate SSP[63:32]!=0 #GP(0) for FAR JMP to 32-bit mode Emulate the Shadow Stack restriction that the current SSP must be a 32-bit value on a FAR JMP from 64-bit mode to compatibility mode. From the SDM's pseudocode for FAR JMP: IF ShadowStackEnabled(CPL) IF (IA32_EFER.LMA and DEST(segment selector).L) = 0 (* If target is legacy or compatibility mode then the SSP must be in low 4GB *) IF (SSP & 0xFFFFFFFF00000000 != 0); THEN #GP(0); FI; FI; FI; Note, only the current CPL needs to be considered, as FAR JMP can't be used for inter-privilege level transfers, and KVM rejects emulation of all other far branch instructions when Shadow Stacks are enabled. To give the emulator access to GUEST_SSP, special case handling MSR_KVM_INTERNAL_GUEST_SSP in emulator_get_msr() to treat the access as a host access (KVM doesn't allow guest accesses to internal "MSRs"). The ->get_msr() API is only used for implicit accesses from the emulator, i.e. is only used with hardcoded MSR indices, and so any access to MSR_KVM_INTERNAL_GUEST_SSP is guaranteed to be from KVM, i.e. not from the guest via RDMSR. Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-21-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/emulate.c | 35 +++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 9 +++++++++ 2 files changed, 44 insertions(+) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 48b067b179e0..59f93f68718a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1554,6 +1554,37 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return linear_write_system(ctxt, addr, desc, sizeof(*desc)); } +static bool emulator_is_ssp_invalid(struct x86_emulate_ctxt *ctxt, u8 cpl) +{ + const u32 MSR_IA32_X_CET = cpl == 3 ? MSR_IA32_U_CET : MSR_IA32_S_CET; + u64 efer = 0, cet = 0, ssp = 0; + + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_CET)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_EFER, &efer)) + return true; + + /* SSP is guaranteed to be valid if the vCPU was already in 32-bit mode. */ + if (!(efer & EFER_LMA)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_IA32_X_CET, &cet)) + return true; + + if (!(cet & CET_SHSTK_EN)) + return false; + + if (ctxt->ops->get_msr(ctxt, MSR_KVM_INTERNAL_GUEST_SSP, &ssp)) + return true; + + /* + * On transfer from 64-bit mode to compatibility mode, SSP[63:32] must + * be 0, i.e. SSP must be a 32-bit value outside of 64-bit mode. + */ + return ssp >> 32; +} + static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg, u8 cpl, enum x86_transfer_type transfer, @@ -1694,6 +1725,10 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (efer & EFER_LMA) goto exception; } + if (!seg_desc.l && emulator_is_ssp_invalid(ctxt, cpl)) { + err_code = 0; + goto exception; + } /* CS(RPL) <- CPL */ selector = (selector & 0xfffc) | cpl; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 31aaff9db083..0a4e58dddf36 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8741,6 +8741,15 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { + /* + * Treat emulator accesses to the current shadow stack pointer as host- + * initiated, as they aren't true MSR accesses (SSP is a "just a reg"), + * and this API is used only for implicit accesses, i.e. not RDMSR, and + * so the index is fully KVM-controlled. + */ + if (unlikely(msr_index == MSR_KVM_INTERNAL_GUEST_SSP)) + return kvm_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); + return __kvm_emulate_msr_read(emul_to_vcpu(ctxt), msr_index, pdata); } From 296599346c671f31854d674db2891ff2e1654b6a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:28 -0700 Subject: [PATCH 18/48] KVM: x86/mmu: WARN on attempt to check permissions for Shadow Stack #PF Add PFERR_SS_MASK, a.k.a. Shadow Stack access, and WARN if KVM attempts to check permissions for a Shadow Stack access as KVM hasn't been taught to understand the magic Writable=0,Dirty=1 combination that is required for Shadow Stack accesses, and likely will never learn. There are no plans to support Shadow Stacks with the Shadow MMU, and the emulator rejects all instructions that affect Shadow Stacks, i.e. it should be impossible for KVM to observe a #PF due to a shadow stack access. Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-22-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5865c9b77b6d..e8d74e949f91 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -267,6 +267,7 @@ enum x86_intercept_stage; #define PFERR_RSVD_MASK BIT(3) #define PFERR_FETCH_MASK BIT(4) #define PFERR_PK_MASK BIT(5) +#define PFERR_SS_MASK BIT(6) #define PFERR_SGX_MASK BIT(15) #define PFERR_GUEST_RMP_MASK BIT_ULL(31) #define PFERR_GUEST_FINAL_MASK BIT_ULL(32) diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index b4b6860ab971..f63074048ec6 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -212,7 +212,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, fault = (mmu->permissions[index] >> pte_access) & 1; - WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK)); + WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK)); if (unlikely(mmu->pkru_mask)) { u32 pkru_bits, offset; From 843af0f2e4617db197034b27a9a658a59271f19f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:29 -0700 Subject: [PATCH 19/48] KVM: x86/mmu: Pretty print PK, SS, and SGX flags in MMU tracepoints Add PK (Protection Keys), SS (Shadow Stacks), and SGX (Software Guard Extensions) to the set of #PF error flags handled via kvm_mmu_trace_pferr_flags. While KVM doesn't expect PK or SS #PFs in particular, pretty print their names instead of the raw hex value saves the user from having to go spelunking in the SDM to figure out what's going on. Reviewed-by: Xiaoyao Li Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-23-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmutrace.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index f35a830ce469..764e3015d021 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -51,6 +51,9 @@ { PFERR_PRESENT_MASK, "P" }, \ { PFERR_WRITE_MASK, "W" }, \ { PFERR_USER_MASK, "U" }, \ + { PFERR_PK_MASK, "PK" }, \ + { PFERR_SS_MASK, "SS" }, \ + { PFERR_SGX_MASK, "SGX" }, \ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } From b3744c59ebc5f2697d62844149c1a1c0e274ead0 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:30 -0700 Subject: [PATCH 20/48] KVM: x86: Allow setting CR4.CET if IBT or SHSTK is supported Drop X86_CR4_CET from CR4_RESERVED_BITS and instead mark CET as reserved if and only if IBT *and* SHSTK are unsupported, i.e. allow CR4.CET to be set if IBT or SHSTK is supported. This creates a virtualization hole if the CPU supports both IBT and SHSTK, but the kernel or vCPU model only supports one of the features. However, it's entirely legal for a CPU to have only one of IBT or SHSTK, i.e. the hole is a flaw in the architecture, not in KVM. More importantly, so long as KVM is careful to initialize and context switch both IBT and SHSTK state (when supported in hardware) if either feature is exposed to the guest, a misbehaving guest can only harm itself. E.g. VMX initializes host CET VMCS fields based solely on hardware capabilities. Signed-off-by: Yang Weijiang Signed-off-by: Mathias Krause Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: split to separate patch, write changelog] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-24-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e8d74e949f91..2f14bdc859fd 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -142,7 +142,7 @@ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ - | X86_CR4_LAM_SUP)) + | X86_CR4_LAM_SUP | X86_CR4_CET)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 65cbd454c4f1..f3dc77f006f9 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -680,6 +680,9 @@ static inline bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) __reserved_bits |= X86_CR4_PCIDE; \ if (!__cpu_has(__c, X86_FEATURE_LAM)) \ __reserved_bits |= X86_CR4_LAM_SUP; \ + if (!__cpu_has(__c, X86_FEATURE_SHSTK) && \ + !__cpu_has(__c, X86_FEATURE_IBT)) \ + __reserved_bits |= X86_CR4_CET; \ __reserved_bits; \ }) From 19e6e083f3f9e4ac1794273d72dfb59d19a0fc69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:31 -0700 Subject: [PATCH 21/48] KVM: nVMX: Always forward XSAVES/XRSTORS exits from L2 to L1 Unconditionally forward XSAVES/XRSTORS VM-Exits from L2 to L1, as KVM doesn't utilize the XSS-bitmap (KVM relies on controlling the XSS value in hardware to prevent unauthorized access to XSAVES state). KVM always loads vmcs02 with vmcs12's bitmap, and so any exit _must_ be due to vmcs12's XSS-bitmap. Drop the comment about XSS never being non-zero in anticipation of enabling CET_KERNEL and CET_USER support. Opportunistically WARN if XSAVES is not enabled for L2, as the CPU is supposed to generate #UD before checking the XSS-bitmap. Reviewed-by: Xiaoyao Li Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-25-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2156c9a854f4..846c07380eac 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6570,14 +6570,17 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); case EXIT_REASON_XSETBV: return true; - case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: + case EXIT_REASON_XSAVES: + case EXIT_REASON_XRSTORS: /* - * This should never happen, since it is not possible to - * set XSS to a non-zero value---neither in L1 nor in L2. - * If if it were, XSS would have to be checked against - * the XSS exit bitmap in vmcs12. + * Always forward XSAVES/XRSTORS to L1 as KVM doesn't utilize + * XSS-bitmap, and always loads vmcs02 with vmcs12's XSS-bitmap + * verbatim, i.e. any exit is due to L1's bitmap. WARN if + * XSAVES isn't enabled, as the CPU is supposed to inject #UD + * in that case, before consulting the XSS-bitmap. */ - return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); + WARN_ON_ONCE(!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES)); + return true; case EXIT_REASON_UMWAIT: case EXIT_REASON_TPAUSE: return nested_cpu_has2(vmcs12, From 69cc3e886582891f9c4d5830f18a2664a7f7cf7c Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:32 -0700 Subject: [PATCH 22/48] KVM: x86: Add XSS support for CET_KERNEL and CET_USER Add CET_KERNEL and CET_USER to KVM's set of supported XSS bits when IBT *or* SHSTK is supported. Like CR4.CET, XFEATURE support for IBT and SHSTK are bundle together under the CET umbrella, and thus prone to virtualization holes if KVM or the guest supports only one of IBT or SHSTK, but hardware supports both. However, again like CR4.CET, such virtualization holes are benign from the host's perspective so long as KVM takes care to always honor the "or" logic. Require CET_KERNEL and CET_USER to come as a pair, and refuse to support IBT or SHSTK if one (or both) features is missing, as the (host) kernel expects them to come as a pair, i.e. may get confused and corrupt state if only one of CET_KERNEL or CET_USER is supported. Signed-off-by: Yang Weijiang Signed-off-by: Mathias Krause Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: split to separate patch, write changelog, add XFEATURE_MASK_CET_ALL] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-26-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0a4e58dddf36..8b4c69330a87 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -220,13 +220,14 @@ static struct kvm_user_return_msrs __percpu *user_return_msrs; | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) +#define XFEATURE_MASK_CET_ALL (XFEATURE_MASK_CET_USER | XFEATURE_MASK_CET_KERNEL) /* * Note, KVM supports exposing PT to the guest, but does not support context * switching PT via XSTATE (KVM's PT virtualization relies on perf; swapping * PT via guest XSTATE would clobber perf state), i.e. KVM doesn't support * IA32_XSS[bit 8] (guests can/must use RDMSR/WRMSR to save/restore PT MSRs). */ -#define KVM_SUPPORTED_XSS 0 +#define KVM_SUPPORTED_XSS (XFEATURE_MASK_CET_ALL) bool __read_mostly allow_smaller_maxphyaddr = 0; EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); @@ -10104,6 +10105,16 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) kvm_caps.supported_xss = 0; + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + + if ((kvm_caps.supported_xss & XFEATURE_MASK_CET_ALL) != XFEATURE_MASK_CET_ALL) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + kvm_caps.supported_xss &= ~XFEATURE_MASK_CET_ALL; + } + if (kvm_caps.has_tsc_control) { /* * Make sure the user can only configure tsc_khz values that @@ -12772,10 +12783,11 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event) /* * On INIT, only select XSTATE components are zeroed, most components * are unchanged. Currently, the only components that are zeroed and - * supported by KVM are MPX related. + * supported by KVM are MPX and CET related. */ xfeatures_mask = (kvm_caps.supported_xcr0 | kvm_caps.supported_xss) & - (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR | + XFEATURE_MASK_CET_ALL); if (!xfeatures_mask) return; From 1f6f68fcfe43cf26fc0a98fe14e0454cc5c75416 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:33 -0700 Subject: [PATCH 23/48] KVM: x86: Disable support for Shadow Stacks if TDP is disabled Make TDP a hard requirement for Shadow Stacks, as there are no plans to add Shadow Stack support to the Shadow MMU. E.g. KVM hasn't been taught to understand the magic Writable=0,Dirty=1 combination that is required for Shadow Stack accesses, and so enabling Shadow Stacks when using shadow paging will put the guest into an infinite #PF loop (KVM thinks the shadow page tables have a valid mapping, hardware says otherwise). Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-27-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 32fde9e80c28..499c86bd457e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -955,6 +955,14 @@ void kvm_set_cpu_caps(void) if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) kvm_cpu_cap_clear(X86_FEATURE_PKU); + /* + * Shadow Stacks aren't implemented in the Shadow MMU. Shadow Stack + * accesses require "magic" Writable=0,Dirty=1 protection, which KVM + * doesn't know how to emulate or map. + */ + if (!tdp_enabled) + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_init(CPUID_7_EDX, F(AVX512_4VNNIW), F(AVX512_4FMAPS), From f705de12a22c945f55d51baec4ea495aedc5ebd7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 22 Sep 2025 11:47:43 -0700 Subject: [PATCH 24/48] KVM: x86: Initialize allow_smaller_maxphyaddr earlier in setup Initialize allow_smaller_maxphyaddr during hardware setup as soon as KVM knows whether or not TDP will be utilized. To avoid having to teach KVM's emulator all about CET, KVM's upcoming CET virtualization support will be mutually exclusive with allow_smaller_maxphyaddr, i.e. will disable SHSTK and IBT if allow_smaller_maxphyaddr is enabled. In general, allow_smaller_maxphyaddr should be initialized as soon as possible since it's globally visible while its only input is whether or not EPT/NPT is enabled. I.e. there's effectively zero risk of setting allow_smaller_maxphyaddr too early, and substantial risk of setting it too late. Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250922184743.1745778-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 30 +++++++++++++++--------------- arch/x86/kvm/vmx/vmx.c | 16 ++++++++-------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d092102fe1cb..d20e5917b0fe 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5370,6 +5370,21 @@ static __init int svm_hardware_setup(void) get_npt_level(), PG_LEVEL_1G); pr_info("Nested Paging %s\n", str_enabled_disabled(npt_enabled)); + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + /* Setup shadow_me_value and shadow_me_mask */ kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); @@ -5449,21 +5464,6 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); - /* - * It seems that on AMD processors PTE's accessed bit is - * being set by the CPU hardware before the NPF vmexit. - * This is not expected behaviour and our tests fail because - * of it. - * A workaround here is to disable support for - * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. - * In this case userspace can know if there is support using - * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle - * it - * If future AMD CPU models change the behaviour described above, - * this variable can be changed accordingly - */ - allow_smaller_maxphyaddr = !npt_enabled; - kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED; return 0; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4ab066a3c22f..4d775983506e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8436,6 +8436,14 @@ __init int vmx_hardware_setup(void) return -EOPNOTSUPP; } + /* + * Shadow paging doesn't have a (further) performance penalty + * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it + * by default + */ + if (!enable_ept) + allow_smaller_maxphyaddr = true; + if (!cpu_has_vmx_ept_ad_bits() || !enable_ept) enable_ept_ad_bits = 0; @@ -8665,14 +8673,6 @@ int __init vmx_init(void) vmx_check_vmcs12_offsets(); - /* - * Shadow paging doesn't have a (further) performance penalty - * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it - * by default - */ - if (!enable_ept) - allow_smaller_maxphyaddr = true; - return 0; err_l1d_flush: From 343acdd158a55dab1cfb0d209cd1d70b0cabb8b2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:34 -0700 Subject: [PATCH 25/48] KVM: x86: Disable support for IBT and SHSTK if allow_smaller_maxphyaddr is true Make IBT and SHSTK virtualization mutually exclusive with "officially" supporting setups with guest.MAXPHYADDR < host.MAXPHYADDR, i.e. if the allow_smaller_maxphyaddr module param is set. Running a guest with a smaller MAXPHYADDR requires intercepting #PF, and can also trigger emulation of arbitrary instructions. Intercepting and reacting to #PFs doesn't play nice with SHSTK, as KVM's MMU hasn't been taught to handle Shadow Stack accesses, and emulating arbitrary instructions doesn't play nice with IBT or SHSTK, as KVM's emulator doesn't handle the various side effects, e.g. doesn't enforce end-branch markers or model Shadow Stack updates. Note, hiding IBT and SHSTK based solely on allow_smaller_maxphyaddr is overkill, as allow_smaller_maxphyaddr is only problematic if the guest is actually configured to have a smaller MAXPHYADDR. However, KVM's ABI doesn't provide a way to express that IBT and SHSTK may break if enabled in conjunction with guest.MAXPHYADDR < host.MAXPHYADDR. I.e. the alternative is to do nothing in KVM and instead update documentation and hope KVM users are thorough readers. Go with the conservative-but-correct approach; worst case scenario, this restriction can be dropped if there's a strong use case for enabling CET on hosts with allow_smaller_maxphyaddr. Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-28-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 499c86bd457e..b0731304bd79 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -982,6 +982,16 @@ void kvm_set_cpu_caps(void) F(FLUSH_L1D), ); + /* + * Disable support for IBT and SHSTK if KVM is configured to emulate + * accesses to reserved GPAs, as KVM's emulator doesn't support IBT or + * SHSTK, nor does KVM handle Shadow Stack #PFs (see above). + */ + if (allow_smaller_maxphyaddr) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + } + if (boot_cpu_has(X86_FEATURE_AMD_IBPB_RET) && boot_cpu_has(X86_FEATURE_AMD_IBPB) && boot_cpu_has(X86_FEATURE_AMD_IBRS)) From e140467bbdafff84f1f346a7c59f404cd9782b82 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:35 -0700 Subject: [PATCH 26/48] KVM: x86: Enable CET virtualization for VMX and advertise to userspace Add support for the LOAD_CET_STATE VM-Enter and VM-Exit controls, the CET XFEATURE bits in XSS, and advertise support for IBT and SHSTK to userspace. Explicitly clear IBT and SHSTK onn SVM, as additional work is needed to enable CET on SVM, e.g. to context switch S_CET and other state. Disable KVM CET feature if unrestricted_guest is unsupported/disabled as KVM does not support emulating CET, as running without Unrestricted Guest can result in KVM emulating large swaths of guest code. While it's highly unlikely any guest will trigger emulation while also utilizing IBT or SHSTK, there's zero reason to allow CET without Unrestricted Guest as that combination should only be possible when explicitly disabling unrestricted_guest for testing purposes. Disable CET if VMX_BASIC[bit56] == 0, i.e. if hardware strictly enforces the presence of an Error Code based on exception vector, as attempting to inject a #CP with an Error Code (#CP architecturally has an Error Code) will fail due to the #CP vector historically not having an Error Code. Clear S_CET and SSP-related VMCS on "reset" to emulate the architectural of CET MSRs and SSP being reset to 0 after RESET, power-up and INIT. Note, KVM already clears guest CET state that is managed via XSTATE in kvm_xstate_reset(). Signed-off-by: Yang Weijiang Signed-off-by: Mathias Krause Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao [sean: move some bits to separate patches, massage changelog] Reviewed-by: Binbin Wu Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20250919223258.1604852-29-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/svm/svm.c | 4 ++++ arch/x86/kvm/vmx/capabilities.h | 5 +++++ arch/x86/kvm/vmx/vmx.c | 30 +++++++++++++++++++++++++++++- arch/x86/kvm/vmx/vmx.h | 6 ++++-- 6 files changed, 45 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index ce10a7e2d3d9..c85c50019523 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -134,6 +134,7 @@ #define VMX_BASIC_DUAL_MONITOR_TREATMENT BIT_ULL(49) #define VMX_BASIC_INOUT BIT_ULL(54) #define VMX_BASIC_TRUE_CTLS BIT_ULL(55) +#define VMX_BASIC_NO_HW_ERROR_CODE_CC BIT_ULL(56) static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b0731304bd79..d290dbc96831 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -946,6 +946,7 @@ void kvm_set_cpu_caps(void) VENDOR_F(WAITPKG), F(SGX_LC), F(BUS_LOCK_DETECT), + X86_64_F(SHSTK), ); /* @@ -980,6 +981,7 @@ void kvm_set_cpu_caps(void) F(AMX_INT8), F(AMX_BF16), F(FLUSH_L1D), + F(IBT), ); /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d20e5917b0fe..ff4925a7bf96 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5222,6 +5222,10 @@ static __init void svm_set_cpu_caps(void) kvm_caps.supported_perf_cap = 0; kvm_caps.supported_xss = 0; + /* KVM doesn't yet support CET virtualization for SVM. */ + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { kvm_cpu_cap_set(X86_FEATURE_SVM); diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 59c83888bdc0..02aadb9d730e 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -73,6 +73,11 @@ static inline bool cpu_has_vmx_basic_inout(void) return vmcs_config.basic & VMX_BASIC_INOUT; } +static inline bool cpu_has_vmx_basic_no_hw_errcode_cc(void) +{ + return vmcs_config.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC; +} + static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS && diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4d775983506e..dfa6c5cd2d5c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2602,6 +2602,7 @@ static int setup_vmcs_config(struct vmcs_config *vmcs_conf, { VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER }, { VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS }, { VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL }, + { VM_ENTRY_LOAD_CET_STATE, VM_EXIT_LOAD_CET_STATE }, }; memset(vmcs_conf, 0, sizeof(*vmcs_conf)); @@ -4868,6 +4869,14 @@ void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + vmcs_writel(GUEST_SSP, 0); + vmcs_writel(GUEST_INTR_SSP_TABLE, 0); + } + if (kvm_cpu_cap_has(X86_FEATURE_IBT) || + kvm_cpu_cap_has(X86_FEATURE_SHSTK)) + vmcs_writel(GUEST_S_CET, 0); + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); vpid_sync_context(vmx->vpid); @@ -6335,6 +6344,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", + vmcs_readl(GUEST_S_CET), vmcs_readl(GUEST_SSP), + vmcs_readl(GUEST_INTR_SSP_TABLE)); pr_err("*** Host State ***\n"); pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); @@ -6365,6 +6378,10 @@ void dump_vmcs(struct kvm_vcpu *vcpu) vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL)); if (vmcs_read32(VM_EXIT_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("host autoload", &vmx->msr_autoload.host); + if (vmexit_ctl & VM_EXIT_LOAD_CET_STATE) + pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", + vmcs_readl(HOST_S_CET), vmcs_readl(HOST_SSP), + vmcs_readl(HOST_INTR_SSP_TABLE)); pr_err("*** Control State ***\n"); pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n", @@ -7946,7 +7963,6 @@ static __init void vmx_set_cpu_caps(void) kvm_cpu_cap_set(X86_FEATURE_UMIP); /* CPUID 0xD.1 */ - kvm_caps.supported_xss = 0; if (!cpu_has_vmx_xsaves()) kvm_cpu_cap_clear(X86_FEATURE_XSAVES); @@ -7958,6 +7974,18 @@ static __init void vmx_set_cpu_caps(void) if (cpu_has_vmx_waitpkg()) kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG); + + /* + * Disable CET if unrestricted_guest is unsupported as KVM doesn't + * enforce CET HW behaviors in emulator. On platforms with + * VMX_BASIC[bit56] == 0, inject #CP at VMX entry with error code + * fails, so disable CET in this case too. + */ + if (!cpu_has_load_cet_ctrl() || !enable_unrestricted_guest || + !cpu_has_vmx_basic_no_hw_errcode_cc()) { + kvm_cpu_cap_clear(X86_FEATURE_SHSTK); + kvm_cpu_cap_clear(X86_FEATURE_IBT); + } } static bool vmx_is_io_intercepted(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 23d6e89b96f2..af8224e074ee 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -484,7 +484,8 @@ static inline u8 vmx_get_rvi(void) VM_ENTRY_LOAD_IA32_EFER | \ VM_ENTRY_LOAD_BNDCFGS | \ VM_ENTRY_PT_CONCEAL_PIP | \ - VM_ENTRY_LOAD_IA32_RTIT_CTL) + VM_ENTRY_LOAD_IA32_RTIT_CTL | \ + VM_ENTRY_LOAD_CET_STATE) #define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \ (VM_EXIT_SAVE_DEBUG_CONTROLS | \ @@ -506,7 +507,8 @@ static inline u8 vmx_get_rvi(void) VM_EXIT_LOAD_IA32_EFER | \ VM_EXIT_CLEAR_BNDCFGS | \ VM_EXIT_PT_CONCEAL_PIP | \ - VM_EXIT_CLEAR_IA32_RTIT_CTL) + VM_EXIT_CLEAR_IA32_RTIT_CTL | \ + VM_EXIT_LOAD_CET_STATE) #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_EXT_INTR_MASK | \ From f7336d47be53d0da1e74143b8befea8c0176f3cc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:36 -0700 Subject: [PATCH 27/48] KVM: VMX: Configure nested capabilities after CPU capabilities Swap the order between configuring nested VMX capabilities and base CPU capabilities, so that nested VMX support can be conditioned on core KVM support, e.g. to allow conditioning support for LOAD_CET_STATE on the presence of IBT or SHSTK. Because the sanity checks on nested VMX config performed by vmx_check_processor_compat() run _after_ vmx_hardware_setup(), any use of kvm_cpu_cap_has() when configuring nested VMX support will lead to failures in vmx_check_processor_compat(). While swapping the order of two (or more) configuration flows can lead to a game of whack-a-mole, in this case nested support inarguably should be done after base support. KVM should never condition base support on nested support, because nested support is fully optional, while obviously it's desirable to condition nested support on base support. And there's zero evidence the current ordering was intentional, e.g. commit 66a6950f9995 ("KVM: x86: Introduce kvm_cpu_caps to replace runtime CPUID masking") likely placed the call to kvm_set_cpu_caps() after nested setup because it looked pretty. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-30-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index dfa6c5cd2d5c..c4b07124689d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8597,6 +8597,13 @@ __init int vmx_hardware_setup(void) setup_default_sgx_lepubkeyhash(); + vmx_set_cpu_caps(); + + /* + * Configure nested capabilities after core CPU capabilities so that + * nested support can be conditional on base support, e.g. so that KVM + * can hide/show features based on kvm_cpu_cap_has(). + */ if (nested) { nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept); @@ -8605,8 +8612,6 @@ __init int vmx_hardware_setup(void) return r; } - vmx_set_cpu_caps(); - r = alloc_kvm_area(); if (r && nested) nested_vmx_hardware_unsetup(); From 033cc166f02920b193f7ec989083bce1a20e9abf Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:37 -0700 Subject: [PATCH 28/48] KVM: nVMX: Virtualize NO_HW_ERROR_CODE_CC for L1 event injection to L2 Per SDM description(Vol.3D, Appendix A.1): "If bit 56 is read as 1, software can use VM entry to deliver a hardware exception with or without an error code, regardless of vector" Modify has_error_code check before inject events to nested guest. Only enforce the check when guest is in real mode, the exception is not hard exception and the platform doesn't enumerate bit56 in VMX_BASIC, in all other case ignore the check to make the logic consistent with SDM. Signed-off-by: Yang Weijiang Reviewed-by: Maxim Levitsky Reviewed-by: Chao Gao Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-31-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 27 ++++++++++++++++++--------- arch/x86/kvm/vmx/nested.h | 5 +++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 846c07380eac..b644f4599f70 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1272,9 +1272,10 @@ static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) { const u64 feature_bits = VMX_BASIC_DUAL_MONITOR_TREATMENT | VMX_BASIC_INOUT | - VMX_BASIC_TRUE_CTLS; + VMX_BASIC_TRUE_CTLS | + VMX_BASIC_NO_HW_ERROR_CODE_CC; - const u64 reserved_bits = GENMASK_ULL(63, 56) | + const u64 reserved_bits = GENMASK_ULL(63, 57) | GENMASK_ULL(47, 45) | BIT_ULL(31); @@ -2949,7 +2950,6 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, u8 vector = intr_info & INTR_INFO_VECTOR_MASK; u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; - bool should_have_error_code; bool urg = nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST); bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; @@ -2966,12 +2966,19 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) return -EINVAL; - /* VM-entry interruption-info field: deliver error code */ - should_have_error_code = - intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && - x86_exception_has_error_code(vector); - if (CC(has_error_code != should_have_error_code)) - return -EINVAL; + /* + * Cannot deliver error code in real mode or if the interrupt + * type is not hardware exception. For other cases, do the + * consistency check only if the vCPU doesn't enumerate + * VMX_BASIC_NO_HW_ERROR_CODE_CC. + */ + if (!prot_mode || intr_type != INTR_TYPE_HARD_EXCEPTION) { + if (CC(has_error_code)) + return -EINVAL; + } else if (!nested_cpu_has_no_hw_errcode_cc(vcpu)) { + if (CC(has_error_code != x86_exception_has_error_code(vector))) + return -EINVAL; + } /* VM-entry exception error code */ if (CC(has_error_code && @@ -7217,6 +7224,8 @@ static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) msrs->basic |= VMX_BASIC_TRUE_CTLS; if (cpu_has_vmx_basic_inout()) msrs->basic |= VMX_BASIC_INOUT; + if (cpu_has_vmx_basic_no_hw_errcode_cc()) + msrs->basic |= VMX_BASIC_NO_HW_ERROR_CODE_CC; } static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 6eedcfc91070..983484d42ebf 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -309,6 +309,11 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) __kvm_is_valid_cr4(vcpu, val); } +static inline bool nested_cpu_has_no_hw_errcode_cc(struct kvm_vcpu *vcpu) +{ + return to_vmx(vcpu)->nested.msrs.basic & VMX_BASIC_NO_HW_ERROR_CODE_CC; +} + /* No difference in the restrictions on guest and host CR4 in VMX operation. */ #define nested_guest_cr4_valid nested_cr4_valid #define nested_host_cr4_valid nested_cr4_valid From 625884996bff1b25a39834fa4935d695d71ef1b1 Mon Sep 17 00:00:00 2001 From: Yang Weijiang Date: Fri, 19 Sep 2025 15:32:38 -0700 Subject: [PATCH 29/48] KVM: nVMX: Prepare for enabling CET support for nested guest Set up CET MSRs, related VM_ENTRY/EXIT control bits and fixed CR4 setting to enable CET for nested VM. vmcs12 and vmcs02 needs to be synced when L2 exits to L1 or when L1 wants to resume L2, that way correct CET states can be observed by one another. Please note that consistency checks regarding CET state during VM-Entry will be added later to prevent this patch from becoming too large. Advertising the new CET VM_ENTRY/EXIT control bits are also be deferred until after the consistency checks are added. Signed-off-by: Yang Weijiang Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Xin Li (Intel) Tested-by: Xin Li (Intel) Link: https://lore.kernel.org/r/20250919223258.1604852-32-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 77 +++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx/vmcs12.c | 6 +++ arch/x86/kvm/vmx/vmcs12.h | 14 ++++++- arch/x86/kvm/vmx/vmx.c | 2 + arch/x86/kvm/vmx/vmx.h | 3 ++ 5 files changed, 101 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b644f4599f70..11e5d3569933 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -721,6 +721,24 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_MPERF, MSR_TYPE_R); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_U_CET, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_S_CET, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL0_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL1_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL2_SSP, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PL3_SSP, MSR_TYPE_RW); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -2521,6 +2539,32 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 } } +static void vmcs_read_cet_state(struct kvm_vcpu *vcpu, u64 *s_cet, + u64 *ssp, u64 *ssp_tbl) +{ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + *s_cet = vmcs_readl(GUEST_S_CET); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + *ssp = vmcs_readl(GUEST_SSP); + *ssp_tbl = vmcs_readl(GUEST_INTR_SSP_TABLE); + } +} + +static void vmcs_write_cet_state(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) +{ + if (guest_cpu_cap_has(vcpu, X86_FEATURE_IBT) || + guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) + vmcs_writel(GUEST_S_CET, s_cet); + + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + vmcs_writel(GUEST_SSP, ssp); + vmcs_writel(GUEST_INTR_SSP_TABLE, ssp_tbl); + } +} + static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { struct hv_enlightened_vmcs *hv_evmcs = nested_vmx_evmcs(vmx); @@ -2637,6 +2681,10 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) + vmcs_write_cet_state(&vmx->vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, vmcs12->guest_ssp_tbl); + set_cr4_guest_host_mask(vmx); } @@ -2676,6 +2724,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, kvm_set_dr(vcpu, 7, vcpu->arch.dr7); vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } + + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) + vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, + vmx->nested.pre_vmenter_ssp, + vmx->nested.pre_vmenter_ssp_tbl); + if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); @@ -3551,6 +3606,12 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (!vmx->nested.nested_run_pending || + !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) + vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, + &vmx->nested.pre_vmenter_ssp, + &vmx->nested.pre_vmenter_ssp_tbl); + /* * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* * nested early checks are disabled. In the event of a "late" VM-Fail, @@ -4634,6 +4695,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) vmcs12->guest_ia32_efer = vcpu->arch.efer; + + vmcs_read_cet_state(&vmx->vcpu, &vmcs12->guest_s_cet, + &vmcs12->guest_ssp, + &vmcs12->guest_ssp_tbl); } /* @@ -4759,6 +4824,18 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) vmcs_write64(GUEST_BNDCFGS, 0); + /* + * Load CET state from host state if VM_EXIT_LOAD_CET_STATE is set. + * otherwise CET state should be retained across VM-exit, i.e., + * guest values should be propagated from vmcs12 to vmcs01. + */ + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) + vmcs_write_cet_state(vcpu, vmcs12->host_s_cet, vmcs12->host_ssp, + vmcs12->host_ssp_tbl); + else + vmcs_write_cet_state(vcpu, vmcs12->guest_s_cet, vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl); + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); vcpu->arch.pat = vmcs12->host_ia32_pat; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index 106a72c923ca..4233b5ca9461 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -139,6 +139,9 @@ const unsigned short vmcs12_field_offsets[] = { FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), + FIELD(GUEST_S_CET, guest_s_cet), + FIELD(GUEST_SSP, guest_ssp), + FIELD(GUEST_INTR_SSP_TABLE, guest_ssp_tbl), FIELD(HOST_CR0, host_cr0), FIELD(HOST_CR3, host_cr3), FIELD(HOST_CR4, host_cr4), @@ -151,5 +154,8 @@ const unsigned short vmcs12_field_offsets[] = { FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), FIELD(HOST_RSP, host_rsp), FIELD(HOST_RIP, host_rip), + FIELD(HOST_S_CET, host_s_cet), + FIELD(HOST_SSP, host_ssp), + FIELD(HOST_INTR_SSP_TABLE, host_ssp_tbl), }; const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets); diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 56fd150a6f24..4ad6b16525b9 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -117,7 +117,13 @@ struct __packed vmcs12 { natural_width host_ia32_sysenter_eip; natural_width host_rsp; natural_width host_rip; - natural_width paddingl[8]; /* room for future expansion */ + natural_width host_s_cet; + natural_width host_ssp; + natural_width host_ssp_tbl; + natural_width guest_s_cet; + natural_width guest_ssp; + natural_width guest_ssp_tbl; + natural_width paddingl[2]; /* room for future expansion */ u32 pin_based_vm_exec_control; u32 cpu_based_vm_exec_control; u32 exception_bitmap; @@ -294,6 +300,12 @@ static inline void vmx_check_vmcs12_offsets(void) CHECK_OFFSET(host_ia32_sysenter_eip, 656); CHECK_OFFSET(host_rsp, 664); CHECK_OFFSET(host_rip, 672); + CHECK_OFFSET(host_s_cet, 680); + CHECK_OFFSET(host_ssp, 688); + CHECK_OFFSET(host_ssp_tbl, 696); + CHECK_OFFSET(guest_s_cet, 704); + CHECK_OFFSET(guest_ssp, 712); + CHECK_OFFSET(guest_ssp_tbl, 720); CHECK_OFFSET(pin_based_vm_exec_control, 744); CHECK_OFFSET(cpu_based_vm_exec_control, 748); CHECK_OFFSET(exception_bitmap, 752); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c4b07124689d..ad5981ff7097 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7735,6 +7735,8 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP)); cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57)); + cr4_fixed1_update(X86_CR4_CET, ecx, feature_bit(SHSTK)); + cr4_fixed1_update(X86_CR4_CET, edx, feature_bit(IBT)); entry = kvm_find_cpuid_entry_index(vcpu, 0x7, 1); cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM)); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index af8224e074ee..ea93121029f9 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -181,6 +181,9 @@ struct nested_vmx { */ u64 pre_vmenter_debugctl; u64 pre_vmenter_bndcfgs; + u64 pre_vmenter_s_cet; + u64 pre_vmenter_ssp; + u64 pre_vmenter_ssp_tbl; /* to migrate it to L1 if L2 writes to L1's CR8 directly */ int l1_tpr_threshold; From 8060b2bd2dd05a19ad7ec248489d374f2bd2b057 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Sep 2025 15:32:39 -0700 Subject: [PATCH 30/48] KVM: nVMX: Add consistency checks for CR0.WP and CR4.CET Add consistency checks for CR4.CET and CR0.WP in guest-state or host-state area in the VMCS12. This ensures that configurations with CR4.CET set and CR0.WP not set result in VM-entry failure, aligning with architectural behavior. Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-33-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 11e5d3569933..51c50ce9e011 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3110,6 +3110,9 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_vcpu_is_legal_cr3(vcpu, vmcs12->host_cr3))) return -EINVAL; + if (CC(vmcs12->host_cr4 & X86_CR4_CET && !(vmcs12->host_cr0 & X86_CR0_WP))) + return -EINVAL; + if (CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || CC(is_noncanonical_msr_address(vmcs12->host_ia32_sysenter_eip, vcpu))) return -EINVAL; @@ -3224,6 +3227,9 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) return -EINVAL; + if (CC(vmcs12->guest_cr4 & X86_CR4_CET && !(vmcs12->guest_cr0 & X86_CR0_WP))) + return -EINVAL; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && (CC(!kvm_dr7_valid(vmcs12->guest_dr7)) || CC(!vmx_is_valid_debugctl(vcpu, vmcs12->guest_ia32_debugctl, false)))) From 62f7533a6b3aad9d39c6098dbc3d8e7daeda9399 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Sep 2025 15:32:40 -0700 Subject: [PATCH 31/48] KVM: nVMX: Add consistency checks for CET states Introduce consistency checks for CET states during nested VM-entry. A VMCS contains both guest and host CET states, each comprising the IA32_S_CET MSR, SSP, and IA32_INTERRUPT_SSP_TABLE_ADDR MSR. Various checks are applied to CET states during VM-entry as documented in SDM Vol3 Chapter "VM ENTRIES". Implement all these checks during nested VM-entry to emulate the architectural behavior. In summary, there are three kinds of checks on guest/host CET states during VM-entry: A. Checks applied to both guest states and host states: * The IA32_S_CET field must not set any reserved bits; bits 10 (SUPPRESS) and 11 (TRACKER) cannot both be set. * SSP should not have bits 1:0 set. * The IA32_INTERRUPT_SSP_TABLE_ADDR field must be canonical. B. Checks applied to host states only * IA32_S_CET MSR and SSP must be canonical if the CPU enters 64-bit mode after VM-exit. Otherwise, IA32_S_CET and SSP must have their higher 32 bits cleared. C. Checks applied to guest states only: * IA32_S_CET MSR and SSP are not required to be canonical (i.e., 63:N-1 are identical, where N is the CPU's maximum linear-address width). But, bits 63:N of SSP must be identical. Tested-by: Mathias Krause Tested-by: John Allen Tested-by: Rick Edgecombe Signed-off-by: Chao Gao Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-34-seanjc@google.com [sean: have common helper return 0/-EINVAL, not true/false] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 51c50ce9e011..a7bd6fc95057 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3100,6 +3100,16 @@ static bool is_l1_noncanonical_address_on_vmexit(u64 la, struct vmcs12 *vmcs12) return !__is_canonical_address(la, l1_address_bits_on_exit); } +static int nested_vmx_check_cet_state_common(struct kvm_vcpu *vcpu, u64 s_cet, + u64 ssp, u64 ssp_tbl) +{ + if (CC(!kvm_is_valid_u_s_cet(vcpu, s_cet)) || CC(!IS_ALIGNED(ssp, 4)) || + CC(is_noncanonical_msr_address(ssp_tbl, vcpu))) + return -EINVAL; + + return 0; +} + static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { @@ -3169,6 +3179,27 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, return -EINVAL; } + if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_CET_STATE) { + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->host_s_cet, + vmcs12->host_ssp, + vmcs12->host_ssp_tbl)) + return -EINVAL; + + /* + * IA32_S_CET and SSP must be canonical if the host will + * enter 64-bit mode after VM-exit; otherwise, higher + * 32-bits must be all 0s. + */ + if (ia32e) { + if (CC(is_noncanonical_msr_address(vmcs12->host_s_cet, vcpu)) || + CC(is_noncanonical_msr_address(vmcs12->host_ssp, vcpu))) + return -EINVAL; + } else { + if (CC(vmcs12->host_s_cet >> 32) || CC(vmcs12->host_ssp >> 32)) + return -EINVAL; + } + } + return 0; } @@ -3279,6 +3310,23 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) return -EINVAL; + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE) { + if (nested_vmx_check_cet_state_common(vcpu, vmcs12->guest_s_cet, + vmcs12->guest_ssp, + vmcs12->guest_ssp_tbl)) + return -EINVAL; + + /* + * Guest SSP must have 63:N bits identical, rather than + * be canonical (i.e., 63:N-1 bits identical), where N is + * the CPU's maximum linear-address width. Similar to + * is_noncanonical_msr_address(), use the host's + * linear-address width. + */ + if (CC(!__is_canonical_address(vmcs12->guest_ssp, max_host_virt_addr_bits() + 1))) + return -EINVAL; + } + if (nested_check_guest_non_reg_state(vmcs12)) return -EINVAL; From 42ae6448531b5106e9f29794992856e94a52b5cb Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Fri, 19 Sep 2025 15:32:41 -0700 Subject: [PATCH 32/48] KVM: nVMX: Advertise new VM-Entry/Exit control bits for CET state Advertise the LOAD_CET_STATE VM-Entry/Exit control bits in the nested VMX MSRS, as all nested support for CET virtualization, including consistency checks, is in place. Advertise support if and only if KVM supports at least one of IBT or SHSTK. While it's userspace's responsibility to provide a consistent CPU model to the guest, that doesn't mean KVM should set userspace up to fail. Note, the existing {CLEAR,LOAD}_BNDCFGS behavior predates KVM_X86_QUIRK_STUFF_FEATURE_MSRS, i.e. KVM "solved" the inconsistent CPU model problem by overwriting the VMX MSRs provided by userspace. Signed-off-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-35-seanjc@google.com Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/nested.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a7bd6fc95057..76271962cb70 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -7179,13 +7179,17 @@ static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, VM_EXIT_HOST_ADDR_SPACE_SIZE | #endif VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | - VM_EXIT_CLEAR_BNDCFGS; + VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_CET_STATE; msrs->exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + msrs->exit_ctls_high &= ~VM_EXIT_LOAD_CET_STATE; + /* We support free control of debug control saving. */ msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; } @@ -7201,11 +7205,16 @@ static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, #ifdef CONFIG_X86_64 VM_ENTRY_IA32E_MODE | #endif - VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; + VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS | + VM_ENTRY_LOAD_CET_STATE; msrs->entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); + if (!kvm_cpu_cap_has(X86_FEATURE_SHSTK) && + !kvm_cpu_cap_has(X86_FEATURE_IBT)) + msrs->entry_ctls_high &= ~VM_ENTRY_LOAD_CET_STATE; + /* We support free control of debug control loading. */ msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; } From 48b2ec0d540c29cebb5119dd2b8e8e7369bc409c Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 19 Sep 2025 15:32:42 -0700 Subject: [PATCH 33/48] KVM: SVM: Emulate reads and writes to shadow stack MSRs Emulate shadow stack MSR access by reading and writing to the corresponding fields in the VMCB. Signed-off-by: John Allen [sean: mark VMCB_CET dirty/clean as appropriate] Link: https://lore.kernel.org/r/20250919223258.1604852-36-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 21 +++++++++++++++++++++ arch/x86/kvm/svm/svm.h | 3 ++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ff4925a7bf96..b977ea4a46e7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2767,6 +2767,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (guest_cpuid_is_intel_compatible(vcpu)) msr_info->data |= (u64)svm->sysenter_esp_hi << 32; break; + case MSR_IA32_S_CET: + msr_info->data = svm->vmcb->save.s_cet; + break; + case MSR_IA32_INT_SSP_TAB: + msr_info->data = svm->vmcb->save.isst_addr; + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + msr_info->data = svm->vmcb->save.ssp; + break; case MSR_TSC_AUX: msr_info->data = svm->tsc_aux; break; @@ -2999,6 +3008,18 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->vmcb01.ptr->save.sysenter_esp = (u32)data; svm->sysenter_esp_hi = guest_cpuid_is_intel_compatible(vcpu) ? (data >> 32) : 0; break; + case MSR_IA32_S_CET: + svm->vmcb->save.s_cet = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; + case MSR_IA32_INT_SSP_TAB: + svm->vmcb->save.isst_addr = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; + case MSR_KVM_INTERNAL_GUEST_SSP: + svm->vmcb->save.ssp = data; + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_CET); + break; case MSR_TSC_AUX: /* * TSC_AUX is always virtualized for SEV-ES guests when the diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 10d5cbc259e1..37de6a4a2665 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -74,6 +74,7 @@ enum { * AVIC PHYSICAL_TABLE pointer, * AVIC LOGICAL_TABLE pointer */ + VMCB_CET, /* S_CET, SSP, ISST_ADDR */ VMCB_SW = 31, /* Reserved for hypervisor/software use */ }; @@ -82,7 +83,7 @@ enum { (1U << VMCB_ASID) | (1U << VMCB_INTR) | \ (1U << VMCB_NPT) | (1U << VMCB_CR) | (1U << VMCB_DR) | \ (1U << VMCB_DT) | (1U << VMCB_SEG) | (1U << VMCB_CR2) | \ - (1U << VMCB_LBR) | (1U << VMCB_AVIC) | \ + (1U << VMCB_LBR) | (1U << VMCB_AVIC) | (1U << VMCB_CET) | \ (1U << VMCB_SW)) /* TPR and CR2 are always written before VMRUN */ From c5ba49458513bd1ecd669d4ec7124e788b19347c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:43 -0700 Subject: [PATCH 34/48] KVM: nSVM: Save/load CET Shadow Stack state to/from vmcb12/vmcb02 Transfer the three CET Shadow Stack VMCB fields (S_CET, ISST_ADDR, and SSP) on VMRUN, #VMEXIT, and loading nested state (saving nested state simply copies the entire save area). SVM doesn't provide a way to disallow L1 from enabling Shadow Stacks for L2, i.e. KVM *must* provide nested support before advertising SHSTK to userspace. Link: https://lore.kernel.org/r/20250919223258.1604852-37-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/nested.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 826473f2d7c7..a6443feab252 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -636,6 +636,14 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 vmcb_mark_dirty(vmcb02, VMCB_DT); } + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && + (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) { + vmcb02->save.s_cet = vmcb12->save.s_cet; + vmcb02->save.isst_addr = vmcb12->save.isst_addr; + vmcb02->save.ssp = vmcb12->save.ssp; + vmcb_mark_dirty(vmcb02, VMCB_CET); + } + kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); svm_set_efer(vcpu, svm->nested.save.efer); @@ -1044,6 +1052,12 @@ void svm_copy_vmrun_state(struct vmcb_save_area *to_save, to_save->rsp = from_save->rsp; to_save->rip = from_save->rip; to_save->cpl = 0; + + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + to_save->s_cet = from_save->s_cet; + to_save->isst_addr = from_save->isst_addr; + to_save->ssp = from_save->ssp; + } } void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) @@ -1111,6 +1125,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->save.dr6 = svm->vcpu.arch.dr6; vmcb12->save.cpl = vmcb02->save.cpl; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) { + vmcb12->save.s_cet = vmcb02->save.s_cet; + vmcb12->save.isst_addr = vmcb02->save.isst_addr; + vmcb12->save.ssp = vmcb02->save.ssp; + } + vmcb12->control.int_state = vmcb02->control.int_state; vmcb12->control.exit_code = vmcb02->control.exit_code; vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi; From c7586aa3bed4969cc7345940e5a331efd614f41d Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 19 Sep 2025 15:32:44 -0700 Subject: [PATCH 35/48] KVM: SVM: Update dump_vmcb with shadow stack save area additions Add shadow stack VMCB fields to dump_vmcb. PL0_SSP, PL1_SSP, PL2_SSP, PL3_SSP, and U_CET are part of the SEV-ES save area and are encrypted, but can be decrypted and dumped if the guest policy allows debugging. Reviewed-by: Maxim Levitsky Signed-off-by: John Allen Link: https://lore.kernel.org/r/20250919223258.1604852-38-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b977ea4a46e7..bea8bfea74f9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3410,6 +3410,10 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "rip:", save->rip, "rflags:", save->rflags); pr_err("%-15s %016llx %-13s %016llx\n", "rsp:", save->rsp, "rax:", save->rax); + pr_err("%-15s %016llx %-13s %016llx\n", + "s_cet:", save->s_cet, "ssp:", save->ssp); + pr_err("%-15s %016llx\n", + "isst_addr:", save->isst_addr); pr_err("%-15s %016llx %-13s %016llx\n", "star:", save01->star, "lstar:", save01->lstar); pr_err("%-15s %016llx %-13s %016llx\n", @@ -3434,6 +3438,13 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-15s %016llx\n", "sev_features", vmsa->sev_features); + pr_err("%-15s %016llx %-13s %016llx\n", + "pl0_ssp:", vmsa->pl0_ssp, "pl1_ssp:", vmsa->pl1_ssp); + pr_err("%-15s %016llx %-13s %016llx\n", + "pl2_ssp:", vmsa->pl2_ssp, "pl3_ssp:", vmsa->pl3_ssp); + pr_err("%-15s %016llx\n", + "u_cet:", vmsa->u_cet); + pr_err("%-15s %016llx %-13s %016llx\n", "rax:", vmsa->rax, "rbx:", vmsa->rbx); pr_err("%-15s %016llx %-13s %016llx\n", From 38c46bdbf99812a07a8a7ef48718f0a03acd8a8a Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 19 Sep 2025 15:32:45 -0700 Subject: [PATCH 36/48] KVM: SVM: Pass through shadow stack MSRs as appropriate Pass through XSAVE managed CET MSRs on SVM when KVM supports shadow stack. These cannot be intercepted without also intercepting XSAVE which would likely cause unacceptable performance overhead. MSR_IA32_INT_SSP_TAB is not managed by XSAVE, so it is intercepted. Reviewed-by: Chao Gao Signed-off-by: John Allen Link: https://lore.kernel.org/r/20250919223258.1604852-39-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bea8bfea74f9..1c1792f6a65c 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -844,6 +844,17 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) svm_disable_intercept_for_msr(vcpu, MSR_IA32_MPERF, MSR_TYPE_R); } + if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) { + bool shstk_enabled = guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + + svm_set_intercept_for_msr(vcpu, MSR_IA32_U_CET, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL0_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL1_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL2_SSP, MSR_TYPE_RW, !shstk_enabled); + svm_set_intercept_for_msr(vcpu, MSR_IA32_PL3_SSP, MSR_TYPE_RW, !shstk_enabled); + } + if (sev_es_guest(vcpu->kvm)) sev_es_recalc_msr_intercepts(vcpu); From b5fa221f7b08ca145b0357b77b90dcc998278967 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:46 -0700 Subject: [PATCH 37/48] KVM: SEV: Synchronize MSR_IA32_XSS from the GHCB when it's valid Synchronize XSS from the GHCB to KVM's internal tracking if the guest marks XSS as valid on a #VMGEXIT. Like XCR0, KVM needs an up-to-date copy of XSS in order to compute the required XSTATE size when emulating CPUID.0xD.0x1 for the guest. Treat the incoming XSS change as an emulated write, i.e. validatate the guest-provided value, to avoid letting the guest load garbage into KVM's tracking. Simply ignore bad values, as either the guest managed to get an unsupported value into hardware, or the guest is misbehaving and providing pure garbage. In either case, KVM can't fix the broken guest. Explicitly allow access to XSS at all times, as KVM needs to ensure its copy of XSS stays up-to-date. E.g. KVM supports migration of SEV-ES guests and so needs to allow the host to save/restore XSS, otherwise a guest that *knows* its XSS hasn't change could get stale/bad CPUID emulation if the guest doesn't provide XSS in the GHCB on every exit. This creates a hypothetical problem where a guest could request emulation of RDMSR or WRMSR on XSS, but arguably that's not even a problem, e.g. it would be entirely reasonable for a guest to request "emulation" as a way to inform the hypervisor that its XSS value has been modified. Note, emulating the change as an MSR write also takes care of side effects, e.g. marking dynamic CPUID bits as dirty. Suggested-by: John Allen base-commit: 14298d819d5a6b7180a4089e7d2121ca3551dc6c Link: https://lore.kernel.org/r/20250919223258.1604852-40-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 3 +++ arch/x86/kvm/svm/svm.c | 4 ++-- arch/x86/kvm/svm/svm.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d2ffacd43234..7971e2a4ddd5 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3310,6 +3310,9 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) if (kvm_ghcb_xcr0_is_valid(svm)) __kvm_set_xcr(vcpu, 0, kvm_ghcb_get_xcr0(svm)); + if (kvm_ghcb_xss_is_valid(svm)) + __kvm_emulate_msr_write(vcpu, MSR_IA32_XSS, kvm_ghcb_get_xss(svm)); + /* Copy the GHCB exit information into the VMCB fields */ exit_code = kvm_ghcb_get_sw_exit_code(svm); control->exit_code = lower_32_bits(exit_code); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 1c1792f6a65c..8ca6ef4b35c1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2721,8 +2721,8 @@ static int svm_get_feature_msr(u32 msr, u64 *data) static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return sev_es_guest(vcpu->kvm) && - vcpu->arch.guest_state_protected && + return sev_es_guest(vcpu->kvm) && vcpu->arch.guest_state_protected && + msr_info->index != MSR_IA32_XSS && !msr_write_intercepted(vcpu, msr_info->index); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 37de6a4a2665..99ce9b95f782 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -941,5 +941,6 @@ DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_1) DEFINE_KVM_GHCB_ACCESSORS(sw_exit_info_2) DEFINE_KVM_GHCB_ACCESSORS(sw_scratch) DEFINE_KVM_GHCB_ACCESSORS(xcr0) +DEFINE_KVM_GHCB_ACCESSORS(xss) #endif From 8db428fd5229ba509d515af10d7f33d7d4a54bfe Mon Sep 17 00:00:00 2001 From: John Allen Date: Fri, 19 Sep 2025 15:32:47 -0700 Subject: [PATCH 38/48] KVM: SVM: Enable shadow stack virtualization for SVM Remove the explicit clearing of shadow stack CPU capabilities. Reviewed-by: Chao Gao Signed-off-by: John Allen Link: https://lore.kernel.org/r/20250919223258.1604852-41-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 8ca6ef4b35c1..6068c99bca95 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5263,10 +5263,7 @@ static __init void svm_set_cpu_caps(void) kvm_set_cpu_caps(); kvm_caps.supported_perf_cap = 0; - kvm_caps.supported_xss = 0; - /* KVM doesn't yet support CET virtualization for SVM. */ - kvm_cpu_cap_clear(X86_FEATURE_SHSTK); kvm_cpu_cap_clear(X86_FEATURE_IBT); /* CPUID 0x80000001 and 0x8000000A (SVM features) */ From d37cc4819a489bfe32008e50e3a76eb967d95d42 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:48 -0700 Subject: [PATCH 39/48] KVM: x86: Add human friendly formatting for #XM, and #VE Add XM_VECTOR and VE_VECTOR pretty-printing for trace_kvm_inj_exception(). Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-42-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/trace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 57d79fd31df0..06da19b370c5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -461,8 +461,8 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ - EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ - EXS(MF), EXS(AC), EXS(MC) + EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \ + EXS(AC), EXS(MC), EXS(XM), EXS(VE) /* * Tracepoint for kvm interrupt injection: From f2f5519aa4e3ec4e42b009338f773bebb0bfd8a3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:49 -0700 Subject: [PATCH 40/48] KVM: x86: Define Control Protection Exception (#CP) vector Add a CP_VECTOR definition for CET's Control Protection Exception (#CP), along with human friendly formatting for trace_kvm_inj_exception(). Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-43-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/uapi/asm/kvm.h | 1 + arch/x86/kvm/trace.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 467116186e71..73e0e88a0a54 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -35,6 +35,7 @@ #define MC_VECTOR 18 #define XM_VECTOR 19 #define VE_VECTOR 20 +#define CP_VECTOR 21 /* Select x86 specific features in */ #define __KVM_HAVE_PIT diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 06da19b370c5..322913dda626 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -462,7 +462,7 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \ - EXS(AC), EXS(MC), EXS(XM), EXS(VE) + EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP) /* * Tracepoint for kvm interrupt injection: From fddd07626baa419c259ad5f3537a57188b5bb415 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:50 -0700 Subject: [PATCH 41/48] KVM: x86: Define AMD's #HV, #VC, and #SX exception vectors Add {HV,CP,SX}_VECTOR definitions for AMD's Hypervisor Injection Exception, VMM Communication Exception, and SVM Security Exception vectors, along with human friendly formatting for trace_kvm_inj_exception(). Note, KVM is all but guaranteed to never observe or inject #SX, and #HV is also unlikely to go unused. Add the architectural collateral mostly for completeness, and on the off chance that hardware goes off the rails. Link: https://lore.kernel.org/r/20250919223258.1604852-44-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/uapi/asm/kvm.h | 4 ++++ arch/x86/kvm/trace.h | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 73e0e88a0a54..d420c9c066d4 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -37,6 +37,10 @@ #define VE_VECTOR 20 #define CP_VECTOR 21 +#define HV_VECTOR 28 +#define VC_VECTOR 29 +#define SX_VECTOR 30 + /* Select x86 specific features in */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 322913dda626..e79bc9cb7162 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -462,7 +462,8 @@ TRACE_EVENT(kvm_inj_virq, #define kvm_trace_sym_exc \ EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), EXS(MF), \ - EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP) + EXS(AC), EXS(MC), EXS(XM), EXS(VE), EXS(CP), \ + EXS(HV), EXS(VC), EXS(SX) /* * Tracepoint for kvm interrupt injection: From 9c38ddb3df94acdc86485c7073610f01655309f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:52 -0700 Subject: [PATCH 42/48] KVM: selftests: Add an MSR test to exercise guest/host and read/write Add a selftest to verify reads and writes to various MSRs, from both the guest and host, and expect success/failure based on whether or not the vCPU supports the MSR according to supported CPUID. Note, this test is extremely similar to KVM-Unit-Test's "msr" test, but provides more coverage with respect to host accesses, and will be extended to provide addition testing of CPUID-based features, save/restore lists, and KVM_{G,S}ET_ONE_REG, all which are extremely difficult to validate in KUT. If kvm.ignore_msrs=true, skip the unsupported and reserved testcases as KVM's ABI is a mess; what exactly is supposed to be ignored, and when, varies wildly. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-46-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile.kvm | 1 + .../selftests/kvm/include/x86/processor.h | 5 + tools/testing/selftests/kvm/x86/msrs_test.c | 322 ++++++++++++++++++ 3 files changed, 328 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86/msrs_test.c diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index f6fe7a07a0a2..e6e410b938b9 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -87,6 +87,7 @@ TEST_GEN_PROGS_x86 += x86/kvm_clock_test TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test +TEST_GEN_PROGS_x86 += x86/msrs_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test TEST_GEN_PROGS_x86 += x86/platform_info_test diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index fbe875eafca5..51cd84b9ca66 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1362,6 +1362,11 @@ static inline bool kvm_is_unrestricted_guest_enabled(void) return get_kvm_intel_param_bool("unrestricted_guest"); } +static inline bool kvm_is_ignore_msrs(void) +{ + return get_kvm_param_bool("ignore_msrs"); +} + uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level); uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr); diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c new file mode 100644 index 000000000000..345a39030a0a --- /dev/null +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include + +#include "kvm_util.h" +#include "processor.h" + +/* Use HYPERVISOR for MSRs that are emulated unconditionally (as is HYPERVISOR). */ +#define X86_FEATURE_NONE X86_FEATURE_HYPERVISOR + +struct kvm_msr { + const struct kvm_x86_cpu_feature feature; + const struct kvm_x86_cpu_feature feature2; + const char *name; + const u64 reset_val; + const u64 write_val; + const u64 rsvd_val; + const u32 index; +}; + +#define ____MSR_TEST(msr, str, val, rsvd, reset, feat, f2) \ +{ \ + .index = msr, \ + .name = str, \ + .write_val = val, \ + .rsvd_val = rsvd, \ + .reset_val = reset, \ + .feature = X86_FEATURE_ ##feat, \ + .feature2 = X86_FEATURE_ ##f2, \ +} + +#define __MSR_TEST(msr, str, val, rsvd, reset, feat) \ + ____MSR_TEST(msr, str, val, rsvd, reset, feat, feat) + +#define MSR_TEST_NON_ZERO(msr, val, rsvd, reset, feat) \ + __MSR_TEST(msr, #msr, val, rsvd, reset, feat) + +#define MSR_TEST(msr, val, rsvd, feat) \ + __MSR_TEST(msr, #msr, val, rsvd, 0, feat) + +#define MSR_TEST2(msr, val, rsvd, feat, f2) \ + ____MSR_TEST(msr, #msr, val, rsvd, 0, feat, f2) + +/* + * Note, use a page aligned value for the canonical value so that the value + * is compatible with MSRs that use bits 11:0 for things other than addresses. + */ +static const u64 canonical_val = 0x123456789000ull; + +/* + * Arbitrary value with bits set in every byte, but not all bits set. This is + * also a non-canonical value, but that's coincidental (any 64-bit value with + * an alternating 0s/1s pattern will be non-canonical). + */ +static const u64 u64_val = 0xaaaa5555aaaa5555ull; + +#define MSR_TEST_CANONICAL(msr, feat) \ + __MSR_TEST(msr, #msr, canonical_val, NONCANONICAL, 0, feat) + +/* + * The main struct must be scoped to a function due to the use of structures to + * define features. For the global structure, allocate enough space for the + * foreseeable future without getting too ridiculous, to minimize maintenance + * costs (bumping the array size every time an MSR is added is really annoying). + */ +static struct kvm_msr msrs[128]; +static int idx; + +static bool ignore_unsupported_msrs; + +static u64 fixup_rdmsr_val(u32 msr, u64 want) +{ + /* + * AMD CPUs drop bits 63:32 on some MSRs that Intel CPUs support. KVM + * is supposed to emulate that behavior based on guest vendor model + * (which is the same as the host vendor model for this test). + */ + if (!host_cpu_is_amd) + return want; + + switch (msr) { + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + case MSR_TSC_AUX: + return want & GENMASK_ULL(31, 0); + default: + return want; + } +} + +static void __rdmsr(u32 msr, u64 want) +{ + u64 val; + u8 vec; + + vec = rdmsr_safe(msr, &val); + __GUEST_ASSERT(!vec, "Unexpected %s on RDMSR(0x%x)", ex_str(vec), msr); + + __GUEST_ASSERT(val == want, "Wanted 0x%lx from RDMSR(0x%x), got 0x%lx", + want, msr, val); +} + +static void __wrmsr(u32 msr, u64 val) +{ + u8 vec; + + vec = wrmsr_safe(msr, val); + __GUEST_ASSERT(!vec, "Unexpected %s on WRMSR(0x%x, 0x%lx)", + ex_str(vec), msr, val); + __rdmsr(msr, fixup_rdmsr_val(msr, val)); +} + +static void guest_test_supported_msr(const struct kvm_msr *msr) +{ + __rdmsr(msr->index, msr->reset_val); + __wrmsr(msr->index, msr->write_val); + GUEST_SYNC(fixup_rdmsr_val(msr->index, msr->write_val)); + + __rdmsr(msr->index, msr->reset_val); +} + +static void guest_test_unsupported_msr(const struct kvm_msr *msr) +{ + u64 val; + u8 vec; + + /* + * KVM's ABI with respect to ignore_msrs is a mess and largely beyond + * repair, just skip the unsupported MSR tests. + */ + if (ignore_unsupported_msrs) + goto skip_wrmsr_gp; + + if (this_cpu_has(msr->feature2)) + goto skip_wrmsr_gp; + + vec = rdmsr_safe(msr->index, &val); + __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on RDMSR(0x%x), got %s", + msr->index, ex_str(vec)); + + vec = wrmsr_safe(msr->index, msr->write_val); + __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on WRMSR(0x%x, 0x%lx), got %s", + msr->index, msr->write_val, ex_str(vec)); + +skip_wrmsr_gp: + GUEST_SYNC(0); +} + +void guest_test_reserved_val(const struct kvm_msr *msr) +{ + /* Skip reserved value checks as well, ignore_msrs is trully a mess. */ + if (ignore_unsupported_msrs) + return; + + /* + * If the CPU will truncate the written value (e.g. SYSENTER on AMD), + * expect success and a truncated value, not #GP. + */ + if (!this_cpu_has(msr->feature) || + msr->rsvd_val == fixup_rdmsr_val(msr->index, msr->rsvd_val)) { + u8 vec = wrmsr_safe(msr->index, msr->rsvd_val); + + __GUEST_ASSERT(vec == GP_VECTOR, + "Wanted #GP on WRMSR(0x%x, 0x%lx), got %s", + msr->index, msr->rsvd_val, ex_str(vec)); + } else { + __wrmsr(msr->index, msr->rsvd_val); + __wrmsr(msr->index, msr->reset_val); + } +} + +static void guest_main(void) +{ + for (;;) { + const struct kvm_msr *msr = &msrs[READ_ONCE(idx)]; + + if (this_cpu_has(msr->feature)) + guest_test_supported_msr(msr); + else + guest_test_unsupported_msr(msr); + + if (msr->rsvd_val) + guest_test_reserved_val(msr); + + GUEST_SYNC(msr->reset_val); + } +} + +static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) +{ + u64 reset_val = msrs[idx].reset_val; + u32 msr = msrs[idx].index; + u64 val; + + if (!kvm_cpu_has(msrs[idx].feature)) + return; + + val = vcpu_get_msr(vcpu, msr); + TEST_ASSERT(val == guest_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", + guest_val, msr, val); + + vcpu_set_msr(vcpu, msr, reset_val); + + val = vcpu_get_msr(vcpu, msr); + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", + reset_val, msr, val); +} + +static void do_vcpu_run(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + for (;;) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + host_test_msr(vcpu, uc.args[1]); + return; + case UCALL_PRINTF: + pr_info("%s", uc.buffer); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_DONE: + TEST_FAIL("Unexpected UCALL_DONE"); + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + } +} + +static void vcpus_run(struct kvm_vcpu **vcpus, const int NR_VCPUS) +{ + int i; + + for (i = 0; i < NR_VCPUS; i++) + do_vcpu_run(vcpus[i]); +} + +#define MISC_ENABLES_RESET_VAL (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_BTS_UNAVAIL) + +static void test_msrs(void) +{ + const struct kvm_msr __msrs[] = { + MSR_TEST_NON_ZERO(MSR_IA32_MISC_ENABLE, + MISC_ENABLES_RESET_VAL | MSR_IA32_MISC_ENABLE_FAST_STRING, + MSR_IA32_MISC_ENABLE_FAST_STRING, MISC_ENABLES_RESET_VAL, NONE), + MSR_TEST_NON_ZERO(MSR_IA32_CR_PAT, 0x07070707, 0, 0x7040600070406, NONE), + + /* + * TSC_AUX is supported if RDTSCP *or* RDPID is supported. Add + * entries for each features so that TSC_AUX doesn't exists for + * the "unsupported" vCPU, and obviously to test both cases. + */ + MSR_TEST2(MSR_TSC_AUX, 0x12345678, u64_val, RDTSCP, RDPID), + MSR_TEST2(MSR_TSC_AUX, 0x12345678, u64_val, RDPID, RDTSCP), + + MSR_TEST(MSR_IA32_SYSENTER_CS, 0x1234, 0, NONE), + /* + * SYSENTER_{ESP,EIP} are technically non-canonical on Intel, + * but KVM doesn't emulate that behavior on emulated writes, + * i.e. this test will observe different behavior if the MSR + * writes are handed by hardware vs. KVM. KVM's behavior is + * intended (though far from ideal), so don't bother testing + * non-canonical values. + */ + MSR_TEST(MSR_IA32_SYSENTER_ESP, canonical_val, 0, NONE), + MSR_TEST(MSR_IA32_SYSENTER_EIP, canonical_val, 0, NONE), + + MSR_TEST_CANONICAL(MSR_FS_BASE, LM), + MSR_TEST_CANONICAL(MSR_GS_BASE, LM), + MSR_TEST_CANONICAL(MSR_KERNEL_GS_BASE, LM), + MSR_TEST_CANONICAL(MSR_LSTAR, LM), + MSR_TEST_CANONICAL(MSR_CSTAR, LM), + MSR_TEST(MSR_SYSCALL_MASK, 0xffffffff, 0, LM), + + MSR_TEST_CANONICAL(MSR_IA32_PL0_SSP, SHSTK), + MSR_TEST(MSR_IA32_PL0_SSP, canonical_val, canonical_val | 1, SHSTK), + MSR_TEST_CANONICAL(MSR_IA32_PL1_SSP, SHSTK), + MSR_TEST(MSR_IA32_PL1_SSP, canonical_val, canonical_val | 1, SHSTK), + MSR_TEST_CANONICAL(MSR_IA32_PL2_SSP, SHSTK), + MSR_TEST(MSR_IA32_PL2_SSP, canonical_val, canonical_val | 1, SHSTK), + MSR_TEST_CANONICAL(MSR_IA32_PL3_SSP, SHSTK), + MSR_TEST(MSR_IA32_PL3_SSP, canonical_val, canonical_val | 1, SHSTK), + }; + + /* + * Create two vCPUs, but run them on the same task, to validate KVM's + * context switching of MSR state. Don't pin the task to a pCPU to + * also validate KVM's handling of cross-pCPU migration. + */ + const int NR_VCPUS = 2; + struct kvm_vcpu *vcpus[NR_VCPUS]; + struct kvm_vm *vm; + + kvm_static_assert(sizeof(__msrs) <= sizeof(msrs)); + kvm_static_assert(ARRAY_SIZE(__msrs) <= ARRAY_SIZE(msrs)); + memcpy(msrs, __msrs, sizeof(__msrs)); + + ignore_unsupported_msrs = kvm_is_ignore_msrs(); + + vm = vm_create_with_vcpus(NR_VCPUS, guest_main, vcpus); + + sync_global_to_guest(vm, msrs); + sync_global_to_guest(vm, ignore_unsupported_msrs); + + for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { + sync_global_to_guest(vm, idx); + + vcpus_run(vcpus, NR_VCPUS); + vcpus_run(vcpus, NR_VCPUS); + } + + kvm_vm_free(vm); +} + +int main(void) +{ + test_msrs(); +} From 27c41353064f9fdec9276e6b8e618b01110ef282 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:53 -0700 Subject: [PATCH 43/48] KVM: selftests: Add support for MSR_IA32_{S,U}_CET to MSRs test Extend the MSRs test to support {S,U}_CET, which are a bit of a pain to handled due to the MSRs existing if IBT *or* SHSTK is supported. To deal with Intel's wonderful decision to bundle IBT and SHSTK under CET, track the second feature, but skip only RDMSR #GP tests to avoid false failures when running on a CPU with only one of IBT or SHSTK (the WRMSR #GP tests are still valid since the enable bits are per-feature). Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-47-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index 345a39030a0a..1efaebb986c4 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -132,13 +132,26 @@ static void guest_test_unsupported_msr(const struct kvm_msr *msr) if (ignore_unsupported_msrs) goto skip_wrmsr_gp; - if (this_cpu_has(msr->feature2)) - goto skip_wrmsr_gp; + /* + * {S,U}_CET exist if IBT or SHSTK is supported, but with bits that are + * writable only if their associated feature is supported. Skip the + * RDMSR #GP test if the secondary feature is supported, but perform + * the WRMSR #GP test as the to-be-written value is tied to the primary + * feature. For all other MSRs, simply do nothing. + */ + if (this_cpu_has(msr->feature2)) { + if (msr->index != MSR_IA32_U_CET && + msr->index != MSR_IA32_S_CET) + goto skip_wrmsr_gp; + + goto skip_rdmsr_gp; + } vec = rdmsr_safe(msr->index, &val); __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on RDMSR(0x%x), got %s", msr->index, ex_str(vec)); +skip_rdmsr_gp: vec = wrmsr_safe(msr->index, msr->write_val); __GUEST_ASSERT(vec == GP_VECTOR, "Wanted #GP on WRMSR(0x%x, 0x%lx), got %s", msr->index, msr->write_val, ex_str(vec)); @@ -276,6 +289,10 @@ static void test_msrs(void) MSR_TEST_CANONICAL(MSR_CSTAR, LM), MSR_TEST(MSR_SYSCALL_MASK, 0xffffffff, 0, LM), + MSR_TEST2(MSR_IA32_S_CET, CET_SHSTK_EN, CET_RESERVED, SHSTK, IBT), + MSR_TEST2(MSR_IA32_S_CET, CET_ENDBR_EN, CET_RESERVED, IBT, SHSTK), + MSR_TEST2(MSR_IA32_U_CET, CET_SHSTK_EN, CET_RESERVED, SHSTK, IBT), + MSR_TEST2(MSR_IA32_U_CET, CET_ENDBR_EN, CET_RESERVED, IBT, SHSTK), MSR_TEST_CANONICAL(MSR_IA32_PL0_SSP, SHSTK), MSR_TEST(MSR_IA32_PL0_SSP, canonical_val, canonical_val | 1, SHSTK), MSR_TEST_CANONICAL(MSR_IA32_PL1_SSP, SHSTK), From a8b9cca99cf454799ef799f8af9bdb55389cfd94 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:54 -0700 Subject: [PATCH 44/48] KVM: selftests: Extend MSRs test to validate vCPUs without supported features Add a third vCPUs to the MSRs test that runs with all features disabled in the vCPU's CPUID model, to verify that KVM does the right thing with respect to emulating accesses to MSRs that shouldn't exist. Use the same VM to verify that KVM is honoring the vCPU model, e.g. isn't looking at per-VM state when emulating MSR accesses. Link: https://lore.kernel.org/r/20250919223258.1604852-48-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 28 ++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index 1efaebb986c4..ec94c70b3f06 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -303,12 +303,17 @@ static void test_msrs(void) MSR_TEST(MSR_IA32_PL3_SSP, canonical_val, canonical_val | 1, SHSTK), }; + const struct kvm_x86_cpu_feature feat_none = X86_FEATURE_NONE; + const struct kvm_x86_cpu_feature feat_lm = X86_FEATURE_LM; + /* - * Create two vCPUs, but run them on the same task, to validate KVM's + * Create three vCPUs, but run them on the same task, to validate KVM's * context switching of MSR state. Don't pin the task to a pCPU to - * also validate KVM's handling of cross-pCPU migration. + * also validate KVM's handling of cross-pCPU migration. Use the full + * set of features for the first two vCPUs, but clear all features in + * third vCPU in order to test both positive and negative paths. */ - const int NR_VCPUS = 2; + const int NR_VCPUS = 3; struct kvm_vcpu *vcpus[NR_VCPUS]; struct kvm_vm *vm; @@ -323,6 +328,23 @@ static void test_msrs(void) sync_global_to_guest(vm, msrs); sync_global_to_guest(vm, ignore_unsupported_msrs); + /* + * Clear features in the "unsupported features" vCPU. This needs to be + * done before the first vCPU run as KVM's ABI is that guest CPUID is + * immutable once the vCPU has been run. + */ + for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { + /* + * Don't clear LM; selftests are 64-bit only, and KVM doesn't + * honor LM=0 for MSRs that are supposed to exist if and only + * if the vCPU is a 64-bit model. Ditto for NONE; clearing a + * fake feature flag will result in false failures. + */ + if (memcmp(&msrs[idx].feature, &feat_lm, sizeof(feat_lm)) && + memcmp(&msrs[idx].feature, &feat_none, sizeof(feat_none))) + vcpu_clear_cpuid_feature(vcpus[2], msrs[idx].feature); + } + for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { sync_global_to_guest(vm, idx); From 80c2b6d8e7bb194744f5fdfc4f0560e053ababda Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:55 -0700 Subject: [PATCH 45/48] KVM: selftests: Add KVM_{G,S}ET_ONE_REG coverage to MSRs test When KVM_{G,S}ET_ONE_REG are supported, verify that MSRs can be accessed via ONE_REG and through the dedicated MSR ioctls. For simplicity, run the test twice, e.g. instead of trying to get MSR values into the exact right state when switching write methods. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-49-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 22 ++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index ec94c70b3f06..680d381d1bf7 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -200,6 +200,9 @@ static void guest_main(void) } } +static bool has_one_reg; +static bool use_one_reg; + static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) { u64 reset_val = msrs[idx].reset_val; @@ -213,11 +216,21 @@ static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) TEST_ASSERT(val == guest_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", guest_val, msr, val); - vcpu_set_msr(vcpu, msr, reset_val); + if (use_one_reg) + vcpu_set_reg(vcpu, KVM_X86_REG_MSR(msr), reset_val); + else + vcpu_set_msr(vcpu, msr, reset_val); val = vcpu_get_msr(vcpu, msr); TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_msr(0x%x), got 0x%lx", reset_val, msr, val); + + if (!has_one_reg) + return; + + val = vcpu_get_reg(vcpu, KVM_X86_REG_MSR(msr)); + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", + reset_val, msr, val); } static void do_vcpu_run(struct kvm_vcpu *vcpu) @@ -357,5 +370,12 @@ static void test_msrs(void) int main(void) { + has_one_reg = kvm_has_cap(KVM_CAP_ONE_REG); + test_msrs(); + + if (has_one_reg) { + use_one_reg = true; + test_msrs(); + } } From 3469fd203bac201ad67d9586041689c4c6a87882 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:56 -0700 Subject: [PATCH 46/48] KVM: selftests: Add coverage for KVM-defined registers in MSRs test Add test coverage for the KVM-defined GUEST_SSP "register" in the MSRs test. While _KVM's_ goal is to not tie the uAPI of KVM-defined registers to any particular internal implementation, i.e. to not commit in uAPI to handling GUEST_SSP as an MSR, treating GUEST_SSP as an MSR for testing purposes is a-ok and is a naturally fit given the semantics of SSP. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-50-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 97 ++++++++++++++++++++- 1 file changed, 94 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index 680d381d1bf7..54ef9c539eb3 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -17,9 +17,10 @@ struct kvm_msr { const u64 write_val; const u64 rsvd_val; const u32 index; + const bool is_kvm_defined; }; -#define ____MSR_TEST(msr, str, val, rsvd, reset, feat, f2) \ +#define ____MSR_TEST(msr, str, val, rsvd, reset, feat, f2, is_kvm) \ { \ .index = msr, \ .name = str, \ @@ -28,10 +29,11 @@ struct kvm_msr { .reset_val = reset, \ .feature = X86_FEATURE_ ##feat, \ .feature2 = X86_FEATURE_ ##f2, \ + .is_kvm_defined = is_kvm, \ } #define __MSR_TEST(msr, str, val, rsvd, reset, feat) \ - ____MSR_TEST(msr, str, val, rsvd, reset, feat, feat) + ____MSR_TEST(msr, str, val, rsvd, reset, feat, feat, false) #define MSR_TEST_NON_ZERO(msr, val, rsvd, reset, feat) \ __MSR_TEST(msr, #msr, val, rsvd, reset, feat) @@ -40,7 +42,7 @@ struct kvm_msr { __MSR_TEST(msr, #msr, val, rsvd, 0, feat) #define MSR_TEST2(msr, val, rsvd, feat, f2) \ - ____MSR_TEST(msr, #msr, val, rsvd, 0, feat, f2) + ____MSR_TEST(msr, #msr, val, rsvd, 0, feat, f2, false) /* * Note, use a page aligned value for the canonical value so that the value @@ -58,6 +60,9 @@ static const u64 u64_val = 0xaaaa5555aaaa5555ull; #define MSR_TEST_CANONICAL(msr, feat) \ __MSR_TEST(msr, #msr, canonical_val, NONCANONICAL, 0, feat) +#define MSR_TEST_KVM(msr, val, rsvd, feat) \ + ____MSR_TEST(KVM_REG_ ##msr, #msr, val, rsvd, 0, feat, feat, true) + /* * The main struct must be scoped to a function due to the use of structures to * define features. For the global structure, allocate enough space for the @@ -203,6 +208,83 @@ static void guest_main(void) static bool has_one_reg; static bool use_one_reg; +#define KVM_X86_MAX_NR_REGS 1 + +static bool vcpu_has_reg(struct kvm_vcpu *vcpu, u64 reg) +{ + struct { + struct kvm_reg_list list; + u64 regs[KVM_X86_MAX_NR_REGS]; + } regs = {}; + int r, i; + + /* + * If KVM_GET_REG_LIST succeeds with n=0, i.e. there are no supported + * regs, then the vCPU obviously doesn't support the reg. + */ + r = __vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®s.list); + if (!r) + return false; + + TEST_ASSERT_EQ(errno, E2BIG); + + /* + * KVM x86 is expected to support enumerating a relative small number + * of regs. The majority of registers supported by KVM_{G,S}ET_ONE_REG + * are enumerated via other ioctls, e.g. KVM_GET_MSR_INDEX_LIST. For + * simplicity, hardcode the maximum number of regs and manually update + * the test as necessary. + */ + TEST_ASSERT(regs.list.n <= KVM_X86_MAX_NR_REGS, + "KVM reports %llu regs, test expects at most %u regs, stale test?", + regs.list.n, KVM_X86_MAX_NR_REGS); + + vcpu_ioctl(vcpu, KVM_GET_REG_LIST, ®s.list); + for (i = 0; i < regs.list.n; i++) { + if (regs.regs[i] == reg) + return true; + } + + return false; +} + +static void host_test_kvm_reg(struct kvm_vcpu *vcpu) +{ + bool has_reg = vcpu_cpuid_has(vcpu, msrs[idx].feature); + u64 reset_val = msrs[idx].reset_val; + u64 write_val = msrs[idx].write_val; + u64 rsvd_val = msrs[idx].rsvd_val; + u32 reg = msrs[idx].index; + u64 val; + int r; + + if (!use_one_reg) + return; + + TEST_ASSERT_EQ(vcpu_has_reg(vcpu, KVM_X86_REG_KVM(reg)), has_reg); + + if (!has_reg) { + r = __vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg), &val); + TEST_ASSERT(r && errno == EINVAL, + "Expected failure on get_reg(0x%x)", reg); + rsvd_val = 0; + goto out; + } + + val = vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg)); + TEST_ASSERT(val == reset_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", + reset_val, reg, val); + + vcpu_set_reg(vcpu, KVM_X86_REG_KVM(reg), write_val); + val = vcpu_get_reg(vcpu, KVM_X86_REG_KVM(reg)); + TEST_ASSERT(val == write_val, "Wanted 0x%lx from get_reg(0x%x), got 0x%lx", + write_val, reg, val); + +out: + r = __vcpu_set_reg(vcpu, KVM_X86_REG_KVM(reg), rsvd_val); + TEST_ASSERT(r, "Expected failure on set_reg(0x%x, 0x%lx)", reg, rsvd_val); +} + static void host_test_msr(struct kvm_vcpu *vcpu, u64 guest_val) { u64 reset_val = msrs[idx].reset_val; @@ -314,6 +396,8 @@ static void test_msrs(void) MSR_TEST(MSR_IA32_PL2_SSP, canonical_val, canonical_val | 1, SHSTK), MSR_TEST_CANONICAL(MSR_IA32_PL3_SSP, SHSTK), MSR_TEST(MSR_IA32_PL3_SSP, canonical_val, canonical_val | 1, SHSTK), + + MSR_TEST_KVM(GUEST_SSP, canonical_val, NONCANONICAL, SHSTK), }; const struct kvm_x86_cpu_feature feat_none = X86_FEATURE_NONE; @@ -329,6 +413,7 @@ static void test_msrs(void) const int NR_VCPUS = 3; struct kvm_vcpu *vcpus[NR_VCPUS]; struct kvm_vm *vm; + int i; kvm_static_assert(sizeof(__msrs) <= sizeof(msrs)); kvm_static_assert(ARRAY_SIZE(__msrs) <= ARRAY_SIZE(msrs)); @@ -359,6 +444,12 @@ static void test_msrs(void) } for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { + if (msrs[idx].is_kvm_defined) { + for (i = 0; i < NR_VCPUS; i++) + host_test_kvm_reg(vcpus[i]); + continue; + } + sync_global_to_guest(vm, idx); vcpus_run(vcpus, NR_VCPUS); From 947ab90c91983c965acb994455734aae19317596 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 19 Sep 2025 15:32:57 -0700 Subject: [PATCH 47/48] KVM: selftests: Verify MSRs are (not) in save/restore list when (un)supported Add a check in the MSRs test to verify that KVM's reported support for MSRs with feature bits is consistent between KVM's MSR save/restore lists and KVM's supported CPUID. To deal with Intel's wonderful decision to bundle IBT and SHSTK under CET, track the "second" feature to avoid false failures when running on a CPU with only one of IBT or SHSTK. Reviewed-by: Chao Gao Link: https://lore.kernel.org/r/20250919223258.1604852-51-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/msrs_test.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/msrs_test.c b/tools/testing/selftests/kvm/x86/msrs_test.c index 54ef9c539eb3..40d918aedce6 100644 --- a/tools/testing/selftests/kvm/x86/msrs_test.c +++ b/tools/testing/selftests/kvm/x86/msrs_test.c @@ -444,12 +444,29 @@ static void test_msrs(void) } for (idx = 0; idx < ARRAY_SIZE(__msrs); idx++) { - if (msrs[idx].is_kvm_defined) { + struct kvm_msr *msr = &msrs[idx]; + + if (msr->is_kvm_defined) { for (i = 0; i < NR_VCPUS; i++) host_test_kvm_reg(vcpus[i]); continue; } + /* + * Verify KVM_GET_SUPPORTED_CPUID and KVM_GET_MSR_INDEX_LIST + * are consistent with respect to MSRs whose existence is + * enumerated via CPUID. Skip the check for FS/GS.base MSRs, + * as they aren't reported in the save/restore list since their + * state is managed via SREGS. + */ + TEST_ASSERT(msr->index == MSR_FS_BASE || msr->index == MSR_GS_BASE || + kvm_msr_is_in_save_restore_list(msr->index) == + (kvm_cpu_has(msr->feature) || kvm_cpu_has(msr->feature2)), + "%s %s in save/restore list, but %s according to CPUID", msr->name, + kvm_msr_is_in_save_restore_list(msr->index) ? "is" : "isn't", + (kvm_cpu_has(msr->feature) || kvm_cpu_has(msr->feature2)) ? + "supported" : "unsupported"); + sync_global_to_guest(vm, idx); vcpus_run(vcpus, NR_VCPUS); From d292035fb5d209b78beda356a2a9720154bd7c00 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 19 Sep 2025 15:32:58 -0700 Subject: [PATCH 48/48] KVM: VMX: Make CR4.CET a guest owned bit Make CR4.CET a guest-owned bit under VMX by extending KVM_POSSIBLE_CR4_GUEST_BITS accordingly. There's no need to intercept changes to CR4.CET, as it's neither included in KVM's MMU role bits, nor does KVM specifically care about the actual value of a (nested) guest's CR4.CET value, beside for enforcing architectural constraints, i.e. make sure that CR0.WP=1 if CR4.CET=1. Intercepting writes to CR4.CET is particularly bad for grsecurity kernels with KERNEXEC or, even worse, KERNSEAL enabled. These features heavily make use of read-only kernel objects and use a cpu-local CR0.WP toggle to override it, when needed. Under a CET-enabled kernel, this also requires toggling CR4.CET, hence the motivation to make it guest-owned. Using the old test from [1] gives the following runtime numbers (perf stat -r 5 ssdd 10 50000): * grsec guest on linux-6.16-rc5 + cet patches: 2.4647 +- 0.0706 seconds time elapsed ( +- 2.86% ) * grsec guest on linux-6.16-rc5 + cet patches + CR4.CET guest-owned: 1.5648 +- 0.0240 seconds time elapsed ( +- 1.53% ) Not only does not intercepting CR4.CET make the test run ~35% faster, it's also more stable with less fluctuation due to fewer VMEXITs. Therefore, make CR4.CET a guest-owned bit where possible. This change is VMX-specific, as SVM has no such fine-grained control register intercept control. If KVM's assumptions regarding MMU role handling wrt. a guest's CR4.CET value ever change, the BUILD_BUG_ON()s related to KVM_MMU_CR4_ROLE_BITS and KVM_POSSIBLE_CR4_GUEST_BITS will catch that early. Link: https://lore.kernel.org/kvm/20230322013731.102955-1-minipli@grsecurity.net/ [1] Reviewed-by: Chao Gao Signed-off-by: Mathias Krause Reviewed-by: Binbin Wu Link: https://lore.kernel.org/r/20250919223258.1604852-52-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/kvm_cache_regs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 36a8786db291..8ddb01191d6f 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -7,7 +7,8 @@ #define KVM_POSSIBLE_CR0_GUEST_BITS (X86_CR0_TS | X86_CR0_WP) #define KVM_POSSIBLE_CR4_GUEST_BITS \ (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE) + | X86_CR4_OSXMMEXCPT | X86_CR4_PGE | X86_CR4_TSD | X86_CR4_FSGSBASE \ + | X86_CR4_CET) #define X86_CR0_PDPTR_BITS (X86_CR0_CD | X86_CR0_NW | X86_CR0_PG) #define X86_CR4_TLBFLUSH_BITS (X86_CR4_PGE | X86_CR4_PCIDE | X86_CR4_PAE | X86_CR4_SMEP)