linux/arch/x86/virt/hw.c
Sean Christopherson e30aa03d03 x86/virt: Treat SVM as unsupported when running as an SEV+ guest
When running as an SEV+ guest, treat SVM as unsupported even if CPUID (and
other reporting, e.g. MSRs) enumerate support for SVM, as KVM  doesn't
support nested virtualization within an SEV VM (KVM would need to
explicitly share all VMCBs and other assets with the untrusted host), let
alone running nested VMs within SEV-ES+ guests (e.g. emulating VMLOAD,
VMSAVE, and VMRUN all require access to guest register state).  And outside
of KVM, there is no in-tree user of SVM enabling.

Arguably, the hypervisor/VMM (e.g. QEMU) should clear SVM from guest CPUID
for SEV VMs, especially for SEV-ES+, but super duper technically, it's
feasible to run nested VMs in SEV+ guests (with many caveats).  More
importantly, Linux-as-a-guest has played nice with SVM being advertised to
SEV+ guests for a long time.

Treating SVM as unsupported fixes a regression where a clean shutdown of
an SEV-ES+ guest degrades into an abrupt termination.  Due to a gnarly
virtualization hole in SEV-ES (the architecture), where EFER must NOT be
intercepted by the hypervisor (because the untrusted hypervisor can't set
e.g. EFER.LME on behalf o the guest), the _host's_ EFER.SVME is visible to
the guest.  Because EFER.SVME must be always '1' while in guest mode,
Linux-the-guest sees EFER.SVME=1 even when _its_ EFER.SVME is '0', thinks
it has enabled virtualization, and ultimately can cause
x86_svm_emergency_disable_virtualization_cpu() to execute STGI to ensure
GIF is enabled.  Executing STGI _should_ be fine, except Linux is a also
wee bit paranoid when running as an SEV-ES guest.

Because L0 sees EFER.SVME=0 for the guest, a well-behaved L0 hypervisor
will intercept STGI (to inject #UD), and thus generate a #VC on the STGI.
Which, again, should be fine.  Unfortunately, vc_check_opcode_bytes() fails
to account for STGI and other SVM instructions, throws a fatal error, and
triggers a termination request.  In a perfect world, the #VC handler would
be more forgiving of unknown intercepts, especially when the #VC happened
on an instruction with exception fixup.  For now, just fix the immediate
regression.

Fixes: 428afac5a8 ("KVM: x86: Move bulk of emergency virtualizaton logic to virt subsystem")
Reported-by: Srikanth Aithal <sraithal@amd.com>
Closes: https://lore.kernel.org/all/c820e242-9f3a-4210-b414-19d11b022404@amd.com
Link: https://patch.msgid.link/20260409191341.1932853-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
2026-04-09 12:21:53 -07:00

361 lines
8.5 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/errno.h>
#include <linux/kvm_types.h>
#include <linux/list.h>
#include <linux/percpu.h>
#include <asm/perf_event.h>
#include <asm/processor.h>
#include <asm/virt.h>
#include <asm/vmx.h>
struct x86_virt_ops {
int feature;
int (*enable_virtualization_cpu)(void);
int (*disable_virtualization_cpu)(void);
void (*emergency_disable_virtualization_cpu)(void);
};
static struct x86_virt_ops virt_ops __ro_after_init;
__visible bool virt_rebooting;
EXPORT_SYMBOL_FOR_KVM(virt_rebooting);
static DEFINE_PER_CPU(int, virtualization_nr_users);
static cpu_emergency_virt_cb __rcu *kvm_emergency_callback;
void x86_virt_register_emergency_callback(cpu_emergency_virt_cb *callback)
{
if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback)))
return;
rcu_assign_pointer(kvm_emergency_callback, callback);
}
EXPORT_SYMBOL_FOR_KVM(x86_virt_register_emergency_callback);
void x86_virt_unregister_emergency_callback(cpu_emergency_virt_cb *callback)
{
if (WARN_ON_ONCE(rcu_access_pointer(kvm_emergency_callback) != callback))
return;
rcu_assign_pointer(kvm_emergency_callback, NULL);
synchronize_rcu();
}
EXPORT_SYMBOL_FOR_KVM(x86_virt_unregister_emergency_callback);
static void x86_virt_invoke_kvm_emergency_callback(void)
{
cpu_emergency_virt_cb *kvm_callback;
kvm_callback = rcu_dereference(kvm_emergency_callback);
if (kvm_callback)
kvm_callback();
}
#if IS_ENABLED(CONFIG_KVM_INTEL)
static DEFINE_PER_CPU(struct vmcs *, root_vmcs);
static int x86_virt_cpu_vmxon(void)
{
u64 vmxon_pointer = __pa(per_cpu(root_vmcs, raw_smp_processor_id()));
u64 msr;
cr4_set_bits(X86_CR4_VMXE);
asm goto("1: vmxon %[vmxon_pointer]\n\t"
_ASM_EXTABLE(1b, %l[fault])
: : [vmxon_pointer] "m"(vmxon_pointer)
: : fault);
return 0;
fault:
WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
cr4_clear_bits(X86_CR4_VMXE);
return -EFAULT;
}
static int x86_vmx_enable_virtualization_cpu(void)
{
int r;
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
intel_pt_handle_vmx(1);
r = x86_virt_cpu_vmxon();
if (r) {
intel_pt_handle_vmx(0);
return r;
}
return 0;
}
/*
* Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
*
* Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
* atomically track post-VMXON state, e.g. this may be called in NMI context.
* Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
* faults are guaranteed to be due to the !post-VMXON check unless the CPU is
* magically in RM, VM86, compat mode, or at CPL>0.
*/
static int x86_vmx_disable_virtualization_cpu(void)
{
int r = -EIO;
asm goto("1: vmxoff\n\t"
_ASM_EXTABLE(1b, %l[fault])
::: "cc", "memory" : fault);
r = 0;
fault:
cr4_clear_bits(X86_CR4_VMXE);
intel_pt_handle_vmx(0);
return r;
}
static void x86_vmx_emergency_disable_virtualization_cpu(void)
{
virt_rebooting = true;
/*
* Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
* set in task context. If this races with _another_ emergency call
* from NMI context, VMCLEAR (in KVM) and VMXOFF may #UD, but KVM and
* the kernel will eat those faults due to virt_rebooting being set by
* the interrupting NMI callback.
*/
if (!(__read_cr4() & X86_CR4_VMXE))
return;
x86_virt_invoke_kvm_emergency_callback();
x86_vmx_disable_virtualization_cpu();
}
static __init void x86_vmx_exit(void)
{
int cpu;
for_each_possible_cpu(cpu) {
free_page((unsigned long)per_cpu(root_vmcs, cpu));
per_cpu(root_vmcs, cpu) = NULL;
}
}
static __init int __x86_vmx_init(void)
{
const struct x86_virt_ops vmx_ops = {
.feature = X86_FEATURE_VMX,
.enable_virtualization_cpu = x86_vmx_enable_virtualization_cpu,
.disable_virtualization_cpu = x86_vmx_disable_virtualization_cpu,
.emergency_disable_virtualization_cpu = x86_vmx_emergency_disable_virtualization_cpu,
};
u64 basic_msr;
u32 rev_id;
int cpu;
if (!cpu_feature_enabled(X86_FEATURE_VMX))
return -EOPNOTSUPP;
rdmsrq(MSR_IA32_VMX_BASIC, basic_msr);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
if (WARN_ON_ONCE(vmx_basic_vmcs_size(basic_msr) > PAGE_SIZE))
return -EIO;
/*
* Even if eVMCS is enabled (or will be enabled?), and even though not
* explicitly documented by TLFS, the root VMCS passed to VMXON should
* still be marked with the revision_id reported by the physical CPU.
*/
rev_id = vmx_basic_vmcs_revision_id(basic_msr);
for_each_possible_cpu(cpu) {
int node = cpu_to_node(cpu);
struct page *page;
struct vmcs *vmcs;
page = __alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
if (WARN_ON_ONCE(!page)) {
x86_vmx_exit();
return -ENOMEM;
}
vmcs = page_address(page);
vmcs->hdr.revision_id = rev_id;
per_cpu(root_vmcs, cpu) = vmcs;
}
memcpy(&virt_ops, &vmx_ops, sizeof(virt_ops));
return 0;
}
static __init int x86_vmx_init(void)
{
int r;
r = __x86_vmx_init();
if (r)
setup_clear_cpu_cap(X86_FEATURE_VMX);
return r;
}
#else
static __init int x86_vmx_init(void) { return -EOPNOTSUPP; }
static __init void x86_vmx_exit(void) { }
#endif
#if IS_ENABLED(CONFIG_KVM_AMD)
static int x86_svm_enable_virtualization_cpu(void)
{
u64 efer;
rdmsrq(MSR_EFER, efer);
if (efer & EFER_SVME)
return -EBUSY;
wrmsrq(MSR_EFER, efer | EFER_SVME);
return 0;
}
static int x86_svm_disable_virtualization_cpu(void)
{
int r = -EIO;
u64 efer;
/*
* Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and
* NMI aren't blocked.
*/
asm goto("1: stgi\n\t"
_ASM_EXTABLE(1b, %l[fault])
::: "memory" : fault);
r = 0;
fault:
rdmsrq(MSR_EFER, efer);
wrmsrq(MSR_EFER, efer & ~EFER_SVME);
return r;
}
static void x86_svm_emergency_disable_virtualization_cpu(void)
{
u64 efer;
virt_rebooting = true;
rdmsrq(MSR_EFER, efer);
if (!(efer & EFER_SVME))
return;
x86_virt_invoke_kvm_emergency_callback();
x86_svm_disable_virtualization_cpu();
}
static __init int x86_svm_init(void)
{
const struct x86_virt_ops svm_ops = {
.feature = X86_FEATURE_SVM,
.enable_virtualization_cpu = x86_svm_enable_virtualization_cpu,
.disable_virtualization_cpu = x86_svm_disable_virtualization_cpu,
.emergency_disable_virtualization_cpu = x86_svm_emergency_disable_virtualization_cpu,
};
if (!cpu_feature_enabled(X86_FEATURE_SVM) ||
cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return -EOPNOTSUPP;
memcpy(&virt_ops, &svm_ops, sizeof(virt_ops));
return 0;
}
#else
static __init int x86_svm_init(void) { return -EOPNOTSUPP; }
#endif
int x86_virt_get_ref(int feat)
{
int r;
/* Ensure the !feature check can't get false positives. */
BUILD_BUG_ON(!X86_FEATURE_SVM || !X86_FEATURE_VMX);
if (!virt_ops.feature || virt_ops.feature != feat)
return -EOPNOTSUPP;
guard(preempt)();
if (this_cpu_inc_return(virtualization_nr_users) > 1)
return 0;
r = virt_ops.enable_virtualization_cpu();
if (r)
WARN_ON_ONCE(this_cpu_dec_return(virtualization_nr_users));
return r;
}
EXPORT_SYMBOL_FOR_KVM(x86_virt_get_ref);
void x86_virt_put_ref(int feat)
{
guard(preempt)();
if (WARN_ON_ONCE(!this_cpu_read(virtualization_nr_users)) ||
this_cpu_dec_return(virtualization_nr_users))
return;
BUG_ON(virt_ops.disable_virtualization_cpu() && !virt_rebooting);
}
EXPORT_SYMBOL_FOR_KVM(x86_virt_put_ref);
/*
* Disable virtualization, i.e. VMX or SVM, to ensure INIT is recognized during
* reboot. VMX blocks INIT if the CPU is post-VMXON, and SVM blocks INIT if
* GIF=0, i.e. if the crash occurred between CLGI and STGI.
*/
int x86_virt_emergency_disable_virtualization_cpu(void)
{
if (!virt_ops.feature)
return -EOPNOTSUPP;
/*
* IRQs must be disabled as virtualization is enabled in hardware via
* function call IPIs, i.e. IRQs need to be disabled to guarantee
* virtualization stays disabled.
*/
lockdep_assert_irqs_disabled();
/*
* Do the NMI shootdown even if virtualization is off on _this_ CPU, as
* other CPUs may have virtualization enabled.
*
* TODO: Track whether or not virtualization might be enabled on other
* CPUs? May not be worth avoiding the NMI shootdown...
*/
virt_ops.emergency_disable_virtualization_cpu();
return 0;
}
void __init x86_virt_init(void)
{
/*
* Attempt to initialize both SVM and VMX, and simply use whichever one
* is present. Rsefuse to enable/use SVM or VMX if both are somehow
* supported. No known CPU supports both SVM and VMX.
*/
bool has_vmx = !x86_vmx_init();
bool has_svm = !x86_svm_init();
if (WARN_ON_ONCE(has_vmx && has_svm)) {
x86_vmx_exit();
memset(&virt_ops, 0, sizeof(virt_ops));
}
}