diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 5100ec3a20bd..7cf9b9899f86 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1098,6 +1098,21 @@ struct kvm_vcpu_arch { */ bool pdptrs_from_userspace; + /* + * Set if an emulated nested VM-Enter to L2 is pending completion. KVM + * must not synthesize a VM-Exit to L1 before entering L2, as VM-Exits + * can only occur at instruction boundaries. The only exception is + * VMX's "notify" exits, which exist in large part to break the CPU out + * of infinite ucode loops, but can corrupt vCPU state in the process! + * + * For all intents and purposes, this is a boolean, but it's tracked as + * a u8 so that KVM can detect when userspace may have stuffed vCPU + * state and generated an architecturally-impossible VM-Exit. + */ +#define KVM_NESTED_RUN_PENDING 1 +#define KVM_NESTED_RUN_PENDING_UNTRUSTED 2 + u8 nested_run_pending; + #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; #endif diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index edde36097ddc..bcfeb5e7c0ed 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -142,13 +142,13 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 exit_info_2; u32 exit_int_info; u32 exit_int_info_err; - u64 nested_ctl; + u64 misc_ctl; u64 avic_vapic_bar; u64 ghcb_gpa; u32 event_inj; u32 event_inj_err; u64 nested_cr3; - u64 virt_ext; + u64 misc_ctl2; u32 clean; u32 reserved_5; u64 next_rip; @@ -182,6 +182,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define TLB_CONTROL_FLUSH_ASID 3 #define TLB_CONTROL_FLUSH_ASID_LOCAL 7 +#define TLB_CONTROL_MASK GENMASK(2, 0) + #define ERAP_CONTROL_ALLOW_LARGER_RAP BIT(0) #define ERAP_CONTROL_CLEAR_RAP BIT(1) @@ -222,8 +224,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define X2APIC_MODE_SHIFT 30 #define X2APIC_MODE_MASK (1 << X2APIC_MODE_SHIFT) -#define LBR_CTL_ENABLE_MASK BIT_ULL(0) -#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) +#define SVM_INT_VECTOR_MASK GENMASK(7, 0) #define SVM_INTERRUPT_SHADOW_MASK BIT_ULL(0) #define SVM_GUEST_INTERRUPT_MASK BIT_ULL(1) @@ -239,10 +240,12 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) -#define SVM_NESTED_CTL_NP_ENABLE BIT(0) -#define SVM_NESTED_CTL_SEV_ENABLE BIT(1) -#define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) +#define SVM_MISC_ENABLE_NP BIT(0) +#define SVM_MISC_ENABLE_SEV BIT(1) +#define SVM_MISC_ENABLE_SEV_ES BIT(2) +#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0) +#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1) #define SVM_TSC_RATIO_RSVD 0xffffff0000000000ULL #define SVM_TSC_RATIO_MIN 0x0000000000000001ULL @@ -636,6 +639,9 @@ static inline void __unused_size_checks(void) #define SVM_EVTINJ_VALID (1 << 31) #define SVM_EVTINJ_VALID_ERR (1 << 11) +#define SVM_EVTINJ_RESERVED_BITS ~(SVM_EVTINJ_VEC_MASK | SVM_EVTINJ_TYPE_MASK | \ + SVM_EVTINJ_VALID_ERR | SVM_EVTINJ_VALID) + #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK #define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6145dac4a605..c8c6cc0406d6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3887,8 +3887,7 @@ static int check_svme_pa(struct x86_emulate_ctxt *ctxt) { u64 rax = reg_read(ctxt, VCPU_REGS_RAX); - /* Valid physical address? */ - if (rax & 0xffff000000000000ULL) + if (!ctxt->ops->page_address_valid(ctxt, rax)) return emulate_gp(ctxt, 0); return check_svme(ctxt); diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 6ce160ffa678..6301f79fcbae 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -305,14 +305,6 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu) { return false; } -static inline bool kvm_hv_is_tlb_flush_hcall(struct kvm_vcpu *vcpu) -{ - return false; -} -static inline bool guest_hv_cpuid_has_l2_tlb_flush(struct kvm_vcpu *vcpu) -{ - return false; -} static inline int kvm_hv_verify_vp_assist(struct kvm_vcpu *vcpu) { return 0; diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index fb3dab4b5a53..0abff36d0994 100644 --- a/arch/x86/kvm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -245,6 +245,8 @@ struct x86_emulate_ops { bool (*is_canonical_addr)(struct x86_emulate_ctxt *ctxt, gva_t addr, unsigned int flags); + + bool (*page_address_valid)(struct x86_emulate_ctxt *ctxt, gpa_t gpa); }; /* Type, address-of, and value of an instruction's operand. */ diff --git a/arch/x86/kvm/svm/hyperv.h b/arch/x86/kvm/svm/hyperv.h index d3f8bfc05832..f70d076911a6 100644 --- a/arch/x86/kvm/svm/hyperv.h +++ b/arch/x86/kvm/svm/hyperv.h @@ -41,10 +41,17 @@ static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) return hv_vcpu->vp_assist_page.nested_control.features.directhypercall; } +static inline bool nested_svm_is_l2_tlb_flush_hcall(struct kvm_vcpu *vcpu) +{ + return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && + nested_svm_l2_tlb_flush_enabled(vcpu) && + kvm_hv_is_tlb_flush_hcall(vcpu); +} + void svm_hv_inject_synthetic_vmexit_post_tlb_flush(struct kvm_vcpu *vcpu); #else /* CONFIG_KVM_HYPERV */ static inline void nested_svm_hv_update_vm_vp_ids(struct kvm_vcpu *vcpu) {} -static inline bool nested_svm_l2_tlb_flush_enabled(struct kvm_vcpu *vcpu) +static inline bool nested_svm_is_l2_tlb_flush_hcall(struct kvm_vcpu *vcpu) { return false; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index b36c33255bed..961804df5f45 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -116,31 +116,28 @@ static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm) if (!nested_npt_enabled(svm)) return true; - if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)) + if (!(svm->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE)) return true; return false; } -void recalc_intercepts(struct vcpu_svm *svm) +void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm) { - struct vmcb_control_area *c, *h; - struct vmcb_ctrl_area_cached *g; + struct vmcb_ctrl_area_cached *vmcb12_ctrl = &svm->nested.ctl; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + struct vmcb *vmcb01 = svm->vmcb01.ptr; unsigned int i; - vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); - - if (!is_guest_mode(&svm->vcpu)) + if (WARN_ON_ONCE(svm->vmcb != vmcb02)) return; - c = &svm->vmcb->control; - h = &svm->vmcb01.ptr->control; - g = &svm->nested.ctl; + vmcb_mark_dirty(vmcb02, VMCB_INTERCEPTS); for (i = 0; i < MAX_INTERCEPT; i++) - c->intercepts[i] = h->intercepts[i]; + vmcb02->control.intercepts[i] = vmcb01->control.intercepts[i]; - if (g->int_ctl & V_INTR_MASKING_MASK) { + if (vmcb12_ctrl->int_ctl & V_INTR_MASKING_MASK) { /* * If L2 is active and V_INTR_MASKING is enabled in vmcb12, * disable intercept of CR8 writes as L2's CR8 does not affect @@ -151,24 +148,17 @@ void recalc_intercepts(struct vcpu_svm *svm) * the effective RFLAGS.IF for L1 interrupts will never be set * while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs). */ - vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); - if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF)) - vmcb_clr_intercept(c, INTERCEPT_VINTR); + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_CR8_WRITE); + if (!(vmcb01->save.rflags & X86_EFLAGS_IF)) + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_VINTR); } - /* - * We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB - * flush feature is enabled. - */ - if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu)) - vmcb_clr_intercept(c, INTERCEPT_VMMCALL); - for (i = 0; i < MAX_INTERCEPT; i++) - c->intercepts[i] |= g->intercepts[i]; + vmcb02->control.intercepts[i] |= vmcb12_ctrl->intercepts[i]; /* If SMI is not intercepted, ignore guest SMI intercept as well */ if (!intercept_smi) - vmcb_clr_intercept(c, INTERCEPT_SMI); + vmcb_clr_intercept(&vmcb02->control, INTERCEPT_SMI); if (nested_vmcb_needs_vls_intercept(svm)) { /* @@ -176,10 +166,10 @@ void recalc_intercepts(struct vcpu_svm *svm) * we must intercept these instructions to correctly * emulate them in case L1 doesn't intercept them. */ - vmcb_set_intercept(c, INTERCEPT_VMLOAD); - vmcb_set_intercept(c, INTERCEPT_VMSAVE); + vmcb_set_intercept(&vmcb02->control, INTERCEPT_VMLOAD); + vmcb_set_intercept(&vmcb02->control, INTERCEPT_VMSAVE); } else { - WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK)); + WARN_ON_ONCE(!(vmcb02->control.misc_ctl2 & SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE)); } } @@ -339,8 +329,56 @@ static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size) kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1); } -static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, - struct vmcb_ctrl_area_cached *control) +static bool nested_svm_event_inj_valid_exept(struct kvm_vcpu *vcpu, u8 vector) +{ + /* + * Vectors that do not correspond to a defined exception are invalid + * (including #NMI and reserved vectors). In a best effort to define + * valid exceptions based on the virtual CPU, make all exceptions always + * valid except those obviously tied to a CPU feature. + */ + switch (vector) { + case DE_VECTOR: case DB_VECTOR: case BP_VECTOR: case OF_VECTOR: + case BR_VECTOR: case UD_VECTOR: case NM_VECTOR: case DF_VECTOR: + case TS_VECTOR: case NP_VECTOR: case SS_VECTOR: case GP_VECTOR: + case PF_VECTOR: case MF_VECTOR: case AC_VECTOR: case MC_VECTOR: + case XM_VECTOR: case HV_VECTOR: case SX_VECTOR: + return true; + case CP_VECTOR: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK); + case VC_VECTOR: + return guest_cpu_cap_has(vcpu, X86_FEATURE_SEV_ES); + } + return false; +} + +/* + * According to the APM, VMRUN exits with SVM_EXIT_ERR if SVM_EVTINJ_VALID is + * set and: + * - The type of event_inj is not one of the defined values. + * - The type is SVM_EVTINJ_TYPE_EXEPT, but the vector is not a valid exception. + */ +static bool nested_svm_check_event_inj(struct kvm_vcpu *vcpu, u32 event_inj) +{ + u32 type = event_inj & SVM_EVTINJ_TYPE_MASK; + u8 vector = event_inj & SVM_EVTINJ_VEC_MASK; + + if (!(event_inj & SVM_EVTINJ_VALID)) + return true; + + if (type != SVM_EVTINJ_TYPE_INTR && type != SVM_EVTINJ_TYPE_NMI && + type != SVM_EVTINJ_TYPE_EXEPT && type != SVM_EVTINJ_TYPE_SOFT) + return false; + + if (type == SVM_EVTINJ_TYPE_EXEPT && + !nested_svm_event_inj_valid_exept(vcpu, vector)) + return false; + + return true; +} + +static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu, + struct vmcb_ctrl_area_cached *control) { if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN))) return false; @@ -348,7 +386,8 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, if (CC(control->asid == 0)) return false; - if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled)) + if (CC((control->misc_ctl & SVM_MISC_ENABLE_NP) && + !kvm_vcpu_is_legal_gpa(vcpu, control->nested_cr3))) return false; if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa, @@ -363,12 +402,15 @@ static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu, return false; } + if (CC(!nested_svm_check_event_inj(vcpu, control->event_inj))) + return false; + return true; } /* Common checks that apply to both L1 and L2 state. */ -static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, - struct vmcb_save_area_cached *save) +static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu, + struct vmcb_save_area_cached *save) { if (CC(!(save->efer & EFER_SVME))) return false; @@ -390,6 +432,10 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, CC(!(save->cr0 & X86_CR0_PE)) || CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3))) return false; + + if (CC((save->cs.attrib & SVM_SELECTOR_L_MASK) && + (save->cs.attrib & SVM_SELECTOR_DB_MASK))) + return false; } /* Note, SVM doesn't have any additional restrictions on CR4. */ @@ -402,26 +448,12 @@ static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu, return true; } -static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_save_area_cached *save = &svm->nested.save; - - return __nested_vmcb_check_save(vcpu, save); -} - -static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl; - - return __nested_vmcb_check_controls(vcpu, ctl); -} - int nested_svm_check_cached_vmcb12(struct kvm_vcpu *vcpu) { - if (!nested_vmcb_check_save(vcpu) || - !nested_vmcb_check_controls(vcpu)) + struct vcpu_svm *svm = to_svm(vcpu); + + if (!nested_vmcb_check_save(vcpu, &svm->nested.save) || + !nested_vmcb_check_controls(vcpu, &svm->nested.ctl)) return -EINVAL; return 0; @@ -456,37 +488,39 @@ void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu, nested_svm_sanitize_intercept(vcpu, to, SKINIT); nested_svm_sanitize_intercept(vcpu, to, RDPRU); - to->iopm_base_pa = from->iopm_base_pa; - to->msrpm_base_pa = from->msrpm_base_pa; + /* Always clear SVM_MISC_ENABLE_NP if the guest cannot use NPTs */ + to->misc_ctl = from->misc_ctl; + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NPT)) + to->misc_ctl &= ~SVM_MISC_ENABLE_NP; + + to->iopm_base_pa = from->iopm_base_pa & PAGE_MASK; + to->msrpm_base_pa = from->msrpm_base_pa & PAGE_MASK; to->tsc_offset = from->tsc_offset; - to->tlb_ctl = from->tlb_ctl; + to->tlb_ctl = from->tlb_ctl & TLB_CONTROL_MASK; to->erap_ctl = from->erap_ctl; to->int_ctl = from->int_ctl; - to->int_vector = from->int_vector; - to->int_state = from->int_state; + to->int_vector = from->int_vector & SVM_INT_VECTOR_MASK; + to->int_state = from->int_state & SVM_INTERRUPT_SHADOW_MASK; to->exit_code = from->exit_code; to->exit_info_1 = from->exit_info_1; to->exit_info_2 = from->exit_info_2; to->exit_int_info = from->exit_int_info; to->exit_int_info_err = from->exit_int_info_err; - to->nested_ctl = from->nested_ctl; - to->event_inj = from->event_inj; + to->event_inj = from->event_inj & ~SVM_EVTINJ_RESERVED_BITS; to->event_inj_err = from->event_inj_err; to->next_rip = from->next_rip; to->nested_cr3 = from->nested_cr3; - to->virt_ext = from->virt_ext; + to->misc_ctl2 = from->misc_ctl2; to->pause_filter_count = from->pause_filter_count; to->pause_filter_thresh = from->pause_filter_thresh; - /* Copy asid here because nested_vmcb_check_controls will check it. */ + /* Copy asid here because nested_vmcb_check_controls() will check it */ to->asid = from->asid; - to->msrpm_base_pa &= ~0x0fffULL; - to->iopm_base_pa &= ~0x0fffULL; + to->clean = from->clean; #ifdef CONFIG_KVM_HYPERV /* Hyper-V extensions (Enlightened VMCB) */ if (kvm_hv_hypercall_enabled(vcpu)) { - to->clean = from->clean; memcpy(&to->hv_enlightenments, &from->hv_enlightenments, sizeof(to->hv_enlightenments)); } @@ -502,17 +536,34 @@ void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm, static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to, struct vmcb_save_area *from) { - /* - * Copy only fields that are validated, as we need them - * to avoid TOC/TOU races. - */ - to->efer = from->efer; - to->cr0 = from->cr0; - to->cr3 = from->cr3; - to->cr4 = from->cr4; + to->es = from->es; + to->cs = from->cs; + to->ss = from->ss; + to->ds = from->ds; + to->gdtr = from->gdtr; + to->idtr = from->idtr; - to->dr6 = from->dr6; + to->cpl = from->cpl; + + to->efer = from->efer; + to->cr4 = from->cr4; + to->cr3 = from->cr3; + to->cr0 = from->cr0; to->dr7 = from->dr7; + to->dr6 = from->dr6; + + to->rflags = from->rflags; + to->rip = from->rip; + to->rsp = from->rsp; + + to->s_cet = from->s_cet; + to->ssp = from->ssp; + to->isst_addr = from->isst_addr; + + to->rax = from->rax; + to->cr2 = from->cr2; + + svm_copy_lbrs(to, from); } void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm, @@ -530,6 +581,7 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) u32 mask; svm->nested.ctl.event_inj = svm->vmcb->control.event_inj; svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err; + svm->nested.ctl.int_state = svm->vmcb->control.int_state; /* Only a few fields of int_ctl are written by the processor. */ mask = V_IRQ_MASK | V_TPR_MASK; @@ -542,7 +594,7 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm) * int_ctl (because it was never recognized while L2 was running). */ if (svm_is_intercept(svm, INTERCEPT_VINTR) && - !test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts)) + !vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_VINTR)) mask &= ~V_IRQ_MASK; if (nested_vgif_enabled(svm)) @@ -648,8 +700,16 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm) svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat; } -static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12) +static bool nested_vmcb12_has_lbrv(struct kvm_vcpu *vcpu) { + return guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && + (to_svm(vcpu)->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR); +} + +static void nested_vmcb02_prepare_save(struct vcpu_svm *svm) +{ + struct vmcb_ctrl_area_cached *control = &svm->nested.ctl; + struct vmcb_save_area_cached *save = &svm->nested.save; bool new_vmcb12 = false; struct vmcb *vmcb01 = svm->vmcb01.ptr; struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; @@ -665,64 +725,64 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12 svm->nested.force_msr_bitmap_recalc = true; } - if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) { - vmcb02->save.es = vmcb12->save.es; - vmcb02->save.cs = vmcb12->save.cs; - vmcb02->save.ss = vmcb12->save.ss; - vmcb02->save.ds = vmcb12->save.ds; - vmcb02->save.cpl = vmcb12->save.cpl; + if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_SEG))) { + vmcb02->save.es = save->es; + vmcb02->save.cs = save->cs; + vmcb02->save.ss = save->ss; + vmcb02->save.ds = save->ds; + vmcb02->save.cpl = save->cpl; vmcb_mark_dirty(vmcb02, VMCB_SEG); } - if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) { - vmcb02->save.gdtr = vmcb12->save.gdtr; - vmcb02->save.idtr = vmcb12->save.idtr; + if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_DT))) { + vmcb02->save.gdtr = save->gdtr; + vmcb02->save.idtr = save->idtr; vmcb_mark_dirty(vmcb02, VMCB_DT); } if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) && - (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) { - vmcb02->save.s_cet = vmcb12->save.s_cet; - vmcb02->save.isst_addr = vmcb12->save.isst_addr; - vmcb02->save.ssp = vmcb12->save.ssp; + (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_CET)))) { + vmcb02->save.s_cet = save->s_cet; + vmcb02->save.isst_addr = save->isst_addr; + vmcb02->save.ssp = save->ssp; vmcb_mark_dirty(vmcb02, VMCB_CET); } - kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED); + kvm_set_rflags(vcpu, save->rflags | X86_EFLAGS_FIXED); svm_set_efer(vcpu, svm->nested.save.efer); svm_set_cr0(vcpu, svm->nested.save.cr0); svm_set_cr4(vcpu, svm->nested.save.cr4); - svm->vcpu.arch.cr2 = vmcb12->save.cr2; + svm->vcpu.arch.cr2 = save->cr2; - kvm_rax_write(vcpu, vmcb12->save.rax); - kvm_rsp_write(vcpu, vmcb12->save.rsp); - kvm_rip_write(vcpu, vmcb12->save.rip); + kvm_rax_write(vcpu, save->rax); + kvm_rsp_write(vcpu, save->rsp); + kvm_rip_write(vcpu, save->rip); /* In case we don't even reach vcpu_run, the fields are not updated */ - vmcb02->save.rax = vmcb12->save.rax; - vmcb02->save.rsp = vmcb12->save.rsp; - vmcb02->save.rip = vmcb12->save.rip; + vmcb02->save.rax = save->rax; + vmcb02->save.rsp = save->rsp; + vmcb02->save.rip = save->rip; - if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) { + if (unlikely(new_vmcb12 || vmcb12_is_dirty(control, VMCB_DR))) { vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1; svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW; vmcb_mark_dirty(vmcb02, VMCB_DR); } - if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) { + if (nested_vmcb12_has_lbrv(vcpu)) { /* * Reserved bits of DEBUGCTL are ignored. Be consistent with * svm_set_msr's definition of reserved bits. */ - svm_copy_lbrs(vmcb02, vmcb12); + svm_copy_lbrs(&vmcb02->save, save); vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS; } else { - svm_copy_lbrs(vmcb02, vmcb01); + svm_copy_lbrs(&vmcb02->save, &vmcb01->save); } + vmcb_mark_dirty(vmcb02, VMCB_LBR); svm_update_lbrv(&svm->vcpu); } @@ -750,18 +810,16 @@ static bool is_evtinj_nmi(u32 evtinj) return type == SVM_EVTINJ_TYPE_NMI; } -static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, - unsigned long vmcb12_rip, - unsigned long vmcb12_csbase) +static void nested_vmcb02_prepare_control(struct vcpu_svm *svm) { u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK; u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK; - struct kvm_vcpu *vcpu = &svm->vcpu; - struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb_ctrl_area_cached *vmcb12_ctrl = &svm->nested.ctl; struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; - u32 pause_count12; - u32 pause_thresh12; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct kvm_vcpu *vcpu = &svm->vcpu; + u32 pause_count12, pause_thresh12; nested_svm_transition_tlb_flush(vcpu); @@ -774,7 +832,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) && - (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK)) + (vmcb12_ctrl->int_ctl & V_GIF_ENABLE_MASK)) int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); else int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK); @@ -790,8 +848,16 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, V_NMI_BLOCKING_MASK); } - /* Copied from vmcb01. msrpm_base can be overwritten later. */ - vmcb02->control.nested_ctl = vmcb01->control.nested_ctl; + /* + * Copied from vmcb01. msrpm_base can be overwritten later. + * + * SVM_MISC_ENABLE_NP in vmcb12 is only used for consistency checks. If + * L1 enables NPTs, KVM shadows L1's NPTs and uses those to run L2. If + * L1 disables NPT, KVM runs L2 with the same NPTs used to run L1. For + * the latter, L1 runs L2 with shadow page tables that translate L2 GVAs + * to L1 GPAs, so the same NPTs can be used for L1 and L2. + */ + vmcb02->control.misc_ctl = vmcb01->control.misc_ctl & SVM_MISC_ENABLE_NP; vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa; vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa; vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP); @@ -818,7 +884,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * L1 re-enters L2, the same instruction will trigger a VM-Exit and the * entire cycle start over. */ - if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip)) + if (vmcb02->save.rip && (svm->nested.last_bus_lock_rip == vmcb02->save.rip)) vmcb02->control.bus_lock_counter = 1; else vmcb02->control.bus_lock_counter = 0; @@ -832,10 +898,9 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(vcpu); - vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( - vcpu->arch.l1_tsc_offset, - svm->nested.ctl.tsc_offset, - svm->tsc_ratio_msr); + vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(vcpu->arch.l1_tsc_offset, + vmcb12_ctrl->tsc_offset, + svm->tsc_ratio_msr); vmcb02->control.tsc_offset = vcpu->arch.tsc_offset; @@ -844,49 +909,49 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, nested_svm_update_tsc_ratio_msr(vcpu); vmcb02->control.int_ctl = - (svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) | + (vmcb12_ctrl->int_ctl & int_ctl_vmcb12_bits) | (vmcb01->control.int_ctl & int_ctl_vmcb01_bits); - vmcb02->control.int_vector = svm->nested.ctl.int_vector; - vmcb02->control.int_state = svm->nested.ctl.int_state; - vmcb02->control.event_inj = svm->nested.ctl.event_inj; - vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err; + vmcb02->control.int_vector = vmcb12_ctrl->int_vector; + vmcb02->control.int_state = vmcb12_ctrl->int_state; + vmcb02->control.event_inj = vmcb12_ctrl->event_inj; + vmcb02->control.event_inj_err = vmcb12_ctrl->event_inj_err; /* - * next_rip is consumed on VMRUN as the return address pushed on the - * stack for injected soft exceptions/interrupts. If nrips is exposed - * to L1, take it verbatim from vmcb12. If nrips is supported in - * hardware but not exposed to L1, stuff the actual L2 RIP to emulate - * what a nrips=0 CPU would do (L1 is responsible for advancing RIP - * prior to injecting the event). + * If nrips is exposed to L1, take NextRIP as-is. Otherwise, L1 + * advances L2's RIP before VMRUN instead of using NextRIP. KVM will + * stuff the current RIP as vmcb02's NextRIP before L2 is run. After + * the first run of L2 (e.g. after save+restore), NextRIP is updated by + * the CPU and/or KVM and should be used regardless of L1's support. */ - if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) - vmcb02->control.next_rip = svm->nested.ctl.next_rip; - else if (boot_cpu_has(X86_FEATURE_NRIPS)) - vmcb02->control.next_rip = vmcb12_rip; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) || + !vcpu->arch.nested_run_pending) + vmcb02->control.next_rip = vmcb12_ctrl->next_rip; svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj); + + /* + * soft_int_csbase, soft_int_old_rip, and soft_int_next_rip (if L1 + * doesn't have NRIPS) are initialized later, before the vCPU is run. + */ if (is_evtinj_soft(vmcb02->control.event_inj)) { svm->soft_int_injected = true; - svm->soft_int_csbase = vmcb12_csbase; - svm->soft_int_old_rip = vmcb12_rip; - if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) - svm->soft_int_next_rip = svm->nested.ctl.next_rip; - else - svm->soft_int_next_rip = vmcb12_rip; + if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS) || + !vcpu->arch.nested_run_pending) + svm->soft_int_next_rip = vmcb12_ctrl->next_rip; } - /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */ + /* SVM_MISC2_ENABLE_V_LBR is controlled by svm_update_lbrv() */ if (!nested_vmcb_needs_vls_intercept(svm)) - vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + vmcb02->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER)) - pause_count12 = svm->nested.ctl.pause_filter_count; + pause_count12 = vmcb12_ctrl->pause_filter_count; else pause_count12 = 0; if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD)) - pause_thresh12 = svm->nested.ctl.pause_filter_thresh; + pause_thresh12 = vmcb12_ctrl->pause_filter_thresh; else pause_thresh12 = 0; if (kvm_pause_in_guest(svm->vcpu.kvm)) { @@ -900,7 +965,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh; /* ... but ensure filtering is disabled if so requested. */ - if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) { + if (vmcb12_is_intercept(vmcb12_ctrl, INTERCEPT_PAUSE)) { if (!pause_count12) vmcb02->control.pause_filter_count = 0; if (!pause_thresh12) @@ -917,7 +982,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * L2 is the "guest"). */ if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS)) - vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl & + vmcb02->control.erap_ctl = (vmcb12_ctrl->erap_ctl & ERAP_CONTROL_ALLOW_LARGER_RAP) | ERAP_CONTROL_CLEAR_RAP; @@ -925,7 +990,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. */ - recalc_intercepts(svm); + nested_vmcb02_recalc_intercepts(svm); } static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb) @@ -940,28 +1005,29 @@ static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl; } -int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, - struct vmcb *vmcb12, bool from_vmrun) +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, bool from_vmrun) { struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_ctrl_area_cached *control = &svm->nested.ctl; + struct vmcb_save_area_cached *save = &svm->nested.save; int ret; trace_kvm_nested_vmenter(svm->vmcb->save.rip, vmcb12_gpa, - vmcb12->save.rip, - vmcb12->control.int_ctl, - vmcb12->control.event_inj, - vmcb12->control.nested_ctl, - vmcb12->control.nested_cr3, - vmcb12->save.cr3, + save->rip, + control->int_ctl, + control->event_inj, + control->misc_ctl, + control->nested_cr3, + save->cr3, KVM_ISA_SVM); - trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff, - vmcb12->control.intercepts[INTERCEPT_CR] >> 16, - vmcb12->control.intercepts[INTERCEPT_EXCEPTION], - vmcb12->control.intercepts[INTERCEPT_WORD3], - vmcb12->control.intercepts[INTERCEPT_WORD4], - vmcb12->control.intercepts[INTERCEPT_WORD5]); + trace_kvm_nested_intercepts(control->intercepts[INTERCEPT_CR] & 0xffff, + control->intercepts[INTERCEPT_CR] >> 16, + control->intercepts[INTERCEPT_EXCEPTION], + control->intercepts[INTERCEPT_WORD3], + control->intercepts[INTERCEPT_WORD4], + control->intercepts[INTERCEPT_WORD5]); svm->nested.vmcb12_gpa = vmcb12_gpa; @@ -971,8 +1037,8 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base); - nested_vmcb02_prepare_save(svm, vmcb12); + nested_vmcb02_prepare_control(svm); + nested_vmcb02_prepare_save(svm); ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3, nested_npt_enabled(svm), from_vmrun); @@ -992,12 +1058,38 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, return 0; } +static int nested_svm_copy_vmcb12_to_cache(struct kvm_vcpu *vcpu, u64 vmcb12_gpa) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_host_map map; + struct vmcb *vmcb12; + int r = 0; + + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) + return -EFAULT; + + vmcb12 = map.hva; + nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); + nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); + + if (nested_svm_check_cached_vmcb12(vcpu) < 0) { + vmcb12->control.exit_code = SVM_EXIT_ERR; + vmcb12->control.exit_info_1 = 0; + vmcb12->control.exit_info_2 = 0; + vmcb12->control.event_inj = 0; + vmcb12->control.event_inj_err = 0; + svm_set_gif(svm, false); + r = -EINVAL; + } + + kvm_vcpu_unmap(vcpu, &map); + return r; +} + int nested_svm_vmrun(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); int ret; - struct vmcb *vmcb12; - struct kvm_host_map map; u64 vmcb12_gpa; struct vmcb *vmcb01 = svm->vmcb01.ptr; @@ -1018,32 +1110,27 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) return ret; } - vmcb12_gpa = svm->vmcb->save.rax; - ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map); - if (ret == -EINVAL) { - kvm_inject_gp(vcpu, 0); - return 1; - } else if (ret) { - return kvm_skip_emulated_instruction(vcpu); - } - - ret = kvm_skip_emulated_instruction(vcpu); - - vmcb12 = map.hva; - if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); - nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); - - if (nested_svm_check_cached_vmcb12(vcpu) < 0) { - vmcb12->control.exit_code = SVM_EXIT_ERR; - vmcb12->control.exit_info_1 = 0; - vmcb12->control.exit_info_2 = 0; - goto out; + vmcb12_gpa = kvm_register_read(vcpu, VCPU_REGS_RAX); + if (!page_address_valid(vcpu, vmcb12_gpa)) { + kvm_inject_gp(vcpu, 0); + return 1; } + ret = nested_svm_copy_vmcb12_to_cache(vcpu, vmcb12_gpa); + if (ret) { + if (ret == -EFAULT) + return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); + + /* Advance RIP past VMRUN as part of the nested #VMEXIT. */ + return kvm_skip_emulated_instruction(vcpu); + } + + /* At this point, VMRUN is guaranteed to not fault; advance RIP. */ + ret = kvm_skip_emulated_instruction(vcpu); + /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. @@ -1057,27 +1144,20 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) if (!npt_enabled) vmcb01->save.cr3 = kvm_read_cr3(vcpu); - svm->nested.nested_run_pending = 1; + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; - if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true)) - goto out_exit_err; + if (enter_svm_guest_mode(vcpu, vmcb12_gpa, true) || + !nested_svm_merge_msrpm(vcpu)) { + vcpu->arch.nested_run_pending = 0; + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; - if (nested_svm_merge_msrpm(vcpu)) - goto out; + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; -out_exit_err: - svm->nested.nested_run_pending = 0; - svm->nmi_l1_to_l2 = false; - svm->soft_int_injected = false; - - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; - - nested_svm_vmexit(svm); - -out: - kvm_vcpu_unmap(vcpu, &map); + nested_svm_vmexit(svm); + } return ret; } @@ -1107,6 +1187,11 @@ void svm_copy_vmrun_state(struct vmcb_save_area *to_save, to_save->isst_addr = from_save->isst_addr; to_save->ssp = from_save->ssp; } + + if (kvm_cpu_cap_has(X86_FEATURE_LBRV)) { + svm_copy_lbrs(to_save, from_save); + to_save->dbgctl &= ~DEBUGCTL_RESERVED_BITS; + } } void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) @@ -1125,36 +1210,20 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb) to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; } -int nested_svm_vmexit(struct vcpu_svm *svm) +static int nested_svm_vmexit_update_vmcb12(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &svm->vcpu; - struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; - struct vmcb *vmcb12; struct kvm_host_map map; + struct vmcb *vmcb12; int rc; rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map); - if (rc) { - if (rc == -EINVAL) - kvm_inject_gp(vcpu, 0); - return 1; - } + if (rc) + return rc; vmcb12 = map.hva; - /* Exit Guest-Mode */ - leave_guest_mode(vcpu); - svm->nested.vmcb12_gpa = 0; - WARN_ON_ONCE(svm->nested.nested_run_pending); - - kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); - - /* in case we halted in L2 */ - kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); - - /* Give the current vmcb to the guest */ - vmcb12->save.es = vmcb02->save.es; vmcb12->save.cs = vmcb02->save.cs; vmcb12->save.ss = vmcb02->save.ss; @@ -1164,7 +1233,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->save.efer = svm->vcpu.arch.efer; vmcb12->save.cr0 = kvm_read_cr0(vcpu); vmcb12->save.cr3 = kvm_read_cr3(vcpu); - vmcb12->save.cr2 = vmcb02->save.cr2; + vmcb12->save.cr2 = vcpu->arch.cr2; vmcb12->save.cr4 = svm->vcpu.arch.cr4; vmcb12->save.rflags = kvm_get_rflags(vcpu); vmcb12->save.rip = kvm_rip_read(vcpu); @@ -1191,9 +1260,43 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) vmcb12->control.next_rip = vmcb02->control.next_rip; + if (nested_vmcb12_has_lbrv(vcpu)) + svm_copy_lbrs(&vmcb12->save, &vmcb02->save); + + vmcb12->control.event_inj = 0; + vmcb12->control.event_inj_err = 0; vmcb12->control.int_ctl = svm->nested.ctl.int_ctl; - vmcb12->control.event_inj = svm->nested.ctl.event_inj; - vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err; + + trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, + vmcb12->control.exit_info_1, + vmcb12->control.exit_info_2, + vmcb12->control.exit_int_info, + vmcb12->control.exit_int_info_err, + KVM_ISA_SVM); + + kvm_vcpu_unmap(vcpu, &map); + return 0; +} + +void nested_svm_vmexit(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + struct vmcb *vmcb01 = svm->vmcb01.ptr; + struct vmcb *vmcb02 = svm->nested.vmcb02.ptr; + + if (nested_svm_vmexit_update_vmcb12(vcpu)) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + + /* Exit Guest-Mode */ + leave_guest_mode(vcpu); + svm->nested.vmcb12_gpa = 0; + + kvm_warn_on_nested_run_pending(vcpu); + + kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); + + /* in case we halted in L2 */ + kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE); if (!kvm_pause_in_guest(vcpu->kvm)) { vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count; @@ -1202,11 +1305,11 @@ int nested_svm_vmexit(struct vcpu_svm *svm) } /* - * Invalidate bus_lock_rip unless KVM is still waiting for the guest - * to make forward progress before re-enabling bus lock detection. + * Invalidate last_bus_lock_rip unless KVM is still waiting for the + * guest to make forward progress before re-enabling bus lock detection. */ if (!vmcb02->control.bus_lock_counter) - svm->nested.ctl.bus_lock_rip = INVALID_GPA; + svm->nested.last_bus_lock_rip = INVALID_GPA; nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr); @@ -1239,11 +1342,10 @@ int nested_svm_vmexit(struct vcpu_svm *svm) if (!nested_exit_on_intr(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); - if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) - svm_copy_lbrs(vmcb12, vmcb02); - else - svm_copy_lbrs(vmcb01, vmcb02); + if (!nested_vmcb12_has_lbrv(vcpu)) { + svm_copy_lbrs(&vmcb01->save, &vmcb02->save); + vmcb_mark_dirty(vmcb01, VMCB_LBR); + } svm_update_lbrv(vcpu); @@ -1296,22 +1398,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->vcpu.arch.dr7 = DR7_FIXED_1; kvm_update_dr7(&svm->vcpu); - trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code, - vmcb12->control.exit_info_1, - vmcb12->control.exit_info_2, - vmcb12->control.exit_int_info, - vmcb12->control.exit_int_info_err, - KVM_ISA_SVM); - - kvm_vcpu_unmap(vcpu, &map); - nested_svm_transition_tlb_flush(vcpu); nested_svm_uninit_mmu_context(vcpu); - rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true); - if (rc) - return 1; + if (nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true)) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + + /* Drop tracking for L1->L2 injected NMIs and soft IRQs */ + svm->nmi_l1_to_l2 = false; + svm->soft_int_injected = false; /* * Drop what we picked up for L2 via svm_complete_interrupts() so it @@ -1336,8 +1432,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) */ if (kvm_apicv_activated(vcpu->kvm)) __kvm_vcpu_update_apicv(vcpu); - - return 0; } static void nested_svm_triple_fault(struct kvm_vcpu *vcpu) @@ -1407,7 +1501,7 @@ void svm_leave_nested(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (is_guest_mode(vcpu)) { - svm->nested.nested_run_pending = 0; + vcpu->arch.nested_run_pending = 0; svm->nested.vmcb12_gpa = INVALID_GPA; leave_guest_mode(vcpu); @@ -1592,7 +1686,7 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) * previously injected event, the pending exception occurred while said * event was being delivered and thus needs to be handled. */ - bool block_nested_exceptions = svm->nested.nested_run_pending; + bool block_nested_exceptions = vcpu->arch.nested_run_pending; /* * New events (not exceptions) are only recognized at instruction * boundaries. If an event needs reinjection, then KVM is handling a @@ -1682,9 +1776,7 @@ int nested_svm_exit_special(struct vcpu_svm *svm) } case SVM_EXIT_VMMCALL: /* Hyper-V L2 TLB flush hypercall is handled by L0 */ - if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) && - nested_svm_l2_tlb_flush_enabled(vcpu) && - kvm_hv_is_tlb_flush_hcall(vcpu)) + if (nested_svm_is_l2_tlb_flush_hcall(vcpu)) return NESTED_EXIT_HOST; break; default: @@ -1729,12 +1821,12 @@ static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst, dst->exit_info_2 = from->exit_info_2; dst->exit_int_info = from->exit_int_info; dst->exit_int_info_err = from->exit_int_info_err; - dst->nested_ctl = from->nested_ctl; + dst->misc_ctl = from->misc_ctl; dst->event_inj = from->event_inj; dst->event_inj_err = from->event_inj_err; dst->next_rip = from->next_rip; - dst->nested_cr3 = from->nested_cr3; - dst->virt_ext = from->virt_ext; + dst->nested_cr3 = from->nested_cr3; + dst->misc_ctl2 = from->misc_ctl2; dst->pause_filter_count = from->pause_filter_count; dst->pause_filter_thresh = from->pause_filter_thresh; /* 'clean' and 'hv_enlightenments' are not changed by KVM */ @@ -1769,7 +1861,7 @@ static int svm_get_nested_state(struct kvm_vcpu *vcpu, kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; - if (svm->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; } @@ -1869,12 +1961,12 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, ret = -EINVAL; __nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl); - if (!__nested_vmcb_check_controls(vcpu, &ctl_cached)) + if (!nested_vmcb_check_controls(vcpu, &ctl_cached)) goto out_free; /* * Processor state contains L2 state. Check that it is - * valid for guest mode (see nested_vmcb_check_save). + * valid for guest mode (see nested_vmcb_check_save()). */ cr0 = kvm_read_cr0(vcpu); if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) @@ -1888,7 +1980,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, if (!(save->cr0 & X86_CR0_PG) || !(save->cr0 & X86_CR0_PE) || (save->rflags & X86_EFLAGS_VM) || - !__nested_vmcb_check_save(vcpu, &save_cached)) + !nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; @@ -1906,8 +1998,10 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); - svm->nested.nested_run_pending = - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; + else + vcpu->arch.nested_run_pending = 0; svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa; @@ -1915,7 +2009,13 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, nested_copy_vmcb_control_to_cache(svm, ctl); svm_switch_vmcb(svm, &svm->nested.vmcb02); - nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base); + nested_vmcb02_prepare_control(svm); + + /* + * Any previously restored state (e.g. KVM_SET_SREGS) would mark fields + * dirty in vmcb01 instead of vmcb02, so mark all of vmcb02 dirty here. + */ + vmcb_mark_all_dirty(svm->vmcb); /* * While the nested guest CR3 is already checked and set by @@ -1930,6 +2030,9 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm->nested.force_msr_bitmap_recalc = true; + if (kvm_vcpu_apicv_active(vcpu)) + kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 52517ef55ace..b1aa85a6ca5a 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -4591,7 +4591,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) struct kvm_sev_info *sev = to_kvm_sev_info(svm->vcpu.kvm); struct vmcb *vmcb = svm->vmcb01.ptr; - svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ES_ENABLE; + svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV_ES; /* * An SEV-ES guest requires a VMSA area that is a separate from the @@ -4631,7 +4631,7 @@ static void sev_es_init_vmcb(struct vcpu_svm *svm, bool init_event) if (!sev_vcpu_has_debug_swap(svm)) { vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - recalc_intercepts(svm); + svm_mark_intercepts_dirty(svm); } else { /* * Disable #DB intercept iff DebugSwap is enabled. KVM doesn't @@ -4662,7 +4662,7 @@ void sev_init_vmcb(struct vcpu_svm *svm, bool init_event) { struct kvm_vcpu *vcpu = &svm->vcpu; - svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; + svm->vmcb->control.misc_ctl |= SVM_MISC_ENABLE_SEV; clr_exception_intercept(svm, UD_VECTOR); /* diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d304568588c7..07ed964dacf5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -52,6 +52,7 @@ #include "svm.h" #include "svm_ops.h" +#include "hyperv.h" #include "kvm_onhyperv.h" #include "svm_onhyperv.h" @@ -216,6 +217,19 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { if (!(efer & EFER_SVME)) { + /* + * Architecturally, clearing EFER.SVME while a guest is + * running yields undefined behavior, i.e. KVM can do + * literally anything. Force the vCPU back into L1 as + * that is the safest option for KVM, but synthesize a + * triple fault (for L1!) so that KVM at least doesn't + * run random L2 code in the context of L1. Do so if + * and only if the vCPU is actively running, e.g. to + * avoid positives if userspace is stuffing state. + */ + if (is_guest_mode(vcpu) && vcpu->wants_to_run) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + svm_leave_nested(vcpu); /* #GP intercept is still needed for vmware backdoor */ if (!enable_vmware_backdoor) @@ -244,6 +258,8 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm)) set_exception_intercept(svm, GP_VECTOR); } + + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); } svm->vmcb->save.efer = efer | EFER_SVME; @@ -635,7 +651,7 @@ static void set_dr_intercepts(struct vcpu_svm *svm) vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_READ); vmcb_set_intercept(&vmcb->control, INTERCEPT_DR7_WRITE); - recalc_intercepts(svm); + svm_mark_intercepts_dirty(svm); } static void clr_dr_intercepts(struct vcpu_svm *svm) @@ -644,7 +660,7 @@ static void clr_dr_intercepts(struct vcpu_svm *svm) vmcb->control.intercepts[INTERCEPT_DR] = 0; - recalc_intercepts(svm); + svm_mark_intercepts_dirty(svm); } static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) @@ -710,7 +726,7 @@ void *svm_alloc_permissions_map(unsigned long size, gfp_t gfp_mask) static void svm_recalc_lbr_msr_intercepts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - bool intercept = !(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK); + bool intercept = !(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR); if (intercept == svm->lbr_msrs_intercepted) return; @@ -841,20 +857,9 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) */ } -void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) -{ - to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; - to_vmcb->save.br_from = from_vmcb->save.br_from; - to_vmcb->save.br_to = from_vmcb->save.br_to; - to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; - to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; - - vmcb_mark_dirty(to_vmcb, VMCB_LBR); -} - static void __svm_enable_lbrv(struct kvm_vcpu *vcpu) { - to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; + to_svm(vcpu)->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_LBR; } void svm_enable_lbrv(struct kvm_vcpu *vcpu) @@ -866,16 +871,16 @@ void svm_enable_lbrv(struct kvm_vcpu *vcpu) static void __svm_disable_lbrv(struct kvm_vcpu *vcpu) { KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm); - to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; + to_svm(vcpu)->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_LBR; } void svm_update_lbrv(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; + bool current_enable_lbrv = svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR; bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) || (is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) && - (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); + (svm->nested.ctl.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR)); if (enable_lbrv && !current_enable_lbrv) __svm_enable_lbrv(vcpu); @@ -1009,6 +1014,14 @@ void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) preempt_enable(); } +static bool svm_has_pending_gif_event(struct vcpu_svm *svm) +{ + return svm->vcpu.arch.smi_pending || + svm->vcpu.arch.nmi_pending || + kvm_cpu_has_injectable_intr(&svm->vcpu) || + kvm_apic_has_pending_init_or_sipi(&svm->vcpu); +} + /* Evaluate instruction intercepts that depend on guest CPUID features. */ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) { @@ -1034,22 +1047,50 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) } /* - * No need to toggle VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK here, it is - * always set if vls is enabled. If the intercepts are set, the bit is - * meaningless anyway. + * Intercept instructions that #UD if EFER.SVME=0, as SVME must be set + * even when running the guest, i.e. hardware will only ever see + * EFER.SVME=1. + * + * No need to toggle any of the vgif/vls/etc. enable bits here, as they + * are set when the VMCB is initialized and never cleared (if the + * relevant intercepts are set, the enablements are meaningless anyway). + * + * FIXME: When #GP is not intercepted, a #GP on these instructions (e.g. + * due to CPL > 0) could be injected by hardware before the instruction + * is intercepted, leading to #GP taking precedence over #UD from the + * guest's perspective. */ - if (guest_cpuid_is_intel_compatible(vcpu)) { + if (!(vcpu->arch.efer & EFER_SVME)) { svm_set_intercept(svm, INTERCEPT_VMLOAD); svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm_set_intercept(svm, INTERCEPT_CLGI); + svm_set_intercept(svm, INTERCEPT_STGI); } else { /* * If hardware supports Virtual VMLOAD VMSAVE then enable it * in VMCB and clear intercepts to avoid #VMEXIT. */ - if (vls) { + if (guest_cpuid_is_intel_compatible(vcpu)) { + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + } else if (vls) { svm_clr_intercept(svm, INTERCEPT_VMLOAD); svm_clr_intercept(svm, INTERCEPT_VMSAVE); } + + /* + * Process pending events when clearing STGI/CLGI intercepts if + * there's at least one pending event that is masked by GIF, so + * that KVM re-evaluates if the intercept needs to be set again + * to track when GIF is re-enabled (e.g. for NMI injection). + */ + if (vgif) { + svm_clr_intercept(svm, INTERCEPT_CLGI); + svm_clr_intercept(svm, INTERCEPT_STGI); + + if (svm_has_pending_gif_event(svm)) + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } } if (kvm_need_rdpmc_intercept(vcpu)) @@ -1162,7 +1203,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) if (npt_enabled) { /* Setup VMCB for Nested Paging */ - control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; + control->misc_ctl |= SVM_MISC_ENABLE_NP; svm_clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); svm_clr_intercept(svm, INTERCEPT_CR3_READ); @@ -1194,14 +1235,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu, bool init_event) if (vnmi) svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; - if (vgif) { - svm_clr_intercept(svm, INTERCEPT_STGI); - svm_clr_intercept(svm, INTERCEPT_CLGI); + if (vgif) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; - } if (vls) - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; if (vcpu->kvm->arch.bus_lock_detection_enabled) svm_set_intercept(svm, INTERCEPT_BUSLOCK); @@ -2151,6 +2189,7 @@ static int intr_interception(struct kvm_vcpu *vcpu) static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) { + u64 vmcb12_gpa = kvm_register_read(vcpu, VCPU_REGS_RAX); struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb12; struct kvm_host_map map; @@ -2159,13 +2198,14 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) if (nested_svm_check_permissions(vcpu)) return 1; - ret = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map); - if (ret) { - if (ret == -EINVAL) - kvm_inject_gp(vcpu, 0); + if (!page_address_valid(vcpu, vmcb12_gpa)) { + kvm_inject_gp(vcpu, 0); return 1; } + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map)) + return kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); + vmcb12 = map.hva; ret = kvm_skip_emulated_instruction(vcpu); @@ -2202,58 +2242,28 @@ static int vmrun_interception(struct kvm_vcpu *vcpu) return nested_svm_vmrun(vcpu); } -enum { - NONE_SVM_INSTR, - SVM_INSTR_VMRUN, - SVM_INSTR_VMLOAD, - SVM_INSTR_VMSAVE, -}; - -/* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ -static int svm_instr_opcode(struct kvm_vcpu *vcpu) +/* Return 0 if not SVM instr, otherwise return associated exit_code */ +static u64 svm_get_decoded_instr_exit_code(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->b != 0x1 || ctxt->opcode_len != 2) - return NONE_SVM_INSTR; + return 0; + + BUILD_BUG_ON(!SVM_EXIT_VMRUN || !SVM_EXIT_VMLOAD || !SVM_EXIT_VMSAVE); switch (ctxt->modrm) { case 0xd8: /* VMRUN */ - return SVM_INSTR_VMRUN; + return SVM_EXIT_VMRUN; case 0xda: /* VMLOAD */ - return SVM_INSTR_VMLOAD; + return SVM_EXIT_VMLOAD; case 0xdb: /* VMSAVE */ - return SVM_INSTR_VMSAVE; + return SVM_EXIT_VMSAVE; default: break; } - return NONE_SVM_INSTR; -} - -static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) -{ - const int guest_mode_exit_codes[] = { - [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, - [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, - [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, - }; - int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { - [SVM_INSTR_VMRUN] = vmrun_interception, - [SVM_INSTR_VMLOAD] = vmload_interception, - [SVM_INSTR_VMSAVE] = vmsave_interception, - }; - struct vcpu_svm *svm = to_svm(vcpu); - int ret; - - if (is_guest_mode(vcpu)) { - /* Returns '1' or -errno on failure, '0' on success. */ - ret = nested_svm_simple_vmexit(svm, guest_mode_exit_codes[opcode]); - if (ret) - return ret; - return 1; - } - return svm_instr_handlers[opcode](vcpu); + return 0; } /* @@ -2268,7 +2278,7 @@ static int gp_interception(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 error_code = svm->vmcb->control.exit_info_1; - int opcode; + u64 svm_exit_code; /* Both #GP cases have zero error_code */ if (error_code) @@ -2278,27 +2288,37 @@ static int gp_interception(struct kvm_vcpu *vcpu) if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) goto reinject; - opcode = svm_instr_opcode(vcpu); + /* FIXME: Handle SVM instructions through the emulator */ + svm_exit_code = svm_get_decoded_instr_exit_code(vcpu); + if (svm_exit_code) { + if (!is_guest_mode(vcpu)) + return svm_invoke_exit_handler(vcpu, svm_exit_code); - if (opcode == NONE_SVM_INSTR) { - if (!enable_vmware_backdoor) + if (nested_svm_check_permissions(vcpu)) + return 1; + + if (!page_address_valid(vcpu, kvm_register_read(vcpu, VCPU_REGS_RAX))) goto reinject; /* - * VMware backdoor emulation on #GP interception only handles - * IN{S}, OUT{S}, and RDPMC. + * FIXME: Only synthesize a #VMEXIT if L1 sets the intercept, + * but only after the VMLOAD/VMSAVE exit handlers can properly + * handle VMLOAD/VMSAVE from L2 with VLS enabled in L1 (i.e. + * RAX is an L2 GPA that needs translation through L1's NPT). */ - if (!is_guest_mode(vcpu)) - return kvm_emulate_instruction(vcpu, - EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); - } else { - /* All SVM instructions expect page aligned RAX */ - if (svm->vmcb->save.rax & ~PAGE_MASK) - goto reinject; - - return emulate_svm_instr(vcpu, opcode); + nested_svm_simple_vmexit(svm, svm_exit_code); + return 1; } + /* + * VMware backdoor emulation on #GP interception only handles + * IN{S}, OUT{S}, and RDPMC, and only for L1. + */ + if (!enable_vmware_backdoor || is_guest_mode(vcpu)) + goto reinject; + + return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); + reinject: kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); return 1; @@ -2319,10 +2339,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) svm_clear_vintr(svm); enable_gif(svm); - if (svm->vcpu.arch.smi_pending || - svm->vcpu.arch.nmi_pending || - kvm_cpu_has_injectable_intr(&svm->vcpu) || - kvm_apic_has_pending_init_or_sipi(&svm->vcpu)) + if (svm_has_pending_gif_event(svm)) kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); } else { disable_gif(svm); @@ -2366,6 +2383,9 @@ static int invlpga_interception(struct kvm_vcpu *vcpu) gva_t gva = kvm_rax_read(vcpu); u32 asid = kvm_rcx_read(vcpu); + if (nested_svm_check_permissions(vcpu)) + return 1; + /* FIXME: Handle an address size prefix. */ if (!is_long_mode(vcpu)) gva = (u32)gva; @@ -2723,6 +2743,24 @@ static int svm_get_feature_msr(u32 msr, u64 *data) return 0; } +static u64 *svm_vmcb_lbr(struct vcpu_svm *svm, u32 msr) +{ + switch (msr) { + case MSR_IA32_LASTBRANCHFROMIP: + return &svm->vmcb->save.br_from; + case MSR_IA32_LASTBRANCHTOIP: + return &svm->vmcb->save.br_to; + case MSR_IA32_LASTINTFROMIP: + return &svm->vmcb->save.last_excp_from; + case MSR_IA32_LASTINTTOIP: + return &svm->vmcb->save.last_excp_to; + default: + break; + } + KVM_BUG_ON(1, svm->vcpu.kvm); + return &svm->vmcb->save.br_from; +} + static bool sev_es_prevent_msr_access(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { @@ -2796,19 +2834,13 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = svm->tsc_aux; break; case MSR_IA32_DEBUGCTLMSR: - msr_info->data = svm->vmcb->save.dbgctl; + msr_info->data = lbrv ? svm->vmcb->save.dbgctl : 0; break; case MSR_IA32_LASTBRANCHFROMIP: - msr_info->data = svm->vmcb->save.br_from; - break; case MSR_IA32_LASTBRANCHTOIP: - msr_info->data = svm->vmcb->save.br_to; - break; case MSR_IA32_LASTINTFROMIP: - msr_info->data = svm->vmcb->save.last_excp_from; - break; case MSR_IA32_LASTINTTOIP: - msr_info->data = svm->vmcb->save.last_excp_to; + msr_info->data = lbrv ? *svm_vmcb_lbr(svm, msr_info->index) : 0; break; case MSR_VM_HSAVE_PA: msr_info->data = svm->nested.hsave_msr; @@ -3083,6 +3115,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) vmcb_mark_dirty(svm->vmcb, VMCB_LBR); svm_update_lbrv(vcpu); break; + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: + if (!lbrv) + return KVM_MSR_RET_UNSUPPORTED; + if (!msr->host_initiated) + return 1; + *svm_vmcb_lbr(svm, ecx) = data; + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); + break; case MSR_VM_HSAVE_PA: /* * Old kernels did not validate the value written to @@ -3224,11 +3267,27 @@ static int bus_lock_exit(struct kvm_vcpu *vcpu) vcpu->arch.complete_userspace_io = complete_userspace_buslock; if (is_guest_mode(vcpu)) - svm->nested.ctl.bus_lock_rip = vcpu->arch.cui_linear_rip; + svm->nested.last_bus_lock_rip = vcpu->arch.cui_linear_rip; return 0; } +static int vmmcall_interception(struct kvm_vcpu *vcpu) +{ + /* + * Inject a #UD if L2 is active and the VMMCALL isn't a Hyper-V TLB + * hypercall, as VMMCALL #UDs if it's not intercepted, and this path is + * reachable if and only if L1 doesn't want to intercept VMMCALL or has + * enabled L0 (KVM) handling of Hyper-V L2 TLB flush hypercalls. + */ + if (is_guest_mode(vcpu) && !nested_svm_is_l2_tlb_flush_hcall(vcpu)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + + return kvm_emulate_hypercall(vcpu); +} + static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3279,7 +3338,7 @@ static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, [SVM_EXIT_VMRUN] = vmrun_interception, - [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, + [SVM_EXIT_VMMCALL] = vmmcall_interception, [SVM_EXIT_VMLOAD] = vmload_interception, [SVM_EXIT_VMSAVE] = vmsave_interception, [SVM_EXIT_STGI] = stgi_interception, @@ -3354,13 +3413,13 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); - pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); + pr_err("%-20s%lld\n", "misc_ctl:", control->misc_ctl); pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%016llx\n", "ghcb:", control->ghcb_gpa); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); - pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext); + pr_err("%-20s%lld\n", "misc_ctl2:", control->misc_ctl2); pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); @@ -3638,6 +3697,16 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, svm->vmcb->control.exit_code); } +static void svm_set_nested_run_soft_int_state(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->soft_int_csbase = svm->vmcb->save.cs.base; + svm->soft_int_old_rip = kvm_rip_read(vcpu); + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) + svm->soft_int_next_rip = kvm_rip_read(vcpu); +} + static int pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); @@ -3739,6 +3808,36 @@ static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) svm->vmcb->control.event_inj = intr->nr | SVM_EVTINJ_VALID | type; } +static void svm_fixup_nested_rips(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!is_guest_mode(vcpu) || !vcpu->arch.nested_run_pending) + return; + + /* + * If nrips is supported in hardware but not exposed to L1, stuff the + * actual L2 RIP to emulate what a nrips=0 CPU would do (L1 is + * responsible for advancing RIP prior to injecting the event). Once L2 + * runs after L1 executes VMRUN, NextRIP is updated by the CPU and/or + * KVM, and this is no longer needed. + * + * This is done here (as opposed to when preparing vmcb02) to use the + * most up-to-date value of RIP regardless of the order of restoring + * registers and nested state in the vCPU save+restore path. + */ + if (boot_cpu_has(X86_FEATURE_NRIPS) && + !guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS)) + svm->vmcb->control.next_rip = kvm_rip_read(vcpu); + + /* + * Simiarly, initialize the soft int metadata here to use the most + * up-to-date values of RIP and CS base, regardless of restore order. + */ + if (svm->soft_int_injected) + svm_set_nested_run_soft_int_state(vcpu); +} + void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, int trig_mode, int vector) { @@ -3861,7 +3960,7 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) return -EBUSY; if (svm_nmi_blocked(vcpu)) @@ -3903,7 +4002,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) return -EBUSY; if (svm_interrupt_blocked(vcpu)) @@ -4106,6 +4205,18 @@ static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); struct vcpu_svm *svm = to_svm(vcpu); + /* + * Initialize the soft int fields *before* reading them below if KVM + * aborted entry to the guest with a nested VMRUN pending. To ensure + * KVM uses up-to-date values for RIP and CS base across save/restore, + * regardless of restore order, KVM waits to set the soft int fields + * until VMRUN is imminent. But when canceling injection, KVM requeues + * the soft int and will reinject it via the standard injection flow, + * and so KVM needs to grab the state from the pending nested VMRUN. + */ + if (is_guest_mode(vcpu) && vcpu->arch.nested_run_pending) + svm_set_nested_run_soft_int_state(vcpu); + /* * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's * associated with the original soft exception/interrupt. next_rip is @@ -4335,6 +4446,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) kvm_register_is_dirty(vcpu, VCPU_EXREG_ERAPS)) svm->vmcb->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP; + svm_fixup_nested_rips(vcpu); + svm_hv_update_vp_id(svm->vmcb, vcpu); /* @@ -4355,7 +4468,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) * VM-Exit), as running with the host's DEBUGCTL can negatively affect * guest state and can even be fatal, e.g. due to Bus Lock Detect. */ - if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + if (!(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR) && vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) update_debugctlmsr(svm->vmcb->save.dbgctl); @@ -4386,7 +4499,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); - if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + if (!(svm->vmcb->control.misc_ctl2 & SVM_MISC2_ENABLE_V_LBR) && vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) update_debugctlmsr(vcpu->arch.host_debugctl); @@ -4404,11 +4517,11 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) nested_sync_control_from_vmcb02(svm); /* Track VMRUNs that have made past consistency checking */ - if (svm->nested.nested_run_pending && + if (vcpu->arch.nested_run_pending && !svm_is_vmrun_failure(svm->vmcb->control.exit_code)) ++vcpu->stat.nested_run; - svm->nested.nested_run_pending = 0; + vcpu->arch.nested_run_pending = 0; } svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; @@ -4436,6 +4549,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) svm_complete_interrupts(vcpu); + /* + * Update the cache after completing interrupts to get an accurate + * NextRIP, e.g. when re-injecting a soft interrupt. + * + * FIXME: Rework svm_get_nested_state() to not pull data from the + * cache (except for maybe int_ctl). + */ + if (is_guest_mode(vcpu)) + svm->nested.ctl.next_rip = svm->vmcb->control.next_rip; + return svm_exit_handlers_fastpath(vcpu); } @@ -4767,7 +4890,7 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu) static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) return -EBUSY; if (svm_smi_blocked(vcpu)) @@ -4784,7 +4907,6 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_host_map map_save; - int ret; if (!is_guest_mode(vcpu)) return 0; @@ -4804,9 +4926,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); - if (ret) - return ret; + nested_svm_simple_vmexit(svm, SVM_EXIT_SW); /* * KVM uses VMCB01 to store L1 host state while L2 runs but @@ -4884,12 +5004,11 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) if (nested_svm_check_cached_vmcb12(vcpu) < 0) goto unmap_save; - if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, - vmcb12, false) != 0) + if (enter_svm_guest_mode(vcpu, smram64->svm_guest_vmcb_gpa, false) != 0) goto unmap_save; ret = 0; - svm->nested.nested_run_pending = 1; + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; unmap_save: kvm_vcpu_unmap(vcpu, &map_save); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6942e6b0eda6..c36802285236 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -140,12 +140,32 @@ struct kvm_vmcb_info { }; struct vmcb_save_area_cached { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg gdtr; + struct vmcb_seg idtr; + u8 cpl; u64 efer; u64 cr4; u64 cr3; u64 cr0; u64 dr7; u64 dr6; + u64 rflags; + u64 rip; + u64 rsp; + u64 s_cet; + u64 ssp; + u64 isst_addr; + u64 rax; + u64 cr2; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; }; struct vmcb_ctrl_area_cached { @@ -166,14 +186,13 @@ struct vmcb_ctrl_area_cached { u64 exit_info_2; u32 exit_int_info; u32 exit_int_info_err; - u64 nested_ctl; + u64 misc_ctl; u32 event_inj; u32 event_inj_err; u64 next_rip; u64 nested_cr3; - u64 virt_ext; + u64 misc_ctl2; u32 clean; - u64 bus_lock_rip; union { #if IS_ENABLED(CONFIG_HYPERV) || IS_ENABLED(CONFIG_KVM_HYPERV) struct hv_vmcb_enlightenments hv_enlightenments; @@ -188,6 +207,7 @@ struct svm_nested_state { u64 vm_cr_msr; u64 vmcb12_gpa; u64 last_vmcb12_gpa; + u64 last_bus_lock_rip; /* * The MSR permissions map used for vmcb02, which is the merge result @@ -195,10 +215,6 @@ struct svm_nested_state { */ void *msrpm; - /* A VMRUN has started but has not yet been performed, so - * we cannot inject a nested vmexit yet. */ - bool nested_run_pending; - /* cache for control fields of the guest */ struct vmcb_ctrl_area_cached ctl; @@ -357,8 +373,6 @@ struct svm_cpu_data { DECLARE_PER_CPU(struct svm_cpu_data, svm_data); -void recalc_intercepts(struct vcpu_svm *svm); - static __always_inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) { return container_of(kvm, struct kvm_svm, kvm); @@ -415,9 +429,9 @@ static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) vmcb->control.clean &= ~(1 << bit); } -static inline bool vmcb_is_dirty(struct vmcb *vmcb, int bit) +static inline bool vmcb12_is_dirty(struct vmcb_ctrl_area_cached *control, int bit) { - return !test_bit(bit, (unsigned long *)&vmcb->control.clean); + return !test_bit(bit, (unsigned long *)&control->clean); } static __always_inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) @@ -486,6 +500,22 @@ static inline bool vmcb12_is_intercept(struct vmcb_ctrl_area_cached *control, u3 return __vmcb_is_intercept((unsigned long *)&control->intercepts, bit); } +void nested_vmcb02_recalc_intercepts(struct vcpu_svm *svm); + +static inline void svm_mark_intercepts_dirty(struct vcpu_svm *svm) +{ + vmcb_mark_dirty(svm->vmcb01.ptr, VMCB_INTERCEPTS); + + /* + * If L2 is active, recalculate the intercepts for vmcb02 to account + * for the changes made to vmcb01. All intercept configuration is done + * for vmcb01 and then propagated to vmcb02 to combine KVM's intercepts + * with L1's intercepts (from the vmcb12 snapshot). + */ + if (is_guest_mode(&svm->vcpu)) + nested_vmcb02_recalc_intercepts(svm); +} + static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) { struct vmcb *vmcb = svm->vmcb01.ptr; @@ -493,7 +523,7 @@ static inline void set_exception_intercept(struct vcpu_svm *svm, u32 bit) WARN_ON_ONCE(bit >= 32); vmcb_set_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); - recalc_intercepts(svm); + svm_mark_intercepts_dirty(svm); } static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) @@ -503,7 +533,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, u32 bit) WARN_ON_ONCE(bit >= 32); vmcb_clr_intercept(&vmcb->control, INTERCEPT_EXCEPTION_OFFSET + bit); - recalc_intercepts(svm); + svm_mark_intercepts_dirty(svm); } static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) @@ -512,7 +542,7 @@ static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) vmcb_set_intercept(&vmcb->control, bit); - recalc_intercepts(svm); + svm_mark_intercepts_dirty(svm); } static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) @@ -521,7 +551,7 @@ static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) vmcb_clr_intercept(&vmcb->control, bit); - recalc_intercepts(svm); + svm_mark_intercepts_dirty(svm); } static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) @@ -578,7 +608,7 @@ static inline bool gif_set(struct vcpu_svm *svm) static inline bool nested_npt_enabled(struct vcpu_svm *svm) { - return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; + return svm->nested.ctl.misc_ctl & SVM_MISC_ENABLE_NP; } static inline bool nested_vnmi_enabled(struct vcpu_svm *svm) @@ -713,8 +743,16 @@ static inline void *svm_vcpu_alloc_msrpm(void) return svm_alloc_permissions_map(MSRPM_SIZE, GFP_KERNEL_ACCOUNT); } +#define svm_copy_lbrs(to, from) \ +do { \ + (to)->dbgctl = (from)->dbgctl; \ + (to)->br_from = (from)->br_from; \ + (to)->br_to = (from)->br_to; \ + (to)->last_excp_from = (from)->last_excp_from; \ + (to)->last_excp_to = (from)->last_excp_to; \ +} while (0) + void svm_vcpu_free_msrpm(void *msrpm); -void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb); void svm_enable_lbrv(struct kvm_vcpu *vcpu); void svm_update_lbrv(struct kvm_vcpu *vcpu); @@ -776,8 +814,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) int __init nested_svm_init_msrpm_merge_offsets(void); -int enter_svm_guest_mode(struct kvm_vcpu *vcpu, - u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun); +int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb_gpa, bool from_vmrun); void svm_leave_nested(struct kvm_vcpu *vcpu); void svm_free_nested(struct vcpu_svm *svm); int svm_allocate_nested(struct vcpu_svm *svm); @@ -785,14 +822,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu); void svm_copy_vmrun_state(struct vmcb_save_area *to_save, struct vmcb_save_area *from_save); void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb); -int nested_svm_vmexit(struct vcpu_svm *svm); +void nested_svm_vmexit(struct vcpu_svm *svm); -static inline int nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) +static inline void nested_svm_simple_vmexit(struct vcpu_svm *svm, u32 exit_code) { svm->vmcb->control.exit_code = exit_code; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; - return nested_svm_vmexit(svm); + nested_svm_vmexit(svm); } int nested_svm_exit_handled(struct vcpu_svm *svm); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 937aeb474af7..3fe88f29be7a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2273,7 +2273,7 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) { - if (vmx->nested.nested_run_pending && + if (vmx->vcpu.arch.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) return vmcs12->guest_ia32_efer; else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) @@ -2513,7 +2513,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 /* * Interrupt/Exception Fields */ - if (vmx->nested.nested_run_pending) { + if (vmx->vcpu.arch.nested_run_pending) { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, @@ -2621,7 +2621,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } - if (kvm_mpx_supported() && vmx->nested.nested_run_pending && + if (kvm_mpx_supported() && vmx->vcpu.arch.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); } @@ -2718,7 +2718,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, !(evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); } - if (vmx->nested.nested_run_pending && + if (vcpu->arch.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmx_guest_debugctl_write(vcpu, vmcs12->guest_ia32_debugctl & @@ -2728,13 +2728,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmx_guest_debugctl_write(vcpu, vmx->nested.pre_vmenter_debugctl); } - if (!vmx->nested.nested_run_pending || + if (!vcpu->arch.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) vmcs_write_cet_state(vcpu, vmx->nested.pre_vmenter_s_cet, vmx->nested.pre_vmenter_ssp, vmx->nested.pre_vmenter_ssp_tbl); - if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || + if (kvm_mpx_supported() && (!vcpu->arch.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); @@ -2747,7 +2747,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); - if (vmx->nested.nested_run_pending && + if (vcpu->arch.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); vcpu->arch.pat = vmcs12->guest_ia32_pat; @@ -3349,7 +3349,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to * CR0.PG) is 1. */ - if (to_vmx(vcpu)->nested.nested_run_pending && + if (vcpu->arch.nested_run_pending && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || @@ -3627,15 +3627,15 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, kvm_service_local_tlb_flush_requests(vcpu); - if (!vmx->nested.nested_run_pending || + if (!vcpu->arch.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.pre_vmenter_debugctl = vmx_guest_debugctl_read(); if (kvm_mpx_supported() && - (!vmx->nested.nested_run_pending || + (!vcpu->arch.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); - if (!vmx->nested.nested_run_pending || + if (!vcpu->arch.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_CET_STATE)) vmcs_read_cet_state(vcpu, &vmx->nested.pre_vmenter_s_cet, &vmx->nested.pre_vmenter_ssp, @@ -3844,7 +3844,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * We're finally done with prerequisite checking, and can start with * the nested entry. */ - vmx->nested.nested_run_pending = 1; + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; vmx->nested.has_preemption_timer_deadline = false; status = nested_vmx_enter_non_root_mode(vcpu, true); if (unlikely(status != NVMX_VMENTRY_SUCCESS)) @@ -3876,12 +3876,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && (vmcs12->guest_rflags & X86_EFLAGS_IF))) { - vmx->nested.nested_run_pending = 0; + vcpu->arch.nested_run_pending = 0; return kvm_emulate_halt_noskip(vcpu); } break; case GUEST_ACTIVITY_WAIT_SIPI: - vmx->nested.nested_run_pending = 0; + vcpu->arch.nested_run_pending = 0; kvm_set_mp_state(vcpu, KVM_MP_STATE_INIT_RECEIVED); break; default: @@ -3891,7 +3891,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return 1; vmentry_failed: - vmx->nested.nested_run_pending = 0; + vcpu->arch.nested_run_pending = 0; if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) return 0; if (status == NVMX_VMENTRY_VMEXIT) @@ -4288,7 +4288,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) * previously injected event, the pending exception occurred while said * event was being delivered and thus needs to be handled. */ - bool block_nested_exceptions = vmx->nested.nested_run_pending; + bool block_nested_exceptions = vcpu->arch.nested_run_pending; /* * Events that don't require injection, i.e. that are virtualized by * hardware, aren't blocked by a pending VM-Enter as KVM doesn't need @@ -4657,7 +4657,7 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (nested_cpu_has_preemption_timer(vmcs12) && vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && - !vmx->nested.nested_run_pending) + !vcpu->arch.nested_run_pending) vmcs12->vmx_preemption_timer_value = vmx_get_preemption_timer_value(vcpu); @@ -5056,7 +5056,7 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx->nested.mtf_pending = false; /* trying to cancel vmlaunch/vmresume is a bug */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); + kvm_warn_on_nested_run_pending(vcpu); #ifdef CONFIG_KVM_HYPERV if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { @@ -6679,7 +6679,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) unsigned long exit_qual; u32 exit_intr_info; - WARN_ON_ONCE(vmx->nested.nested_run_pending); + kvm_warn_on_nested_run_pending(vcpu); /* * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM @@ -6775,7 +6775,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu)) { kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; - if (vmx->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; if (vmx->nested.mtf_pending) @@ -6850,7 +6850,7 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, void vmx_leave_nested(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) { - to_vmx(vcpu)->nested.nested_run_pending = 0; + vcpu->arch.nested_run_pending = 0; nested_vmx_vmexit(vcpu, -1, 0, 0); } free_nested(vcpu); @@ -7008,8 +7008,10 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) return 0; - vmx->nested.nested_run_pending = - !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); + if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING) + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; + else + vcpu->arch.nested_run_pending = 0; vmx->nested.mtf_pending = !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); @@ -7054,7 +7056,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return 0; error_guest_mode: - vmx->nested.nested_run_pending = 0; + vcpu->arch.nested_run_pending = 0; return ret; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d16427a079f6..d76a21c38506 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5279,7 +5279,7 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { - if (to_vmx(vcpu)->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) return -EBUSY; /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ @@ -5306,7 +5306,7 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { - if (to_vmx(vcpu)->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) return -EBUSY; /* @@ -6118,7 +6118,7 @@ static bool vmx_unhandleable_emulation_required(struct kvm_vcpu *vcpu) * only reachable if userspace modifies L2 guest state after KVM has * performed the nested VM-Enter consistency checks. */ - if (vmx->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) return true; /* @@ -6802,7 +6802,7 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * invalid guest state should never happen as that means KVM knowingly * allowed a nested VM-Enter with an invalid vmcs12. More below. */ - if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) + if (KVM_BUG_ON(vcpu->arch.nested_run_pending, vcpu->kvm)) return -EIO; if (is_guest_mode(vcpu)) { @@ -7730,11 +7730,11 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) * Track VMLAUNCH/VMRESUME that have made past guest state * checking. */ - if (vmx->nested.nested_run_pending && + if (vcpu->arch.nested_run_pending && !vmx_get_exit_reason(vcpu).failed_vmentry) ++vcpu->stat.nested_run; - vmx->nested.nested_run_pending = 0; + vcpu->arch.nested_run_pending = 0; } if (unlikely(vmx->fail)) @@ -8491,7 +8491,7 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu) int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ - if (to_vmx(vcpu)->nested.nested_run_pending) + if (vcpu->arch.nested_run_pending) return -EBUSY; return !is_smm(vcpu); } @@ -8536,7 +8536,7 @@ int vmx_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) if (ret != NVMX_VMENTRY_SUCCESS) return 1; - vmx->nested.nested_run_pending = 1; + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING; vmx->nested.smm.guest_mode = false; } return 0; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 70bfe81dea54..db84e8001da5 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -138,9 +138,6 @@ struct nested_vmx { */ bool enlightened_vmcs_enabled; - /* L2 must run next, and mustn't decide to exit to L1. */ - bool nested_run_pending; - /* Pending MTF VM-exit into L1. */ bool mtf_pending; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 33084df3865c..d65edcf8f30d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -351,6 +351,9 @@ static const u32 msrs_to_save_base[] = { MSR_IA32_U_CET, MSR_IA32_S_CET, MSR_IA32_PL0_SSP, MSR_IA32_PL1_SSP, MSR_IA32_PL2_SSP, MSR_IA32_PL3_SSP, MSR_IA32_INT_SSP_TAB, + MSR_IA32_DEBUGCTLMSR, + MSR_IA32_LASTBRANCHFROMIP, MSR_IA32_LASTBRANCHTOIP, + MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP, }; static const u32 msrs_to_save_pmu[] = { @@ -864,9 +867,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned int nr, vcpu->arch.exception.error_code = error_code; vcpu->arch.exception.has_payload = has_payload; vcpu->arch.exception.payload = payload; - if (!is_guest_mode(vcpu)) - kvm_deliver_exception_payload(vcpu, - &vcpu->arch.exception); return; } @@ -5531,18 +5531,8 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, return 0; } -static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) +static struct kvm_queued_exception *kvm_get_exception_to_save(struct kvm_vcpu *vcpu) { - struct kvm_queued_exception *ex; - - process_nmi(vcpu); - -#ifdef CONFIG_KVM_SMM - if (kvm_check_request(KVM_REQ_SMI, vcpu)) - process_smi(vcpu); -#endif - /* * KVM's ABI only allows for one exception to be migrated. Luckily, * the only time there can be two queued exceptions is if there's a @@ -5553,21 +5543,46 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, if (vcpu->arch.exception_vmexit.pending && !vcpu->arch.exception.pending && !vcpu->arch.exception.injected) - ex = &vcpu->arch.exception_vmexit; - else - ex = &vcpu->arch.exception; + return &vcpu->arch.exception_vmexit; + + return &vcpu->arch.exception; +} + +static void kvm_handle_exception_payload_quirk(struct kvm_vcpu *vcpu) +{ + struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu); /* - * In guest mode, payload delivery should be deferred if the exception - * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 - * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability, - * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not - * propagate the payload and so it cannot be safely deferred. Deliver - * the payload if the capability hasn't been requested. + * If KVM_CAP_EXCEPTION_PAYLOAD is disabled, then (prematurely) deliver + * the pending exception payload when userspace saves *any* vCPU state + * that interacts with exception payloads to avoid breaking userspace. + * + * Architecturally, KVM must not deliver an exception payload until the + * exception is actually injected, e.g. to avoid losing pending #DB + * information (which VMX tracks in the VMCS), and to avoid clobbering + * state if the exception is never injected for whatever reason. But + * if KVM_CAP_EXCEPTION_PAYLOAD isn't enabled, then userspace may or + * may not propagate the payload across save+restore, and so KVM can't + * safely defer delivery of the payload. */ if (!vcpu->kvm->arch.exception_payload_enabled && ex->pending && ex->has_payload) kvm_deliver_exception_payload(vcpu, ex); +} + +static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + struct kvm_queued_exception *ex = kvm_get_exception_to_save(vcpu); + + process_nmi(vcpu); + +#ifdef CONFIG_KVM_SMM + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); +#endif + + kvm_handle_exception_payload_quirk(vcpu); memset(events, 0, sizeof(*events)); @@ -5746,6 +5761,8 @@ static int kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, vcpu->arch.guest_state_protected) return -EINVAL; + kvm_handle_exception_payload_quirk(vcpu); + memset(dbgregs, 0, sizeof(*dbgregs)); BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); @@ -8897,6 +8914,11 @@ static bool emulator_is_canonical_addr(struct x86_emulate_ctxt *ctxt, return !is_noncanonical_address(addr, emul_to_vcpu(ctxt), flags); } +static bool emulator_page_address_valid(struct x86_emulate_ctxt *ctxt, gpa_t gpa) +{ + return page_address_valid(emul_to_vcpu(ctxt), gpa); +} + static const struct x86_emulate_ops emulate_ops = { .vm_bugged = emulator_vm_bugged, .read_gpr = emulator_read_gpr, @@ -8944,6 +8966,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_xcr = emulator_set_xcr, .get_untagged_addr = emulator_get_untagged_addr, .is_canonical_addr = emulator_is_canonical_addr, + .page_address_valid = emulator_page_address_valid, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -11916,6 +11939,13 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) static int kvm_x86_vcpu_pre_run(struct kvm_vcpu *vcpu) { + /* + * Userspace may have modified vCPU state, mark nested_run_pending as + * "untrusted" to avoid triggering false-positive WARNs. + */ + if (vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING) + vcpu->arch.nested_run_pending = KVM_NESTED_RUN_PENDING_UNTRUSTED; + /* * SIPI_RECEIVED is obsolete; KVM leaves the vCPU in Wait-For-SIPI and * tracks the pending SIPI separately. SIPI_RECEIVED is still accepted @@ -12156,6 +12186,8 @@ static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) if (vcpu->arch.guest_state_protected) goto skip_protected_regs; + kvm_handle_exception_payload_quirk(vcpu); + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 44a28d343d40..38a905fa86de 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -188,6 +188,16 @@ static inline bool kvm_can_set_cpuid_and_feature_msrs(struct kvm_vcpu *vcpu) return vcpu->arch.last_vmentry_cpu == -1 && !is_guest_mode(vcpu); } +/* + * WARN if a nested VM-Enter is pending completion, and userspace hasn't gained + * control since the nested VM-Enter was initiated (in which case, userspace + * may have modified vCPU state to induce an architecturally invalid VM-Exit). + */ +static inline void kvm_warn_on_nested_run_pending(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(vcpu->arch.nested_run_pending == KVM_NESTED_RUN_PENDING); +} + static inline void kvm_set_mp_state(struct kvm_vcpu *vcpu, int mp_state) { vcpu->arch.mp_state = mp_state; diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 8ad649a8e936..e79561deffbf 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -111,8 +111,11 @@ TEST_GEN_PROGS_x86 += x86/state_test TEST_GEN_PROGS_x86 += x86/vmx_preemption_timer_test TEST_GEN_PROGS_x86 += x86/svm_vmcall_test TEST_GEN_PROGS_x86 += x86/svm_int_ctl_test +TEST_GEN_PROGS_x86 += x86/svm_nested_clear_efer_svme TEST_GEN_PROGS_x86 += x86/svm_nested_shutdown_test TEST_GEN_PROGS_x86 += x86/svm_nested_soft_inject_test +TEST_GEN_PROGS_x86 += x86/svm_nested_vmcb12_gpa +TEST_GEN_PROGS_x86 += x86/svm_lbr_nested_state TEST_GEN_PROGS_x86 += x86/tsc_scaling_sync TEST_GEN_PROGS_x86 += x86/sync_regs_test TEST_GEN_PROGS_x86 += x86/ucna_injection_test diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 732a9d816a70..d8634a760a60 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1390,6 +1390,11 @@ static inline bool kvm_is_ignore_msrs(void) return get_kvm_param_bool("ignore_msrs"); } +static inline bool kvm_is_lbrv_enabled(void) +{ + return !!get_kvm_amd_param_integer("lbrv"); +} + uint64_t *vm_get_pte(struct kvm_vm *vm, uint64_t vaddr); uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, diff --git a/tools/testing/selftests/kvm/include/x86/svm.h b/tools/testing/selftests/kvm/include/x86/svm.h index 10b30b38bb3f..c8539166270e 100644 --- a/tools/testing/selftests/kvm/include/x86/svm.h +++ b/tools/testing/selftests/kvm/include/x86/svm.h @@ -97,13 +97,13 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 exit_info_2; u32 exit_int_info; u32 exit_int_info_err; - u64 nested_ctl; + u64 misc_ctl; u64 avic_vapic_bar; u8 reserved_4[8]; u32 event_inj; u32 event_inj_err; u64 nested_cr3; - u64 virt_ext; + u64 misc_ctl2; u32 clean; u32 reserved_5; u64 next_rip; @@ -155,9 +155,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define AVIC_ENABLE_SHIFT 31 #define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) -#define LBR_CTL_ENABLE_MASK BIT_ULL(0) -#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1) - #define SVM_INTERRUPT_SHADOW_MASK 1 #define SVM_IOIO_STR_SHIFT 2 @@ -175,8 +172,11 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL #define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL -#define SVM_NESTED_CTL_NP_ENABLE BIT(0) -#define SVM_NESTED_CTL_SEV_ENABLE BIT(1) +#define SVM_MISC_ENABLE_NP BIT(0) +#define SVM_MISC_ENABLE_SEV BIT(1) + +#define SVM_MISC2_ENABLE_V_LBR BIT_ULL(0) +#define SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE BIT_ULL(1) struct __attribute__ ((__packed__)) vmcb_seg { u16 selector; diff --git a/tools/testing/selftests/kvm/lib/x86/svm.c b/tools/testing/selftests/kvm/lib/x86/svm.c index 2e5c480c9afd..eb20b00112c7 100644 --- a/tools/testing/selftests/kvm/lib/x86/svm.c +++ b/tools/testing/selftests/kvm/lib/x86/svm.c @@ -126,7 +126,7 @@ void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_r guest_regs.rdi = (u64)svm; if (svm->ncr3_gpa) { - ctrl->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; + ctrl->misc_ctl |= SVM_MISC_ENABLE_NP; ctrl->nested_cr3 = svm->ncr3_gpa; } } diff --git a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c index 6764a48f9d4d..71717118d692 100644 --- a/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c +++ b/tools/testing/selftests/kvm/x86/nested_vmsave_vmload_test.c @@ -79,8 +79,8 @@ static void l1_guest_code(struct svm_test_data *svm) svm->vmcb->control.intercept |= (BIT_ULL(INTERCEPT_VMSAVE) | BIT_ULL(INTERCEPT_VMLOAD)); - /* ..VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK cleared.. */ - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + /* ..SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE cleared.. */ + svm->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; run_guest(svm->vmcb, svm->vmcb_gpa); @@ -90,8 +90,8 @@ static void l1_guest_code(struct svm_test_data *svm) run_guest(svm->vmcb, svm->vmcb_gpa); GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMLOAD); - /* ..and VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK set */ - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + /* ..and SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE set */ + svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; svm->vmcb->save.rip = (u64)l2_guest_code_vmsave; run_guest(svm->vmcb, svm->vmcb_gpa); @@ -106,20 +106,20 @@ static void l1_guest_code(struct svm_test_data *svm) BIT_ULL(INTERCEPT_VMLOAD)); /* - * Without VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be + * Without SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE, the GPA will be * interpreted as an L1 GPA, so VMCB0 should be used. */ svm->vmcb->save.rip = (u64)l2_guest_code_vmcb0; - svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + svm->vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; run_guest(svm->vmcb, svm->vmcb_gpa); GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); /* - * With VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK, the GPA will be interpeted as + * With SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE, the GPA will be interpeted as * an L2 GPA, and translated through the NPT to VMCB1. */ svm->vmcb->save.rip = (u64)l2_guest_code_vmcb1; - svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; + svm->vmcb->control.misc_ctl2 |= SVM_MISC2_ENABLE_V_VMLOAD_VMSAVE; run_guest(svm->vmcb, svm->vmcb_gpa); GUEST_ASSERT_EQ(svm->vmcb->control.exit_code, SVM_EXIT_VMMCALL); diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index f2c7a1c297e3..992a52504a4a 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -26,7 +26,9 @@ void svm_l2_guest_code(void) GUEST_SYNC(4); /* Exit to L1 */ vmcall(); + clgi(); GUEST_SYNC(6); + stgi(); /* Done, exit to L1 and never come back. */ vmcall(); } @@ -41,6 +43,8 @@ static void svm_l1_guest_code(struct svm_test_data *svm) generic_svm_setup(svm, svm_l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + vmcb->control.int_ctl |= (V_GIF_ENABLE_MASK | V_GIF_MASK); + GUEST_SYNC(3); run_guest(vmcb, svm->vmcb_gpa); GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL); @@ -222,6 +226,35 @@ static void __attribute__((__flatten__)) guest_code(void *arg) GUEST_DONE(); } +void svm_check_nested_state(int stage, struct kvm_x86_state *state) +{ + struct vmcb *vmcb = (struct vmcb *)state->nested.data.svm; + + if (kvm_cpu_has(X86_FEATURE_VGIF)) { + if (stage == 4) + TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 1); + if (stage == 6) + TEST_ASSERT_EQ(!!(vmcb->control.int_ctl & V_GIF_MASK), 0); + } + + if (kvm_cpu_has(X86_FEATURE_NRIPS)) { + /* + * GUEST_SYNC() causes IO emulation in KVM, in which case the + * RIP is advanced before exiting to userspace. Hence, the RIP + * in the saved state should be the same as nRIP saved by the + * CPU in the VMCB. + */ + if (stage == 6) + TEST_ASSERT_EQ(vmcb->control.next_rip, state->regs.rip); + } +} + +void check_nested_state(int stage, struct kvm_x86_state *state) +{ + if (kvm_has_cap(KVM_CAP_NESTED_STATE) && kvm_cpu_has(X86_FEATURE_SVM)) + svm_check_nested_state(stage, state); +} + int main(int argc, char *argv[]) { uint64_t *xstate_bv, saved_xstate_bv; @@ -278,6 +311,8 @@ int main(int argc, char *argv[]) kvm_vm_release(vm); + check_nested_state(stage, state); + /* Restore state in a new VM. */ vcpu = vm_recreate_with_one_vcpu(vm); vcpu_load_state(vcpu, state); diff --git a/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c b/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c new file mode 100644 index 000000000000..ff99438824d3 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/svm_lbr_nested_state.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026, Google, Inc. + */ + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" + + +#define L2_GUEST_STACK_SIZE 64 + +#define DO_BRANCH() do { asm volatile("jmp 1f\n 1: nop"); } while (0) + +struct lbr_branch { + u64 from, to; +}; + +volatile struct lbr_branch l2_branch; + +#define RECORD_AND_CHECK_BRANCH(b) \ +do { \ + wrmsr(MSR_IA32_DEBUGCTLMSR, DEBUGCTLMSR_LBR); \ + DO_BRANCH(); \ + (b)->from = rdmsr(MSR_IA32_LASTBRANCHFROMIP); \ + (b)->to = rdmsr(MSR_IA32_LASTBRANCHTOIP); \ + /* Disable LBR right after to avoid overriding the IPs */ \ + wrmsr(MSR_IA32_DEBUGCTLMSR, 0); \ + \ + GUEST_ASSERT_NE((b)->from, 0); \ + GUEST_ASSERT_NE((b)->to, 0); \ +} while (0) + +#define CHECK_BRANCH_MSRS(b) \ +do { \ + GUEST_ASSERT_EQ((b)->from, rdmsr(MSR_IA32_LASTBRANCHFROMIP)); \ + GUEST_ASSERT_EQ((b)->to, rdmsr(MSR_IA32_LASTBRANCHTOIP)); \ +} while (0) + +#define CHECK_BRANCH_VMCB(b, vmcb) \ +do { \ + GUEST_ASSERT_EQ((b)->from, vmcb->save.br_from); \ + GUEST_ASSERT_EQ((b)->to, vmcb->save.br_to); \ +} while (0) + +static void l2_guest_code(struct svm_test_data *svm) +{ + /* Record a branch, trigger save/restore, and make sure LBRs are intact */ + RECORD_AND_CHECK_BRANCH(&l2_branch); + GUEST_SYNC(true); + CHECK_BRANCH_MSRS(&l2_branch); + vmmcall(); +} + +static void l1_guest_code(struct svm_test_data *svm, bool nested_lbrv) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + struct vmcb *vmcb = svm->vmcb; + struct lbr_branch l1_branch; + + /* Record a branch, trigger save/restore, and make sure LBRs are intact */ + RECORD_AND_CHECK_BRANCH(&l1_branch); + GUEST_SYNC(true); + CHECK_BRANCH_MSRS(&l1_branch); + + /* Run L2, which will also do the same */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + if (nested_lbrv) + vmcb->control.misc_ctl2 = SVM_MISC2_ENABLE_V_LBR; + else + vmcb->control.misc_ctl2 &= ~SVM_MISC2_ENABLE_V_LBR; + + run_guest(vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + /* Trigger save/restore one more time before checking, just for kicks */ + GUEST_SYNC(true); + + /* + * If LBR_CTL_ENABLE is set, L1 and L2 should have separate LBR MSRs, so + * expect L1's LBRs to remain intact and L2 LBRs to be in the VMCB. + * Otherwise, the MSRs are shared between L1 & L2 so expect L2's LBRs. + */ + if (nested_lbrv) { + CHECK_BRANCH_MSRS(&l1_branch); + CHECK_BRANCH_VMCB(&l2_branch, vmcb); + } else { + CHECK_BRANCH_MSRS(&l2_branch); + } + GUEST_DONE(); +} + +void test_lbrv_nested_state(bool nested_lbrv) +{ + struct kvm_x86_state *state = NULL; + struct kvm_vcpu *vcpu; + vm_vaddr_t svm_gva; + struct kvm_vm *vm; + struct ucall uc; + + pr_info("Testing with nested LBRV %s\n", nested_lbrv ? "enabled" : "disabled"); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + vcpu_alloc_svm(vm, &svm_gva); + vcpu_args_set(vcpu, 2, svm_gva, nested_lbrv); + + for (;;) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + switch (get_ucall(vcpu, &uc)) { + case UCALL_SYNC: + /* Save the vCPU state and restore it in a new VM on sync */ + pr_info("Guest triggered save/restore.\n"); + state = vcpu_save_state(vcpu); + kvm_vm_release(vm); + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + break; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } +done: + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + TEST_REQUIRE(kvm_is_lbrv_enabled()); + + test_lbrv_nested_state(/*nested_lbrv=*/false); + test_lbrv_nested_state(/*nested_lbrv=*/true); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c b/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c new file mode 100644 index 000000000000..a521a9eed061 --- /dev/null +++ b/tools/testing/selftests/kvm/x86/svm_nested_clear_efer_svme.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026, Google LLC. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" + + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(void) +{ + unsigned long efer = rdmsr(MSR_EFER); + + /* generic_svm_setup() initializes EFER_SVME set for L2 */ + GUEST_ASSERT(efer & EFER_SVME); + wrmsr(MSR_EFER, efer & ~EFER_SVME); + + /* Unreachable, L1 should be shutdown */ + GUEST_ASSERT(0); +} + +static void l1_guest_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + run_guest(svm->vmcb, svm->vmcb_gpa); + + /* Unreachable, L1 should be shutdown */ + GUEST_ASSERT(0); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t nested_gva = 0; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + vcpu_alloc_svm(vm, &nested_gva); + vcpu_args_set(vcpu, 1, nested_gva); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c b/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c new file mode 100644 index 000000000000..569869bed20b --- /dev/null +++ b/tools/testing/selftests/kvm/x86/svm_nested_vmcb12_gpa.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2026, Google LLC. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" +#include "kvm_test_harness.h" +#include "test_util.h" + + +#define L2_GUEST_STACK_SIZE 64 + +#define SYNC_GP 101 +#define SYNC_L2_STARTED 102 + +static unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + +static void guest_gp_handler(struct ex_regs *regs) +{ + GUEST_SYNC(SYNC_GP); +} + +static void l2_code(void) +{ + GUEST_SYNC(SYNC_L2_STARTED); + vmcall(); +} + +static void l1_vmrun(struct svm_test_data *svm, u64 gpa) +{ + generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + asm volatile ("vmrun %[gpa]" : : [gpa] "a" (gpa) : "memory"); +} + +static void l1_vmload(struct svm_test_data *svm, u64 gpa) +{ + generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + asm volatile ("vmload %[gpa]" : : [gpa] "a" (gpa) : "memory"); +} + +static void l1_vmsave(struct svm_test_data *svm, u64 gpa) +{ + generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + asm volatile ("vmsave %[gpa]" : : [gpa] "a" (gpa) : "memory"); +} + +static void l1_vmexit(struct svm_test_data *svm, u64 gpa) +{ + generic_svm_setup(svm, l2_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + GUEST_DONE(); +} + +static u64 unmappable_gpa(struct kvm_vcpu *vcpu) +{ + struct userspace_mem_region *region; + u64 region_gpa_end, vm_gpa_end = 0; + int i; + + hash_for_each(vcpu->vm->regions.slot_hash, i, region, slot_node) { + region_gpa_end = region->region.guest_phys_addr + region->region.memory_size; + vm_gpa_end = max(vm_gpa_end, region_gpa_end); + } + + return vm_gpa_end; +} + +static void test_invalid_vmcb12(struct kvm_vcpu *vcpu) +{ + vm_vaddr_t nested_gva = 0; + struct ucall uc; + + + vm_install_exception_handler(vcpu->vm, GP_VECTOR, guest_gp_handler); + vcpu_alloc_svm(vcpu->vm, &nested_gva); + vcpu_args_set(vcpu, 2, nested_gva, -1ULL); + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC); + TEST_ASSERT_EQ(uc.args[1], SYNC_GP); +} + +static void test_unmappable_vmcb12(struct kvm_vcpu *vcpu) +{ + vm_vaddr_t nested_gva = 0; + + vcpu_alloc_svm(vcpu->vm, &nested_gva); + vcpu_args_set(vcpu, 2, nested_gva, unmappable_gpa(vcpu)); + vcpu_run(vcpu); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_INTERNAL_ERROR); + TEST_ASSERT_EQ(vcpu->run->emulation_failure.suberror, KVM_INTERNAL_ERROR_EMULATION); +} + +static void test_unmappable_vmcb12_vmexit(struct kvm_vcpu *vcpu) +{ + struct kvm_x86_state *state; + vm_vaddr_t nested_gva = 0; + struct ucall uc; + + /* + * Enter L2 (with a legit vmcb12 GPA), then overwrite vmcb12 GPA with an + * unmappable GPA. KVM will fail to map vmcb12 on nested VM-Exit and + * cause a shutdown. + */ + vcpu_alloc_svm(vcpu->vm, &nested_gva); + vcpu_args_set(vcpu, 2, nested_gva, unmappable_gpa(vcpu)); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + TEST_ASSERT_EQ(get_ucall(vcpu, &uc), UCALL_SYNC); + TEST_ASSERT_EQ(uc.args[1], SYNC_L2_STARTED); + + state = vcpu_save_state(vcpu); + state->nested.hdr.svm.vmcb_pa = unmappable_gpa(vcpu); + vcpu_load_state(vcpu, state); + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_SHUTDOWN); + + kvm_x86_state_cleanup(state); +} + +KVM_ONE_VCPU_TEST_SUITE(vmcb12_gpa); + +KVM_ONE_VCPU_TEST(vmcb12_gpa, vmrun_invalid, l1_vmrun) +{ + test_invalid_vmcb12(vcpu); +} + +KVM_ONE_VCPU_TEST(vmcb12_gpa, vmload_invalid, l1_vmload) +{ + test_invalid_vmcb12(vcpu); +} + +KVM_ONE_VCPU_TEST(vmcb12_gpa, vmsave_invalid, l1_vmsave) +{ + test_invalid_vmcb12(vcpu); +} + +KVM_ONE_VCPU_TEST(vmcb12_gpa, vmrun_unmappable, l1_vmrun) +{ + test_unmappable_vmcb12(vcpu); +} + +KVM_ONE_VCPU_TEST(vmcb12_gpa, vmload_unmappable, l1_vmload) +{ + test_unmappable_vmcb12(vcpu); +} + +KVM_ONE_VCPU_TEST(vmcb12_gpa, vmsave_unmappable, l1_vmsave) +{ + test_unmappable_vmcb12(vcpu); +} + +/* + * Invalid vmcb12_gpa cannot be test for #VMEXIT as KVM_SET_NESTED_STATE will + * reject it. + */ +KVM_ONE_VCPU_TEST(vmcb12_gpa, vmexit_unmappable, l1_vmexit) +{ + test_unmappable_vmcb12_vmexit(vcpu); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_SVM)); + + return test_harness_run(argc, argv); +}