diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index aa0031108bc1..8d62a6fdb152 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3079,6 +3079,26 @@ Kernel parameters Default is Y (on). + kvm.enable_pmu=[KVM,X86] + If enabled, KVM will virtualize PMU functionality based + on the virtual CPU model defined by userspace. This + can be overridden on a per-VM basis via + KVM_CAP_PMU_CAPABILITY. + + If disabled, KVM will not virtualize PMU functionality, + e.g. MSRs, PMCs, PMIs, etc., even if userspace defines + a virtual CPU model that contains PMU assets. + + Note, KVM's vPMU support implicitly requires running + with an in-kernel local APIC, e.g. to deliver PMIs to + the guest. Running without an in-kernel local APIC is + not supported, though KVM will allow such a combination + (with severely degraded functionality). + + See also enable_mediated_pmu. + + Default is Y (on). + kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86] If enabled, KVM will enable virtualization in hardware when KVM is loaded, and disable virtualization when KVM @@ -3125,6 +3145,35 @@ Kernel parameters If the value is 0 (the default), KVM will pick a period based on the ratio, such that a page is zapped after 1 hour on average. + kvm-{amd,intel}.enable_mediated_pmu=[KVM,AMD,INTEL] + If enabled, KVM will provide a mediated virtual PMU, + instead of the default perf-based virtual PMU (if + kvm.enable_pmu is true and PMU is enumerated via the + virtual CPU model). + + With a perf-based vPMU, KVM operates as a user of perf, + i.e. emulates guest PMU counters using perf events. + KVM-created perf events are managed by perf as regular + (guest-only) events, e.g. are scheduled in/out, contend + for hardware resources, etc. Using a perf-based vPMU + allows guest and host usage of the PMU to co-exist, but + incurs non-trivial overhead and can result in silently + dropped guest events (due to resource contention). + + With a mediated vPMU, hardware PMU state is context + switched around the world switch to/from the guest. + KVM mediates which events the guest can utilize, but + gives the guest direct access to all other PMU assets + when possible (KVM may intercept some accesses if the + virtual CPU model provides a subset of hardware PMU + functionality). Using a mediated vPMU significantly + reduces PMU virtualization overhead and eliminates lost + guest events, but is mutually exclusive with using perf + to profile KVM guests and adds latency to most VM-Exits + (to context switch PMU state). + + Default is N (off). + kvm-amd.nested= [KVM,AMD] Control nested virtualization feature in KVM/SVM. Default is 1 (enabled). diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0f40d3da0550..94d5b0b99fd1 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2413,7 +2413,7 @@ static int __init init_subsystems(void) if (err) goto out; - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); out: if (err) diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index d1c5156e02d8..ac38d0f19dd3 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -402,7 +402,7 @@ static int kvm_loongarch_env_init(void) } kvm_init_gcsr_flag(); - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); /* Register LoongArch IPI interrupt controller interface. */ ret = kvm_loongarch_register_ipi_device(); diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 45536af521f0..0f3fe3986fc0 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -174,7 +174,7 @@ static int __init riscv_kvm_init(void) kvm_riscv_setup_vendor_features(); - kvm_register_perf_callbacks(NULL); + kvm_register_perf_callbacks(); rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (rc) { diff --git a/arch/x86/entry/entry_fred.c b/arch/x86/entry/entry_fred.c index 94e626cc6a07..a9b72997103d 100644 --- a/arch/x86/entry/entry_fred.c +++ b/arch/x86/entry/entry_fred.c @@ -114,6 +114,7 @@ static idtentry_t sysvec_table[NR_SYSTEM_VECTORS] __ro_after_init = { SYSVEC(IRQ_WORK_VECTOR, irq_work), + SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, perf_guest_mediated_pmi_handler), SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi), diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 44656d2fb555..0c92ed5f464b 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1439,6 +1439,8 @@ static int __init amd_core_pmu_init(void) amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + /* Update PMC handling functions */ x86_pmu.enable_all = amd_pmu_v2_enable_all; x86_pmu.disable_all = amd_pmu_v2_disable_all; diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 576baa9a52c5..73ed4d753ac5 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,8 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .pmu = &pmu, }; +static DEFINE_PER_CPU(bool, guest_lvtpc_loaded); + DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); @@ -1760,6 +1763,25 @@ void perf_events_lapic_init(void) apic_write(APIC_LVTPC, APIC_DM_NMI); } +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +void perf_load_guest_lvtpc(u32 guest_lvtpc) +{ + u32 masked = guest_lvtpc & APIC_LVT_MASKED; + + apic_write(APIC_LVTPC, + APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); + this_cpu_write(guest_lvtpc_loaded, true); +} +EXPORT_SYMBOL_FOR_KVM(perf_load_guest_lvtpc); + +void perf_put_guest_lvtpc(void) +{ + this_cpu_write(guest_lvtpc_loaded, false); + apic_write(APIC_LVTPC, APIC_DM_NMI); +} +EXPORT_SYMBOL_FOR_KVM(perf_put_guest_lvtpc); +#endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ + static int perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) { @@ -1767,6 +1789,17 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) u64 finish_clock; int ret; + /* + * Ignore all NMIs when the CPU's LVTPC is configured to route PMIs to + * PERF_GUEST_MEDIATED_PMI_VECTOR, i.e. when an NMI time can't be due + * to a PMI. Attempting to handle a PMI while the guest's context is + * loaded will generate false positives and clobber guest state. Note, + * the LVTPC is switched to/from the dedicated mediated PMI IRQ vector + * while host events are quiesced. + */ + if (this_cpu_read(guest_lvtpc_loaded)) + return NMI_DONE; + /* * All PMUs/events that share this PMI handler should make sure to * increment active_events for their events. @@ -3073,11 +3106,12 @@ void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) cap->version = x86_pmu.version; cap->num_counters_gp = x86_pmu_num_counters(NULL); cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); - cap->bit_width_gp = x86_pmu.cntval_bits; - cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->bit_width_gp = cap->num_counters_gp ? x86_pmu.cntval_bits : 0; + cap->bit_width_fixed = cap->num_counters_fixed ? x86_pmu.cntval_bits : 0; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; cap->pebs_ept = x86_pmu.pebs_ept; + cap->mediated = !!(pmu.capabilities & PERF_PMU_CAP_MEDIATED_VPMU); } EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bdf3f0d0fe21..1840ca1918d1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -5695,6 +5695,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) else pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; + pmu->pmu.capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + intel_pmu_check_event_constraints_all(&pmu->pmu); intel_pmu_check_extra_regs(pmu->extra_regs); @@ -7314,6 +7316,9 @@ __init int intel_pmu_init(void) pr_cont(" AnyThread deprecated, "); } + /* The perf side of core PMU is ready to support the mediated vPMU. */ + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; + /* * Many features on and after V6 require dynamic constraint, * e.g., Arch PEBS, ACR. @@ -7405,6 +7410,7 @@ __init int intel_pmu_init(void) case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_SILVERMONT_MID: case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_AIRMONT_NP: case INTEL_ATOM_SILVERMONT_MID2: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index fa67fda6e45b..f3d5ee07f8f2 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -41,7 +41,7 @@ * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL - * MTL,SRF,GRR,ARL,LNL,PTL + * MTL,SRF,GRR,ARL,LNL,PTL,WCL,NVL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -53,19 +53,20 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * GRR,ARL,LNL,PTL + * GRR,ARL,LNL,PTL,WCL,NVL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, - * PTL + * PTL,WCL,NVL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, - * RPL,SPR,MTL,ARL,LNL,SRF,PTL + * RPL,SPR,MTL,ARL,LNL,SRF,PTL,WCL, + * NVL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 @@ -78,7 +79,7 @@ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, - * ARL,LNL,PTL + * ARL,LNL,PTL,WCL,NVL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 @@ -97,11 +98,12 @@ * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL + * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL, + * WCL,NVL * Scope: Package (physical package) * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. * perf code: 0x00 - * Available model: SRF,GRR + * Available model: SRF,GRR,NVL * Scope: A cluster of cores shared L2 cache * */ @@ -527,6 +529,18 @@ static const struct cstate_model lnl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model nvl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -599,6 +613,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates), + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &slm_cstates), X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates), X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates), @@ -638,6 +653,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates), X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates), + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, &srf_cstates), X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates), X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates), @@ -654,6 +670,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &lnl_cstates), + X86_MATCH_VFM(INTEL_NOVALAKE, &nvl_cstates), + X86_MATCH_VFM(INTEL_NOVALAKE_L, &nvl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index 7f5007a4752a..8052596b8503 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -78,6 +78,7 @@ static bool test_intel(int idx, void *data) case INTEL_ATOM_SILVERMONT: case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_AIRMONT: + case INTEL_ATOM_AIRMONT_NP: case INTEL_ATOM_GOLDMONT: case INTEL_ATOM_GOLDMONT_D: diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 6b6d472baa0b..9314642ae93c 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -18,6 +18,9 @@ typedef struct { unsigned int kvm_posted_intr_ipis; unsigned int kvm_posted_intr_wakeup_ipis; unsigned int kvm_posted_intr_nested_ipis; +#endif +#ifdef CONFIG_GUEST_PERF_EVENTS + unsigned int perf_guest_mediated_pmis; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 3218770670d3..42bf6a58ec36 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -746,6 +746,12 @@ DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested # define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif +# ifdef CONFIG_GUEST_PERF_EVENTS +DECLARE_IDTENTRY_SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, sysvec_perf_guest_mediated_pmi_handler); +#else +# define fred_sysvec_perf_guest_mediated_pmi_handler NULL +#endif + # ifdef CONFIG_X86_POSTED_MSI DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); #else diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 47051871b436..85253fc8e384 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,9 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +/* IRQ vector for PMIs when running a guest with a mediated PMU. */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index 9159bf1a4730..f0aa6996811f 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -23,5 +23,9 @@ KVM_X86_PMU_OP_OPTIONAL(reset) KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) KVM_X86_PMU_OP_OPTIONAL(cleanup) +KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) +KVM_X86_PMU_OP(mediated_load) +KVM_X86_PMU_OP(mediated_put) + #undef KVM_X86_PMU_OP #undef KVM_X86_PMU_OP_OPTIONAL diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 94cd4dc0e2a1..ff07c45e3c73 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -537,6 +537,7 @@ struct kvm_pmc { */ u64 emulated_counter; u64 eventsel; + u64 eventsel_hw; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* @@ -565,6 +566,7 @@ struct kvm_pmu { unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; + u64 fixed_ctr_ctrl_hw; u64 fixed_ctr_ctrl_rsvd; u64 global_ctrl; u64 global_status; @@ -1503,6 +1505,7 @@ struct kvm_arch { bool bus_lock_detection_enabled; bool enable_pmu; + bool created_mediated_pmu; u32 notify_window; u32 notify_vmexit_flags; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3d0a0950d20a..4d3566bb1a93 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -1219,6 +1219,7 @@ #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 +#define MSR_CORE_PERF_GLOBAL_STATUS_SET 0x00000391 #define MSR_PERF_METRICS 0x00000329 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7276ba70c88a..0d9af4135e0a 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -301,6 +301,7 @@ struct x86_pmu_capability { unsigned int events_mask; int events_mask_len; unsigned int pebs_ept :1; + unsigned int mediated :1; }; /* @@ -759,6 +760,11 @@ static inline void perf_events_lapic_init(void) { } static inline void perf_check_microcode(void) { } #endif +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +extern void perf_load_guest_lvtpc(u32 guest_lvtpc); +extern void perf_put_guest_lvtpc(void); +#endif + #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data); extern void x86_perf_get_lbr(struct x86_pmu_lbr *lbr); diff --git a/arch/x86/include/asm/unwind_user.h b/arch/x86/include/asm/unwind_user.h index 12064284bc4e..6e469044e4de 100644 --- a/arch/x86/include/asm/unwind_user.h +++ b/arch/x86/include/asm/unwind_user.h @@ -2,11 +2,23 @@ #ifndef _ASM_X86_UNWIND_USER_H #define _ASM_X86_UNWIND_USER_H -#ifdef CONFIG_HAVE_UNWIND_USER_FP +#ifdef CONFIG_UNWIND_USER #include #include +static inline int unwind_user_word_size(struct pt_regs *regs) +{ + /* We can't unwind VM86 stacks */ + if (regs->flags & X86_VM_MASK) + return 0; + return user_64bit_mode(regs) ? 8 : 4; +} + +#endif /* CONFIG_UNWIND_USER */ + +#ifdef CONFIG_HAVE_UNWIND_USER_FP + #define ARCH_INIT_USER_FP_FRAME(ws) \ .cfa_off = 2*(ws), \ .ra_off = -1*(ws), \ @@ -19,22 +31,11 @@ .fp_off = 0, \ .use_fp = false, -static inline int unwind_user_word_size(struct pt_regs *regs) -{ - /* We can't unwind VM86 stacks */ - if (regs->flags & X86_VM_MASK) - return 0; -#ifdef CONFIG_X86_64 - if (!user_64bit_mode(regs)) - return sizeof(int); -#endif - return sizeof(long); -} - static inline bool unwind_user_at_function_start(struct pt_regs *regs) { return is_uprobe_at_func_entry(regs); } +#define unwind_user_at_function_start unwind_user_at_function_start #endif /* CONFIG_HAVE_UNWIND_USER_FP */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index c85c50019523..b92ff87e3560 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -107,6 +107,7 @@ #define VM_EXIT_PT_CONCEAL_PIP 0x01000000 #define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 #define VM_EXIT_LOAD_CET_STATE 0x10000000 +#define VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL 0x40000000 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index f445bec516a0..260456588756 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -158,6 +158,9 @@ static const __initconst struct idt_data apic_idts[] = { INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), # endif +#ifdef CONFIG_GUEST_PERF_EVENTS + INTG(PERF_GUEST_MEDIATED_PMI_VECTOR, asm_sysvec_perf_guest_mediated_pmi_handler), +#endif # ifdef CONFIG_IRQ_WORK INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), # endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index b2fe6181960c..316730e95fc3 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -192,6 +192,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->kvm_posted_intr_wakeup_ipis); seq_puts(p, " Posted-interrupt wakeup event\n"); #endif +#ifdef CONFIG_GUEST_PERF_EVENTS + seq_printf(p, "%*s: ", prec, "VPMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->perf_guest_mediated_pmis); + seq_puts(p, " Perf Guest Mediated PMI\n"); +#endif #ifdef CONFIG_X86_POSTED_MSI seq_printf(p, "%*s: ", prec, "PMN"); for_each_online_cpu(j) @@ -349,6 +356,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi) } #endif +#ifdef CONFIG_GUEST_PERF_EVENTS +/* + * Handler for PERF_GUEST_MEDIATED_PMI_VECTOR. + */ +DEFINE_IDTENTRY_SYSVEC(sysvec_perf_guest_mediated_pmi_handler) +{ + apic_eoi(); + inc_irq_stat(perf_guest_mediated_pmis); + perf_guest_handle_mediated_pmi(); +} +#endif + #if IS_ENABLED(CONFIG_KVM) static void dummy_handler(void) {} static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 278f08194ec8..d916bd766c94 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -37,6 +37,7 @@ config KVM_X86 select SCHED_INFO select PERF_EVENTS select GUEST_PERF_EVENTS + select PERF_GUEST_MEDIATED_PMU select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index ff20b4102173..bd6b785cf261 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -103,7 +103,7 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) +void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; @@ -135,6 +135,13 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) enable_pmu = false; } + if (!enable_pmu || !enable_mediated_pmu || !kvm_host_pmu.mediated || + !pmu_ops->is_mediated_pmu_supported(&kvm_host_pmu)) + enable_mediated_pmu = false; + + if (!enable_mediated_pmu) + pmu_ops->write_global_ctrl = NULL; + if (!enable_pmu) { memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); return; @@ -153,6 +160,16 @@ void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); } +void kvm_handle_guest_mediated_pmi(void) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + if (WARN_ON_ONCE(!vcpu || !kvm_vcpu_has_mediated_pmu(vcpu))) + return; + + kvm_make_request(KVM_REQ_PMI, vcpu); +} + static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -362,6 +379,11 @@ static void pmc_update_sample_period(struct kvm_pmc *pmc) void pmc_write_counter(struct kvm_pmc *pmc, u64 val) { + if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) { + pmc->counter = val & pmc_bitmask(pmc); + return; + } + /* * Drop any unconsumed accumulated counts, the WRMSR is a write, not a * read-modify-write. Adjust the counter value so that its value is @@ -498,6 +520,25 @@ static bool pmc_is_event_allowed(struct kvm_pmc *pmc) return is_fixed_event_allowed(filter, pmc->idx); } +static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc) +{ + bool allowed = pmc_is_event_allowed(pmc); + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_gp(pmc)) { + pmc->eventsel_hw &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + if (allowed) + pmc->eventsel_hw |= pmc->eventsel & + ARCH_PERFMON_EVENTSEL_ENABLE; + } else { + u64 mask = intel_fixed_bits_by_idx(pmc->idx - KVM_FIXED_PMC_BASE_IDX, 0xf); + + pmu->fixed_ctr_ctrl_hw &= ~mask; + if (allowed) + pmu->fixed_ctr_ctrl_hw |= pmu->fixed_ctr_ctrl & mask; + } +} + static int reprogram_counter(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); @@ -506,6 +547,11 @@ static int reprogram_counter(struct kvm_pmc *pmc) bool emulate_overflow; u8 fixed_ctr_ctrl; + if (kvm_vcpu_has_mediated_pmu(pmu_to_vcpu(pmu))) { + kvm_mediated_pmu_refresh_event_filter(pmc); + return 0; + } + emulate_overflow = pmc_pause_counter(pmc); if (!pmc_is_globally_enabled(pmc) || !pmc_is_locally_enabled(pmc) || @@ -700,6 +746,46 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) return 0; } +static bool kvm_need_any_pmc_intercept(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + if (!kvm_vcpu_has_mediated_pmu(vcpu)) + return true; + + /* + * Note! Check *host* PMU capabilities, not KVM's PMU capabilities, as + * KVM's capabilities are constrained based on KVM support, i.e. KVM's + * capabilities themselves may be a subset of hardware capabilities. + */ + return pmu->nr_arch_gp_counters != kvm_host_pmu.num_counters_gp || + pmu->nr_arch_fixed_counters != kvm_host_pmu.num_counters_fixed; +} + +bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu) +{ + return kvm_need_any_pmc_intercept(vcpu) || + !kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_perf_global_ctrl_intercept); + +bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* + * VMware allows access to these Pseduo-PMCs even when read via RDPMC + * in Ring3 when CR4.PCE=0. + */ + if (enable_vmware_backdoor) + return true; + + return kvm_need_any_pmc_intercept(vcpu) || + pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_host_pmu.bit_width_gp) - 1) || + pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_host_pmu.bit_width_fixed) - 1); +} +EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_rdpmc_intercept); + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { if (lapic_in_kernel(vcpu)) { @@ -795,6 +881,12 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) pmu->global_ctrl = data; reprogram_counters(pmu, diff); } + /* + * Unconditionally forward writes to vendor code, i.e. to the + * VMC{B,S}, as pmu->global_ctrl is per-VCPU, not per-VMC{B,S}. + */ + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(data); break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: /* @@ -835,11 +927,14 @@ static void kvm_pmu_reset(struct kvm_vcpu *vcpu) pmc->counter = 0; pmc->emulated_counter = 0; - if (pmc_is_gp(pmc)) + if (pmc_is_gp(pmc)) { pmc->eventsel = 0; + pmc->eventsel_hw = 0; + } } - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; + pmu->fixed_ctr_ctrl = pmu->fixed_ctr_ctrl_hw = 0; + pmu->global_ctrl = pmu->global_status = 0; kvm_pmu_call(reset)(vcpu); } @@ -888,9 +983,13 @@ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) * in the global controls). Emulate that behavior when refreshing the * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. */ - if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) + if (pmu->nr_arch_gp_counters && + (kvm_pmu_has_perf_global_ctrl(pmu) || kvm_vcpu_has_mediated_pmu(vcpu))) pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); + if (kvm_vcpu_has_mediated_pmu(vcpu)) + kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl); + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, pmu->nr_arch_fixed_counters); @@ -932,10 +1031,45 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); } +static bool pmc_is_pmi_enabled(struct kvm_pmc *pmc) +{ + u8 fixed_ctr_ctrl; + + if (pmc_is_gp(pmc)) + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_INT; + + fixed_ctr_ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, + pmc->idx - KVM_FIXED_PMC_BASE_IDX); + return fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI; +} + static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) { - pmc->emulated_counter++; - kvm_pmu_request_counter_reprogram(pmc); + struct kvm_vcpu *vcpu = pmc->vcpu; + + /* + * For perf-based PMUs, accumulate software-emulated events separately + * from pmc->counter, as pmc->counter is offset by the count of the + * associated perf event. Request reprogramming, which will consult + * both emulated and hardware-generated events to detect overflow. + */ + if (!kvm_vcpu_has_mediated_pmu(vcpu)) { + pmc->emulated_counter++; + kvm_pmu_request_counter_reprogram(pmc); + return; + } + + /* + * For mediated PMUs, pmc->counter is updated when the vCPU's PMU is + * put, and will be loaded into hardware when the PMU is loaded. Simply + * increment the counter and signal overflow if it wraps to zero. + */ + pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); + if (!pmc->counter) { + pmc_to_pmu(pmc)->global_status |= BIT_ULL(pmc->idx); + if (pmc_is_pmi_enabled(pmc)) + kvm_make_request(KVM_REQ_PMI, vcpu); + } } static inline bool cpl_is_matched(struct kvm_pmc *pmc) @@ -1148,3 +1282,126 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) kfree(filter); return r; } + +static __always_inline u32 fixed_counter_msr(u32 idx) +{ + return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static __always_inline u32 gp_counter_msr(u32 idx) +{ + return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static __always_inline u32 gp_eventsel_msr(u32 idx) +{ + return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE; +} + +static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + /* + * No need to zero out unexposed GP/fixed counters/selectors since RDPMC + * is intercepted if hardware has counters that aren't visible to the + * guest (KVM will inject #GP as appropriate). + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + + if (pmc->counter != rdpmc(i)) + wrmsrl(gp_counter_msr(i), pmc->counter); + wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw); + } + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + + if (pmc->counter != rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i)) + wrmsrl(fixed_counter_msr(i), pmc->counter); + } +} + +void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + if (!kvm_vcpu_has_mediated_pmu(vcpu) || + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return; + + lockdep_assert_irqs_disabled(); + + perf_load_guest_context(); + + /* + * Explicitly clear PERF_GLOBAL_CTRL, as "loading" the guest's context + * disables all individual counters (if any were enabled), but doesn't + * globally disable the entire PMU. Loading event selectors and PMCs + * with guest values while PERF_GLOBAL_CTRL is non-zero will generate + * unexpected events and PMIs. + * + * VMX will enable/disable counters at VM-Enter/VM-Exit by atomically + * loading PERF_GLOBAL_CONTROL. SVM effectively performs the switch by + * configuring all events to be GUEST_ONLY. Clear PERF_GLOBAL_CONTROL + * even for SVM to minimize the damage if a perf event is left enabled, + * and to ensure a consistent starting state. + */ + wrmsrq(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0); + + perf_load_guest_lvtpc(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC)); + + kvm_pmu_load_guest_pmcs(vcpu); + + kvm_pmu_call(mediated_load)(vcpu); +} + +static void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 i; + + /* + * Clear selectors and counters to ensure hardware doesn't count using + * guest controls when the host (perf) restores its state. + */ + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + pmc = &pmu->gp_counters[i]; + + pmc->counter = rdpmc(i); + if (pmc->counter) + wrmsrq(gp_counter_msr(i), 0); + if (pmc->eventsel_hw) + wrmsrq(gp_eventsel_msr(i), 0); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + pmc = &pmu->fixed_counters[i]; + + pmc->counter = rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i); + if (pmc->counter) + wrmsrq(fixed_counter_msr(i), 0); + } +} + +void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + if (!kvm_vcpu_has_mediated_pmu(vcpu) || + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) + return; + + lockdep_assert_irqs_disabled(); + + /* + * Defer handling of PERF_GLOBAL_CTRL to vendor code. On Intel, it's + * atomically cleared on VM-Exit, i.e. doesn't need to be clear here. + */ + kvm_pmu_call(mediated_put)(vcpu); + + kvm_pmu_put_guest_pmcs(vcpu); + + perf_put_guest_lvtpc(); + + perf_put_guest_context(); +} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5c3939e91f1d..0925246731cb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -37,13 +37,26 @@ struct kvm_pmu_ops { void (*deliver_pmi)(struct kvm_vcpu *vcpu); void (*cleanup)(struct kvm_vcpu *vcpu); + bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); + void (*mediated_load)(struct kvm_vcpu *vcpu); + void (*mediated_put)(struct kvm_vcpu *vcpu); + void (*write_global_ctrl)(u64 global_ctrl); + const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; const int MIN_NR_GP_COUNTERS; + + const u32 PERF_GLOBAL_CTRL; + const u32 GP_EVENTSEL_BASE; + const u32 GP_COUNTER_BASE; + const u32 FIXED_COUNTER_BASE; + const u32 MSR_STRIDE; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); +void kvm_handle_guest_mediated_pmi(void); + static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) { /* @@ -58,6 +71,11 @@ static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) return pmu->version > 1; } +static inline bool kvm_vcpu_has_mediated_pmu(struct kvm_vcpu *vcpu) +{ + return enable_mediated_pmu && vcpu_to_pmu(vcpu)->version; +} + /* * KVM tracks all counters in 64-bit bitmaps, with general purpose counters * mapped to bits 31:0 and fixed counters mapped to 63:32, e.g. fixed counter 0 @@ -101,6 +119,9 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) { u64 counter, enabled, running; + if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) + return pmc->counter & pmc_bitmask(pmc); + counter = pmc->counter + pmc->emulated_counter; if (pmc->perf_event && !pmc->is_paused) @@ -174,7 +195,7 @@ static inline bool pmc_is_locally_enabled(struct kvm_pmc *pmc) extern struct x86_pmu_capability kvm_pmu_cap; -void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); +void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops); void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); @@ -213,6 +234,16 @@ static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); } +static inline bool kvm_pmu_is_fastpath_emulation_allowed(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + return !kvm_vcpu_has_mediated_pmu(vcpu) || + !bitmap_intersects(pmu->pmc_counting_instructions, + (unsigned long *)&pmu->global_ctrl, + X86_PMC_IDX_MAX); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); @@ -227,8 +258,12 @@ void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); +void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu); +void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu); bool is_vmware_backdoor_pmc(u32 pmc_idx); +bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu); +bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu); extern struct kvm_pmu_ops intel_pmu_ops; extern struct kvm_pmu_ops amd_pmu_ops; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 79cb85b8a156..de90b104a0dd 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -193,7 +193,7 @@ void recalc_intercepts(struct vcpu_svm *svm) * Hardcode the capacity of the array based on the maximum number of _offsets_. * MSRs are batched together, so there are fewer offsets than MSRs. */ -static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; +static int nested_svm_msrpm_merge_offsets[10] __ro_after_init; static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; typedef unsigned long nsvm_msrpm_merge_t; @@ -221,6 +221,22 @@ int __init nested_svm_init_msrpm_merge_offsets(void) MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP, MSR_IA32_LASTINTTOIP, + + MSR_K7_PERFCTR0, + MSR_K7_PERFCTR1, + MSR_K7_PERFCTR2, + MSR_K7_PERFCTR3, + MSR_F15H_PERF_CTR0, + MSR_F15H_PERF_CTR1, + MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, + MSR_F15H_PERF_CTR4, + MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, }; int i, j; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index bc062285fbf5..7aa298eeb072 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -166,6 +166,8 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~pmu->reserved_bits; if (data != pmc->eventsel) { pmc->eventsel = data; + pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) | + AMD64_EVENTSEL_GUESTONLY; kvm_pmu_request_counter_reprogram(pmc); } return 0; @@ -227,6 +229,37 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) } } +static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) +{ + return host_pmu->version >= 2; +} + +static void amd_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status; + + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status); + /* Clear host global_status MSR if non-zero. */ + if (global_status) + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status); + + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status); + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl); +} + +static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status); + + /* Clear global status bits if non-zero */ + if (pmu->global_status) + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status); +} + struct kvm_pmu_ops amd_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, @@ -236,7 +269,18 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .set_msr = amd_pmu_set_msr, .refresh = amd_pmu_refresh, .init = amd_pmu_init, + + .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, + .mediated_load = amd_mediated_pmu_load, + .mediated_put = amd_mediated_pmu_put, + .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, + + .PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + .GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0, + .GP_COUNTER_BASE = MSR_F15H_PERF_CTR0, + .FIXED_COUNTER_BASE = 0, + .MSR_STRIDE = 2, }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9ee74c57bd51..8f8bc863e214 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -170,6 +170,8 @@ module_param(intercept_smi, bool, 0444); bool vnmi = true; module_param(vnmi, bool, 0444); +module_param(enable_mediated_pmu, bool, 0444); + static bool svm_gp_erratum_intercept = true; static u8 rsm_ins_bytes[] = "\x0f\xaa"; @@ -729,6 +731,40 @@ void svm_vcpu_free_msrpm(void *msrpm) __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); } +static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) +{ + bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + if (!enable_mediated_pmu) + return; + + /* Legacy counters are always available for AMD CPUs with a PMU. */ + for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++) + svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i, + MSR_TYPE_RW, intercept); + + intercept |= !guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); + for (i = 0; i < pmu->nr_arch_gp_counters; i++) + svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, + MSR_TYPE_RW, intercept); + + for ( ; i < kvm_pmu_cap.num_counters_gp; i++) + svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, + MSR_TYPE_RW); + + intercept = kvm_need_perf_global_ctrl_intercept(vcpu); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, + MSR_TYPE_RW, intercept); + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, + MSR_TYPE_RW, intercept); +} + static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -797,6 +833,8 @@ static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) if (sev_es_guest(vcpu->kvm)) sev_es_recalc_msr_intercepts(vcpu); + svm_recalc_pmu_msr_intercepts(vcpu); + /* * x2APIC intercepts are modified on-demand and cannot be filtered by * userspace. @@ -1013,6 +1051,11 @@ static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_VMSAVE); } } + + if (kvm_need_rdpmc_intercept(vcpu)) + svm_set_intercept(svm, INTERCEPT_RDPMC); + else + svm_clr_intercept(svm, INTERCEPT_RDPMC); } static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) @@ -4385,6 +4428,9 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; + if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl); + trace_kvm_exit(vcpu, KVM_ISA_SVM); svm_complete_interrupts(vcpu); diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 02aadb9d730e..4e371c93ae16 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -109,6 +109,12 @@ static inline bool cpu_has_load_cet_ctrl(void) { return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE); } + +static inline bool cpu_has_save_perf_global_ctrl(void) +{ + return vmcs_config.vmexit_ctrl & VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; +} + static inline bool cpu_has_vmx_mpx(void) { return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; @@ -395,7 +401,8 @@ static inline bool vmx_pt_mode_is_host_guest(void) static inline bool vmx_pebs_supported(void) { - return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept && + !enable_mediated_pmu; } static inline bool cpu_has_notify_vmexit(void) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 881bb914c164..248635da6766 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -621,6 +621,47 @@ static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, msr_bitmap_l0, msr); } +#define nested_vmx_merge_msr_bitmaps(msr, type) \ + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ + msr_bitmap_l0, msr, type) + +#define nested_vmx_merge_msr_bitmaps_read(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) + +#define nested_vmx_merge_msr_bitmaps_write(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) + +#define nested_vmx_merge_msr_bitmaps_rw(msr) \ + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) + +static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, + unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; + + /* + * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if + * none of the MSRs can possibly be passed through to L1. + */ + if (!kvm_vcpu_has_mediated_pmu(vcpu)) + return; + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); + + nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); + nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); + nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); +} + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -704,23 +745,13 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. */ #ifdef CONFIG_X86_64 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_FS_BASE, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_GS_BASE, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); + nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); + nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); + nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); #endif - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, MSR_TYPE_W); - - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_FLUSH_CMD, MSR_TYPE_W); + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); + nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); + nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_APERF, MSR_TYPE_R); @@ -746,6 +777,8 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_PL3_SSP, MSR_TYPE_RW); + nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; @@ -1046,16 +1079,12 @@ static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, * does not include the time taken for emulation of the L2->L1 * VM-exit in L0, use the more accurate value. */ - if (msr_index == MSR_IA32_TSC) { - int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, - MSR_IA32_TSC); + if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { + int slot = vmx->nested.tsc_autostore_slot; + u64 host_tsc = vmx->msr_autostore.val[slot].value; - if (i >= 0) { - u64 val = vmx->msr_autostore.guest.val[i].value; - - *data = kvm_read_l1_tsc(vcpu, val); - return true; - } + *data = kvm_read_l1_tsc(vcpu, host_tsc); + return true; } if (kvm_emulate_msr_read(vcpu, msr_index, data)) { @@ -1134,42 +1163,6 @@ static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) return false; } -static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, - u32 msr_index) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - struct vmx_msrs *autostore = &vmx->msr_autostore.guest; - bool in_vmcs12_store_list; - int msr_autostore_slot; - bool in_autostore_list; - int last; - - msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); - in_autostore_list = msr_autostore_slot >= 0; - in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); - - if (in_vmcs12_store_list && !in_autostore_list) { - if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { - /* - * Emulated VMEntry does not fail here. Instead a less - * accurate value will be returned by - * nested_vmx_get_vmexit_msr_value() by reading KVM's - * internal MSR state instead of reading the value from - * the vmcs02 VMExit MSR-store area. - */ - pr_warn_ratelimited( - "Not enough msr entries in msr_autostore. Can't add msr %x\n", - msr_index); - return; - } - last = autostore->nr++; - autostore->val[last].index = msr_index; - } else if (!in_vmcs12_store_list && in_autostore_list) { - last = --autostore->nr; - autostore->val[msr_autostore_slot] = autostore->val[last]; - } -} - /* * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are * emulating VM-Entry into a guest with EPT enabled. On failure, the expected @@ -2337,7 +2330,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode. */ - vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); @@ -2669,12 +2662,25 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) } /* - * Make sure the msr_autostore list is up to date before we set the - * count in the vmcs02. + * If vmcs12 is configured to save TSC on exit via the auto-store list, + * append the MSR to vmcs02's auto-store list so that KVM effectively + * reads TSC at the time of VM-Exit from L2. The saved value will be + * propagated to vmcs12's list on nested VM-Exit. + * + * Don't increment the number of MSRs in the vCPU structure, as saving + * TSC is specific to this particular incarnation of vmcb02, i.e. must + * not bleed into vmcs01. */ - prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); + if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && + !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { + vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; + vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); + } else { + vmx->nested.tsc_autostore_slot = -1; + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); + } vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -5118,7 +5124,11 @@ void __nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, kvm_nested_vmexit_handle_ibrs(vcpu); - /* Update any VMCS fields that might have changed while L2 ran */ + /* + * Update any VMCS fields that might have changed while vmcs02 was the + * active VMCS. The tracking is per-vCPU, not per-VMCS. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index de1d9785c01f..27eb76e6b6a0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -61,6 +61,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) int i; pmu->fixed_ctr_ctrl = data; + pmu->fixed_ctr_ctrl_hw = data; for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { u8 new_ctrl = fixed_ctrl_field(data, i); u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); @@ -128,19 +129,6 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; } -static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) -{ - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) - return 0; - - return vcpu->arch.perf_capabilities; -} - -static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) -{ - return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; -} - static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) { if (!fw_writes_is_enabled(pmu_to_vcpu(pmu))) @@ -443,6 +431,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data != pmc->eventsel) { pmc->eventsel = data; + pmc->eventsel_hw = data; kvm_pmu_request_counter_reprogram(pmc); } break; @@ -767,6 +756,71 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) } } +static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) +{ + u64 host_perf_cap = 0; + + if (boot_cpu_has(X86_FEATURE_PDCM)) + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); + + /* + * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width + * writes so that KVM can precisely load guest counter values. + */ + if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES)) + return false; + + /* + * All CPUs that support a mediated PMU are expected to support loading + * PERF_GLOBAL_CTRL via dedicated VMCS fields. + */ + if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl())) + return false; + + return true; +} + +static void intel_pmu_write_global_ctrl(u64 global_ctrl) +{ + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl); +} + + +static void intel_mediated_pmu_load(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 global_status, toggle; + + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, global_status); + toggle = pmu->global_status ^ global_status; + if (global_status & toggle) + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); + if (pmu->global_status & toggle) + wrmsrq(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); + + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw); +} + +static void intel_mediated_pmu_put(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + /* MSR_CORE_PERF_GLOBAL_CTRL is already saved at VM-exit. */ + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); + + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ + if (pmu->global_status) + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); + + /* + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and + * also to avoid accidentally enabling fixed counters (based on guest + * state) while running in the host, e.g. when setting global ctrl. + */ + if (pmu->fixed_ctr_ctrl_hw) + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); +} + struct kvm_pmu_ops intel_pmu_ops __initdata = { .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, @@ -778,7 +832,19 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .reset = intel_pmu_reset, .deliver_pmi = intel_pmu_deliver_pmi, .cleanup = intel_pmu_cleanup, + + .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, + .mediated_load = intel_mediated_pmu_load, + .mediated_put = intel_mediated_pmu_put, + .write_global_ctrl = intel_pmu_write_global_ctrl, + .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, .MIN_NR_GP_COUNTERS = 1, + + .PERF_GLOBAL_CTRL = MSR_CORE_PERF_GLOBAL_CTRL, + .GP_EVENTSEL_BASE = MSR_P6_EVNTSEL0, + .GP_COUNTER_BASE = MSR_IA32_PMC0, + .FIXED_COUNTER_BASE = MSR_CORE_PERF_FIXED_CTR0, + .MSR_STRIDE = 1, }; diff --git a/arch/x86/kvm/vmx/pmu_intel.h b/arch/x86/kvm/vmx/pmu_intel.h index 5620d0882cdc..5d9357640aa1 100644 --- a/arch/x86/kvm/vmx/pmu_intel.h +++ b/arch/x86/kvm/vmx/pmu_intel.h @@ -4,6 +4,21 @@ #include +#include "cpuid.h" + +static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) +{ + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) + return 0; + + return vcpu->arch.perf_capabilities; +} + +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; +} + bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 49f5caa45e13..967b58a8ab9d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -150,6 +150,8 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); extern bool __read_mostly allow_smaller_maxphyaddr; module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); +module_param(enable_mediated_pmu, bool, 0444); + #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ @@ -1027,7 +1029,7 @@ static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx vm_exit_controls_clearbit(vmx, exit); } -int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) +static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) { unsigned int i; @@ -1038,9 +1040,22 @@ int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) return -ENOENT; } -static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr, + unsigned long vmcs_count_field) { int i; + + i = vmx_find_loadstore_msr_slot(m, msr); + if (i < 0) + return; + + --m->nr; + m->val[i] = m->val[m->nr]; + vmcs_write32(vmcs_count_field, m->nr); +} + +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ struct msr_autoload *m = &vmx->msr_autoload; switch (msr) { @@ -1061,21 +1076,9 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = vmx_find_loadstore_msr_slot(&m->guest, msr); - if (i < 0) - goto skip_guest; - --m->guest.nr; - m->guest.val[i] = m->guest.val[m->guest.nr]; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); -skip_guest: - i = vmx_find_loadstore_msr_slot(&m->host, msr); - if (i < 0) - return; - - --m->host.nr; - m->host.val[i] = m->host.val[m->host.nr]; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); + vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT); + vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT); } static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -1090,11 +1093,28 @@ static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_setbit(vmx, exit); } -static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, - u64 guest_val, u64 host_val, bool entry_only) +static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value, + unsigned long vmcs_count_field, struct kvm *kvm) +{ + int i; + + i = vmx_find_loadstore_msr_slot(m, msr); + if (i < 0) { + if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm)) + return; + + i = m->nr++; + m->val[i].index = msr; + vmcs_write32(vmcs_count_field, m->nr); + } + m->val[i].value = value; +} + +static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, + u64 guest_val, u64 host_val) { - int i, j = 0; struct msr_autoload *m = &vmx->msr_autoload; + struct kvm *kvm = vmx->vcpu.kvm; switch (msr) { case MSR_EFER: @@ -1128,32 +1148,8 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrq(MSR_IA32_PEBS_ENABLE, 0); } - i = vmx_find_loadstore_msr_slot(&m->guest, msr); - if (!entry_only) - j = vmx_find_loadstore_msr_slot(&m->host, msr); - - if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || - (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { - printk_once(KERN_WARNING "Not enough msr switch entries. " - "Can't add msr %x\n", msr); - return; - } - if (i < 0) { - i = m->guest.nr++; - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); - } - m->guest.val[i].index = msr; - m->guest.val[i].value = guest_val; - - if (entry_only) - return; - - if (j < 0) { - j = m->host.nr++; - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); - } - m->host.val[j].index = msr; - m->host.val[j].value = host_val; + vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm); + vmx_add_auto_msr(&m->guest, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm); } static bool update_transition_efer(struct vcpu_vmx *vmx) @@ -1187,8 +1183,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) if (!(guest_efer & EFER_LMA)) guest_efer &= ~EFER_LME; if (guest_efer != kvm_host.efer) - add_atomic_switch_msr(vmx, MSR_EFER, - guest_efer, kvm_host.efer, false); + add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer); else clear_atomic_switch_msr(vmx, MSR_EFER); return false; @@ -1209,6 +1204,17 @@ static bool update_transition_efer(struct vcpu_vmx *vmx) return true; } +static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr) +{ + vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT, + vmx->vcpu.kvm); +} + +static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr) +{ + vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT); +} + #ifdef CONFIG_X86_32 /* * On 32-bit kernels, VM exits still load the FS and GS bases from the @@ -4278,6 +4284,62 @@ void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) } } +static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) +{ + u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; + bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool intercept = !has_mediated_pmu; + int i; + + if (!enable_mediated_pmu) + return; + + if (!cpu_has_save_perf_global_ctrl()) { + vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; + + if (has_mediated_pmu) + vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); + else + vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); + } + + vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + has_mediated_pmu); + + vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu); + + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW, + intercept || !fw_writes_is_enabled(vcpu)); + } + for ( ; i < kvm_pmu_cap.num_counters_gp; i++) { + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, + MSR_TYPE_RW, true); + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, + MSR_TYPE_RW, true); + } + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, + MSR_TYPE_RW, intercept); + for ( ; i < kvm_pmu_cap.num_counters_fixed; i++) + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, + MSR_TYPE_RW, true); + + intercept = kvm_need_perf_global_ctrl_intercept(vcpu); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + MSR_TYPE_RW, intercept); + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_TYPE_RW, intercept); +} + static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) { bool intercept; @@ -4344,14 +4406,23 @@ static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); } + vmx_recalc_pmu_msr_intercepts(vcpu); + /* * x2APIC and LBR MSR intercepts are modified on-demand and cannot be * filtered by userspace. */ } +static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) +{ + exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING, + kvm_need_rdpmc_intercept(vcpu)); +} + void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) { + vmx_recalc_instruction_intercepts(vcpu); vmx_recalc_msr_intercepts(vcpu); } @@ -4519,6 +4590,16 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx) vmcs_writel(HOST_SSP, 0); vmcs_writel(HOST_INTR_SSP_TABLE, 0); } + + /* + * When running a guest with a mediated PMU, guest state is resident in + * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host + * activity doesn't bleed into the guest counters. When running with + * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every + * entry/exit to merge guest and host PMU usage. + */ + if (enable_mediated_pmu) + vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0); } void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) @@ -4586,7 +4667,8 @@ static u32 vmx_get_initial_vmexit_ctrl(void) VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ return vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER | + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL); } void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) @@ -4918,6 +5000,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmcs_write64(VM_FUNCTION_CONTROL, 0); vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); @@ -6584,7 +6667,7 @@ void dump_vmcs(struct kvm_vcpu *vcpu) if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) - vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); + vmx_dump_msrs("autostore", &vmx->msr_autostore); if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", @@ -7336,6 +7419,9 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) struct perf_guest_switch_msr *msrs; struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); + if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu)) + return; + pmu->host_cross_mapped_mask = 0; if (pmu->pebs_enable & pmu->global_ctrl) intel_pmu_cross_mapped_check(pmu); @@ -7350,7 +7436,30 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) clear_atomic_switch_msr(vmx, msrs[i].msr); else add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, - msrs[i].host, false); + msrs[i].host); +} + +static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) + return; + + if (!cpu_has_save_perf_global_ctrl()) { + int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, + MSR_CORE_PERF_GLOBAL_CTRL); + + if (WARN_ON_ONCE(slot < 0)) + return; + + pmu->global_ctrl = vmx->msr_autostore.val[slot].value; + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl); + return; + } + + pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); } static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) @@ -7638,6 +7747,8 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, u64 run_flags) vmx->loaded_vmcs->launched = 1; + vmx_refresh_guest_perf_global_control(vcpu); + vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); @@ -8031,7 +8142,8 @@ static __init u64 vmx_get_perf_capabilities(void) if (boot_cpu_has(X86_FEATURE_PDCM)) rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); - if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) && + !enable_mediated_pmu) { x86_perf_get_lbr(&vmx_lbr_caps); /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a926ce43ad40..70bfe81dea54 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -182,6 +182,7 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; + int tsc_autostore_slot; struct nested_vmx_msrs msrs; /* SMM related state */ @@ -236,9 +237,7 @@ struct vcpu_vmx { struct vmx_msrs host; } msr_autoload; - struct msr_autostore { - struct vmx_msrs guest; - } msr_autostore; + struct vmx_msrs msr_autostore; struct { int vm86_active; @@ -376,7 +375,6 @@ void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags); unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, unsigned int flags); -int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); @@ -501,7 +499,8 @@ static inline u8 vmx_get_rvi(void) VM_EXIT_CLEAR_BNDCFGS | \ VM_EXIT_PT_CONCEAL_PIP | \ VM_EXIT_CLEAR_IA32_RTIT_CTL | \ - VM_EXIT_LOAD_CET_STATE) + VM_EXIT_LOAD_CET_STATE | \ + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL) #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ (PIN_BASED_EXT_INTR_MASK | \ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 06f55aa55172..391f4a5ce6dd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -185,6 +185,10 @@ bool __read_mostly enable_pmu = true; EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); module_param(enable_pmu, bool, 0444); +/* Enable/disabled mediated PMU virtualization. */ +bool __read_mostly enable_mediated_pmu; +EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); + bool __read_mostly eager_page_split = true; module_param(eager_page_split, bool, 0644); @@ -2213,6 +2217,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_invd); fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + if (!kvm_emulate_invd(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; @@ -2269,6 +2276,9 @@ static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || @@ -3944,6 +3954,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vcpu->arch.perf_capabilities = data; kvm_pmu_refresh(vcpu); + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); break; case MSR_IA32_PRED_CMD: { u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); @@ -6881,7 +6892,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, break; mutex_lock(&kvm->lock); - if (!kvm->created_vcpus) { + if (!kvm->created_vcpus && !kvm->arch.created_mediated_pmu) { kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); r = 0; } @@ -10151,7 +10162,8 @@ int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); + __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, + enable_mediated_pmu ? kvm_handle_guest_mediated_pmi : NULL); if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); @@ -11359,6 +11371,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) run_flags |= KVM_RUN_LOAD_DEBUGCTL; vcpu->arch.host_debugctl = debug_ctl; + kvm_mediated_pmu_load(vcpu); + guest_timing_enter_irqoff(); /* @@ -11397,6 +11411,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_host_pkru(vcpu); + kvm_mediated_pmu_put(vcpu); + /* * Do this here before restoring debug registers on the host. And * since we do this before handling the vmexit, a DR access vmexit @@ -11734,6 +11750,9 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_emulate_halt); fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) { + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) + return EXIT_FASTPATH_NONE; + if (!kvm_emulate_halt(vcpu)) return EXIT_FASTPATH_EXIT_USERSPACE; @@ -12673,8 +12692,13 @@ static int sync_regs(struct kvm_vcpu *vcpu) return 0; } +#define PERF_MEDIATED_PMU_MSG \ + "Failed to enable mediated vPMU, try disabling system wide perf events and nmi_watchdog.\n" + int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { + int r; + if (kvm_check_tsc_unstable() && kvm->created_vcpus) pr_warn_once("SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); @@ -12685,7 +12709,29 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (id >= kvm->arch.max_vcpu_ids) return -EINVAL; - return kvm_x86_call(vcpu_precreate)(kvm); + /* + * Note, any actions done by .vcpu_create() must be idempotent with + * respect to creating multiple vCPUs, and therefore are not undone if + * creating a vCPU fails (including failure during pre-create). + */ + r = kvm_x86_call(vcpu_precreate)(kvm); + if (r) + return r; + + if (enable_mediated_pmu && kvm->arch.enable_pmu && + !kvm->arch.created_mediated_pmu) { + if (irqchip_in_kernel(kvm)) { + r = perf_create_mediated_pmu(); + if (r) { + pr_warn_ratelimited(PERF_MEDIATED_PMU_MSG); + return r; + } + kvm->arch.created_mediated_pmu = true; + } else { + kvm->arch.enable_pmu = false; + } + } + return 0; } int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) @@ -13351,6 +13397,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } + if (kvm->arch.created_mediated_pmu) + perf_release_mediated_pmu(); kvm_destroy_vcpus(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); #ifdef CONFIG_KVM_IOAPIC diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index ff20e62d98c6..94d4f07aaaa0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -481,6 +481,7 @@ extern struct kvm_caps kvm_caps; extern struct kvm_host_values kvm_host; extern bool enable_pmu; +extern bool enable_mediated_pmu; void kvm_setup_xss_caps(void); diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 295c94a3ccc1..9aff61e7b8f2 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -32,6 +32,7 @@ mandatory-y += irq_work.h mandatory-y += kdebug.h mandatory-y += kmap_size.h mandatory-y += kprobes.h +mandatory-y += kvm_types.h mandatory-y += linkage.h mandatory-y += local.h mandatory-y += local64.h diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 021d1fa09e92..c05a79c21745 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1756,10 +1756,17 @@ static inline bool kvm_arch_intc_initialized(struct kvm *kvm) #ifdef CONFIG_GUEST_PERF_EVENTS unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); +void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), + void (*mediated_pmi_handler)(void)); + +static inline void kvm_register_perf_callbacks(void) +{ + __kvm_register_perf_callbacks(NULL, NULL); +} + void kvm_unregister_perf_callbacks(void); #else -static inline void kvm_register_perf_callbacks(void *ign) {} +static inline void kvm_register_perf_callbacks(void) {} static inline void kvm_unregister_perf_callbacks(void) {} #endif /* CONFIG_GUEST_PERF_EVENTS */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9ded2e582c60..48d851fbd8ea 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -305,6 +305,7 @@ struct perf_event_pmu_context; #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 +#define PERF_PMU_CAP_MEDIATED_VPMU 0x0800 /** * pmu::scope @@ -998,6 +999,11 @@ struct perf_event_groups { u64 index; }; +struct perf_time_ctx { + u64 time; + u64 stamp; + u64 offset; +}; /** * struct perf_event_context - event context structure @@ -1036,9 +1042,12 @@ struct perf_event_context { /* * Context clock, runs when context enabled. */ - u64 time; - u64 timestamp; - u64 timeoffset; + struct perf_time_ctx time; + + /* + * Context clock, runs when in the guest mode. + */ + struct perf_time_ctx timeguest; /* * These fields let us detect when two contexts have both @@ -1171,9 +1180,8 @@ struct bpf_perf_event_data_kern { * This is a per-cpu dynamically allocated data structure. */ struct perf_cgroup_info { - u64 time; - u64 timestamp; - u64 timeoffset; + struct perf_time_ctx time; + struct perf_time_ctx timeguest; int active; }; @@ -1669,6 +1677,8 @@ struct perf_guest_info_callbacks { unsigned int (*state)(void); unsigned long (*get_ip)(void); unsigned int (*handle_intel_pt_intr)(void); + + void (*handle_mediated_pmi)(void); }; #ifdef CONFIG_GUEST_PERF_EVENTS @@ -1678,6 +1688,7 @@ extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DECLARE_STATIC_CALL(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); static inline unsigned int perf_guest_state(void) { @@ -1694,6 +1705,11 @@ static inline unsigned int perf_guest_handle_intel_pt_intr(void) return static_call(__perf_guest_handle_intel_pt_intr)(); } +static inline void perf_guest_handle_mediated_pmi(void) +{ + static_call(__perf_guest_handle_mediated_pmi)(); +} + extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); @@ -1914,6 +1930,13 @@ extern int perf_event_account_interrupt(struct perf_event *event); extern int perf_event_period(struct perf_event *event, u64 value); extern u64 perf_event_pause(struct perf_event *event, bool reset); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +int perf_create_mediated_pmu(void); +void perf_release_mediated_pmu(void); +void perf_load_guest_context(void); +void perf_put_guest_context(void); +#endif + #else /* !CONFIG_PERF_EVENTS: */ static inline void * diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h index 7f7282516bf5..64618618febd 100644 --- a/include/linux/unwind_user.h +++ b/include/linux/unwind_user.h @@ -5,8 +5,22 @@ #include #include -#ifndef ARCH_INIT_USER_FP_FRAME - #define ARCH_INIT_USER_FP_FRAME +#ifndef CONFIG_HAVE_UNWIND_USER_FP + +#define ARCH_INIT_USER_FP_FRAME(ws) + +#endif + +#ifndef ARCH_INIT_USER_FP_ENTRY_FRAME +#define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) +#endif + +#ifndef unwind_user_at_function_start +static inline bool unwind_user_at_function_start(struct pt_regs *regs) +{ + return false; +} +#define unwind_user_at_function_start unwind_user_at_function_start #endif int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries); diff --git a/init/Kconfig b/init/Kconfig index fa79feb8fe57..6628ff295cb8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2061,6 +2061,10 @@ config GUEST_PERF_EVENTS bool depends on HAVE_PERF_EVENTS +config PERF_GUEST_MEDIATED_PMU + bool + depends on GUEST_PERF_EVENTS + config PERF_USE_VMALLOC bool help diff --git a/kernel/events/core.c b/kernel/events/core.c index 8cca80094624..e320e06c8af6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "internal.h" @@ -166,6 +167,18 @@ enum event_type_t { EVENT_CPU = 0x10, EVENT_CGROUP = 0x20, + /* + * EVENT_GUEST is set when scheduling in/out events between the host + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST + * is used: + * + * - In for_each_epc() to skip PMUs that don't support events in a + * MEDIATED_VPMU guest, i.e. don't need to be context switched. + * - To indicate the start/end point of the events in a guest. Guest + * running time is deducted for host-only (exclude_guest) events. + */ + EVENT_GUEST = 0x40, + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, /* compound helpers */ EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, @@ -458,6 +471,20 @@ static cpumask_var_t perf_online_pkg_mask; static cpumask_var_t perf_online_sys_mask; static struct kmem_cache *perf_event_cache; +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static DEFINE_PER_CPU(bool, guest_ctx_loaded); + +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return __this_cpu_read(guest_ctx_loaded); +} +#else +static __always_inline bool is_guest_mediated_pmu_loaded(void) +{ + return false; +} +#endif + /* * perf event paranoia level: * -1 - not paranoid at all @@ -779,33 +806,97 @@ do { \ ___p; \ }) -#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ +static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) +{ + if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) + return true; + if ((event_type & EVENT_GUEST) && + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) + return true; + return false; +} + +#define for_each_epc(_epc, _ctx, _pmu, _event_type) \ list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ - if (_cgroup && !_epc->nr_cgroups) \ + if (perf_skip_pmu_ctx(_epc, _event_type)) \ continue; \ else if (_pmu && _epc->pmu != _pmu) \ continue; \ else -static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_disable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_disable(pmu_ctx->pmu); } -static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) +static void perf_ctx_enable(struct perf_event_context *ctx, + enum event_type_t event_type) { struct perf_event_pmu_context *pmu_ctx; - for_each_epc(pmu_ctx, ctx, NULL, cgroup) + for_each_epc(pmu_ctx, ctx, NULL, event_type) perf_pmu_enable(pmu_ctx->pmu); } static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); +static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) +{ + if (adv) + time->time += now - time->stamp; + time->stamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(time->offset, time->time - time->stamp); +} + +static_assert(offsetof(struct perf_event_context, timeguest) - + offsetof(struct perf_event_context, time) == + sizeof(struct perf_time_ctx)); + +#define T_TOTAL 0 +#define T_GUEST 1 + +static inline u64 __perf_event_time_ctx(struct perf_event *event, + struct perf_time_ctx *times) +{ + u64 time = times[T_TOTAL].time; + + if (event->attr.exclude_guest) + time -= times[T_GUEST].time; + + return time; +} + +static inline u64 __perf_event_time_ctx_now(struct perf_event *event, + struct perf_time_ctx *times, + u64 now) +{ + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { + /* + * (now + times[total].offset) - (now + times[guest].offset) := + * times[total].offset - times[guest].offset + */ + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); + } + + return now + READ_ONCE(times[T_TOTAL].offset); +} + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -842,12 +933,16 @@ static inline int is_cgroup_event(struct perf_event *event) return event->cgrp != NULL; } +static_assert(offsetof(struct perf_cgroup_info, timeguest) - + offsetof(struct perf_cgroup_info, time) == + sizeof(struct perf_time_ctx)); + static inline u64 perf_cgroup_event_time(struct perf_event *event) { struct perf_cgroup_info *t; t = per_cpu_ptr(event->cgrp->info, event->cpu); - return t->time; + return __perf_event_time_ctx(event, &t->time); } static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) @@ -856,20 +951,21 @@ static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) t = per_cpu_ptr(event->cgrp->info, event->cpu); if (!__load_acquire(&t->active)) - return t->time; - now += READ_ONCE(t->timeoffset); - return now; + return __perf_event_time_ctx(event, &t->time); + + return __perf_event_time_ctx_now(event, &t->time, now); } -static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) { - if (adv) - info->time += now - info->timestamp; - info->timestamp = now; - /* - * see update_context_time() - */ - WRITE_ONCE(info->timeoffset, info->time - info->timestamp); + update_perf_time_ctx(&info->timeguest, now, adv); +} + +static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) +{ + update_perf_time_ctx(&info->time, now, true); + if (is_guest_mediated_pmu_loaded()) + __update_cgrp_guest_time(info, now, true); } static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) @@ -885,7 +981,7 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, now, true); + update_cgrp_time(info, now); if (final) __store_release(&info->active, 0); } @@ -908,11 +1004,11 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) * Do not update time when cgroup is not active */ if (info->active) - __update_cgrp_time(info, perf_clock(), true); + update_cgrp_time(info, perf_clock()); } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { struct perf_event_context *ctx = &cpuctx->ctx; struct perf_cgroup *cgrp = cpuctx->cgrp; @@ -932,8 +1028,12 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - __update_cgrp_time(info, ctx->timestamp, false); - __store_release(&info->active, 1); + if (guest) { + __update_cgrp_guest_time(info, ctx->time.stamp, false); + } else { + update_perf_time_ctx(&info->time, ctx->time.stamp, false); + __store_release(&info->active, 1); + } } } @@ -964,8 +1064,7 @@ static void perf_cgroup_switch(struct task_struct *task) return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - - perf_ctx_disable(&cpuctx->ctx, true); + perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); /* @@ -981,7 +1080,7 @@ static void perf_cgroup_switch(struct task_struct *task) */ ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); - perf_ctx_enable(&cpuctx->ctx, true); + perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -1138,7 +1237,7 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) { } @@ -1550,29 +1649,24 @@ static void perf_unpin_context(struct perf_event_context *ctx) */ static void __update_context_time(struct perf_event_context *ctx, bool adv) { - u64 now = perf_clock(); - lockdep_assert_held(&ctx->lock); - if (adv) - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; + update_perf_time_ctx(&ctx->time, perf_clock(), adv); +} - /* - * The above: time' = time + (now - timestamp), can be re-arranged - * into: time` = now + (time - timestamp), which gives a single value - * offset to compute future time without locks on. - * - * See perf_event_time_now(), which can be used from NMI context where - * it's (obviously) not possible to acquire ctx->lock in order to read - * both the above values in a consistent manner. - */ - WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) +{ + lockdep_assert_held(&ctx->lock); + + /* must be called after __update_context_time(); */ + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); } static void update_context_time(struct perf_event_context *ctx) { __update_context_time(ctx, true); + if (is_guest_mediated_pmu_loaded()) + __update_context_guest_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) @@ -1585,7 +1679,7 @@ static u64 perf_event_time(struct perf_event *event) if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx->time; + return __perf_event_time_ctx(event, &ctx->time); } static u64 perf_event_time_now(struct perf_event *event, u64 now) @@ -1599,10 +1693,9 @@ static u64 perf_event_time_now(struct perf_event *event, u64 now) return perf_cgroup_event_time_now(event, now); if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) - return ctx->time; + return __perf_event_time_ctx(event, &ctx->time); - now += READ_ONCE(ctx->timeoffset); - return now; + return __perf_event_time_ctx_now(event, &ctx->time, now); } static enum event_type_t get_event_type(struct perf_event *event) @@ -2422,20 +2515,23 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) } static inline void -__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) +__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, + bool final, enum event_type_t event_type) { if (ctx->is_active & EVENT_TIME) { if (ctx->is_active & EVENT_FROZEN) return; + update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx, final); + /* vPMU should not stop time */ + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); } } static inline void ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - __ctx_time_update(cpuctx, ctx, false); + __ctx_time_update(cpuctx, ctx, false, 0); } /* @@ -2861,14 +2957,15 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_PINNED); - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type); + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type); if (ctx) - ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type); } /* @@ -2902,11 +2999,11 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, event_type &= EVENT_ALL; - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_disable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_disable(epc->pmu); task_ctx_sched_out(task_ctx, pmu, event_type); @@ -2924,13 +3021,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (event_type & EVENT_PINNED) ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, pmu); + perf_event_sched_in(cpuctx, task_ctx, pmu, 0); - for_each_epc(epc, &cpuctx->ctx, pmu, false) + for_each_epc(epc, &cpuctx->ctx, pmu, 0) perf_pmu_enable(epc->pmu); if (task_ctx) { - for_each_epc(epc, task_ctx, pmu, false) + for_each_epc(epc, task_ctx, pmu, 0) perf_pmu_enable(epc->pmu); } } @@ -3479,11 +3576,10 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -3507,14 +3603,14 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t * * would only update time for the pinned events. */ - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() */ barrier(); - ctx->is_active &= ~event_type; + ctx->is_active &= ~active_type; if (!(ctx->is_active & EVENT_ALL)) { /* @@ -3533,9 +3629,20 @@ ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t cpuctx->task_ctx = NULL; } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule out all exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = EVENT_ALL; + __update_context_guest_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx, true); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } - for_each_epc(pmu_ctx, ctx, pmu, cgroup) + for_each_epc(pmu_ctx, ctx, pmu, event_type) __pmu_ctx_sched_out(pmu_ctx, is_active); } @@ -3691,7 +3798,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ if (local_read(&ctx->nr_no_switch_fast) || @@ -3715,7 +3822,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) perf_ctx_sched_task_cb(ctx, task, false); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); /* * RCU_INIT_POINTER here is safe because we've not @@ -3739,13 +3846,13 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) if (do_switch) { raw_spin_lock(&ctx->lock); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); inside_switch: perf_ctx_sched_task_cb(ctx, task, false); task_ctx_sched_out(ctx, NULL, EVENT_ALL); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); raw_spin_unlock(&ctx->lock); } } @@ -3992,10 +4099,15 @@ static inline void group_update_userpage(struct perf_event *group_event) event_update_userpage(event); } +struct merge_sched_data { + int can_add_hw; + enum event_type_t event_type; +}; + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - int *can_add_hw = data; + struct merge_sched_data *msd = data; if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -4003,13 +4115,22 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, *can_add_hw)) { + /* + * Don't schedule in any host events from PMU with + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. + */ + if (is_guest_mediated_pmu_loaded() && + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && + !(msd->event_type & EVENT_GUEST)) + return 0; + + if (group_can_go_on(event, msd->can_add_hw)) { if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } if (event->state == PERF_EVENT_STATE_INACTIVE) { - *can_add_hw = 0; + msd->can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); @@ -4032,11 +4153,15 @@ static int merge_sched_in(struct perf_event *event, void *data) static void pmu_groups_sched_in(struct perf_event_context *ctx, struct perf_event_groups *groups, - struct pmu *pmu) + struct pmu *pmu, + enum event_type_t event_type) { - int can_add_hw = 1; + struct merge_sched_data msd = { + .can_add_hw = 1, + .event_type = event_type, + }; visit_groups_merge(ctx, groups, smp_processor_id(), pmu, - merge_sched_in, &can_add_hw); + merge_sched_in, &msd); } static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, @@ -4045,20 +4170,18 @@ static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, struct perf_event_context *ctx = pmu_ctx->ctx; if (event_type & EVENT_PINNED) - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); if (event_type & EVENT_FLEXIBLE) - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); } static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) { struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + enum event_type_t active_type = event_type & ~EVENT_FLAGS; struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; - bool cgroup = event_type & EVENT_CGROUP; - - event_type &= ~EVENT_CGROUP; lockdep_assert_held(&ctx->lock); @@ -4066,9 +4189,11 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t return; if (!(is_active & EVENT_TIME)) { + /* EVENT_TIME should be active while the guest runs */ + WARN_ON_ONCE(event_type & EVENT_GUEST); /* start ctx time */ __update_context_time(ctx, false); - perf_cgroup_set_timestamp(cpuctx); + perf_cgroup_set_timestamp(cpuctx, false); /* * CPU-release for the below ->is_active store, * see __load_acquire() in perf_event_time_now() @@ -4076,7 +4201,7 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t barrier(); } - ctx->is_active |= (event_type | EVENT_TIME); + ctx->is_active |= active_type | EVENT_TIME; if (ctx->task) { if (!(is_active & EVENT_ALL)) cpuctx->task_ctx = ctx; @@ -4084,21 +4209,37 @@ ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t WARN_ON_ONCE(cpuctx->task_ctx != ctx); } - is_active ^= ctx->is_active; /* changed bits */ + if (event_type & EVENT_GUEST) { + /* + * Schedule in the required exclude_guest events of PMU + * with PERF_PMU_CAP_MEDIATED_VPMU. + */ + is_active = event_type & EVENT_ALL; + + /* + * Update ctx time to set the new start time for + * the exclude_guest events. + */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx, false); + barrier(); + } else { + is_active ^= ctx->is_active; /* changed bits */ + } /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); + for_each_epc(pmu_ctx, ctx, pmu, event_type) + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); } /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) { - for_each_epc(pmu_ctx, ctx, pmu, cgroup) - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); + for_each_epc(pmu_ctx, ctx, pmu, event_type) + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); } } @@ -4114,11 +4255,11 @@ static void perf_event_context_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) { perf_ctx_lock(cpuctx, ctx); - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); perf_ctx_sched_task_cb(ctx, task, true); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); perf_ctx_unlock(cpuctx, ctx); goto rcu_unlock; } @@ -4131,7 +4272,7 @@ static void perf_event_context_sched_in(struct task_struct *task) if (!ctx->nr_events) goto unlock; - perf_ctx_disable(ctx, false); + perf_ctx_disable(ctx, 0); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -4141,18 +4282,18 @@ static void perf_event_context_sched_in(struct task_struct *task) * events, no need to flip the cpuctx's events around. */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { - perf_ctx_disable(&cpuctx->ctx, false); + perf_ctx_disable(&cpuctx->ctx, 0); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); } - perf_event_sched_in(cpuctx, ctx, NULL); + perf_event_sched_in(cpuctx, ctx, NULL, 0); perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - perf_ctx_enable(&cpuctx->ctx, false); + perf_ctx_enable(&cpuctx->ctx, 0); - perf_ctx_enable(ctx, false); + perf_ctx_enable(ctx, 0); unlock: perf_ctx_unlock(cpuctx, ctx); @@ -5594,6 +5735,8 @@ static void __free_event(struct perf_event *event) { struct pmu *pmu = event->pmu; + security_perf_event_free(event); + if (event->attach_state & PERF_ATTACH_CALLCHAIN) put_callchain_buffers(); @@ -5647,6 +5790,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } +static void mediated_pmu_unaccount_event(struct perf_event *event); + DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) /* vs perf_event_alloc() success */ @@ -5656,8 +5801,7 @@ static void _free_event(struct perf_event *event) irq_work_sync(&event->pending_disable_irq); unaccount_event(event); - - security_perf_event_free(event); + mediated_pmu_unaccount_event(event); if (event->rb) { /* @@ -6180,6 +6324,138 @@ u64 perf_event_pause(struct perf_event *event, bool reset) } EXPORT_SYMBOL_GPL(perf_event_pause); +#ifdef CONFIG_PERF_GUEST_MEDIATED_PMU +static atomic_t nr_include_guest_events __read_mostly; + +static atomic_t nr_mediated_pmu_vms __read_mostly; +static DEFINE_MUTEX(perf_mediated_pmu_mutex); + +/* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ +static inline bool is_include_guest_event(struct perf_event *event) +{ + if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) && + !event->attr.exclude_guest) + return true; + + return false; +} + +static int mediated_pmu_account_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return 0; + + if (atomic_inc_not_zero(&nr_include_guest_events)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_read(&nr_mediated_pmu_vms)) + return -EOPNOTSUPP; + + atomic_inc(&nr_include_guest_events); + return 0; +} + +static void mediated_pmu_unaccount_event(struct perf_event *event) +{ + if (!is_include_guest_event(event)) + return; + + if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events))) + return; + + atomic_dec(&nr_include_guest_events); +} + +/* + * Currently invoked at VM creation to + * - Check whether there are existing !exclude_guest events of PMU with + * PERF_PMU_CAP_MEDIATED_VPMU + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on + * PMUs with PERF_PMU_CAP_MEDIATED_VPMU + * + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf + * still owns all the PMU resources. + */ +int perf_create_mediated_pmu(void) +{ + if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) + return 0; + + guard(mutex)(&perf_mediated_pmu_mutex); + if (atomic_read(&nr_include_guest_events)) + return -EBUSY; + + atomic_inc(&nr_mediated_pmu_vms); + return 0; +} +EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu); + +void perf_release_mediated_pmu(void) +{ + if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms))) + return; + + atomic_dec(&nr_mediated_pmu_vms); +} +EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu); + +/* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ +void perf_load_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST); + if (cpuctx->task_ctx) { + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST); + } + + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, true); +} +EXPORT_SYMBOL_GPL(perf_load_guest_context); + +void perf_put_guest_context(void) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + + lockdep_assert_irqs_disabled(); + + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + + if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded))) + return; + + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); + if (cpuctx->task_ctx) + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); + + perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST); + + if (cpuctx->task_ctx) + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); + + __this_cpu_write(guest_ctx_loaded, false); +} +EXPORT_SYMBOL_GPL(perf_put_guest_context); +#else +static int mediated_pmu_account_event(struct perf_event *event) { return 0; } +static void mediated_pmu_unaccount_event(struct perf_event *event) {} +#endif + /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block @@ -6547,23 +6823,23 @@ void perf_event_update_userpage(struct perf_event *event) if (!rb) goto unlock; - /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. - * - * we cannot simply called update_context_time() - * because of locking issue as we can be called in - * NMI context - */ - calc_timer_values(event, &now, &enabled, &running); - - userpg = rb->user_page; /* * Disable preemption to guarantee consistent time stamps are stored to * the user page. */ preempt_disable(); + + /* + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. + * + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. + */ + calc_timer_values(event, &now, &enabled, &running); + + userpg = rb->user_page; + ++userpg->lock; barrier(); userpg->index = perf_event_index(event); @@ -7383,6 +7659,7 @@ struct perf_guest_info_callbacks __rcu *perf_guest_cbs; DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { @@ -7397,6 +7674,10 @@ void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) if (cbs->handle_intel_pt_intr) static_call_update(__perf_guest_handle_intel_pt_intr, cbs->handle_intel_pt_intr); + + if (cbs->handle_mediated_pmi) + static_call_update(__perf_guest_handle_mediated_pmi, + cbs->handle_mediated_pmi); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); @@ -7408,8 +7689,8 @@ void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) rcu_assign_pointer(perf_guest_cbs, NULL); static_call_update(__perf_guest_state, (void *)&__static_call_return0); static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); - static_call_update(__perf_guest_handle_intel_pt_intr, - (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0); synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); @@ -7869,13 +8150,11 @@ static void perf_output_read(struct perf_output_handle *handle, u64 read_format = event->attr.read_format; /* - * compute total_time_enabled, total_time_running - * based on snapshot values taken when the event - * was last scheduled in. + * Compute total_time_enabled, total_time_running based on snapshot + * values taken when the event was last scheduled in. * - * we cannot simply called update_context_time() - * because of locking issue as we are called in - * NMI context + * We cannot simply call update_context_time() because doing so would + * lead to deadlock when called from NMI context. */ if (read_format & PERF_FORMAT_TOTAL_TIMES) calc_timer_values(event, &now, &enabled, &running); @@ -12043,7 +12322,7 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { event->hw.state = 0; - local64_set(&event->hw.prev_count, event->ctx->time); + local64_set(&event->hw.prev_count, event->ctx->time.time); perf_swevent_start_hrtimer(event); } @@ -12052,7 +12331,7 @@ static void task_clock_event_stop(struct perf_event *event, int flags) event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) - task_clock_event_update(event, event->ctx->time); + task_clock_event_update(event, event->ctx->time.time); } static int task_clock_event_add(struct perf_event *event, int flags) @@ -12072,8 +12351,8 @@ static void task_clock_event_del(struct perf_event *event, int flags) static void task_clock_event_read(struct perf_event *event) { u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - u64 time = event->ctx->time + delta; + u64 delta = now - event->ctx->time.stamp; + u64 time = event->ctx->time.time + delta; task_clock_event_update(event, time); } @@ -13155,6 +13434,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) return ERR_PTR(err); + err = mediated_pmu_account_event(event); + if (err) + return ERR_PTR(err); + /* symmetric to unaccount_event() in _free_event() */ account_event(event); diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 39e270789444..90ab3c1a205e 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -31,6 +31,7 @@ static int unwind_user_next_common(struct unwind_user_state *state, { unsigned long cfa, fp, ra; + /* Get the Canonical Frame Address (CFA) */ if (frame->use_fp) { if (state->fp < state->sp) return -EINVAL; @@ -38,11 +39,9 @@ static int unwind_user_next_common(struct unwind_user_state *state, } else { cfa = state->sp; } - - /* Get the Canonical Frame Address (CFA) */ cfa += frame->cfa_off; - /* stack going in wrong direction? */ + /* Make sure that stack is not going in wrong direction */ if (cfa <= state->sp) return -EINVAL; @@ -50,10 +49,11 @@ static int unwind_user_next_common(struct unwind_user_state *state, if (cfa & (state->ws - 1)) return -EINVAL; - /* Find the Return Address (RA) */ + /* Get the Return Address (RA) */ if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; + /* Get the Frame Pointer (FP) */ if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; @@ -67,7 +67,6 @@ static int unwind_user_next_common(struct unwind_user_state *state, static int unwind_user_next_fp(struct unwind_user_state *state) { -#ifdef CONFIG_HAVE_UNWIND_USER_FP struct pt_regs *regs = task_pt_regs(current); if (state->topmost && unwind_user_at_function_start(regs)) { @@ -81,9 +80,6 @@ static int unwind_user_next_fp(struct unwind_user_state *state) ARCH_INIT_USER_FP_FRAME(state->ws) }; return unwind_user_next_common(state, &fp_frame); -#else - return -EINVAL; -#endif } static int unwind_user_next(struct unwind_user_state *state) diff --git a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h index 47051871b436..6e1d5b955aae 100644 --- a/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h +++ b/tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h @@ -77,7 +77,8 @@ */ #define IRQ_WORK_VECTOR 0xf6 -/* 0xf5 - unused, was UV_BAU_MESSAGE */ +#define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 + #define DEFERRED_ERROR_VECTOR 0xf4 /* Vector on which hypervisor callbacks will be delivered */ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6b1097e76288..61dca8d37abc 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6482,11 +6482,15 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { .state = kvm_guest_state, .get_ip = kvm_guest_get_ip, .handle_intel_pt_intr = NULL, + .handle_mediated_pmi = NULL, }; -void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) +void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), + void (*mediated_pmi_handler)(void)) { kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; + kvm_guest_cbs.handle_mediated_pmi = mediated_pmi_handler; + perf_register_guest_info_callbacks(&kvm_guest_cbs); } void kvm_unregister_perf_callbacks(void)