From 8ca19c40c47d80af369c222662445bbf593666b1 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:40:58 -0800 Subject: [PATCH 01/87] KVM: arm64: Drop MDSCR_EL1_DEBUG_MASK Nothing is using this macro, get rid of it. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-2-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/debug.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index ce8886122ed3..587e9eb4372e 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -16,11 +16,6 @@ #include "trace.h" -/* These are the bits of MDSCR_EL1 we may manipulate */ -#define MDSCR_EL1_DEBUG_MASK (DBG_MDSCR_SS | \ - DBG_MDSCR_KDE | \ - DBG_MDSCR_MDE) - static DEFINE_PER_CPU(u64, mdcr_el2); /* From 2417218f2f234fd7880fb193ebf3ae5fcccfa29b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:40:59 -0800 Subject: [PATCH 02/87] KVM: arm64: Get rid of __kvm_get_mdcr_el2() and related warts KVM caches MDCR_EL2 on a per-CPU basis in order to preserve the configuration of MDCR_EL2.HPMN while running a guest. This is a bit gross, since we're relying on some baked configuration rather than the hardware definition of implemented counters. Discover the number of implemented counters by reading PMCR_EL0.N instead. This works because: - In VHE the kernel runs at EL2, and N always returns the number of counters implemented in hardware - In {n,h}VHE, the EL2 setup code programs MDCR_EL2.HPMN with the EL2 view of PMCR_EL0.N for the host Lastly, avoid traps under nested virtualization by saving PMCR_EL0.N in host data. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-3-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 5 +---- arch/arm64/include/asm/kvm_host.h | 7 +++++-- arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/debug.c | 29 +++++++++++------------------ arch/arm64/kvm/hyp/nvhe/debug-sr.c | 5 ----- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 6 ------ arch/arm64/kvm/hyp/vhe/debug-sr.c | 5 ----- 7 files changed, 18 insertions(+), 41 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index ca2590344313..063185c202ce 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -53,8 +53,7 @@ enum __kvm_host_smccc_func { /* Hypercalls available only prior to pKVM finalisation */ /* __KVM_HOST_SMCCC_FUNC___kvm_hyp_init */ - __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 = __KVM_HOST_SMCCC_FUNC___kvm_hyp_init + 1, - __KVM_HOST_SMCCC_FUNC___pkvm_init, + __KVM_HOST_SMCCC_FUNC___pkvm_init = __KVM_HOST_SMCCC_FUNC___kvm_hyp_init + 1, __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping, __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector, __KVM_HOST_SMCCC_FUNC___kvm_enable_ssbs, @@ -247,8 +246,6 @@ extern void __kvm_adjust_pc(struct kvm_vcpu *vcpu); extern u64 __vgic_v3_get_gic_config(void); extern void __vgic_v3_init_lrs(void); -extern u64 __kvm_get_mdcr_el2(void); - #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ " .align 3\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e18e9244d17a..064f5dfca7f4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -642,7 +642,7 @@ struct kvm_host_data { * host_debug_state contains the host registers which are * saved and restored during world switches. */ - struct { + struct { /* {Break,watch}point registers */ struct kvm_guest_debug_arch regs; /* Statistical profiling extension */ @@ -652,6 +652,9 @@ struct kvm_host_data { /* Values of trap registers for the host before guest entry. */ u64 mdcr_el2; } host_debug_state; + + /* Number of programmable event counters (PMCR_EL0.N) for this CPU */ + unsigned int nr_event_counters; }; struct kvm_host_psci_config { @@ -1332,7 +1335,7 @@ static inline bool kvm_system_needs_idmapped_vectors(void) static inline void kvm_arch_sync_events(struct kvm *kvm) {} -void kvm_arm_init_debug(void); +void kvm_init_host_debug_data(void); void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a102c3aebdbc..ab1bf9ccf385 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2109,6 +2109,7 @@ static void cpu_set_hyp_vector(void) static void cpu_hyp_init_context(void) { kvm_init_host_cpu_context(host_data_ptr(host_ctxt)); + kvm_init_host_debug_data(); if (!is_kernel_in_hyp_mode()) cpu_init_hyp_mode(); @@ -2117,7 +2118,6 @@ static void cpu_hyp_init_context(void) static void cpu_hyp_init_features(void) { cpu_set_hyp_vector(); - kvm_arm_init_debug(); if (is_kernel_in_hyp_mode()) kvm_timer_init_vhe(); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 587e9eb4372e..d8ea6fe6a2a2 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -16,8 +16,6 @@ #include "trace.h" -static DEFINE_PER_CPU(u64, mdcr_el2); - /* * save/restore_guest_debug_regs * @@ -60,21 +58,6 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; } -/** - * kvm_arm_init_debug - grab what we need for debug - * - * Currently the sole task of this function is to retrieve the initial - * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has - * presumably been set-up by some knowledgeable bootcode. - * - * It is called once per-cpu during CPU hyp initialisation. - */ - -void kvm_arm_init_debug(void) -{ - __this_cpu_write(mdcr_el2, kvm_call_hyp_ret(__kvm_get_mdcr_el2)); -} - /** * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value * @@ -94,7 +77,8 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK * to disable guest access to the profiling and trace buffers */ - vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; + vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, + *host_data_ptr(nr_event_counters)); vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | MDCR_EL2_TTRF | @@ -338,3 +322,12 @@ void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE); vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE); } + +void kvm_init_host_debug_data(void) +{ + u64 dfr0 = read_sysreg(id_aa64dfr0_el1); + + if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) + *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, + read_sysreg(pmcr_el0)); +} diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 53efda0235cf..1e2a26d0196e 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -106,8 +106,3 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 6aa0b13d86e5..16f5da3a884a 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -264,11 +264,6 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt) __vgic_v3_init_lrs(); } -static void handle___kvm_get_mdcr_el2(struct kvm_cpu_context *host_ctxt) -{ - cpu_reg(host_ctxt, 1) = __kvm_get_mdcr_el2(); -} - static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1); @@ -384,7 +379,6 @@ typedef void (*hcall_t)(struct kvm_cpu_context *); static const hcall_t host_hcall[] = { /* ___kvm_hyp_init */ - HANDLE_FUNC(__kvm_get_mdcr_el2), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_cpu_set_vector), diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index 289689b2682d..0100339b09e0 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -19,8 +19,3 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) { __debug_switch_to_host_common(vcpu); } - -u64 __kvm_get_mdcr_el2(void) -{ - return read_sysreg(mdcr_el2); -} From 38131c02a53ff691e4c496d65d2d087a5ed42bbf Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:00 -0800 Subject: [PATCH 03/87] KVM: arm64: Track presence of SPE/TRBE in kvm_host_data instead of vCPU Add flags to kvm_host_data to track if SPE/TRBE is present + programmable on a per-CPU basis. Set the flags up at init rather than vcpu_load() as the programmability of these buffers is unlikely to change. Reviewed-by: James Clark Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-4-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 19 ++++++++------ arch/arm64/kvm/arm.c | 3 --- arch/arm64/kvm/debug.c | 40 ++++++++---------------------- arch/arm64/kvm/hyp/nvhe/debug-sr.c | 8 +++--- 4 files changed, 26 insertions(+), 44 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 064f5dfca7f4..fb252d540850 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -610,6 +610,10 @@ struct cpu_sve_state { * field. */ struct kvm_host_data { +#define KVM_HOST_DATA_FLAG_HAS_SPE 0 +#define KVM_HOST_DATA_FLAG_HAS_TRBE 1 + unsigned long flags; + struct kvm_cpu_context host_ctxt; /* @@ -911,10 +915,6 @@ struct kvm_vcpu_arch { #define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) /* Guest debug is live */ #define DEBUG_DIRTY __vcpu_single_flag(iflags, BIT(4)) -/* Save SPE context if active */ -#define DEBUG_STATE_SAVE_SPE __vcpu_single_flag(iflags, BIT(5)) -/* Save TRBE context if active */ -#define DEBUG_STATE_SAVE_TRBE __vcpu_single_flag(iflags, BIT(6)) /* SVE enabled for host EL0 */ #define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) @@ -1310,6 +1310,13 @@ DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data); &this_cpu_ptr_hyp_sym(kvm_host_data)->f) #endif +#define host_data_test_flag(flag) \ + (test_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags))) +#define host_data_set_flag(flag) \ + set_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags)) +#define host_data_clear_flag(flag) \ + clear_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags)) + /* Check whether the FP regs are owned by the guest */ static inline bool guest_owns_fp_regs(void) { @@ -1370,10 +1377,6 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) return (!has_vhe() && attr->exclude_host); } -/* Flags for host debug state */ -void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu); -void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); - #ifdef CONFIG_KVM void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u64 clr); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ab1bf9ccf385..3822774840e1 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -617,15 +617,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu_set_pauth_traps(vcpu); - kvm_arch_vcpu_load_debug_state_flags(vcpu); - if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_arch_vcpu_put_debug_state_flags(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_vhe(vcpu); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index d8ea6fe6a2a2..1ee2fd765b62 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -294,35 +294,6 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) } } -void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) -{ - u64 dfr0; - - /* For VHE, there is nothing to do */ - if (has_vhe()) - return; - - dfr0 = read_sysreg(id_aa64dfr0_el1); - /* - * If SPE is present on this CPU and is available at current EL, - * we may need to check if the host state needs to be saved. - */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && - !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(PMBIDR_EL1_P_SHIFT))) - vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_SPE); - - /* Check if we have TRBE implemented and available at the host */ - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && - !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) - vcpu_set_flag(vcpu, DEBUG_STATE_SAVE_TRBE); -} - -void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) -{ - vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_SPE); - vcpu_clear_flag(vcpu, DEBUG_STATE_SAVE_TRBE); -} - void kvm_init_host_debug_data(void) { u64 dfr0 = read_sysreg(id_aa64dfr0_el1); @@ -330,4 +301,15 @@ void kvm_init_host_debug_data(void) if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0) *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, read_sysreg(pmcr_el0)); + + if (has_vhe()) + return; + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) && + !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P)) + host_data_set_flag(HAS_SPE); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && + !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) + host_data_set_flag(HAS_TRBE); } diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 1e2a26d0196e..858bb38e273f 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -82,10 +82,10 @@ static void __debug_restore_trace(u64 trfcr_el1) void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) { /* Disable and flush SPE data generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) + if (host_data_test_flag(HAS_SPE)) __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); /* Disable and flush Self-Hosted Trace generation */ - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) + if (host_data_test_flag(HAS_TRBE)) __debug_save_trace(host_data_ptr(host_debug_state.trfcr_el1)); } @@ -96,9 +96,9 @@ void __debug_switch_to_guest(struct kvm_vcpu *vcpu) void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_SPE)) + if (host_data_test_flag(HAS_SPE)) __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); - if (vcpu_get_flag(vcpu, DEBUG_STATE_SAVE_TRBE)) + if (host_data_test_flag(HAS_TRBE)) __debug_restore_trace(*host_data_ptr(host_debug_state.trfcr_el1)); } From d381e53384a69e35aac417cd6e66afc6c8c11583 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:01 -0800 Subject: [PATCH 04/87] KVM: arm64: Move host SME/SVE tracking flags to host data The SME/SVE state tracking flags have no business in the vCPU. Move them to kvm_host_data. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-5-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 22 ++++++++++------------ arch/arm64/kvm/fpsimd.c | 12 ++++++------ 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index fb252d540850..d196bf0fce52 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -610,8 +610,10 @@ struct cpu_sve_state { * field. */ struct kvm_host_data { -#define KVM_HOST_DATA_FLAG_HAS_SPE 0 -#define KVM_HOST_DATA_FLAG_HAS_TRBE 1 +#define KVM_HOST_DATA_FLAG_HAS_SPE 0 +#define KVM_HOST_DATA_FLAG_HAS_TRBE 1 +#define KVM_HOST_DATA_FLAG_HOST_SVE_ENABLED 2 +#define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED 3 unsigned long flags; struct kvm_cpu_context host_ctxt; @@ -916,22 +918,18 @@ struct kvm_vcpu_arch { /* Guest debug is live */ #define DEBUG_DIRTY __vcpu_single_flag(iflags, BIT(4)) -/* SVE enabled for host EL0 */ -#define HOST_SVE_ENABLED __vcpu_single_flag(sflags, BIT(0)) -/* SME enabled for EL0 */ -#define HOST_SME_ENABLED __vcpu_single_flag(sflags, BIT(1)) /* Physical CPU not in supported_cpus */ -#define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(2)) +#define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(0)) /* WFIT instruction trapped */ -#define IN_WFIT __vcpu_single_flag(sflags, BIT(3)) +#define IN_WFIT __vcpu_single_flag(sflags, BIT(1)) /* vcpu system registers loaded on physical CPU */ -#define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(4)) +#define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(2)) /* Software step state is Active-pending */ -#define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(5)) +#define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(3)) /* PMUSERENR for the guest EL0 is on physical CPU */ -#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(6)) +#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(4)) /* WFI instruction trapped */ -#define IN_WFI __vcpu_single_flag(sflags, BIT(7)) +#define IN_WFI __vcpu_single_flag(sflags, BIT(5)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index ea5484ce1f3b..0e0f37d1990a 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -65,14 +65,14 @@ void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) *host_data_ptr(fpsimd_state) = kern_hyp_va(¤t->thread.uw.fpsimd_state); *host_data_ptr(fpmr_ptr) = kern_hyp_va(¤t->thread.uw.fpmr); - vcpu_clear_flag(vcpu, HOST_SVE_ENABLED); + host_data_clear_flag(HOST_SVE_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_ZEN_EL0EN) - vcpu_set_flag(vcpu, HOST_SVE_ENABLED); + host_data_set_flag(HOST_SVE_ENABLED); if (system_supports_sme()) { - vcpu_clear_flag(vcpu, HOST_SME_ENABLED); + host_data_clear_flag(HOST_SME_ENABLED); if (read_sysreg(cpacr_el1) & CPACR_EL1_SMEN_EL0EN) - vcpu_set_flag(vcpu, HOST_SME_ENABLED); + host_data_set_flag(HOST_SME_ENABLED); /* * If PSTATE.SM is enabled then save any pending FP @@ -168,7 +168,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) */ if (has_vhe() && system_supports_sme()) { /* Also restore EL0 state seen on entry */ - if (vcpu_get_flag(vcpu, HOST_SME_ENABLED)) + if (host_data_test_flag(HOST_SME_ENABLED)) sysreg_clear_set(CPACR_EL1, 0, CPACR_ELx_SMEN); else sysreg_clear_set(CPACR_EL1, @@ -227,7 +227,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) * for EL0. To avoid spurious traps, restore the trap state * seen by kvm_arch_vcpu_load_fp(): */ - if (vcpu_get_flag(vcpu, HOST_SVE_ENABLED)) + if (host_data_test_flag(HOST_SVE_ENABLED)) sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_ZEN_EL0EN); else sysreg_clear_set(CPACR_EL1, CPACR_EL1_ZEN_EL0EN, 0); From b47ffd13fda8275733d573e5799e63e66b5f5361 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:02 -0800 Subject: [PATCH 05/87] KVM: arm64: Write MDCR_EL2 directly from kvm_arm_setup_mdcr_el2() Expecting the callee to know when MDCR_EL2 needs to be written to hardware asking for trouble. Do the deed from kvm_arm_setup_mdcr_el2() instead. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-6-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/debug.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 1ee2fd765b62..fef03456284b 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -73,6 +73,8 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) */ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) { + preempt_disable(); + /* * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK * to disable guest access to the profiling and trace buffers @@ -103,6 +105,12 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) kvm_vcpu_os_lock_enabled(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; + /* Write MDCR_EL2 directly if we're already at EL2 */ + if (has_vhe()) + write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); + + preempt_enable(); + trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); } @@ -148,7 +156,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) { - unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2; + unsigned long mdscr; trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); @@ -250,10 +258,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) vcpu_set_flag(vcpu, DEBUG_DIRTY); - /* Write mdcr_el2 changes since vcpu_load on VHE systems */ - if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) - write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); } From cd9b10102ae38bf0e10b13dbb98c3ead42cf8e1b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:03 -0800 Subject: [PATCH 06/87] KVM: arm64: Evaluate debug owner at vcpu_load() In preparation for tossing the debug_ptr mess, introduce an enumeration to track the ownership of the debug registers while in the guest. Update the owner at vcpu_load() based on whether the host needs to steal the guest's debug context or if breakpoints/watchpoints are actively in use. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-7-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 11 ++++++++ arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/debug.c | 46 +++++++++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 2 ++ 4 files changed, 60 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d196bf0fce52..f65b30bab0ab 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -756,6 +756,12 @@ struct kvm_vcpu_arch { struct kvm_guest_debug_arch vcpu_debug_state; struct kvm_guest_debug_arch external_debug_state; + enum { + VCPU_DEBUG_FREE, + VCPU_DEBUG_HOST_OWNED, + VCPU_DEBUG_GUEST_OWNED, + } debug_owner; + /* VGIC state */ struct vgic_cpu vgic_cpu; struct arch_timer_cpu timer_cpu; @@ -1345,10 +1351,15 @@ void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); +void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); +void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); #define kvm_vcpu_os_lock_enabled(vcpu) \ (!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & OSLSR_EL1_OSLK)) +#define kvm_host_owns_debug_regs(vcpu) \ + ((vcpu)->arch.debug_owner == VCPU_DEBUG_HOST_OWNED) + int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3822774840e1..a068337da52a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -598,6 +598,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vgic_load(vcpu); kvm_timer_vcpu_load(vcpu); + kvm_vcpu_load_debug(vcpu); if (has_vhe()) kvm_vcpu_load_vhe(vcpu); kvm_arch_vcpu_load_fp(vcpu); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index fef03456284b..548016439aca 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -317,3 +317,49 @@ void kvm_init_host_debug_data(void) !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) host_data_set_flag(HAS_TRBE); } + +void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) +{ + u64 mdscr; + + /* Must be called before kvm_vcpu_load_vhe() */ + KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm); + + /* + * Determine which of the possible debug states we're in: + * + * - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's + * breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do + * software step or emulate the effects of the OS Lock being enabled. + * + * - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and + * the breakpoint/watchpoint registers need to be loaded eagerly. + * + * - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint + * context needs to be loaded on the CPU. + */ + if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { + vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; + } else { + mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); + + if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE)) + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + else + vcpu->arch.debug_owner = VCPU_DEBUG_FREE; + } +} + +/* + * Updates ownership of the debug registers after a trapped guest access to a + * breakpoint/watchpoint register. Host ownership of the debug registers is of + * strictly higher priority, and it is the responsibility of the VMM to emulate + * guest debug exceptions in this configuration. + */ +void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) +{ + if (kvm_host_owns_debug_regs(vcpu)) + return; + + vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e2a5c2918d9e..e45a096be669 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -656,6 +656,7 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, if (p->is_write) vcpu_set_flag(vcpu, DEBUG_DIRTY); + kvm_debug_set_guest_ownership(vcpu); trace_trap_reg(__func__, r->reg, p->is_write, p->regval); return true; @@ -684,6 +685,7 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; + kvm_debug_set_guest_ownership(vcpu); vcpu_set_flag(vcpu, DEBUG_DIRTY); } From 4cefbec97d80247083b84ccc86e32a78c119f705 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:04 -0800 Subject: [PATCH 07/87] KVM: arm64: Clean up KVM_SET_GUEST_DEBUG handler No particular reason other than it isn't nice to look at. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-8-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/guest.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 12dad841f2a5..1fe097c67766 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -917,31 +917,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { - int ret = 0; - trace_kvm_set_guest_debug(vcpu, dbg->control); - if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) { - ret = -EINVAL; - goto out; - } + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) + return -EINVAL; - if (dbg->control & KVM_GUESTDBG_ENABLE) { - vcpu->guest_debug = dbg->control; - - /* Hardware assisted Break and Watch points */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - vcpu->arch.external_debug_state = dbg->arch; - } - - } else { - /* If not enabled clear all flags */ + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->guest_debug = 0; vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + return 0; } -out: - return ret; + vcpu->guest_debug = dbg->control; + + /* Hardware assisted Break and Watch points */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + vcpu->arch.external_debug_state = dbg->arch; + + return 0; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, From 58db67e9accca6d146916203a943568f63754697 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:05 -0800 Subject: [PATCH 08/87] KVM: arm64: Select debug state to save/restore based on debug owner Select the set of debug registers to use based on the owner rather than relying on debug_ptr. Besides the code cleanup, this allows us to eliminate a couple instances kern_hyp_va() as well. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-9-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 23 +++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f65b30bab0ab..5ef1b2f69e89 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1357,6 +1357,8 @@ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); #define kvm_vcpu_os_lock_enabled(vcpu) \ (!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & OSLSR_EL1_OSLK)) +#define kvm_debug_regs_in_use(vcpu) \ + ((vcpu)->arch.debug_owner != VCPU_DEBUG_FREE) #define kvm_host_owns_debug_regs(vcpu) \ ((vcpu)->arch.debug_owner == VCPU_DEBUG_HOST_OWNED) diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index d00093699aaf..9e3051225039 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -88,6 +88,21 @@ default: write_debug(ptr[0], reg, 0); \ } +static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu) +{ + switch (vcpu->arch.debug_owner) { + case VCPU_DEBUG_FREE: + WARN_ON_ONCE(1); + fallthrough; + case VCPU_DEBUG_GUEST_OWNED: + return &vcpu->arch.vcpu_debug_state; + case VCPU_DEBUG_HOST_OWNED: + return &vcpu->arch.external_debug_state; + } + + return NULL; +} + static void __debug_save_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { @@ -132,13 +147,13 @@ static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; host_dbg = host_data_ptr(host_debug_state.regs); - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(host_dbg, host_ctxt); __debug_restore_state(guest_dbg, guest_ctxt); @@ -151,13 +166,13 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) struct kvm_guest_debug_arch *host_dbg; struct kvm_guest_debug_arch *guest_dbg; - if (!vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (!kvm_debug_regs_in_use(vcpu)) return; host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; host_dbg = host_data_ptr(host_debug_state.regs); - guest_dbg = kern_hyp_va(vcpu->arch.debug_ptr); + guest_dbg = __vcpu_debug_regs(vcpu); __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); From 3b7780945cc8494793040ab0a9805b77b7826abb Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:06 -0800 Subject: [PATCH 09/87] KVM: arm64: Remove debug tracepoints The debug tracepoints are a useless firehose of information that track implementation detail rather than well-defined events. These are going to be rather difficult to uphold now that the implementation is getting redone, so throw them out instead of bending over backwards. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-10-oliver.upton@linux.dev [maz: fixed compilation after trace-ectomy] Signed-off-by: Marc Zyngier --- arch/arm64/kvm/debug.c | 34 -------------- arch/arm64/kvm/sys_regs.c | 11 ----- arch/arm64/kvm/trace_handle_exit.h | 75 ------------------------------ 3 files changed, 120 deletions(-) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 548016439aca..9979fc1a20bd 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -14,7 +14,6 @@ #include #include -#include "trace.h" /* * save/restore_guest_debug_regs @@ -35,10 +34,6 @@ static void save_guest_debug_regs(struct kvm_vcpu *vcpu) u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1); vcpu->arch.guest_debug_preserved.mdscr_el1 = val; - - trace_kvm_arm_set_dreg32("Saved MDSCR_EL1", - vcpu->arch.guest_debug_preserved.mdscr_el1); - vcpu->arch.guest_debug_preserved.pstate_ss = (*vcpu_cpsr(vcpu) & DBG_SPSR_SS); } @@ -49,9 +44,6 @@ static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) vcpu_write_sys_reg(vcpu, val, MDSCR_EL1); - trace_kvm_arm_set_dreg32("Restored MDSCR_EL1", - vcpu_read_sys_reg(vcpu, MDSCR_EL1)); - if (vcpu->arch.guest_debug_preserved.pstate_ss) *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; else @@ -110,8 +102,6 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); preempt_enable(); - - trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); } /** @@ -158,8 +148,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) { unsigned long mdscr; - trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); - kvm_arm_setup_mdcr_el2(vcpu); /* Check if we need to use the debug registers. */ @@ -209,8 +197,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); } - trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu)); - /* * HW Breakpoints and watchpoints * @@ -228,14 +214,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; vcpu_set_flag(vcpu, DEBUG_DIRTY); - trace_kvm_arm_set_regset("BKPTS", get_num_brps(), - &vcpu->arch.debug_ptr->dbg_bcr[0], - &vcpu->arch.debug_ptr->dbg_bvr[0]); - - trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), - &vcpu->arch.debug_ptr->dbg_wcr[0], - &vcpu->arch.debug_ptr->dbg_wvr[0]); - /* * The OS Lock blocks debug exceptions in all ELs when it is * enabled. If the guest has enabled the OS Lock, constrain its @@ -257,14 +235,10 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) /* If KDE or MDE are set, perform a full save/restore cycle. */ if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) vcpu_set_flag(vcpu, DEBUG_DIRTY); - - trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); } void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) { - trace_kvm_arm_clear_debug(vcpu->guest_debug); - /* * Restore the guest's debug registers if we were using them. */ @@ -286,14 +260,6 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { kvm_arm_reset_debug_ptr(vcpu); - - trace_kvm_arm_set_regset("BKPTS", get_num_brps(), - &vcpu->arch.debug_ptr->dbg_bcr[0], - &vcpu->arch.debug_ptr->dbg_bvr[0]); - - trace_kvm_arm_set_regset("WAPTS", get_num_wrps(), - &vcpu->arch.debug_ptr->dbg_wcr[0], - &vcpu->arch.debug_ptr->dbg_wvr[0]); } } } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e45a096be669..8c5771572133 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -657,8 +657,6 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, vcpu_set_flag(vcpu, DEBUG_DIRTY); kvm_debug_set_guest_ownership(vcpu); - trace_trap_reg(__func__, r->reg, p->is_write, p->regval); - return true; } @@ -711,8 +709,6 @@ static bool trap_bvr(struct kvm_vcpu *vcpu, else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); - return true; } @@ -748,8 +744,6 @@ static bool trap_bcr(struct kvm_vcpu *vcpu, else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); - return true; } @@ -785,9 +779,6 @@ static bool trap_wvr(struct kvm_vcpu *vcpu, else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->CRm, p->is_write, - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]); - return true; } @@ -823,8 +814,6 @@ static bool trap_wcr(struct kvm_vcpu *vcpu, else dbg_to_reg(vcpu, p, rd, dbg_reg); - trace_trap_reg(__func__, rd->CRm, p->is_write, *dbg_reg); - return true; } diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 064a58c19f48..f85415db7713 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -46,38 +46,6 @@ TRACE_EVENT(kvm_hvc_arm64, __entry->vcpu_pc, __entry->r0, __entry->imm) ); -TRACE_EVENT(kvm_arm_setup_debug, - TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug), - TP_ARGS(vcpu, guest_debug), - - TP_STRUCT__entry( - __field(struct kvm_vcpu *, vcpu) - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->vcpu = vcpu; - __entry->guest_debug = guest_debug; - ), - - TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug) -); - -TRACE_EVENT(kvm_arm_clear_debug, - TP_PROTO(__u32 guest_debug), - TP_ARGS(guest_debug), - - TP_STRUCT__entry( - __field(__u32, guest_debug) - ), - - TP_fast_assign( - __entry->guest_debug = guest_debug; - ), - - TP_printk("flags: 0x%08x", __entry->guest_debug) -); - /* * The dreg32 name is a leftover from a distant past. This will really * output a 64bit value... @@ -99,49 +67,6 @@ TRACE_EVENT(kvm_arm_set_dreg32, TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); -TRACE_DEFINE_SIZEOF(__u64); - -TRACE_EVENT(kvm_arm_set_regset, - TP_PROTO(const char *type, int len, __u64 *control, __u64 *value), - TP_ARGS(type, len, control, value), - TP_STRUCT__entry( - __field(const char *, name) - __field(int, len) - __array(u64, ctrls, 16) - __array(u64, values, 16) - ), - TP_fast_assign( - __entry->name = type; - __entry->len = len; - memcpy(__entry->ctrls, control, len << 3); - memcpy(__entry->values, value, len << 3); - ), - TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name, - __print_array(__entry->ctrls, __entry->len, sizeof(__u64)), - __print_array(__entry->values, __entry->len, sizeof(__u64))) -); - -TRACE_EVENT(trap_reg, - TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value), - TP_ARGS(fn, reg, is_write, write_value), - - TP_STRUCT__entry( - __field(const char *, fn) - __field(int, reg) - __field(bool, is_write) - __field(u64, write_value) - ), - - TP_fast_assign( - __entry->fn = fn; - __entry->reg = reg; - __entry->is_write = is_write; - __entry->write_value = write_value; - ), - - TP_printk("%s %s reg %d (0x%016llx)", __entry->fn, __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value) -); - TRACE_EVENT(kvm_handle_sys_reg, TP_PROTO(unsigned long hsr), TP_ARGS(hsr), From 803602b0d94168bd25f5ff6eafdfd9388a6dd2ec Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:07 -0800 Subject: [PATCH 10/87] KVM: arm64: Remove vestiges of debug_ptr Delete the remnants of debug_ptr now that debug registers are selected based on the debug owner instead. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-11-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 5 ----- arch/arm64/kvm/arm.c | 2 -- arch/arm64/kvm/debug.c | 30 +----------------------------- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 -- 4 files changed, 1 insertion(+), 38 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5ef1b2f69e89..905b84e59c24 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -748,11 +748,7 @@ struct kvm_vcpu_arch { * * external_debug_state contains the debug values we want to debug the * guest. This is set via the KVM_SET_GUEST_DEBUG ioctl. - * - * debug_ptr points to the set of debug registers that should be loaded - * onto the hardware when running the guest. */ - struct kvm_guest_debug_arch *debug_ptr; struct kvm_guest_debug_arch vcpu_debug_state; struct kvm_guest_debug_arch external_debug_state; @@ -1350,7 +1346,6 @@ void kvm_init_host_debug_data(void); void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); -void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a068337da52a..44a6093b0d9e 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -476,8 +476,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_pmu_vcpu_init(vcpu); - kvm_arm_reset_debug_ptr(vcpu); - kvm_arm_pvtime_vcpu_init(&vcpu->arch); vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 9979fc1a20bd..f39004c52d33 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -118,16 +118,6 @@ void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) preempt_enable(); } -/** - * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state - * @vcpu: the vcpu pointer - */ - -void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state; -} - /** * kvm_arm_setup_debug - set up debug related stuff * @@ -198,20 +188,13 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) } /* - * HW Breakpoints and watchpoints - * - * We simply switch the debug_ptr to point to our new - * external_debug_state which has been populated by the - * debug ioctl. The existing DEBUG_DIRTY mechanism ensures - * the registers are updated on the world switch. + * Enable breakpoints and watchpoints if userspace wants them. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - /* Enable breakpoints/watchpoints */ mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); mdscr |= DBG_MDSCR_MDE; vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; vcpu_set_flag(vcpu, DEBUG_DIRTY); /* @@ -229,9 +212,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) } } - BUG_ON(!vcpu->guest_debug && - vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state); - /* If KDE or MDE are set, perform a full save/restore cycle. */ if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) vcpu_set_flag(vcpu, DEBUG_DIRTY); @@ -253,14 +233,6 @@ void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) } restore_guest_debug_regs(vcpu); - - /* - * If we were using HW debug we need to restore the - * debug_ptr to the guest debug state. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - kvm_arm_reset_debug_ptr(vcpu); - } } } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 16f5da3a884a..f98ef98af183 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -112,8 +112,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; - hyp_vcpu->vcpu.arch.debug_ptr = kern_hyp_va(host_vcpu->arch.debug_ptr); - hyp_vcpu->vcpu.arch.vsesr_el2 = host_vcpu->arch.vsesr_el2; hyp_vcpu->vcpu.arch.vgic_cpu.vgic_v3 = host_vcpu->arch.vgic_cpu.vgic_v3; From beb470d96cec8dd8f4e05b2135c74d828f7b114b Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:08 -0800 Subject: [PATCH 11/87] KVM: arm64: Use debug_owner to track if debug regs need save/restore Use the debug owner to determine if the debug regs are in use instead of keeping around the DEBUG_DIRTY flag. Debug registers are now saved/restored after the first trap, regardless of whether it was a read or a write. This also shifts the point at which KVM becomes lazy to vcpu_put() rather than the next exception taken from the guest. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-12-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 4 +-- arch/arm64/kvm/debug.c | 19 ++----------- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 2 -- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 4 +-- arch/arm64/kvm/sys_regs.c | 33 ---------------------- 5 files changed, 7 insertions(+), 55 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 905b84e59c24..a6796a952af7 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -917,8 +917,6 @@ struct kvm_vcpu_arch { #define EXCEPT_AA64_EL2_IRQ __vcpu_except_flags(5) #define EXCEPT_AA64_EL2_FIQ __vcpu_except_flags(6) #define EXCEPT_AA64_EL2_SERR __vcpu_except_flags(7) -/* Guest debug is live */ -#define DEBUG_DIRTY __vcpu_single_flag(iflags, BIT(4)) /* Physical CPU not in supported_cpus */ #define ON_UNSUPPORTED_CPU __vcpu_single_flag(sflags, BIT(0)) @@ -1356,6 +1354,8 @@ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); ((vcpu)->arch.debug_owner != VCPU_DEBUG_FREE) #define kvm_host_owns_debug_regs(vcpu) \ ((vcpu)->arch.debug_owner == VCPU_DEBUG_HOST_OWNED) +#define kvm_guest_owns_debug_regs(vcpu) \ + ((vcpu)->arch.debug_owner == VCPU_DEBUG_GUEST_OWNED) int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index f39004c52d33..a4ae17c31fa8 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -86,15 +86,9 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; /* - * Trap debug register access when one of the following is true: - * - Userspace is using the hardware to debug the guest - * (KVM_GUESTDBG_USE_HW is set). - * - The guest is not using debug (DEBUG_DIRTY clear). - * - The guest has enabled the OS Lock (debug exceptions are blocked). + * Trap debug registers if the guest doesn't have ownership of them. */ - if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || - !vcpu_get_flag(vcpu, DEBUG_DIRTY) || - kvm_vcpu_os_lock_enabled(vcpu)) + if (!kvm_guest_owns_debug_regs(vcpu)) vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; /* Write MDCR_EL2 directly if we're already at EL2 */ @@ -127,8 +121,7 @@ void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) * debug related registers. * * Additionally, KVM only traps guest accesses to the debug registers if - * the guest is not actively using them (see the DEBUG_DIRTY - * flag on vcpu->arch.iflags). Since the guest must not interfere + * the guest is not actively using them. Since the guest must not interfere * with the hardware state when debugging the guest, we must ensure that * trapping is enabled whenever we are debugging the guest using the * debug registers. @@ -195,8 +188,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) mdscr |= DBG_MDSCR_MDE; vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - vcpu_set_flag(vcpu, DEBUG_DIRTY); - /* * The OS Lock blocks debug exceptions in all ELs when it is * enabled. If the guest has enabled the OS Lock, constrain its @@ -211,10 +202,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); } } - - /* If KDE or MDE are set, perform a full save/restore cycle. */ - if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) - vcpu_set_flag(vcpu, DEBUG_DIRTY); } void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 9e3051225039..c14586d87361 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -176,8 +176,6 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - - vcpu_clear_flag(vcpu, DEBUG_DIRTY); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index a651c43ad679..a998b2f6abcb 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -283,7 +283,7 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); } @@ -300,7 +300,7 @@ static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2); write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2); - if (has_vhe() || vcpu_get_flag(vcpu, DEBUG_DIRTY)) + if (has_vhe() || kvm_debug_regs_in_use(vcpu)) write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 8c5771572133..66db77fb6201 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -621,40 +621,11 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, } } -/* - * We want to avoid world-switching all the DBG registers all the - * time: - * - * - If we've touched any debug register, it is likely that we're - * going to touch more of them. It then makes sense to disable the - * traps and start doing the save/restore dance - * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is - * then mandatory to save/restore the registers, as the guest - * depends on them. - * - * For this, we use a DIRTY bit, indicating the guest has modified the - * debug registers, used as follow: - * - * On guest entry: - * - If the dirty bit is set (because we're coming back from trapping), - * disable the traps, save host registers, restore guest registers. - * - If debug is actively in use (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), - * set the dirty bit, disable the traps, save host registers, - * restore guest registers. - * - Otherwise, enable the traps - * - * On guest exit: - * - If the dirty bit is set, save guest registers, restore host - * registers and clear the dirty bit. This ensure that the host can - * now use the debug registers. - */ static bool trap_debug_regs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { access_rw(vcpu, p, r); - if (p->is_write) - vcpu_set_flag(vcpu, DEBUG_DIRTY); kvm_debug_set_guest_ownership(vcpu); return true; @@ -665,9 +636,6 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu, * * A 32 bit write to a debug register leave top bits alone * A 32 bit read from a debug register only returns the bottom bits - * - * All writes will set the DEBUG_DIRTY flag to ensure the hyp code - * switches between host and guest values in future. */ static void reg_to_dbg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -684,7 +652,6 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, *dbg_reg = val; kvm_debug_set_guest_ownership(vcpu); - vcpu_set_flag(vcpu, DEBUG_DIRTY); } static void dbg_to_reg(struct kvm_vcpu *vcpu, From 06d22a9c1b94006ebfa67693a38baed86b9a75e8 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:09 -0800 Subject: [PATCH 12/87] KVM: arm64: Reload vCPU for accesses to OSLAR_EL1 KVM takes ownership of the debug regs if the guest enables the OS lock, as it needs to use MDSCR_EL1 to mask debug exceptions. Just reload the vCPU if the guest toggles the OS lock, relying on kvm_vcpu_load_debug() to update the debug owner and get the right trap configuration in place. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-13-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/debug.c | 13 +++++++++++++ arch/arm64/kvm/sys_regs.c | 9 +-------- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index a6796a952af7..d61b8ed32a21 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1346,6 +1346,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); +void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val); #define kvm_vcpu_os_lock_enabled(vcpu) \ (!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & OSLSR_EL1_OSLK)) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index a4ae17c31fa8..dec023673d4c 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -288,3 +288,16 @@ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; } + +void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) +{ + if (val & OSLAR_EL1_OSLK) + __vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK; + else + __vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK; + + preempt_disable(); + kvm_arch_vcpu_put(vcpu); + kvm_arch_vcpu_load(vcpu, smp_processor_id()); + preempt_enable(); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 66db77fb6201..ed4adbf5a25e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -570,17 +570,10 @@ static bool trap_oslar_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 oslsr; - if (!p->is_write) return read_from_write_only(vcpu, p, r); - /* Forward the OSLK bit to OSLSR */ - oslsr = __vcpu_sys_reg(vcpu, OSLSR_EL1) & ~OSLSR_EL1_OSLK; - if (p->regval & OSLAR_EL1_OSLK) - oslsr |= OSLSR_EL1_OSLK; - - __vcpu_sys_reg(vcpu, OSLSR_EL1) = oslsr; + kvm_debug_handle_oslar(vcpu, p->regval); return true; } From 75a5fbaf6623328d3ac69719145c2247f7b4e299 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:10 -0800 Subject: [PATCH 13/87] KVM: arm64: Compute MDCR_EL2 at vcpu_load() KVM has picked up several hacks to cope with vcpu->arch.mdcr_el2 needing to be prepared before vcpu_load(), which is when it gets programmed into hardware on VHE. Now that the flows for reprogramming MDCR_EL2 have been simplified, move that computation to vcpu_load(). Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-14-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/arm.c | 2 -- arch/arm64/kvm/debug.c | 19 +++---------------- 3 files changed, 3 insertions(+), 19 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d61b8ed32a21..c630d59e7f4c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1341,7 +1341,6 @@ static inline bool kvm_system_needs_idmapped_vectors(void) static inline void kvm_arch_sync_events(struct kvm *kvm) {} void kvm_init_host_debug_data(void); -void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 44a6093b0d9e..ec581aeb41f9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -804,8 +804,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) kvm_init_mpidr_data(kvm); - kvm_arm_vcpu_init_debug(vcpu); - if (likely(irqchip_in_kernel(kvm))) { /* * Map the VGIC hardware resources before running a vcpu the diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index dec023673d4c..cc9599a7df5f 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -98,20 +98,6 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) preempt_enable(); } -/** - * kvm_arm_vcpu_init_debug - setup vcpu debug traps - * - * @vcpu: the vcpu pointer - * - * Set vcpu initial mdcr_el2 value. - */ -void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) -{ - preempt_disable(); - kvm_arm_setup_mdcr_el2(vcpu); - preempt_enable(); -} - /** * kvm_arm_setup_debug - set up debug related stuff * @@ -131,8 +117,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) { unsigned long mdscr; - kvm_arm_setup_mdcr_el2(vcpu); - /* Check if we need to use the debug registers. */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { /* Save guest debug state */ @@ -273,6 +257,8 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) else vcpu->arch.debug_owner = VCPU_DEBUG_FREE; } + + kvm_arm_setup_mdcr_el2(vcpu); } /* @@ -287,6 +273,7 @@ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) return; vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + kvm_arm_setup_mdcr_el2(vcpu); } void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) From 4ad3a0b87f2ec4714fdfa6bd5de57b4c30e15753 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:11 -0800 Subject: [PATCH 14/87] KVM: arm64: Don't hijack guest context MDSCR_EL1 Stealing MDSCR_EL1 in the guest's kvm_cpu_context for external debugging is rather gross. Just add a field for this instead and let the context switch code pick the correct one based on the debug owner. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-15-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/debug.c | 75 +++++++++++----------- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 39 +++++++---- 3 files changed, 64 insertions(+), 52 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index c630d59e7f4c..e7036096d9c2 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -751,6 +751,7 @@ struct kvm_vcpu_arch { */ struct kvm_guest_debug_arch vcpu_debug_state; struct kvm_guest_debug_arch external_debug_state; + u64 external_mdscr_el1; enum { VCPU_DEBUG_FREE, @@ -771,7 +772,6 @@ struct kvm_vcpu_arch { * are using guest debug. */ struct { - u32 mdscr_el1; bool pstate_ss; } guest_debug_preserved; diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index cc9599a7df5f..db3d74e63bc3 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -31,19 +31,12 @@ */ static void save_guest_debug_regs(struct kvm_vcpu *vcpu) { - u64 val = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - - vcpu->arch.guest_debug_preserved.mdscr_el1 = val; vcpu->arch.guest_debug_preserved.pstate_ss = (*vcpu_cpsr(vcpu) & DBG_SPSR_SS); } static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) { - u64 val = vcpu->arch.guest_debug_preserved.mdscr_el1; - - vcpu_write_sys_reg(vcpu, val, MDSCR_EL1); - if (vcpu->arch.guest_debug_preserved.pstate_ss) *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; else @@ -115,8 +108,6 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) { - unsigned long mdscr; - /* Check if we need to use the debug registers. */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { /* Save guest debug state */ @@ -154,36 +145,6 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; else *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; - - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr |= DBG_MDSCR_SS; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - } else { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr &= ~DBG_MDSCR_SS; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - } - - /* - * Enable breakpoints and watchpoints if userspace wants them. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr |= DBG_MDSCR_MDE; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); - - /* - * The OS Lock blocks debug exceptions in all ELs when it is - * enabled. If the guest has enabled the OS Lock, constrain its - * effects to the guest. Emulate the behavior by clearing - * MDSCR_EL1.MDE. In so doing, we ensure that host debug - * exceptions are unaffected by guest configuration of the OS - * Lock. - */ - } else if (kvm_vcpu_os_lock_enabled(vcpu)) { - mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); - mdscr &= ~DBG_MDSCR_MDE; - vcpu_write_sys_reg(vcpu, mdscr, MDSCR_EL1); } } } @@ -227,6 +188,41 @@ void kvm_init_host_debug_data(void) host_data_set_flag(HAS_TRBE); } +/* + * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host + * has taken over MDSCR_EL1. + * + * - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1. + * + * - Userspace is using the breakpoint/watchpoint registers to debug the + * guest, and MDSCR_EL1.MDE is forced to 1. + * + * - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0, + * masking all debug exceptions affected by the OS Lock. + */ +static void setup_external_mdscr(struct kvm_vcpu *vcpu) +{ + /* + * Use the guest's MDSCR_EL1 as a starting point, since there are + * several other features controlled by MDSCR_EL1 that are not relevant + * to the host. + * + * Clear the bits that KVM may use which also satisfies emulation of + * the OS Lock as MDSCR_EL1.MDE is cleared. + */ + u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS | + MDSCR_EL1_MDE | + MDSCR_EL1_KDE); + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + mdscr |= MDSCR_EL1_SS; + + if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) + mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE; + + vcpu->arch.external_mdscr_el1 = mdscr; +} + void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) { u64 mdscr; @@ -249,6 +245,7 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; + setup_external_mdscr(vcpu); } else { mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index a998b2f6abcb..76ff095c6b6e 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -18,9 +18,34 @@ static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt); +static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; + + if (!vcpu) + vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); + + return vcpu; +} + +static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt) +{ + return host_data_ptr(host_ctxt) != ctxt; +} + +static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt) +{ + struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); + + if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu)) + return &vcpu->arch.external_mdscr_el1; + + return &ctxt_sys_reg(ctxt, MDSCR_EL1); +} + static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt) { - ctxt_sys_reg(ctxt, MDSCR_EL1) = read_sysreg(mdscr_el1); + *ctxt_mdscr_el1(ctxt) = read_sysreg(mdscr_el1); // POR_EL0 can affect uaccess, so must be saved/restored early. if (ctxt_has_s1poe(ctxt)) @@ -33,16 +58,6 @@ static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt) ctxt_sys_reg(ctxt, TPIDRRO_EL0) = read_sysreg(tpidrro_el0); } -static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt) -{ - struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu; - - if (!vcpu) - vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt); - - return vcpu; -} - static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt) { struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt); @@ -139,7 +154,7 @@ static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt) static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt) { - write_sysreg(ctxt_sys_reg(ctxt, MDSCR_EL1), mdscr_el1); + write_sysreg(*ctxt_mdscr_el1(ctxt), mdscr_el1); // POR_EL0 can affect uaccess, so must be saved/restored early. if (ctxt_has_s1poe(ctxt)) From 2ca3f03bf524c98afa421c479689f1e7dc030bf0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 20 Dec 2024 08:59:48 +0000 Subject: [PATCH 15/87] KVM: arm64: Manage software step state at load/put KVM takes over the guest's software step state machine if the VMM is debugging the guest, but it does the save/restore fiddling for every guest entry. Note that the only constraint on host usage of software step is that the guest's configuration remains visible to userspace via the ONE_REG ioctls. So, we can cut down on the amount of fiddling by doing this at load/put instead. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-16-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 24 ++--- arch/arm64/kvm/arm.c | 4 +- arch/arm64/kvm/debug.c | 145 ++++++++---------------------- arch/arm64/kvm/guest.c | 2 +- arch/arm64/kvm/handle_exit.c | 2 +- 5 files changed, 48 insertions(+), 129 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e7036096d9c2..d48516de778d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -764,17 +764,6 @@ struct kvm_vcpu_arch { struct arch_timer_cpu timer_cpu; struct kvm_pmu pmu; - /* - * Guest registers we preserve during guest debugging. - * - * These shadow registers are updated by the kvm_handle_sys_reg - * trap handler if the guest accesses or updates them while we - * are using guest debug. - */ - struct { - bool pstate_ss; - } guest_debug_preserved; - /* vcpu power state */ struct kvm_mp_state mp_state; spinlock_t mp_state_lock; @@ -924,12 +913,14 @@ struct kvm_vcpu_arch { #define IN_WFIT __vcpu_single_flag(sflags, BIT(1)) /* vcpu system registers loaded on physical CPU */ #define SYSREGS_ON_CPU __vcpu_single_flag(sflags, BIT(2)) -/* Software step state is Active-pending */ -#define DBG_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(3)) +/* Software step state is Active-pending for external debug */ +#define HOST_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(3)) +/* Software step state is Active pending for guest debug */ +#define GUEST_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(4)) /* PMUSERENR for the guest EL0 is on physical CPU */ -#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(4)) +#define PMUSERENR_ON_CPU __vcpu_single_flag(sflags, BIT(5)) /* WFI instruction trapped */ -#define IN_WFI __vcpu_single_flag(sflags, BIT(5)) +#define IN_WFI __vcpu_single_flag(sflags, BIT(6)) /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ @@ -1341,9 +1332,8 @@ static inline bool kvm_system_needs_idmapped_vectors(void) static inline void kvm_arch_sync_events(struct kvm *kvm) {} void kvm_init_host_debug_data(void); -void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); -void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); +void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu); void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu); void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ec581aeb41f9..563cd0b626b9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -622,6 +622,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + kvm_vcpu_put_debug(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) kvm_vcpu_put_vhe(vcpu); @@ -1181,7 +1182,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } - kvm_arm_setup_debug(vcpu); kvm_arch_vcpu_ctxflush_fp(vcpu); /************************************************************** @@ -1198,8 +1198,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Back from guest *************************************************************/ - kvm_arm_clear_debug(vcpu); - /* * We must sync the PMU state before the vgic state so * that the vgic can properly sample the updated state of the diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index db3d74e63bc3..7d3c71d65518 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -3,7 +3,8 @@ * Debug and Guest Debug support * * Copyright (C) 2015 - Linaro Ltd - * Author: Alex Bennée + * Authors: Alex Bennée + * Oliver Upton */ #include @@ -14,35 +15,6 @@ #include #include - -/* - * save/restore_guest_debug_regs - * - * For some debug operations we need to tweak some guest registers. As - * a result we need to save the state of those registers before we - * make those modifications. - * - * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled - * after we have restored the preserved value to the main context. - * - * When single-step is enabled by userspace, we tweak PSTATE.SS on every - * guest entry. Preserve PSTATE.SS so we can restore the original value - * for the vcpu after the single-step is disabled. - */ -static void save_guest_debug_regs(struct kvm_vcpu *vcpu) -{ - vcpu->arch.guest_debug_preserved.pstate_ss = - (*vcpu_cpsr(vcpu) & DBG_SPSR_SS); -} - -static void restore_guest_debug_regs(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.guest_debug_preserved.pstate_ss) - *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; - else - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; -} - /** * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value * @@ -91,83 +63,6 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) preempt_enable(); } -/** - * kvm_arm_setup_debug - set up debug related stuff - * - * @vcpu: the vcpu pointer - * - * This is called before each entry into the hypervisor to setup any - * debug related registers. - * - * Additionally, KVM only traps guest accesses to the debug registers if - * the guest is not actively using them. Since the guest must not interfere - * with the hardware state when debugging the guest, we must ensure that - * trapping is enabled whenever we are debugging the guest using the - * debug registers. - */ - -void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) -{ - /* Check if we need to use the debug registers. */ - if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - /* Save guest debug state */ - save_guest_debug_regs(vcpu); - - /* - * Single Step (ARM ARM D2.12.3 The software step state - * machine) - * - * If we are doing Single Step we need to manipulate - * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the - * step has occurred the hypervisor will trap the - * debug exception and we return to userspace. - * - * If the guest attempts to single step its userspace - * we would have to deal with a trapped exception - * while in the guest kernel. Because this would be - * hard to unwind we suppress the guest's ability to - * do so by masking MDSCR_EL.SS. - * - * This confuses guest debuggers which use - * single-step behind the scenes but everything - * returns to normal once the host is no longer - * debugging the system. - */ - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - /* - * If the software step state at the last guest exit - * was Active-pending, we don't set DBG_SPSR_SS so - * that the state is maintained (to not run another - * single-step until the pending Software Step - * exception is taken). - */ - if (!vcpu_get_flag(vcpu, DBG_SS_ACTIVE_PENDING)) - *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; - else - *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; - } - } -} - -void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) -{ - /* - * Restore the guest's debug registers if we were using them. - */ - if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) - /* - * Mark the vcpu as ACTIVE_PENDING - * until Software Step exception is taken. - */ - vcpu_set_flag(vcpu, DBG_SS_ACTIVE_PENDING); - } - - restore_guest_debug_regs(vcpu); - } -} - void kvm_init_host_debug_data(void) { u64 dfr0 = read_sysreg(id_aa64dfr0_el1); @@ -246,6 +141,22 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; setup_external_mdscr(vcpu); + + /* + * Steal the guest's single-step state machine if userspace wants + * single-step the guest. + */ + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { + if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS) + vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + else + vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING); + + if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + } } else { mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1); @@ -258,6 +169,26 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) kvm_arm_setup_mdcr_el2(vcpu); } +void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu) +{ + if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + return; + + /* + * Save the host's software step state and restore the guest's before + * potentially returning to userspace. + */ + if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS)) + vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING); + else + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); + + if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING)) + *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS; + else + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; +} + /* * Updates ownership of the debug registers after a trapped guest access to a * breakpoint/watchpoint register. Host ownership of the debug registers is of diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 1fe097c67766..2196979a24a3 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -924,7 +924,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->guest_debug = 0; - vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING); return 0; } diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index d7c2990e7c9e..1e302f0c8903 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -193,7 +193,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) run->debug.arch.far = vcpu->arch.fault.far_el2; break; case ESR_ELx_EC_SOFTSTP_LOW: - vcpu_clear_flag(vcpu, DBG_SS_ACTIVE_PENDING); + *vcpu_cpsr(vcpu) |= DBG_SPSR_SS; break; } From b0ee51033ae35461ae98c465426f0002b8370679 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:13 -0800 Subject: [PATCH 16/87] KVM: arm64: nv: Honor MDCR_EL2.TDE routing for debug exceptions Inject debug exceptions into vEL2 if MDCR_EL2.TDE is set. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-17-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 1 + arch/arm64/kvm/emulate-nested.c | 23 +++++++++++++++++++---- arch/arm64/kvm/handle_exit.c | 3 +++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 233e65522716..ec6e28d63d26 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -64,6 +64,7 @@ static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0) } extern bool forward_smc_trap(struct kvm_vcpu *vcpu); +extern bool forward_debug_exception(struct kvm_vcpu *vcpu); extern void kvm_init_nested(struct kvm *kvm); extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu); extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu); diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 1ffbfd1c3cf2..e37fb598cc24 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -2345,14 +2345,14 @@ bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index) return true; } -static bool forward_traps(struct kvm_vcpu *vcpu, u64 control_bit) +static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit) { bool control_bit_set; if (!vcpu_has_nv(vcpu)) return false; - control_bit_set = __vcpu_sys_reg(vcpu, HCR_EL2) & control_bit; + control_bit_set = __vcpu_sys_reg(vcpu, reg) & control_bit; if (!is_hyp_ctxt(vcpu) && control_bit_set) { kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu)); return true; @@ -2360,9 +2360,24 @@ static bool forward_traps(struct kvm_vcpu *vcpu, u64 control_bit) return false; } +static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, HCR_EL2, control_bit); +} + bool forward_smc_trap(struct kvm_vcpu *vcpu) { - return forward_traps(vcpu, HCR_TSC); + return forward_hcr_traps(vcpu, HCR_TSC); +} + +static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit) +{ + return __forward_traps(vcpu, MDCR_EL2, control_bit); +} + +bool forward_debug_exception(struct kvm_vcpu *vcpu) +{ + return forward_mdcr_traps(vcpu, MDCR_EL2_TDE); } static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr) @@ -2406,7 +2421,7 @@ void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu) * Forward this trap to the virtual EL2 if the virtual * HCR_EL2.NV bit is set and this is coming from !EL2. */ - if (forward_traps(vcpu, HCR_NV)) + if (forward_hcr_traps(vcpu, HCR_NV)) return; spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 1e302f0c8903..512d152233ff 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -183,6 +183,9 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) struct kvm_run *run = vcpu->run; u64 esr = kvm_vcpu_get_esr(vcpu); + if (!vcpu->guest_debug && forward_debug_exception(vcpu)) + return 1; + run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = lower_32_bits(esr); run->debug.arch.hsr_high = upper_32_bits(esr); From 8c02c2bbd64375e603df79449f0eb2c57e1a597c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:14 -0800 Subject: [PATCH 17/87] KVM: arm64: Avoid reading ID_AA64DFR0_EL1 for debug save/restore Similar to other per-CPU profiling/debug features we handle, store the number of breakpoints/watchpoints in kvm_host_data to avoid reading the ID register 4 times on every guest entry/exit. And if you're in the nested virt business that's quite a few avoidable exits to the L0 hypervisor. Tested-by: James Clark Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-18-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 4 ++++ arch/arm64/kvm/debug.c | 3 +++ arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 17 ++++------------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d48516de778d..e7c740c99ee3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -661,6 +661,10 @@ struct kvm_host_data { /* Number of programmable event counters (PMCR_EL0.N) for this CPU */ unsigned int nr_event_counters; + + /* Number of debug breakpoints/watchpoints for this CPU (minus 1) */ + unsigned int debug_brps; + unsigned int debug_wrps; }; struct kvm_host_psci_config { diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 7d3c71d65518..d921e7f7bd59 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -71,6 +71,9 @@ void kvm_init_host_debug_data(void) *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N, read_sysreg(pmcr_el0)); + *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0); + *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0); + if (has_vhe()) return; diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index c14586d87361..502a5b73ee70 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -106,12 +106,8 @@ static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu) static void __debug_save_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); save_debug(dbg->dbg_bcr, dbgbcr, brps); save_debug(dbg->dbg_bvr, dbgbvr, brps); @@ -124,13 +120,8 @@ static void __debug_save_state(struct kvm_guest_debug_arch *dbg, static void __debug_restore_state(struct kvm_guest_debug_arch *dbg, struct kvm_cpu_context *ctxt) { - u64 aa64dfr0; - int brps, wrps; - - aa64dfr0 = read_sysreg(id_aa64dfr0_el1); - - brps = (aa64dfr0 >> 12) & 0xf; - wrps = (aa64dfr0 >> 20) & 0xf; + int brps = *host_data_ptr(debug_brps); + int wrps = *host_data_ptr(debug_wrps); restore_debug(dbg->dbg_bcr, dbgbcr, brps); restore_debug(dbg->dbg_bvr, dbgbvr, brps); From 3ce9f3357e9e099f02feaa002769eb99ef1138cd Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:15 -0800 Subject: [PATCH 18/87] KVM: arm64: Fold DBGxVR/DBGxCR accessors into common set There is a nauseating amount of boilerplate for accessing the breakpoint and watchpoint registers. Fold everything together into a single set of accessors and select the right storage based on the sysreg encoding. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-19-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 207 ++++++++++++++------------------------ 1 file changed, 74 insertions(+), 133 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ed4adbf5a25e..d9c7911ec58a 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -658,143 +658,78 @@ static void dbg_to_reg(struct kvm_vcpu *vcpu, p->regval = (*dbg_reg & mask) >> shift; } -static bool trap_bvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) +static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; + struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state; + + switch (rd->Op2) { + case 0b100: + return &dbg->dbg_bvr[rd->CRm]; + case 0b101: + return &dbg->dbg_bcr[rd->CRm]; + case 0b110: + return &dbg->dbg_wvr[rd->CRm]; + case 0b111: + return &dbg->dbg_wcr[rd->CRm]; + default: + KVM_BUG_ON(1, vcpu->kvm); + return NULL; + } +} + +static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + const struct sys_reg_desc *rd) +{ + u64 *reg = demux_wb_reg(vcpu, rd); + + if (!reg) + return false; if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); + reg_to_dbg(vcpu, p, rd, reg); else - dbg_to_reg(vcpu, p, rd, dbg_reg); + dbg_to_reg(vcpu, p, rd, reg); return true; } -static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) +static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = val; + u64 *reg = demux_wb_reg(vcpu, rd); + + if (!reg) + return -EINVAL; + + *reg = val; return 0; } -static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) +static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 *val) { - *val = vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm]; + u64 *reg = demux_wb_reg(vcpu, rd); + + if (!reg) + return -EINVAL; + + *val = *reg; return 0; } -static u64 reset_bvr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) +static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { - vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; - return rd->val; -} + u64 *reg = demux_wb_reg(vcpu, rd); -static bool trap_bcr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; + /* + * Bail early if we couldn't find storage for the register, the + * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever + * being run. + */ + if (!reg) + return 0; - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - return true; -} - -static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = val; - return 0; -} - -static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm]; - return 0; -} - -static u64 reset_bcr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; - return rd->val; -} - -static bool trap_wvr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - return true; -} - -static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = val; - return 0; -} - -static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm]; - return 0; -} - -static u64 reset_wvr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; - return rd->val; -} - -static bool trap_wcr(struct kvm_vcpu *vcpu, - struct sys_reg_params *p, - const struct sys_reg_desc *rd) -{ - u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - - if (p->is_write) - reg_to_dbg(vcpu, p, rd, dbg_reg); - else - dbg_to_reg(vcpu, p, rd, dbg_reg); - - return true; -} - -static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 val) -{ - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = val; - return 0; -} - -static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, - u64 *val) -{ - *val = vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm]; - return 0; -} - -static u64 reset_wcr(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd) -{ - vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; + *reg = rd->val; return rd->val; } @@ -1303,13 +1238,17 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ - trap_bvr, reset_bvr, 0, 0, get_bvr, set_bvr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ - trap_bcr, reset_bcr, 0, 0, get_bcr, set_bcr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ - trap_wvr, reset_wvr, 0, 0, get_wvr, set_wvr }, \ + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg }, \ { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ - trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } + trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0, \ + get_dbg_wb_reg, set_dbg_wb_reg } #define PMU_SYS_REG(name) \ SYS_DESC(SYS_##name), .reset = reset_pmu_reg, \ @@ -3523,18 +3462,20 @@ static bool trap_dbgdidr(struct kvm_vcpu *vcpu, * None of the other registers share their location, so treat them as * if they were 64bit. */ -#define DBG_BCR_BVR_WCR_WVR(n) \ - /* DBGBVRn */ \ - { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, \ - /* DBGBCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n }, \ - /* DBGWVRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n }, \ - /* DBGWCRn */ \ - { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n } +#define DBG_BCR_BVR_WCR_WVR(n) \ + /* DBGBVRn */ \ + { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4), \ + trap_dbg_wb_reg, NULL, n }, \ + /* DBGBCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWVRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n }, \ + /* DBGWCRn */ \ + { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n } -#define DBGBXVR(n) \ - { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_bvr, NULL, n } +#define DBGBXVR(n) \ + { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1), \ + trap_dbg_wb_reg, NULL, n } /* * Trapped cp14 registers. We generally ignore most of the external From c4a6ed85455979ef3fbadc2f1bdf18734b0ecea6 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 19 Dec 2024 14:41:16 -0800 Subject: [PATCH 19/87] KVM: arm64: Promote guest ownership for DBGxVR/DBGxCR reads Only yielding control of the debug registers for writes is a bit silly, unless of course you're a fan of pointless traps. Give control of the debug registers to the guest upon the first access, regardless of direction. Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20241219224116.3941496-20-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d9c7911ec58a..aac79e34cd50 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -643,8 +643,6 @@ static void reg_to_dbg(struct kvm_vcpu *vcpu, val &= ~mask; val |= (p->regval & (mask >> shift)) << shift; *dbg_reg = val; - - kvm_debug_set_guest_ownership(vcpu); } static void dbg_to_reg(struct kvm_vcpu *vcpu, @@ -690,6 +688,7 @@ static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, else dbg_to_reg(vcpu, p, rd, reg); + kvm_debug_set_guest_ownership(vcpu); return true; } From a1a1f1ff1f28d52deb39aa29b89663de2afefd67 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:42 +0000 Subject: [PATCH 20/87] KVM: arm64: Change the layout of enum pkvm_page_state The 'concrete' (a.k.a non-meta) page states are currently encoded using software bits in PTEs. For performance reasons, the abstract pkvm_page_state enum uses the same bits to encode these states as that makes conversions from and to PTEs easy. In order to prepare the ground for moving the 'concrete' state storage to the hyp vmemmap, re-arrange the enum to use bits 0 and 1 for this purpose. No functional changes intended. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-2-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 0972faccc2af..5462faf6bfee 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -24,25 +24,27 @@ */ enum pkvm_page_state { PKVM_PAGE_OWNED = 0ULL, - PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, - PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, - __PKVM_PAGE_RESERVED = KVM_PGTABLE_PROT_SW0 | - KVM_PGTABLE_PROT_SW1, + PKVM_PAGE_SHARED_OWNED = BIT(0), + PKVM_PAGE_SHARED_BORROWED = BIT(1), + __PKVM_PAGE_RESERVED = BIT(0) | BIT(1), /* Meta-states which aren't encoded directly in the PTE's SW bits */ - PKVM_NOPAGE, + PKVM_NOPAGE = BIT(2), }; +#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED) #define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, enum pkvm_page_state state) { - return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; + prot &= ~PKVM_PAGE_STATE_PROT_MASK; + prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state); + return prot; } static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) { - return prot & PKVM_PAGE_STATE_PROT_MASK; + return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot); } struct host_mmu { From d4fc42a479c8e913fd61dbb432e67f35587c336a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:43 +0000 Subject: [PATCH 21/87] KVM: arm64: Move enum pkvm_page_state to memory.h In order to prepare the way for storing page-tracking information in pKVM's vmemmap, move the enum pkvm_page_state definition to nvhe/memory.h. No functional changes intended. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-3-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 34 +------------------ arch/arm64/kvm/hyp/include/nvhe/memory.h | 33 ++++++++++++++++++ 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 5462faf6bfee..25038ac705d8 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -11,42 +11,10 @@ #include #include #include +#include #include #include -/* - * SW bits 0-1 are reserved to track the memory ownership state of each page: - * 00: The page is owned exclusively by the page-table owner. - * 01: The page is owned by the page-table owner, but is shared - * with another entity. - * 10: The page is shared with, but not owned by the page-table owner. - * 11: Reserved for future use (lending). - */ -enum pkvm_page_state { - PKVM_PAGE_OWNED = 0ULL, - PKVM_PAGE_SHARED_OWNED = BIT(0), - PKVM_PAGE_SHARED_BORROWED = BIT(1), - __PKVM_PAGE_RESERVED = BIT(0) | BIT(1), - - /* Meta-states which aren't encoded directly in the PTE's SW bits */ - PKVM_NOPAGE = BIT(2), -}; -#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED) - -#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) -static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, - enum pkvm_page_state state) -{ - prot &= ~PKVM_PAGE_STATE_PROT_MASK; - prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state); - return prot; -} - -static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) -{ - return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot); -} - struct host_mmu { struct kvm_arch arch; struct kvm_pgtable pgt; diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index ab205c4d6774..0964c461da92 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -7,6 +7,39 @@ #include +/* + * SW bits 0-1 are reserved to track the memory ownership state of each page: + * 00: The page is owned exclusively by the page-table owner. + * 01: The page is owned by the page-table owner, but is shared + * with another entity. + * 10: The page is shared with, but not owned by the page-table owner. + * 11: Reserved for future use (lending). + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = BIT(0), + PKVM_PAGE_SHARED_BORROWED = BIT(1), + __PKVM_PAGE_RESERVED = BIT(0) | BIT(1), + + /* Meta-states which aren't encoded directly in the PTE's SW bits */ + PKVM_NOPAGE = BIT(2), +}; +#define PKVM_PAGE_META_STATES_MASK (~__PKVM_PAGE_RESERVED) + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + prot &= ~PKVM_PAGE_STATE_PROT_MASK; + prot |= FIELD_PREP(PKVM_PAGE_STATE_PROT_MASK, state); + return prot; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return FIELD_GET(PKVM_PAGE_STATE_PROT_MASK, prot); +} + struct hyp_page { unsigned short refcount; unsigned short order; From b35875d466ad3cb08866eac067cca0581d4293d7 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:44 +0000 Subject: [PATCH 22/87] KVM: arm64: Make hyp_page::order a u8 We don't need 16 bits to store the hyp page order, and we'll need some bits to store page ownership data soon, so let's reduce the order member. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-4-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/nvhe/gfp.h | 6 +++--- arch/arm64/kvm/hyp/include/nvhe/memory.h | 5 +++-- arch/arm64/kvm/hyp/nvhe/page_alloc.c | 14 +++++++------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index 97c527ef53c2..3766333bace9 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -7,7 +7,7 @@ #include #include -#define HYP_NO_ORDER USHRT_MAX +#define HYP_NO_ORDER ((u8)(~0)) struct hyp_pool { /* @@ -19,11 +19,11 @@ struct hyp_pool { struct list_head free_area[NR_PAGE_ORDERS]; phys_addr_t range_start; phys_addr_t range_end; - unsigned short max_order; + u8 max_order; }; /* Allocation */ -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order); +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order); void hyp_split_page(struct hyp_page *page); void hyp_get_page(struct hyp_pool *pool, void *addr); void hyp_put_page(struct hyp_pool *pool, void *addr); diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 0964c461da92..8f2b42bcc8e1 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -41,8 +41,9 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) } struct hyp_page { - unsigned short refcount; - unsigned short order; + u16 refcount; + u8 order; + u8 reserved; }; extern u64 __hyp_vmemmap; diff --git a/arch/arm64/kvm/hyp/nvhe/page_alloc.c b/arch/arm64/kvm/hyp/nvhe/page_alloc.c index e691290d3765..a1eb27a1a747 100644 --- a/arch/arm64/kvm/hyp/nvhe/page_alloc.c +++ b/arch/arm64/kvm/hyp/nvhe/page_alloc.c @@ -32,7 +32,7 @@ u64 __hyp_vmemmap; */ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { phys_addr_t addr = hyp_page_to_phys(p); @@ -51,7 +51,7 @@ static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, /* Find a buddy page currently available for allocation */ static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); @@ -94,7 +94,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, struct hyp_page *p) { phys_addr_t phys = hyp_page_to_phys(p); - unsigned short order = p->order; + u8 order = p->order; struct hyp_page *buddy; memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); @@ -129,7 +129,7 @@ static void __hyp_attach_page(struct hyp_pool *pool, static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, struct hyp_page *p, - unsigned short order) + u8 order) { struct hyp_page *buddy; @@ -183,7 +183,7 @@ void hyp_get_page(struct hyp_pool *pool, void *addr) void hyp_split_page(struct hyp_page *p) { - unsigned short order = p->order; + u8 order = p->order; unsigned int i; p->order = 0; @@ -195,10 +195,10 @@ void hyp_split_page(struct hyp_page *p) } } -void *hyp_alloc_pages(struct hyp_pool *pool, unsigned short order) +void *hyp_alloc_pages(struct hyp_pool *pool, u8 order) { - unsigned short i = order; struct hyp_page *p; + u8 i = order; hyp_spin_lock(&pool->lock); From e94a7dea2972bd9a5ee3ed4312f7198370969407 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:45 +0000 Subject: [PATCH 23/87] KVM: arm64: Move host page ownership tracking to the hyp vmemmap We currently store part of the page-tracking state in PTE software bits for the host, guests and the hypervisor. This is sub-optimal when e.g. sharing pages as this forces to break block mappings purely to support this software tracking. This causes an unnecessarily fragmented stage-2 page-table for the host in particular when it shares pages with Secure, which can lead to measurable regressions. Moreover, having this state stored in the page-table forces us to do multiple costly walks on the page transition path, hence causing overhead. In order to work around these problems, move the host-side page-tracking logic from SW bits in its stage-2 PTEs to the hypervisor's vmemmap. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-5-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/nvhe/memory.h | 14 +++- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 100 ++++++++++++++++------- arch/arm64/kvm/hyp/nvhe/setup.c | 7 +- 3 files changed, 84 insertions(+), 37 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 8f2b42bcc8e1..2a5eabf4b753 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -8,7 +8,7 @@ #include /* - * SW bits 0-1 are reserved to track the memory ownership state of each page: + * Bits 0-1 are reserved to track the memory ownership state of each page: * 00: The page is owned exclusively by the page-table owner. * 01: The page is owned by the page-table owner, but is shared * with another entity. @@ -43,7 +43,9 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) struct hyp_page { u16 refcount; u8 order; - u8 reserved; + + /* Host (non-meta) state. Guarded by the host stage-2 lock. */ + enum pkvm_page_state host_state : 8; }; extern u64 __hyp_vmemmap; @@ -63,7 +65,13 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) #define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) #define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) -#define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)]) + +static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) +{ + BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u32)); + return &hyp_vmemmap[hyp_phys_to_pfn(phys)]; +} + #define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) #define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index caba3e4bd09e..12bb5445fe47 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -201,8 +201,8 @@ static void *guest_s2_zalloc_page(void *mc) memset(addr, 0, PAGE_SIZE); p = hyp_virt_to_page(addr); - memset(p, 0, sizeof(*p)); p->refcount = 1; + p->order = 0; return addr; } @@ -268,6 +268,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) { + struct hyp_page *page; void *addr; /* Dump all pgtable pages in the hyp_pool */ @@ -279,7 +280,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) /* Drain the hyp_pool into the memcache */ addr = hyp_alloc_pages(&vm->pool, 0); while (addr) { - memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + page = hyp_virt_to_page(addr); + page->refcount = 0; + page->order = 0; push_hyp_memcache(mc, addr, hyp_virt_to_phys); WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); addr = hyp_alloc_pages(&vm->pool, 0); @@ -382,19 +385,28 @@ bool addr_is_memory(phys_addr_t phys) return !!find_mem_range(phys, &range); } -static bool addr_is_allowed_memory(phys_addr_t phys) +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + +static int check_range_allowed_memory(u64 start, u64 end) { struct memblock_region *reg; struct kvm_mem_range range; - reg = find_mem_range(phys, &range); + /* + * Callers can't check the state of a range that overlaps memory and + * MMIO regions, so ensure [start, end[ is in the same kvm_mem_range. + */ + reg = find_mem_range(start, &range); + if (!is_in_mem_range(end - 1, &range)) + return -EINVAL; - return reg && !(reg->flags & MEMBLOCK_NOMAP); -} + if (!reg || reg->flags & MEMBLOCK_NOMAP) + return -EPERM; -static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) -{ - return range->start <= addr && addr < range->end; + return 0; } static bool range_is_memory(u64 start, u64 end) @@ -454,8 +466,10 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) if (kvm_pte_valid(pte)) return -EAGAIN; - if (pte) + if (pte) { + WARN_ON(addr_is_memory(addr) && hyp_phys_to_page(addr)->host_state != PKVM_NOPAGE); return -EPERM; + } do { u64 granule = kvm_granule_size(level); @@ -477,10 +491,33 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size, return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); } +static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state) +{ + phys_addr_t end = addr + size; + + for (; addr < end; addr += PAGE_SIZE) + hyp_phys_to_page(addr)->host_state = state; +} + int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { - return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, - addr, size, &host_s2_pool, owner_id); + int ret; + + if (!addr_is_memory(addr)) + return -EPERM; + + ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, + addr, size, &host_s2_pool, owner_id); + if (ret) + return ret; + + /* Don't forget to update the vmemmap tracking for the host */ + if (owner_id == PKVM_ID_HOST) + __host_update_page_state(addr, size, PKVM_PAGE_OWNED); + else + __host_update_page_state(addr, size, PKVM_NOPAGE); + + return 0; } static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) @@ -604,35 +641,38 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size, return kvm_pgtable_walk(pgt, addr, size, &walker); } -static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr) -{ - if (!addr_is_allowed_memory(addr)) - return PKVM_NOPAGE; - - if (!kvm_pte_valid(pte) && pte) - return PKVM_NOPAGE; - - return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); -} - static int __host_check_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - struct check_walk_data d = { - .desired = state, - .get_page_state = host_get_page_state, - }; + u64 end = addr + size; + int ret; + + ret = check_range_allowed_memory(addr, end); + if (ret) + return ret; hyp_assert_lock_held(&host_mmu.lock); - return check_page_state_range(&host_mmu.pgt, addr, size, &d); + for (; addr < end; addr += PAGE_SIZE) { + if (hyp_phys_to_page(addr)->host_state != state) + return -EPERM; + } + + return 0; } static int __host_set_page_state_range(u64 addr, u64 size, enum pkvm_page_state state) { - enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state); + if (hyp_phys_to_page(addr)->host_state == PKVM_NOPAGE) { + int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT); - return host_stage2_idmap_locked(addr, size, prot); + if (ret) + return ret; + } + + __host_update_page_state(addr, size, state); + + return 0; } static int host_request_owned_transition(u64 *completer_addr, diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index cbdd18cd3f98..7e04d1c2a03d 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -180,7 +180,6 @@ static void hpool_put_page(void *addr) static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, enum kvm_pgtable_walk_flags visit) { - enum kvm_pgtable_prot prot; enum pkvm_page_state state; phys_addr_t phys; @@ -203,16 +202,16 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx, case PKVM_PAGE_OWNED: return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP); case PKVM_PAGE_SHARED_OWNED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED; break; case PKVM_PAGE_SHARED_BORROWED: - prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED; break; default: return -EINVAL; } - return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); + return 0; } static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx, From 5398ddc5c90bd418b90d859e9267aa39399021af Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:46 +0000 Subject: [PATCH 24/87] KVM: arm64: Pass walk flags to kvm_pgtable_stage2_mkyoung kvm_pgtable_stage2_mkyoung currently assumes that it is being called from a 'shared' walker, which will not be true once called from pKVM. To allow for the re-use of that function, make the walk flags one of its parameters. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-6-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pgtable.h | 4 +++- arch/arm64/kvm/hyp/pgtable.c | 7 +++---- arch/arm64/kvm/mmu.c | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index aab04097b505..38b7ec1c8614 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -669,13 +669,15 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. + * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored. * * If there is a valid, leaf page-table entry used to translate @addr, then * set the access flag in that entry. */ -void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr); +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 40bd55966540..0470aedb4bf4 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1245,14 +1245,13 @@ int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) NULL, NULL, 0); } -void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr) +void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) { int ret; ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0, - NULL, NULL, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + NULL, NULL, flags); if (!ret) dsb(ishst); } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c9d46ad57e52..a2339b76c826 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1718,13 +1718,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* Resolve the access fault by making the page young again. */ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; struct kvm_s2_mmu *mmu; trace_kvm_access_fault(fault_ipa); read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; - kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa); + kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa, flags); read_unlock(&vcpu->kvm->mmu_lock); } From e279c25d78d6729e39a0221c98185bd0e7aa0c99 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:47 +0000 Subject: [PATCH 25/87] KVM: arm64: Pass walk flags to kvm_pgtable_stage2_relax_perms kvm_pgtable_stage2_relax_perms currently assumes that it is being called from a 'shared' walker, which will not be true once called from pKVM. To allow for the re-use of that function, make the walk flags one of its parameters. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-7-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pgtable.h | 4 +++- arch/arm64/kvm/hyp/pgtable.c | 6 ++---- arch/arm64/kvm/mmu.c | 7 +++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 38b7ec1c8614..c2f4149283ef 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -707,6 +707,7 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). * @addr: Intermediate physical address to identify the page-table entry. * @prot: Additional permissions to grant for the mapping. + * @flags: Flags to control the page-table walk (ex. a shared walk) * * The offset of @addr within a page is ignored. * @@ -719,7 +720,8 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, * Return: 0 on success, negative error code on failure. */ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot); + enum kvm_pgtable_prot prot, + enum kvm_pgtable_walk_flags flags); /** * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 0470aedb4bf4..b7a3b5363235 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1307,7 +1307,7 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, } int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot) + enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags) { int ret; s8 level; @@ -1325,9 +1325,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, if (prot & KVM_PGTABLE_PROT_X) clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags); if (!ret || ret == -EAGAIN) kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a2339b76c826..641e4fec1659 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1452,6 +1452,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; struct page *page; + enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); @@ -1695,13 +1696,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * PTE, which will be preserved. */ prot &= ~KVM_NV_GUEST_MAP_SZ; - ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); + ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot, flags); } else { ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, - memcache, - KVM_PGTABLE_WALK_HANDLE_FAULT | - KVM_PGTABLE_WALK_SHARED); + memcache, flags); } out_unlock: From c77e5181fed54b25d489eb7d2ccb5c1c72a1063c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:48 +0000 Subject: [PATCH 26/87] KVM: arm64: Make kvm_pgtable_stage2_init() a static inline function Turn kvm_pgtable_stage2_init() into a static inline function instead of a macro. This will allow the usage of typeof() on it later on. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-8-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pgtable.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index c2f4149283ef..04418b5e3004 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -526,8 +526,11 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, enum kvm_pgtable_stage2_flags flags, kvm_pgtable_force_pte_cb_t force_pte_cb); -#define kvm_pgtable_stage2_init(pgt, mmu, mm_ops) \ - __kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, NULL) +static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) +{ + return __kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, NULL); +} /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. From 99996d575ee69d4327bad98a0148729b73dde23a Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:49 +0000 Subject: [PATCH 27/87] KVM: arm64: Add {get,put}_pkvm_hyp_vm() helpers In preparation for accessing pkvm_hyp_vm structures at EL2 in a context where we can't always expect a vCPU to be loaded (e.g. MMU notifiers), introduce get/put helpers to get temporary references to hyp VMs from any context. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-9-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 3 +++ arch/arm64/kvm/hyp/nvhe/pkvm.c | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 24a9a8330d19..f361d8b91930 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -70,4 +70,7 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); + #endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 071993c16de8..d46a02e24e4a 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -327,6 +327,26 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_spin_unlock(&vm_table_lock); } +struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (hyp_vm) + hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); + + return hyp_vm; +} + +void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm) +{ + hyp_spin_lock(&vm_table_lock); + hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); + hyp_spin_unlock(&vm_table_lock); +} + static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) { struct kvm *kvm = &hyp_vm->kvm; From f7d03fcbf1f482069e9afac55b17de3bd323b8f6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 18 Dec 2024 19:40:50 +0000 Subject: [PATCH 28/87] KVM: arm64: Introduce __pkvm_vcpu_{load,put}() Rather than look-up the hyp vCPU on every run hypercall at EL2, introduce a per-CPU 'loaded_hyp_vcpu' tracking variable which is updated by a pair of load/put hypercalls called directly from kvm_arch_vcpu_{load,put}() when pKVM is enabled. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-10-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 2 ++ arch/arm64/kvm/arm.c | 14 ++++++++ arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 7 ++++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 47 ++++++++++++++++++++------ arch/arm64/kvm/hyp/nvhe/pkvm.c | 29 ++++++++++++++++ arch/arm64/kvm/vgic/vgic-v3.c | 6 ++-- 6 files changed, 93 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index ca2590344313..89c0fac69551 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -79,6 +79,8 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load, + __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put, }; #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a102c3aebdbc..55cc62b2f469 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -619,12 +619,26 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_arch_vcpu_load_debug_state_flags(vcpu); + if (is_protected_kvm_enabled()) { + kvm_call_hyp_nvhe(__pkvm_vcpu_load, + vcpu->kvm->arch.pkvm.handle, + vcpu->vcpu_idx, vcpu->arch.hcr_el2); + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + } + if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus)) vcpu_set_on_unsupported_cpu(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + if (is_protected_kvm_enabled()) { + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, + &vcpu->arch.vgic_cpu.vgic_v3); + kvm_call_hyp_nvhe(__pkvm_vcpu_put); + } + kvm_arch_vcpu_put_debug_state_flags(vcpu); kvm_arch_vcpu_put_fp(vcpu); if (has_vhe()) diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index f361d8b91930..be52c5b15e21 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -20,6 +20,12 @@ struct pkvm_hyp_vcpu { /* Backpointer to the host's (untrusted) vCPU instance. */ struct kvm_vcpu *host_vcpu; + + /* + * If this hyp vCPU is loaded, then this is a backpointer to the + * per-cpu pointer tracking us. Otherwise, NULL if not loaded. + */ + struct pkvm_hyp_vcpu **loaded_hyp_vcpu; }; /* @@ -69,6 +75,7 @@ int __pkvm_teardown_vm(pkvm_handle_t handle); struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void); struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 6aa0b13d86e5..95d78db315b3 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -141,16 +141,46 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i]; } +static void handle___pkvm_vcpu_load(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(unsigned int, vcpu_idx, host_ctxt, 2); + DECLARE_REG(u64, hcr_el2, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_load_hyp_vcpu(handle, vcpu_idx); + if (!hyp_vcpu) + return; + + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) { + /* Propagate WFx trapping flags */ + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWE | HCR_TWI); + hyp_vcpu->vcpu.arch.hcr_el2 |= hcr_el2 & (HCR_TWE | HCR_TWI); + } +} + +static void handle___pkvm_vcpu_put(struct kvm_cpu_context *host_ctxt) +{ + struct pkvm_hyp_vcpu *hyp_vcpu; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (hyp_vcpu) + pkvm_put_hyp_vcpu(hyp_vcpu); +} + static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 1); int ret; - host_vcpu = kern_hyp_va(host_vcpu); - if (unlikely(is_protected_kvm_enabled())) { - struct pkvm_hyp_vcpu *hyp_vcpu; - struct kvm *host_kvm; + struct pkvm_hyp_vcpu *hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); /* * KVM (and pKVM) doesn't support SME guests for now, and @@ -163,9 +193,6 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) goto out; } - host_kvm = kern_hyp_va(host_vcpu->kvm); - hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle, - host_vcpu->vcpu_idx); if (!hyp_vcpu) { ret = -EINVAL; goto out; @@ -176,12 +203,10 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) ret = __kvm_vcpu_run(&hyp_vcpu->vcpu); sync_hyp_vcpu(hyp_vcpu); - pkvm_put_hyp_vcpu(hyp_vcpu); } else { /* The host is fully trusted, run its vCPU directly. */ - ret = __kvm_vcpu_run(host_vcpu); + ret = __kvm_vcpu_run(kern_hyp_va(host_vcpu)); } - out: cpu_reg(host_ctxt, 1) = ret; } @@ -409,6 +434,8 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_vcpu_load), + HANDLE_FUNC(__pkvm_vcpu_put), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index d46a02e24e4a..496d186efb03 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -23,6 +23,12 @@ unsigned int kvm_arm_vmid_bits; unsigned int kvm_host_sve_max_vl; +/* + * The currently loaded hyp vCPU for each physical CPU. Used only when + * protected KVM is enabled, but for both protected and non-protected VMs. + */ +static DEFINE_PER_CPU(struct pkvm_hyp_vcpu *, loaded_hyp_vcpu); + /* * Set trap register values based on features in ID_AA64PFR0. */ @@ -306,15 +312,30 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, struct pkvm_hyp_vcpu *hyp_vcpu = NULL; struct pkvm_hyp_vm *hyp_vm; + /* Cannot load a new vcpu without putting the old one first. */ + if (__this_cpu_read(loaded_hyp_vcpu)) + return NULL; + hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) goto unlock; hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; + + /* Ensure vcpu isn't loaded on more than one cpu simultaneously. */ + if (unlikely(hyp_vcpu->loaded_hyp_vcpu)) { + hyp_vcpu = NULL; + goto unlock; + } + + hyp_vcpu->loaded_hyp_vcpu = this_cpu_ptr(&loaded_hyp_vcpu); hyp_page_ref_inc(hyp_virt_to_page(hyp_vm)); unlock: hyp_spin_unlock(&vm_table_lock); + + if (hyp_vcpu) + __this_cpu_write(loaded_hyp_vcpu, hyp_vcpu); return hyp_vcpu; } @@ -323,10 +344,18 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu); hyp_spin_lock(&vm_table_lock); + hyp_vcpu->loaded_hyp_vcpu = NULL; + __this_cpu_write(loaded_hyp_vcpu, NULL); hyp_page_ref_dec(hyp_virt_to_page(hyp_vm)); hyp_spin_unlock(&vm_table_lock); } +struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void) +{ + return __this_cpu_read(loaded_hyp_vcpu); + +} + struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle) { struct pkvm_hyp_vm *hyp_vm; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index f267bc2486a1..c2ef41fff079 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -734,7 +734,8 @@ void vgic_v3_load(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if); if (has_vhe()) __vgic_v3_activate_traps(cpu_if); @@ -746,7 +747,8 @@ void vgic_v3_put(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); + if (likely(!is_protected_kvm_enabled())) + kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if); WARN_ON(vgic_v4_put(vcpu)); if (has_vhe()) From d0bd3e6570aee42766e7bd884734ae078667ea1e Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:51 +0000 Subject: [PATCH 29/87] KVM: arm64: Introduce __pkvm_host_share_guest() In preparation for handling guest stage-2 mappings at EL2, introduce a new pKVM hypercall allowing to share pages with non-protected guests. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-11-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/include/asm/kvm_host.h | 3 + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 2 + arch/arm64/kvm/hyp/include/nvhe/memory.h | 4 +- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 34 +++++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 72 +++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/pkvm.c | 8 +++ 7 files changed, 123 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 89c0fac69551..449337f5b2a3 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -65,6 +65,7 @@ enum __kvm_host_smccc_func { /* Hypercalls available after pKVM finalisation */ __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp, + __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e18e9244d17a..1246f1d01dbf 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -771,6 +771,9 @@ struct kvm_vcpu_arch { /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; + /* Pages to top-up the pKVM/EL2 guest pool */ + struct kvm_hyp_memcache pkvm_memcache; + /* Virtual SError ESR to restore when HCR_EL2.VSE is set */ u64 vsesr_el2; diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 25038ac705d8..15b8956051b6 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -39,6 +39,8 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h index 2a5eabf4b753..34233d586060 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/memory.h +++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h @@ -46,6 +46,8 @@ struct hyp_page { /* Host (non-meta) state. Guarded by the host stage-2 lock. */ enum pkvm_page_state host_state : 8; + + u32 host_share_guest_count; }; extern u64 __hyp_vmemmap; @@ -68,7 +70,7 @@ static inline phys_addr_t hyp_virt_to_phys(void *addr) static inline struct hyp_page *hyp_phys_to_page(phys_addr_t phys) { - BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u32)); + BUILD_BUG_ON(sizeof(struct hyp_page) != sizeof(u64)); return &hyp_vmemmap[hyp_phys_to_pfn(phys)]; } diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 95d78db315b3..d659462fbf5d 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -211,6 +211,39 @@ static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = ret; } +static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu; + + return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, + host_vcpu->arch.pkvm_memcache.nr_pages, + &host_vcpu->arch.pkvm_memcache); +} + +static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = pkvm_refill_memcache(hyp_vcpu); + if (ret) + goto out; + + ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); @@ -420,6 +453,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_host_unshare_hyp), + HANDLE_FUNC(__pkvm_host_share_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 12bb5445fe47..fb9592e721cf 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -867,6 +867,27 @@ static int hyp_complete_donation(u64 addr, return pkvm_create_mappings_locked(start, end, prot); } +static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) +{ + if (!kvm_pte_valid(pte)) + return PKVM_NOPAGE; + + return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte)); +} + +static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, + u64 size, enum pkvm_page_state state) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + struct check_walk_data d = { + .desired = state, + .get_page_state = guest_get_page_state, + }; + + hyp_assert_lock_held(&vm->lock); + return check_page_state_range(&vm->pgt, addr, size, &d); +} + static int check_share(struct pkvm_mem_share *share) { const struct pkvm_mem_transition *tx = &share->tx; @@ -1349,3 +1370,54 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) return ret; } + +int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, + enum kvm_pgtable_prot prot) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 phys = hyp_pfn_to_phys(pfn); + u64 ipa = hyp_pfn_to_phys(gfn); + struct hyp_page *page; + int ret; + + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + + ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + if (ret) + return ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __guest_check_page_state_range(vcpu, ipa, PAGE_SIZE, PKVM_NOPAGE); + if (ret) + goto unlock; + + page = hyp_phys_to_page(phys); + switch (page->host_state) { + case PKVM_PAGE_OWNED: + WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED)); + break; + case PKVM_PAGE_SHARED_OWNED: + if (page->host_share_guest_count) + break; + /* Only host to np-guest multi-sharing is tolerated */ + WARN_ON(1); + fallthrough; + default: + ret = -EPERM; + goto unlock; + } + + WARN_ON(kvm_pgtable_stage2_map(&vm->pgt, ipa, PAGE_SIZE, phys, + pkvm_mkstate(prot, PKVM_PAGE_SHARED_BORROWED), + &vcpu->vcpu.arch.pkvm_memcache, 0)); + page->host_share_guest_count++; + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 496d186efb03..0109c36566c8 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -795,6 +795,14 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) /* Push the metadata pages to the teardown memcache */ for (idx = 0; idx < hyp_vm->nr_vcpus; ++idx) { struct pkvm_hyp_vcpu *hyp_vcpu = hyp_vm->vcpus[idx]; + struct kvm_hyp_memcache *vcpu_mc = &hyp_vcpu->vcpu.arch.pkvm_memcache; + + while (vcpu_mc->nr_pages) { + void *addr = pop_hyp_memcache(vcpu_mc, hyp_phys_to_virt); + + push_hyp_memcache(mc, addr, hyp_virt_to_phys); + unmap_donated_memory_noclear(addr, PAGE_SIZE); + } teardown_donated_memory(mc, hyp_vcpu, sizeof(*hyp_vcpu)); } From 72db3d3fbaa77ba649201c9e9f9d1a54fa76b217 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:52 +0000 Subject: [PATCH 30/87] KVM: arm64: Introduce __pkvm_host_unshare_guest() In preparation for letting the host unmap pages from non-protected guests, introduce a new hypercall implementing the host-unshare-guest transition. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-12-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 6 ++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 21 ++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 67 +++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/pkvm.c | 12 ++++ 6 files changed, 108 insertions(+) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 449337f5b2a3..0b6c4d325134 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -66,6 +66,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp, __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 15b8956051b6..e6d080b71779 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -41,6 +41,7 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); +int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index be52c5b15e21..0cc2a429f1fb 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -64,6 +64,11 @@ static inline bool pkvm_hyp_vcpu_is_protected(struct pkvm_hyp_vcpu *hyp_vcpu) return vcpu_is_protected(&hyp_vcpu->vcpu); } +static inline bool pkvm_hyp_vm_is_protected(struct pkvm_hyp_vm *hyp_vm) +{ + return kvm_vm_is_protected(&hyp_vm->kvm); +} + void pkvm_hyp_vm_table_init(void *tbl); int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, @@ -78,6 +83,7 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); struct pkvm_hyp_vcpu *pkvm_get_loaded_hyp_vcpu(void); struct pkvm_hyp_vm *get_pkvm_hyp_vm(pkvm_handle_t handle); +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle); void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm); #endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index d659462fbf5d..3c3a27c985a2 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -244,6 +244,26 @@ static void handle___pkvm_host_share_guest(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = ret; } +static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_unshare_guest(gfn, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); @@ -454,6 +474,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_host_unshare_hyp), HANDLE_FUNC(__pkvm_host_share_guest), + HANDLE_FUNC(__pkvm_host_unshare_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index fb9592e721cf..30243b7922f1 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1421,3 +1421,70 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, return ret; } + +static int __check_host_shared_guest(struct pkvm_hyp_vm *vm, u64 *__phys, u64 ipa) +{ + enum pkvm_page_state state; + struct hyp_page *page; + kvm_pte_t pte; + u64 phys; + s8 level; + int ret; + + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, &level); + if (ret) + return ret; + if (level != KVM_PGTABLE_LAST_LEVEL) + return -E2BIG; + if (!kvm_pte_valid(pte)) + return -ENOENT; + + state = guest_get_page_state(pte, ipa); + if (state != PKVM_PAGE_SHARED_BORROWED) + return -EPERM; + + phys = kvm_pte_to_phys(pte); + ret = check_range_allowed_memory(phys, phys + PAGE_SIZE); + if (WARN_ON(ret)) + return ret; + + page = hyp_phys_to_page(phys); + if (page->host_state != PKVM_PAGE_SHARED_OWNED) + return -EPERM; + if (WARN_ON(!page->host_share_guest_count)) + return -EINVAL; + + *__phys = phys; + + return 0; +} + +int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + struct hyp_page *page; + u64 phys; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __check_host_shared_guest(vm, &phys, ipa); + if (ret) + goto unlock; + + ret = kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE); + if (ret) + goto unlock; + + page = hyp_phys_to_page(phys); + page->host_share_guest_count--; + if (!page->host_share_guest_count) + WARN_ON(__host_set_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_OWNED)); + +unlock: + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 0109c36566c8..2c618f2f2769 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -376,6 +376,18 @@ void put_pkvm_hyp_vm(struct pkvm_hyp_vm *hyp_vm) hyp_spin_unlock(&vm_table_lock); } +struct pkvm_hyp_vm *get_np_pkvm_hyp_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm = get_pkvm_hyp_vm(handle); + + if (hyp_vm && pkvm_hyp_vm_is_protected(hyp_vm)) { + put_pkvm_hyp_vm(hyp_vm); + hyp_vm = NULL; + } + + return hyp_vm; +} + static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) { struct kvm *kvm = &hyp_vm->kvm; From 34884a0a4a53f9544f78e7e9556cb4d202e170d5 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:53 +0000 Subject: [PATCH 31/87] KVM: arm64: Introduce __pkvm_host_relax_guest_perms() Introduce a new hypercall allowing the host to relax the stage-2 permissions of mappings in a non-protected guest page-table. It will be used later once we start allowing RO memslots and dirty logging. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-13-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 20 ++++++++++++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 23 +++++++++++++++++++ 4 files changed, 45 insertions(+) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 0b6c4d325134..66ee8542dcc9 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -67,6 +67,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp, __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest, __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index e6d080b71779..181aec2d5bc1 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -42,6 +42,7 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 3c3a27c985a2..287e4ee93ef2 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -264,6 +264,25 @@ static void handle___pkvm_host_unshare_guest(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = ret; } +static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 2); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_relax_perms_guest(gfn, hyp_vcpu, prot); +out: + cpu_reg(host_ctxt, 1) = ret; +} + static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); @@ -475,6 +494,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_unshare_hyp), HANDLE_FUNC(__pkvm_host_share_guest), HANDLE_FUNC(__pkvm_host_unshare_guest), + HANDLE_FUNC(__pkvm_host_relax_perms_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 30243b7922f1..aa8e0408aebb 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1488,3 +1488,26 @@ int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *vm) return ret; } + +int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 phys; + int ret; + + if (prot & ~KVM_PGTABLE_PROT_RWX) + return -EINVAL; + + host_lock_component(); + guest_lock_component(vm); + + ret = __check_host_shared_guest(vm, &phys, ipa); + if (!ret) + ret = kvm_pgtable_stage2_relax_perms(&vm->pgt, ipa, prot, 0); + + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} From 26117e4c636c813be4fa31e9ec9410b7d02ced5c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:54 +0000 Subject: [PATCH 32/87] KVM: arm64: Introduce __pkvm_host_wrprotect_guest() Introduce a new hypercall to remove the write permission from a non-protected guest stage-2 mapping. This will be used for e.g. enabling dirty logging. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-14-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 21 +++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 19 +++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 66ee8542dcc9..8663a588cf34 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -68,6 +68,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___pkvm_host_share_guest, __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest, __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 181aec2d5bc1..0bbfb0e1734c 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -43,6 +43,7 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 287e4ee93ef2..98d317735107 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -283,6 +283,26 @@ static void handle___pkvm_host_relax_perms_guest(struct kvm_cpu_context *host_ct cpu_reg(host_ctxt, 1) = ret; } +static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_wrprotect_guest(gfn, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); @@ -495,6 +515,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_share_guest), HANDLE_FUNC(__pkvm_host_unshare_guest), HANDLE_FUNC(__pkvm_host_relax_perms_guest), + HANDLE_FUNC(__pkvm_host_wrprotect_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index aa8e0408aebb..94e4251b5077 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1511,3 +1511,22 @@ int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_ return ret; } + +int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + u64 phys; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __check_host_shared_guest(vm, &phys, ipa); + if (!ret) + ret = kvm_pgtable_stage2_wrprotect(&vm->pgt, ipa, PAGE_SIZE); + + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} From 56ab4de37f4e13231e964cf7fa304818f791fea7 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:55 +0000 Subject: [PATCH 33/87] KVM: arm64: Introduce __pkvm_host_test_clear_young_guest() Plumb the kvm_stage2_test_clear_young() callback into pKVM for non-protected guest. It will be later be called from MMU notifiers. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-15-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 22 +++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 19 ++++++++++++++++ 4 files changed, 43 insertions(+) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 8663a588cf34..4f97155d6323 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -69,6 +69,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_guest, __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest, __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 0bbfb0e1734c..74bd6c72fff2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -44,6 +44,7 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu, int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 98d317735107..616e172a9c48 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -303,6 +303,27 @@ static void handle___pkvm_host_wrprotect_guest(struct kvm_cpu_context *host_ctxt cpu_reg(host_ctxt, 1) = ret; } +static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, gfn, host_ctxt, 2); + DECLARE_REG(bool, mkold, host_ctxt, 3); + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + goto out; + + ret = __pkvm_host_test_clear_young_guest(gfn, mkold, hyp_vm); + put_pkvm_hyp_vm(hyp_vm); +out: + cpu_reg(host_ctxt, 1) = ret; +} + static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); @@ -516,6 +537,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_unshare_guest), HANDLE_FUNC(__pkvm_host_relax_perms_guest), HANDLE_FUNC(__pkvm_host_wrprotect_guest), + HANDLE_FUNC(__pkvm_host_test_clear_young_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 94e4251b5077..0e42c3baaf4b 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1530,3 +1530,22 @@ int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *vm) return ret; } + +int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm) +{ + u64 ipa = hyp_pfn_to_phys(gfn); + u64 phys; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __check_host_shared_guest(vm, &phys, ipa); + if (!ret) + ret = kvm_pgtable_stage2_test_clear_young(&vm->pgt, ipa, PAGE_SIZE, mkold); + + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} From 76f0b18b3db57868fb0cabe691201aad3085b712 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:56 +0000 Subject: [PATCH 34/87] KVM: arm64: Introduce __pkvm_host_mkyoung_guest() Plumb the kvm_pgtable_stage2_mkyoung() callback into pKVM for non-protected guests. It will be called later from the fault handling path. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-16-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 19 ++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 20 +++++++++++++++++++ 4 files changed, 41 insertions(+) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 4f97155d6323..a3b07db2776c 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -70,6 +70,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___pkvm_host_relax_perms_guest, __KVM_HOST_SMCCC_FUNC___pkvm_host_wrprotect_guest, __KVM_HOST_SMCCC_FUNC___pkvm_host_test_clear_young_guest, + __KVM_HOST_SMCCC_FUNC___pkvm_host_mkyoung_guest, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 74bd6c72fff2..978f38c386ee 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -45,6 +45,7 @@ int __pkvm_host_unshare_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); int __pkvm_host_relax_perms_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu, enum kvm_pgtable_prot prot); int __pkvm_host_wrprotect_guest(u64 gfn, struct pkvm_hyp_vm *hyp_vm); int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm *vm); +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 616e172a9c48..32c4627b5b5b 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -324,6 +324,24 @@ static void handle___pkvm_host_test_clear_young_guest(struct kvm_cpu_context *ho cpu_reg(host_ctxt, 1) = ret; } +static void handle___pkvm_host_mkyoung_guest(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, gfn, host_ctxt, 1); + struct pkvm_hyp_vcpu *hyp_vcpu; + int ret = -EINVAL; + + if (!is_protected_kvm_enabled()) + goto out; + + hyp_vcpu = pkvm_get_loaded_hyp_vcpu(); + if (!hyp_vcpu || pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + goto out; + + ret = __pkvm_host_mkyoung_guest(gfn, hyp_vcpu); +out: + cpu_reg(host_ctxt, 1) = ret; +} + static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); @@ -538,6 +556,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_relax_perms_guest), HANDLE_FUNC(__pkvm_host_wrprotect_guest), HANDLE_FUNC(__pkvm_host_test_clear_young_guest), + HANDLE_FUNC(__pkvm_host_mkyoung_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 0e42c3baaf4b..eae03509d371 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1549,3 +1549,23 @@ int __pkvm_host_test_clear_young_guest(u64 gfn, bool mkold, struct pkvm_hyp_vm * return ret; } + +int __pkvm_host_mkyoung_guest(u64 gfn, struct pkvm_hyp_vcpu *vcpu) +{ + struct pkvm_hyp_vm *vm = pkvm_hyp_vcpu_to_hyp_vm(vcpu); + u64 ipa = hyp_pfn_to_phys(gfn); + u64 phys; + int ret; + + host_lock_component(); + guest_lock_component(vm); + + ret = __check_host_shared_guest(vm, &phys, ipa); + if (!ret) + kvm_pgtable_stage2_mkyoung(&vm->pgt, ipa, 0); + + guest_unlock_component(vm); + host_unlock_component(); + + return ret; +} From 0adce4d42f249b1701c136907055d9b12f8f6e1c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:57 +0000 Subject: [PATCH 35/87] KVM: arm64: Introduce __pkvm_tlb_flush_vmid() Introduce a new hypercall to flush the TLBs of non-protected guests. The host kernel will be responsible for issuing this hypercall after changing stage-2 permissions using the __pkvm_host_relax_guest_perms() or __pkvm_host_wrprotect_guest() paths. This is left under the host's responsibility for performance reasons. Note however that the TLB maintenance for all *unmap* operations still remains entirely under the hypervisor's responsibility for security reasons -- an unmapped page may be donated to another entity, so a stale TLB entry could be used to leak private data. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-17-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_asm.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-main.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index a3b07db2776c..002088c6e297 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -87,6 +87,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put, + __KVM_HOST_SMCCC_FUNC___pkvm_tlb_flush_vmid, }; #define DECLARE_KVM_VHE_SYM(sym) extern char sym[] diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 32c4627b5b5b..130f5f23bcb5 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -389,6 +389,22 @@ static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid(kern_hyp_va(mmu)); } +static void handle___pkvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + struct pkvm_hyp_vm *hyp_vm; + + if (!is_protected_kvm_enabled()) + return; + + hyp_vm = get_np_pkvm_hyp_vm(handle); + if (!hyp_vm) + return; + + __kvm_tlb_flush_vmid(&hyp_vm->kvm.arch.mmu); + put_pkvm_hyp_vm(hyp_vm); +} + static void handle___kvm_flush_cpu_context(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -573,6 +589,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_teardown_vm), HANDLE_FUNC(__pkvm_vcpu_load), HANDLE_FUNC(__pkvm_vcpu_put), + HANDLE_FUNC(__pkvm_tlb_flush_vmid), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) From e912efed485a4c50bdc3934ae647e257ef568ef6 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:58 +0000 Subject: [PATCH 36/87] KVM: arm64: Introduce the EL1 pKVM MMU Introduce a set of helper functions allowing to manipulate the pKVM guest stage-2 page-tables from EL1 using pKVM's HVC interface. Each helper has an exact one-to-one correspondance with the traditional kvm_pgtable_stage2_*() functions from pgtable.c, with a strictly matching prototype. This will ease plumbing later on in mmu.c. These callbacks track the gfn->pfn mappings in a simple rb_tree indexed by IPA in lieu of a page-table. This rb-tree is kept in sync with pKVM's state and is protected by the mmu_lock like a traditional stage-2 page-table. Signed-off-by: Quentin Perret Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20241218194059.3670226-18-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_pgtable.h | 21 +-- arch/arm64/include/asm/kvm_pkvm.h | 26 ++++ arch/arm64/kvm/pkvm.c | 201 +++++++++++++++++++++++++++ 4 files changed, 241 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 1246f1d01dbf..f23f4ea9ec8b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -85,6 +85,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); struct kvm_hyp_memcache { phys_addr_t head; unsigned long nr_pages; + struct pkvm_mapping *mapping; /* only used from EL1 */ }; static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc, diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 04418b5e3004..6b9d274052c7 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -412,15 +412,20 @@ static inline bool kvm_pgtable_walk_lock_held(void) * be used instead of block mappings. */ struct kvm_pgtable { - u32 ia_bits; - s8 start_level; - kvm_pteref_t pgd; - struct kvm_pgtable_mm_ops *mm_ops; + union { + struct rb_root pkvm_mappings; + struct { + u32 ia_bits; + s8 start_level; + kvm_pteref_t pgd; + struct kvm_pgtable_mm_ops *mm_ops; - /* Stage-2 only */ - struct kvm_s2_mmu *mmu; - enum kvm_pgtable_stage2_flags flags; - kvm_pgtable_force_pte_cb_t force_pte_cb; + /* Stage-2 only */ + enum kvm_pgtable_stage2_flags flags; + kvm_pgtable_force_pte_cb_t force_pte_cb; + }; + }; + struct kvm_s2_mmu *mmu; }; /** diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index cd56acd9a842..65f988b6fe0d 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -137,4 +137,30 @@ static inline size_t pkvm_host_sve_state_size(void) SVE_SIG_REGS_SIZE(sve_vq_from_vl(kvm_host_sve_max_vl))); } +struct pkvm_mapping { + struct rb_node node; + u64 gfn; + u64 pfn; +}; + +int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops); +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); +int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, + enum kvm_pgtable_prot prot, void *mc, + enum kvm_pgtable_walk_flags flags); +int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size); +int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size); +int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); +bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold); +int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, + enum kvm_pgtable_walk_flags flags); +void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags); +int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc); +void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level); +kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, + enum kvm_pgtable_prot prot, void *mc, + bool force_pte); #endif /* __ARM64_KVM_PKVM_H__ */ diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 85117ea8f351..930b677eb9b0 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -268,3 +269,203 @@ static int __init finalize_pkvm(void) return ret; } device_initcall_sync(finalize_pkvm); + +static int cmp_mappings(struct rb_node *node, const struct rb_node *parent) +{ + struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node); + struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node); + + if (a->gfn < b->gfn) + return -1; + if (a->gfn > b->gfn) + return 1; + return 0; +} + +static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn) +{ + struct rb_node *node = root->rb_node, *prev = NULL; + struct pkvm_mapping *mapping; + + while (node) { + mapping = rb_entry(node, struct pkvm_mapping, node); + if (mapping->gfn == gfn) + return node; + prev = node; + node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right; + } + + return prev; +} + +/* + * __tmp is updated to rb_next(__tmp) *before* entering the body of the loop to allow freeing + * of __map inline. + */ +#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map) \ + for (struct rb_node *__tmp = find_first_mapping_node(&(__pgt)->pkvm_mappings, \ + ((__start) >> PAGE_SHIFT)); \ + __tmp && ({ \ + __map = rb_entry(__tmp, struct pkvm_mapping, node); \ + __tmp = rb_next(__tmp); \ + true; \ + }); \ + ) \ + if (__map->gfn < ((__start) >> PAGE_SHIFT)) \ + continue; \ + else if (__map->gfn >= ((__end) >> PAGE_SHIFT)) \ + break; \ + else + +int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, + struct kvm_pgtable_mm_ops *mm_ops) +{ + pgt->pkvm_mappings = RB_ROOT; + pgt->mmu = mmu; + + return 0; +} + +void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + struct rb_node *node; + + if (!handle) + return; + + node = rb_first(&pgt->pkvm_mappings); + while (node) { + mapping = rb_entry(node, struct pkvm_mapping, node); + kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); + node = rb_next(node); + rb_erase(&mapping->node, &pgt->pkvm_mappings); + kfree(mapping); + } +} + +int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, + u64 phys, enum kvm_pgtable_prot prot, + void *mc, enum kvm_pgtable_walk_flags flags) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping = NULL; + struct kvm_hyp_memcache *cache = mc; + u64 gfn = addr >> PAGE_SHIFT; + u64 pfn = phys >> PAGE_SHIFT; + int ret; + + if (size != PAGE_SIZE) + return -EINVAL; + + lockdep_assert_held_write(&kvm->mmu_lock); + ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, prot); + if (ret) { + /* Is the gfn already mapped due to a racing vCPU? */ + if (ret == -EPERM) + return -EAGAIN; + } + + swap(mapping, cache->mapping); + mapping->gfn = gfn; + mapping->pfn = pfn; + WARN_ON(rb_find_add(&mapping->node, &pgt->pkvm_mappings, cmp_mappings)); + + return ret; +} + +int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret = 0; + + lockdep_assert_held_write(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn); + if (WARN_ON(ret)) + break; + rb_erase(&mapping->node, &pgt->pkvm_mappings); + kfree(mapping); + } + + return ret; +} + +int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + int ret = 0; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) { + ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn); + if (WARN_ON(ret)) + break; + } + + return ret; +} + +int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + struct pkvm_mapping *mapping; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE); + + return 0; +} + +bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold) +{ + struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu); + pkvm_handle_t handle = kvm->arch.pkvm.handle; + struct pkvm_mapping *mapping; + bool young = false; + + lockdep_assert_held(&kvm->mmu_lock); + for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) + young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn, + mkold); + + return young; +} + +int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot, + enum kvm_pgtable_walk_flags flags) +{ + return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot); +} + +void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr, + enum kvm_pgtable_walk_flags flags) +{ + WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT)); +} + +void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level) +{ + WARN_ON_ONCE(1); +} + +kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level, + enum kvm_pgtable_prot prot, void *mc, bool force_pte) +{ + WARN_ON_ONCE(1); + return NULL; +} + +int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} From fce886a6020734d6253c2c5a3bc285e385cc5496 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Wed, 18 Dec 2024 19:40:59 +0000 Subject: [PATCH 37/87] KVM: arm64: Plumb the pKVM MMU in KVM Introduce the KVM_PGT_CALL() helper macro to allow switching from the traditional pgtable code to the pKVM version easily in mmu.c. The cost of this 'indirection' is expected to be very minimal due to is_protected_kvm_enabled() being backed by a static key. With this, everything is in place to allow the delegation of non-protected guest stage-2 page-tables to pKVM, so let's stop using the host's kvm_s2_mmu from EL2 and enjoy the ride. Tested-by: Fuad Tabba Reviewed-by: Fuad Tabba Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20241218194059.3670226-19-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 16 ++++++ arch/arm64/kvm/arm.c | 9 +++- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 2 - arch/arm64/kvm/mmu.c | 87 ++++++++++++++++++++---------- 4 files changed, 82 insertions(+), 32 deletions(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 66d93e320ec8..d116ab4230e8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -353,6 +353,22 @@ static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) return &kvm->arch.mmu != mmu; } +static inline void kvm_fault_lock(struct kvm *kvm) +{ + if (is_protected_kvm_enabled()) + write_lock(&kvm->mmu_lock); + else + read_lock(&kvm->mmu_lock); +} + +static inline void kvm_fault_unlock(struct kvm *kvm) +{ + if (is_protected_kvm_enabled()) + write_unlock(&kvm->mmu_lock); + else + read_unlock(&kvm->mmu_lock); +} + #ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS void kvm_s2_ptdump_create_debugfs(struct kvm *kvm); #else diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 55cc62b2f469..9bcbc7b8ed38 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -502,7 +502,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + if (!is_protected_kvm_enabled()) + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + else + free_hyp_memcache(&vcpu->arch.pkvm_memcache); kvm_timer_vcpu_terminate(vcpu); kvm_pmu_vcpu_destroy(vcpu); kvm_vgic_vcpu_destroy(vcpu); @@ -574,6 +577,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) struct kvm_s2_mmu *mmu; int *last_ran; + if (is_protected_kvm_enabled()) + goto nommu; + if (vcpu_has_nv(vcpu)) kvm_vcpu_load_hw_mmu(vcpu); @@ -594,6 +600,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) *last_ran = vcpu->vcpu_idx; } +nommu: vcpu->cpu = cpu; kvm_vgic_load(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 130f5f23bcb5..258d572eed62 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -103,8 +103,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) /* Limit guest vector length to the maximum supported by the host. */ hyp_vcpu->vcpu.arch.sve_max_vl = min(host_vcpu->arch.sve_max_vl, kvm_host_sve_max_vl); - hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; - hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 641e4fec1659..9403524c11c6 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,8 @@ static phys_addr_t __ro_after_init hyp_idmap_vector; static unsigned long __ro_after_init io_map_base; +#define KVM_PGT_FN(fn) (!is_protected_kvm_enabled() ? fn : p ## fn) + static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, phys_addr_t size) { @@ -147,7 +150,7 @@ static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, return -EINVAL; next = __stage2_range_addr_end(addr, end, chunk_size); - ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); + ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache); if (ret) break; } while (addr = next, addr != end); @@ -168,15 +171,23 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) */ int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { - kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); return 0; } int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { - kvm_tlb_flush_vmid_range(&kvm->arch.mmu, - gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); + u64 size = nr_pages << PAGE_SHIFT; + u64 addr = gfn << PAGE_SHIFT; + + if (is_protected_kvm_enabled()) + kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle); + else + kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size); return 0; } @@ -225,7 +236,7 @@ static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) void *pgtable = page_to_virt(page); s8 level = page_private(page); - kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); + KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level); } static void stage2_free_unlinked_table(void *addr, s8 level) @@ -324,7 +335,7 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 lockdep_assert_held_write(&kvm->mmu_lock); WARN_ON(size & ~PAGE_MASK); - WARN_ON(stage2_apply_range(mmu, start, end, kvm_pgtable_stage2_unmap, + WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap), may_block)); } @@ -336,7 +347,7 @@ void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_flush); + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush)); } static void stage2_flush_memslot(struct kvm *kvm, @@ -942,10 +953,14 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return -ENOMEM; mmu->arch = &kvm->arch; - err = kvm_pgtable_stage2_init(pgt, mmu, &kvm_s2_mm_ops); + err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops); if (err) goto out_free_pgtable; + mmu->pgt = pgt; + if (is_protected_kvm_enabled()) + return 0; + mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran)); if (!mmu->last_vcpu_ran) { err = -ENOMEM; @@ -959,7 +974,6 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; mmu->split_page_cache.gfp_zero = __GFP_ZERO; - mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); if (kvm_is_nested_s2_mmu(kvm, mmu)) @@ -968,7 +982,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t return 0; out_destroy_pgtable: - kvm_pgtable_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); out_free_pgtable: kfree(pgt); return err; @@ -1065,7 +1079,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) write_unlock(&kvm->mmu_lock); if (pgt) { - kvm_pgtable_stage2_destroy(pgt); + KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt); kfree(pgt); } } @@ -1082,9 +1096,11 @@ static void *hyp_mc_alloc_fn(void *unused) void free_hyp_memcache(struct kvm_hyp_memcache *mc) { - if (is_protected_kvm_enabled()) - __free_hyp_memcache(mc, hyp_mc_free_fn, - kvm_host_va, NULL); + if (!is_protected_kvm_enabled()) + return; + + kfree(mc->mapping); + __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, NULL); } int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) @@ -1092,6 +1108,12 @@ int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages) if (!is_protected_kvm_enabled()) return 0; + if (!mc->mapping) { + mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT); + if (!mc->mapping) + return -ENOMEM; + } + return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn, kvm_host_pa, NULL); } @@ -1130,8 +1152,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, break; write_lock(&kvm->mmu_lock); - ret = kvm_pgtable_stage2_map(pgt, addr, PAGE_SIZE, pa, prot, - &cache, 0); + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE, + pa, prot, &cache, 0); write_unlock(&kvm->mmu_lock); if (ret) break; @@ -1151,7 +1173,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, */ void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) { - stage2_apply_range_resched(mmu, addr, end, kvm_pgtable_stage2_wrprotect); + stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect)); } /** @@ -1442,9 +1464,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long mmu_seq; phys_addr_t ipa = fault_ipa; struct kvm *kvm = vcpu->kvm; - struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; struct vm_area_struct *vma; short vma_shift; + void *memcache; gfn_t gfn; kvm_pfn_t pfn; bool logging_active = memslot_is_logging(memslot); @@ -1472,8 +1494,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * and a write fault needs to collapse a block entry into a table. */ if (!fault_is_perm || (logging_active && write_fault)) { - ret = kvm_mmu_topup_memory_cache(memcache, - kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu)); + int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); + + if (!is_protected_kvm_enabled()) { + memcache = &vcpu->arch.mmu_page_cache; + ret = kvm_mmu_topup_memory_cache(memcache, min_pages); + } else { + memcache = &vcpu->arch.pkvm_memcache; + ret = topup_hyp_memcache(memcache, min_pages); + } if (ret) return ret; } @@ -1494,7 +1523,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * logging_active is guaranteed to never be true for VM_PFNMAP * memslots. */ - if (logging_active) { + if (logging_active || is_protected_kvm_enabled()) { force_pte = true; vma_shift = PAGE_SHIFT; } else { @@ -1634,7 +1663,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, prot |= kvm_encode_nested_level(nested); } - read_lock(&kvm->mmu_lock); + kvm_fault_lock(kvm); pgt = vcpu->arch.hw_mmu->pgt; if (mmu_invalidate_retry(kvm, mmu_seq)) { ret = -EAGAIN; @@ -1696,16 +1725,16 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * PTE, which will be preserved. */ prot &= ~KVM_NV_GUEST_MAP_SZ; - ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot, flags); + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags); } else { - ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize, __pfn_to_phys(pfn), prot, memcache, flags); } out_unlock: kvm_release_faultin_page(kvm, page, !!ret, writable); - read_unlock(&kvm->mmu_lock); + kvm_fault_unlock(kvm); /* Mark the page dirty only if the fault is handled successfully */ if (writable && !ret) @@ -1724,7 +1753,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) read_lock(&vcpu->kvm->mmu_lock); mmu = vcpu->arch.hw_mmu; - kvm_pgtable_stage2_mkyoung(mmu->pgt, fault_ipa, flags); + KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags); read_unlock(&vcpu->kvm->mmu_lock); } @@ -1764,7 +1793,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) } /* Falls between the IPA range and the PARange? */ - if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) { + if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) { fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0); if (is_iabt) @@ -1930,7 +1959,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; - return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, true); /* @@ -1946,7 +1975,7 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.mmu.pgt) return false; - return kvm_pgtable_stage2_test_clear_young(kvm->arch.mmu.pgt, + return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT, size, false); } From 2589dbd72797a4163dd998b05c4663ff98bd0771 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 20 Dec 2024 11:33:05 +0000 Subject: [PATCH 38/87] KVM: arm64: Consolidate allowed and restricted VM feature checks The definitions for features allowed and allowed with restrictions for protected guests, which are based on feature registers, were defined and checked for separately, even though they are handled in the same way. This could result in missing checks for certain features, e.g., pointer authentication, causing traps for allowed features. Consolidate the definitions into one. Use that new definition to construct the guest view of the feature registers for consistency. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-2-tabba@google.com Signed-off-by: Marc Zyngier --- .../arm64/kvm/hyp/include/nvhe/fixed_config.h | 55 +++++++------------ arch/arm64/kvm/hyp/nvhe/pkvm.c | 8 +-- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 6 +- 3 files changed, 26 insertions(+), 43 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index f957890c7e38..d1e59b88ff66 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -14,11 +14,8 @@ * guest virtual machines, depending on the mode KVM is running in and on the * type of guest that is running. * - * The ALLOW masks represent a bitmask of feature fields that are allowed - * without any restrictions as long as they are supported by the system. - * - * The RESTRICT_UNSIGNED masks, if present, represent unsigned fields for - * features that are restricted to support at most the specified feature. + * Each field in the masks represents the highest supported *unsigned* value for + * the feature, if supported by the system. * * If a feature field is not present in either, than it is not supported. * @@ -34,16 +31,7 @@ * - Floating-point and Advanced SIMD * - Data Independent Timing * - Spectre/Meltdown Mitigation - */ -#define PVM_ID_AA64PFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) \ - ) - -/* + * * Restrict to the following *unsigned* features for protected VMs: * - AArch64 guests only (no support for AArch32 guests): * AArch32 adds complexity in trap handling, emulation, condition codes, @@ -51,7 +39,12 @@ * - RAS (v1) * Supported by KVM */ -#define PVM_ID_AA64PFR0_RESTRICT_UNSIGNED (\ +#define PVM_ID_AA64PFR0_ALLOW (\ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ + ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) | \ SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP) | \ SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP) | \ SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP) | \ @@ -77,20 +70,16 @@ * - Distinction between Secure and Non-secure Memory * - Mixed-endian at EL0 only * - Non-context synchronizing exception entry and exit + * + * Restrict to the following *unsigned* features for protected VMs: + * - 40-bit IPA + * - 16-bit ASID */ #define PVM_ID_AA64MMFR0_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \ ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \ ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) \ - ) - -/* - * Restrict to the following *unsigned* features for protected VMs: - * - 40-bit IPA - * - 16-bit ASID - */ -#define PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED (\ + ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) | \ FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \ FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \ ) @@ -185,15 +174,6 @@ ) /* Restrict pointer authentication to the basic version. */ -#define PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ - ) - -#define PVM_ID_AA64ISAR2_RESTRICT_UNSIGNED (\ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ - ) - #define PVM_ID_AA64ISAR1_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ @@ -206,13 +186,16 @@ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \ ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) \ + ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ ) #define PVM_ID_AA64ISAR2_ALLOW (\ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_ATS1A)| \ ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) \ + ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) | \ + FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ ) u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 071993c16de8..42b4d6e3b15c 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -36,9 +36,9 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) /* Protected KVM does not support AArch32 guests. */ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_EL0_IMP); + PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL0_IMP); BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) != ID_AA64PFR0_EL1_EL1_IMP); + PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL1_IMP); /* * Linux guests assume support for floating-point and Advanced SIMD. Do @@ -362,8 +362,8 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) set_bit(KVM_ARM_VCPU_SVE, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED)) + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_ALLOW)) set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 2860548d4250..59fb2f056177 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -89,7 +89,7 @@ static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED); + PVM_ID_AA64PFR0_ALLOW); return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; } @@ -189,7 +189,7 @@ static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) u64 set_mask; set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, - PVM_ID_AA64MMFR0_RESTRICT_UNSIGNED); + PVM_ID_AA64MMFR0_ALLOW); return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; } @@ -276,7 +276,7 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, * of AArch32 feature id registers. */ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_RESTRICT_UNSIGNED) > ID_AA64PFR0_EL1_EL1_IMP); + PVM_ID_AA64PFR0_ALLOW) > ID_AA64PFR0_EL1_EL1_IMP); return pvm_access_raz_wi(vcpu, p, r); } From f50758260bfff393f2a800469b37c45a7ef50376 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 20 Dec 2024 11:33:05 +0000 Subject: [PATCH 39/87] KVM: arm64: Group setting traps for protected VMs by control register Group setting protected VM traps by control register rather than feature id register, since some trap values (e.g., PAuth), depend on more than one feature id register. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-3-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 351 +++++++++++++++------------------ 1 file changed, 161 insertions(+), 190 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 42b4d6e3b15c..3af5fca64f67 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -23,187 +23,6 @@ unsigned int kvm_arm_vmid_bits; unsigned int kvm_host_sve_max_vl; -/* - * Set trap register values based on features in ID_AA64PFR0. - */ -static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - u64 hcr_set = HCR_RW; - u64 hcr_clear = 0; - u64 cptr_set = 0; - u64 cptr_clear = 0; - - /* Protected KVM does not support AArch32 guests. */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL0_IMP); - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL1_IMP); - - /* - * Linux guests assume support for floating-point and Advanced SIMD. Do - * not change the trapping behavior for these from the KVM default. - */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), - PVM_ID_AA64PFR0_ALLOW)); - - if (has_hvhe()) - hcr_set |= HCR_E2H; - - /* Trap RAS unless all current versions are supported */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < - ID_AA64PFR0_EL1_RAS_V1P1) { - hcr_set |= HCR_TERR | HCR_TEA; - hcr_clear |= HCR_FIEN; - } - - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), feature_ids)) { - hcr_clear |= HCR_AMVOFFEN; - cptr_set |= CPTR_EL2_TAM; - } - - /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { - if (has_hvhe()) - cptr_clear |= CPACR_ELx_ZEN; - else - cptr_set |= CPTR_EL2_TZ; - } - - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; - vcpu->arch.cptr_el2 &= ~cptr_clear; -} - -/* - * Set trap register values based on features in ID_AA64PFR1. - */ -static void pvm_init_traps_aa64pfr1(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - u64 hcr_set = 0; - u64 hcr_clear = 0; - - /* Memory Tagging: Trap and Treat as Untagged if not supported. */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), feature_ids)) { - hcr_set |= HCR_TID5; - hcr_clear |= HCR_DCT | HCR_ATA; - } - - vcpu->arch.hcr_el2 |= hcr_set; - vcpu->arch.hcr_el2 &= ~hcr_clear; -} - -/* - * Set trap register values based on features in ID_AA64DFR0. - */ -static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); - u64 mdcr_set = 0; - u64 mdcr_clear = 0; - u64 cptr_set = 0; - - /* Trap/constrain PMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; - mdcr_clear |= MDCR_EL2_HPME | MDCR_EL2_MTPME | - MDCR_EL2_HPMN_MASK; - } - - /* Trap Debug */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), feature_ids)) - mdcr_set |= MDCR_EL2_TDRA | MDCR_EL2_TDA | MDCR_EL2_TDE; - - /* Trap OS Double Lock */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), feature_ids)) - mdcr_set |= MDCR_EL2_TDOSA; - - /* Trap SPE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), feature_ids)) { - mdcr_set |= MDCR_EL2_TPMS; - mdcr_clear |= MDCR_EL2_E2PB_MASK; - } - - /* Trap Trace Filter */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), feature_ids)) - mdcr_set |= MDCR_EL2_TTRF; - - /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { - if (has_hvhe()) - cptr_set |= CPACR_EL1_TTA; - else - cptr_set |= CPTR_EL2_TTA; - } - - /* Trap External Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), feature_ids)) - mdcr_clear |= MDCR_EL2_E2TB_MASK; - - vcpu->arch.mdcr_el2 |= mdcr_set; - vcpu->arch.mdcr_el2 &= ~mdcr_clear; - vcpu->arch.cptr_el2 |= cptr_set; -} - -/* - * Set trap register values based on features in ID_AA64MMFR0. - */ -static void pvm_init_traps_aa64mmfr0(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); - u64 mdcr_set = 0; - - /* Trap Debug Communications Channel registers */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), feature_ids)) - mdcr_set |= MDCR_EL2_TDCC; - - vcpu->arch.mdcr_el2 |= mdcr_set; -} - -/* - * Set trap register values based on features in ID_AA64MMFR1. - */ -static void pvm_init_traps_aa64mmfr1(struct kvm_vcpu *vcpu) -{ - const u64 feature_ids = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); - u64 hcr_set = 0; - - /* Trap LOR */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), feature_ids)) - hcr_set |= HCR_TLOR; - - vcpu->arch.hcr_el2 |= hcr_set; -} - -/* - * Set baseline trap register values. - */ -static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) -{ - const u64 hcr_trap_feat_regs = HCR_TID3; - const u64 hcr_trap_impdef = HCR_TACR | HCR_TIDCP | HCR_TID1; - - /* - * Always trap: - * - Feature id registers: to control features exposed to guests - * - Implementation-defined features - */ - vcpu->arch.hcr_el2 |= hcr_trap_feat_regs | hcr_trap_impdef; - - /* Clear res0 and set res1 bits to trap potential new features. */ - vcpu->arch.hcr_el2 &= ~(HCR_RES0); - vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); - if (!has_hvhe()) { - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); - } -} - static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) { vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; @@ -231,25 +50,177 @@ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); } +static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) +{ + const u64 id_aa64pfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + const u64 id_aa64pfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); + const u64 id_aa64mmfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); + u64 val = vcpu->arch.hcr_el2; + + /* No support for AArch32. */ + val |= HCR_RW; + + if (has_hvhe()) + val |= HCR_E2H; + + /* + * Always trap: + * - Feature id registers: to control features exposed to guests + * - Implementation-defined features + */ + val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; + + /* Trap RAS unless all current versions are supported */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), id_aa64pfr0) < + ID_AA64PFR0_EL1_RAS_V1P1) { + val |= HCR_TERR | HCR_TEA; + val &= ~(HCR_FIEN); + } + + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), id_aa64pfr0)) + val &= ~(HCR_AMVOFFEN); + + /* Memory Tagging: Trap and Treat as Untagged if not supported. */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), id_aa64pfr1)) { + val |= HCR_TID5; + val &= ~(HCR_DCT | HCR_ATA); + } + + /* Trap LOR */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), id_aa64mmfr1)) + val |= HCR_TLOR; + + vcpu->arch.hcr_el2 = val; +} + +static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) +{ + const u64 id_aa64pfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); + const u64 id_aa64pfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); + const u64 id_aa64dfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + u64 val = vcpu->arch.cptr_el2; + + if (!has_hvhe()) { + val |= CPTR_NVHE_EL2_RES1; + val &= ~(CPTR_NVHE_EL2_RES0); + } + + /* Trap AMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), id_aa64pfr0)) + val |= CPTR_EL2_TAM; + + /* Trap SVE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), id_aa64pfr0)) { + if (has_hvhe()) + val &= ~(CPACR_ELx_ZEN); + else + val |= CPTR_EL2_TZ; + } + + /* No SME support in KVM. */ + BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME), id_aa64pfr1)); + if (has_hvhe()) + val &= ~(CPACR_ELx_SMEN); + else + val |= CPTR_EL2_TSM; + + /* Trap Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), id_aa64dfr0)) { + if (has_hvhe()) + val |= CPACR_EL1_TTA; + else + val |= CPTR_EL2_TTA; + } + + vcpu->arch.cptr_el2 = val; +} + +static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) +{ + const u64 id_aa64dfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + const u64 id_aa64mmfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); + u64 val = vcpu->arch.mdcr_el2; + + /* Trap/constrain PMU */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), id_aa64dfr0)) { + val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); + } + + /* Trap Debug */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0)) + val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; + + /* Trap OS Double Lock */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), id_aa64dfr0)) + val |= MDCR_EL2_TDOSA; + + /* Trap SPE */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), id_aa64dfr0)) { + val |= MDCR_EL2_TPMS; + val &= ~MDCR_EL2_E2PB_MASK; + } + + /* Trap Trace Filter */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), id_aa64dfr0)) + val |= MDCR_EL2_TTRF; + + /* Trap External Trace */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), id_aa64dfr0)) + val |= MDCR_EL2_E2TB_MASK; + + /* Trap Debug Communications Channel registers */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), id_aa64mmfr0)) + val |= MDCR_EL2_TDCC; + + vcpu->arch.mdcr_el2 = val; +} + /* * Initialize trap register values in protected mode. */ -static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static void pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) { + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); vcpu->arch.mdcr_el2 = 0; pkvm_vcpu_reset_hcr(vcpu); - if ((!vcpu_is_protected(vcpu))) + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) return; - pvm_init_trap_regs(vcpu); - pvm_init_traps_aa64pfr0(vcpu); - pvm_init_traps_aa64pfr1(vcpu); - pvm_init_traps_aa64dfr0(vcpu); - pvm_init_traps_aa64mmfr0(vcpu); - pvm_init_traps_aa64mmfr1(vcpu); + /* + * PAuth is allowed if supported by the system and the vcpu. + * Properly checking for PAuth requires checking various fields in + * ID_AA64ISAR1_EL1 and ID_AA64ISAR2_EL1. The way that fixed config + * is controlled now in pKVM does not easily allow that. This will + * change later to follow the changes upstream wrt fixed configuration + * and nested virt. + */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), + PVM_ID_AA64ISAR1_ALLOW)); + + /* Protected KVM does not support AArch32 guests. */ + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), + PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL0_IMP); + BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), + PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL1_IMP); + + /* + * Linux guests assume support for floating-point and Advanced SIMD. Do + * not change the trapping behavior for these from the KVM default. + */ + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), + PVM_ID_AA64PFR0_ALLOW)); + BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), + PVM_ID_AA64PFR0_ALLOW)); + + pvm_init_traps_hcr(vcpu); + pvm_init_traps_cptr(vcpu); + pvm_init_traps_mdcr(vcpu); } /* @@ -448,7 +419,7 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); pkvm_vcpu_init_ptrauth(hyp_vcpu); - pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); + pkvm_vcpu_init_traps(hyp_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); From 1fea164ccf19750c5bea688afd9122eb84eb3a72 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:43 +0000 Subject: [PATCH 40/87] KVM: arm64: Move checking protected vcpu features to a separate function At the moment, checks for supported vcpu features for protected VMs are build-time bugs. In the following patch, they will become runtime checks based on the vcpu's features registers. Therefore, consolidate them into one function that would return an error if it encounters an unsupported feature. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-4-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 45 ++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 3af5fca64f67..6a39f68563ae 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -178,20 +178,11 @@ static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) } /* - * Initialize trap register values in protected mode. + * Check that cpu features that are neither trapped nor supported are not + * enabled for protected VMs. */ -static void pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) +static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { - struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; - - vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); - vcpu->arch.mdcr_el2 = 0; - - pkvm_vcpu_reset_hcr(vcpu); - - if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) - return; - /* * PAuth is allowed if supported by the system and the vcpu. * Properly checking for PAuth requires checking various fields in @@ -218,9 +209,34 @@ static void pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), PVM_ID_AA64PFR0_ALLOW)); + return 0; +} + +/* + * Initialize trap register values in protected mode. + */ +static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + int ret; + + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!pkvm_hyp_vcpu_is_protected(hyp_vcpu))) + return 0; + + ret = pkvm_check_pvm_cpu_features(vcpu); + if (ret) + return ret; + pvm_init_traps_hcr(vcpu); pvm_init_traps_cptr(vcpu); pvm_init_traps_mdcr(vcpu); + + return 0; } /* @@ -417,9 +433,12 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + ret = pkvm_vcpu_init_traps(hyp_vcpu); + if (ret) + goto done; + pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); pkvm_vcpu_init_ptrauth(hyp_vcpu); - pkvm_vcpu_init_traps(hyp_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); From 27f5cf8ad5224033a711aef3fde90b60c9a8d7d5 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:44 +0000 Subject: [PATCH 41/87] KVM: arm64: Remove KVM_ARM_VCPU_POWER_OFF from protected VMs allowed features in pKVM The hypervisor is responsible for the power state of protected VMs in pKVM. Therefore, remove KVM_ARM_VCPU_POWER_OFF from the list of allowed features for protected VMs. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-5-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 6a39f68563ae..d7ca7e9ccea2 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -331,10 +331,8 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc /* * For protected VMs, always allow: - * - CPU starting in poweroff state * - PSCI v0.2 */ - set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); /* From a3163dca4817e9a30b154a14c793641e39a00592 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:45 +0000 Subject: [PATCH 42/87] KVM: arm64: Use KVM extension checks for allowed protected VM capabilities Use KVM extension checks as the source for determining which capabilities are allowed for protected VMs. KVM extension checks is the natural place for this, since it is also the interface exposed to users. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-6-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_pkvm.h | 25 +++++++++++++++++++++++++ arch/arm64/kvm/arm.c | 29 ++--------------------------- arch/arm64/kvm/hyp/nvhe/pkvm.c | 24 ++++++------------------ 3 files changed, 33 insertions(+), 45 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index cd56acd9a842..400f7cef1e81 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -20,6 +20,31 @@ int pkvm_init_host_vm(struct kvm *kvm); int pkvm_create_hyp_vm(struct kvm *kvm); void pkvm_destroy_hyp_vm(struct kvm *kvm); +/* + * This functions as an allow-list of protected VM capabilities. + * Features not explicitly allowed by this function are denied. + */ +static inline bool kvm_pvm_ext_allowed(long ext) +{ + switch (ext) { + case KVM_CAP_IRQCHIP: + case KVM_CAP_ARM_PSCI: + case KVM_CAP_ARM_PSCI_0_2: + case KVM_CAP_NR_VCPUS: + case KVM_CAP_MAX_VCPUS: + case KVM_CAP_MAX_VCPU_ID: + case KVM_CAP_MSI_DEVID: + case KVM_CAP_ARM_VM_IPA_SIZE: + case KVM_CAP_ARM_PMU_V3: + case KVM_CAP_ARM_SVE: + case KVM_CAP_ARM_PTRAUTH_ADDRESS: + case KVM_CAP_ARM_PTRAUTH_GENERIC: + return true; + default: + return false; + } +} + extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a102c3aebdbc..b295218cdc24 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -80,31 +80,6 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; } -/* - * This functions as an allow-list of protected VM capabilities. - * Features not explicitly allowed by this function are denied. - */ -static bool pkvm_ext_allowed(struct kvm *kvm, long ext) -{ - switch (ext) { - case KVM_CAP_IRQCHIP: - case KVM_CAP_ARM_PSCI: - case KVM_CAP_ARM_PSCI_0_2: - case KVM_CAP_NR_VCPUS: - case KVM_CAP_MAX_VCPUS: - case KVM_CAP_MAX_VCPU_ID: - case KVM_CAP_MSI_DEVID: - case KVM_CAP_ARM_VM_IPA_SIZE: - case KVM_CAP_ARM_PMU_V3: - case KVM_CAP_ARM_SVE: - case KVM_CAP_ARM_PTRAUTH_ADDRESS: - case KVM_CAP_ARM_PTRAUTH_GENERIC: - return true; - default: - return false; - } -} - int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { @@ -113,7 +88,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, if (cap->flags) return -EINVAL; - if (kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, cap->cap)) + if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap)) return -EINVAL; switch (cap->cap) { @@ -311,7 +286,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; - if (kvm && kvm_vm_is_protected(kvm) && !pkvm_ext_allowed(kvm, ext)) + if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext)) return 0; switch (ext) { diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index d7ca7e9ccea2..c39d4e92dd3c 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -329,32 +329,20 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); - /* - * For protected VMs, always allow: - * - PSCI v0.2 - */ set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); - /* - * Check if remaining features are allowed: - * - Performance Monitoring - * - Scalable Vectors - * - Pointer Authentication - */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PMU_V3)) set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) - set_bit(KVM_ARM_VCPU_SVE, allowed_features); - - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_ALLOW) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_ADDRESS)) set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && - FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, allowed_features, KVM_VCPU_MAX_FEATURES); } From 7ba5b8f80475e48b486f095ee9fb67dc9f9d02df Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:46 +0000 Subject: [PATCH 43/87] KVM: arm64: Initialize feature id registers for protected VMs The hypervisor maintains the state of protected VMs. Initialize the values for feature ID registers for protected VMs, to be used when setting traps and when advertising features to protected VMs. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-7-tabba@google.com Signed-off-by: Marc Zyngier --- .../arm64/kvm/hyp/include/nvhe/fixed_config.h | 1 + arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 2 + arch/arm64/kvm/hyp/nvhe/pkvm.c | 10 +++-- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 45 +++++++++++++++++-- 4 files changed, 52 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index d1e59b88ff66..69e26d1a0ebe 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -201,6 +201,7 @@ u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); int kvm_check_pvm_sysreg_table(void); #endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 24a9a8330d19..698bc20ab80b 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -47,6 +47,8 @@ struct pkvm_hyp_vm { struct pkvm_hyp_vcpu *vcpus[]; }; +extern hyp_spinlock_t vm_table_lock; + static inline struct pkvm_hyp_vm * pkvm_hyp_vcpu_to_hyp_vm(struct pkvm_hyp_vcpu *hyp_vcpu) { diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index c39d4e92dd3c..c7958183e40d 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -257,10 +257,10 @@ static pkvm_handle_t idx_to_vm_handle(unsigned int idx) /* * Spinlock for protecting state related to the VM table. Protects writes - * to 'vm_table' and 'nr_table_entries' as well as reads and writes to - * 'last_hyp_vcpu_lookup'. + * to 'vm_table', 'nr_table_entries', and other per-vm state on initialization. + * Also protects reads and writes to 'last_hyp_vcpu_lookup'. */ -static DEFINE_HYP_SPINLOCK(vm_table_lock); +DEFINE_HYP_SPINLOCK(vm_table_lock); /* * The table of VM entries for protected VMs in hyp. @@ -381,6 +381,7 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + hyp_vm->kvm.arch.flags = 0; pkvm_init_features_from_host(hyp_vm, host_kvm); } @@ -419,6 +420,9 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + if (pkvm_hyp_vcpu_is_protected(hyp_vcpu)) + kvm_init_pvm_id_regs(&hyp_vcpu->vcpu); + ret = pkvm_vcpu_init_traps(hyp_vcpu); if (ret) goto done; diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 59fb2f056177..2aea44c911bd 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -12,6 +12,7 @@ #include #include +#include #include "../../sys_regs.h" @@ -204,8 +205,7 @@ static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; } -/* Read a sanitized cpufeature ID register by its encoding */ -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) { switch (id) { case SYS_ID_AA64PFR0_EL1: @@ -240,10 +240,25 @@ u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) } } +/* Read a sanitized cpufeature ID register by its encoding */ +u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) +{ + return pvm_calc_id_reg(vcpu, id); +} + static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { - return pvm_read_id_reg(vcpu, reg_to_encoding(r)); + struct kvm *kvm = vcpu->kvm; + u32 reg = reg_to_encoding(r); + + if (WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))) + return 0; + + if (reg >= sys_reg(3, 0, 0, 1, 0) && reg <= sys_reg(3, 0, 0, 7, 7)) + return kvm->arch.id_regs[IDREG_IDX(reg)]; + + return 0; } /* Handler to RAZ/WI sysregs */ @@ -448,6 +463,30 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = { /* Performance Monitoring Registers are restricted. */ }; +/* + * Initializes feature registers for protected vms. + */ +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_arch *ka = &kvm->arch; + u32 r; + + hyp_assert_lock_held(&vm_table_lock); + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + /* + * Initialize only AArch64 id registers since AArch32 isn't supported + * for protected VMs. + */ + for (r = sys_reg(3, 0, 0, 4, 0); r <= sys_reg(3, 0, 0, 7, 7); r += sys_reg(0, 0, 0, 0, 1)) + ka->id_regs[IDREG_IDX(r)] = pvm_calc_id_reg(vcpu, r); + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); +} + /* * Checks that the sysreg table is unique and in-order. * From 9df9186f8df513dc9bf9f95f68525c7ebc941bcd Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:47 +0000 Subject: [PATCH 44/87] KVM: arm64: Fix RAS trapping in pKVM for protected VMs Trap RAS in pKVM if not supported at all for protected VMs. The RAS version doesn't matter in this case. Fixes: 2a0c343386ae ("KVM: arm64: Initialize trap registers for protected VMs") Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-8-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index c7958183e40d..6eddd29b264b 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -70,9 +70,8 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) */ val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; - /* Trap RAS unless all current versions are supported */ - if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), id_aa64pfr0) < - ID_AA64PFR0_EL1_RAS_V1P1) { + /* Trap RAS */ + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), id_aa64pfr0)) { val |= HCR_TERR | HCR_TEA; val &= ~(HCR_FIEN); } From 0401f7e76d707741d2562f4988743cc5daf445e4 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:48 +0000 Subject: [PATCH 45/87] KVM: arm64: Set protected VM traps based on its view of feature registers Now that the VM's feature id registers are initialized with the values of the supported features, use those values to determine which traps to set using kvm_has_feature(). Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-9-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 84 +++++++++++------------------- arch/arm64/kvm/hyp/nvhe/sys_regs.c | 7 --- 2 files changed, 30 insertions(+), 61 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 6eddd29b264b..23afc63cac55 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -52,9 +52,7 @@ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) { - const u64 id_aa64pfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - const u64 id_aa64pfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - const u64 id_aa64mmfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR1_EL1); + struct kvm *kvm = vcpu->kvm; u64 val = vcpu->arch.hcr_el2; /* No support for AArch32. */ @@ -70,24 +68,20 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) */ val |= HCR_TACR | HCR_TIDCP | HCR_TID3 | HCR_TID1; - /* Trap RAS */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), id_aa64pfr0)) { + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, IMP)) { val |= HCR_TERR | HCR_TEA; val &= ~(HCR_FIEN); } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), id_aa64pfr0)) + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) val &= ~(HCR_AMVOFFEN); - /* Memory Tagging: Trap and Treat as Untagged if not supported. */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE), id_aa64pfr1)) { + if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, MTE, IMP)) { val |= HCR_TID5; val &= ~(HCR_DCT | HCR_ATA); } - /* Trap LOR */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_LO), id_aa64mmfr1)) + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, LO, IMP)) val |= HCR_TLOR; vcpu->arch.hcr_el2 = val; @@ -95,9 +89,7 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) { - const u64 id_aa64pfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR0_EL1); - const u64 id_aa64pfr1 = pvm_read_id_reg(vcpu, SYS_ID_AA64PFR1_EL1); - const u64 id_aa64dfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); + struct kvm *kvm = vcpu->kvm; u64 val = vcpu->arch.cptr_el2; if (!has_hvhe()) { @@ -105,12 +97,11 @@ static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) val &= ~(CPTR_NVHE_EL2_RES0); } - /* Trap AMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU), id_aa64pfr0)) + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) val |= CPTR_EL2_TAM; - /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), id_aa64pfr0)) { + /* SVE can be disabled by userspace even if supported. */ + if (!vcpu_has_sve(vcpu)) { if (has_hvhe()) val &= ~(CPACR_ELx_ZEN); else @@ -118,14 +109,13 @@ static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) } /* No SME support in KVM. */ - BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME), id_aa64pfr1)); + BUG_ON(kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)); if (has_hvhe()) val &= ~(CPACR_ELx_SMEN); else val |= CPTR_EL2_TSM; - /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), id_aa64dfr0)) { + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) { if (has_hvhe()) val |= CPACR_EL1_TTA; else @@ -137,40 +127,33 @@ static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) { - const u64 id_aa64dfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64DFR0_EL1); - const u64 id_aa64mmfr0 = pvm_read_id_reg(vcpu, SYS_ID_AA64MMFR0_EL1); + struct kvm *kvm = vcpu->kvm; u64 val = vcpu->arch.mdcr_el2; - /* Trap/constrain PMU */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), id_aa64dfr0)) { + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP)) { val |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; val &= ~(MDCR_EL2_HPME | MDCR_EL2_MTPME | MDCR_EL2_HPMN_MASK); } - /* Trap Debug */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), id_aa64dfr0)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, IMP)) val |= MDCR_EL2_TDRA | MDCR_EL2_TDA; - /* Trap OS Double Lock */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DoubleLock), id_aa64dfr0)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DoubleLock, IMP)) val |= MDCR_EL2_TDOSA; - /* Trap SPE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer), id_aa64dfr0)) { + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP)) { val |= MDCR_EL2_TPMS; val &= ~MDCR_EL2_E2PB_MASK; } - /* Trap Trace Filter */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceFilt), id_aa64dfr0)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP)) val |= MDCR_EL2_TTRF; - /* Trap External Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_ExtTrcBuff), id_aa64dfr0)) + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, ExtTrcBuff, IMP)) val |= MDCR_EL2_E2TB_MASK; /* Trap Debug Communications Channel registers */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_FGT), id_aa64mmfr0)) + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP)) val |= MDCR_EL2_TDCC; vcpu->arch.mdcr_el2 = val; @@ -182,31 +165,24 @@ static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) */ static int pkvm_check_pvm_cpu_features(struct kvm_vcpu *vcpu) { - /* - * PAuth is allowed if supported by the system and the vcpu. - * Properly checking for PAuth requires checking various fields in - * ID_AA64ISAR1_EL1 and ID_AA64ISAR2_EL1. The way that fixed config - * is controlled now in pKVM does not easily allow that. This will - * change later to follow the changes upstream wrt fixed configuration - * and nested virt. - */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), - PVM_ID_AA64ISAR1_ALLOW)); + struct kvm *kvm = vcpu->kvm; /* Protected KVM does not support AArch32 guests. */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), - PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL0_IMP); - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_ALLOW) != ID_AA64PFR0_EL1_EL1_IMP); + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL0, AARCH32) || + kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL1, AARCH32)) + return -EINVAL; /* * Linux guests assume support for floating-point and Advanced SIMD. Do * not change the trapping behavior for these from the KVM default. */ - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP), - PVM_ID_AA64PFR0_ALLOW)); - BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), - PVM_ID_AA64PFR0_ALLOW)); + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, FP, IMP) || + !kvm_has_feat(kvm, ID_AA64PFR0_EL1, AdvSIMD, IMP)) + return -EINVAL; + + /* No SME support in KVM right now. Check to catch if it changes. */ + if (kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)) + return -EINVAL; return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 2aea44c911bd..398563d3a266 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -286,13 +286,6 @@ static bool pvm_access_id_aarch32(struct kvm_vcpu *vcpu, return false; } - /* - * No support for AArch32 guests, therefore, pKVM has no sanitized copy - * of AArch32 feature id registers. - */ - BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL1), - PVM_ID_AA64PFR0_ALLOW) > ID_AA64PFR0_EL1_EL1_IMP); - return pvm_access_raz_wi(vcpu, p, r); } From 3d7ff00700d1a4d0c8f092f2c1bf67553a8c7c4c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:49 +0000 Subject: [PATCH 46/87] KVM: arm64: Rework specifying restricted features for protected VMs The existing code didn't properly distinguish between signed and unsigned features, and was difficult to read and to maintain. Rework it using the same method used in other parts of KVM when handling vcpu features. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-10-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 + .../arm64/kvm/hyp/include/nvhe/fixed_config.h | 1 - arch/arm64/kvm/hyp/nvhe/sys_regs.c | 411 +++++++++--------- 3 files changed, 216 insertions(+), 197 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e18e9244d17a..d3230a22846f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1422,6 +1422,7 @@ static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature) return test_bit(feature, ka->vcpu_features); } +#define kvm_vcpu_has_feature(k, f) __vcpu_has_feature(&(k)->arch, (f)) #define vcpu_has_feature(v, f) __vcpu_has_feature(&(v)->kvm->arch, (f)) #define kvm_vcpu_initialized(v) vcpu_get_flag(vcpu, VCPU_INITIALIZED) diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h index 69e26d1a0ebe..37a6d2434e47 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h @@ -198,7 +198,6 @@ FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ ) -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id); bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 398563d3a266..934d7c4b04b4 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -29,6 +29,221 @@ u64 id_aa64mmfr1_el1_sys_val; u64 id_aa64mmfr2_el1_sys_val; u64 id_aa64smfr0_el1_sys_val; +struct pvm_ftr_bits { + bool sign; + u8 shift; + u8 width; + u8 max_val; + bool (*vm_supported)(const struct kvm *kvm); +}; + +#define __MAX_FEAT_FUNC(id, fld, max, func, sgn) \ + { \ + .sign = sgn, \ + .shift = id##_##fld##_SHIFT, \ + .width = id##_##fld##_WIDTH, \ + .max_val = id##_##fld##_##max, \ + .vm_supported = func, \ + } + +#define MAX_FEAT_FUNC(id, fld, max, func) \ + __MAX_FEAT_FUNC(id, fld, max, func, id##_##fld##_SIGNED) + +#define MAX_FEAT(id, fld, max) \ + MAX_FEAT_FUNC(id, fld, max, NULL) + +#define MAX_FEAT_ENUM(id, fld, max) \ + __MAX_FEAT_FUNC(id, fld, max, NULL, false) + +#define FEAT_END { .width = 0, } + +static bool vm_has_ptrauth(const struct kvm *kvm) +{ + if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH)) + return false; + + return (cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || + cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC); +} + +static bool vm_has_sve(const struct kvm *kvm) +{ + return system_supports_sve() && kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_SVE); +} + +/* + * Definitions for features to be allowed or restricted for protected guests. + * + * Each field in the masks represents the highest supported value for the + * feature. If a feature field is not present, it is not supported. Moreover, + * these are used to generate the guest's view of the feature registers. + * + * The approach for protected VMs is to at least support features that are: + * - Needed by common Linux distributions (e.g., floating point) + * - Trivial to support, e.g., supporting the feature does not introduce or + * require tracking of additional state in KVM + * - Cannot be trapped or prevent the guest from using anyway + */ + +static const struct pvm_ftr_bits pvmid_aa64pfr0[] = { + MAX_FEAT(ID_AA64PFR0_EL1, EL0, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL1, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, EL3, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, FP, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, AdvSIMD, FP16), + MAX_FEAT(ID_AA64PFR0_EL1, GIC, IMP), + MAX_FEAT_FUNC(ID_AA64PFR0_EL1, SVE, IMP, vm_has_sve), + MAX_FEAT(ID_AA64PFR0_EL1, RAS, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, DIT, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV2, IMP), + MAX_FEAT(ID_AA64PFR0_EL1, CSV3, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64pfr1[] = { + MAX_FEAT(ID_AA64PFR1_EL1, BT, IMP), + MAX_FEAT(ID_AA64PFR1_EL1, SSBS, SSBS2), + MAX_FEAT_ENUM(ID_AA64PFR1_EL1, MTE_frac, NI), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64mmfr0[] = { + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, PARANGE, 40), + MAX_FEAT_ENUM(ID_AA64MMFR0_EL1, ASIDBITS, 16), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGEND, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, SNSMEM, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, BIGENDEL0, IMP), + MAX_FEAT(ID_AA64MMFR0_EL1, EXS, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64mmfr1[] = { + MAX_FEAT(ID_AA64MMFR1_EL1, HAFDBS, DBM), + MAX_FEAT_ENUM(ID_AA64MMFR1_EL1, VMIDBits, 16), + MAX_FEAT(ID_AA64MMFR1_EL1, HPDS, HPDS2), + MAX_FEAT(ID_AA64MMFR1_EL1, PAN, PAN3), + MAX_FEAT(ID_AA64MMFR1_EL1, SpecSEI, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, ETS, IMP), + MAX_FEAT(ID_AA64MMFR1_EL1, CMOW, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64mmfr2[] = { + MAX_FEAT(ID_AA64MMFR2_EL1, CnP, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, UAO, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, IESB, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, AT, IMP), + MAX_FEAT_ENUM(ID_AA64MMFR2_EL1, IDS, 0x18), + MAX_FEAT(ID_AA64MMFR2_EL1, TTL, IMP), + MAX_FEAT(ID_AA64MMFR2_EL1, BBM, 2), + MAX_FEAT(ID_AA64MMFR2_EL1, E0PD, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64isar1[] = { + MAX_FEAT(ID_AA64ISAR1_EL1, DPB, DPB2), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, APA, PAuth, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR1_EL1, API, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR1_EL1, JSCVT, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FCMA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, LRCPC, LRCPC3), + MAX_FEAT(ID_AA64ISAR1_EL1, GPA, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, GPI, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, FRINTTS, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SB, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX), + MAX_FEAT(ID_AA64ISAR1_EL1, BF16, EBF16), + MAX_FEAT(ID_AA64ISAR1_EL1, DGH, IMP), + MAX_FEAT(ID_AA64ISAR1_EL1, I8MM, IMP), + FEAT_END +}; + +static const struct pvm_ftr_bits pvmid_aa64isar2[] = { + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, GPA3, IMP, vm_has_ptrauth), + MAX_FEAT_FUNC(ID_AA64ISAR2_EL1, APA3, PAuth, vm_has_ptrauth), + MAX_FEAT(ID_AA64ISAR2_EL1, ATS1A, IMP), + FEAT_END +}; + +/* + * None of the features in ID_AA64DFR0_EL1 nor ID_AA64MMFR4_EL1 are supported. + * However, both have Not-Implemented values that are non-zero. Define them + * so they can be used when getting the value of these registers. + */ +#define ID_AA64DFR0_EL1_NONZERO_NI \ +( \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DoubleLock, NI) | \ + SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, MTPMU, NI) \ +) + +#define ID_AA64MMFR4_EL1_NONZERO_NI \ + SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI) + +/* + * Returns the value of the feature registers based on the system register + * value, the vcpu support for the revelant features, and the additional + * restrictions for protected VMs. + */ +static u64 get_restricted_features(const struct kvm_vcpu *vcpu, + u64 sys_reg_val, + const struct pvm_ftr_bits restrictions[]) +{ + u64 val = 0UL; + int i; + + for (i = 0; restrictions[i].width != 0; i++) { + bool (*vm_supported)(const struct kvm *) = restrictions[i].vm_supported; + bool sign = restrictions[i].sign; + int shift = restrictions[i].shift; + int width = restrictions[i].width; + u64 min_signed = (1UL << width) - 1UL; + u64 sign_bit = 1UL << (width - 1); + u64 mask = GENMASK_ULL(width + shift - 1, shift); + u64 sys_val = (sys_reg_val & mask) >> shift; + u64 pvm_max = restrictions[i].max_val; + + if (vm_supported && !vm_supported(vcpu->kvm)) + val |= (sign ? min_signed : 0) << shift; + else if (sign && (sys_val >= sign_bit || pvm_max >= sign_bit)) + val |= max(sys_val, pvm_max) << shift; + else + val |= min(sys_val, pvm_max) << shift; + } + + return val; +} + +static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) +{ + switch (id) { + case SYS_ID_AA64PFR0_EL1: + return get_restricted_features(vcpu, id_aa64pfr0_el1_sys_val, pvmid_aa64pfr0); + case SYS_ID_AA64PFR1_EL1: + return get_restricted_features(vcpu, id_aa64pfr1_el1_sys_val, pvmid_aa64pfr1); + case SYS_ID_AA64ISAR0_EL1: + return id_aa64isar0_el1_sys_val; + case SYS_ID_AA64ISAR1_EL1: + return get_restricted_features(vcpu, id_aa64isar1_el1_sys_val, pvmid_aa64isar1); + case SYS_ID_AA64ISAR2_EL1: + return get_restricted_features(vcpu, id_aa64isar2_el1_sys_val, pvmid_aa64isar2); + case SYS_ID_AA64MMFR0_EL1: + return get_restricted_features(vcpu, id_aa64mmfr0_el1_sys_val, pvmid_aa64mmfr0); + case SYS_ID_AA64MMFR1_EL1: + return get_restricted_features(vcpu, id_aa64mmfr1_el1_sys_val, pvmid_aa64mmfr1); + case SYS_ID_AA64MMFR2_EL1: + return get_restricted_features(vcpu, id_aa64mmfr2_el1_sys_val, pvmid_aa64mmfr2); + case SYS_ID_AA64DFR0_EL1: + return ID_AA64DFR0_EL1_NONZERO_NI; + case SYS_ID_AA64MMFR4_EL1: + return ID_AA64MMFR4_EL1_NONZERO_NI; + default: + /* Unhandled ID register, RAZ */ + return 0; + } +} + /* * Inject an unknown/undefined exception to an AArch64 guest while most of its * sysregs are live. @@ -50,202 +265,6 @@ static void inject_undef64(struct kvm_vcpu *vcpu) write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); } -/* - * Returns the restricted features values of the feature register based on the - * limitations in restrict_fields. - * A feature id field value of 0b0000 does not impose any restrictions. - * Note: Use only for unsigned feature field values. - */ -static u64 get_restricted_features_unsigned(u64 sys_reg_val, - u64 restrict_fields) -{ - u64 value = 0UL; - u64 mask = GENMASK_ULL(ARM64_FEATURE_FIELD_BITS - 1, 0); - - /* - * According to the Arm Architecture Reference Manual, feature fields - * use increasing values to indicate increases in functionality. - * Iterate over the restricted feature fields and calculate the minimum - * unsigned value between the one supported by the system, and what the - * value is being restricted to. - */ - while (sys_reg_val && restrict_fields) { - value |= min(sys_reg_val & mask, restrict_fields & mask); - sys_reg_val &= ~mask; - restrict_fields &= ~mask; - mask <<= ARM64_FEATURE_FIELD_BITS; - } - - return value; -} - -/* - * Functions that return the value of feature id registers for protected VMs - * based on allowed features, system features, and KVM support. - */ - -static u64 get_pvm_id_aa64pfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask = 0; - u64 allow_mask = PVM_ID_AA64PFR0_ALLOW; - - set_mask |= get_restricted_features_unsigned(id_aa64pfr0_el1_sys_val, - PVM_ID_AA64PFR0_ALLOW); - - return (id_aa64pfr0_el1_sys_val & allow_mask) | set_mask; -} - -static u64 get_pvm_id_aa64pfr1(const struct kvm_vcpu *vcpu) -{ - const struct kvm *kvm = (const struct kvm *)kern_hyp_va(vcpu->kvm); - u64 allow_mask = PVM_ID_AA64PFR1_ALLOW; - - if (!kvm_has_mte(kvm)) - allow_mask &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); - - return id_aa64pfr1_el1_sys_val & allow_mask; -} - -static u64 get_pvm_id_aa64zfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for Scalable Vectors, therefore, hyp has no sanitized - * copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64ZFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, including breakpoints, and watchpoints, - * therefore, pKVM has no sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64dfr1(const struct kvm_vcpu *vcpu) -{ - /* - * No support for debug, therefore, hyp has no sanitized copy of the - * feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64DFR1_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64afr0(const struct kvm_vcpu *vcpu) -{ - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR0_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64afr1(const struct kvm_vcpu *vcpu) -{ - /* - * No support for implementation defined features, therefore, hyp has no - * sanitized copy of the feature id register. - */ - BUILD_BUG_ON(PVM_ID_AA64AFR1_ALLOW != 0ULL); - return 0; -} - -static u64 get_pvm_id_aa64isar0(const struct kvm_vcpu *vcpu) -{ - return id_aa64isar0_el1_sys_val & PVM_ID_AA64ISAR0_ALLOW; -} - -static u64 get_pvm_id_aa64isar1(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR1_ALLOW; - - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI)); - - return id_aa64isar1_el1_sys_val & allow_mask; -} - -static u64 get_pvm_id_aa64isar2(const struct kvm_vcpu *vcpu) -{ - u64 allow_mask = PVM_ID_AA64ISAR2_ALLOW; - - if (!vcpu_has_ptrauth(vcpu)) - allow_mask &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); - - return id_aa64isar2_el1_sys_val & allow_mask; -} - -static u64 get_pvm_id_aa64mmfr0(const struct kvm_vcpu *vcpu) -{ - u64 set_mask; - - set_mask = get_restricted_features_unsigned(id_aa64mmfr0_el1_sys_val, - PVM_ID_AA64MMFR0_ALLOW); - - return (id_aa64mmfr0_el1_sys_val & PVM_ID_AA64MMFR0_ALLOW) | set_mask; -} - -static u64 get_pvm_id_aa64mmfr1(const struct kvm_vcpu *vcpu) -{ - return id_aa64mmfr1_el1_sys_val & PVM_ID_AA64MMFR1_ALLOW; -} - -static u64 get_pvm_id_aa64mmfr2(const struct kvm_vcpu *vcpu) -{ - return id_aa64mmfr2_el1_sys_val & PVM_ID_AA64MMFR2_ALLOW; -} - -static u64 pvm_calc_id_reg(const struct kvm_vcpu *vcpu, u32 id) -{ - switch (id) { - case SYS_ID_AA64PFR0_EL1: - return get_pvm_id_aa64pfr0(vcpu); - case SYS_ID_AA64PFR1_EL1: - return get_pvm_id_aa64pfr1(vcpu); - case SYS_ID_AA64ZFR0_EL1: - return get_pvm_id_aa64zfr0(vcpu); - case SYS_ID_AA64DFR0_EL1: - return get_pvm_id_aa64dfr0(vcpu); - case SYS_ID_AA64DFR1_EL1: - return get_pvm_id_aa64dfr1(vcpu); - case SYS_ID_AA64AFR0_EL1: - return get_pvm_id_aa64afr0(vcpu); - case SYS_ID_AA64AFR1_EL1: - return get_pvm_id_aa64afr1(vcpu); - case SYS_ID_AA64ISAR0_EL1: - return get_pvm_id_aa64isar0(vcpu); - case SYS_ID_AA64ISAR1_EL1: - return get_pvm_id_aa64isar1(vcpu); - case SYS_ID_AA64ISAR2_EL1: - return get_pvm_id_aa64isar2(vcpu); - case SYS_ID_AA64MMFR0_EL1: - return get_pvm_id_aa64mmfr0(vcpu); - case SYS_ID_AA64MMFR1_EL1: - return get_pvm_id_aa64mmfr1(vcpu); - case SYS_ID_AA64MMFR2_EL1: - return get_pvm_id_aa64mmfr2(vcpu); - default: - /* Unhandled ID register, RAZ */ - return 0; - } -} - -/* Read a sanitized cpufeature ID register by its encoding */ -u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id) -{ - return pvm_calc_id_reg(vcpu, id); -} - static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) { From 81403c8d04e1d85209cdb0e0ce32aa0019620c65 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:50 +0000 Subject: [PATCH 47/87] KVM: arm64: Remove fixed_config.h header The few remaining items needed in fixed_config.h are better suited for pkvm.h. Move them there and delete it. No functional change intended. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-11-tabba@google.com Signed-off-by: Marc Zyngier --- .../arm64/kvm/hyp/include/nvhe/fixed_config.h | 206 ------------------ arch/arm64/kvm/hyp/include/nvhe/pkvm.h | 5 + arch/arm64/kvm/hyp/nvhe/pkvm.c | 1 - arch/arm64/kvm/hyp/nvhe/setup.c | 1 - arch/arm64/kvm/hyp/nvhe/switch.c | 1 - arch/arm64/kvm/hyp/nvhe/sys_regs.c | 1 - 6 files changed, 5 insertions(+), 210 deletions(-) delete mode 100644 arch/arm64/kvm/hyp/include/nvhe/fixed_config.h diff --git a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h b/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h deleted file mode 100644 index 37a6d2434e47..000000000000 --- a/arch/arm64/kvm/hyp/include/nvhe/fixed_config.h +++ /dev/null @@ -1,206 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2021 Google LLC - * Author: Fuad Tabba - */ - -#ifndef __ARM64_KVM_FIXED_CONFIG_H__ -#define __ARM64_KVM_FIXED_CONFIG_H__ - -#include - -/* - * This file contains definitions for features to be allowed or restricted for - * guest virtual machines, depending on the mode KVM is running in and on the - * type of guest that is running. - * - * Each field in the masks represents the highest supported *unsigned* value for - * the feature, if supported by the system. - * - * If a feature field is not present in either, than it is not supported. - * - * The approach taken for protected VMs is to allow features that are: - * - Needed by common Linux distributions (e.g., floating point) - * - Trivial to support, e.g., supporting the feature does not introduce or - * require tracking of additional state in KVM - * - Cannot be trapped or prevent the guest from using anyway - */ - -/* - * Allow for protected VMs: - * - Floating-point and Advanced SIMD - * - Data Independent Timing - * - Spectre/Meltdown Mitigation - * - * Restrict to the following *unsigned* features for protected VMs: - * - AArch64 guests only (no support for AArch32 guests): - * AArch32 adds complexity in trap handling, emulation, condition codes, - * etc... - * - RAS (v1) - * Supported by KVM - */ -#define PVM_ID_AA64PFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_FP) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_DIT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | \ - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP) | \ - SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, RAS, IMP) \ - ) - -/* - * Allow for protected VMs: - * - Branch Target Identification - * - Speculative Store Bypassing - */ -#define PVM_ID_AA64PFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_BT) | \ - ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SSBS) \ - ) - -#define PVM_ID_AA64PFR2_ALLOW 0ULL - -/* - * Allow for protected VMs: - * - Mixed-endian - * - Distinction between Secure and Non-secure Memory - * - Mixed-endian at EL0 only - * - Non-context synchronizing exception entry and exit - * - * Restrict to the following *unsigned* features for protected VMs: - * - 40-bit IPA - * - 16-bit ASID - */ -#define PVM_ID_AA64MMFR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGEND) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_SNSMEM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_BIGENDEL0) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_EXS) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_PARANGE), ID_AA64MMFR0_EL1_PARANGE_40) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64MMFR0_EL1_ASIDBITS), ID_AA64MMFR0_EL1_ASIDBITS_16) \ - ) - -/* - * Allow for protected VMs: - * - Hardware translation table updates to Access flag and Dirty state - * - Number of VMID bits from CPU - * - Hierarchical Permission Disables - * - Privileged Access Never - * - SError interrupt exceptions from speculative reads - * - Enhanced Translation Synchronization - * - Control for cache maintenance permission - */ -#define PVM_ID_AA64MMFR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HAFDBS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_VMIDBits) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_HPDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_PAN) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_SpecSEI) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_ETS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR1_EL1_CMOW) \ - ) - -/* - * Allow for protected VMs: - * - Common not Private translations - * - User Access Override - * - IESB bit in the SCTLR_ELx registers - * - Unaligned single-copy atomicity and atomic functions - * - ESR_ELx.EC value on an exception by read access to feature ID space - * - TTL field in address operations. - * - Break-before-make sequences when changing translation block size - * - E0PDx mechanism - */ -#define PVM_ID_AA64MMFR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_CnP) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_UAO) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IESB) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_AT) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_IDS) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_TTL) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_BBM) | \ - ARM64_FEATURE_MASK(ID_AA64MMFR2_EL1_E0PD) \ - ) - -#define PVM_ID_AA64MMFR3_ALLOW (0ULL) - -/* - * No support for Scalable Vectors for protected VMs: - * Requires additional support from KVM, e.g., context-switching and - * trapping at EL2 - */ -#define PVM_ID_AA64ZFR0_ALLOW (0ULL) - -/* - * No support for debug, including breakpoints, and watchpoints for protected - * VMs: - * The Arm architecture mandates support for at least the Armv8 debug - * architecture, which would include at least 2 hardware breakpoints and - * watchpoints. Providing that support to protected guests adds - * considerable state and complexity. Therefore, the reserved value of 0 is - * used for debug-related fields. - */ -#define PVM_ID_AA64DFR0_ALLOW (0ULL) -#define PVM_ID_AA64DFR1_ALLOW (0ULL) - -/* - * No support for implementation defined features. - */ -#define PVM_ID_AA64AFR0_ALLOW (0ULL) -#define PVM_ID_AA64AFR1_ALLOW (0ULL) - -/* - * No restrictions on instructions implemented in AArch64. - */ -#define PVM_ID_AA64ISAR0_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_AES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA1) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA2) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_CRC32) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_ATOMIC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RDM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SHA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_SM4) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_DP) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_FHM) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_TLB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR0_EL1_RNDR) \ - ) - -/* Restrict pointer authentication to the basic version. */ -#define PVM_ID_AA64ISAR1_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DPB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_JSCVT) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FCMA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_LRCPC) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_FRINTTS) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SB) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_SPECRES) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_BF16) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_DGH) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_I8MM) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), ID_AA64ISAR1_EL1_APA_PAuth) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), ID_AA64ISAR1_EL1_API_PAuth) \ - ) - -#define PVM_ID_AA64ISAR2_ALLOW (\ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_ATS1A)| \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3) | \ - ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_MOPS) | \ - FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3), ID_AA64ISAR2_EL1_APA3_PAuth) \ - ) - -bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); -bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); -void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); -int kvm_check_pvm_sysreg_table(void); - -#endif /* __ARM64_KVM_FIXED_CONFIG_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 698bc20ab80b..3888aabd78f3 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -72,4 +72,9 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu); +bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code); +bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code); +void kvm_init_pvm_id_regs(struct kvm_vcpu *vcpu); +int kvm_check_pvm_sysreg_table(void); + #endif /* __ARM64_KVM_NVHE_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 23afc63cac55..df30d46b7542 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -9,7 +9,6 @@ #include -#include #include #include #include diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index cbdd18cd3f98..31bd729ea45c 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -12,7 +12,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index cc69106734ca..7786a83d0fa8 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -26,7 +26,6 @@ #include #include -#include #include /* Non-VHE specific context */ diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 934d7c4b04b4..1ddd9ed3cbb3 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -11,7 +11,6 @@ #include -#include #include #include "../../sys_regs.h" From 092e7b2c3b1a5591bbabc358f3b709dfa2289b91 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:51 +0000 Subject: [PATCH 48/87] KVM: arm64: Remove redundant setting of HCR_EL2 trap bit In hVHE mode, HCR_E2H should be set for both protected and non-protected VMs. Since commit b56680de9c64 ("KVM: arm64: Initialize trap register values in hyp in pKVM"), this has been fixed, and the setting of the flag here is redundant. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-12-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index df30d46b7542..2beab633d721 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -57,9 +57,6 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) /* No support for AArch32. */ val |= HCR_RW; - if (has_hvhe()) - val |= HCR_E2H; - /* * Always trap: * - Feature id registers: to control features exposed to guests From 2fd5b4b0e7b440602455b79977bfa64dea101e6c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:52 +0000 Subject: [PATCH 49/87] KVM: arm64: Calculate cptr_el2 traps on activating traps Similar to VHE, calculate the value of cptr_el2 from scratch on activate traps. This removes the need to store cptr_el2 in every vcpu structure. Moreover, some traps, such as whether the guest owns the fp registers, need to be set on every vcpu run. Reported-by: James Clark Fixes: 5294afdbf45a ("KVM: arm64: Exclude FP ownership from kvm_vcpu_arch") Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-13-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 1 - arch/arm64/kvm/arm.c | 1 - arch/arm64/kvm/hyp/nvhe/pkvm.c | 42 ------------------------- arch/arm64/kvm/hyp/nvhe/switch.c | 51 +++++++++++++++++++------------ 4 files changed, 32 insertions(+), 63 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d3230a22846f..ba9536c3076d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -708,7 +708,6 @@ struct kvm_vcpu_arch { u64 hcr_el2; u64 hcrx_el2; u64 mdcr_el2; - u64 cptr_el2; /* Exception Information */ struct kvm_vcpu_fault_info fault; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b295218cdc24..8a3d02cf0a7a 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1546,7 +1546,6 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); - vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); /* * Handle the "start in power-off" case. diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 2beab633d721..4ef08fb3bca4 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -83,44 +83,6 @@ static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 = val; } -static void pvm_init_traps_cptr(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - u64 val = vcpu->arch.cptr_el2; - - if (!has_hvhe()) { - val |= CPTR_NVHE_EL2_RES1; - val &= ~(CPTR_NVHE_EL2_RES0); - } - - if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, AMU, IMP)) - val |= CPTR_EL2_TAM; - - /* SVE can be disabled by userspace even if supported. */ - if (!vcpu_has_sve(vcpu)) { - if (has_hvhe()) - val &= ~(CPACR_ELx_ZEN); - else - val |= CPTR_EL2_TZ; - } - - /* No SME support in KVM. */ - BUG_ON(kvm_has_feat(kvm, ID_AA64PFR1_EL1, SME, IMP)); - if (has_hvhe()) - val &= ~(CPACR_ELx_SMEN); - else - val |= CPTR_EL2_TSM; - - if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceVer, IMP)) { - if (has_hvhe()) - val |= CPACR_EL1_TTA; - else - val |= CPTR_EL2_TTA; - } - - vcpu->arch.cptr_el2 = val; -} - static void pvm_init_traps_mdcr(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; @@ -191,7 +153,6 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; int ret; - vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); vcpu->arch.mdcr_el2 = 0; pkvm_vcpu_reset_hcr(vcpu); @@ -204,7 +165,6 @@ static int pkvm_vcpu_init_traps(struct pkvm_hyp_vcpu *hyp_vcpu) return ret; pvm_init_traps_hcr(vcpu); - pvm_init_traps_cptr(vcpu); pvm_init_traps_mdcr(vcpu); return 0; @@ -644,8 +604,6 @@ int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, return ret; } - hyp_vcpu->vcpu.arch.cptr_el2 = kvm_get_reset_cptr_el2(&hyp_vcpu->vcpu); - return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 7786a83d0fa8..0ebf84a9f9e2 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -35,33 +35,46 @@ DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); -static void __activate_traps(struct kvm_vcpu *vcpu) +static void __activate_cptr_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - ___activate_traps(vcpu, vcpu->arch.hcr_el2); - __activate_traps_common(vcpu); + if (has_hvhe()) { + val |= CPACR_ELx_TTA; - val = vcpu->arch.cptr_el2; - val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; - if (cpus_have_final_cap(ARM64_SME)) { - if (has_hvhe()) - val &= ~CPACR_ELx_SMEN; - else - val |= CPTR_EL2_TSM; + if (guest_owns_fp_regs()) { + val |= CPACR_ELx_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_ELx_ZEN; + } + } else { + val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; } - if (!guest_owns_fp_regs()) { - if (has_hvhe()) - val &= ~(CPACR_ELx_FPEN | CPACR_ELx_ZEN); - else - val |= CPTR_EL2_TFP | CPTR_EL2_TZ; - + if (!guest_owns_fp_regs()) __activate_traps_fpsimd32(vcpu); - } kvm_write_cptr_el2(val); +} + +static void __activate_traps(struct kvm_vcpu *vcpu) +{ + ___activate_traps(vcpu, vcpu->arch.hcr_el2); + __activate_traps_common(vcpu); + __activate_cptr_traps(vcpu); + write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { From 8f7df795b2da0564b22a03c4aceec90bfc5e1b1b Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:53 +0000 Subject: [PATCH 50/87] KVM: arm64: Refactor kvm_reset_cptr_el2() Fold kvm_get_reset_cptr_el2() into kvm_reset_cptr_el2(), since it is its only caller. Add a comment to clarify that this function is meant for the host value of cptr_el2. No functional change intended. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-14-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index cf811009a33c..7b3dc52248ce 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -619,7 +619,8 @@ static __always_inline void kvm_write_cptr_el2(u64 val) write_sysreg(val, cptr_el2); } -static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) +/* Resets the value of cptr_el2 when returning to the host. */ +static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) { u64 val; @@ -643,13 +644,6 @@ static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) val &= ~CPTR_EL2_TSM; } - return val; -} - -static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) -{ - u64 val = kvm_get_reset_cptr_el2(vcpu); - kvm_write_cptr_el2(val); } From 1eccad35c9268f1ad4be3d72d37167a58c0ac2db Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:54 +0000 Subject: [PATCH 51/87] KVM: arm64: Fix the value of the CPTR_EL2 RES1 bitmask for nVHE Since the introduction of SME, bit 12 in CPTR_EL2 (nVHE) is TSM for trapping SME, instead of RES1, as per ARM ARM DDI 0487K.a, section D23.2.34. Fix the value of CPTR_NVHE_EL2_RES1 to reflect that, and adjust the code that relies on it accordingly. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-15-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/include/asm/kvm_emulate.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 3e0f0de1d2da..24e4ac7c50f2 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -300,7 +300,7 @@ #define CPTR_EL2_TSM (1 << 12) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) #define CPTR_EL2_TZ (1 << 8) -#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ +#define CPTR_NVHE_EL2_RES1 (BIT(13) | BIT(9) | GENMASK(7, 0)) #define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \ GENMASK(29, 21) | \ GENMASK(19, 14) | \ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 7b3dc52248ce..6602a4c091ac 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -640,8 +640,8 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) if (vcpu_has_sve(vcpu) && guest_owns_fp_regs()) val |= CPTR_EL2_TZ; - if (cpus_have_final_cap(ARM64_SME)) - val &= ~CPTR_EL2_TSM; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; } kvm_write_cptr_el2(val); From c5c1763596660fcd77a1190b3bd78bbe24bcfd6a Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:55 +0000 Subject: [PATCH 52/87] KVM: arm64: Remove PtrAuth guest vcpu flag The vcpu flag GUEST_HAS_PTRAUTH is always associated with the vcpu PtrAuth features, which are defined per vm rather than per vcpu. Remove the flag, and replace it with checks for the features instead. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-16-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 5 ----- arch/arm64/include/asm/kvm_host.h | 7 +++---- arch/arm64/kvm/hyp/nvhe/pkvm.c | 13 ------------- arch/arm64/kvm/reset.c | 4 ---- 4 files changed, 3 insertions(+), 26 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 6602a4c091ac..406e99a452bf 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -691,9 +691,4 @@ static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu) { return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN); } - -static inline void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) -{ - vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); -} #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index ba9536c3076d..86fb40d35051 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -866,10 +866,8 @@ struct kvm_vcpu_arch { #define GUEST_HAS_SVE __vcpu_single_flag(cflags, BIT(0)) /* SVE config completed */ #define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) -/* PTRAUTH exposed to guest */ -#define GUEST_HAS_PTRAUTH __vcpu_single_flag(cflags, BIT(2)) /* KVM_ARM_VCPU_INIT completed */ -#define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(3)) +#define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(2)) /* Exception pending */ #define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) @@ -965,7 +963,8 @@ struct kvm_vcpu_arch { #define vcpu_has_ptrauth(vcpu) \ ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) || \ cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) && \ - vcpu_get_flag(vcpu, GUEST_HAS_PTRAUTH)) + (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || \ + vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC))) #else #define vcpu_has_ptrauth(vcpu) false #endif diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 4ef08fb3bca4..5ad271d91cac 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -278,18 +278,6 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc allowed_features, KVM_VCPU_MAX_FEATURES); } -static void pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu) -{ - struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; - - if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || - vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) { - kvm_vcpu_enable_ptrauth(vcpu); - } else { - vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH); - } -} - static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) @@ -359,7 +347,6 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, goto done; pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); - pkvm_vcpu_init_ptrauth(hyp_vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 470524b31951..1cfab6a5d8a5 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -211,10 +211,6 @@ void kvm_reset_vcpu(struct kvm_vcpu *vcpu) kvm_vcpu_reset_sve(vcpu); } - if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || - vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) - kvm_vcpu_enable_ptrauth(vcpu); - if (vcpu_el1_is_32bit(vcpu)) pstate = VCPU_RESET_PSTATE_SVC; else if (vcpu_has_nv(vcpu)) From 41d6028e28bd474298ff10409c292ec46cf43a90 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:56 +0000 Subject: [PATCH 53/87] KVM: arm64: Convert the SVE guest vcpu flag to a vm flag The vcpu flag GUEST_HAS_SVE is per-vcpu, but it is based on what is now a per-vm feature. Make the flag per-vm. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-17-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 12 +++++++++--- arch/arm64/include/asm/kvm_host.h | 18 ++++++++++++------ arch/arm64/kvm/hyp/nvhe/pkvm.c | 11 +++++++---- arch/arm64/kvm/reset.c | 2 +- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 406e99a452bf..2d91fb88298a 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -620,7 +620,7 @@ static __always_inline void kvm_write_cptr_el2(u64 val) } /* Resets the value of cptr_el2 when returning to the host. */ -static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) +static __always_inline void __kvm_reset_cptr_el2(struct kvm *kvm) { u64 val; @@ -631,14 +631,14 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) } else if (has_hvhe()) { val = CPACR_ELx_FPEN; - if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + if (!kvm_has_sve(kvm) || !guest_owns_fp_regs()) val |= CPACR_ELx_ZEN; if (cpus_have_final_cap(ARM64_SME)) val |= CPACR_ELx_SMEN; } else { val = CPTR_NVHE_EL2_RES1; - if (vcpu_has_sve(vcpu) && guest_owns_fp_regs()) + if (kvm_has_sve(kvm) && guest_owns_fp_regs()) val |= CPTR_EL2_TZ; if (!cpus_have_final_cap(ARM64_SME)) val |= CPTR_EL2_TSM; @@ -647,6 +647,12 @@ static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) kvm_write_cptr_el2(val); } +#ifdef __KVM_NVHE_HYPERVISOR__ +#define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2(kern_hyp_va((v)->kvm)) +#else +#define kvm_reset_cptr_el2(v) __kvm_reset_cptr_el2((v)->kvm) +#endif + /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 86fb40d35051..7ba742b9067e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -331,6 +331,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 7 /* Fine-Grained UNDEF initialised */ #define KVM_ARCH_FLAG_FGU_INITIALIZED 8 + /* SVE exposed to guest */ +#define KVM_ARCH_FLAG_GUEST_HAS_SVE 9 unsigned long flags; /* VM-wide vCPU feature set */ @@ -862,12 +864,10 @@ struct kvm_vcpu_arch { #define vcpu_set_flag(v, ...) __vcpu_set_flag((v), __VA_ARGS__) #define vcpu_clear_flag(v, ...) __vcpu_clear_flag((v), __VA_ARGS__) -/* SVE exposed to guest */ -#define GUEST_HAS_SVE __vcpu_single_flag(cflags, BIT(0)) +/* KVM_ARM_VCPU_INIT completed */ +#define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(0)) /* SVE config completed */ #define VCPU_SVE_FINALIZED __vcpu_single_flag(cflags, BIT(1)) -/* KVM_ARM_VCPU_INIT completed */ -#define VCPU_INITIALIZED __vcpu_single_flag(cflags, BIT(2)) /* Exception pending */ #define PENDING_EXCEPTION __vcpu_single_flag(iflags, BIT(0)) @@ -956,8 +956,14 @@ struct kvm_vcpu_arch { KVM_GUESTDBG_USE_HW | \ KVM_GUESTDBG_SINGLESTEP) -#define vcpu_has_sve(vcpu) (system_supports_sve() && \ - vcpu_get_flag(vcpu, GUEST_HAS_SVE)) +#define kvm_has_sve(kvm) (system_supports_sve() && \ + test_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &(kvm)->arch.flags)) + +#ifdef __KVM_NVHE_HYPERVISOR__ +#define vcpu_has_sve(vcpu) kvm_has_sve(kern_hyp_va((vcpu)->kvm)) +#else +#define vcpu_has_sve(vcpu) kvm_has_sve((vcpu)->kvm) +#endif #ifdef CONFIG_ARM64_PTR_AUTH #define vcpu_has_ptrauth(vcpu) \ diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 5ad271d91cac..4b2622f8e250 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -248,10 +248,13 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) { struct kvm *kvm = &hyp_vm->kvm; + unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); /* No restrictions for non-protected VMs. */ if (!kvm_vm_is_protected(kvm)) { + hyp_vm->kvm.arch.flags = host_arch_flags; + bitmap_copy(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); @@ -271,8 +274,10 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc if (kvm_pvm_ext_allowed(KVM_CAP_ARM_PTRAUTH_GENERIC)) set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); - if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) + if (kvm_pvm_ext_allowed(KVM_CAP_ARM_SVE)) { set_bit(KVM_ARM_VCPU_SVE, allowed_features); + kvm->arch.flags |= host_arch_flags & BIT(KVM_ARCH_FLAG_GUEST_HAS_SVE); + } bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, allowed_features, KVM_VCPU_MAX_FEATURES); @@ -308,10 +313,8 @@ static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu * { struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; - if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { - vcpu_clear_flag(vcpu, GUEST_HAS_SVE); + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); - } } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 1cfab6a5d8a5..803e11b0dc8f 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -85,7 +85,7 @@ static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu) * KVM_REG_ARM64_SVE_VLS. Allocation is deferred until * kvm_arm_vcpu_finalize(), which freezes the configuration. */ - vcpu_set_flag(vcpu, GUEST_HAS_SVE); + set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags); } /* From aac64ad36955268d65375c32415d5bcf1bd1dd47 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Mon, 16 Dec 2024 10:50:57 +0000 Subject: [PATCH 54/87] KVM: arm64: Use kvm_vcpu_has_feature() directly for struct kvm Now that we have introduced kvm_vcpu_has_feature(), use it in the remaining code that checks for features in struct kvm, instead of using the __vcpu_has_feature() helper. No functional change intended. Suggested-by: Quentin Perret Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241216105057.579031-18-tabba@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 9b36218b48de..4cada42788f7 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1021,8 +1021,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= HCR_NV2; if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, IMP)) res0 |= (HCR_AT | HCR_NV1 | HCR_NV); - if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && - __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) + if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) res0 |= (HCR_API | HCR_APK); if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TME, IMP)) res0 |= BIT(39); @@ -1078,8 +1078,8 @@ int kvm_init_nv_sysregs(struct kvm *kvm) /* HFG[RW]TR_EL2 */ res0 = res1 = 0; - if (!(__vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && - __vcpu_has_feature(&kvm->arch, KVM_ARM_VCPU_PTRAUTH_GENERIC))) + if (!(kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_ADDRESS) && + kvm_vcpu_has_feature(kvm, KVM_ARM_VCPU_PTRAUTH_GENERIC))) res0 |= (HFGxTR_EL2_APDAKey | HFGxTR_EL2_APDBKey | HFGxTR_EL2_APGAKey | HFGxTR_EL2_APIAKey | HFGxTR_EL2_APIBKey); From e891432cf7171ab2054222e2ce4c94f8080b92e7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 21 Dec 2024 10:06:17 +0000 Subject: [PATCH 55/87] KVM: arm64: nv: Advertise the lack of AArch32 EL0 support Although we never supported 32bit anywhere in NV, we fail to advertise so for EL0, probably owing to the relative lack of hardware supporting both NV2 and 32bit EL0. Add some sanitising to ID_AA64PFR0_EL1.EL0, and reaffirm that "in 64bit-only we trust". Reported-by: Oliver Upton Acked-by: Oliver Upton Signed-off-by: Marc Zyngier --- arch/arm64/kvm/nested.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 9b36218b48de..9e74f7b38e05 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -830,8 +830,10 @@ static void limit_nv_id_regs(struct kvm *kvm) NV_FTR(PFR0, RAS) | NV_FTR(PFR0, EL3) | NV_FTR(PFR0, EL2) | - NV_FTR(PFR0, EL1)); - /* 64bit EL1/EL2/EL3 only */ + NV_FTR(PFR0, EL1) | + NV_FTR(PFR0, EL0)); + /* 64bit only at any EL */ + val |= FIELD_PREP(NV_FTR(PFR0, EL0), 0b0001); val |= FIELD_PREP(NV_FTR(PFR0, EL1), 0b0001); val |= FIELD_PREP(NV_FTR(PFR0, EL2), 0b0001); val |= FIELD_PREP(NV_FTR(PFR0, EL3), 0b0001); From b59dbb91f7636a89b54ab8fff756afe320ba6549 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:09 +0000 Subject: [PATCH 56/87] KVM: arm64: nv: Add handling of EL2-specific timer registers Add the required handling for EL2 and EL02 registers, as well as EL1 registers used in the E2H context. This includes handling the virtual timer accesses when CNTHCTL_EL2.EL1TVT or CNTHCTL_EL2.EL1TVCT are set. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/sysreg.h | 4 + arch/arm64/kvm/sys_regs.c | 143 +++++++++++++++++++++++++++ include/clocksource/arm_arch_timer.h | 2 + 3 files changed, 149 insertions(+) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b8303a83c0bf..2ed33737c7a9 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -477,6 +477,7 @@ #define SYS_CNTFRQ_EL0 sys_reg(3, 3, 14, 0, 0) #define SYS_CNTPCT_EL0 sys_reg(3, 3, 14, 0, 1) +#define SYS_CNTVCT_EL0 sys_reg(3, 3, 14, 0, 2) #define SYS_CNTPCTSS_EL0 sys_reg(3, 3, 14, 0, 5) #define SYS_CNTVCTSS_EL0 sys_reg(3, 3, 14, 0, 6) @@ -484,14 +485,17 @@ #define SYS_CNTP_CTL_EL0 sys_reg(3, 3, 14, 2, 1) #define SYS_CNTP_CVAL_EL0 sys_reg(3, 3, 14, 2, 2) +#define SYS_CNTV_TVAL_EL0 sys_reg(3, 3, 14, 3, 0) #define SYS_CNTV_CTL_EL0 sys_reg(3, 3, 14, 3, 1) #define SYS_CNTV_CVAL_EL0 sys_reg(3, 3, 14, 3, 2) #define SYS_AARCH32_CNTP_TVAL sys_reg(0, 0, 14, 2, 0) #define SYS_AARCH32_CNTP_CTL sys_reg(0, 0, 14, 2, 1) #define SYS_AARCH32_CNTPCT sys_reg(0, 0, 0, 14, 0) +#define SYS_AARCH32_CNTVCT sys_reg(0, 1, 0, 14, 0) #define SYS_AARCH32_CNTP_CVAL sys_reg(0, 2, 0, 14, 0) #define SYS_AARCH32_CNTPCTSS sys_reg(0, 8, 0, 14, 0) +#define SYS_AARCH32_CNTVCTSS sys_reg(0, 9, 0, 14, 0) #define __PMEV_op2(n) ((n) & 0x7) #define __CNTR_CRm(n) (0x8 | (((n) >> 3) & 0x3)) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 83c6b4a07ef5..986e63d4f9fa 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1412,26 +1412,146 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, switch (reg) { case SYS_CNTP_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTV_TVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_AARCH32_CNTP_TVAL: + case SYS_CNTP_TVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_TVAL; break; + + case SYS_CNTV_TVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHP_TVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_TVAL; + break; + + case SYS_CNTHV_TVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_TVAL; + break; + case SYS_CNTP_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTV_CTL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_AARCH32_CNTP_CTL: + case SYS_CNTP_CTL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CTL; break; + + case SYS_CNTV_CTL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHP_CTL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CTL; + break; + + case SYS_CNTHV_CTL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CTL; + break; + case SYS_CNTP_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTV_CVAL_EL0: + if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_AARCH32_CNTP_CVAL: + case SYS_CNTP_CVAL_EL02: tmr = TIMER_PTIMER; treg = TIMER_REG_CVAL; break; + + case SYS_CNTV_CVAL_EL02: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHP_CVAL_EL2: + tmr = TIMER_HPTIMER; + treg = TIMER_REG_CVAL; + break; + + case SYS_CNTHV_CVAL_EL2: + tmr = TIMER_HVTIMER; + treg = TIMER_REG_CVAL; + break; + case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HPTIMER; + else + tmr = TIMER_PTIMER; + treg = TIMER_REG_CNT; + break; + case SYS_AARCH32_CNTPCT: + case SYS_AARCH32_CNTPCTSS: tmr = TIMER_PTIMER; treg = TIMER_REG_CNT; break; + + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (is_hyp_ctxt(vcpu)) + tmr = TIMER_HVTIMER; + else + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + + case SYS_AARCH32_CNTVCT: + case SYS_AARCH32_CNTVCTSS: + tmr = TIMER_VTIMER; + treg = TIMER_REG_CNT; + break; + default: print_sys_reg_msg(p, "%s", "Unhandled trapped timer register"); return undef_access(vcpu, p, r); @@ -2901,11 +3021,17 @@ static const struct sys_reg_desc sys_reg_descs[] = { AMU_AMEVTYPER1_EL0(15), { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer }, { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer }, + /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), PMU_PMEVCNTR_EL0(1), @@ -3057,9 +3183,24 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0), EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0), + { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer }, + EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0), + EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0), + + { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_arch_timer }, + EL2_REG(CNTHV_CTL_EL2, access_arch_timer, reset_val, 0), + EL2_REG(CNTHV_CVAL_EL2, access_arch_timer, reset_val, 0), { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 }, + { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer }, + + { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer }, + { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer }, + EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; @@ -3879,9 +4020,11 @@ static const struct sys_reg_desc cp15_64_regs[] = { { SYS_DESC(SYS_AARCH32_CNTPCT), access_arch_timer }, { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 }, { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */ + { SYS_DESC(SYS_AARCH32_CNTVCT), access_arch_timer }, { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */ { SYS_DESC(SYS_AARCH32_CNTP_CVAL), access_arch_timer }, { SYS_DESC(SYS_AARCH32_CNTPCTSS), access_arch_timer }, + { SYS_DESC(SYS_AARCH32_CNTVCTSS), access_arch_timer }, }; static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index cbbc9a6dc571..877dcbb2601a 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -22,6 +22,8 @@ #define CNTHCTL_EVNTDIR (1 << 3) #define CNTHCTL_EVNTI (0xF << 4) #define CNTHCTL_ECV (1 << 12) +#define CNTHCTL_EL1TVT (1 << 13) +#define CNTHCTL_EL1TVCT (1 << 14) enum arch_timer_reg { ARCH_TIMER_REG_CTRL, From 4bad3068cfa9fc38dd767441871e0edab821105b Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:10 +0000 Subject: [PATCH 57/87] KVM: arm64: nv: Sync nested timer state with FEAT_NV2 Emulating the timers with FEAT_NV2 is a bit odd, as the timers can be reconfigured behind our back without the hypervisor even noticing. In the VHE case, that's an actual regression in the architecture... Co-developed-by: Christoffer Dall Signed-off-by: Christoffer Dall Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 44 ++++++++++++++++++++++++++++++++++++ arch/arm64/kvm/arm.c | 3 +++ include/kvm/arm_arch_timer.h | 1 + 3 files changed, 48 insertions(+) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 1215df590418..ee5f732fbbec 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -905,6 +905,50 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) kvm_timer_blocking(vcpu); } +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) +{ + /* + * When NV2 is on, guest hypervisors have their EL1 timer register + * accesses redirected to the VNCR page. Any guest action taken on + * the timer is postponed until the next exit, leading to a very + * poor quality of emulation. + */ + if (!is_hyp_ctxt(vcpu)) + return; + + if (!vcpu_el2_e2h_is_set(vcpu)) { + /* + * A non-VHE guest hypervisor doesn't have any direct access + * to its timers: the EL2 registers trap (and the HW is + * fully emulated), while the EL0 registers access memory + * despite the access being notionally direct. Boo. + * + * We update the hardware timer registers with the + * latest value written by the guest to the VNCR page + * and let the hardware take care of the rest. + */ + write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CTL_EL0), SYS_CNTV_CTL); + write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0), SYS_CNTV_CVAL); + write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CTL_EL0), SYS_CNTP_CTL); + write_sysreg_el0(__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0), SYS_CNTP_CVAL); + } else { + /* + * For a VHE guest hypervisor, the EL2 state is directly + * stored in the host EL1 timers, while the emulated EL0 + * state is stored in the VNCR page. The latter could have + * been updated behind our back, and we must reset the + * emulation of the timers. + */ + struct timer_map map; + get_timer_map(vcpu, &map); + + soft_timer_cancel(&map.emul_vtimer->hrtimer); + soft_timer_cancel(&map.emul_ptimer->hrtimer); + timer_emulate(map.emul_vtimer); + timer_emulate(map.emul_ptimer); + } +} + /* * With a userspace irqchip we have to check if the guest de-asserted the * timer and if so, unmask the timer irq signal on the host interrupt diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a102c3aebdbc..fa3089822f9f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1228,6 +1228,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); + if (vcpu_has_nv(vcpu)) + kvm_timer_sync_nested(vcpu); + kvm_arch_vcpu_ctxsync_fp(vcpu); /* diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index fd650a8789b9..6e3f6b7ff2b2 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -98,6 +98,7 @@ int __init kvm_timer_hyp_init(bool has_gic); int kvm_timer_enable(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); +void kvm_timer_sync_nested(struct kvm_vcpu *vcpu); void kvm_timer_sync_user(struct kvm_vcpu *vcpu); bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); void kvm_timer_update_run(struct kvm_vcpu *vcpu); From cc45963cbf6334d2b9078f06efef9864639cddd0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:11 +0000 Subject: [PATCH 58/87] KVM: arm64: nv: Publish emulated timer interrupt state in the in-memory state With FEAT_NV2, the EL0 timer state is entirely stored in memory, meaning that the hypervisor can only provide a very poor emulation. The only thing we can really do is to publish the interrupt state in the guest view of CNT{P,V}_CTL_EL0, and defer everything else to the next exit. Only FEAT_ECV will allow us to fix it, at the cost of extra trapping. Suggested-by: Chase Conklin Suggested-by: Ganapatrao Kulkarni Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 21 +++++++++++++++++++++ arch/arm64/kvm/arm.c | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index ee5f732fbbec..8bff913ed126 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -441,11 +441,30 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } +static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level) +{ + /* + * Paper over NV2 brokenness by publishing the interrupt status + * bit. This still results in a poor quality of emulation (guest + * writes will have no effect until the next exit). + * + * But hey, it's fast, right? + */ + if (is_hyp_ctxt(ctx->vcpu) && + (ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) { + unsigned long val = timer_get_ctl(ctx); + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level); + timer_set_ctl(ctx, val); + } +} + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { int ret; + kvm_timer_update_status(timer_ctx, new_level); + timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx), timer_ctx->irq.level); @@ -471,6 +490,8 @@ static void timer_emulate(struct arch_timer_context *ctx) return; } + kvm_timer_update_status(ctx, should_fire); + /* * If the timer can fire now, we don't need to have a soft timer * scheduled for the future. If the timer cannot fire at all, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fa3089822f9f..bda905022df4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1228,7 +1228,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (unlikely(!irqchip_in_kernel(vcpu->kvm))) kvm_timer_sync_user(vcpu); - if (vcpu_has_nv(vcpu)) + if (is_hyp_ctxt(vcpu)) kvm_timer_sync_nested(vcpu); kvm_arch_vcpu_ctxsync_fp(vcpu); From 2cd2a77f9c32f1eaf599fb72cbcd0394938a8b58 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:12 +0000 Subject: [PATCH 59/87] KVM: arm64: nv: Use FEAT_ECV to trap access to EL0 timers Although FEAT_NV2 makes most things fast, it also makes it impossible to correctly emulate the timers, as the sysreg accesses are redirected to memory. FEAT_ECV addresses this by giving a hypervisor the ability to trap the EL02 sysregs as well as the virtual timer. Add the required trap setting to make use of the feature, allowing us to elide the ugly resync in the middle of the run loop. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 36 +++++++++++++++++++++++++--- include/clocksource/arm_arch_timer.h | 2 ++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 8bff913ed126..b6a06bda9e53 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -783,7 +783,7 @@ static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu, static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) { - bool tpt, tpc; + bool tvt, tpt, tvc, tpc, tvt02, tpt02; u64 clr, set; /* @@ -798,7 +798,29 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) * within this function, reality kicks in and we start adding * traps based on emulation requirements. */ - tpt = tpc = false; + tvt = tpt = tvc = tpc = false; + tvt02 = tpt02 = false; + + /* + * NV2 badly breaks the timer semantics by redirecting accesses to + * the EL1 timer state to memory, so let's call ECV to the rescue if + * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses. + * + * The treatment slightly varies depending whether we run a nVHE or + * VHE guest: nVHE will use the _EL0 registers directly, while VHE + * will use the _EL02 accessors. This translates in different trap + * bits. + * + * None of the trapping is required when running in non-HYP context, + * unless required by the L1 hypervisor settings once we advertise + * ECV+NV in the guest, or that we need trapping for other reasons. + */ + if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) { + if (vcpu_el2_e2h_is_set(vcpu)) + tvt02 = tpt02 = true; + else + tvt = tpt = true; + } /* * We have two possibility to deal with a physical offset: @@ -838,6 +860,10 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr); assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr); + assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set); + assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set); + assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set); + assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set); /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */ sysreg_clear_set(cnthctl_el2, clr, set); @@ -933,8 +959,12 @@ void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) * accesses redirected to the VNCR page. Any guest action taken on * the timer is postponed until the next exit, leading to a very * poor quality of emulation. + * + * This is an unmitigated disaster, only papered over by FEAT_ECV, + * which allows trapping of the timer registers even with NV2. + * Still, this is still worse than FEAT_NV on its own. Meh. */ - if (!is_hyp_ctxt(vcpu)) + if (cpus_have_final_cap(ARM64_HAS_ECV) || !is_hyp_ctxt(vcpu)) return; if (!vcpu_el2_e2h_is_set(vcpu)) { diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index 877dcbb2601a..c62811fb4130 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -24,6 +24,8 @@ #define CNTHCTL_ECV (1 << 12) #define CNTHCTL_EL1TVT (1 << 13) #define CNTHCTL_EL1TVCT (1 << 14) +#define CNTHCTL_EL1NVPCT (1 << 15) +#define CNTHCTL_EL1NVVCT (1 << 16) enum arch_timer_reg { ARCH_TIMER_REG_CTRL, From 338f8ea51944d02ea29eadb3d5fa9196e74a100d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:13 +0000 Subject: [PATCH 60/87] KVM: arm64: nv: Accelerate EL0 timer read accesses when FEAT_ECV in use Although FEAT_ECV allows us to correctly emulate the timers, it also reduces performances pretty badly. Mitigate this by emulating the CTL/CVAL register reads in the inner run loop, without returning to the general kernel. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-6-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 21 +----- arch/arm64/kvm/hyp/include/hyp/switch.h | 5 ++ arch/arm64/kvm/hyp/vhe/switch.c | 99 +++++++++++++++++++++++++ include/kvm/arm_arch_timer.h | 15 ++++ 4 files changed, 122 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index b6a06bda9e53..6f04f31c0a7f 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -101,21 +101,6 @@ u64 timer_get_cval(struct arch_timer_context *ctxt) } } -static u64 timer_get_offset(struct arch_timer_context *ctxt) -{ - u64 offset = 0; - - if (!ctxt) - return 0; - - if (ctxt->offset.vm_offset) - offset += *ctxt->offset.vm_offset; - if (ctxt->offset.vcpu_offset) - offset += *ctxt->offset.vcpu_offset; - - return offset; -} - static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) { struct kvm_vcpu *vcpu = ctxt->vcpu; @@ -964,10 +949,10 @@ void kvm_timer_sync_nested(struct kvm_vcpu *vcpu) * which allows trapping of the timer registers even with NV2. * Still, this is still worse than FEAT_NV on its own. Meh. */ - if (cpus_have_final_cap(ARM64_HAS_ECV) || !is_hyp_ctxt(vcpu)) - return; - if (!vcpu_el2_e2h_is_set(vcpu)) { + if (cpus_have_final_cap(ARM64_HAS_ECV)) + return; + /* * A non-VHE guest hypervisor doesn't have any direct access * to its timers: the EL2 registers trap (and the HW is diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 34f53707892d..30e572de2874 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -501,6 +501,11 @@ static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu) return true; } +static inline u64 compute_counter_value(struct arch_timer_context *ctxt) +{ + return arch_timer_read_cntpct_el0() - timer_get_offset(ctxt); +} + static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) { struct arch_timer_context *ctxt; diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 80581b1c3995..51119d58ecff 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -256,6 +256,102 @@ void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu) host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL; } +static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu, + enum vcpu_sysreg reg) +{ + unsigned long ctl; + u64 cval, cnt; + bool stat; + + switch (reg) { + case CNTP_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0); + cnt = compute_counter_value(vcpu_ptimer(vcpu)); + break; + case CNTV_CTL_EL0: + cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + ctl = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0); + cnt = compute_counter_value(vcpu_vtimer(vcpu)); + break; + default: + BUG(); + } + + stat = cval <= cnt; + __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat); + + return ctl; +} + +static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + u64 esr, val; + + /* + * Having FEAT_ECV allows for a better quality of timer emulation. + * However, this comes at a huge cost in terms of traps. Try and + * satisfy the reads from guest's hypervisor context without + * returning to the kernel if we can. + */ + if (!is_hyp_ctxt(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ) + return false; + + switch (esr_sys64_to_sysreg(esr)) { + case SYS_CNTP_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTP_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0); + break; + case SYS_CNTP_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + break; + case SYS_CNTP_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) { + val = read_sysreg_el0(SYS_CNTP_CVAL); + + if (!has_cntpoff()) + val -= timer_get_offset(vcpu_hptimer(vcpu)); + } else { + val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); + } + break; + case SYS_CNTV_CTL_EL02: + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CTL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CTL); + else + val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); + break; + case SYS_CNTV_CVAL_EL02: + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + case SYS_CNTV_CVAL_EL0: + if (vcpu_el2_e2h_is_set(vcpu)) + val = read_sysreg_el0(SYS_CNTV_CVAL); + else + val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); + break; + default: + return false; + } + + vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); + __kvm_skip_instr(vcpu); + + return true; +} + static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code) { u64 esr = kvm_vcpu_get_esr(vcpu); @@ -409,6 +505,9 @@ static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code) if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code)) return true; + if (kvm_hyp_handle_timer(vcpu, exit_code)) + return true; + if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code)) return true; diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index 6e3f6b7ff2b2..c1ba31fab6f5 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -156,4 +156,19 @@ static inline bool has_cntpoff(void) return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); } +static inline u64 timer_get_offset(struct arch_timer_context *ctxt) +{ + u64 offset = 0; + + if (!ctxt) + return 0; + + if (ctxt->offset.vm_offset) + offset += *ctxt->offset.vm_offset; + if (ctxt->offset.vcpu_offset) + offset += *ctxt->offset.vcpu_offset; + + return offset; +} + #endif From 9b3b2f00291e1abd54bff345761a7fadd8df4daa Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:14 +0000 Subject: [PATCH 61/87] KVM: arm64: nv: Accelerate EL0 counter accesses from hypervisor context Similarly to handling the physical timer accesses early when FEAT_ECV causes a trap, we try to handle the physical counter without returning to the general sysreg handling. More surprisingly, we introduce something similar for the virtual counter. Although this isn't necessary yet, it will prove useful on systems that have a broken CNTVOFF_EL2 implementation. Yes, they exist. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-7-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vhe/switch.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 51119d58ecff..ef344bcff09a 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -324,6 +324,10 @@ static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0); } break; + case SYS_CNTPCT_EL0: + case SYS_CNTPCTSS_EL0: + val = compute_counter_value(vcpu_hptimer(vcpu)); + break; case SYS_CNTV_CTL_EL02: val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0); break; @@ -342,6 +346,10 @@ static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code) else val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0); break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + val = compute_counter_value(vcpu_hvtimer(vcpu)); + break; default: return false; } From b86fc215dc26d8e1bb274f0a7990b5deab740ac8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:15 +0000 Subject: [PATCH 62/87] KVM: arm64: Handle counter access early in non-HYP context We already deal with CNTPCT_EL0 accesses in non-HYP context. Let's add CNTVCT_EL0 as a good measure. This is also an opportunity to simplify things and make it plain that this code is only for non-HYP context handling. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-8-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 34 +++++++++++++++---------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 30e572de2874..719479b42b32 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -506,7 +506,7 @@ static inline u64 compute_counter_value(struct arch_timer_context *ctxt) return arch_timer_read_cntpct_el0() - timer_get_offset(ctxt); } -static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) +static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu) { struct arch_timer_context *ctxt; u32 sysreg; @@ -516,18 +516,19 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) * We only get here for 64bit guests, 32bit guests will hit * the long and winding road all the way to the standard * handling. Yes, it sucks to be irrelevant. + * + * Also, we only deal with non-hypervisor context here (either + * an EL1 guest, or a non-HYP context of an EL2 guest). */ + if (is_hyp_ctxt(vcpu)) + return false; + sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); switch (sysreg) { case SYS_CNTPCT_EL0: case SYS_CNTPCTSS_EL0: if (vcpu_has_nv(vcpu)) { - if (is_hyp_ctxt(vcpu)) { - ctxt = vcpu_hptimer(vcpu); - break; - } - /* Check for guest hypervisor trapping */ val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); if (!vcpu_el2_e2h_is_set(vcpu)) @@ -539,16 +540,23 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) ctxt = vcpu_ptimer(vcpu); break; + case SYS_CNTVCT_EL0: + case SYS_CNTVCTSS_EL0: + if (vcpu_has_nv(vcpu)) { + /* Check for guest hypervisor trapping */ + val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); + + if (val & CNTHCTL_EL1TVCT) + return false; + } + + ctxt = vcpu_vtimer(vcpu); + break; default: return false; } - val = arch_timer_read_cntpct_el0(); - - if (ctxt->offset.vm_offset) - val -= *kern_hyp_va(ctxt->offset.vm_offset); - if (ctxt->offset.vcpu_offset) - val -= *kern_hyp_va(ctxt->offset.vcpu_offset); + val = compute_counter_value(ctxt); vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val); __kvm_skip_instr(vcpu); @@ -593,7 +601,7 @@ static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) __vgic_v3_perform_cpuif_access(vcpu) == 1) return true; - if (kvm_hyp_handle_cntpct(vcpu)) + if (kvm_handle_cntxct(vcpu)) return true; return false; From c271269e3570766724820bcb76a144125dead272 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:16 +0000 Subject: [PATCH 63/87] KVM: arm64: nv: Add trap routing for CNTHCTL_EL2.EL1{NVPCT,NVVCT,TVT,TVCT} For completeness, fun, and cerebral meltdown, add the virtualisation related traps to the counter and timers. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-9-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/emulate-nested.c | 58 +++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c index 1ffbfd1c3cf2..03d21044c41f 100644 --- a/arch/arm64/kvm/emulate-nested.c +++ b/arch/arm64/kvm/emulate-nested.c @@ -89,6 +89,9 @@ enum cgt_group_id { CGT_HCRX_EnFPM, CGT_HCRX_TCR2En, + CGT_CNTHCTL_EL1TVT, + CGT_CNTHCTL_EL1TVCT, + CGT_ICH_HCR_TC, CGT_ICH_HCR_TALL0, CGT_ICH_HCR_TALL1, @@ -124,6 +127,8 @@ enum cgt_group_id { __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__, CGT_CNTHCTL_EL1PTEN, + CGT_CNTHCTL_EL1NVPCT, + CGT_CNTHCTL_EL1NVVCT, CGT_CPTR_TTA, CGT_MDCR_HPMN, @@ -393,6 +398,18 @@ static const struct trap_bits coarse_trap_bits[] = { .mask = HCRX_EL2_TCR2En, .behaviour = BEHAVE_FORWARD_RW, }, + [CGT_CNTHCTL_EL1TVT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVT, + .mask = CNTHCTL_EL1TVT, + .behaviour = BEHAVE_FORWARD_RW, + }, + [CGT_CNTHCTL_EL1TVCT] = { + .index = CNTHCTL_EL2, + .value = CNTHCTL_EL1TVCT, + .mask = CNTHCTL_EL1TVCT, + .behaviour = BEHAVE_FORWARD_READ, + }, [CGT_ICH_HCR_TC] = { .index = ICH_HCR_EL2, .value = ICH_HCR_TC, @@ -487,6 +504,32 @@ static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu) return BEHAVE_FORWARD_RW; } +static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu) +{ + u64 val; + + val = __vcpu_sys_reg(vcpu, HCR_EL2); + return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV)); +} + +static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + +static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu) +{ + if (!is_nested_nv2_guest(vcpu) || + !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT)) + return BEHAVE_HANDLE_LOCALLY; + + return BEHAVE_FORWARD_RW; +} + static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu) { u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2); @@ -534,6 +577,8 @@ static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu) static const complex_condition_check ccc[] = { CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten), CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten), + CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct), + CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct), CCC(CGT_CPTR_TTA, check_cptr_tta), CCC(CGT_MDCR_HPMN, check_mdcr_hpmn), }; @@ -850,11 +895,15 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SYS_CNTHP_CVAL_EL2, CGT_HCR_NV), SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2, SYS_CNTHV_CVAL_EL2, CGT_HCR_NV), - /* All _EL02, _EL12 registers */ + /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/ SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0), sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV), SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0), - sys_reg(3, 5, 14, 15, 7), CGT_HCR_NV), + sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV), + SR_TRAP(SYS_CNTP_CTL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTP_CVAL_EL02, CGT_CNTHCTL_EL1NVPCT), + SR_TRAP(SYS_CNTV_CTL_EL02, CGT_CNTHCTL_EL1NVVCT), + SR_TRAP(SYS_CNTV_CVAL_EL02, CGT_CNTHCTL_EL1NVVCT), SR_TRAP(OP_AT_S1E2R, CGT_HCR_NV), SR_TRAP(OP_AT_S1E2W, CGT_HCR_NV), SR_TRAP(OP_AT_S12E1R, CGT_HCR_NV), @@ -1184,6 +1233,11 @@ static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { SR_TRAP(SYS_CNTP_CTL_EL0, CGT_CNTHCTL_EL1PTEN), SR_TRAP(SYS_CNTPCT_EL0, CGT_CNTHCTL_EL1PCTEN), SR_TRAP(SYS_CNTPCTSS_EL0, CGT_CNTHCTL_EL1PCTEN), + SR_TRAP(SYS_CNTV_TVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CVAL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTV_CTL_EL0, CGT_CNTHCTL_EL1TVT), + SR_TRAP(SYS_CNTVCT_EL0, CGT_CNTHCTL_EL1TVCT), + SR_TRAP(SYS_CNTVCTSS_EL0, CGT_CNTHCTL_EL1TVCT), SR_TRAP(SYS_FPMR, CGT_HCRX_EnFPM), /* * IMPDEF choice: From 479428cc3dc99bbe28954b62b053b22accbfd1fd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:17 +0000 Subject: [PATCH 64/87] KVM: arm64: nv: Propagate CNTHCTL_EL2.EL1NV{P,V}CT bits Allow a guest hypervisor to trap accesses to CNT{P,V}CT_EL02 by propagating these trap bits to the host trap configuration. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-10-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 6f04f31c0a7f..e5951e6eaf23 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -824,6 +824,10 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) * Apply the enable bits that the guest hypervisor has requested for * its own guest. We can only add traps that wouldn't have been set * above. + * Implementation choices: we do not support NV when E2H=0 in the + * guest, and we don't support configuration where E2H is writable + * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but + * not both). This simplifies the handling of the EL1NV* bits. */ if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) { u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2); @@ -834,6 +838,9 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) tpt |= !(val & (CNTHCTL_EL1PCEN << 10)); tpc |= !(val & (CNTHCTL_EL1PCTEN << 10)); + + tpt02 |= (val & CNTHCTL_EL1NVPCT); + tvt02 |= (val & CNTHCTL_EL1NVVCT); } /* From d1e37a50e1d781201768c89314532f6ab87e5a42 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:18 +0000 Subject: [PATCH 65/87] KVM: arm64: nv: Sanitise CNTHCTL_EL2 Inject some sanity in CNTHCTL_EL2, ensuring that we don't handle more than we advertise to the guest. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-11-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/nested.c | 15 +++++++++++++++ include/clocksource/arm_arch_timer.h | 2 ++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e18e9244d17a..cf571d41faa8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -490,7 +490,6 @@ enum vcpu_sysreg { VBAR_EL2, /* Vector Base Address Register (EL2) */ RVBAR_EL2, /* Reset Vector Base Address Register */ CONTEXTIDR_EL2, /* Context ID Register (EL2) */ - CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ SP_EL2, /* EL2 Stack Pointer */ CNTHP_CTL_EL2, CNTHP_CVAL_EL2, @@ -501,6 +500,7 @@ enum vcpu_sysreg { MARKER(__SANITISED_REG_START__), TCR2_EL2, /* Extended Translation Control Register (EL2) */ MDCR_EL2, /* Monitor Debug Configuration Register (EL2) */ + CNTHCTL_EL2, /* Counter-timer Hypervisor Control register */ /* Any VNCR-capable reg goes after this point */ MARKER(__VNCR_START__), diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 9b36218b48de..9113c6025d6f 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1271,6 +1271,21 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= MDCR_EL2_EnSTEPOP; set_sysreg_masks(kvm, MDCR_EL2, res0, res1); + /* CNTHCTL_EL2 */ + res0 = GENMASK(63, 20); + res1 = 0; + if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP)) + res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) { + res0 |= CNTHCTL_ECV; + if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP)) + res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT | + CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT); + } + if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP)) + res0 |= GENMASK(11, 8); + set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1); + return 0; } diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index c62811fb4130..ce6521ad04d1 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -26,6 +26,8 @@ #define CNTHCTL_EL1TVCT (1 << 14) #define CNTHCTL_EL1NVPCT (1 << 15) #define CNTHCTL_EL1NVVCT (1 << 16) +#define CNTHCTL_CNTVMASK (1 << 18) +#define CNTHCTL_CNTPMASK (1 << 19) enum arch_timer_reg { ARCH_TIMER_REG_CTRL, From 0bc9a9e85fcf4ffb69846b961273fde4eb0d03ab Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:19 +0000 Subject: [PATCH 66/87] KVM: arm64: Work around x1e's CNTVOFF_EL2 bogosity It appears that on Qualcomm's x1e CPU, CNTVOFF_EL2 doesn't really work, specially with HCR_EL2.E2H=1. A non-zero offset results in a screaming virtual timer interrupt, to the tune of a few 100k interrupts per second on a 4 vcpu VM. This is also evidenced by this CPU's inability to correctly run any of the timer selftests. The only case this doesn't break is when this register is set to 0, which breaks VM migration. When HCR_EL2.E2H=0, the timer seems to behave normally, and does not result in an interrupt storm. As a workaround, use the fact that this CPU implements FEAT_ECV, and trap all accesses to the virtual timer and counter, keeping CNTVOFF_EL2 set to zero, and emulate accesses to CVAL/TVAL/CTL and the counter itself, fixing up the timer to account for the missing offset. And if you think this is disgusting, you'd probably be right. Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-12-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/cputype.h | 2 ++ arch/arm64/kernel/cpu_errata.c | 8 +++++ arch/arm64/kernel/image-vars.h | 3 ++ arch/arm64/kvm/arch_timer.c | 58 ++++++++++++++++++++++++++++-- arch/arm64/kvm/hyp/nvhe/timer-sr.c | 16 ++++++--- arch/arm64/kvm/sys_regs.c | 3 +- arch/arm64/tools/cpucaps | 1 + include/kvm/arm_arch_timer.h | 7 ++++ 8 files changed, 90 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 488f8e751349..6f3f4142e214 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -122,6 +122,7 @@ #define QCOM_CPU_PART_KRYO_3XX_SILVER 0x803 #define QCOM_CPU_PART_KRYO_4XX_GOLD 0x804 #define QCOM_CPU_PART_KRYO_4XX_SILVER 0x805 +#define QCOM_CPU_PART_ORYON_X1 0x001 #define NVIDIA_CPU_PART_DENVER 0x003 #define NVIDIA_CPU_PART_CARMEL 0x004 @@ -198,6 +199,7 @@ #define MIDR_QCOM_KRYO_3XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_3XX_SILVER) #define MIDR_QCOM_KRYO_4XX_GOLD MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_GOLD) #define MIDR_QCOM_KRYO_4XX_SILVER MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO_4XX_SILVER) +#define MIDR_QCOM_ORYON_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_ORYON_X1) #define MIDR_NVIDIA_DENVER MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_DENVER) #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index a78f247029ae..7ce555862895 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -786,6 +786,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(erratum_ac03_cpu_38_list), }, #endif + { + .desc = "Broken CNTVOFF_EL2", + .capability = ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF, + ERRATA_MIDR_RANGE_LIST(((const struct midr_range[]) { + MIDR_ALL_VERSIONS(MIDR_QCOM_ORYON_X1), + {} + })), + }, { } }; diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 8f5422ed1b75..ef3a69cc398e 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -105,6 +105,9 @@ KVM_NVHE_ALIAS(__hyp_stub_vectors); KVM_NVHE_ALIAS(vgic_v2_cpuif_trap); KVM_NVHE_ALIAS(vgic_v3_cpuif_trap); +/* Static key which is set if CNTVOFF_EL2 is unusable */ +KVM_NVHE_ALIAS(broken_cntvoff_key); + /* EL2 exception handling */ KVM_NVHE_ALIAS(__start___kvm_ex_table); KVM_NVHE_ALIAS(__stop___kvm_ex_table); diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index e5951e6eaf23..d3d243366536 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -30,6 +30,7 @@ static u32 host_vtimer_irq_flags; static u32 host_ptimer_irq_flags; static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); +DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key); static const u8 default_ppi[] = { [TIMER_PTIMER] = 30, @@ -519,7 +520,12 @@ static void timer_save_state(struct arch_timer_context *ctx) case TIMER_VTIMER: case TIMER_HVTIMER: timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL)); - timer_set_cval(ctx, read_sysreg_el0(SYS_CNTV_CVAL)); + cval = read_sysreg_el0(SYS_CNTV_CVAL); + + if (has_broken_cntvoff()) + cval -= timer_get_offset(ctx); + + timer_set_cval(ctx, cval); /* Disable the timer */ write_sysreg_el0(0, SYS_CNTV_CTL); @@ -624,8 +630,15 @@ static void timer_restore_state(struct arch_timer_context *ctx) case TIMER_VTIMER: case TIMER_HVTIMER: - set_cntvoff(timer_get_offset(ctx)); - write_sysreg_el0(timer_get_cval(ctx), SYS_CNTV_CVAL); + cval = timer_get_cval(ctx); + offset = timer_get_offset(ctx); + if (has_broken_cntvoff()) { + set_cntvoff(0); + cval += offset; + } else { + set_cntvoff(offset); + } + write_sysreg_el0(cval, SYS_CNTV_CVAL); isb(); write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL); break; @@ -820,6 +833,13 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map) if (!has_cntpoff() && timer_get_offset(map->direct_ptimer)) tpt = tpc = true; + /* + * For the poor sods that could not correctly substract one value + * from another, trap the full virtual timer and counter. + */ + if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer)) + tvt = tvc = true; + /* * Apply the enable bits that the guest hypervisor has requested for * its own guest. We can only add traps that wouldn't have been set @@ -1450,6 +1470,37 @@ static int kvm_irq_init(struct arch_timer_kvm_info *info) return 0; } +static void kvm_timer_handle_errata(void) +{ + u64 mmfr0, mmfr1, mmfr4; + + /* + * CNTVOFF_EL2 is broken on some implementations. For those, we trap + * all virtual timer/counter accesses, requiring FEAT_ECV. + * + * However, a hypervisor supporting nesting is likely to mitigate the + * erratum at L0, and not require other levels to mitigate it (which + * would otherwise be a terrible performance sink due to trap + * amplification). + * + * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0, + * and that NV is likely not to (because of limitations of the + * architecture), only enable the workaround when FEAT_VHE and + * FEAT_E2H0 are both detected. Time will tell if this actually holds. + */ + mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); + mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1); + if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1) && + !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4) && + SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0) && + (has_vhe() || has_hvhe()) && + cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) { + static_branch_enable(&broken_cntvoff_key); + kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n"); + } +} + int __init kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; @@ -1518,6 +1569,7 @@ int __init kvm_timer_hyp_init(bool has_gic) goto out_free_vtimer_irq; } + kvm_timer_handle_errata(); return 0; out_free_ptimer_irq: diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index 3aaab20ae5b4..ff176f4ce7de 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -22,15 +22,16 @@ void __kvm_timer_set_cntvoff(u64 cntvoff) */ void __timer_disable_traps(struct kvm_vcpu *vcpu) { - u64 val, shift = 0; + u64 set, clr, shift = 0; if (has_hvhe()) shift = 10; /* Allow physical timer/counter access for the host */ - val = read_sysreg(cnthctl_el2); - val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; - write_sysreg(val, cnthctl_el2); + set = (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; + clr = CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + + sysreg_clear_set(cnthctl_el2, clr, set); } /* @@ -58,5 +59,12 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu) set <<= 10; } + /* + * Trap the virtual counter/timer if we have a broken cntvoff + * implementation. + */ + if (has_broken_cntvoff()) + set |= CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT; + sysreg_clear_set(cnthctl_el2, clr, set); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 986e63d4f9fa..d161d6c05707 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1721,7 +1721,8 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, if (!vcpu_has_ptrauth(vcpu)) val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) | ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3)); - if (!cpus_have_final_cap(ARM64_HAS_WFXT)) + if (!cpus_have_final_cap(ARM64_HAS_WFXT) || + has_broken_cntvoff()) val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); break; case SYS_ID_AA64MMFR2_EL1: diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index eb17f59e543c..1e65f2fb45bd 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -105,6 +105,7 @@ WORKAROUND_CLEAN_CACHE WORKAROUND_DEVICE_LOAD_ACQUIRE WORKAROUND_NVIDIA_CARMEL_CNP WORKAROUND_QCOM_FALKOR_E1003 +WORKAROUND_QCOM_ORYON_CNTVOFF WORKAROUND_REPEAT_TLBI WORKAROUND_SPECULATIVE_AT WORKAROUND_SPECULATIVE_SSBS diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index c1ba31fab6f5..681cf0c8b9df 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -151,6 +151,13 @@ void kvm_timer_cpu_down(void); /* CNTKCTL_EL1 valid bits as of DDI0487J.a */ #define CNTKCTL_VALID_BITS (BIT(17) | GENMASK_ULL(9, 0)) +DECLARE_STATIC_KEY_FALSE(broken_cntvoff_key); + +static inline bool has_broken_cntvoff(void) +{ + return static_branch_unlikely(&broken_cntvoff_key); +} + static inline bool has_cntpoff(void) { return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF)); From affd1c83e090133a3d1750916c7911b20f8911c0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 17 Dec 2024 14:23:20 +0000 Subject: [PATCH 67/87] KVM: arm64: nv: Document EL2 timer API Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20241217142321.763801-13-maz@kernel.org Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/devices/vcpu.rst | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index 31f14ec4a65b..d62ba86ee166 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -142,8 +142,9 @@ the cpu field to the processor id. :Architectures: ARM64 -2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER ------------------------------------------------------------------------------ +2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER, + KVM_ARM_VCPU_TIMER_IRQ_HVTIMER, KVM_ARM_VCPU_TIMER_IRQ_HPTIMER, +-------------------------------------------------------------------------------- :Parameters: in kvm_device_attr.addr the address for the timer interrupt is a pointer to an int @@ -159,10 +160,12 @@ A value describing the architected timer interrupt number when connected to an in-kernel virtual GIC. These must be a PPI (16 <= intid < 32). Setting the attribute overrides the default values (see below). -============================= ========================================== -KVM_ARM_VCPU_TIMER_IRQ_VTIMER The EL1 virtual timer intid (default: 27) -KVM_ARM_VCPU_TIMER_IRQ_PTIMER The EL1 physical timer intid (default: 30) -============================= ========================================== +============================== ========================================== +KVM_ARM_VCPU_TIMER_IRQ_VTIMER The EL1 virtual timer intid (default: 27) +KVM_ARM_VCPU_TIMER_IRQ_PTIMER The EL1 physical timer intid (default: 30) +KVM_ARM_VCPU_TIMER_IRQ_HVTIMER The EL2 virtual timer intid (default: 28) +KVM_ARM_VCPU_TIMER_IRQ_HPTIMER The EL2 physical timer intid (default: 26) +============================== ========================================== Setting the same PPI for different timers will prevent the VCPUs from running. Setting the interrupt number on a VCPU configures all VCPUs created at that From e8440c1e2d23a9ca5e0af1a18be637cbd5a5d44f Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Fri, 25 Oct 2024 09:32:59 +0000 Subject: [PATCH 68/87] Documentation: Update the behaviour of "kvm-arm.mode" Commit 5053c3f0519c ("KVM: arm64: Use hVHE in pKVM by default on CPUs with VHE support") modified the behaviour of "kvm-arm.mode=protected" without the updating the kernel parameters doc. Update it to match the current implementation. Also, update required architecture version for nested virtualization as suggested by Marc. Cc: Will Deacon Cc: Marc Zyngier Signed-off-by: Mostafa Saleh Link: https://lore.kernel.org/r/20241025093259.2216093-1-smostafa@google.com Signed-off-by: Marc Zyngier --- Documentation/admin-guide/kernel-parameters.txt | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3872bc6ec49d..7c870883af8e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2748,17 +2748,21 @@ nvhe: Standard nVHE-based mode, without support for protected guests. - protected: nVHE-based mode with support for guests whose - state is kept private from the host. + protected: Mode with support for guests whose state is + kept private from the host, using VHE or + nVHE depending on HW support. nested: VHE-based mode with support for nested - virtualization. Requires at least ARMv8.3 - hardware. + virtualization. Requires at least ARMv8.4 + hardware (with FEAT_NV2). Defaults to VHE/nVHE based on hardware support. Setting mode to "protected" will disable kexec and hibernation - for the host. "nested" is experimental and should be - used with extreme caution. + for the host. To force nVHE on VHE hardware, add + "arm64_sw.hvhe=0 id_aa64mmfr1.vh=0" to the + command-line. + "nested" is experimental and should be used with + extreme caution. kvm-arm.vgic_v3_group0_trap= [KVM,ARM,EARLY] Trap guest accesses to GICv3 group-0 From b7f345fbc32afab0f0b03c71c7eaf48b9a0ad7ed Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Mon, 6 Jan 2025 11:24:21 +0000 Subject: [PATCH 69/87] KVM: arm64: Fix FEAT_MTE in pKVM Make sure we do not trap access to Allocation Tags. Fixes: b56680de9c64 ("KVM: arm64: Initialize trap register values in hyp in pKVM") Signed-off-by: Vladimir Murzin Reviewed-by: Oliver Upton Reviewed-by: Fuad Tabba Link: https://lore.kernel.org/r/20250106112421.65355-1-vladimir.murzin@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 4b2622f8e250..f76adddc52de 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -47,6 +47,9 @@ static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) if (vcpu_has_ptrauth(vcpu)) vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); + + if (kvm_has_mte(vcpu->kvm)) + vcpu->arch.hcr_el2 |= HCR_ATA; } static void pvm_init_traps_hcr(struct kvm_vcpu *vcpu) @@ -251,6 +254,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc unsigned long host_arch_flags = READ_ONCE(host_kvm->arch.flags); DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags)) + set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags); + /* No restrictions for non-protected VMs. */ if (!kvm_vm_is_protected(kvm)) { hyp_vm->kvm.arch.flags = host_arch_flags; From 68344037b764401f751c66661c53334ea1e15324 Mon Sep 17 00:00:00 2001 From: Vincent Donnefort Date: Tue, 7 Jan 2025 11:28:21 +0000 Subject: [PATCH 70/87] KVM: arm64: Fix nVHE stacktrace VA bits mask The hypervisor VA space size depends on both the ID map's (IDMAP_VA_BITS) and the kernel stage-1 (VA_BITS). However, the hypervisor stacktrace decoding is solely relying on VA_BITS. This is especially an issue when VA_BITS < IDMAP_VA_BITS (i.e. VA_BITS is 39-bit): the hypervisor may have addresses bigger than the stacktrace is masking. Align this mask with hyp_va_bits. Signed-off-by: Vincent Donnefort Link: https://lore.kernel.org/r/20250107112821.416591-1-vdonnefort@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_mmu.h | 2 ++ arch/arm64/kvm/mmu.c | 3 +++ arch/arm64/kvm/stacktrace.c | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 66d93e320ec8..2ddd98fd2df4 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -139,6 +139,8 @@ static __always_inline unsigned long __kern_hyp_va(unsigned long v) #define kern_hyp_va(v) ((typeof(v))(__kern_hyp_va((unsigned long)(v)))) +extern u32 __hyp_va_bits; + /* * We currently support using a VM-specified IPA size. For backward * compatibility, the default IPA size is fixed to 40bits. diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index c9d46ad57e52..d36be6d2ac91 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -29,6 +29,8 @@ static unsigned long __ro_after_init hyp_idmap_start; static unsigned long __ro_after_init hyp_idmap_end; static phys_addr_t __ro_after_init hyp_idmap_vector; +u32 __ro_after_init __hyp_va_bits; + static unsigned long __ro_after_init io_map_base; static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, @@ -2056,6 +2058,7 @@ int __init kvm_mmu_init(u32 *hyp_va_bits) goto out_destroy_pgtable; io_map_base = hyp_idmap_start; + __hyp_va_bits = *hyp_va_bits; return 0; out_destroy_pgtable: diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index 3ace5b75813b..fdedd8a3ed6f 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -19,6 +19,7 @@ #include #include +#include #include static struct stack_info stackinfo_get_overflow(void) @@ -145,7 +146,7 @@ static void unwind(struct unwind_state *state, */ static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) { - unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long va_mask = GENMASK_ULL(__hyp_va_bits - 1, 0); unsigned long hyp_offset = (unsigned long)arg; /* Mask tags and convert to kern addr */ From 38f9e4b905a00047a96fbdc6cefe9ceb4dae34c3 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 11 Nov 2024 16:32:50 -0800 Subject: [PATCH 71/87] arm64: kvm: Introduce nvhe stack size constants Refactor nvhe stack code to use NVHE_STACK_SIZE/SHIFT constants, instead of directly using PAGE_SIZE/SHIFT. This makes the code a bit easier to read, without introducing any functional changes. Cc: Marc Zyngier Cc: Mark Brown Cc: Mark Rutland Cc: Will Deacon Signed-off-by: Kalesh Singh Link: https://lore.kernel.org/r/20241112003336.1375584-1-kaleshsingh@google.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/memory.h | 5 ++++- arch/arm64/include/asm/stacktrace/nvhe.h | 2 +- arch/arm64/kvm/arm.c | 18 +++++++++--------- arch/arm64/kvm/hyp/nvhe/host.S | 4 ++-- arch/arm64/kvm/hyp/nvhe/mm.c | 12 ++++++------ arch/arm64/kvm/hyp/nvhe/stacktrace.c | 4 ++-- arch/arm64/kvm/mmu.c | 12 ++++++------ arch/arm64/kvm/stacktrace.c | 6 +++--- 8 files changed, 33 insertions(+), 30 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 8b9f33cf561b..717829df294e 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -145,13 +145,16 @@ #define OVERFLOW_STACK_SIZE SZ_4K +#define NVHE_STACK_SHIFT PAGE_SHIFT +#define NVHE_STACK_SIZE (UL(1) << NVHE_STACK_SHIFT) + /* * With the minimum frame size of [x29, x30], exactly half the combined * sizes of the hyp and overflow stacks is the maximum size needed to * save the unwinded stacktrace; plus an additional entry to delimit the * end. */ -#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + PAGE_SIZE) / 2 + sizeof(long)) +#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + NVHE_STACK_SIZE) / 2 + sizeof(long)) /* * Alignment of kernel segments (e.g. .text, .data). diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h index 44759281d0d4..171f9edef49f 100644 --- a/arch/arm64/include/asm/stacktrace/nvhe.h +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -47,7 +47,7 @@ static inline void kvm_nvhe_unwind_init(struct unwind_state *state, DECLARE_KVM_NVHE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); -DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base); void kvm_nvhe_dump_backtrace(unsigned long hyp_offset); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a102c3aebdbc..9ac45e23770f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -61,7 +61,7 @@ static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTR DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); -DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base); DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); @@ -2339,7 +2339,7 @@ static void __init teardown_hyp_mode(void) free_hyp_pgds(); for_each_possible_cpu(cpu) { - free_page(per_cpu(kvm_arm_hyp_stack_page, cpu)); + free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT); free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order()); if (free_sve) { @@ -2527,15 +2527,15 @@ static int __init init_hyp_mode(void) * Allocate stack pages for Hypervisor-mode */ for_each_possible_cpu(cpu) { - unsigned long stack_page; + unsigned long stack_base; - stack_page = __get_free_page(GFP_KERNEL); - if (!stack_page) { + stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT); + if (!stack_base) { err = -ENOMEM; goto out_err; } - per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page; + per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base; } /* @@ -2604,9 +2604,9 @@ static int __init init_hyp_mode(void) */ for_each_possible_cpu(cpu) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); + char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu); - err = create_hyp_stack(__pa(stack_page), ¶ms->stack_hyp_va); + err = create_hyp_stack(__pa(stack_base), ¶ms->stack_hyp_va); if (err) { kvm_err("Cannot map hyp stack\n"); goto out_err; @@ -2618,7 +2618,7 @@ static int __init init_hyp_mode(void) * __hyp_pa() won't do the right thing there, since the stack * has been mapped in the flexible private VA space. */ - params->stack_pa = __pa(stack_page); + params->stack_pa = __pa(stack_base); } for_each_possible_cpu(cpu) { diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 3d610fc51f4d..58f0cb2298cc 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -188,12 +188,12 @@ SYM_FUNC_END(__host_hvc) /* * Test whether the SP has overflowed, without corrupting a GPR. - * nVHE hypervisor stacks are aligned so that the PAGE_SHIFT bit + * nVHE hypervisor stacks are aligned so that the NVHE_STACK_SHIFT bit * of SP should always be 1. */ add sp, sp, x0 // sp' = sp + x0 sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp - tbz x0, #PAGE_SHIFT, .L__hyp_sp_overflow\@ + tbz x0, #NVHE_STACK_SHIFT, .L__hyp_sp_overflow\@ sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index 8850b591d775..f41c7440b34b 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -360,10 +360,10 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) prev_base = __io_map_base; /* - * Efficient stack verification using the PAGE_SHIFT bit implies + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ - size = PAGE_SIZE * 2; + size = NVHE_STACK_SIZE * 2; addr = ALIGN(__io_map_base, size); ret = __pkvm_alloc_private_va_range(addr, size); @@ -373,12 +373,12 @@ int pkvm_create_stack(phys_addr_t phys, unsigned long *haddr) * at the higher address and leave the lower guard page * unbacked. * - * Any valid stack address now has the PAGE_SHIFT bit as 1 + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ - ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + PAGE_SIZE, - PAGE_SIZE, phys, PAGE_HYP); + ret = kvm_pgtable_hyp_map(&pkvm_pgtable, addr + NVHE_STACK_SIZE, + NVHE_STACK_SIZE, phys, PAGE_HYP); if (ret) __io_map_base = prev_base; } diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c index ed6b58b19cfa..5b6eeab1a774 100644 --- a/arch/arm64/kvm/hyp/nvhe/stacktrace.c +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -28,7 +28,7 @@ static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); - stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - NVHE_STACK_SIZE); stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); stacktrace_info->fp = fp; stacktrace_info->pc = pc; @@ -54,7 +54,7 @@ static struct stack_info stackinfo_get_hyp(void) { struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); unsigned long high = params->stack_hyp_va; - unsigned long low = high - PAGE_SIZE; + unsigned long low = high - NVHE_STACK_SIZE; return (struct stack_info) { .low = low, diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d36be6d2ac91..8850741243bb 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -706,10 +706,10 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) mutex_lock(&kvm_hyp_pgd_mutex); /* - * Efficient stack verification using the PAGE_SHIFT bit implies + * Efficient stack verification using the NVHE_STACK_SHIFT bit implies * an alignment of our allocation on the order of the size. */ - size = PAGE_SIZE * 2; + size = NVHE_STACK_SIZE * 2; base = ALIGN_DOWN(io_map_base - size, size); ret = __hyp_alloc_private_va_range(base); @@ -726,12 +726,12 @@ int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr) * at the higher address and leave the lower guard page * unbacked. * - * Any valid stack address now has the PAGE_SHIFT bit as 1 + * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1 * and addresses corresponding to the guard page have the - * PAGE_SHIFT bit as 0 - this is used for overflow detection. + * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection. */ - ret = __create_hyp_mappings(base + PAGE_SIZE, PAGE_SIZE, phys_addr, - PAGE_HYP); + ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE, + phys_addr, PAGE_HYP); if (ret) kvm_err("Cannot map hyp stack\n"); diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c index fdedd8a3ed6f..af5eec681127 100644 --- a/arch/arm64/kvm/stacktrace.c +++ b/arch/arm64/kvm/stacktrace.c @@ -51,7 +51,7 @@ static struct stack_info stackinfo_get_hyp(void) struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); unsigned long low = (unsigned long)stacktrace_info->stack_base; - unsigned long high = low + PAGE_SIZE; + unsigned long high = low + NVHE_STACK_SIZE; return (struct stack_info) { .low = low, @@ -61,8 +61,8 @@ static struct stack_info stackinfo_get_hyp(void) static struct stack_info stackinfo_get_hyp_kern_va(void) { - unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); - unsigned long high = low + PAGE_SIZE; + unsigned long low = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_base); + unsigned long high = low + NVHE_STACK_SIZE; return (struct stack_info) { .low = low, From dea8838128c580cc6b563c901576b580f93a703e Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 10 Jan 2025 23:53:09 +0100 Subject: [PATCH 72/87] KVM: arm64: vgic: Use str_enabled_disabled() in vgic_v3_probe() Remove hard-coded strings by using the str_enabled_disabled() helper function. Suggested-by: Christophe JAILLET Signed-off-by: Thorsten Blum Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250110225310.369980-2-thorsten.blum@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/kvm/vgic/vgic-v3.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index f267bc2486a1..528ee93fefd4 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -663,9 +664,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info) if (info->has_v4) { kvm_vgic_global_state.has_gicv4 = gicv4_enable; kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable; - kvm_info("GICv4%s support %sabled\n", + kvm_info("GICv4%s support %s\n", kvm_vgic_global_state.has_gicv4_1 ? ".1" : "", - gicv4_enable ? "en" : "dis"); + str_enabled_disabled(gicv4_enable)); } kvm_vgic_global_state.vcpu_base = 0; From a7f1fa5564be565bd4bc18875bb46ffd0c01d292 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 9 Jan 2025 16:38:36 -0600 Subject: [PATCH 73/87] KVM: arm64: Explicitly handle BRBE traps as UNDEFINED The Branch Record Buffer Extension (BRBE) adds a number of system registers and instructions which we don't currently intend to expose to guests. Our existing logic handles this safely, but this could be improved with some explicit handling of BRBE. KVM currently hides BRBE from guests: the cpufeature code's ftr_id_aa64dfr0[] table doesn't have an entry for the BRBE field, and so this will be zero in the sanitised value of ID_AA64DFR0 exposed to guests via read_sanitised_id_aa64dfr0_el1(). KVM currently traps BRBE usage from guests: the default configuration of the fine-grained trap controls HDFGRTR_EL2.{nBRBDATA,nBRBCTL,nBRBIDR} and HFGITR_EL2.{nBRBINJ_nBRBIALL} cause these to be trapped to EL2. Well-behaved guests shouldn't try to use the registers or instructions, but badly-behaved guests could use these, resulting in unnecessary warnings from KVM before it injects an UNDEF, e.g. | kvm [197]: Unsupported guest access at: 401c98 | { Op0( 2), Op1( 1), CRn( 9), CRm( 0), Op2( 0), func_read }, | kvm [197]: Unsupported guest access at: 401d04 | { Op0( 2), Op1( 1), CRn( 9), CRm( 0), Op2( 1), func_read }, | kvm [197]: Unsupported guest access at: 401d70 | { Op0( 2), Op1( 1), CRn( 9), CRm( 2), Op2( 0), func_read }, | kvm [197]: Unsupported guest access at: 401ddc | { Op0( 2), Op1( 1), CRn( 9), CRm( 1), Op2( 0), func_read }, | kvm [197]: Unsupported guest access at: 401e48 | { Op0( 2), Op1( 1), CRn( 9), CRm( 1), Op2( 1), func_read }, | kvm [197]: Unsupported guest access at: 401eb4 | { Op0( 2), Op1( 1), CRn( 9), CRm( 1), Op2( 2), func_read }, | kvm [197]: Unsupported guest access at: 401f20 | { Op0( 2), Op1( 1), CRn( 9), CRm( 0), Op2( 2), func_read }, | kvm [197]: Unsupported guest access at: 401f8c | { Op0( 1), Op1( 1), CRn( 7), CRm( 2), Op2( 4), func_write }, | kvm [197]: Unsupported guest access at: 401ff8 | { Op0( 1), Op1( 1), CRn( 7), CRm( 2), Op2( 5), func_write }, As with other features that we know how to handle, these warnings aren't particularly interesting, and we can simply treat these as UNDEFINED without any warning. Add the necessary fine-grained undef configuration to make this happen, as suggested by Marc Zyngier: https://lore.kernel.org/linux-arm-kernel/86r0czk6wd.wl-maz@kernel.org/ At the same time, update read_sanitised_id_aa64dfr0_el1() to hide BRBE from guests, as we do for SPE. This will prevent accidentally exposing BRBE to guests if/when ftr_id_aa64dfr0[] gains a BRBE entry. Cc: Anshuman Khandual Signed-off-by: Mark Rutland Signed-off-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250109223836.419240-1-robh@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/sys_regs.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e2a5c2918d9e..bc5a10b590ad 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1806,6 +1806,9 @@ static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val) /* Hide SPE from guests */ val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; + /* Hide BRBE from guests */ + val &= ~ID_AA64DFR0_EL1_BRBE_MASK; + return val; } @@ -4973,6 +4976,14 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) kvm->arch.fgu[HAFGRTR_GROUP] |= ~(HAFGRTR_EL2_RES0 | HAFGRTR_EL2_RES1); + if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, BRBE, IMP)) { + kvm->arch.fgu[HDFGRTR_GROUP] |= (HDFGRTR_EL2_nBRBDATA | + HDFGRTR_EL2_nBRBCTL | + HDFGRTR_EL2_nBRBIDR); + kvm->arch.fgu[HFGITR_GROUP] |= (HFGITR_EL2_nBRBINJ | + HFGITR_EL2_nBRBIALL); + } + set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); out: mutex_unlock(&kvm->arch.config_lock); From 7a0688832f580e5f4eda2dd92b4806fb4348f9c6 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 10 Jan 2025 12:19:34 +0000 Subject: [PATCH 74/87] KVM: arm64: Drop pkvm_mem_transition for FF-A Simplify the __pkvm_host_{un}share_ffa() paths by using {check,set}_page_state_range(). No functional changes intended. Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20250110121936.1559655-2-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 36 ++++++++------------------- 1 file changed, 10 insertions(+), 26 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 392f036bc69e..3a3d9fcbc508 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -1324,22 +1324,14 @@ void hyp_unpin_shared_mem(void *from, void *to) int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) { + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - struct pkvm_mem_share share = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = hyp_pfn_to_phys(pfn), - }, - .completer = { - .id = PKVM_ID_FFA, - }, - }, - }; host_lock_component(); - ret = do_share(&share); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); host_unlock_component(); return ret; @@ -1347,22 +1339,14 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) { + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; int ret; - struct pkvm_mem_share share = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = hyp_pfn_to_phys(pfn), - }, - .completer = { - .id = PKVM_ID_FFA, - }, - }, - }; host_lock_component(); - ret = do_unshare(&share); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (!ret) + ret = __host_set_page_state_range(phys, size, PKVM_PAGE_OWNED); host_unlock_component(); return ret; From 7cbf7c37718e67f2c994d4ed4ca7128fe06a4f60 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 10 Jan 2025 12:19:35 +0000 Subject: [PATCH 75/87] KVM: arm64: Drop pkvm_mem_transition for host/hyp sharing Simplify the __pkvm_host_{un}share_hyp() paths by not using the pkvm_mem_transition machinery. As there are the last users of the do_share()/do_unshare(), remove all the now-unused code as well. No functional changes intended. Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20250110121936.1559655-3-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 319 +++----------------------- 1 file changed, 34 insertions(+), 285 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 3a3d9fcbc508..cd8d233b9527 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -685,36 +685,6 @@ static int host_request_owned_transition(u64 *completer_addr, return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); } -static int host_request_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); -} - -static int host_initiate_share(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_SHARED_OWNED); -} - -static int host_initiate_unshare(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_set_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - static int host_initiate_donation(u64 *completer_addr, const struct pkvm_mem_transition *tx) { @@ -802,31 +772,6 @@ static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) tx->initiator.id != PKVM_ID_HOST); } -static int hyp_ack_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - - if (perms != PAGE_HYP) - return -EPERM; - - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; - - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); -} - -static int hyp_ack_unshare(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - - if (tx->initiator.id == PKVM_ID_HOST && hyp_page_count((void *)addr)) - return -EBUSY; - - return __hyp_check_page_state_range(addr, size, - PKVM_PAGE_SHARED_BORROWED); -} - static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) { u64 size = tx->nr_pages * PAGE_SIZE; @@ -837,24 +782,6 @@ static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); } -static int hyp_complete_share(u64 addr, const struct pkvm_mem_transition *tx, - enum kvm_pgtable_prot perms) -{ - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); - enum kvm_pgtable_prot prot; - - prot = pkvm_mkstate(perms, PKVM_PAGE_SHARED_BORROWED); - return pkvm_create_mappings_locked(start, end, prot); -} - -static int hyp_complete_unshare(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - int ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, addr, size); - - return (ret != size) ? -EFAULT : 0; -} - static int hyp_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) { @@ -885,180 +812,6 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, return check_page_state_range(&vm->pgt, addr, size, &d); } -static int check_share(struct pkvm_mem_share *share) -{ - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_share(completer_addr, tx, share->completer_prot); - break; - case PKVM_ID_FFA: - /* - * We only check the host; the secure side will check the other - * end when we forward the FFA call. - */ - ret = 0; - break; - default: - ret = -EINVAL; - } - - return ret; -} - -static int __do_share(struct pkvm_mem_share *share) -{ - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_share(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_share(completer_addr, tx, share->completer_prot); - break; - case PKVM_ID_FFA: - /* - * We're not responsible for any secure page-tables, so there's - * nothing to do here. - */ - ret = 0; - break; - default: - ret = -EINVAL; - } - - return ret; -} - -/* - * do_share(): - * - * The page owner grants access to another component with a given set - * of permissions. - * - * Initiator: OWNED => SHARED_OWNED - * Completer: NOPAGE => SHARED_BORROWED - */ -static int do_share(struct pkvm_mem_share *share) -{ - int ret; - - ret = check_share(share); - if (ret) - return ret; - - return WARN_ON(__do_share(share)); -} - -static int check_unshare(struct pkvm_mem_share *share) -{ - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_ack_unshare(completer_addr, tx); - break; - case PKVM_ID_FFA: - /* See check_share() */ - ret = 0; - break; - default: - ret = -EINVAL; - } - - return ret; -} - -static int __do_unshare(struct pkvm_mem_share *share) -{ - const struct pkvm_mem_transition *tx = &share->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_unshare(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HYP: - ret = hyp_complete_unshare(completer_addr, tx); - break; - case PKVM_ID_FFA: - /* See __do_share() */ - ret = 0; - break; - default: - ret = -EINVAL; - } - - return ret; -} - -/* - * do_unshare(): - * - * The page owner revokes access from another component for a range of - * pages which were previously shared using do_share(). - * - * Initiator: SHARED_OWNED => OWNED - * Completer: SHARED_BORROWED => NOPAGE - */ -static int do_unshare(struct pkvm_mem_share *share) -{ - int ret; - - ret = check_unshare(share); - if (ret) - return ret; - - return WARN_ON(__do_unshare(share)); -} - static int check_donation(struct pkvm_mem_donation *donation) { const struct pkvm_mem_transition *tx = &donation->tx; @@ -1149,31 +902,29 @@ static int do_donate(struct pkvm_mem_donation *donation) int __pkvm_host_share_hyp(u64 pfn) { + u64 phys = hyp_pfn_to_phys(pfn); + void *virt = __hyp_va(phys); + enum kvm_pgtable_prot prot; + u64 size = PAGE_SIZE; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; host_lock_component(); hyp_lock_component(); - ret = do_share(&share); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); + if (ret) + goto unlock; + } + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED)); + +unlock: hyp_unlock_component(); host_unlock_component(); @@ -1182,31 +933,29 @@ int __pkvm_host_share_hyp(u64 pfn) int __pkvm_host_unshare_hyp(u64 pfn) { + u64 phys = hyp_pfn_to_phys(pfn); + u64 virt = (u64)__hyp_va(phys); + u64 size = PAGE_SIZE; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_share share = { - .tx = { - .nr_pages = 1, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - .completer_prot = PAGE_HYP, - }; host_lock_component(); hyp_lock_component(); - ret = do_unshare(&share); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED); + if (ret) + goto unlock; + ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_SHARED_BORROWED); + if (ret) + goto unlock; + if (hyp_page_count((void *)virt)) { + ret = -EBUSY; + goto unlock; + } + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(__host_set_page_state_range(phys, size, PKVM_PAGE_OWNED)); + +unlock: hyp_unlock_component(); host_unlock_component(); From 6f91d31d47c57953da68e1a91240ae2543b00172 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 10 Jan 2025 12:19:36 +0000 Subject: [PATCH 76/87] KVM: arm64: Drop pkvm_mem_transition for host/hyp donations Simplify the __pkvm_host_donate_hyp() and pkvm_hyp_donate_host() paths by not using the pkvm_mem_transition machinery. As the last users of this, also remove all the now unused code. No functional changes intended. Signed-off-by: Quentin Perret Link: https://lore.kernel.org/r/20250110121936.1559655-4-qperret@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 285 +++----------------------- 1 file changed, 32 insertions(+), 253 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index cd8d233b9527..7ad7b133b81a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -583,39 +583,6 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) BUG_ON(ret && ret != -EAGAIN); } -struct pkvm_mem_transition { - u64 nr_pages; - - struct { - enum pkvm_component_id id; - /* Address in the initiator's address space */ - u64 addr; - - union { - struct { - /* Address in the completer's address space */ - u64 completer_addr; - } host; - struct { - u64 completer_addr; - } hyp; - }; - } initiator; - - struct { - enum pkvm_component_id id; - } completer; -}; - -struct pkvm_mem_share { - const struct pkvm_mem_transition tx; - const enum kvm_pgtable_prot completer_prot; -}; - -struct pkvm_mem_donation { - const struct pkvm_mem_transition tx; -}; - struct check_walk_data { enum pkvm_page_state desired; enum pkvm_page_state (*get_page_state)(kvm_pte_t pte, u64 addr); @@ -675,56 +642,6 @@ static int __host_set_page_state_range(u64 addr, u64 size, return 0; } -static int host_request_owned_transition(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.host.completer_addr; - return __host_check_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - -static int host_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u8 owner_id = tx->completer.id; - u64 size = tx->nr_pages * PAGE_SIZE; - - *completer_addr = tx->initiator.host.completer_addr; - return host_stage2_set_owner_locked(tx->initiator.addr, size, owner_id); -} - -static bool __host_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) -{ - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HYP); -} - -static int __host_ack_transition(u64 addr, const struct pkvm_mem_transition *tx, - enum pkvm_page_state state) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - - if (__host_ack_skip_pgtable_check(tx)) - return 0; - - return __host_check_page_state_range(addr, size, state); -} - -static int host_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - return __host_ack_transition(addr, tx, PKVM_NOPAGE); -} - -static int host_complete_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u8 host_id = tx->completer.id; - - return host_stage2_set_owner_locked(addr, size, host_id); -} - static enum pkvm_page_state hyp_get_page_state(kvm_pte_t pte, u64 addr) { if (!kvm_pte_valid(pte)) @@ -745,52 +662,6 @@ static int __hyp_check_page_state_range(u64 addr, u64 size, return check_page_state_range(&pkvm_pgtable, addr, size, &d); } -static int hyp_request_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - u64 addr = tx->initiator.addr; - - *completer_addr = tx->initiator.hyp.completer_addr; - return __hyp_check_page_state_range(addr, size, PKVM_PAGE_OWNED); -} - -static int hyp_initiate_donation(u64 *completer_addr, - const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - int ret; - - *completer_addr = tx->initiator.hyp.completer_addr; - ret = kvm_pgtable_hyp_unmap(&pkvm_pgtable, tx->initiator.addr, size); - return (ret != size) ? -EFAULT : 0; -} - -static bool __hyp_ack_skip_pgtable_check(const struct pkvm_mem_transition *tx) -{ - return !(IS_ENABLED(CONFIG_NVHE_EL2_DEBUG) || - tx->initiator.id != PKVM_ID_HOST); -} - -static int hyp_ack_donation(u64 addr, const struct pkvm_mem_transition *tx) -{ - u64 size = tx->nr_pages * PAGE_SIZE; - - if (__hyp_ack_skip_pgtable_check(tx)) - return 0; - - return __hyp_check_page_state_range(addr, size, PKVM_NOPAGE); -} - -static int hyp_complete_donation(u64 addr, - const struct pkvm_mem_transition *tx) -{ - void *start = (void *)addr, *end = start + (tx->nr_pages * PAGE_SIZE); - enum kvm_pgtable_prot prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); - - return pkvm_create_mappings_locked(start, end, prot); -} - static enum pkvm_page_state guest_get_page_state(kvm_pte_t pte, u64 addr) { if (!kvm_pte_valid(pte)) @@ -812,94 +683,6 @@ static int __guest_check_page_state_range(struct pkvm_hyp_vcpu *vcpu, u64 addr, return check_page_state_range(&vm->pgt, addr, size, &d); } -static int check_donation(struct pkvm_mem_donation *donation) -{ - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_request_owned_transition(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_request_donation(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_ack_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_ack_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - return ret; -} - -static int __do_donate(struct pkvm_mem_donation *donation) -{ - const struct pkvm_mem_transition *tx = &donation->tx; - u64 completer_addr; - int ret; - - switch (tx->initiator.id) { - case PKVM_ID_HOST: - ret = host_initiate_donation(&completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_initiate_donation(&completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - if (ret) - return ret; - - switch (tx->completer.id) { - case PKVM_ID_HOST: - ret = host_complete_donation(completer_addr, tx); - break; - case PKVM_ID_HYP: - ret = hyp_complete_donation(completer_addr, tx); - break; - default: - ret = -EINVAL; - } - - return ret; -} - -/* - * do_donate(): - * - * The page owner transfers ownership to another component, losing access - * as a consequence. - * - * Initiator: OWNED => NOPAGE - * Completer: NOPAGE => OWNED - */ -static int do_donate(struct pkvm_mem_donation *donation) -{ - int ret; - - ret = check_donation(donation); - if (ret) - return ret; - - return WARN_ON(__do_donate(donation)); -} - int __pkvm_host_share_hyp(u64 pfn) { u64 phys = hyp_pfn_to_phys(pfn); @@ -964,30 +747,29 @@ int __pkvm_host_unshare_hyp(u64 pfn) int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) { + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + void *virt = __hyp_va(phys); + enum kvm_pgtable_prot prot; int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HOST, - .addr = host_addr, - .host = { - .completer_addr = hyp_addr, - }, - }, - .completer = { - .id = PKVM_ID_HYP, - }, - }, - }; host_lock_component(); hyp_lock_component(); - ret = do_donate(&donation); + ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __hyp_check_page_state_range((u64)virt, size, PKVM_NOPAGE); + if (ret) + goto unlock; + } + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_OWNED); + WARN_ON(pkvm_create_mappings_locked(virt, virt + size, prot)); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HYP)); + +unlock: hyp_unlock_component(); host_unlock_component(); @@ -996,30 +778,27 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages) int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages) { + u64 phys = hyp_pfn_to_phys(pfn); + u64 size = PAGE_SIZE * nr_pages; + u64 virt = (u64)__hyp_va(phys); int ret; - u64 host_addr = hyp_pfn_to_phys(pfn); - u64 hyp_addr = (u64)__hyp_va(host_addr); - struct pkvm_mem_donation donation = { - .tx = { - .nr_pages = nr_pages, - .initiator = { - .id = PKVM_ID_HYP, - .addr = hyp_addr, - .hyp = { - .completer_addr = host_addr, - }, - }, - .completer = { - .id = PKVM_ID_HOST, - }, - }, - }; host_lock_component(); hyp_lock_component(); - ret = do_donate(&donation); + ret = __hyp_check_page_state_range(virt, size, PKVM_PAGE_OWNED); + if (ret) + goto unlock; + if (IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + ret = __host_check_page_state_range(phys, size, PKVM_NOPAGE); + if (ret) + goto unlock; + } + WARN_ON(kvm_pgtable_hyp_unmap(&pkvm_pgtable, virt, size) != size); + WARN_ON(host_stage2_set_owner_locked(phys, size, PKVM_ID_HOST)); + +unlock: hyp_unlock_component(); host_unlock_component(); From 38138762faffeb923d9f49efbcc09884f1530786 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Jan 2025 14:24:37 +0000 Subject: [PATCH 77/87] tools: arm64: Update sysreg.h header files Created with the following: cp include/linux/kasan-tags.h tools/include/linux/ cp arch/arm64/include/asm/sysreg.h tools/arch/arm64/include/asm/ Update the tools copy of sysreg.h so that the next commit to add a new register doesn't have unrelated changes in it. Because the new version of sysreg.h includes kasan-tags.h, that file also now needs to be copied into tools. Acked-by: Mark Brown Reviewed-by: Suzuki K Poulose Signed-off-by: James Clark Signed-off-by: James Clark Link: https://lore.kernel.org/r/20250106142446.628923-3-james.clark@linaro.org Signed-off-by: Marc Zyngier --- tools/arch/arm64/include/asm/sysreg.h | 398 +++++++++++++++++++++++++- tools/include/linux/kasan-tags.h | 15 + 2 files changed, 405 insertions(+), 8 deletions(-) create mode 100644 tools/include/linux/kasan-tags.h diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index cd8420e8c3ad..345e81e0d2b3 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -11,6 +11,7 @@ #include #include +#include #include @@ -108,6 +109,9 @@ #define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x)) #define set_pstate_dit(x) asm volatile(SET_PSTATE_DIT(x)) +/* Register-based PAN access, for save/restore purposes */ +#define SYS_PSTATE_PAN sys_reg(3, 0, 4, 2, 3) + #define __SYS_BARRIER_INSN(CRm, op2, Rt) \ __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f)) @@ -123,6 +127,37 @@ #define SYS_DC_CIGSW sys_insn(1, 0, 7, 14, 4) #define SYS_DC_CIGDSW sys_insn(1, 0, 7, 14, 6) +#define SYS_IC_IALLUIS sys_insn(1, 0, 7, 1, 0) +#define SYS_IC_IALLU sys_insn(1, 0, 7, 5, 0) +#define SYS_IC_IVAU sys_insn(1, 3, 7, 5, 1) + +#define SYS_DC_IVAC sys_insn(1, 0, 7, 6, 1) +#define SYS_DC_IGVAC sys_insn(1, 0, 7, 6, 3) +#define SYS_DC_IGDVAC sys_insn(1, 0, 7, 6, 5) + +#define SYS_DC_CVAC sys_insn(1, 3, 7, 10, 1) +#define SYS_DC_CGVAC sys_insn(1, 3, 7, 10, 3) +#define SYS_DC_CGDVAC sys_insn(1, 3, 7, 10, 5) + +#define SYS_DC_CVAU sys_insn(1, 3, 7, 11, 1) + +#define SYS_DC_CVAP sys_insn(1, 3, 7, 12, 1) +#define SYS_DC_CGVAP sys_insn(1, 3, 7, 12, 3) +#define SYS_DC_CGDVAP sys_insn(1, 3, 7, 12, 5) + +#define SYS_DC_CVADP sys_insn(1, 3, 7, 13, 1) +#define SYS_DC_CGVADP sys_insn(1, 3, 7, 13, 3) +#define SYS_DC_CGDVADP sys_insn(1, 3, 7, 13, 5) + +#define SYS_DC_CIVAC sys_insn(1, 3, 7, 14, 1) +#define SYS_DC_CIGVAC sys_insn(1, 3, 7, 14, 3) +#define SYS_DC_CIGDVAC sys_insn(1, 3, 7, 14, 5) + +/* Data cache zero operations */ +#define SYS_DC_ZVA sys_insn(1, 3, 7, 4, 1) +#define SYS_DC_GVA sys_insn(1, 3, 7, 4, 3) +#define SYS_DC_GZVA sys_insn(1, 3, 7, 4, 4) + /* * Automatically generated definitions for system registers, the * manual encodings below are in the process of being converted to @@ -162,6 +197,84 @@ #define SYS_DBGDTRTX_EL0 sys_reg(2, 3, 0, 5, 0) #define SYS_DBGVCR32_EL2 sys_reg(2, 4, 0, 7, 0) +#define SYS_BRBINF_EL1(n) sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 0)) +#define SYS_BRBINFINJ_EL1 sys_reg(2, 1, 9, 1, 0) +#define SYS_BRBSRC_EL1(n) sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 1)) +#define SYS_BRBSRCINJ_EL1 sys_reg(2, 1, 9, 1, 1) +#define SYS_BRBTGT_EL1(n) sys_reg(2, 1, 8, (n & 15), (((n & 16) >> 2) | 2)) +#define SYS_BRBTGTINJ_EL1 sys_reg(2, 1, 9, 1, 2) +#define SYS_BRBTS_EL1 sys_reg(2, 1, 9, 0, 2) + +#define SYS_BRBCR_EL1 sys_reg(2, 1, 9, 0, 0) +#define SYS_BRBFCR_EL1 sys_reg(2, 1, 9, 0, 1) +#define SYS_BRBIDR0_EL1 sys_reg(2, 1, 9, 2, 0) + +#define SYS_TRCITECR_EL1 sys_reg(3, 0, 1, 2, 3) +#define SYS_TRCACATR(m) sys_reg(2, 1, 2, ((m & 7) << 1), (2 | (m >> 3))) +#define SYS_TRCACVR(m) sys_reg(2, 1, 2, ((m & 7) << 1), (0 | (m >> 3))) +#define SYS_TRCAUTHSTATUS sys_reg(2, 1, 7, 14, 6) +#define SYS_TRCAUXCTLR sys_reg(2, 1, 0, 6, 0) +#define SYS_TRCBBCTLR sys_reg(2, 1, 0, 15, 0) +#define SYS_TRCCCCTLR sys_reg(2, 1, 0, 14, 0) +#define SYS_TRCCIDCCTLR0 sys_reg(2, 1, 3, 0, 2) +#define SYS_TRCCIDCCTLR1 sys_reg(2, 1, 3, 1, 2) +#define SYS_TRCCIDCVR(m) sys_reg(2, 1, 3, ((m & 7) << 1), 0) +#define SYS_TRCCLAIMCLR sys_reg(2, 1, 7, 9, 6) +#define SYS_TRCCLAIMSET sys_reg(2, 1, 7, 8, 6) +#define SYS_TRCCNTCTLR(m) sys_reg(2, 1, 0, (4 | (m & 3)), 5) +#define SYS_TRCCNTRLDVR(m) sys_reg(2, 1, 0, (0 | (m & 3)), 5) +#define SYS_TRCCNTVR(m) sys_reg(2, 1, 0, (8 | (m & 3)), 5) +#define SYS_TRCCONFIGR sys_reg(2, 1, 0, 4, 0) +#define SYS_TRCDEVARCH sys_reg(2, 1, 7, 15, 6) +#define SYS_TRCDEVID sys_reg(2, 1, 7, 2, 7) +#define SYS_TRCEVENTCTL0R sys_reg(2, 1, 0, 8, 0) +#define SYS_TRCEVENTCTL1R sys_reg(2, 1, 0, 9, 0) +#define SYS_TRCEXTINSELR(m) sys_reg(2, 1, 0, (8 | (m & 3)), 4) +#define SYS_TRCIDR0 sys_reg(2, 1, 0, 8, 7) +#define SYS_TRCIDR10 sys_reg(2, 1, 0, 2, 6) +#define SYS_TRCIDR11 sys_reg(2, 1, 0, 3, 6) +#define SYS_TRCIDR12 sys_reg(2, 1, 0, 4, 6) +#define SYS_TRCIDR13 sys_reg(2, 1, 0, 5, 6) +#define SYS_TRCIDR1 sys_reg(2, 1, 0, 9, 7) +#define SYS_TRCIDR2 sys_reg(2, 1, 0, 10, 7) +#define SYS_TRCIDR3 sys_reg(2, 1, 0, 11, 7) +#define SYS_TRCIDR4 sys_reg(2, 1, 0, 12, 7) +#define SYS_TRCIDR5 sys_reg(2, 1, 0, 13, 7) +#define SYS_TRCIDR6 sys_reg(2, 1, 0, 14, 7) +#define SYS_TRCIDR7 sys_reg(2, 1, 0, 15, 7) +#define SYS_TRCIDR8 sys_reg(2, 1, 0, 0, 6) +#define SYS_TRCIDR9 sys_reg(2, 1, 0, 1, 6) +#define SYS_TRCIMSPEC(m) sys_reg(2, 1, 0, (m & 7), 7) +#define SYS_TRCITEEDCR sys_reg(2, 1, 0, 2, 1) +#define SYS_TRCOSLSR sys_reg(2, 1, 1, 1, 4) +#define SYS_TRCPRGCTLR sys_reg(2, 1, 0, 1, 0) +#define SYS_TRCQCTLR sys_reg(2, 1, 0, 1, 1) +#define SYS_TRCRSCTLR(m) sys_reg(2, 1, 1, (m & 15), (0 | (m >> 4))) +#define SYS_TRCRSR sys_reg(2, 1, 0, 10, 0) +#define SYS_TRCSEQEVR(m) sys_reg(2, 1, 0, (m & 3), 4) +#define SYS_TRCSEQRSTEVR sys_reg(2, 1, 0, 6, 4) +#define SYS_TRCSEQSTR sys_reg(2, 1, 0, 7, 4) +#define SYS_TRCSSCCR(m) sys_reg(2, 1, 1, (m & 7), 2) +#define SYS_TRCSSCSR(m) sys_reg(2, 1, 1, (8 | (m & 7)), 2) +#define SYS_TRCSSPCICR(m) sys_reg(2, 1, 1, (m & 7), 3) +#define SYS_TRCSTALLCTLR sys_reg(2, 1, 0, 11, 0) +#define SYS_TRCSTATR sys_reg(2, 1, 0, 3, 0) +#define SYS_TRCSYNCPR sys_reg(2, 1, 0, 13, 0) +#define SYS_TRCTRACEIDR sys_reg(2, 1, 0, 0, 1) +#define SYS_TRCTSCTLR sys_reg(2, 1, 0, 12, 0) +#define SYS_TRCVICTLR sys_reg(2, 1, 0, 0, 2) +#define SYS_TRCVIIECTLR sys_reg(2, 1, 0, 1, 2) +#define SYS_TRCVIPCSSCTLR sys_reg(2, 1, 0, 3, 2) +#define SYS_TRCVISSCTLR sys_reg(2, 1, 0, 2, 2) +#define SYS_TRCVMIDCCTLR0 sys_reg(2, 1, 3, 2, 2) +#define SYS_TRCVMIDCCTLR1 sys_reg(2, 1, 3, 3, 2) +#define SYS_TRCVMIDCVR(m) sys_reg(2, 1, 3, ((m & 7) << 1), 1) + +/* ETM */ +#define SYS_TRCOSLAR sys_reg(2, 1, 1, 0, 4) + +#define SYS_BRBCR_EL2 sys_reg(2, 4, 9, 0, 0) + #define SYS_MIDR_EL1 sys_reg(3, 0, 0, 0, 0) #define SYS_MPIDR_EL1 sys_reg(3, 0, 0, 0, 5) #define SYS_REVIDR_EL1 sys_reg(3, 0, 0, 0, 6) @@ -202,15 +315,38 @@ #define SYS_ERXCTLR_EL1 sys_reg(3, 0, 5, 4, 1) #define SYS_ERXSTATUS_EL1 sys_reg(3, 0, 5, 4, 2) #define SYS_ERXADDR_EL1 sys_reg(3, 0, 5, 4, 3) +#define SYS_ERXPFGF_EL1 sys_reg(3, 0, 5, 4, 4) +#define SYS_ERXPFGCTL_EL1 sys_reg(3, 0, 5, 4, 5) +#define SYS_ERXPFGCDN_EL1 sys_reg(3, 0, 5, 4, 6) #define SYS_ERXMISC0_EL1 sys_reg(3, 0, 5, 5, 0) #define SYS_ERXMISC1_EL1 sys_reg(3, 0, 5, 5, 1) +#define SYS_ERXMISC2_EL1 sys_reg(3, 0, 5, 5, 2) +#define SYS_ERXMISC3_EL1 sys_reg(3, 0, 5, 5, 3) #define SYS_TFSR_EL1 sys_reg(3, 0, 5, 6, 0) #define SYS_TFSRE0_EL1 sys_reg(3, 0, 5, 6, 1) #define SYS_PAR_EL1 sys_reg(3, 0, 7, 4, 0) #define SYS_PAR_EL1_F BIT(0) +/* When PAR_EL1.F == 1 */ #define SYS_PAR_EL1_FST GENMASK(6, 1) +#define SYS_PAR_EL1_PTW BIT(8) +#define SYS_PAR_EL1_S BIT(9) +#define SYS_PAR_EL1_AssuredOnly BIT(12) +#define SYS_PAR_EL1_TopLevel BIT(13) +#define SYS_PAR_EL1_Overlay BIT(14) +#define SYS_PAR_EL1_DirtyBit BIT(15) +#define SYS_PAR_EL1_F1_IMPDEF GENMASK_ULL(63, 48) +#define SYS_PAR_EL1_F1_RES0 (BIT(7) | BIT(10) | GENMASK_ULL(47, 16)) +#define SYS_PAR_EL1_RES1 BIT(11) +/* When PAR_EL1.F == 0 */ +#define SYS_PAR_EL1_SH GENMASK_ULL(8, 7) +#define SYS_PAR_EL1_NS BIT(9) +#define SYS_PAR_EL1_F0_IMPDEF BIT(10) +#define SYS_PAR_EL1_NSE BIT(11) +#define SYS_PAR_EL1_PA GENMASK_ULL(51, 12) +#define SYS_PAR_EL1_ATTR GENMASK_ULL(63, 56) +#define SYS_PAR_EL1_F0_RES0 (GENMASK_ULL(6, 1) | GENMASK_ULL(55, 52)) /*** Statistical Profiling Extension ***/ #define PMSEVFR_EL1_RES0_IMP \ @@ -274,6 +410,8 @@ #define SYS_ICC_IGRPEN0_EL1 sys_reg(3, 0, 12, 12, 6) #define SYS_ICC_IGRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) +#define SYS_ACCDATA_EL1 sys_reg(3, 0, 13, 0, 5) + #define SYS_CNTKCTL_EL1 sys_reg(3, 0, 14, 1, 0) #define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7) @@ -286,7 +424,6 @@ #define SYS_PMCNTENCLR_EL0 sys_reg(3, 3, 9, 12, 2) #define SYS_PMOVSCLR_EL0 sys_reg(3, 3, 9, 12, 3) #define SYS_PMSWINC_EL0 sys_reg(3, 3, 9, 12, 4) -#define SYS_PMSELR_EL0 sys_reg(3, 3, 9, 12, 5) #define SYS_PMCEID0_EL0 sys_reg(3, 3, 9, 12, 6) #define SYS_PMCEID1_EL0 sys_reg(3, 3, 9, 12, 7) #define SYS_PMCCNTR_EL0 sys_reg(3, 3, 9, 13, 0) @@ -369,6 +506,7 @@ #define SYS_SCTLR_EL2 sys_reg(3, 4, 1, 0, 0) #define SYS_ACTLR_EL2 sys_reg(3, 4, 1, 0, 1) +#define SYS_SCTLR2_EL2 sys_reg(3, 4, 1, 0, 3) #define SYS_HCR_EL2 sys_reg(3, 4, 1, 1, 0) #define SYS_MDCR_EL2 sys_reg(3, 4, 1, 1, 1) #define SYS_CPTR_EL2 sys_reg(3, 4, 1, 1, 2) @@ -382,12 +520,15 @@ #define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2) #define SYS_TRFCR_EL2 sys_reg(3, 4, 1, 2, 1) -#define SYS_HDFGRTR_EL2 sys_reg(3, 4, 3, 1, 4) -#define SYS_HDFGWTR_EL2 sys_reg(3, 4, 3, 1, 5) +#define SYS_VNCR_EL2 sys_reg(3, 4, 2, 2, 0) #define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6) #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) #define SYS_ELR_EL2 sys_reg(3, 4, 4, 0, 1) #define SYS_SP_EL1 sys_reg(3, 4, 4, 1, 0) +#define SYS_SPSR_irq sys_reg(3, 4, 4, 3, 0) +#define SYS_SPSR_abt sys_reg(3, 4, 4, 3, 1) +#define SYS_SPSR_und sys_reg(3, 4, 4, 3, 2) +#define SYS_SPSR_fiq sys_reg(3, 4, 4, 3, 3) #define SYS_IFSR32_EL2 sys_reg(3, 4, 5, 0, 1) #define SYS_AFSR0_EL2 sys_reg(3, 4, 5, 1, 0) #define SYS_AFSR1_EL2 sys_reg(3, 4, 5, 1, 1) @@ -449,24 +590,49 @@ #define SYS_CONTEXTIDR_EL2 sys_reg(3, 4, 13, 0, 1) #define SYS_TPIDR_EL2 sys_reg(3, 4, 13, 0, 2) +#define SYS_SCXTNUM_EL2 sys_reg(3, 4, 13, 0, 7) + +#define __AMEV_op2(m) (m & 0x7) +#define __AMEV_CRm(n, m) (n | ((m & 0x8) >> 3)) +#define __SYS__AMEVCNTVOFF0n_EL2(m) sys_reg(3, 4, 13, __AMEV_CRm(0x8, m), __AMEV_op2(m)) +#define SYS_AMEVCNTVOFF0n_EL2(m) __SYS__AMEVCNTVOFF0n_EL2(m) +#define __SYS__AMEVCNTVOFF1n_EL2(m) sys_reg(3, 4, 13, __AMEV_CRm(0xA, m), __AMEV_op2(m)) +#define SYS_AMEVCNTVOFF1n_EL2(m) __SYS__AMEVCNTVOFF1n_EL2(m) #define SYS_CNTVOFF_EL2 sys_reg(3, 4, 14, 0, 3) #define SYS_CNTHCTL_EL2 sys_reg(3, 4, 14, 1, 0) +#define SYS_CNTHP_TVAL_EL2 sys_reg(3, 4, 14, 2, 0) +#define SYS_CNTHP_CTL_EL2 sys_reg(3, 4, 14, 2, 1) +#define SYS_CNTHP_CVAL_EL2 sys_reg(3, 4, 14, 2, 2) +#define SYS_CNTHV_TVAL_EL2 sys_reg(3, 4, 14, 3, 0) +#define SYS_CNTHV_CTL_EL2 sys_reg(3, 4, 14, 3, 1) +#define SYS_CNTHV_CVAL_EL2 sys_reg(3, 4, 14, 3, 2) /* VHE encodings for architectural EL0/1 system registers */ +#define SYS_BRBCR_EL12 sys_reg(2, 5, 9, 0, 0) #define SYS_SCTLR_EL12 sys_reg(3, 5, 1, 0, 0) +#define SYS_CPACR_EL12 sys_reg(3, 5, 1, 0, 2) +#define SYS_SCTLR2_EL12 sys_reg(3, 5, 1, 0, 3) +#define SYS_ZCR_EL12 sys_reg(3, 5, 1, 2, 0) +#define SYS_TRFCR_EL12 sys_reg(3, 5, 1, 2, 1) +#define SYS_SMCR_EL12 sys_reg(3, 5, 1, 2, 6) #define SYS_TTBR0_EL12 sys_reg(3, 5, 2, 0, 0) #define SYS_TTBR1_EL12 sys_reg(3, 5, 2, 0, 1) #define SYS_TCR_EL12 sys_reg(3, 5, 2, 0, 2) +#define SYS_TCR2_EL12 sys_reg(3, 5, 2, 0, 3) #define SYS_SPSR_EL12 sys_reg(3, 5, 4, 0, 0) #define SYS_ELR_EL12 sys_reg(3, 5, 4, 0, 1) #define SYS_AFSR0_EL12 sys_reg(3, 5, 5, 1, 0) #define SYS_AFSR1_EL12 sys_reg(3, 5, 5, 1, 1) #define SYS_ESR_EL12 sys_reg(3, 5, 5, 2, 0) #define SYS_TFSR_EL12 sys_reg(3, 5, 5, 6, 0) +#define SYS_FAR_EL12 sys_reg(3, 5, 6, 0, 0) +#define SYS_PMSCR_EL12 sys_reg(3, 5, 9, 9, 0) #define SYS_MAIR_EL12 sys_reg(3, 5, 10, 2, 0) #define SYS_AMAIR_EL12 sys_reg(3, 5, 10, 3, 0) #define SYS_VBAR_EL12 sys_reg(3, 5, 12, 0, 0) +#define SYS_CONTEXTIDR_EL12 sys_reg(3, 5, 13, 0, 1) +#define SYS_SCXTNUM_EL12 sys_reg(3, 5, 13, 0, 7) #define SYS_CNTKCTL_EL12 sys_reg(3, 5, 14, 1, 0) #define SYS_CNTP_TVAL_EL02 sys_reg(3, 5, 14, 2, 0) #define SYS_CNTP_CTL_EL02 sys_reg(3, 5, 14, 2, 1) @@ -477,6 +643,183 @@ #define SYS_SP_EL2 sys_reg(3, 6, 4, 1, 0) +/* AT instructions */ +#define AT_Op0 1 +#define AT_CRn 7 + +#define OP_AT_S1E1R sys_insn(AT_Op0, 0, AT_CRn, 8, 0) +#define OP_AT_S1E1W sys_insn(AT_Op0, 0, AT_CRn, 8, 1) +#define OP_AT_S1E0R sys_insn(AT_Op0, 0, AT_CRn, 8, 2) +#define OP_AT_S1E0W sys_insn(AT_Op0, 0, AT_CRn, 8, 3) +#define OP_AT_S1E1RP sys_insn(AT_Op0, 0, AT_CRn, 9, 0) +#define OP_AT_S1E1WP sys_insn(AT_Op0, 0, AT_CRn, 9, 1) +#define OP_AT_S1E1A sys_insn(AT_Op0, 0, AT_CRn, 9, 2) +#define OP_AT_S1E2R sys_insn(AT_Op0, 4, AT_CRn, 8, 0) +#define OP_AT_S1E2W sys_insn(AT_Op0, 4, AT_CRn, 8, 1) +#define OP_AT_S12E1R sys_insn(AT_Op0, 4, AT_CRn, 8, 4) +#define OP_AT_S12E1W sys_insn(AT_Op0, 4, AT_CRn, 8, 5) +#define OP_AT_S12E0R sys_insn(AT_Op0, 4, AT_CRn, 8, 6) +#define OP_AT_S12E0W sys_insn(AT_Op0, 4, AT_CRn, 8, 7) +#define OP_AT_S1E2A sys_insn(AT_Op0, 4, AT_CRn, 9, 2) + +/* TLBI instructions */ +#define TLBI_Op0 1 + +#define TLBI_Op1_EL1 0 /* Accessible from EL1 or higher */ +#define TLBI_Op1_EL2 4 /* Accessible from EL2 or higher */ + +#define TLBI_CRn_XS 8 /* Extra Slow (the common one) */ +#define TLBI_CRn_nXS 9 /* not Extra Slow (which nobody uses)*/ + +#define TLBI_CRm_IPAIS 0 /* S2 Inner-Shareable */ +#define TLBI_CRm_nROS 1 /* non-Range, Outer-Sharable */ +#define TLBI_CRm_RIS 2 /* Range, Inner-Sharable */ +#define TLBI_CRm_nRIS 3 /* non-Range, Inner-Sharable */ +#define TLBI_CRm_IPAONS 4 /* S2 Outer and Non-Shareable */ +#define TLBI_CRm_ROS 5 /* Range, Outer-Sharable */ +#define TLBI_CRm_RNS 6 /* Range, Non-Sharable */ +#define TLBI_CRm_nRNS 7 /* non-Range, Non-Sharable */ + +#define OP_TLBI_VMALLE1OS sys_insn(1, 0, 8, 1, 0) +#define OP_TLBI_VAE1OS sys_insn(1, 0, 8, 1, 1) +#define OP_TLBI_ASIDE1OS sys_insn(1, 0, 8, 1, 2) +#define OP_TLBI_VAAE1OS sys_insn(1, 0, 8, 1, 3) +#define OP_TLBI_VALE1OS sys_insn(1, 0, 8, 1, 5) +#define OP_TLBI_VAALE1OS sys_insn(1, 0, 8, 1, 7) +#define OP_TLBI_RVAE1IS sys_insn(1, 0, 8, 2, 1) +#define OP_TLBI_RVAAE1IS sys_insn(1, 0, 8, 2, 3) +#define OP_TLBI_RVALE1IS sys_insn(1, 0, 8, 2, 5) +#define OP_TLBI_RVAALE1IS sys_insn(1, 0, 8, 2, 7) +#define OP_TLBI_VMALLE1IS sys_insn(1, 0, 8, 3, 0) +#define OP_TLBI_VAE1IS sys_insn(1, 0, 8, 3, 1) +#define OP_TLBI_ASIDE1IS sys_insn(1, 0, 8, 3, 2) +#define OP_TLBI_VAAE1IS sys_insn(1, 0, 8, 3, 3) +#define OP_TLBI_VALE1IS sys_insn(1, 0, 8, 3, 5) +#define OP_TLBI_VAALE1IS sys_insn(1, 0, 8, 3, 7) +#define OP_TLBI_RVAE1OS sys_insn(1, 0, 8, 5, 1) +#define OP_TLBI_RVAAE1OS sys_insn(1, 0, 8, 5, 3) +#define OP_TLBI_RVALE1OS sys_insn(1, 0, 8, 5, 5) +#define OP_TLBI_RVAALE1OS sys_insn(1, 0, 8, 5, 7) +#define OP_TLBI_RVAE1 sys_insn(1, 0, 8, 6, 1) +#define OP_TLBI_RVAAE1 sys_insn(1, 0, 8, 6, 3) +#define OP_TLBI_RVALE1 sys_insn(1, 0, 8, 6, 5) +#define OP_TLBI_RVAALE1 sys_insn(1, 0, 8, 6, 7) +#define OP_TLBI_VMALLE1 sys_insn(1, 0, 8, 7, 0) +#define OP_TLBI_VAE1 sys_insn(1, 0, 8, 7, 1) +#define OP_TLBI_ASIDE1 sys_insn(1, 0, 8, 7, 2) +#define OP_TLBI_VAAE1 sys_insn(1, 0, 8, 7, 3) +#define OP_TLBI_VALE1 sys_insn(1, 0, 8, 7, 5) +#define OP_TLBI_VAALE1 sys_insn(1, 0, 8, 7, 7) +#define OP_TLBI_VMALLE1OSNXS sys_insn(1, 0, 9, 1, 0) +#define OP_TLBI_VAE1OSNXS sys_insn(1, 0, 9, 1, 1) +#define OP_TLBI_ASIDE1OSNXS sys_insn(1, 0, 9, 1, 2) +#define OP_TLBI_VAAE1OSNXS sys_insn(1, 0, 9, 1, 3) +#define OP_TLBI_VALE1OSNXS sys_insn(1, 0, 9, 1, 5) +#define OP_TLBI_VAALE1OSNXS sys_insn(1, 0, 9, 1, 7) +#define OP_TLBI_RVAE1ISNXS sys_insn(1, 0, 9, 2, 1) +#define OP_TLBI_RVAAE1ISNXS sys_insn(1, 0, 9, 2, 3) +#define OP_TLBI_RVALE1ISNXS sys_insn(1, 0, 9, 2, 5) +#define OP_TLBI_RVAALE1ISNXS sys_insn(1, 0, 9, 2, 7) +#define OP_TLBI_VMALLE1ISNXS sys_insn(1, 0, 9, 3, 0) +#define OP_TLBI_VAE1ISNXS sys_insn(1, 0, 9, 3, 1) +#define OP_TLBI_ASIDE1ISNXS sys_insn(1, 0, 9, 3, 2) +#define OP_TLBI_VAAE1ISNXS sys_insn(1, 0, 9, 3, 3) +#define OP_TLBI_VALE1ISNXS sys_insn(1, 0, 9, 3, 5) +#define OP_TLBI_VAALE1ISNXS sys_insn(1, 0, 9, 3, 7) +#define OP_TLBI_RVAE1OSNXS sys_insn(1, 0, 9, 5, 1) +#define OP_TLBI_RVAAE1OSNXS sys_insn(1, 0, 9, 5, 3) +#define OP_TLBI_RVALE1OSNXS sys_insn(1, 0, 9, 5, 5) +#define OP_TLBI_RVAALE1OSNXS sys_insn(1, 0, 9, 5, 7) +#define OP_TLBI_RVAE1NXS sys_insn(1, 0, 9, 6, 1) +#define OP_TLBI_RVAAE1NXS sys_insn(1, 0, 9, 6, 3) +#define OP_TLBI_RVALE1NXS sys_insn(1, 0, 9, 6, 5) +#define OP_TLBI_RVAALE1NXS sys_insn(1, 0, 9, 6, 7) +#define OP_TLBI_VMALLE1NXS sys_insn(1, 0, 9, 7, 0) +#define OP_TLBI_VAE1NXS sys_insn(1, 0, 9, 7, 1) +#define OP_TLBI_ASIDE1NXS sys_insn(1, 0, 9, 7, 2) +#define OP_TLBI_VAAE1NXS sys_insn(1, 0, 9, 7, 3) +#define OP_TLBI_VALE1NXS sys_insn(1, 0, 9, 7, 5) +#define OP_TLBI_VAALE1NXS sys_insn(1, 0, 9, 7, 7) +#define OP_TLBI_IPAS2E1IS sys_insn(1, 4, 8, 0, 1) +#define OP_TLBI_RIPAS2E1IS sys_insn(1, 4, 8, 0, 2) +#define OP_TLBI_IPAS2LE1IS sys_insn(1, 4, 8, 0, 5) +#define OP_TLBI_RIPAS2LE1IS sys_insn(1, 4, 8, 0, 6) +#define OP_TLBI_ALLE2OS sys_insn(1, 4, 8, 1, 0) +#define OP_TLBI_VAE2OS sys_insn(1, 4, 8, 1, 1) +#define OP_TLBI_ALLE1OS sys_insn(1, 4, 8, 1, 4) +#define OP_TLBI_VALE2OS sys_insn(1, 4, 8, 1, 5) +#define OP_TLBI_VMALLS12E1OS sys_insn(1, 4, 8, 1, 6) +#define OP_TLBI_RVAE2IS sys_insn(1, 4, 8, 2, 1) +#define OP_TLBI_RVALE2IS sys_insn(1, 4, 8, 2, 5) +#define OP_TLBI_ALLE2IS sys_insn(1, 4, 8, 3, 0) +#define OP_TLBI_VAE2IS sys_insn(1, 4, 8, 3, 1) +#define OP_TLBI_ALLE1IS sys_insn(1, 4, 8, 3, 4) +#define OP_TLBI_VALE2IS sys_insn(1, 4, 8, 3, 5) +#define OP_TLBI_VMALLS12E1IS sys_insn(1, 4, 8, 3, 6) +#define OP_TLBI_IPAS2E1OS sys_insn(1, 4, 8, 4, 0) +#define OP_TLBI_IPAS2E1 sys_insn(1, 4, 8, 4, 1) +#define OP_TLBI_RIPAS2E1 sys_insn(1, 4, 8, 4, 2) +#define OP_TLBI_RIPAS2E1OS sys_insn(1, 4, 8, 4, 3) +#define OP_TLBI_IPAS2LE1OS sys_insn(1, 4, 8, 4, 4) +#define OP_TLBI_IPAS2LE1 sys_insn(1, 4, 8, 4, 5) +#define OP_TLBI_RIPAS2LE1 sys_insn(1, 4, 8, 4, 6) +#define OP_TLBI_RIPAS2LE1OS sys_insn(1, 4, 8, 4, 7) +#define OP_TLBI_RVAE2OS sys_insn(1, 4, 8, 5, 1) +#define OP_TLBI_RVALE2OS sys_insn(1, 4, 8, 5, 5) +#define OP_TLBI_RVAE2 sys_insn(1, 4, 8, 6, 1) +#define OP_TLBI_RVALE2 sys_insn(1, 4, 8, 6, 5) +#define OP_TLBI_ALLE2 sys_insn(1, 4, 8, 7, 0) +#define OP_TLBI_VAE2 sys_insn(1, 4, 8, 7, 1) +#define OP_TLBI_ALLE1 sys_insn(1, 4, 8, 7, 4) +#define OP_TLBI_VALE2 sys_insn(1, 4, 8, 7, 5) +#define OP_TLBI_VMALLS12E1 sys_insn(1, 4, 8, 7, 6) +#define OP_TLBI_IPAS2E1ISNXS sys_insn(1, 4, 9, 0, 1) +#define OP_TLBI_RIPAS2E1ISNXS sys_insn(1, 4, 9, 0, 2) +#define OP_TLBI_IPAS2LE1ISNXS sys_insn(1, 4, 9, 0, 5) +#define OP_TLBI_RIPAS2LE1ISNXS sys_insn(1, 4, 9, 0, 6) +#define OP_TLBI_ALLE2OSNXS sys_insn(1, 4, 9, 1, 0) +#define OP_TLBI_VAE2OSNXS sys_insn(1, 4, 9, 1, 1) +#define OP_TLBI_ALLE1OSNXS sys_insn(1, 4, 9, 1, 4) +#define OP_TLBI_VALE2OSNXS sys_insn(1, 4, 9, 1, 5) +#define OP_TLBI_VMALLS12E1OSNXS sys_insn(1, 4, 9, 1, 6) +#define OP_TLBI_RVAE2ISNXS sys_insn(1, 4, 9, 2, 1) +#define OP_TLBI_RVALE2ISNXS sys_insn(1, 4, 9, 2, 5) +#define OP_TLBI_ALLE2ISNXS sys_insn(1, 4, 9, 3, 0) +#define OP_TLBI_VAE2ISNXS sys_insn(1, 4, 9, 3, 1) +#define OP_TLBI_ALLE1ISNXS sys_insn(1, 4, 9, 3, 4) +#define OP_TLBI_VALE2ISNXS sys_insn(1, 4, 9, 3, 5) +#define OP_TLBI_VMALLS12E1ISNXS sys_insn(1, 4, 9, 3, 6) +#define OP_TLBI_IPAS2E1OSNXS sys_insn(1, 4, 9, 4, 0) +#define OP_TLBI_IPAS2E1NXS sys_insn(1, 4, 9, 4, 1) +#define OP_TLBI_RIPAS2E1NXS sys_insn(1, 4, 9, 4, 2) +#define OP_TLBI_RIPAS2E1OSNXS sys_insn(1, 4, 9, 4, 3) +#define OP_TLBI_IPAS2LE1OSNXS sys_insn(1, 4, 9, 4, 4) +#define OP_TLBI_IPAS2LE1NXS sys_insn(1, 4, 9, 4, 5) +#define OP_TLBI_RIPAS2LE1NXS sys_insn(1, 4, 9, 4, 6) +#define OP_TLBI_RIPAS2LE1OSNXS sys_insn(1, 4, 9, 4, 7) +#define OP_TLBI_RVAE2OSNXS sys_insn(1, 4, 9, 5, 1) +#define OP_TLBI_RVALE2OSNXS sys_insn(1, 4, 9, 5, 5) +#define OP_TLBI_RVAE2NXS sys_insn(1, 4, 9, 6, 1) +#define OP_TLBI_RVALE2NXS sys_insn(1, 4, 9, 6, 5) +#define OP_TLBI_ALLE2NXS sys_insn(1, 4, 9, 7, 0) +#define OP_TLBI_VAE2NXS sys_insn(1, 4, 9, 7, 1) +#define OP_TLBI_ALLE1NXS sys_insn(1, 4, 9, 7, 4) +#define OP_TLBI_VALE2NXS sys_insn(1, 4, 9, 7, 5) +#define OP_TLBI_VMALLS12E1NXS sys_insn(1, 4, 9, 7, 6) + +/* Misc instructions */ +#define OP_GCSPUSHX sys_insn(1, 0, 7, 7, 4) +#define OP_GCSPOPCX sys_insn(1, 0, 7, 7, 5) +#define OP_GCSPOPX sys_insn(1, 0, 7, 7, 6) +#define OP_GCSPUSHM sys_insn(1, 3, 7, 7, 0) + +#define OP_BRB_IALL sys_insn(1, 1, 7, 2, 4) +#define OP_BRB_INJ sys_insn(1, 1, 7, 2, 5) +#define OP_CFP_RCTX sys_insn(1, 3, 7, 3, 4) +#define OP_DVP_RCTX sys_insn(1, 3, 7, 3, 5) +#define OP_COSP_RCTX sys_insn(1, 3, 7, 3, 6) +#define OP_CPP_RCTX sys_insn(1, 3, 7, 3, 7) + /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_ENTP2 (BIT(60)) #define SCTLR_ELx_DSSBS (BIT(44)) @@ -555,16 +898,14 @@ /* Position the attr at the correct index */ #define MAIR_ATTRIDX(attr, idx) ((attr) << ((idx) * 8)) -/* id_aa64pfr0 */ -#define ID_AA64PFR0_EL1_ELx_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL1_ELx_32BIT_64BIT 0x2 - /* id_aa64mmfr0 */ #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN 0x0 +#define ID_AA64MMFR0_EL1_TGRAN4_LPA2 ID_AA64MMFR0_EL1_TGRAN4_52_BIT #define ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX 0x7 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN 0x0 #define ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX 0x7 #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN 0x1 +#define ID_AA64MMFR0_EL1_TGRAN16_LPA2 ID_AA64MMFR0_EL1_TGRAN16_52_BIT #define ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX 0xf #define ARM64_MIN_PARANGE_BITS 32 @@ -572,6 +913,7 @@ #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT 0x0 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE 0x1 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN 0x2 +#define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2 0x3 #define ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX 0x7 #ifdef CONFIG_ARM64_PA_BITS_52 @@ -582,11 +924,13 @@ #if defined(CONFIG_ARM64_4K_PAGES) #define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN4_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_LPA2 ID_AA64MMFR0_EL1_TGRAN4_52_BIT #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT #elif defined(CONFIG_ARM64_16K_PAGES) #define ID_AA64MMFR0_EL1_TGRAN_SHIFT ID_AA64MMFR0_EL1_TGRAN16_SHIFT +#define ID_AA64MMFR0_EL1_TGRAN_LPA2 ID_AA64MMFR0_EL1_TGRAN16_52_BIT #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN #define ID_AA64MMFR0_EL1_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX #define ID_AA64MMFR0_EL1_TGRAN_2_SHIFT ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT @@ -610,6 +954,19 @@ #define SYS_GCR_EL1_RRND (BIT(16)) #define SYS_GCR_EL1_EXCL_MASK 0xffffUL +#ifdef CONFIG_KASAN_HW_TAGS +/* + * KASAN always uses a whole byte for its tags. With CONFIG_KASAN_HW_TAGS it + * only uses tags in the range 0xF0-0xFF, which we map to MTE tags 0x0-0xF. + */ +#define __MTE_TAG_MIN (KASAN_TAG_MIN & 0xf) +#define __MTE_TAG_MAX (KASAN_TAG_MAX & 0xf) +#define __MTE_TAG_INCL GENMASK(__MTE_TAG_MAX, __MTE_TAG_MIN) +#define KERNEL_GCR_EL1_EXCL (SYS_GCR_EL1_EXCL_MASK & ~__MTE_TAG_INCL) +#else +#define KERNEL_GCR_EL1_EXCL SYS_GCR_EL1_EXCL_MASK +#endif + #define KERNEL_GCR_EL1 (SYS_GCR_EL1_RRND | KERNEL_GCR_EL1_EXCL) /* RGSR_EL1 Definitions */ @@ -716,6 +1073,22 @@ #define PIRx_ELx_PERM(idx, perm) ((perm) << ((idx) * 4)) +/* + * Permission Overlay Extension (POE) permission encodings. + */ +#define POE_NONE UL(0x0) +#define POE_R UL(0x1) +#define POE_X UL(0x2) +#define POE_RX UL(0x3) +#define POE_W UL(0x4) +#define POE_RW UL(0x5) +#define POE_XW UL(0x6) +#define POE_RXW UL(0x7) +#define POE_MASK UL(0xf) + +/* Initial value for Permission Overlay Extension for EL0 */ +#define POR_EL0_INIT POE_RXW + #define ARM64_FEATURE_FIELD_BITS 4 /* Defined for compatibility only, do not add new users. */ @@ -789,15 +1162,21 @@ /* * For registers without architectural names, or simply unsupported by * GAS. + * + * __check_r forces warnings to be generated by the compiler when + * evaluating r which wouldn't normally happen due to being passed to + * the assembler via __stringify(r). */ #define read_sysreg_s(r) ({ \ u64 __val; \ + u32 __maybe_unused __check_r = (u32)(r); \ asm volatile(__mrs_s("%0", r) : "=r" (__val)); \ __val; \ }) #define write_sysreg_s(v, r) do { \ u64 __val = (u64)(v); \ + u32 __maybe_unused __check_r = (u32)(r); \ asm volatile(__msr_s(r, "%x0") : : "rZ" (__val)); \ } while (0) @@ -827,6 +1206,8 @@ par; \ }) +#define SYS_FIELD_VALUE(reg, field, val) reg##_##field##_##val + #define SYS_FIELD_GET(reg, field, val) \ FIELD_GET(reg##_##field##_MASK, val) @@ -834,7 +1215,8 @@ FIELD_PREP(reg##_##field##_MASK, val) #define SYS_FIELD_PREP_ENUM(reg, field, val) \ - FIELD_PREP(reg##_##field##_MASK, reg##_##field##_##val) + FIELD_PREP(reg##_##field##_MASK, \ + SYS_FIELD_VALUE(reg, field, val)) #endif diff --git a/tools/include/linux/kasan-tags.h b/tools/include/linux/kasan-tags.h new file mode 100644 index 000000000000..4f85f562512c --- /dev/null +++ b/tools/include/linux/kasan-tags.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KASAN_TAGS_H +#define _LINUX_KASAN_TAGS_H + +#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */ +#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ +#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ + +#ifdef CONFIG_KASAN_HW_TAGS +#define KASAN_TAG_MIN 0xF0 /* minimum value for random tags */ +#else +#define KASAN_TAG_MIN 0x00 /* minimum value for random tags */ +#endif + +#endif /* LINUX_KASAN_TAGS_H */ From c382ee674c8b5005798606267d660cf995218b18 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Jan 2025 14:24:38 +0000 Subject: [PATCH 78/87] arm64/sysreg/tools: Move TRFCR definitions to sysreg Convert TRFCR to automatic generation. Add separate definitions for ELx and EL2 as TRFCR_EL1 doesn't have CX. This also mirrors the previous definition so no code change is required. Also add TRFCR_EL12 which will start to be used in a later commit. Unfortunately, to avoid breaking the Perf build with duplicate definition errors, the tools copy of the sysreg.h header needs to be updated at the same time rather than the usual second commit. This is because the generated version of sysreg (arch/arm64/include/generated/asm/sysreg-defs.h), is currently shared and tools/ does not have its own copy. Reviewed-by: Mark Brown Signed-off-by: James Clark Signed-off-by: James Clark Link: https://lore.kernel.org/r/20250106142446.628923-4-james.clark@linaro.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/sysreg.h | 12 --------- arch/arm64/tools/sysreg | 36 +++++++++++++++++++++++++++ tools/arch/arm64/include/asm/sysreg.h | 12 --------- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b8303a83c0bf..808f65818b91 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -283,8 +283,6 @@ #define SYS_RGSR_EL1 sys_reg(3, 0, 1, 0, 5) #define SYS_GCR_EL1 sys_reg(3, 0, 1, 0, 6) -#define SYS_TRFCR_EL1 sys_reg(3, 0, 1, 2, 1) - #define SYS_TCR_EL1 sys_reg(3, 0, 2, 0, 2) #define SYS_APIAKEYLO_EL1 sys_reg(3, 0, 2, 1, 0) @@ -519,7 +517,6 @@ #define SYS_VTTBR_EL2 sys_reg(3, 4, 2, 1, 0) #define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2) -#define SYS_TRFCR_EL2 sys_reg(3, 4, 1, 2, 1) #define SYS_VNCR_EL2 sys_reg(3, 4, 2, 2, 0) #define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6) #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) @@ -983,15 +980,6 @@ /* Safe value for MPIDR_EL1: Bit31:RES1, Bit30:U:0, Bit24:MT:0 */ #define SYS_MPIDR_SAFE_VAL (BIT(31)) -#define TRFCR_ELx_TS_SHIFT 5 -#define TRFCR_ELx_TS_MASK ((0x3UL) << TRFCR_ELx_TS_SHIFT) -#define TRFCR_ELx_TS_VIRTUAL ((0x1UL) << TRFCR_ELx_TS_SHIFT) -#define TRFCR_ELx_TS_GUEST_PHYSICAL ((0x2UL) << TRFCR_ELx_TS_SHIFT) -#define TRFCR_ELx_TS_PHYSICAL ((0x3UL) << TRFCR_ELx_TS_SHIFT) -#define TRFCR_EL2_CX BIT(3) -#define TRFCR_ELx_ExTRE BIT(1) -#define TRFCR_ELx_E0TRE BIT(0) - /* GIC Hypervisor interface registers */ /* ICH_MISR_EL2 bit definitions */ #define ICH_MISR_EOI (1 << 0) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index b081b54d6d22..c28870251220 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1995,6 +1995,22 @@ Sysreg CPACR_EL1 3 0 1 0 2 Fields CPACR_ELx EndSysreg +SysregFields TRFCR_ELx +Res0 63:7 +UnsignedEnum 6:5 TS + 0b0001 VIRTUAL + 0b0010 GUEST_PHYSICAL + 0b0011 PHYSICAL +EndEnum +Res0 4:2 +Field 1 ExTRE +Field 0 E0TRE +EndSysregFields + +Sysreg TRFCR_EL1 3 0 1 2 1 +Fields TRFCR_ELx +EndSysreg + Sysreg SMPRI_EL1 3 0 1 2 4 Res0 63:4 Field 3:0 PRIORITY @@ -2544,6 +2560,22 @@ Field 1 ICIALLU Field 0 ICIALLUIS EndSysreg +Sysreg TRFCR_EL2 3 4 1 2 1 +Res0 63:7 +UnsignedEnum 6:5 TS + 0b0000 USE_TRFCR_EL1_TS + 0b0001 VIRTUAL + 0b0010 GUEST_PHYSICAL + 0b0011 PHYSICAL +EndEnum +Res0 4 +Field 3 CX +Res0 2 +Field 1 E2TRE +Field 0 E0HTRE +EndSysreg + + Sysreg HDFGRTR_EL2 3 4 3 1 4 Field 63 PMBIDR_EL1 Field 62 nPMSNEVFR_EL1 @@ -2954,6 +2986,10 @@ Sysreg ZCR_EL12 3 5 1 2 0 Fields ZCR_ELx EndSysreg +Sysreg TRFCR_EL12 3 5 1 2 1 +Fields TRFCR_ELx +EndSysreg + Sysreg SMCR_EL12 3 5 1 2 6 Fields SMCR_ELx EndSysreg diff --git a/tools/arch/arm64/include/asm/sysreg.h b/tools/arch/arm64/include/asm/sysreg.h index 345e81e0d2b3..150416682e2c 100644 --- a/tools/arch/arm64/include/asm/sysreg.h +++ b/tools/arch/arm64/include/asm/sysreg.h @@ -283,8 +283,6 @@ #define SYS_RGSR_EL1 sys_reg(3, 0, 1, 0, 5) #define SYS_GCR_EL1 sys_reg(3, 0, 1, 0, 6) -#define SYS_TRFCR_EL1 sys_reg(3, 0, 1, 2, 1) - #define SYS_TCR_EL1 sys_reg(3, 0, 2, 0, 2) #define SYS_APIAKEYLO_EL1 sys_reg(3, 0, 2, 1, 0) @@ -519,7 +517,6 @@ #define SYS_VTTBR_EL2 sys_reg(3, 4, 2, 1, 0) #define SYS_VTCR_EL2 sys_reg(3, 4, 2, 1, 2) -#define SYS_TRFCR_EL2 sys_reg(3, 4, 1, 2, 1) #define SYS_VNCR_EL2 sys_reg(3, 4, 2, 2, 0) #define SYS_HAFGRTR_EL2 sys_reg(3, 4, 3, 1, 6) #define SYS_SPSR_EL2 sys_reg(3, 4, 4, 0, 0) @@ -983,15 +980,6 @@ /* Safe value for MPIDR_EL1: Bit31:RES1, Bit30:U:0, Bit24:MT:0 */ #define SYS_MPIDR_SAFE_VAL (BIT(31)) -#define TRFCR_ELx_TS_SHIFT 5 -#define TRFCR_ELx_TS_MASK ((0x3UL) << TRFCR_ELx_TS_SHIFT) -#define TRFCR_ELx_TS_VIRTUAL ((0x1UL) << TRFCR_ELx_TS_SHIFT) -#define TRFCR_ELx_TS_GUEST_PHYSICAL ((0x2UL) << TRFCR_ELx_TS_SHIFT) -#define TRFCR_ELx_TS_PHYSICAL ((0x3UL) << TRFCR_ELx_TS_SHIFT) -#define TRFCR_EL2_CX BIT(3) -#define TRFCR_ELx_ExTRE BIT(1) -#define TRFCR_ELx_E0TRE BIT(0) - /* GIC Hypervisor interface registers */ /* ICH_MISR_EL2 bit definitions */ #define ICH_MISR_EOI (1 << 0) From a2b579c41fe9c295804abd167751f9fdc73c7006 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Jan 2025 14:24:39 +0000 Subject: [PATCH 79/87] coresight: trbe: Remove redundant disable call trbe_drain_and_disable_local() just clears TRBLIMITR and drains. TRBLIMITR is already cleared on the next line after this call, so replace it with only drain. This is so we can make a kvm call that has a preempt enabled warning from set_trbe_disabled() in the next commit, where trbe_reset_local() is called from a preemptible hotplug path. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20250106142446.628923-5-james.clark@linaro.org Signed-off-by: Marc Zyngier --- drivers/hwtracing/coresight/coresight-trbe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index 919804b12a67..03d3695ba5aa 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -253,8 +253,8 @@ static void trbe_drain_and_disable_local(struct trbe_cpudata *cpudata) static void trbe_reset_local(struct trbe_cpudata *cpudata) { - trbe_drain_and_disable_local(cpudata); write_sysreg_s(0, SYS_TRBLIMITR_EL1); + trbe_drain_buffer(); write_sysreg_s(0, SYS_TRBPTR_EL1); write_sysreg_s(0, SYS_TRBBASER_EL1); write_sysreg_s(0, SYS_TRBSR_EL1); From a665e3bc88081dd65642d83fc22a1abdb6a901bc Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Jan 2025 14:24:40 +0000 Subject: [PATCH 80/87] KVM: arm64: coresight: Give TRBE enabled state to KVM Currently in nVHE, KVM has to check if TRBE is enabled on every guest switch even if it was never used. Because it's a debug feature and is more likely to not be used than used, give KVM the TRBE buffer status to allow a much simpler and faster do-nothing path in the hyp. Protected mode now disables trace regardless of TRBE (because trfcr_while_in_guest is always 0), which was not previously done. However, it continues to flush whenever the buffer is enabled regardless of the filter status. This avoids the hypothetical case of a host that had disabled the filter but not flushed which would arise if only doing the flush when the filter was enabled. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20250106142446.628923-6-james.clark@linaro.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 9 +++ arch/arm64/kvm/debug.c | 32 +++++++++- arch/arm64/kvm/hyp/nvhe/debug-sr.c | 65 ++++++++++++-------- drivers/hwtracing/coresight/coresight-trbe.c | 3 + 4 files changed, 80 insertions(+), 29 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e7c740c99ee3..fec53d84e990 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -614,6 +614,8 @@ struct kvm_host_data { #define KVM_HOST_DATA_FLAG_HAS_TRBE 1 #define KVM_HOST_DATA_FLAG_HOST_SVE_ENABLED 2 #define KVM_HOST_DATA_FLAG_HOST_SME_ENABLED 3 +#define KVM_HOST_DATA_FLAG_TRBE_ENABLED 4 +#define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED 5 unsigned long flags; struct kvm_cpu_context host_ctxt; @@ -659,6 +661,9 @@ struct kvm_host_data { u64 mdcr_el2; } host_debug_state; + /* Guest trace filter value */ + u64 trfcr_while_in_guest; + /* Number of programmable event counters (PMCR_EL0.N) for this CPU */ unsigned int nr_event_counters; @@ -1381,6 +1386,8 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u64 clr); bool kvm_set_pmuserenr(u64 val); +void kvm_enable_trbe(void); +void kvm_disable_trbe(void); #else static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u64 clr) {} @@ -1388,6 +1395,8 @@ static inline bool kvm_set_pmuserenr(u64 val) { return false; } +static inline void kvm_enable_trbe(void) {} +static inline void kvm_disable_trbe(void) {} #endif void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index d921e7f7bd59..92a24adadb0e 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -81,9 +81,15 @@ void kvm_init_host_debug_data(void) !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P)) host_data_set_flag(HAS_SPE); - if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && - !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) - host_data_set_flag(HAS_TRBE); + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) { + /* Force disable trace in protected mode in case of no TRBE */ + if (is_protected_kvm_enabled()) + host_data_set_flag(EL1_TRACING_CONFIGURED); + + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) && + !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P)) + host_data_set_flag(HAS_TRBE); + } } /* @@ -219,3 +225,23 @@ void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) kvm_arch_vcpu_load(vcpu, smp_processor_id()); preempt_enable(); } + +void kvm_enable_trbe(void) +{ + if (has_vhe() || is_protected_kvm_enabled() || + WARN_ON_ONCE(preemptible())) + return; + + host_data_set_flag(TRBE_ENABLED); +} +EXPORT_SYMBOL_GPL(kvm_enable_trbe); + +void kvm_disable_trbe(void) +{ + if (has_vhe() || is_protected_kvm_enabled() || + WARN_ON_ONCE(preemptible())) + return; + + host_data_clear_flag(TRBE_ENABLED); +} +EXPORT_SYMBOL_GPL(kvm_disable_trbe); diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 858bb38e273f..2f4a4f5036bb 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -51,32 +51,45 @@ static void __debug_restore_spe(u64 pmscr_el1) write_sysreg_el1(pmscr_el1, SYS_PMSCR); } -static void __debug_save_trace(u64 *trfcr_el1) +static void __trace_do_switch(u64 *saved_trfcr, u64 new_trfcr) { - *trfcr_el1 = 0; - - /* Check if the TRBE is enabled */ - if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E)) - return; - /* - * Prohibit trace generation while we are in guest. - * Since access to TRFCR_EL1 is trapped, the guest can't - * modify the filtering set by the host. - */ - *trfcr_el1 = read_sysreg_el1(SYS_TRFCR); - write_sysreg_el1(0, SYS_TRFCR); - isb(); - /* Drain the trace buffer to memory */ - tsb_csync(); + *saved_trfcr = read_sysreg_el1(SYS_TRFCR); + write_sysreg_el1(new_trfcr, SYS_TRFCR); } -static void __debug_restore_trace(u64 trfcr_el1) +static bool __trace_needs_drain(void) { - if (!trfcr_el1) - return; + if (is_protected_kvm_enabled() && host_data_test_flag(HAS_TRBE)) + return read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_EL1_E; - /* Restore trace filter controls */ - write_sysreg_el1(trfcr_el1, SYS_TRFCR); + return host_data_test_flag(TRBE_ENABLED); +} + +static bool __trace_needs_switch(void) +{ + return host_data_test_flag(TRBE_ENABLED) || + host_data_test_flag(EL1_TRACING_CONFIGURED); +} + +static void __trace_switch_to_guest(void) +{ + /* Unsupported with TRBE so disable */ + if (host_data_test_flag(TRBE_ENABLED)) + *host_data_ptr(trfcr_while_in_guest) = 0; + + __trace_do_switch(host_data_ptr(host_debug_state.trfcr_el1), + *host_data_ptr(trfcr_while_in_guest)); + + if (__trace_needs_drain()) { + isb(); + tsb_csync(); + } +} + +static void __trace_switch_to_host(void) +{ + __trace_do_switch(host_data_ptr(trfcr_while_in_guest), + *host_data_ptr(host_debug_state.trfcr_el1)); } void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) @@ -84,9 +97,9 @@ void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) /* Disable and flush SPE data generation */ if (host_data_test_flag(HAS_SPE)) __debug_save_spe(host_data_ptr(host_debug_state.pmscr_el1)); - /* Disable and flush Self-Hosted Trace generation */ - if (host_data_test_flag(HAS_TRBE)) - __debug_save_trace(host_data_ptr(host_debug_state.trfcr_el1)); + + if (__trace_needs_switch()) + __trace_switch_to_guest(); } void __debug_switch_to_guest(struct kvm_vcpu *vcpu) @@ -98,8 +111,8 @@ void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) { if (host_data_test_flag(HAS_SPE)) __debug_restore_spe(*host_data_ptr(host_debug_state.pmscr_el1)); - if (host_data_test_flag(HAS_TRBE)) - __debug_restore_trace(*host_data_ptr(host_debug_state.trfcr_el1)); + if (__trace_needs_switch()) + __trace_switch_to_host(); } void __debug_switch_to_host(struct kvm_vcpu *vcpu) diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index 03d3695ba5aa..a728802d2206 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -17,6 +17,7 @@ #include #include +#include #include #include "coresight-self-hosted-trace.h" @@ -221,6 +222,7 @@ static inline void set_trbe_enabled(struct trbe_cpudata *cpudata, u64 trblimitr) */ trblimitr |= TRBLIMITR_EL1_E; write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1); + kvm_enable_trbe(); /* Synchronize the TRBE enable event */ isb(); @@ -239,6 +241,7 @@ static inline void set_trbe_disabled(struct trbe_cpudata *cpudata) */ trblimitr &= ~TRBLIMITR_EL1_E; write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1); + kvm_disable_trbe(); if (trbe_needs_drain_after_disable(cpudata)) trbe_drain_buffer(); From 054b88391bbe2e470c5484cb91622238314344fb Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Jan 2025 14:24:41 +0000 Subject: [PATCH 81/87] KVM: arm64: Support trace filtering for guests For nVHE, switch the filter value in and out if the Coresight driver asks for it. This will support filters for guests when sinks other than TRBE are used. For VHE, just write the filter directly to TRFCR_EL1 where trace can be used even with TRBE sinks. Signed-off-by: James Clark Link: https://lore.kernel.org/r/20250106142446.628923-7-james.clark@linaro.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/debug.c | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index fec53d84e990..05931a846e4e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1388,6 +1388,7 @@ void kvm_clr_pmu_events(u64 clr); bool kvm_set_pmuserenr(u64 val); void kvm_enable_trbe(void); void kvm_disable_trbe(void); +void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest); #else static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {} static inline void kvm_clr_pmu_events(u64 clr) {} @@ -1397,6 +1398,7 @@ static inline bool kvm_set_pmuserenr(u64 val) } static inline void kvm_enable_trbe(void) {} static inline void kvm_disable_trbe(void) {} +static inline void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) {} #endif void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 92a24adadb0e..0e4c805e7e89 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -245,3 +245,21 @@ void kvm_disable_trbe(void) host_data_clear_flag(TRBE_ENABLED); } EXPORT_SYMBOL_GPL(kvm_disable_trbe); + +void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) +{ + if (is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible())) + return; + + if (has_vhe()) { + write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12); + return; + } + + *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest; + if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest) + host_data_set_flag(EL1_TRACING_CONFIGURED); + else + host_data_clear_flag(EL1_TRACING_CONFIGURED); +} +EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration); From aaf69eff6cdb8613ff1f6a520821f769dc92f969 Mon Sep 17 00:00:00 2001 From: James Clark Date: Mon, 6 Jan 2025 14:24:42 +0000 Subject: [PATCH 82/87] coresight: Pass guest TRFCR value to KVM Currently the userspace and kernel filters for guests are never set, so no trace will be generated for them. Add support for tracing guests by passing the desired TRFCR value to KVM so it can be applied to the guest. By writing either E1TRE or E0TRE, filtering on either guest kernel or guest userspace is also supported. And if both E1TRE and E0TRE are cleared when exclude_guest is set, that option is supported too. This change also brings exclude_host support which is difficult to add as a separate commit without excess churn and resulting in no trace at all. cpu_prohibit_trace() gets moved to TRBE because the ETM driver doesn't need the read, it already has the base TRFCR value. TRBE only needs the read to disable it and then restore. Testing ======= The addresses were counted with the following: $ perf report -D | grep -Eo 'EL2|EL1|EL0' | sort | uniq -c Guest kernel only: $ perf record -e cs_etm//Gk -a -- true 535 EL1 1 EL2 Guest user only (only 5 addresses because the guest runs slowly in the model): $ perf record -e cs_etm//Gu -a -- true 5 EL0 Host kernel only: $ perf record -e cs_etm//Hk -a -- true 3501 EL2 Host userspace only: $ perf record -e cs_etm//Hu -a -- true 408 EL0 1 EL2 Signed-off-by: James Clark Link: https://lore.kernel.org/r/20250106142446.628923-8-james.clark@linaro.org Signed-off-by: Marc Zyngier --- .../coresight/coresight-etm4x-core.c | 49 ++++++++++++++++--- drivers/hwtracing/coresight/coresight-etm4x.h | 2 +- drivers/hwtracing/coresight/coresight-priv.h | 3 ++ .../coresight/coresight-self-hosted-trace.h | 9 ---- drivers/hwtracing/coresight/coresight-trbe.c | 10 ++++ 5 files changed, 56 insertions(+), 17 deletions(-) diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index dd8c74f893db..fbc4aa378527 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -268,10 +269,28 @@ struct etm4_enable_arg { */ static void etm4x_prohibit_trace(struct etmv4_drvdata *drvdata) { + u64 trfcr; + /* If the CPU doesn't support FEAT_TRF, nothing to do */ if (!drvdata->trfcr) return; - cpu_prohibit_trace(); + + trfcr = drvdata->trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE); + + write_trfcr(trfcr); + kvm_tracing_set_el1_configuration(trfcr); +} + +static u64 etm4x_get_kern_user_filter(struct etmv4_drvdata *drvdata) +{ + u64 trfcr = drvdata->trfcr; + + if (drvdata->config.mode & ETM_MODE_EXCL_KERN) + trfcr &= ~TRFCR_ELx_ExTRE; + if (drvdata->config.mode & ETM_MODE_EXCL_USER) + trfcr &= ~TRFCR_ELx_E0TRE; + + return trfcr; } /* @@ -286,18 +305,28 @@ static void etm4x_prohibit_trace(struct etmv4_drvdata *drvdata) */ static void etm4x_allow_trace(struct etmv4_drvdata *drvdata) { - u64 trfcr = drvdata->trfcr; + u64 trfcr, guest_trfcr; /* If the CPU doesn't support FEAT_TRF, nothing to do */ - if (!trfcr) + if (!drvdata->trfcr) return; - if (drvdata->config.mode & ETM_MODE_EXCL_KERN) - trfcr &= ~TRFCR_ELx_ExTRE; - if (drvdata->config.mode & ETM_MODE_EXCL_USER) - trfcr &= ~TRFCR_ELx_E0TRE; + if (drvdata->config.mode & ETM_MODE_EXCL_HOST) + trfcr = drvdata->trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE); + else + trfcr = etm4x_get_kern_user_filter(drvdata); write_trfcr(trfcr); + + /* Set filters for guests and pass to KVM */ + if (drvdata->config.mode & ETM_MODE_EXCL_GUEST) + guest_trfcr = drvdata->trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE); + else + guest_trfcr = etm4x_get_kern_user_filter(drvdata); + + /* TRFCR_EL1 doesn't have CX so mask it out. */ + guest_trfcr &= ~TRFCR_EL2_CX; + kvm_tracing_set_el1_configuration(guest_trfcr); } #ifdef CONFIG_ETM4X_IMPDEF_FEATURE @@ -655,6 +684,12 @@ static int etm4_parse_event_config(struct coresight_device *csdev, if (attr->exclude_user) config->mode = ETM_MODE_EXCL_USER; + if (attr->exclude_host) + config->mode |= ETM_MODE_EXCL_HOST; + + if (attr->exclude_guest) + config->mode |= ETM_MODE_EXCL_GUEST; + /* Always start from the default config */ etm4_set_default_config(config); diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h index 9e9165f62e81..1119762b5cec 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x.h +++ b/drivers/hwtracing/coresight/coresight-etm4x.h @@ -817,7 +817,7 @@ enum etm_impdef_type { * @s_ex_level: Secure ELs where tracing is supported. */ struct etmv4_config { - u32 mode; + u64 mode; u32 pe_sel; u32 cfg; u32 eventctrl0; diff --git a/drivers/hwtracing/coresight/coresight-priv.h b/drivers/hwtracing/coresight/coresight-priv.h index 05f891ca6b5c..76403530f33e 100644 --- a/drivers/hwtracing/coresight/coresight-priv.h +++ b/drivers/hwtracing/coresight/coresight-priv.h @@ -42,6 +42,9 @@ extern const struct device_type coresight_dev_type[]; #define ETM_MODE_EXCL_KERN BIT(30) #define ETM_MODE_EXCL_USER BIT(31) +#define ETM_MODE_EXCL_HOST BIT(32) +#define ETM_MODE_EXCL_GUEST BIT(33) + struct cs_pair_attribute { struct device_attribute attr; u32 lo_off; diff --git a/drivers/hwtracing/coresight/coresight-self-hosted-trace.h b/drivers/hwtracing/coresight/coresight-self-hosted-trace.h index 53840a2c41f2..303d71911870 100644 --- a/drivers/hwtracing/coresight/coresight-self-hosted-trace.h +++ b/drivers/hwtracing/coresight/coresight-self-hosted-trace.h @@ -21,13 +21,4 @@ static inline void write_trfcr(u64 val) isb(); } -static inline u64 cpu_prohibit_trace(void) -{ - u64 trfcr = read_trfcr(); - - /* Prohibit tracing at EL0 & the kernel EL */ - write_trfcr(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE)); - /* Return the original value of the TRFCR */ - return trfcr; -} #endif /* __CORESIGHT_SELF_HOSTED_TRACE_H */ diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index a728802d2206..d6eb0d525a4d 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -1113,6 +1113,16 @@ static bool is_perf_trbe(struct perf_output_handle *handle) return true; } +static u64 cpu_prohibit_trace(void) +{ + u64 trfcr = read_trfcr(); + + /* Prohibit tracing at EL0 & the kernel EL */ + write_trfcr(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE)); + /* Return the original value of the TRFCR */ + return trfcr; +} + static irqreturn_t arm_trbe_irq_handler(int irq, void *dev) { struct perf_output_handle **handle_ptr = dev; From 9fb4267a759ce50e633a56df6dfdb32bafc24203 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 13 Jan 2025 15:19:52 +0000 Subject: [PATCH 83/87] KVM: arm64: Fix selftests after sysreg field name update Fix KVM selftests that check for EL0's 64bit-ness, and use a now removed definition. Kindly point them at the new one. Reported-by: Mark Brown Signed-off-by: Marc Zyngier --- tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c | 2 +- tools/testing/selftests/kvm/aarch64/set_id_regs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c index 8e5bd07a3727..bd182be48c1b 100644 --- a/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/aarch32_id_regs.c @@ -147,7 +147,7 @@ static bool vcpu_aarch64_only(struct kvm_vcpu *vcpu) vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val); - return el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY; + return el0 == ID_AA64PFR0_EL1_EL0_IMP; } int main(void) diff --git a/tools/testing/selftests/kvm/aarch64/set_id_regs.c b/tools/testing/selftests/kvm/aarch64/set_id_regs.c index a79b7f18452d..88f506c45112 100644 --- a/tools/testing/selftests/kvm/aarch64/set_id_regs.c +++ b/tools/testing/selftests/kvm/aarch64/set_id_regs.c @@ -667,7 +667,7 @@ int main(void) /* Check for AARCH64 only system */ vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_ID_AA64PFR0_EL1), &val); el0 = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), val); - aarch64_only = (el0 == ID_AA64PFR0_EL1_ELx_64BIT_ONLY); + aarch64_only = (el0 == ID_AA64PFR0_EL1_EL0_IMP); ksft_print_header(); From c139b6d1b4d27724987af5071177fb5f3d60c1e4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 12 Jan 2025 16:50:28 +0000 Subject: [PATCH 84/87] KVM: arm64: nv: Always evaluate HCR_EL2 using sanitising accessors A lot of the NV code depends on HCR_EL2.{E2H,TGE}, and we assume in places that at least HCR_EL2.E2H is invariant for a given guest. However, we make a point in *not* using the sanitising accessor that would enforce this, and are at the mercy of the guest doing stupid things. Clearly, that's not good. Rework the HCR_EL2 accessors to use __vcpu_sys_reg() instead, guaranteeing that the RESx settings get applied, specially when HCR_EL2.E2H is evaluated. This results in fewer accessors overall. Huge thanks to Joey who spent a long time tracking this bug down. Reported-by: Joey Gouly Tested-by: Joey Gouly Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250112165029.1181056-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 38 ++++++++++++---------------- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 4 +-- 2 files changed, 18 insertions(+), 24 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index cf811009a33c..240b479b15c3 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -184,29 +184,30 @@ static inline bool vcpu_is_el2(const struct kvm_vcpu *vcpu) return vcpu_is_el2_ctxt(&vcpu->arch.ctxt); } -static inline bool __vcpu_el2_e2h_is_set(const struct kvm_cpu_context *ctxt) -{ - return (!cpus_have_final_cap(ARM64_HAS_HCR_NV1) || - (ctxt_sys_reg(ctxt, HCR_EL2) & HCR_E2H)); -} - static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu) { - return __vcpu_el2_e2h_is_set(&vcpu->arch.ctxt); -} - -static inline bool __vcpu_el2_tge_is_set(const struct kvm_cpu_context *ctxt) -{ - return ctxt_sys_reg(ctxt, HCR_EL2) & HCR_TGE; + return (!cpus_have_final_cap(ARM64_HAS_HCR_NV1) || + (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_E2H)); } static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu) { - return __vcpu_el2_tge_is_set(&vcpu->arch.ctxt); + return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_TGE; } -static inline bool __is_hyp_ctxt(const struct kvm_cpu_context *ctxt) +static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) { + bool e2h, tge; + u64 hcr; + + if (!vcpu_has_nv(vcpu)) + return false; + + hcr = __vcpu_sys_reg(vcpu, HCR_EL2); + + e2h = (hcr & HCR_E2H); + tge = (hcr & HCR_TGE); + /* * We are in a hypervisor context if the vcpu mode is EL2 or * E2H and TGE bits are set. The latter means we are in the user space @@ -215,14 +216,7 @@ static inline bool __is_hyp_ctxt(const struct kvm_cpu_context *ctxt) * Note that the HCR_EL2.{E2H,TGE}={0,1} isn't really handled in the * rest of the KVM code, and will result in a misbehaving guest. */ - return vcpu_is_el2_ctxt(ctxt) || - (__vcpu_el2_e2h_is_set(ctxt) && __vcpu_el2_tge_is_set(ctxt)) || - __vcpu_el2_tge_is_set(ctxt); -} - -static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu) -{ - return vcpu_has_nv(vcpu) && __is_hyp_ctxt(&vcpu->arch.ctxt); + return vcpu_is_el2(vcpu) || (e2h && tge) || tge; } static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 5f78a39053a7..90b018e06f2c 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -216,7 +216,7 @@ void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_user_state(guest_ctxt); - if (unlikely(__is_hyp_ctxt(guest_ctxt))) { + if (unlikely(is_hyp_ctxt(vcpu))) { __sysreg_restore_vel2_state(vcpu); } else { if (vcpu_has_nv(vcpu)) { @@ -260,7 +260,7 @@ void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); - if (unlikely(__is_hyp_ctxt(guest_ctxt))) + if (unlikely(is_hyp_ctxt(vcpu))) __sysreg_save_vel2_state(vcpu); else __sysreg_save_el1_state(guest_ctxt); From 36f998de853cfad60508dfdfb41c9c40a2245f19 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 12 Jan 2025 16:50:29 +0000 Subject: [PATCH 85/87] KVM: arm64: nv: Apply RESx settings to sysreg reset values While we have sanitisation in place for the guest sysregs, we lack that sanitisation out of reset. So some of the fields could be evaluated and not reflect their RESx status, which sounds like a very bad idea. Apply the RESx masks to the the sysreg file in two situations: - when going via a reset of the sysregs - after having computed the RESx masks Having this separate reset phase from the actual reset handling is a bit grotty, but we need to apply this after the ID registers are final. Tested-by: Joey Gouly Reviewed-by: Joey Gouly Link: https://lore.kernel.org/r/20250112165029.1181056-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_nested.h | 2 +- arch/arm64/kvm/nested.c | 9 +++++++-- arch/arm64/kvm/sys_regs.c | 5 ++++- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index 233e65522716..f616e25e204f 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -186,7 +186,7 @@ static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr) return true; } -int kvm_init_nv_sysregs(struct kvm *kvm); +int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu); #ifdef CONFIG_ARM64_PTR_AUTH bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 9b36218b48de..dd6480cf90ea 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -963,14 +963,15 @@ static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, kvm->arch.sysreg_masks->mask[i].res1 = res1; } -int kvm_init_nv_sysregs(struct kvm *kvm) +int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; u64 res0, res1; lockdep_assert_held(&kvm->arch.config_lock); if (kvm->arch.sysreg_masks) - return 0; + goto out; kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)), GFP_KERNEL_ACCOUNT); @@ -1271,6 +1272,10 @@ int kvm_init_nv_sysregs(struct kvm *kvm) res0 |= MDCR_EL2_EnSTEPOP; set_sysreg_masks(kvm, MDCR_EL2, res0, res1); +out: + for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) + (void)__vcpu_sys_reg(vcpu, sr); + return 0; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 83c6b4a07ef5..8671d46f53e5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -4396,6 +4396,9 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) reset_vcpu_ftr_id_reg(vcpu, r); else r->reset(vcpu, r); + + if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) + (void)__vcpu_sys_reg(vcpu, r->reg); } set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); @@ -4999,7 +5002,7 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu) } if (vcpu_has_nv(vcpu)) { - int ret = kvm_init_nv_sysregs(kvm); + int ret = kvm_init_nv_sysregs(vcpu); if (ret) return ret; } From 544786361d4b73905b05b9539c2bf401c533f0d6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 16 Jan 2025 10:27:10 +0000 Subject: [PATCH 86/87] KVM: arm64: nv: Fix doc header layout for timers Stephen reports that 'make htmldocs' spits out a warning ("Documentation/virt/kvm/devices/vcpu.rst:147: WARNING: Definition list ends without a blank line; unexpected unindent."). Fix it by keeping all the timer attributes on a single line. Reported-by: Stephen Rothwell Signed-off-by: Marc Zyngier --- Documentation/virt/kvm/devices/vcpu.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/virt/kvm/devices/vcpu.rst b/Documentation/virt/kvm/devices/vcpu.rst index d62ba86ee166..31a9576c07af 100644 --- a/Documentation/virt/kvm/devices/vcpu.rst +++ b/Documentation/virt/kvm/devices/vcpu.rst @@ -142,9 +142,8 @@ the cpu field to the processor id. :Architectures: ARM64 -2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_VTIMER, KVM_ARM_VCPU_TIMER_IRQ_PTIMER, - KVM_ARM_VCPU_TIMER_IRQ_HVTIMER, KVM_ARM_VCPU_TIMER_IRQ_HPTIMER, --------------------------------------------------------------------------------- +2.1. ATTRIBUTES: KVM_ARM_VCPU_TIMER_IRQ_{VTIMER,PTIMER,HVTIMER,HPTIMER} +----------------------------------------------------------------------- :Parameters: in kvm_device_attr.addr the address for the timer interrupt is a pointer to an int From 01009b06a6b52d8439c55b530633a971c13b6cb2 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 12 Jan 2025 13:08:59 +0000 Subject: [PATCH 87/87] arm64/sysreg: Get rid of TRFCR_ELx SysregFields There is no such thing as TRFCR_ELx in the architecture. What we have is TRFCR_EL1, for which TRFCR_EL12 is an accessor. Rename TRFCR_ELx_* to TRFCR_EL1_*, and fix the bit of code using these names. Similarly, TRFCR_EL12 is redefined as a mapping to TRFCR_EL1. Reviewed-by: James Clark Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/87cygsqgkh.wl-maz@kernel.org Cc: Suzuki K Poulose Cc: Mark Brown Cc: Will Deacon Cc: Catalin Marinas --- arch/arm64/tools/sysreg | 8 ++------ .../hwtracing/coresight/coresight-etm4x-core.c | 16 ++++++++-------- .../hwtracing/coresight/coresight-etm4x-sysfs.c | 10 +++++----- drivers/hwtracing/coresight/coresight-trbe.c | 2 +- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 4c2c9c6767c9..ac5202e1df86 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1999,7 +1999,7 @@ Field 17:16 ZEN Res0 15:0 EndSysreg -SysregFields TRFCR_ELx +Sysreg TRFCR_EL1 3 0 1 2 1 Res0 63:7 UnsignedEnum 6:5 TS 0b0001 VIRTUAL @@ -2011,10 +2011,6 @@ Field 1 ExTRE Field 0 E0TRE EndSysregFields -Sysreg TRFCR_EL1 3 0 1 2 1 -Fields TRFCR_ELx -EndSysreg - Sysreg SMPRI_EL1 3 0 1 2 4 Res0 63:4 Field 3:0 PRIORITY @@ -2991,7 +2987,7 @@ Mapping ZCR_EL1 EndSysreg Sysreg TRFCR_EL12 3 5 1 2 1 -Fields TRFCR_ELx +Mapping TRFCR_EL1 EndSysreg Sysreg SMCR_EL12 3 5 1 2 6 diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index fbc4aa378527..2c1a60577728 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -275,7 +275,7 @@ static void etm4x_prohibit_trace(struct etmv4_drvdata *drvdata) if (!drvdata->trfcr) return; - trfcr = drvdata->trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE); + trfcr = drvdata->trfcr & ~(TRFCR_EL1_ExTRE | TRFCR_EL1_E0TRE); write_trfcr(trfcr); kvm_tracing_set_el1_configuration(trfcr); @@ -286,9 +286,9 @@ static u64 etm4x_get_kern_user_filter(struct etmv4_drvdata *drvdata) u64 trfcr = drvdata->trfcr; if (drvdata->config.mode & ETM_MODE_EXCL_KERN) - trfcr &= ~TRFCR_ELx_ExTRE; + trfcr &= ~TRFCR_EL1_ExTRE; if (drvdata->config.mode & ETM_MODE_EXCL_USER) - trfcr &= ~TRFCR_ELx_E0TRE; + trfcr &= ~TRFCR_EL1_E0TRE; return trfcr; } @@ -312,7 +312,7 @@ static void etm4x_allow_trace(struct etmv4_drvdata *drvdata) return; if (drvdata->config.mode & ETM_MODE_EXCL_HOST) - trfcr = drvdata->trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE); + trfcr = drvdata->trfcr & ~(TRFCR_EL1_ExTRE | TRFCR_EL1_E0TRE); else trfcr = etm4x_get_kern_user_filter(drvdata); @@ -320,7 +320,7 @@ static void etm4x_allow_trace(struct etmv4_drvdata *drvdata) /* Set filters for guests and pass to KVM */ if (drvdata->config.mode & ETM_MODE_EXCL_GUEST) - guest_trfcr = drvdata->trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE); + guest_trfcr = drvdata->trfcr & ~(TRFCR_EL1_ExTRE | TRFCR_EL1_E0TRE); else guest_trfcr = etm4x_get_kern_user_filter(drvdata); @@ -1176,9 +1176,9 @@ static void cpu_detect_trace_filtering(struct etmv4_drvdata *drvdata) * tracing at the kernel EL and EL0, forcing to use the * virtual time as the timestamp. */ - trfcr = (TRFCR_ELx_TS_VIRTUAL | - TRFCR_ELx_ExTRE | - TRFCR_ELx_E0TRE); + trfcr = (TRFCR_EL1_TS_VIRTUAL | + TRFCR_EL1_ExTRE | + TRFCR_EL1_E0TRE); /* If we are running at EL2, allow tracing the CONTEXTIDR_EL2. */ if (is_kernel_in_hyp_mode()) diff --git a/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c b/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c index a9f19629f3f8..c767f8ae4cf1 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-sysfs.c @@ -2319,11 +2319,11 @@ static ssize_t ts_source_show(struct device *dev, goto out; } - switch (drvdata->trfcr & TRFCR_ELx_TS_MASK) { - case TRFCR_ELx_TS_VIRTUAL: - case TRFCR_ELx_TS_GUEST_PHYSICAL: - case TRFCR_ELx_TS_PHYSICAL: - val = FIELD_GET(TRFCR_ELx_TS_MASK, drvdata->trfcr); + switch (drvdata->trfcr & TRFCR_EL1_TS_MASK) { + case TRFCR_EL1_TS_VIRTUAL: + case TRFCR_EL1_TS_GUEST_PHYSICAL: + case TRFCR_EL1_TS_PHYSICAL: + val = FIELD_GET(TRFCR_EL1_TS_MASK, drvdata->trfcr); break; default: val = -1; diff --git a/drivers/hwtracing/coresight/coresight-trbe.c b/drivers/hwtracing/coresight/coresight-trbe.c index d6eb0d525a4d..fff67aac8418 100644 --- a/drivers/hwtracing/coresight/coresight-trbe.c +++ b/drivers/hwtracing/coresight/coresight-trbe.c @@ -1118,7 +1118,7 @@ static u64 cpu_prohibit_trace(void) u64 trfcr = read_trfcr(); /* Prohibit tracing at EL0 & the kernel EL */ - write_trfcr(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE)); + write_trfcr(trfcr & ~(TRFCR_EL1_ExTRE | TRFCR_EL1_E0TRE)); /* Return the original value of the TRFCR */ return trfcr; }