* Single fix for broken usage of 'multi-MIDR' infrastructure in PI
   code, adding an open-coded erratum check for everyone's favorite pile
   of sand: Cavium ThunderX
 
 x86:
 
 * Bugfixes from a planned posted interrupt rework
 
 * Do not use kvm_rip_read() unconditionally to cater for guests
   with inaccessible register state.
 -----BEGIN PGP SIGNATURE-----
 
 iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmgKdOQUHHBib256aW5p
 QHJlZGhhdC5jb20ACgkQv/vSX3jHroMLnAf+KZLEzeeQ7D29licBu8xg9dsiT+p1
 A/wGv+LxA7EzGjWLoX1SVWIaziWfThK9FZaOtSclOoE4e4LHx+m69xuFJTzqiodz
 P6oHDjwUBSclBW05bfW3WNsaa+kGjvoiAtyEmTni/IcMJQs7BSjR/Yh1Jhgi8Ozi
 K8lwUH8BLRpqsXMI++c7IRAimqqecEt2oaf3U22nu3wz7EmAmsW8zlZjoAUD/xKc
 uiKG5WeTx30TKH8YlSXLrNaB13qEUXzdqSeHPEXoTNid8u3ySIPBv1URamQKqEP2
 caKZfPPhIDomeEsDobzKdsXVTZ9Pl9/lDxQ7wyMjnJ7ga3cCOKpy1GNfTQ==
 =5EpY
 -----END PGP SIGNATURE-----

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "ARM:

   - Single fix for broken usage of 'multi-MIDR' infrastructure in PI
     code, adding an open-coded erratum check for everyone's favorite
     pile of sand: Cavium ThunderX

  x86:

   - Bugfixes from a planned posted interrupt rework

   - Do not use kvm_rip_read() unconditionally to cater for guests with
     inaccessible register state"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  KVM: x86: Do not use kvm_rip_read() unconditionally for KVM_PROFILING
  KVM: x86: Do not use kvm_rip_read() unconditionally in KVM tracepoints
  KVM: SVM: WARN if an invalid posted interrupt IRTE entry is added
  iommu/amd: WARN if KVM attempts to set vCPU affinity without posted intrrupts
  iommu/amd: Return an error if vCPU affinity is set for non-vCPU IRTE
  KVM: x86: Take irqfds.lock when adding/deleting IRQ bypass producer
  KVM: x86: Explicitly treat routing entry type changes as changes
  KVM: x86: Reset IRTE to host control if *new* route isn't postable
  KVM: SVM: Allocate IR data using atomic allocation
  KVM: SVM: Don't update IRTEs if APICv/AVIC is disabled
  KVM: arm64, x86: make kvm_arch_has_irq_bypass() inline
  arm64: Rework checks for broken Cavium HW in the PI code
This commit is contained in:
Linus Torvalds 2025-04-25 12:00:56 -07:00
commit c405e182ea
12 changed files with 116 additions and 94 deletions

View File

@ -1588,4 +1588,9 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);
#define kvm_has_s1poe(k) \
(kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP))
static inline bool kvm_arch_has_irq_bypass(void)
{
return true;
}
#endif /* __ARM64_KVM_HOST_H__ */

View File

@ -94,17 +94,6 @@ static inline bool kaslr_requires_kpti(void)
return false;
}
/*
* Systems affected by Cavium erratum 24756 are incompatible
* with KPTI.
*/
if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) {
extern const struct midr_range cavium_erratum_27456_cpus[];
if (is_midr_in_range_list(cavium_erratum_27456_cpus))
return false;
}
return true;
}

View File

@ -335,7 +335,7 @@ static const struct midr_range cavium_erratum_23154_cpus[] = {
#endif
#ifdef CONFIG_CAVIUM_ERRATUM_27456
const struct midr_range cavium_erratum_27456_cpus[] = {
static const struct midr_range cavium_erratum_27456_cpus[] = {
/* Cavium ThunderX, T88 pass 1.x - 2.1 */
MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1),
/* Cavium ThunderX, T81 pass 1.0 */

View File

@ -47,10 +47,6 @@ PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override);
PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override);
PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override);
PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings);
#ifdef CONFIG_CAVIUM_ERRATUM_27456
PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus);
PROVIDE(__pi_is_midr_in_range_list = is_midr_in_range_list);
#endif
PROVIDE(__pi__ctype = _ctype);
PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed);

View File

@ -207,6 +207,29 @@ static void __init map_fdt(u64 fdt)
dsb(ishst);
}
/*
* PI version of the Cavium Eratum 27456 detection, which makes it
* impossible to use non-global mappings.
*/
static bool __init ng_mappings_allowed(void)
{
static const struct midr_range cavium_erratum_27456_cpus[] __initconst = {
/* Cavium ThunderX, T88 pass 1.x - 2.1 */
MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1),
/* Cavium ThunderX, T81 pass 1.0 */
MIDR_REV(MIDR_THUNDERX_81XX, 0, 0),
{},
};
for (const struct midr_range *r = cavium_erratum_27456_cpus; r->model; r++) {
if (midr_is_cpu_model_range(read_cpuid_id(), r->model,
r->rv_min, r->rv_max))
return false;
}
return true;
}
asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
{
static char const chosen_str[] __initconst = "/chosen";
@ -246,7 +269,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt)
u64 kaslr_seed = kaslr_early_init(fdt, chosen);
if (kaslr_seed && kaslr_requires_kpti())
arm64_use_ng_mappings = true;
arm64_use_ng_mappings = ng_mappings_allowed();
kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1);
}

View File

@ -2743,11 +2743,6 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
return irqchip_in_kernel(kvm);
}
bool kvm_arch_has_irq_bypass(void)
{
return true;
}
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{

View File

@ -35,6 +35,7 @@
#include <asm/mtrr.h>
#include <asm/msr-index.h>
#include <asm/asm.h>
#include <asm/irq_remapping.h>
#include <asm/kvm_page_track.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/reboot.h>
@ -2423,4 +2424,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
*/
#define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1)
static inline bool kvm_arch_has_irq_bypass(void)
{
return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
}
#endif /* _ASM_X86_KVM_HOST_H */

View File

@ -796,12 +796,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
struct amd_svm_iommu_ir *ir;
u64 entry;
if (WARN_ON_ONCE(!pi->ir_data))
return -EINVAL;
/**
* In some cases, the existing irte is updated and re-set,
* so we need to check here if it's already been * added
* to the ir_list.
*/
if (pi->ir_data && (pi->prev_ga_tag != 0)) {
if (pi->prev_ga_tag) {
struct kvm *kvm = svm->vcpu.kvm;
u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
@ -820,7 +823,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
*/
ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
@ -896,10 +899,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
bool enable_remapped_mode = true;
int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP))
if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass())
return 0;
pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
@ -933,6 +936,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
kvm_vcpu_apicv_active(&svm->vcpu)) {
struct amd_iommu_pi_data pi;
enable_remapped_mode = false;
/* Try to enable guest_mode in IRTE */
pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
AVIC_HPA_MASK);
@ -951,33 +956,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
*/
if (!ret && pi.is_guest_mode)
svm_ir_list_add(svm, &pi);
} else {
/* Use legacy mode in IRTE */
struct amd_iommu_pi_data pi;
/**
* Here, pi is used to:
* - Tell IOMMU to use legacy mode for this interrupt.
* - Retrieve ga_tag of prior interrupt remapping data.
*/
pi.prev_ga_tag = 0;
pi.is_guest_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &pi);
/**
* Check if the posted interrupt was previously
* setup with the guest_mode by checking if the ga_tag
* was cached. If so, we need to clean up the per-vcpu
* ir_list.
*/
if (!ret && pi.prev_ga_tag) {
int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
struct kvm_vcpu *vcpu;
vcpu = kvm_get_vcpu_by_id(kvm, id);
if (vcpu)
svm_ir_list_del(to_svm(vcpu), &pi);
}
}
if (!ret && svm) {
@ -993,6 +971,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
}
ret = 0;
if (enable_remapped_mode) {
/* Use legacy mode in IRTE */
struct amd_iommu_pi_data pi;
/**
* Here, pi is used to:
* - Tell IOMMU to use legacy mode for this interrupt.
* - Retrieve ga_tag of prior interrupt remapping data.
*/
pi.prev_ga_tag = 0;
pi.is_guest_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &pi);
/**
* Check if the posted interrupt was previously
* setup with the guest_mode by checking if the ga_tag
* was cached. If so, we need to clean up the per-vcpu
* ir_list.
*/
if (!ret && pi.prev_ga_tag) {
int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
struct kvm_vcpu *vcpu;
vcpu = kvm_get_vcpu_by_id(kvm, id);
if (vcpu)
svm_ir_list_del(to_svm(vcpu), &pi);
}
}
out:
srcu_read_unlock(&kvm->irq_srcu, idx);
return ret;

View File

@ -11,6 +11,13 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
#ifdef CREATE_TRACE_POINTS
#define tracing_kvm_rip_read(vcpu) ({ \
typeof(vcpu) __vcpu = vcpu; \
__vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu); \
})
#endif
/*
* Tracepoint for guest mode entry.
*/
@ -28,7 +35,7 @@ TRACE_EVENT(kvm_entry,
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
__entry->rip = kvm_rip_read(vcpu);
__entry->rip = tracing_kvm_rip_read(vcpu);
__entry->immediate_exit = force_immediate_exit;
kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info,
@ -319,7 +326,7 @@ TRACE_EVENT(name, \
), \
\
TP_fast_assign( \
__entry->guest_rip = kvm_rip_read(vcpu); \
__entry->guest_rip = tracing_kvm_rip_read(vcpu); \
__entry->isa = isa; \
__entry->vcpu_id = vcpu->vcpu_id; \
__entry->requests = READ_ONCE(vcpu->requests); \
@ -423,7 +430,7 @@ TRACE_EVENT(kvm_page_fault,
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
__entry->guest_rip = kvm_rip_read(vcpu);
__entry->guest_rip = tracing_kvm_rip_read(vcpu);
__entry->fault_address = fault_address;
__entry->error_code = error_code;
),

View File

@ -297,6 +297,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
bool enable_remapped_mode = true;
struct kvm_lapic_irq irq;
struct kvm_vcpu *vcpu;
struct vcpu_data vcpu_info;
@ -335,21 +336,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
!kvm_irq_is_postable(&irq)) {
/*
* Make sure the IRTE is in remapped mode if
* we don't handle it in posted mode.
*/
ret = irq_set_vcpu_affinity(host_irq, NULL);
if (ret < 0) {
printk(KERN_INFO
"failed to back to remapped mode, irq: %u\n",
host_irq);
goto out;
}
!kvm_irq_is_postable(&irq))
continue;
}
vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
@ -357,11 +345,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
vcpu_info.vector, vcpu_info.pi_desc_addr, set);
if (set)
ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
else
ret = irq_set_vcpu_affinity(host_irq, NULL);
if (!set)
continue;
enable_remapped_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
if (ret < 0) {
printk(KERN_INFO "%s: failed to update PI IRTE\n",
__func__);
@ -369,6 +358,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
}
}
if (enable_remapped_mode)
ret = irq_set_vcpu_affinity(host_irq, NULL);
ret = 0;
out:
srcu_read_unlock(&kvm->irq_srcu, idx);

View File

@ -11098,7 +11098,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* Profile KVM exit RIPs:
*/
if (unlikely(prof_on == KVM_PROFILING)) {
if (unlikely(prof_on == KVM_PROFILING &&
!vcpu->arch.guest_state_protected)) {
unsigned long rip = kvm_rip_read(vcpu);
profile_hit(KVM_PROFILING, (void *)rip);
}
@ -13556,25 +13557,27 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
bool kvm_arch_has_irq_bypass(void)
{
return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP);
}
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
struct irq_bypass_producer *prod)
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
struct kvm *kvm = irqfd->kvm;
int ret;
irqfd->producer = prod;
kvm_arch_start_assignment(irqfd->kvm);
spin_lock_irq(&kvm->irqfds.lock);
irqfd->producer = prod;
ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 1);
if (ret)
kvm_arch_end_assignment(irqfd->kvm);
spin_unlock_irq(&kvm->irqfds.lock);
return ret;
}
@ -13584,9 +13587,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
int ret;
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
struct kvm *kvm = irqfd->kvm;
WARN_ON(irqfd->producer != prod);
irqfd->producer = NULL;
/*
* When producer of consumer is unregistered, we change back to
@ -13594,12 +13597,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
spin_lock_irq(&kvm->irqfds.lock);
irqfd->producer = NULL;
ret = kvm_x86_call(pi_update_irte)(irqfd->kvm,
prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
spin_unlock_irq(&kvm->irqfds.lock);
kvm_arch_end_assignment(irqfd->kvm);
}
@ -13612,7 +13621,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
struct kvm_kernel_irq_routing_entry *new)
{
if (new->type != KVM_IRQ_ROUTING_MSI)
if (old->type != KVM_IRQ_ROUTING_MSI ||
new->type != KVM_IRQ_ROUTING_MSI)
return true;
return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));

View File

@ -3869,6 +3869,9 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
struct irq_2_irte *irte_info = &ir_data->irq_2_irte;
struct iommu_dev_data *dev_data;
if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)))
return -EINVAL;
if (ir_data->iommu == NULL)
return -EINVAL;
@ -3879,21 +3882,11 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
* we should not modify the IRTE
*/
if (!dev_data || !dev_data->use_vapic)
return 0;
return -EINVAL;
ir_data->cfg = irqd_cfg(data);
pi_data->ir_data = ir_data;
/* Note:
* SVM tries to set up for VAPIC mode, but we are in
* legacy mode. So, we force legacy mode instead.
*/
if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
pr_debug("%s: Fall back to using intr legacy remap\n",
__func__);
pi_data->is_guest_mode = false;
}
pi_data->prev_ga_tag = ir_data->cached_ga_tag;
if (pi_data->is_guest_mode) {
ir_data->ga_root_ptr = (pi_data->base >> 12);