From 600e9606046ac3b9b7a3f0500d08a179df84c45e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:34:40 -0700 Subject: [PATCH 1/8] x86/irq: Ensure initial PIR loads are performed exactly once Ensure the PIR is read exactly once at the start of handle_pending_pir(), to guarantee that checking for an outstanding posted interrupt in a given chuck doesn't reload the chunk from the "real" PIR. Functionally, a reload is benign, but it would defeat the purpose of pre-loading into a copy. Fixes: 1b03d82ba15e ("x86/irq: Install posted MSI notification handler") Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20250401163447.846608-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 81f9b78e0f7b..6cd5d2d6c58a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -419,7 +419,7 @@ static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) bool handled = false; for (i = 0; i < 4; i++) - pir_copy[i] = pir[i]; + pir_copy[i] = READ_ONCE(pir[i]); for (i = 0; i < 4; i++) { if (!pir_copy[i]) From 3cdb8261504cc5503a74d57e619cd7915431b088 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:34:41 -0700 Subject: [PATCH 2/8] x86/irq: Track if IRQ was found in PIR during initial loop (to load PIR vals) Track whether or not at least one IRQ was found in PIR during the initial loop to load PIR chunks from memory. Doing so generates slightly better code (arguably) for processing the for-loop of XCHGs, especially for the case where there are no pending IRQs. Note, while PIR can be modified between the initial load and the XCHG, it can only _gain_ new IRQs, i.e. there is no danger of a false positive due to the final version of pir_copy[] being empty. Opportunistically convert the boolean to an "unsigned long" and compute the effective boolean result via bitwise-OR. Some compilers, e.g. clang-14, need the extra "hint" to elide conditional branches. Opportunistically rename the variable in anticipation of moving the PIR accesses to a common helper that can be shared by posted MSIs and KVM. Old: <+74>: test %rdx,%rdx <+77>: je 0xffffffff812bbeb0 <+88>: mov $0x1,%dl> <+90>: test %rsi,%rsi <+93>: je 0xffffffff812bbe8c <+106>: mov $0x1,%dl <+108>: test %rcx,%rcx <+111>: je 0xffffffff812bbe9e <+124>: mov $0x1,%dl <+126>: test %rax,%rax <+129>: je 0xffffffff812bbeb9 <+142>: jmp 0xffffffff812bbec1 <+144>: xor %edx,%edx <+146>: test %rsi,%rsi <+149>: jne 0xffffffff812bbe7f <+151>: jmp 0xffffffff812bbe8c <+153>: test %dl,%dl <+155>: je 0xffffffff812bbf8e New: <+74>: mov %rax,%r8 <+77>: or %rcx,%r8 <+80>: or %rdx,%r8 <+83>: or %rsi,%r8 <+86>: setne %bl <+89>: je 0xffffffff812bbf88 <+95>: test %rsi,%rsi <+98>: je 0xffffffff812bbe8d <+109>: test %rdx,%rdx <+112>: je 0xffffffff812bbe9d <+125>: test %rcx,%rcx <+128>: je 0xffffffff812bbead <+141>: test %rax,%rax <+144>: je 0xffffffff812bbebd Link: https://lore.kernel.org/r/20250401163447.846608-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kernel/irq.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 6cd5d2d6c58a..6d2c8894877e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -414,27 +414,28 @@ void intel_posted_msi_init(void) */ static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) { + unsigned long pir_copy[4], pending = 0; int i, vec = FIRST_EXTERNAL_VECTOR; - unsigned long pir_copy[4]; - bool handled = false; - for (i = 0; i < 4; i++) + for (i = 0; i < 4; i++) { pir_copy[i] = READ_ONCE(pir[i]); + pending |= pir_copy[i]; + } + + if (!pending) + return false; for (i = 0; i < 4; i++) { if (!pir_copy[i]) continue; pir_copy[i] = arch_xchg(&pir[i], 0); - handled = true; } - if (handled) { - for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) - call_irq_handler(vec, regs); - } + for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) + call_irq_handler(vec, regs); - return handled; + return true; } /* From 6433fc01f9f19573894cdcf776679c10d1310801 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:34:42 -0700 Subject: [PATCH 3/8] KVM: VMX: Ensure vIRR isn't reloaded at odd times when sync'ing PIR Read each vIRR exactly once when shuffling IRQs from the PIR to the vAPIC to ensure getting the highest priority IRQ from the chunk doesn't reload from the vIRR. In practice, a reload is functionally benign as vcpu->mutex is held and so IRQs can be consumed, i.e. new IRQs can appear, but existing IRQs can't disappear. Link: https://lore.kernel.org/r/20250401163447.846608-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c9de81cc27e1..38d793a96686 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -667,7 +667,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); - irr_val = *p_irr; + irr_val = READ_ONCE(*p_irr); pir_val = READ_ONCE(pir[i]); if (pir_val) { From f1459315f4d2ff4bf02896e541e1566a69a8fd00 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:34:43 -0700 Subject: [PATCH 4/8] x86/irq: KVM: Track PIR bitmap as an "unsigned long" array Track the PIR bitmap in posted interrupt descriptor structures as an array of unsigned longs instead of using unionized arrays for KVM (u32s) versus IRQ management (u64s). In practice, because the non-KVM usage is (sanely) restricted to 64-bit kernels, all existing usage of the u64 variant is already working with unsigned longs. Using "unsigned long" for the array will allow reworking KVM's processing of the bitmap to read/write in 64-bit chunks on 64-bit kernels, i.e. will allow optimizing KVM by reducing the number of atomic accesses to PIR. Opportunstically replace the open coded literals in the posted MSIs code with the appropriate macro. Deliberately don't use ARRAY_SIZE() in the for-loops, even though it would be cleaner from a certain perspective, in anticipation of decoupling the processing from the array declaration. No functional change intended. Link: https://lore.kernel.org/r/20250401163447.846608-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/posted_intr.h | 14 +++++++------- arch/x86/kernel/irq.c | 12 ++++++------ arch/x86/kvm/lapic.c | 9 +++++---- arch/x86/kvm/lapic.h | 4 ++-- arch/x86/kvm/vmx/posted_intr.h | 2 +- 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index bb107ebbe713..d86aa4badcee 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -8,12 +8,12 @@ #define PID_TABLE_ENTRY_VALID 1 +#define NR_PIR_VECTORS 256 +#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG) + /* Posted-Interrupt Descriptor */ struct pi_desc { - union { - u32 pir[8]; /* Posted interrupt requested */ - u64 pir64[4]; - }; + unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */ union { struct { u16 notifications; /* Suppress and outstanding bits */ @@ -43,12 +43,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc) static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) { - return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); + return test_and_set_bit(vector, pi_desc->pir); } static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) { - return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); + return bitmap_empty(pi_desc->pir, NR_VECTORS); } static inline void pi_set_sn(struct pi_desc *pi_desc) @@ -110,7 +110,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector) if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR)) return false; - return test_bit(vector, (unsigned long *)pid->pir); + return test_bit(vector, pid->pir); } extern void intel_posted_msi_init(void); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 6d2c8894877e..c8cf7c61fc57 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -412,12 +412,12 @@ void intel_posted_msi_init(void) * instead of: * read, xchg, read, xchg, read, xchg, read, xchg */ -static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) +static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) { - unsigned long pir_copy[4], pending = 0; + unsigned long pir_copy[NR_PIR_WORDS], pending = 0; int i, vec = FIRST_EXTERNAL_VECTOR; - for (i = 0; i < 4; i++) { + for (i = 0; i < NR_PIR_WORDS; i++) { pir_copy[i] = READ_ONCE(pir[i]); pending |= pir_copy[i]; } @@ -425,7 +425,7 @@ static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs) if (!pending) return false; - for (i = 0; i < 4; i++) { + for (i = 0; i < NR_PIR_WORDS; i++) { if (!pir_copy[i]) continue; @@ -465,7 +465,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here. */ while (++i < MAX_POSTED_MSI_COALESCING_LOOP) { - if (!handle_pending_pir(pid->pir64, regs)) + if (!handle_pending_pir(pid->pir, regs)) break; } @@ -480,7 +480,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification) * process PIR bits one last time such that handling the new interrupts * are not delayed until the next IRQ. */ - handle_pending_pir(pid->pir64, regs); + handle_pending_pir(pid->pir, regs); apic_eoi(); irq_exit(); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 38d793a96686..bd72701256bc 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -655,8 +655,9 @@ static u8 count_vectors(void *bitmap) return count; } -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { + u32 *__pir = (void *)pir; u32 i, vec; u32 pir_val, irr_val, prev_irr_val; int max_updated_irr; @@ -668,10 +669,10 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); irr_val = READ_ONCE(*p_irr); - pir_val = READ_ONCE(pir[i]); + pir_val = READ_ONCE(__pir[i]); if (pir_val) { - pir_val = xchg(&pir[i], 0); + pir_val = xchg(&__pir[i], 0); prev_irr_val = irr_val; do { @@ -691,7 +692,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e33c969439f7..4ce30db65828 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -103,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int shorthand, unsigned int dest, int dest_mode); int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2); void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec); -bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); -bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); +bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr); +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h index 68605ca7ef68..8d183983cd91 100644 --- a/arch/x86/kvm/vmx/posted_intr.h +++ b/arch/x86/kvm/vmx/posted_intr.h @@ -20,7 +20,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc) { int vec; - vec = find_last_bit((unsigned long *)pi_desc->pir, 256); + vec = find_last_bit(pi_desc->pir, 256); return vec < 256 ? vec : -1; } From 06b4d0ea226c295e85a9daa6aed0ae9fa3ff8a94 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:34:44 -0700 Subject: [PATCH 5/8] KVM: VMX: Process PIR using 64-bit accesses on 64-bit kernels Process the PIR at the natural kernel width, i.e. in 64-bit chunks on 64-bit kernels, so that the worst case of having a posted IRQ in each chunk of the vIRR only requires 4 loads and xchgs from/to the PIR, not 8. Deliberately use a "continue" to skip empty entries so that the code is a carbon copy of handle_pending_pir(), in anticipation of deduplicating KVM and posted MSI logic. Suggested-by: Jim Mattson Link: https://lore.kernel.org/r/20250401163447.846608-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bd72701256bc..8b90a537f6ad 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -657,26 +657,32 @@ static u8 count_vectors(void *bitmap) bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { - u32 *__pir = (void *)pir; + unsigned long pir_vals[NR_PIR_WORDS]; + u32 *__pir = (void *)pir_vals; u32 i, vec; - u32 pir_val, irr_val, prev_irr_val; + u32 irr_val, prev_irr_val; int max_updated_irr; max_updated_irr = -1; *max_irr = -1; + for (i = 0; i < NR_PIR_WORDS; i++) { + pir_vals[i] = READ_ONCE(pir[i]); + if (!pir_vals[i]) + continue; + + pir_vals[i] = xchg(&pir[i], 0); + } + for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10); irr_val = READ_ONCE(*p_irr); - pir_val = READ_ONCE(__pir[i]); - - if (pir_val) { - pir_val = xchg(&__pir[i], 0); + if (__pir[i]) { prev_irr_val = irr_val; do { - irr_val = prev_irr_val | pir_val; + irr_val = prev_irr_val | __pir[i]; } while (prev_irr_val != irr_val && !try_cmpxchg(p_irr, &prev_irr_val, irr_val)); From b41f8638b9d30fbe045b4ef83ff4136c56a57397 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:34:45 -0700 Subject: [PATCH 6/8] KVM: VMX: Isolate pure loads from atomic XCHG when processing PIR Rework KVM's processing of the PIR to use the same algorithm as posted MSIs, i.e. to do READ(x4) => XCHG(x4) instead of (READ+XCHG)(x4). Given KVM's long-standing, sub-optimal use of 32-bit accesses to the PIR, it's safe to say far more thought and investigation was put into handling the PIR for posted MSIs, i.e. there's no reason to assume KVM's existing logic is meaningful, let alone superior. Matching the processing done by posted MSIs will also allow deduplicating the code between KVM and posted MSIs. See the comment for handle_pending_pir() added by commit 1b03d82ba15e ("x86/irq: Install posted MSI notification handler") for details on why isolating loads from XCHG is desirable. Suggested-by: Jim Mattson Link: https://lore.kernel.org/r/20250401163447.846608-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 8b90a537f6ad..bc56775426c9 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -657,7 +657,7 @@ static u8 count_vectors(void *bitmap) bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { - unsigned long pir_vals[NR_PIR_WORDS]; + unsigned long pir_vals[NR_PIR_WORDS], pending = 0; u32 *__pir = (void *)pir_vals; u32 i, vec; u32 irr_val, prev_irr_val; @@ -668,6 +668,13 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) for (i = 0; i < NR_PIR_WORDS; i++) { pir_vals[i] = READ_ONCE(pir[i]); + pending |= pir_vals[i]; + } + + if (!pending) + return false; + + for (i = 0; i < NR_PIR_WORDS; i++) { if (!pir_vals[i]) continue; From baf68a0e3bd624bc300381a757f660451483fb7f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:34:46 -0700 Subject: [PATCH 7/8] KVM: VMX: Use arch_xchg() when processing PIR to avoid instrumentation Use arch_xchg() when moving IRQs from the PIR to the vIRR, purely to avoid instrumentation so that KVM is compatible with the needs of posted MSI. This will allow extracting the core PIR logic to common code and sharing it between KVM and posted MSI handling. Link: https://lore.kernel.org/r/20250401163447.846608-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index bc56775426c9..70ad5f0942e2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -678,7 +678,7 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) if (!pir_vals[i]) continue; - pir_vals[i] = xchg(&pir[i], 0); + pir_vals[i] = arch_xchg(&pir[i], 0); } for (i = vec = 0; i <= 7; i++, vec += 32) { From edaf3eded386257b0e6504f6b2c29fd8d84c8d29 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:34:47 -0700 Subject: [PATCH 8/8] x86/irq: KVM: Add helper for harvesting PIR to deduplicate KVM and posted MSIs Now that posted MSI and KVM harvesting of PIR is identical, extract the code (and posted MSI's wonderful comment) to a common helper. No functional change intended. Link: https://lore.kernel.org/r/20250401163447.846608-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/posted_intr.h | 64 ++++++++++++++++++++++++++++++ arch/x86/kernel/irq.c | 50 ++--------------------- arch/x86/kvm/lapic.c | 16 +------- 3 files changed, 69 insertions(+), 61 deletions(-) diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h index d86aa4badcee..a5f761fbf45b 100644 --- a/arch/x86/include/asm/posted_intr.h +++ b/arch/x86/include/asm/posted_intr.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X86_POSTED_INTR_H #define _X86_POSTED_INTR_H + +#include +#include #include +#include + #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 @@ -26,6 +31,65 @@ struct pi_desc { u32 rsvd[6]; } __aligned(64); +/* + * De-multiplexing posted interrupts is on the performance path, the code + * below is written to optimize the cache performance based on the following + * considerations: + * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently + * accessed by both CPU and IOMMU. + * 2.During software processing of posted interrupts, the CPU needs to do + * natural width read and xchg for checking and clearing posted interrupt + * request (PIR), a 256 bit field within the PID. + * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache + * line when posting interrupts and setting control bits. + * 4.The CPU can access the cache line a magnitude faster than the IOMMU. + * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID + * cache line. The cache line states after each operation are as follows, + * assuming a 64-bit kernel: + * CPU IOMMU PID Cache line state + * --------------------------------------------------------------- + *...read64 exclusive + *...lock xchg64 modified + *... post/atomic swap invalid + *...------------------------------------------------------------- + * + * To reduce L1 data cache miss, it is important to avoid contention with + * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used + * when processing posted interrupts in software, e.g. to dispatch interrupt + * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR + * in KVM. + * + * In addition, the code is trying to keep the cache line state consistent + * as much as possible. e.g. when making a copy and clearing the PIR + * (assuming non-zero PIR bits are present in the entire PIR), it does: + * read, read, read, read, xchg, xchg, xchg, xchg + * instead of: + * read, xchg, read, xchg, read, xchg, read, xchg + */ +static __always_inline bool pi_harvest_pir(unsigned long *pir, + unsigned long *pir_vals) +{ + unsigned long pending = 0; + int i; + + for (i = 0; i < NR_PIR_WORDS; i++) { + pir_vals[i] = READ_ONCE(pir[i]); + pending |= pir_vals[i]; + } + + if (!pending) + return false; + + for (i = 0; i < NR_PIR_WORDS; i++) { + if (!pir_vals[i]) + continue; + + pir_vals[i] = arch_xchg(&pir[i], 0); + } + + return true; +} + static inline bool pi_test_and_set_on(struct pi_desc *pi_desc) { return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index c8cf7c61fc57..9ed29ff10e59 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -380,58 +380,14 @@ void intel_posted_msi_init(void) this_cpu_write(posted_msi_pi_desc.ndst, destination); } -/* - * De-multiplexing posted interrupts is on the performance path, the code - * below is written to optimize the cache performance based on the following - * considerations: - * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently - * accessed by both CPU and IOMMU. - * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg - * for checking and clearing posted interrupt request (PIR), a 256 bit field - * within the PID. - * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache - * line when posting interrupts and setting control bits. - * 4.The CPU can access the cache line a magnitude faster than the IOMMU. - * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID - * cache line. The cache line states after each operation are as follows: - * CPU IOMMU PID Cache line state - * --------------------------------------------------------------- - *...read64 exclusive - *...lock xchg64 modified - *... post/atomic swap invalid - *...------------------------------------------------------------- - * - * To reduce L1 data cache miss, it is important to avoid contention with - * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used - * to dispatch interrupt handlers. - * - * In addition, the code is trying to keep the cache line state consistent - * as much as possible. e.g. when making a copy and clearing the PIR - * (assuming non-zero PIR bits are present in the entire PIR), it does: - * read, read, read, read, xchg, xchg, xchg, xchg - * instead of: - * read, xchg, read, xchg, read, xchg, read, xchg - */ static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs) { - unsigned long pir_copy[NR_PIR_WORDS], pending = 0; - int i, vec = FIRST_EXTERNAL_VECTOR; + unsigned long pir_copy[NR_PIR_WORDS]; + int vec = FIRST_EXTERNAL_VECTOR; - for (i = 0; i < NR_PIR_WORDS; i++) { - pir_copy[i] = READ_ONCE(pir[i]); - pending |= pir_copy[i]; - } - - if (!pending) + if (!pi_harvest_pir(pir, pir_copy)) return false; - for (i = 0; i < NR_PIR_WORDS; i++) { - if (!pir_copy[i]) - continue; - - pir_copy[i] = arch_xchg(&pir[i], 0); - } - for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR) call_irq_handler(vec, regs); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 70ad5f0942e2..2c325b45b415 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -657,7 +657,7 @@ static u8 count_vectors(void *bitmap) bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) { - unsigned long pir_vals[NR_PIR_WORDS], pending = 0; + unsigned long pir_vals[NR_PIR_WORDS]; u32 *__pir = (void *)pir_vals; u32 i, vec; u32 irr_val, prev_irr_val; @@ -666,21 +666,9 @@ bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr) max_updated_irr = -1; *max_irr = -1; - for (i = 0; i < NR_PIR_WORDS; i++) { - pir_vals[i] = READ_ONCE(pir[i]); - pending |= pir_vals[i]; - } - - if (!pending) + if (!pi_harvest_pir(pir, pir_vals)) return false; - for (i = 0; i < NR_PIR_WORDS; i++) { - if (!pir_vals[i]) - continue; - - pir_vals[i] = arch_xchg(&pir[i], 0); - } - for (i = vec = 0; i <= 7; i++, vec += 32) { u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);