From 9874b2917b9fbc30956fee209d3c4aa47201c64e Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 9 Apr 2026 11:43:30 -0700 Subject: [PATCH 1/3] x86/shstk: Prevent deadlock during shstk sigreturn During sigreturn the shadow stack signal frame is popped. The kernel does this by reading the shadow stack using normal read accesses. When it can't assume the memory is shadow stack, it takes extra steps to makes sure it is reading actual shadow stack memory and not other normal readable memory. It does this by holding the mmap read lock while doing the access and checking the flags of the VMA. Unfortunately that is not safe. If the read of the shadow stack sigframe hits a page fault, the fault handler will try to recursively grab another mmap read lock. This normally works ok, but if a writer on another CPU is also waiting, the second read lock could fail and cause a deadlock. Fix this by not holding mmap lock during the read access to userspace. Instead use mmap_lock_speculate_...() to watch for changes between dropping mmap lock and the userspace access. Retry if anything grabbed an mmap write lock in between and could have changed the VMA. These mmap_lock_speculate_...() helpers use mm::mm_lock_seq, which is only available when PER_VMA_LOCK is configured. So make X86_USER_SHADOW_STACK depend on it. On x86, PER_VMA_LOCK is a default configuration for SMP kernels. So drop support for the other configs under the assumption that the !SMP shadow stack user base does not exist. Currently there is a check that skips the lookup work when the SSP can be assumed to be on a shadow stack. While reorganizing the function, remove the optimization to make the tricky code flows more common, such that issues like this cannot escape detection for so long. Fixes: 7fad2a432cd3 ("x86/shstk: Check that signal frame is shadow stack mem") Suggested-by: Linus Torvalds Signed-off-by: Rick Edgecombe Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Cc: stable@vger.kernel.org --- arch/x86/Kconfig | 1 + arch/x86/kernel/shstk.c | 42 ++++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 99bb5217649a..f3f7cb01d69d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1885,6 +1885,7 @@ config X86_USER_SHADOW_STACK bool "X86 userspace shadow stack" depends on AS_WRUSS depends on X86_64 + depends on PER_VMA_LOCK select ARCH_USES_HIGH_VMA_FLAGS select ARCH_HAS_USER_SHADOW_STACK select X86_CET diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 0962ae4c3017..0ca64900192f 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -326,10 +326,8 @@ static int shstk_push_sigframe(unsigned long *ssp) static int shstk_pop_sigframe(unsigned long *ssp) { - struct vm_area_struct *vma; unsigned long token_addr; - bool need_to_check_vma; - int err = 1; + unsigned int seq; /* * It is possible for the SSP to be off the end of a shadow stack by 4 @@ -340,25 +338,35 @@ static int shstk_pop_sigframe(unsigned long *ssp) if (!IS_ALIGNED(*ssp, 8)) return -EINVAL; - need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; + do { + struct vm_area_struct *vma; + bool valid_vma; + int err; - if (need_to_check_vma) if (mmap_read_lock_killable(current->mm)) return -EINTR; - err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); - if (unlikely(err)) - goto out_err; - - if (need_to_check_vma) { vma = find_vma(current->mm, *ssp); - if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { - err = -EFAULT; - goto out_err; - } + valid_vma = vma && (vma->vm_flags & VM_SHADOW_STACK); + /* + * VMAs can change between get_shstk_data() and find_vma(). + * Watch for changes and ensure that 'token_addr' comes from + * 'vma' by recording a seqcount. + * + * Ignore the return value of mmap_lock_speculate_try_begin() + * because the mmap lock excludes the possibility of writers. + */ + mmap_lock_speculate_try_begin(current->mm, &seq); mmap_read_unlock(current->mm); - } + + if (!valid_vma) + return -EINVAL; + + err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); + if (err) + return err; + } while (mmap_lock_speculate_retry(current->mm, seq)); /* Restore SSP aligned? */ if (unlikely(!IS_ALIGNED(token_addr, 8))) @@ -371,10 +379,6 @@ static int shstk_pop_sigframe(unsigned long *ssp) *ssp = token_addr; return 0; -out_err: - if (need_to_check_vma) - mmap_read_unlock(current->mm); - return err; } int setup_signal_shadow_stack(struct ksignal *ksig) From 932d922285ef4d0d655a6f5def2779ae86ca0d73 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 21 Apr 2026 09:31:36 -0700 Subject: [PATCH 2/3] x86/cpu: Disable FRED when PTI is forced on FRED and PTI were never intended to work together. No FRED hardware is vulnerable to Meltdown and all of it should have LASS anyway. Nevertheless, if you boot a system with pti=on and fred=on, the kernel tries to do what is asked of it and dies a horrible death on the first attempt to run userspace (since it never switches to the user page tables). Disable FRED when PTI is forced on, and print a warning about it. A quick brain dump about what a FRED+PTI implementation would look like is below. I'm not sure it would make any sense to do it, but never say never. All I know is that it's way too complicated to be worth it today. The SWITCH_TO_USER/KERNEL_CR3 bits are simple to fix (or at least we have the assembly tools to do it already), as is sticking the FRED entry text in .entry.text (it's not in there today). The nasty part is the stacks. Today, the CPU pops into the kernel on MSR_IA32_FRED_RSP0 which is normal old kernel memory and not mapped to userspace. The hardware pushes gunk on to MSR_IA32_FRED_RSP0, which is currently the task stacks. MSR_IA32_FRED_RSP0 would need to point elsewhere, probably cpu_entry_stack(). Then, start playing games with stacks on entry/exit, including copying gunk to and from the task stack. While I'd *like* to have PTI everywhere, I'm not sure it's worth mucking up the FRED code with PTI kludges. If a user wants fast entry/exit, they use FRED. If you want PTI (and sekuritay), you certainly don't care about fast entry and FRED isn't going to help you *all* that much, so you can just stay with the IDT. Plus, FRED hardware should have LASS which gives you a similar security profile to PTI without the CR3 munging. Reported-by: Gayatri Kammela Signed-off-by: Dave Hansen Reviewed-by: Borislav Petkov (AMD) Tested-by: Maciej Wieczor-Retman Cc:stable@vger.kernel.org Link: https://patch.msgid.link/20260421163136.E7C6788A@davehans-spike.ostc.intel.com --- arch/x86/mm/pti.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index f7546e9e8e89..631f0375bd42 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -105,6 +105,11 @@ void __init pti_check_boottime_disable(void) pr_debug("PTI enabled, disabling INVLPGB\n"); setup_clear_cpu_cap(X86_FEATURE_INVLPGB); } + + if (cpu_feature_enabled(X86_FEATURE_FRED)) { + pr_debug("PTI enabled, disabling FRED\n"); + setup_clear_cpu_cap(X86_FEATURE_FRED); + } } static int __init pti_parse_cmdline(char *arg) From a39a7014825bd8d10b94fa4f953141b9473c25b4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 21 Apr 2026 08:19:09 -0700 Subject: [PATCH 3/3] x86/mm: Revert INVLPGB optimization for set_memory code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tl;dr: Revert an INVLPGB optimization that did not properly handle discontiguous virtual addresses. Full story: I got a report from some graphics (i915) folks that bisected a regression in their test suite to 86e6815b316e ("x86/mm: Change cpa_flush() to call flush_kernel_range() directly"). There was a bit of flip-flopping on the exact bisect, but the code here does seem wrong to me. The i915 folks were calling set_pages_array_wc(), so using the CPA_PAGES_ARRAY mode. Basically, the 'struct cpa_data' can wrap up all kinds of page table changes. Some of these are virtually contiguous, but some are very much not which is one reason why there are ->vaddr and ->pages arrays. 86e6815b316e made the mistake of assuming that the virtual addresses in the cpa_data are always contiguous. It got things right when neither CPA_ARRAY/CPA_PAGES_ARRAY is used, but theoretically wrong when either of those is used. In the i915 case, it probably failed to flush some WB TLB entries and install WC ones, leaving some data in the caches and not flushing it out to where the device could see it. That eventually caused graphics problems. Revert the INVLPGB optimization. It can be reintroduced later, but it will need to be a bit careful about the array modes. Fixes: 86e6815b316ec ("x86/mm: Change cpa_flush() to call flush_kernel_range()") Reported-by: Cui, Ling Signed-off-by: Dave Hansen Signed-off-by: Ingo Molnar Reviewed-by: Rick Edgecombe Reviewed-by: Thomas Hellström Link: https://patch.msgid.link/20260421151909.6B3281C6@davehans-spike.ostc.intel.com --- arch/x86/mm/pat/set_memory.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index cba907c39718..d023a40a1e03 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -399,6 +399,15 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } +static void __cpa_flush_tlb(void *data) +{ + struct cpa_data *cpa = data; + unsigned int i; + + for (i = 0; i < cpa->numpages; i++) + flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); +} + static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); static void cpa_collapse_large_pages(struct cpa_data *cpa) @@ -435,7 +444,6 @@ static void cpa_collapse_large_pages(struct cpa_data *cpa) static void cpa_flush(struct cpa_data *cpa, int cache) { - unsigned long start, end; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); @@ -445,12 +453,10 @@ static void cpa_flush(struct cpa_data *cpa, int cache) goto collapse_large_pages; } - start = fix_addr(__cpa_addr(cpa, 0)); - end = start + cpa->numpages * PAGE_SIZE; - if (cpa->force_flush_all) - end = TLB_FLUSH_ALL; - - flush_tlb_kernel_range(start, end); + if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) + flush_tlb_all(); + else + on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) goto collapse_large_pages;