From 031b33ef1a6a1129a1a02a16b89608ded2eff9be Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 20 Mar 2025 16:42:52 -0700 Subject: [PATCH 01/32] x86/fpu/xstate: Remove xstate offset check Traditionally, new xstate components have been assigned sequentially, aligning feature numbers with their offsets in the XSAVE buffer. However, this ordering is not architecturally mandated in the non-compacted format, where a component's offset may not correspond to its feature number. The kernel caches CPUID-reported xstate component details, including size and offset in the non-compacted format. As part of this process, a sanity check is also conducted to ensure alignment between feature numbers and offsets. This check was likely intended as a general guideline rather than a strict requirement. Upcoming changes will support out-of-order offsets. Remove the check as becoming obsolete. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250320234301.8342-2-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 6a41d1610d8b..542c6981180d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -216,9 +216,6 @@ static bool xfeature_enabled(enum xfeature xfeature) static void __init setup_xstate_cache(void) { u32 eax, ebx, ecx, edx, i; - /* start at the beginning of the "extended state" */ - unsigned int last_good_offset = offsetof(struct xregs_state, - extended_state_area); /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form @@ -246,16 +243,6 @@ static void __init setup_xstate_cache(void) continue; xstate_offsets[i] = ebx; - - /* - * In our xstate size checks, we assume that the highest-numbered - * xstate feature has the highest offset in the buffer. Ensure - * it does. - */ - WARN_ONCE(last_good_offset > xstate_offsets[i], - "x86/fpu: misordered xstate at %d\n", last_good_offset); - - last_good_offset = xstate_offsets[i]; } } From 15d51a2f6f3f7057311d6e37d5190d58597a94a9 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 20 Mar 2025 16:42:53 -0700 Subject: [PATCH 02/32] x86/fpu/xstate: Introduce xfeature order table and accessor macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel has largely assumed that higher xstate component numbers correspond to later offsets in the buffer. However, this assumption no longer holds for the non-compacted format, where a newer state component may have a lower offset. When iterating over xstate components in offset order, using the feature number as an index may be misleading. At the same time, the CPU exposes each component’s size and offset based on its feature number, making it a key for state information. To provide flexibility in handling xstate ordering, introduce a mapping table: feature order -> feature number. The table is dynamically populated based on the CPU-exposed features and is sorted in offset order at boot time. Additionally, add an accessor macro to facilitate sequential traversal of xstate components based on their actual buffer positions, given a feature bitmask. This accessor macro will be particularly useful for computing custom non-compacted format sizes and iterating over xstate offsets in non-compacted buffers. Suggested-by: Dave Hansen Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250320234301.8342-3-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 58 +++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 542c6981180d..1e22103a8e17 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -88,6 +89,31 @@ static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; +/* + * Ordering of xstate components in uncompacted format: The xfeature + * number does not necessarily indicate its position in the XSAVE buffer. + * This array defines the traversal order of xstate features. + */ +static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; + +static inline unsigned int next_xfeature_order(unsigned int i, u64 mask) +{ + for (; xfeature_uncompact_order[i] != -1; i++) { + if (mask & BIT_ULL(xfeature_uncompact_order[i])) + break; + } + + return i; +} + +/* Iterate xstate features in uncompacted order: */ +#define for_each_extended_xfeature_in_order(i, mask) \ + for (i = 0; \ + i = next_xfeature_order(i, mask), \ + xfeature_uncompact_order[i] != -1; \ + i++) + #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) @@ -209,13 +235,20 @@ static bool xfeature_enabled(enum xfeature xfeature) return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } +static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2) +{ + return xstate_offsets[*(unsigned int *)xfeature1] - + xstate_offsets[*(unsigned int *)xfeature2]; +} + /* * Record the offsets and sizes of various xstates contained - * in the XSAVE state memory layout. + * in the XSAVE state memory layout. Also, create an ordered + * list of xfeatures for handling out-of-order offsets. */ static void __init setup_xstate_cache(void) { - u32 eax, ebx, ecx, edx, i; + u32 eax, ebx, ecx, edx, xfeature, i = 0; /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form @@ -229,21 +262,30 @@ static void __init setup_xstate_cache(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { - cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); + for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) { + cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx); - xstate_sizes[i] = eax; - xstate_flags[i] = ecx; + xstate_sizes[xfeature] = eax; + xstate_flags[xfeature] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ - if (xfeature_is_supervisor(i)) + if (xfeature_is_supervisor(xfeature)) continue; - xstate_offsets[i] = ebx; + xstate_offsets[xfeature] = ebx; + + /* Populate the list of xfeatures before sorting */ + xfeature_uncompact_order[i++] = xfeature; } + + /* + * Sort xfeatures by their offsets to support out-of-order + * offsets in the uncompacted format. + */ + sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL); } /* From a758ae2885eacaa68e4de9a01539a242a6bbb403 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 20 Mar 2025 16:42:54 -0700 Subject: [PATCH 03/32] x86/fpu/xstate: Adjust XSAVE buffer size calculation The current xstate size calculation assumes that the highest-numbered xstate feature has the highest offset in the buffer, determining the size based on the topmost bit in the feature mask. However, this assumption is not architecturally guaranteed -- higher-numbered features may have lower offsets. With the introduction of the xfeature order table and its helper macro, xstate components can now be traversed in their positional order. Update the non-compacted format handling to iterate through the table to determine the last-positioned feature. Then, set the offset accordingly. Since size calculation primarily occurs during initialization or in non-critical paths, looping to find the last feature is not expected to have a meaningful performance impact. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250320234301.8342-4-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 1e22103a8e17..93f94013b094 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -581,13 +581,20 @@ static bool __init check_xstate_against_struct(int nr) static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; - unsigned int offset = xstate_offsets[topmost]; + unsigned int offset, i; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); - if (compacted) + if (compacted) { offset = xfeature_get_offset(xfeatures, topmost); + } else { + /* Walk through the xfeature order to pick the last */ + for_each_extended_xfeature_in_order(i, xfeatures) + topmost = xfeature_uncompact_order[i]; + offset = xstate_offsets[topmost]; + } + return offset + xstate_sizes[topmost]; } From cbe8e4dab16c56ac84765dcd53e418160c8bc0db Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Thu, 20 Mar 2025 16:42:55 -0700 Subject: [PATCH 04/32] x86/fpu/xstate: Adjust xstate copying logic for user ABI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit == Background == As feature positions in the userspace XSAVE buffer do not always align with their feature numbers, the XSAVE format conversion needs to be reconsidered to align with the revised xstate size calculation logic. * For signal handling, XSAVE and XRSTOR are used directly to save and restore extended registers. * For ptrace, KVM, and signal returns (for 32-bit frame), the kernel copies data between its internal buffer and the userspace XSAVE buffer. If memcpy() were used for these cases, existing offset helpers — such as __raw_xsave_addr() or xstate_offsets[] — would be sufficient to handle the format conversion. == Problem == When copying data from the compacted in-kernel buffer to the non-compacted userspace buffer, the function follows the user_regset_get2_fn() prototype. This means it utilizes struct membuf helpers for the destination buffer. As defined in regset.h, these helpers update the memory pointer during the copy process, enforcing sequential writes within the loop. Since xstate components are processed sequentially, any component whose buffer position does not align with its feature number has an issue. == Solution == Replace for_each_extended_xfeature() with the newly introduced for_each_extended_xfeature_in_order(). This macro ensures xstate components are handled in the correct order based on their actual positions in the destination buffer, rather than their feature numbers. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250320234301.8342-5-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 93f94013b094..46c45e2f2a5a 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1107,10 +1107,9 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; + unsigned int zerofrom, i, xfeature; struct xstate_header header; - unsigned int zerofrom; u64 mask; - int i; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; @@ -1179,15 +1178,16 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = header.xfeatures; - for_each_extended_xfeature(i, mask) { + for_each_extended_xfeature_in_order(i, mask) { + xfeature = xfeature_uncompact_order[i]; /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ - if (zerofrom < xstate_offsets[i]) - membuf_zero(&to, xstate_offsets[i] - zerofrom); + if (zerofrom < xstate_offsets[xfeature]) + membuf_zero(&to, xstate_offsets[xfeature] - zerofrom); - if (i == XFEATURE_PKRU) { + if (xfeature == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the @@ -1197,14 +1197,14 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, - __raw_xsave_addr(xsave, i), - xstate_sizes[i]); + __raw_xsave_addr(xsave, xfeature), + xstate_sizes[xfeature]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ - zerofrom = xstate_offsets[i] + xstate_sizes[i]; + zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature]; } out: From 77fbccede633a5565cae084348b5459f6849086d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:20 +0200 Subject: [PATCH 05/32] x86/fpu: Introduce the x86_task_fpu() helper method The per-task FPU context/save area is allocated right next to task_struct, currently in a variable-size array via task_struct::thread.fpu[], but we plan to fully hide it from the C type scope. Introduce the x86_task_fpu() accessor that gets to the FPU context pointer explicitly from the task pointer. Right now this is a simple (task)->thread.fpu wrapper. Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S. Bae Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250409211127.3544993-2-mingo@kernel.org --- arch/x86/include/asm/processor.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5d2f7e5aff26..2f631e0adea3 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -523,6 +523,8 @@ struct thread_struct { */ }; +#define x86_task_fpu(task) (&(task)->thread.fpu) + extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); static inline void arch_thread_struct_whitelist(unsigned long *offset, From e3bfa3859936da3edd1e16d0b74fdaaa19bb5087 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:21 +0200 Subject: [PATCH 06/32] x86/fpu: Convert task_struct::thread.fpu accesses to use x86_task_fpu() This will make the removal of the task_struct::thread.fpu array easier. No change in functionality - code generated before and after this commit is identical on x86-defconfig: kepler:~/tip> diff -up vmlinux.before.asm vmlinux.after.asm kepler:~/tip> Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S. Bae Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250409211127.3544993-3-mingo@kernel.org --- arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/kernel/fpu/context.h | 4 ++-- arch/x86/kernel/fpu/core.c | 30 +++++++++++++++--------------- arch/x86/kernel/fpu/init.c | 8 ++++---- arch/x86/kernel/fpu/regset.c | 22 +++++++++++----------- arch/x86/kernel/fpu/signal.c | 18 +++++++++--------- arch/x86/kernel/fpu/xstate.c | 22 +++++++++++----------- arch/x86/kernel/fpu/xstate.h | 6 +++--- arch/x86/kernel/process.c | 6 +++--- arch/x86/kernel/signal.c | 6 +++--- arch/x86/kernel/traps.c | 2 +- arch/x86/math-emu/fpu_aux.c | 2 +- arch/x86/math-emu/fpu_entry.c | 4 ++-- arch/x86/math-emu/fpu_system.h | 2 +- arch/x86/mm/extable.c | 2 +- 15 files changed, 68 insertions(+), 68 deletions(-) diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index c485f1944c5f..1feaa68b7567 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -41,7 +41,7 @@ static inline void switch_fpu_prepare(struct task_struct *old, int cpu) { if (cpu_feature_enabled(X86_FEATURE_FPU) && !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) { - struct fpu *old_fpu = &old->thread.fpu; + struct fpu *old_fpu = x86_task_fpu(old); save_fpregs_to_fpstate(old_fpu); /* diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index f6d856bd50bc..10d0a720659c 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -53,7 +53,7 @@ static inline void fpregs_activate(struct fpu *fpu) /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) @@ -67,7 +67,7 @@ static inline void fpregs_restore_userregs(void) * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be - * not up to date in current->thread.fpu.xsave state. + * not up to date in current->thread.fpu->xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 91d6341f281f..dc6d7f93c446 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -211,7 +211,7 @@ static void fpu_init_guest_permissions(struct fpu_guest *gfpu) return; spin_lock_irq(¤t->sighand->siglock); - fpuperm = ¤t->group_leader->thread.fpu.guest_perm; + fpuperm = &x86_task_fpu(current->group_leader)->guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ @@ -323,7 +323,7 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); */ void fpu_sync_guest_vmexit_xfd_state(void) { - struct fpstate *fps = current->thread.fpu.fpstate; + struct fpstate *fps = x86_task_fpu(current)->fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { @@ -337,7 +337,7 @@ EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); @@ -438,7 +438,7 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); - save_fpregs_to_fpstate(¤t->thread.fpu); + save_fpregs_to_fpstate(x86_task_fpu(current)); } __cpu_invalidate_fpregs_state(); @@ -467,7 +467,7 @@ EXPORT_SYMBOL_GPL(kernel_fpu_end); */ void fpu_sync_fpstate(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); trace_x86_fpu_before_save(fpu); @@ -552,7 +552,7 @@ void fpstate_reset(struct fpu *fpu) static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { - struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + struct fpu *src_fpu = x86_task_fpu(current->group_leader); spin_lock_irq(¤t->sighand->siglock); /* Fork also inherits the permissions of the parent */ @@ -572,7 +572,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) if (!ssp) return 0; - xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave, XFEATURE_CET_USER); /* @@ -593,8 +593,8 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { - struct fpu *src_fpu = ¤t->thread.fpu; - struct fpu *dst_fpu = &dst->thread.fpu; + struct fpu *src_fpu = x86_task_fpu(current); + struct fpu *dst_fpu = x86_task_fpu(dst); /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; @@ -686,7 +686,7 @@ void fpu__drop(struct fpu *fpu) { preempt_disable(); - if (fpu == ¤t->thread.fpu) { + if (fpu == x86_task_fpu(current)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -720,7 +720,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) */ static void fpu_reset_fpregs(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); @@ -749,7 +749,7 @@ static void fpu_reset_fpregs(void) */ void fpu__clear_user_states(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { @@ -782,7 +782,7 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { - fpstate_reset(¤t->thread.fpu); + fpstate_reset(x86_task_fpu(current)); fpu_reset_fpregs(); } /* @@ -823,7 +823,7 @@ void fpregs_lock_and_load(void) */ void fpregs_assert_state_consistent(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; @@ -835,7 +835,7 @@ EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); void fpregs_mark_activate(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 998a08f17e33..ad5cb2943d37 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -38,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); + fpstate_init_soft(&x86_task_fpu(current)->fpstate->regs.soft); else #endif asm volatile ("fninit"); @@ -154,7 +154,7 @@ static void __init fpu__init_task_struct_size(void) * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(current->thread.fpu.__fpstate.regs); + task_size -= sizeof(union fpregs_state); /* * Add back the dynamically-calculated register state @@ -204,7 +204,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_kernel_cfg.default_size = size; fpu_user_cfg.max_size = size; fpu_user_cfg.default_size = size; - fpstate_reset(¤t->thread.fpu); + fpstate_reset(x86_task_fpu(current)); } /* @@ -213,7 +213,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ void __init fpu__init_system(void) { - fpstate_reset(¤t->thread.fpu); + fpstate_reset(x86_task_fpu(current)); fpu__init_system_early_generic(); /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 887b0b8e21e3..0986c2200adc 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -45,7 +45,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r */ static void sync_fpstate(struct fpu *fpu) { - if (fpu == ¤t->thread.fpu) + if (fpu == x86_task_fpu(current)) fpu_sync_fpstate(fpu); } @@ -63,7 +63,7 @@ static void fpu_force_restore(struct fpu *fpu) * Only stopped child tasks can be used to modify the FPU * state in the fpstate buffer: */ - WARN_ON_FPU(fpu == ¤t->thread.fpu); + WARN_ON_FPU(fpu == x86_task_fpu(current)); __fpu_invalidate_fpregs_state(fpu); } @@ -71,7 +71,7 @@ static void fpu_force_restore(struct fpu *fpu) int xfpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; @@ -91,7 +91,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct fxregs_state newstate; int ret; @@ -133,7 +133,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; - sync_fpstate(&target->thread.fpu); + sync_fpstate(x86_task_fpu(target)); copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE); return 0; @@ -143,7 +143,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct xregs_state *tmpbuf = NULL; int ret; @@ -187,7 +187,7 @@ int ssp_active(struct task_struct *target, const struct user_regset *regset) int ssp_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct cet_user_state *cetregs; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || @@ -214,7 +214,7 @@ int ssp_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct xregs_state *xsave = &fpu->fpstate->regs.xsave; struct cet_user_state *cetregs; unsigned long user_ssp; @@ -368,7 +368,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); + __convert_from_fxsr(env, tsk, &x86_task_fpu(tsk)->fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, @@ -401,7 +401,7 @@ void convert_to_fxsr(struct fxregs_state *fxsave, int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; struct fxregs_state fxsave, *fx; @@ -433,7 +433,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; int ret; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 6c69cb28b298..b8b4fa9c2d04 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -43,13 +43,13 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + current->thread.fpu.fpstate->user_size))) + if (__get_user(magic2, (__u32 __user *)(fpstate + x86_task_fpu(current)->fpstate->user_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) return true; setfx: - trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); + trace_x86_fpu_xstate_check_failed(x86_task_fpu(current)); /* Set the parameters for fx only state */ fx_sw->magic1 = 0; @@ -64,13 +64,13 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; + struct xregs_state *xsave = &x86_task_fpu(tsk)->fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); + fxsave(&x86_task_fpu(tsk)->fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -184,7 +184,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pk bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) { struct task_struct *tsk = current; - struct fpstate *fpstate = tsk->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(tsk)->fpstate; bool ia32_fxstate = (buf != buf_fx); int ret; @@ -272,7 +272,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, */ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); int ret; /* Restore enabled features only. */ @@ -332,7 +332,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; + struct fpu *fpu = x86_task_fpu(tsk); struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; @@ -452,7 +452,7 @@ static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) */ bool fpu__restore_sig(void __user *buf, int ia32_frame) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); void __user *buf_fx = buf; bool ia32_fxstate = false; bool success = false; @@ -499,7 +499,7 @@ unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); + unsigned long frame_size = xstate_sigframe_size(x86_task_fpu(current)->fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 46c45e2f2a5a..253da5aec915 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -763,7 +763,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = 0; - fpstate_reset(¤t->thread.fpu); + fpstate_reset(x86_task_fpu(current)); } /* @@ -871,7 +871,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; /* Reset the state for the current task */ - fpstate_reset(¤t->thread.fpu); + fpstate_reset(x86_task_fpu(current)); /* * Update info used for ptrace frames; use standard-format size and no @@ -945,7 +945,7 @@ void fpu__resume_cpu(void) } if (fpu_state_size_dynamic()) - wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); + wrmsrl(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd); } /* @@ -1227,8 +1227,8 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { - __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, - tsk->thread.fpu.fpstate->user_xfeatures, + __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate, + x86_task_fpu(tsk)->fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } @@ -1368,7 +1368,7 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { - return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); + return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) @@ -1462,7 +1462,7 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ - if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd) return false; /* @@ -1539,7 +1539,7 @@ void fpstate_free(struct fpu *fpu) static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; @@ -1632,7 +1632,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); - struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; @@ -1735,7 +1735,7 @@ int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) return -EPERM; } - fpu = ¤t->group_leader->thread.fpu; + fpu = x86_task_fpu(current->group_leader); perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; @@ -1840,7 +1840,7 @@ long fpu_xstate_prctl(int option, unsigned long arg2) */ static void avx512_status(struct seq_file *m, struct task_struct *task) { - unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); + unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); long delta; if (!timestamp) { diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 0fd34f53f025..9a3a8ccf13bf 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -22,7 +22,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) static inline u64 xstate_get_group_perm(bool guest) { - struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; /* Pairs with WRITE_ONCE() in xstate_request_perm() */ @@ -288,7 +288,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ - struct fpstate *fpstate = current->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(current)->fpstate; u64 mask = fpstate->user_xfeatures; u32 lmask; u32 hmask; @@ -322,7 +322,7 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 u32 hmask = mask >> 32; int err; - xfd_validate_state(current->thread.fpu.fpstate, mask, true); + xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true); stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 962c3ce39323..47694e391506 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -103,7 +103,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.vm86 = NULL; #endif /* Drop the copied pointer to current's fpstate */ - dst->thread.fpu.fpstate = NULL; + x86_task_fpu(dst)->fpstate = NULL; return 0; } @@ -112,7 +112,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) void arch_release_task_struct(struct task_struct *tsk) { if (fpu_state_size_dynamic()) - fpstate_free(&tsk->thread.fpu); + fpstate_free(x86_task_fpu(tsk)); } #endif @@ -122,7 +122,7 @@ void arch_release_task_struct(struct task_struct *tsk) void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; - struct fpu *fpu = &t->fpu; + struct fpu *fpu = x86_task_fpu(tsk); if (test_thread_flag(TIF_IO_BITMAP)) io_bitmap_exit(tsk); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 5f441039b572..2404233336ab 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -255,7 +255,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); if (v8086_mode(regs)) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); @@ -423,14 +423,14 @@ bool sigaltstack_size_valid(size_t ss_size) if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) return true; - fsize += current->group_leader->thread.fpu.perm.__user_state_size; + fsize += x86_task_fpu(current->group_leader)->perm.__user_state_size; if (likely(ss_size > fsize)) return true; if (strict_sigaltstack_size) return ss_size > fsize; - mask = current->group_leader->thread.fpu.perm.__state_perm; + mask = x86_task_fpu(current->group_leader)->perm.__state_perm; if (mask & XFEATURE_MASK_USER_DYNAMIC) return ss_size > fsize; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9f88b8a78e50..f48325dfaa01 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -1295,7 +1295,7 @@ DEFINE_IDTENTRY_RAW(exc_debug) static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; - struct fpu *fpu = &task->thread.fpu; + struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index d62662bdd460..5f253ae406b6 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -53,7 +53,7 @@ void fpstate_init_soft(struct swregs_state *soft) void finit(void) { - fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); + fpstate_init_soft(&x86_task_fpu(current)->fpstate->regs.soft); } /* diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 91c52ead1226..5034df617740 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -641,7 +641,7 @@ int fpregs_soft_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; + struct swregs_state *s387 = &x86_task_fpu(target)->fpstate->regs.soft; void *space = s387->st_space; int ret; int offset, other, i, tags, regnr, tag, newtop; @@ -692,7 +692,7 @@ int fpregs_soft_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct swregs_state *s387 = &target->thread.fpu.fpstate->regs.soft; + struct swregs_state *s387 = &x86_task_fpu(target)->fpstate->regs.soft; const void *space = s387->st_space; int offset = (S387->ftop & 7) * 10, other = 80 - offset; diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index eec3e4805c75..5e238e930fe3 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -73,7 +73,7 @@ static inline bool seg_writable(struct desc_struct *d) return (d->type & SEG_TYPE_EXECUTE_MASK) == SEG_TYPE_WRITABLE; } -#define I387 (¤t->thread.fpu.fpstate->regs) +#define I387 (&x86_task_fpu(current)->fpstate->regs) #define FPU_info (I387->soft.info) #define FPU_CS (*(unsigned short *) &(FPU_info->regs->cs)) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 51986e8a9d35..bf8dab18be97 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -111,7 +111,7 @@ static bool ex_handler_sgx(const struct exception_table_entry *fixup, /* * Handler for when we fail to restore a task's FPU state. We should never get - * here because the FPU state of a task using the FPU (task->thread.fpu.state) + * here because the FPU state of a task using the FPU (struct fpu::fpstate) * should always be valid. However, past bugs have allowed userspace to set * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn(). * These caused XRSTOR to fail when switching to the task, leaking the FPU From cb7ca40a3882360ce87191793449d48df0b29184 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:22 +0200 Subject: [PATCH 07/32] x86/fpu: Make task_struct::thread constant size Turn thread.fpu into a pointer. Since most FPU code internals work by passing around the FPU pointer already, the code generation impact is small. This allows us to remove the old kludge of task_struct being variable size: struct task_struct { ... /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end /* CPU-specific state of this task: */ struct thread_struct thread; /* * WARNING: on x86, 'thread_struct' contains a variable-sized * structure. It *MUST* be at the end of 'task_struct'. * * Do not put anything below here! */ }; ... which creates a number of problems, such as requiring thread_struct to be the last member of the struct - not allowing it to be struct-randomized, etc. But the primary motivation is to allow the decoupling of task_struct from hardware details ( in particular), and to eventually allow the per-task infrastructure: DECLARE_PER_TASK(type, name); ... per_task(current, name) = val; ... which requires task_struct to be a constant size struct. The fpu_thread_struct_whitelist() quirk to hardened usercopy can be removed, now that the FPU structure is not embedded in the task struct anymore, which reduces text footprint a bit. Fixed-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S. Bae Cc: H. Peter Anvin Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250409211127.3544993-4-mingo@kernel.org --- arch/x86/include/asm/processor.h | 20 +++++++++----------- arch/x86/kernel/fpu/core.c | 23 ++++++++++++----------- arch/x86/kernel/fpu/init.c | 17 ++++++++++------- arch/x86/kernel/process.c | 2 +- include/linux/sched.h | 15 ++++----------- 5 files changed, 36 insertions(+), 41 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2f631e0adea3..5ea7e5d2c4de 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -516,21 +516,19 @@ struct thread_struct { #endif /* Floating point and extended processor state */ - struct fpu fpu; - /* - * WARNING: 'fpu' is dynamically-sized. It *MUST* be at - * the end. - */ + struct fpu *fpu; }; -#define x86_task_fpu(task) (&(task)->thread.fpu) +#define x86_task_fpu(task) ((task)->thread.fpu) -extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); - -static inline void arch_thread_struct_whitelist(unsigned long *offset, - unsigned long *size) +/* + * X86 doesn't need any embedded-FPU-struct quirks: + */ +static inline void +arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - fpu_thread_struct_whitelist(offset, size); + *offset = 0; + *size = 0; } static inline void diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index dc6d7f93c446..853a738fdf2d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -593,8 +593,19 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { + /* + * We allocate the new FPU structure right after the end of the task struct. + * task allocation size already took this into account. + * + * This is safe because task_struct size is a multiple of cacheline size. + */ struct fpu *src_fpu = x86_task_fpu(current); - struct fpu *dst_fpu = x86_task_fpu(dst); + struct fpu *dst_fpu = (void *)dst + sizeof(*dst); + + BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0); + BUG_ON(!src_fpu); + + dst->thread.fpu = dst_fpu; /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; @@ -663,16 +674,6 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, return 0; } -/* - * Whitelist the FPU register state embedded into task_struct for hardened - * usercopy. - */ -void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) -{ - *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); - *size = fpu_kernel_cfg.default_size; -} - /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index ad5cb2943d37..848ea79886ba 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -71,8 +71,15 @@ static bool __init fpu__probe_without_cpuid(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } +static struct fpu x86_init_fpu __attribute__ ((aligned (64))) __read_mostly; + static void __init fpu__init_system_early_generic(void) { + fpstate_reset(&x86_init_fpu); + current->thread.fpu = &x86_init_fpu; + set_thread_flag(TIF_NEED_FPU_LOAD); + x86_init_fpu.last_cpu = -1; + if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { if (fpu__probe_without_cpuid()) @@ -150,6 +157,8 @@ static void __init fpu__init_task_struct_size(void) { int task_size = sizeof(struct task_struct); + task_size += sizeof(struct fpu); + /* * Subtract off the static size of the register state. * It potentially has a bunch of padding. @@ -164,14 +173,9 @@ static void __init fpu__init_task_struct_size(void) /* * We dynamically size 'struct fpu', so we require that - * it be at the end of 'thread_struct' and that - * 'thread_struct' be at the end of 'task_struct'. If - * you hit a compile error here, check the structure to - * see if something got added to the end. + * 'state' be at the end of 'it: */ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); - CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); - CHECK_MEMBER_AT_END_OF(struct task_struct, thread); arch_task_struct_size = task_size; } @@ -213,7 +217,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ void __init fpu__init_system(void) { - fpstate_reset(x86_task_fpu(current)); fpu__init_system_early_generic(); /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 47694e391506..3ce4cce46f3f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -103,7 +103,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.vm86 = NULL; #endif /* Drop the copied pointer to current's fpstate */ - x86_task_fpu(dst)->fpstate = NULL; + dst->thread.fpu = NULL; return 0; } diff --git a/include/linux/sched.h b/include/linux/sched.h index f96ac1982893..4ecc0c6b1cb0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1646,22 +1646,15 @@ struct task_struct { struct user_event_mm *user_event_mm; #endif + /* CPU-specific state of this task: */ + struct thread_struct thread; + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end - - /* CPU-specific state of this task: */ - struct thread_struct thread; - - /* - * WARNING: on x86, 'thread_struct' contains a variable-sized - * structure. It *MUST* be at the end of 'task_struct'. - * - * Do not put anything below here! - */ -}; +} __attribute__ ((aligned (64))); #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) From 55bc30f2e34dcc17a370d1f6c1c992be107c4502 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:23 +0200 Subject: [PATCH 08/32] x86/fpu: Remove the thread::fpu pointer As suggested by Oleg, remove the thread::fpu pointer, as we can calculate it via x86_task_fpu() at compile-time. This improves code generation a bit: kepler:~/tip> size vmlinux.before vmlinux.after text data bss dec hex filename 26475405 10435342 1740804 38651551 24dc69f vmlinux.before 26475339 10959630 1216516 38651485 24dc65d vmlinux.after Suggested-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S. Bae Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Uros Bizjak Link: https://lore.kernel.org/r/20250409211127.3544993-5-mingo@kernel.org --- arch/x86/include/asm/processor.h | 5 +---- arch/x86/kernel/fpu/core.c | 4 +--- arch/x86/kernel/fpu/init.c | 1 - arch/x86/kernel/process.c | 2 -- arch/x86/kernel/vmlinux.lds.S | 4 ++++ 5 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ea7e5d2c4de..b7f7c9c83409 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -514,12 +514,9 @@ struct thread_struct { struct thread_shstk shstk; #endif - - /* Floating point and extended processor state */ - struct fpu *fpu; }; -#define x86_task_fpu(task) ((task)->thread.fpu) +#define x86_task_fpu(task) ((struct fpu *)((void *)(task) + sizeof(*(task)))) /* * X86 doesn't need any embedded-FPU-struct quirks: diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 853a738fdf2d..974b276ff0da 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -600,13 +600,11 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, * This is safe because task_struct size is a multiple of cacheline size. */ struct fpu *src_fpu = x86_task_fpu(current); - struct fpu *dst_fpu = (void *)dst + sizeof(*dst); + struct fpu *dst_fpu = x86_task_fpu(dst); BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0); BUG_ON(!src_fpu); - dst->thread.fpu = dst_fpu; - /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 848ea79886ba..da41a1d2c40f 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -76,7 +76,6 @@ static struct fpu x86_init_fpu __attribute__ ((aligned (64))) __read_mostly; static void __init fpu__init_system_early_generic(void) { fpstate_reset(&x86_init_fpu); - current->thread.fpu = &x86_init_fpu; set_thread_flag(TIF_NEED_FPU_LOAD); x86_init_fpu.last_cpu = -1; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3ce4cce46f3f..88868a90459e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -102,8 +102,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - /* Drop the copied pointer to current's fpstate */ - dst->thread.fpu = NULL; return 0; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ccdc45e5b759..d9ca2d1754da 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -181,6 +181,10 @@ SECTIONS /* equivalent to task_pt_regs(&init_task) */ __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE; + __x86_init_fpu_begin = .; + . = __x86_init_fpu_begin + 128*PAGE_SIZE; + __x86_init_fpu_end = .; + #ifdef CONFIG_X86_32 /* 32 bit has nosave before _edata */ NOSAVE_DATA From ec2227e03a46a162f2721917262adc553b212e2d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:24 +0200 Subject: [PATCH 09/32] x86/fpu: Push 'fpu' pointer calculation into the fpu__drop() call This encapsulates the fpu__drop() functionality better, and it will also enable other changes that want to check a task for PF_KTHREAD before calling x86_task_fpu(). Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S. Bae Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250409211127.3544993-6-mingo@kernel.org --- arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/kernel/fpu/core.c | 4 +++- arch/x86/kernel/process.c | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index 1feaa68b7567..5fd12634bcc4 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -10,7 +10,7 @@ #include extern void save_fpregs_to_fpstate(struct fpu *fpu); -extern void fpu__drop(struct fpu *fpu); +extern void fpu__drop(struct task_struct *tsk); extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long shstk_addr); extern void fpu_flush_thread(void); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 974b276ff0da..e4c20908ee49 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -681,8 +681,10 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, * a state-restore is coming: either an explicit one, * or a reschedule. */ -void fpu__drop(struct fpu *fpu) +void fpu__drop(struct task_struct *tsk) { + struct fpu *fpu = x86_task_fpu(tsk); + preempt_disable(); if (fpu == x86_task_fpu(current)) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 88868a90459e..5fb502c97b08 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -120,7 +120,6 @@ void arch_release_task_struct(struct task_struct *tsk) void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; - struct fpu *fpu = x86_task_fpu(tsk); if (test_thread_flag(TIF_IO_BITMAP)) io_bitmap_exit(tsk); @@ -128,7 +127,7 @@ void exit_thread(struct task_struct *tsk) free_vm86(t); shstk_free(tsk); - fpu__drop(fpu); + fpu__drop(tsk); } static int set_new_tls(struct task_struct *p, unsigned long tls) From c360bdc593b8a8b6e94166026728764085919cff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:25 +0200 Subject: [PATCH 10/32] x86/fpu: Make sure x86_task_fpu() doesn't get called for PF_KTHREAD|PF_USER_WORKER tasks during exit fpu__drop() and arch_release_task_struct() calls x86_task_fpu() unconditionally, while the FPU context area will not be present if it's the init task, and should not be in use when it's some other type of kthread. Return early for PF_KTHREAD or PF_USER_WORKER tasks. The debug warning in x86_task_fpu() will catch any kthreads attempting to use the FPU save area. Fixed-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250409211127.3544993-7-mingo@kernel.org --- arch/x86/kernel/fpu/core.c | 8 +++++++- arch/x86/kernel/process.c | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index e4c20908ee49..4a2193892e5d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -683,7 +683,13 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, */ void fpu__drop(struct task_struct *tsk) { - struct fpu *fpu = x86_task_fpu(tsk); + struct fpu *fpu; + + /* PF_KTHREAD tasks do not use the FPU context area: */ + if (tsk->flags & (PF_KTHREAD | PF_USER_WORKER)) + return; + + fpu = x86_task_fpu(tsk); preempt_disable(); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 5fb502c97b08..7a1bfb61d86f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -109,7 +109,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_X86_64 void arch_release_task_struct(struct task_struct *tsk) { - if (fpu_state_size_dynamic()) + if (fpu_state_size_dynamic() && !(tsk->flags & (PF_KTHREAD | PF_USER_WORKER))) fpstate_free(x86_task_fpu(tsk)); } #endif From 22aafe3bcb67472effdea1ccf0df20280192bbaf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:26 +0200 Subject: [PATCH 11/32] x86/fpu: Remove init_task FPU state dependencies, add debugging warning for PF_KTHREAD tasks init_task's FPU state initialization was a bit of a hack: __x86_init_fpu_begin = .; . = __x86_init_fpu_begin + 128*PAGE_SIZE; __x86_init_fpu_end = .; But the init task isn't supposed to be using the FPU context in any case, so remove the hack and add in some debug warnings. As Linus noted in the discussion, the init task (and other PF_KTHREAD tasks) *can* use the FPU via kernel_fpu_begin()/_end(), but they don't need the context area because their FPU use is not preemptible or reentrant, and they don't return to user-space. Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S. Bae Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Uros Bizjak Link: https://lore.kernel.org/r/20250409211127.3544993-8-mingo@kernel.org --- arch/x86/include/asm/processor.h | 6 +++++- arch/x86/kernel/fpu/core.c | 15 +++++++++++---- arch/x86/kernel/fpu/init.c | 3 +-- arch/x86/kernel/fpu/xstate.c | 3 --- arch/x86/kernel/vmlinux.lds.S | 4 ---- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b7f7c9c83409..eaa7214d6953 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -516,7 +516,11 @@ struct thread_struct { #endif }; -#define x86_task_fpu(task) ((struct fpu *)((void *)(task) + sizeof(*(task)))) +#ifdef CONFIG_X86_DEBUG_FPU +extern struct fpu *x86_task_fpu(struct task_struct *task); +#else +# define x86_task_fpu(task) ((struct fpu *)((void *)(task) + sizeof(*(task)))) +#endif /* * X86 doesn't need any embedded-FPU-struct quirks: diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 4a2193892e5d..4d1a205b7ce2 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -51,6 +51,16 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); +#ifdef CONFIG_X86_DEBUG_FPU +struct fpu *x86_task_fpu(struct task_struct *task) +{ + if (WARN_ON_ONCE(task->flags & PF_KTHREAD)) + return NULL; + + return (void *)task + sizeof(*task); +} +#endif + /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? @@ -599,11 +609,9 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, * * This is safe because task_struct size is a multiple of cacheline size. */ - struct fpu *src_fpu = x86_task_fpu(current); - struct fpu *dst_fpu = x86_task_fpu(dst); + struct fpu *dst_fpu = (void *)dst + sizeof(*dst); BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0); - BUG_ON(!src_fpu); /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; @@ -666,7 +674,6 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, if (update_fpu_shstk(dst, ssp)) return 1; - trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index da41a1d2c40f..16b6611634c3 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -38,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(&x86_task_fpu(current)->fpstate->regs.soft); + ; else #endif asm volatile ("fninit"); @@ -207,7 +207,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_kernel_cfg.default_size = size; fpu_user_cfg.max_size = size; fpu_user_cfg.default_size = size; - fpstate_reset(x86_task_fpu(current)); } /* diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 253da5aec915..4c771b9bd270 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -870,9 +870,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (err) goto out_disable; - /* Reset the state for the current task */ - fpstate_reset(x86_task_fpu(current)); - /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d9ca2d1754da..ccdc45e5b759 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -181,10 +181,6 @@ SECTIONS /* equivalent to task_pt_regs(&init_task) */ __top_init_kernel_stack = __end_init_stack - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE; - __x86_init_fpu_begin = .; - . = __x86_init_fpu_begin + 128*PAGE_SIZE; - __x86_init_fpu_end = .; - #ifdef CONFIG_X86_32 /* 32 bit has nosave before _edata */ NOSAVE_DATA From 8b2a7a7294b34fa00adbecbd352ef19eb780261b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Apr 2025 23:11:27 +0200 Subject: [PATCH 12/32] x86/fpu: Use 'fpstate' variable names consistently A few uses of 'fps' snuck in, which is rather confusing (to me) as it suggests frames-per-second. ;-) Rename them to the canonical 'fpstate' name. No change in functionality. Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S. Bae Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250409211127.3544993-9-mingo@kernel.org --- arch/x86/include/asm/fpu/api.h | 2 +- arch/x86/kernel/fpu/core.c | 14 +++++++------- arch/x86/kernel/fpu/xstate.c | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index f42de5f05e7e..8e6848f55dcd 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -136,7 +136,7 @@ static inline void fpstate_free(struct fpu *fpu) { } #endif /* fpstate-related functions which are exported to KVM */ -extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); +extern void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature); extern u64 xstate_get_guest_group_perm(void); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 4d1a205b7ce2..d0a45f6492cb 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -273,16 +273,16 @@ EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { - struct fpstate *fps = gfpu->fpstate; + struct fpstate *fpstate = gfpu->fpstate; - if (!fps) + if (!fpstate) return; - if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use)) return; gfpu->fpstate = NULL; - vfree(fps); + vfree(fpstate); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); @@ -333,12 +333,12 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); */ void fpu_sync_guest_vmexit_xfd_state(void) { - struct fpstate *fps = x86_task_fpu(current)->fpstate; + struct fpstate *fpstate = x86_task_fpu(current)->fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { - rdmsrl(MSR_IA32_XFD, fps->xfd); - __this_cpu_write(xfd_state, fps->xfd); + rdmsrl(MSR_IA32_XFD, fpstate->xfd); + __this_cpu_write(xfd_state, fpstate->xfd); } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4c771b9bd270..a288597065fd 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1431,9 +1431,9 @@ void xrstors(struct xregs_state *xstate, u64 mask) } #if IS_ENABLED(CONFIG_KVM) -void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature) { - void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); From e3a52b67f54aa36ab21265eeea016460b5fe1c46 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Apr 2025 12:52:16 +0200 Subject: [PATCH 13/32] x86/fpu: Clarify FPU context cacheline alignment Suggested-by: Peter Zijlstra Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Chang S. Bae Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/Z_ejggklB5-IWB5W@gmail.com --- arch/x86/kernel/fpu/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index d0a45f6492cb..3a19877a314e 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -607,7 +607,8 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, * We allocate the new FPU structure right after the end of the task struct. * task allocation size already took this into account. * - * This is safe because task_struct size is a multiple of cacheline size. + * This is safe because task_struct size is a multiple of cacheline size, + * thus x86_task_fpu() will always be cacheline aligned as well. */ struct fpu *dst_fpu = (void *)dst + sizeof(*dst); From b02dc185ee86836cf1d8a37b81349374e4018ee0 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:51 -0700 Subject: [PATCH 14/32] x86/cpufeatures: Add X86_FEATURE_APX Intel Advanced Performance Extensions (APX) introduce a new set of general-purpose registers, managed as an extended state component via the xstate management facility. Before enabling this new xstate, define a feature flag to clarify the dependency in xsave_cpuid_features[]. APX is enumerated under CPUID level 7 with EDX=1. Since this CPUID leaf is not yet allocated, place the flag in a scattered feature word. While this feature is intended only for userspace, exposing it via /proc/cpuinfo is unnecessary. Instead, the existing arch_prctl(2) mechanism with the ARCH_GET_XCOMP_SUPP option can be used to query the feature availability. Finally, clarify that APX depends on XSAVE. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250416021720.12305-2-chang.seok.bae@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/cpuid-deps.c | 1 + arch/x86/kernel/cpu/scattered.c | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index bc81b9d1aeca..478ab362fda2 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -481,6 +481,7 @@ #define X86_FEATURE_AMD_HTR_CORES (21*32+ 6) /* Heterogeneous Core Topology */ #define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32+ 7) /* Workload Classification */ #define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ +#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ /* * BUG word(s) diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 94c062cddfa4..46efcbd6afa4 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -28,6 +28,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_APX, X86_FEATURE_XSAVE }, { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, { X86_FEATURE_MMX, X86_FEATURE_FXSR }, { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index c75c57b32b74..dbf6d71bdf18 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, From bd0b10b795c5c4c587e83c0498251356874c655c Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:52 -0700 Subject: [PATCH 15/32] x86/fpu/apx: Define APX state component Advanced Performance Extensions (APX) is associated with a new state component number 19. To support saving and restoring of the corresponding registers via the XSAVE mechanism, introduce the component definition along with the necessary sanity checks. Define the new component number, state name, and those register data type. Then, extend the size checker to validate the register data type and explicitly list the APX feature flag as a dependency for the new component in xsave_cpuid_features[]. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250416021720.12305-3-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/types.h | 9 +++++++++ arch/x86/kernel/fpu/xstate.c | 3 +++ 2 files changed, 12 insertions(+) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index de16862bf230..97310df3ea13 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -125,6 +125,7 @@ enum xfeature { XFEATURE_RSRVD_COMP_16, XFEATURE_XTILE_CFG, XFEATURE_XTILE_DATA, + XFEATURE_APX, XFEATURE_MAX, }; @@ -145,6 +146,7 @@ enum xfeature { #define XFEATURE_MASK_LBR (1 << XFEATURE_LBR) #define XFEATURE_MASK_XTILE_CFG (1 << XFEATURE_XTILE_CFG) #define XFEATURE_MASK_XTILE_DATA (1 << XFEATURE_XTILE_DATA) +#define XFEATURE_MASK_APX (1 << XFEATURE_APX) #define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE) #define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \ @@ -303,6 +305,13 @@ struct xtile_data { struct reg_1024_byte tmm; } __packed; +/* + * State component 19: 8B extended general purpose register. + */ +struct apx_state { + u64 egpr[16]; +} __packed; + /* * State component 10 is supervisor state used for context-switching the * PASID state. diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a288597065fd..dfd07af10037 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -63,6 +63,7 @@ static const char *xfeature_names[] = "unknown xstate feature", "AMX Tile config", "AMX Tile data", + "APX registers", "unknown xstate feature", }; @@ -81,6 +82,7 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, + [XFEATURE_APX] = X86_FEATURE_APX, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = @@ -569,6 +571,7 @@ static bool __init check_xstate_against_struct(int nr) case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); From ea68e39190cff86f457bd286c70b535e2a99a94d Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:53 -0700 Subject: [PATCH 16/32] x86/fpu/apx: Disallow conflicting MPX presence XSTATE components are architecturally independent. There is no rule requiring their offsets in the non-compacted format to be strictly ascending or mutually non-overlapping. However, in practice, such overlaps have not occurred -- until now. APX is introduced as xstate component 19, following AMX. In the non-compacted XSAVE format, its offset overlaps with the space previously occupied by the now-deprecated MPX feature: 45fc24e89b7c ("x86/mpx: remove MPX from arch/x86") To prevent conflicts, the kernel must ensure the CPU never expose both features at the same time. If so, it indicates unreliable hardware. In such cases, XSAVE should be disabled entirely as a precautionary measure. Add a sanity check to detect this condition and disable XSAVE if an invalid hardware configuration is identified. Note: MPX state components remain enabled on legacy systems solely for KVM guest support. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250416021720.12305-4-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index dfd07af10037..14f5c1bb2080 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -814,6 +814,17 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX && + fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) { + /* + * This is a problematic CPU configuration where two + * conflicting state components are both enumerated. + */ + pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx, disabling XSAVE.\n", + fpu_kernel_cfg.max_features); + goto out_disable; + } + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & XFEATURE_MASK_INDEPENDENT; From 50c5b071e2833d2b61e3774cd792620311df157c Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:54 -0700 Subject: [PATCH 17/32] x86/fpu/apx: Enable APX state support With securing APX against conflicting MPX, it is now ready to be enabled. Include APX in the enabled xfeature set. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250416021720.12305-5-chang.seok.bae@intel.com --- arch/x86/include/asm/fpu/xstate.h | 3 ++- arch/x86/kernel/fpu/xstate.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 7f39fe7980c5..b308a76afbb7 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -32,7 +32,8 @@ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_XTILE) + XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_APX) /* * Features which are restored when returning to user space. diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 14f5c1bb2080..2ac1fc182273 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -371,7 +371,8 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ - XFEATURE_MASK_XTILE) + XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_APX) /* * setup the xstate image representing the init state From ab6f87ddd0c6d3fb114cdf897eb9839cbd429439 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:55 -0700 Subject: [PATCH 18/32] selftests/x86/apx: Add APX test The extended general-purpose registers for APX may contain random data, which is currently assumed by the xstate testing framework. This allows the testing of the new userspace feature using the common test code. Invoke the test entry function from apx.c after enumerating the state component and adding it to the support list Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Reviewed-by: Sohil Mehta Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250416021720.12305-6-chang.seok.bae@intel.com --- tools/testing/selftests/x86/Makefile | 3 ++- tools/testing/selftests/x86/apx.c | 10 ++++++++++ tools/testing/selftests/x86/xstate.c | 3 ++- tools/testing/selftests/x86/xstate.h | 2 ++ 4 files changed, 16 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/x86/apx.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 28422c32cc8f..f703fcfe9f7c 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -19,7 +19,7 @@ TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering \ - corrupt_xstate_header amx lam test_shadow_stack avx + corrupt_xstate_header amx lam test_shadow_stack avx apx # Some selftests require 32bit support enabled also on 64bit systems TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall @@ -136,3 +136,4 @@ $(OUTPUT)/nx_stack_64: CFLAGS += -Wl,-z,noexecstack $(OUTPUT)/avx_64: CFLAGS += -mno-avx -mno-avx512f $(OUTPUT)/amx_64: EXTRA_FILES += xstate.c $(OUTPUT)/avx_64: EXTRA_FILES += xstate.c +$(OUTPUT)/apx_64: EXTRA_FILES += xstate.c diff --git a/tools/testing/selftests/x86/apx.c b/tools/testing/selftests/x86/apx.c new file mode 100644 index 000000000000..d9c8d41b8c5a --- /dev/null +++ b/tools/testing/selftests/x86/apx.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include "xstate.h" + +int main(void) +{ + test_xstate(XFEATURE_APX); +} diff --git a/tools/testing/selftests/x86/xstate.c b/tools/testing/selftests/x86/xstate.c index 23c1d6c964ea..97fe4bd8bc77 100644 --- a/tools/testing/selftests/x86/xstate.c +++ b/tools/testing/selftests/x86/xstate.c @@ -31,7 +31,8 @@ (1 << XFEATURE_OPMASK) | \ (1 << XFEATURE_ZMM_Hi256) | \ (1 << XFEATURE_Hi16_ZMM) | \ - (1 << XFEATURE_XTILEDATA)) + (1 << XFEATURE_XTILEDATA) | \ + (1 << XFEATURE_APX)) static inline uint64_t xgetbv(uint32_t index) { diff --git a/tools/testing/selftests/x86/xstate.h b/tools/testing/selftests/x86/xstate.h index 42af36ec852f..e91e3092b5d2 100644 --- a/tools/testing/selftests/x86/xstate.h +++ b/tools/testing/selftests/x86/xstate.h @@ -33,6 +33,7 @@ enum xfeature { XFEATURE_RSRVD_COMP_16, XFEATURE_XTILECFG, XFEATURE_XTILEDATA, + XFEATURE_APX, XFEATURE_MAX, }; @@ -59,6 +60,7 @@ static const char *xfeature_names[] = "unknown xstate feature", "AMX Tile config", "AMX Tile data", + "APX registers", "unknown xstate feature", }; From 39cd7fad39ce2ffbeb21939c805ee55f3ec808d4 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:56 -0700 Subject: [PATCH 19/32] x86/fpu: Log XSAVE disablement consistently Not all paths that lead to fpu__init_disable_system_xstate() currently emit a message indicating that XSAVE has been disabled. Move the print statement into the function to ensure the message in all cases. Suggested-by: Dave Hansen Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250416021720.12305-7-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 2ac1fc182273..8b14c9d3a1df 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -751,6 +751,8 @@ static int __init init_xstate_size(void) */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { + pr_info("x86/fpu: XSAVE disabled\n"); + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); @@ -821,7 +823,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * This is a problematic CPU configuration where two * conflicting state components are both enumerated. */ - pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx, disabling XSAVE.\n", + pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } @@ -900,7 +902,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { - pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } @@ -912,7 +914,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { - pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", + pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } From 64e54461ab6e8524a8de4e63b7d1a3e4481b5cf3 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:57 -0700 Subject: [PATCH 20/32] x86/fpu: Refactor xfeature bitmask update code for sigframe XSAVE Currently, saving register states in the signal frame, the legacy feature bits are always set in xregs_state->header->xfeatures. This code sequence can be generalized for reuse in similar cases. Refactor the logic to ensure a consistent approach across similar usages. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250416021720.12305-8-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/signal.c | 11 +---------- arch/x86/kernel/fpu/xstate.h | 13 +++++++++++++ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index b8b4fa9c2d04..c3ec2512f2bb 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -114,7 +114,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, { struct xregs_state __user *x = buf; struct _fpx_sw_bytes sw_bytes = {}; - u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ @@ -127,12 +126,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, err |= __put_user(FP_XSTATE_MAGIC2, (__u32 __user *)(buf + fpstate->user_size)); - /* - * Read the xfeatures which we copied (directly from the cpu or - * from the state in task struct) to the user buffers. - */ - err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. This will @@ -144,9 +137,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ - xfeatures |= XFEATURE_MASK_FPSSE; - - err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); + err |= set_xfeature_in_sigframe(x, XFEATURE_MASK_FPSSE); return !err; } diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 9a3a8ccf13bf..4231e449f0b4 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -69,6 +69,19 @@ static inline u64 xfeatures_mask_independent(void) return fpu_kernel_cfg.independent_features; } +static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask) +{ + u64 xfeatures; + int err; + + /* Read the xfeatures value already saved in the user buffer */ + err = __get_user(xfeatures, &xbuf->header.xfeatures); + xfeatures |= mask; + err |= __put_user(xfeatures, &xbuf->header.xfeatures); + + return err; +} + /* * Update the value of PKRU register that was already pushed onto the signal frame. */ From d1e420772cd1eb0afe5858619c73ce36f3e781a1 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:58 -0700 Subject: [PATCH 21/32] x86/pkeys: Simplify PKRU update in signal frame The signal delivery logic was modified to always set the PKRU bit in xregs_state->header->xfeatures by this commit: ae6012d72fa6 ("x86/pkeys: Ensure updated PKRU value is XRSTOR'd") However, the change derives the bitmask value using XGETBV(1), rather than simply updating the buffer that already holds the value. Thus, this approach induces an unnecessary dependency on XGETBV1 for PKRU handling. Eliminate the dependency by using the established helper function. Subsequently, remove the now-unused 'mask' argument. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Aruna Ramakrishna Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Tony W Wang-oc Cc: Dave Hansen Link: https://lore.kernel.org/r/20250416021720.12305-9-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/xstate.h | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 4231e449f0b4..a0256ef34ecb 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -85,18 +85,15 @@ static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 /* * Update the value of PKRU register that was already pushed onto the signal frame. */ -static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru) +static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) { - u64 xstate_bv; int err; if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) return 0; /* Mark PKRU as in-use so that it is restored correctly. */ - xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU; - - err = __put_user(xstate_bv, &buf->header.xfeatures); + err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU); if (err) return err; @@ -320,7 +317,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr clac(); if (!err) - err = update_pkru_in_sigframe(buf, mask, pkru); + err = update_pkru_in_sigframe(buf, pkru); return err; } From 70fe4a0266ef156f3a49071da0d9ea6af0f49c44 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:16:59 -0700 Subject: [PATCH 22/32] x86/fpu: Remove export of mxcsr_feature_mask The variable was previously referenced in KVM code but the last usage was removed by: ea4d6938d4c0 ("x86/fpu: Replace KVMs home brewed FPU copy from user") Remove its export symbol. Reviewed-by: Nikolay Borisov Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250416021720.12305-10-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 16b6611634c3..2d9b5e677559 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -100,7 +100,6 @@ static void __init fpu__init_system_early_generic(void) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu; -EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { From de8304c319bc020ef79d109909ad40e944d82c82 Mon Sep 17 00:00:00 2001 From: "Chang S. Bae" Date: Tue, 15 Apr 2025 19:17:00 -0700 Subject: [PATCH 23/32] x86/fpu: Rename fpu_reset_fpregs() to fpu_reset_fpstate_regs() The original function name came from an overly compressed form of 'fpstate_regs' by commit: e61d6310a0f8 ("x86/fpu: Reset permission and fpstate on exec()") However, the term 'fpregs' typically refers to physical FPU registers. In contrast, this function copies the init values to fpu->fpstate->regs, not hardware registers. Rename the function to better reflect what it actually does. No functional change. Signed-off-by: Chang S. Bae Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Link: https://lore.kernel.org/r/20250416021720.12305-11-chang.seok.bae@intel.com --- arch/x86/kernel/fpu/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 3a19877a314e..8d674435f173 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -733,7 +733,7 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) /* * Reset current->fpu memory state to the init values. */ -static void fpu_reset_fpregs(void) +static void fpu_reset_fpstate_regs(void) { struct fpu *fpu = x86_task_fpu(current); @@ -768,7 +768,7 @@ void fpu__clear_user_states(struct fpu *fpu) fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpu_reset_fpregs(); + fpu_reset_fpstate_regs(); fpregs_unlock(); return; } @@ -798,7 +798,7 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { fpstate_reset(x86_task_fpu(current)); - fpu_reset_fpregs(); + fpu_reset_fpstate_regs(); } /* * Load FPU context before returning to userspace. From 730faa15a069f4025a0f8c2a5244c3067da7ecbe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 3 May 2025 16:38:30 +0200 Subject: [PATCH 24/32] x86/fpu: Simplify the switch_fpu_prepare() + switch_fpu_finish() logic Now that switch_fpu_finish() doesn't load the FPU state, it makes more sense to fold it into switch_fpu_prepare() renamed to switch_fpu(), and more importantly, use the "prev_p" task as a target for TIF_NEED_FPU_LOAD. It doesn't make any sense to delay set_tsk_thread_flag(TIF_NEED_FPU_LOAD) until "prev_p" is scheduled again. There is no worry about the very first context switch, fpu_clone() must always set TIF_NEED_FPU_LOAD. Also, shift the test_tsk_thread_flag(TIF_NEED_FPU_LOAD) from the callers to switch_fpu(). Note that the "PF_KTHREAD | PF_USER_WORKER" check can be removed but this deserves a separate patch which can change more functions, say, kernel_fpu_begin_mask(). Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Chang S . Bae Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250503143830.GA8982@redhat.com --- arch/x86/include/asm/fpu/sched.h | 34 +++++++++----------------------- arch/x86/kernel/process_32.c | 5 +---- arch/x86/kernel/process_64.c | 5 +---- 3 files changed, 11 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index 5fd12634bcc4..c060549c6c94 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -18,31 +18,25 @@ extern void fpu_flush_thread(void); /* * FPU state switching for scheduling. * - * This is a two-stage process: + * switch_fpu() saves the old state and sets TIF_NEED_FPU_LOAD if + * TIF_NEED_FPU_LOAD is not set. This is done within the context + * of the old process. * - * - switch_fpu_prepare() saves the old state. - * This is done within the context of the old process. - * - * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state - * will get loaded on return to userspace, or when the kernel needs it. - * - * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers - * are saved in the current thread's FPU register state. - * - * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not - * hold current()'s FPU registers. It is required to load the + * Once TIF_NEED_FPU_LOAD is set, it is required to load the * registers before returning to userland or using the content * otherwise. * * The FPU context is only stored/restored for a user task and * PF_KTHREAD is used to distinguish between kernel and user threads. */ -static inline void switch_fpu_prepare(struct task_struct *old, int cpu) +static inline void switch_fpu(struct task_struct *old, int cpu) { - if (cpu_feature_enabled(X86_FEATURE_FPU) && + if (!test_tsk_thread_flag(old, TIF_NEED_FPU_LOAD) && + cpu_feature_enabled(X86_FEATURE_FPU) && !(old->flags & (PF_KTHREAD | PF_USER_WORKER))) { struct fpu *old_fpu = x86_task_fpu(old); + set_tsk_thread_flag(old, TIF_NEED_FPU_LOAD); save_fpregs_to_fpstate(old_fpu); /* * The save operation preserved register state, so the @@ -50,7 +44,7 @@ static inline void switch_fpu_prepare(struct task_struct *old, int cpu) * current CPU number in @old_fpu, so the next return * to user space can avoid the FPU register restore * when is returns on the same CPU and still owns the - * context. + * context. See fpregs_restore_userregs(). */ old_fpu->last_cpu = cpu; @@ -58,14 +52,4 @@ static inline void switch_fpu_prepare(struct task_struct *old, int cpu) } } -/* - * Delay loading of the complete FPU state until the return to userland. - * PKRU is handled separately. - */ -static inline void switch_fpu_finish(struct task_struct *new) -{ - if (cpu_feature_enabled(X86_FEATURE_FPU)) - set_tsk_thread_flag(new, TIF_NEED_FPU_LOAD); -} - #endif /* _ASM_X86_FPU_SCHED_H */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4636ef359973..9bd4fa694da5 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -160,8 +160,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_p, cpu); + switch_fpu(prev_p, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -208,8 +207,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(current_task, next_p); - switch_fpu_finish(next_p); - /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7196ca7048be..d55310d3133c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -616,8 +616,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(hardirq_stack_inuse)); - if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_p, cpu); + switch_fpu(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -671,8 +670,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(current_task, next_p); raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_p); - /* Reload sp0. */ update_task_stack(next_p); From 392bbe11c7cf90e65cba32e90af3b969a981c4fe Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 3 May 2025 16:38:37 +0200 Subject: [PATCH 25/32] x86/fpu: Remove x86_init_fpu It is not actually used after: 55bc30f2e34d ("x86/fpu: Remove the thread::fpu pointer") Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Chang S . Bae Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250503143837.GA8985@redhat.com --- arch/x86/kernel/fpu/init.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 2d9b5e677559..6bb3e35c40e2 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -71,13 +71,9 @@ static bool __init fpu__probe_without_cpuid(void) return fsw == 0 && (fcw & 0x103f) == 0x003f; } -static struct fpu x86_init_fpu __attribute__ ((aligned (64))) __read_mostly; - static void __init fpu__init_system_early_generic(void) { - fpstate_reset(&x86_init_fpu); set_thread_flag(TIF_NEED_FPU_LOAD); - x86_init_fpu.last_cpu = -1; if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { From 8e269c030ecafbfebf4f55e24fb336fd7b489708 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 3 May 2025 16:38:43 +0200 Subject: [PATCH 26/32] x86/fpu: Remove DEFINE_EVENT(x86_fpu, x86_fpu_copy_src) trace_x86_fpu_copy_src() has no users after: 22aafe3bcb67 ("x86/fpu: Remove init_task FPU state dependencies, add debugging warning for PF_KTHREAD tasks") Remove the event. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Chang S . Bae Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250503143843.GA8989@redhat.com --- arch/x86/include/asm/trace/fpu.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 4645a6334063..0454d5e60e5d 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -74,11 +74,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_copy_src, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_copy_dst, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) From 2d299e3d773d519ee93e5aaa3ffddd4a6276b005 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 3 May 2025 16:38:50 +0200 Subject: [PATCH 27/32] x86/fpu: Always use memcpy_and_pad() in arch_dup_task_struct() It makes no sense to copy the bytes after sizeof(struct task_struct), FPU state will be initialized in fpu_clone(). A plain memcpy(dst, src, sizeof(struct task_struct)) should work too, but "_and_pad" looks safer. [ mingo: Simplify it a bit more. ] Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Brian Gerst Cc: Chang S . Bae Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250503143850.GA8997@redhat.com --- arch/x86/kernel/process.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7a1bfb61d86f..9e6180777565 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -93,11 +93,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - /* init_task is not dynamically sized (incomplete FPU state) */ - if (unlikely(src == &init_task)) - memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(init_task), 0); - else - memcpy(dst, src, arch_task_struct_size); + /* fpu_clone() will initialize the "dst_fpu" memory */ + memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(*dst), 0); #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; From 016a2e6f8ae5ed544ba8fb2b6d78f64ddfd9d01b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 3 May 2025 16:38:56 +0200 Subject: [PATCH 28/32] x86/fpu: Check TIF_NEED_FPU_LOAD instead of PF_KTHREAD|PF_USER_WORKER in fpu__drop() PF_KTHREAD|PF_USER_WORKER tasks should never clear TIF_NEED_FPU_LOAD, so the TIF_NEED_FPU_LOAD check should equally filter them out. And this way an exiting userspace task can avoid the unnecessary "fwait" if it does context_switch() at least once on its way to exit_thread(). Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Chang S . Bae Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250503143856.GA9009@redhat.com --- arch/x86/kernel/fpu/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 8d674435f173..fa131299c7da 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -693,8 +693,7 @@ void fpu__drop(struct task_struct *tsk) { struct fpu *fpu; - /* PF_KTHREAD tasks do not use the FPU context area: */ - if (tsk->flags & (PF_KTHREAD | PF_USER_WORKER)) + if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD)) return; fpu = x86_task_fpu(tsk); From 46c158e3ad0fc633007802c338c409c188ec0a12 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 3 May 2025 16:39:02 +0200 Subject: [PATCH 29/32] x86/fpu: Shift fpregs_assert_state_consistent() from arch_exit_work() to its caller If CONFIG_X86_DEBUG_FPU=Y, arch_exit_to_user_mode_prepare() calls arch_exit_work() even if ti_work == 0. There only reason is that we want to call fpregs_assert_state_consistent() if TIF_NEED_FPU_LOAD is not set. This looks confusing. arch_exit_to_user_mode_prepare() can just call fpregs_assert_state_consistent() unconditionally, it depends on CONFIG_X86_DEBUG_FPU and checks TIF_NEED_FPU_LOAD itself. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar Cc: Chang S . Bae Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250503143902.GA9012@redhat.com --- arch/x86/include/asm/entry-common.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 77d20555e04d..d535a97c7284 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -53,7 +53,6 @@ static inline void arch_exit_work(unsigned long ti_work) if (unlikely(ti_work & _TIF_IO_BITMAP)) tss_update_io_bitmap(); - fpregs_assert_state_consistent(); if (unlikely(ti_work & _TIF_NEED_FPU_LOAD)) switch_fpu_return(); } @@ -61,7 +60,9 @@ static inline void arch_exit_work(unsigned long ti_work) static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) { - if (IS_ENABLED(CONFIG_X86_DEBUG_FPU) || unlikely(ti_work)) + fpregs_assert_state_consistent(); + + if (unlikely(ti_work)) arch_exit_work(ti_work); fred_update_rsp0(); From 960bc2bcba5987a82530b9756e1f602a894cffa4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 4 May 2025 15:30:38 -0700 Subject: [PATCH 30/32] x86/fpu: Restore fpu_thread_struct_whitelist() to fix CONFIG_HARDENED_USERCOPY=y crash Borislav Petkov reported the following boot crash on x86-32, with CONFIG_HARDENED_USERCOPY=y: | usercopy: Kernel memory overwrite attempt detected to SLUB object 'task_struct' (offset 2112, size 160)! | ... | kernel BUG at mm/usercopy.c:102! So the useroffset and usersize arguments are what control the allowed window of copying in/out of the "task_struct" kmem cache: /* create a slab on which task_structs can be allocated */ task_struct_whitelist(&useroffset, &usersize); task_struct_cachep = kmem_cache_create_usercopy("task_struct", arch_task_struct_size, align, SLAB_PANIC|SLAB_ACCOUNT, useroffset, usersize, NULL); task_struct_whitelist() positions this window based on the location of the thread_struct within task_struct, and gets the arch-specific details via arch_thread_struct_whitelist(offset, size): static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ arch_thread_struct_whitelist(offset, size); /* * Handle zero-sized whitelist or empty thread_struct, otherwise * adjust offset to position of thread_struct in task_struct. */ if (unlikely(*size == 0)) *offset = 0; else *offset += offsetof(struct task_struct, thread); } Commit cb7ca40a3882 ("x86/fpu: Make task_struct::thread constant size") removed the logic for the window, leaving: static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = 0; *size = 0; } So now there is no window that usercopy hardening will allow to be copied in/out of task_struct. But as reported above, there *is* a copy in copy_uabi_to_xstate(). (It seems there are several, actually.) int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru); } This appears to be writing into x86_task_fpu(tsk)->fpstate. With or without CONFIG_X86_DEBUG_FPU, this resolves to: ((struct fpu *)((void *)(task) + sizeof(*(task)))) i.e. the memory "after task_struct" is cast to "struct fpu", and the uses the "fpstate" pointer. How that pointer gets set looks to be variable, but I think the one we care about here is: fpu->fpstate = &fpu->__fpstate; And struct fpu::__fpstate says: struct fpstate __fpstate; /* * WARNING: '__fpstate' is dynamically-sized. Do not put * anything after it here. */ So we're still dealing with a dynamically sized thing, even if it's not within the literal struct task_struct -- it's still in the kmem cache, though. Looking at the kmem cache size, it has allocated "arch_task_struct_size" bytes, which is calculated in fpu__init_task_struct_size(): int task_size = sizeof(struct task_struct); task_size += sizeof(struct fpu); /* * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ task_size -= sizeof(union fpregs_state); /* * Add back the dynamically-calculated register state * size. */ task_size += fpu_kernel_cfg.default_size; /* * We dynamically size 'struct fpu', so we require that * 'state' be at the end of 'it: */ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); arch_task_struct_size = task_size; So, this is still copying out of the kmem cache for task_struct, and the window seems unchanged (still fpu regs). This is what the window was before: void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); *size = fpu_kernel_cfg.default_size; } And the same commit I mentioned above removed it. I think the misunderstanding is here: | The fpu_thread_struct_whitelist() quirk to hardened usercopy can be removed, | now that the FPU structure is not embedded in the task struct anymore, which | reduces text footprint a bit. Yes, FPU is no longer in task_struct, but it IS in the kmem cache named "task_struct", since the fpstate is still being allocated there. Partially revert the earlier mentioned commit, along with a recalculation of the fpstate regs location. Fixes: cb7ca40a3882 ("x86/fpu: Make task_struct::thread constant size") Reported-by: Borislav Petkov (AMD) Tested-by: Borislav Petkov (AMD) Signed-off-by: Kees Cook Signed-off-by: Ingo Molnar Cc: Chang S. Bae Cc: Gustavo A. R. Silva Cc: H. Peter Anvin Cc: Andy Lutomirski Cc: Brian Gerst Cc: Linus Torvalds Cc: linux-hardening@vger.kernel.org Link: https://lore.kernel.org/all/20250409211127.3544993-1-mingo@kernel.org/ # Discussion #1 Link: https://lore.kernel.org/r/202505041418.F47130C4C8@keescook # Discussion #2 --- arch/x86/include/asm/processor.h | 12 +++++------- arch/x86/kernel/fpu/core.c | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index eaa7214d6953..ad33903dc92a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -522,14 +522,12 @@ extern struct fpu *x86_task_fpu(struct task_struct *task); # define x86_task_fpu(task) ((struct fpu *)((void *)(task) + sizeof(*(task)))) #endif -/* - * X86 doesn't need any embedded-FPU-struct quirks: - */ -static inline void -arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) +extern void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size); + +static inline void arch_thread_struct_whitelist(unsigned long *offset, + unsigned long *size) { - *offset = 0; - *size = 0; + fpu_thread_struct_whitelist(offset, size); } static inline void diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index fa131299c7da..105b1b80d88d 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -680,6 +680,20 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, return 0; } +/* + * While struct fpu is no longer part of struct thread_struct, it is still + * allocated after struct task_struct in the "task_struct" kmem cache. But + * since FPU is expected to be part of struct thread_struct, we have to + * adjust for it here. + */ +void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + /* The allocation follows struct task_struct. */ + *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread); + *offset += offsetof(struct fpu, __fpstate.regs); + *size = fpu_kernel_cfg.default_size; +} + /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents From d8414603b29f25191fcceafb72e74b67ac2e92b1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 6 May 2025 17:36:06 +0800 Subject: [PATCH 31/32] x86/fpu/xstate: Always preserve non-user xfeatures/flags in __state_perm When granting userspace or a KVM guest access to an xfeature, preserve the entity's existing supervisor and software-defined permissions as tracked by __state_perm, i.e. use __state_perm to track *all* permissions even though all supported supervisor xfeatures are granted to all FPUs and FPU_GUEST_PERM_LOCKED disallows changing permissions. Effectively clobbering supervisor permissions results in inconsistent behavior, as xstate_get_group_perm() will report supervisor features for process that do NOT request access to dynamic user xfeatures, whereas any and all supervisor features will be absent from the set of permissions for any process that is granted access to one or more dynamic xfeatures (which right now means AMX). The inconsistency isn't problematic because fpu_xstate_prctl() already strips out everything except user xfeatures: case ARCH_GET_XCOMP_PERM: /* * Lockless snapshot as it can also change right after the * dropping the lock. */ permitted = xstate_get_host_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_GET_XCOMP_GUEST_PERM: permitted = xstate_get_guest_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); and similarly KVM doesn't apply the __state_perm to supervisor states (kvm_get_filtered_xcr0() incorporates xstate_get_guest_group_perm()): case 0xd: { u64 permitted_xcr0 = kvm_get_filtered_xcr0(); u64 permitted_xss = kvm_caps.supported_xss; But if KVM in particular were to ever change, dropping supervisor permissions would result in subtle bugs in KVM's reporting of supported CPUID settings. And the above behavior also means that having supervisor xfeatures in __state_perm is correctly handled by all users. Dropping supervisor permissions also creates another landmine for KVM. If more dynamic user xfeatures are ever added, requesting access to multiple xfeatures in separate ARCH_REQ_XCOMP_GUEST_PERM calls will result in the second invocation of __xstate_request_perm() computing the wrong ksize, as as the mask passed to xstate_calculate_size() would not contain *any* supervisor features. Commit 781c64bfcb73 ("x86/fpu/xstate: Handle supervisor states in XSTATE permissions") fudged around the size issue for userspace FPUs, but for reasons unknown skipped guest FPUs. Lack of a fix for KVM "works" only because KVM doesn't yet support virtualizing features that have supervisor xfeatures, i.e. as of today, KVM guest FPUs will never need the relevant xfeatures. Simply extending the hack-a-fix for guests would temporarily solve the ksize issue, but wouldn't address the inconsistency issue and would leave another lurking pitfall for KVM. KVM support for virtualizing CET will likely add CET_KERNEL as a guest-only xfeature, i.e. CET_KERNEL will not be set in xfeatures_mask_supervisor() and would again be dropped when granting access to dynamic xfeatures. Note, the existing clobbering behavior is rather subtle. The @permitted parameter to __xstate_request_perm() comes from: permitted = xstate_get_group_perm(guest); which is either fpu->guest_perm.__state_perm or fpu->perm.__state_perm, where __state_perm is initialized to: fpu->perm.__state_perm = fpu_kernel_cfg.default_features; and copied to the guest side of things: /* Same defaults for guests */ fpu->guest_perm = fpu->perm; fpu_kernel_cfg.default_features contains everything except the dynamic xfeatures, i.e. everything except XFEATURE_MASK_XTILE_DATA: fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features; fpu_kernel_cfg.default_features &= ~XFEATURE_MASK_USER_DYNAMIC; When __xstate_request_perm() restricts the local "mask" variable to compute the user state size: mask &= XFEATURE_MASK_USER_SUPPORTED; usize = xstate_calculate_size(mask, false); it subtly overwrites the target __state_perm with "mask" containing only user xfeatures: perm = guest ? &fpu->guest_perm : &fpu->perm; /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ WRITE_ONCE(perm->__state_perm, mask); Signed-off-by: Sean Christopherson Signed-off-by: Yang Weijiang Signed-off-by: Chao Gao Signed-off-by: Ingo Molnar Reviewed-by: Maxim Levitsky Reviewed-by: Rick Edgecombe Reviewed-by: Chang S. Bae Acked-by: Dave Hansen Cc: Andy Lutomirski Cc: David Woodhouse Cc: H. Peter Anvin Cc: John Allen Cc: Linus Torvalds Cc: Mitchell Levy Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Samuel Holland Cc: Sohil Mehta Cc: Vignesh Balasubramanian Cc: Vitaly Kuznetsov Cc: Xin Li Cc: kvm@vger.kernel.org Link: https://lore.kernel.org/all/ZTqgzZl-reO1m01I@google.com Link: https://lore.kernel.org/r/20250506093740.2864458-2-chao.gao@intel.com --- arch/x86/include/asm/fpu/types.h | 8 +++++--- arch/x86/kernel/fpu/xstate.c | 18 +++++++++++------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 97310df3ea13..e64db0eb9d27 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -416,9 +416,11 @@ struct fpu_state_perm { /* * @__state_perm: * - * This bitmap indicates the permission for state components, which - * are available to a thread group. The permission prctl() sets the - * enabled state bits in thread_group_leader()->thread.fpu. + * This bitmap indicates the permission for state components + * available to a thread group, including both user and supervisor + * components and software-defined bits like FPU_GUEST_PERM_LOCKED. + * The permission prctl() sets the enabled state bits in + * thread_group_leader()->thread.fpu. * * All run time operations use the per thread information in the * currently active fpu.fpstate which contains the xfeature masks diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 8b14c9d3a1df..1c8410b68108 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1656,16 +1656,20 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) if ((permitted & requested) == requested) return 0; - /* Calculate the resulting kernel state size */ + /* + * Calculate the resulting kernel state size. Note, @permitted also + * contains supervisor xfeatures even though supervisor are always + * permitted for kernel and guest FPUs, and never permitted for user + * FPUs. + */ mask = permitted | requested; - /* Take supervisor states into account on the host */ - if (!guest) - mask |= xfeatures_mask_supervisor(); ksize = xstate_calculate_size(mask, compacted); - /* Calculate the resulting user state size */ - mask &= XFEATURE_MASK_USER_SUPPORTED; - usize = xstate_calculate_size(mask, false); + /* + * Calculate the resulting user state size. Take care not to clobber + * the supervisor xfeatures in the new mask! + */ + usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false); if (!guest) { ret = validate_sigaltstack(usize); From 32d5fa804dc9bd7cf6651a1378ba616d332e7444 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Tue, 6 May 2025 17:36:07 +0800 Subject: [PATCH 32/32] x86/fpu: Drop @perm from guest pseudo FPU container Remove @perm from the guest pseudo FPU container. The field is initialized during allocation and never used later. Rename fpu_init_guest_permissions() to show that its sole purpose is to lock down guest permissions. Suggested-by: Maxim Levitsky Signed-off-by: Chao Gao Signed-off-by: Ingo Molnar Reviewed-by: Chang S. Bae Cc: Andy Lutomirski Cc: David Woodhouse Cc: Eric Biggers Cc: Fenghua Yu Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Mitchell Levy Cc: Oleg Nesterov Cc: Paolo Bonzini Cc: Samuel Holland Cc: Sean Christopherson Cc: Vitaly Kuznetsov Link: https://lore.kernel.org/kvm/af972fe5981b9e7101b64de43c7be0a8cc165323.camel@redhat.com/ Link: https://lore.kernel.org/r/20250506093740.2864458-3-chao.gao@intel.com --- arch/x86/include/asm/fpu/types.h | 7 ------- arch/x86/kernel/fpu/core.c | 7 ++----- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index e64db0eb9d27..1c94121acd3d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -535,13 +535,6 @@ struct fpu_guest { */ u64 xfeatures; - /* - * @perm: xfeature bitmap of features which are - * permitted to be enabled for the guest - * vCPU. - */ - u64 perm; - /* * @xfd_err: Save the guest value. */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 105b1b80d88d..1cda5b78540b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -212,7 +212,7 @@ void fpu_reset_from_exception_fixup(void) #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); -static void fpu_init_guest_permissions(struct fpu_guest *gfpu) +static void fpu_lock_guest_permissions(void) { struct fpu_state_perm *fpuperm; u64 perm; @@ -228,8 +228,6 @@ static void fpu_init_guest_permissions(struct fpu_guest *gfpu) WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(¤t->sighand->siglock); - - gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) @@ -250,7 +248,6 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_kernel_cfg.default_features; - gfpu->perm = fpu_kernel_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state @@ -265,7 +262,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; - fpu_init_guest_permissions(gfpu); + fpu_lock_guest_permissions(); return true; }