From 4db7384ce55c4d7bfb9876fabd8d8778b2ff90ff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 19 May 2025 14:03:01 -0400 Subject: [PATCH 001/284] btrfs: don't drop a reference if btrfs_check_write_meta_pointer() fails In the zoned mode there's a bug in the extent buffer tree conversion to xarray. The reference for eb is dropped and code continues but the references get dropped by releasing the batch. Reported-by: Johannes Thumshirn Fixes: 19d7f65f032f ("btrfs: convert the buffer_radix to an xarray") Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e43f6280f954..849199768664 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2189,7 +2189,6 @@ int btree_write_cache_pages(struct address_space *mapping, done = 1; break; } - free_extent_buffer(eb); continue; } From 4f6fc782128355931527cefe3eb45338abd8ab39 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2025 12:31:45 +0200 Subject: [PATCH 002/284] perf: Fix sample vs do_exit() Baisheng Gao reported an ARM64 crash, which Mark decoded as being a synchronous external abort -- most likely due to trying to access MMIO in bad ways. The crash further shows perf trying to do a user stack sample while in exit_mmap()'s tlb_finish_mmu() -- i.e. while tearing down the address space it is trying to access. It turns out that we stop perf after we tear down the userspace mm; a receipie for disaster, since perf likes to access userspace for various reasons. Flip this order by moving up where we stop perf in do_exit(). Additionally, harden PERF_SAMPLE_CALLCHAIN and PERF_SAMPLE_STACK_USER to abort when the current task does not have an mm (exit_mm() makes sure to set current->mm = NULL; before commencing with the actual teardown). Such that CPU wide events don't trip on this same problem. Fixes: c5ebcedb566e ("perf: Add ability to attach user stack dump to sample") Reported-by: Baisheng Gao Suggested-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250605110815.GQ39944@noisy.programming.kicks-ass.net --- kernel/events/core.c | 7 +++++++ kernel/exit.c | 17 +++++++++-------- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index f34c99f8ce8f..26dec1d8a5ab 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7439,6 +7439,10 @@ perf_sample_ustack_size(u16 stack_size, u16 header_size, if (!regs) return 0; + /* No mm, no stack, no dump. */ + if (!current->mm) + return 0; + /* * Check if we fit in with the requested stack size into the: * - TASK_SIZE @@ -8150,6 +8154,9 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + if (!current->mm) + user = false; + if (!kernel && !user) return &__empty_callchain; diff --git a/kernel/exit.c b/kernel/exit.c index 38645039dd8f..1883150c9422 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -944,6 +944,15 @@ void __noreturn do_exit(long code) taskstats_exit(tsk, group_dead); trace_sched_process_exit(tsk, group_dead); + /* + * Since sampling can touch ->mm, make sure to stop everything before we + * tear it down. + * + * Also flushes inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_event_exit_task(tsk); + exit_mm(); if (group_dead) @@ -959,14 +968,6 @@ void __noreturn do_exit(long code) exit_task_work(tsk); exit_thread(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - * - * because of cgroup mode, must be called before cgroup_exit() - */ - perf_event_exit_task(tsk); - sched_autogroup_exit_task(tsk); cgroup_exit(tsk); From 61988e36dc5457cdff7ae7927e8d9ad1419ee998 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2025 12:37:11 +0200 Subject: [PATCH 003/284] perf: Fix cgroup state vs ERROR While chasing down a missing perf_cgroup_event_disable() elsewhere, Leo Yan found that both perf_put_aux_event() and perf_remove_sibling_event() were also missing one. Specifically, the rule is that events that switch to OFF,ERROR need to call perf_cgroup_event_disable(). Unify the disable paths to ensure this. Fixes: ab43762ef010 ("perf: Allow normal events to output AUX data") Fixes: 9f0c4fa111dc ("perf/core: Add a new PERF_EV_CAP_SIBLING event capability") Reported-by: Leo Yan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250605123343.GD35970@noisy.programming.kicks-ass.net --- kernel/events/core.c | 51 ++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 26dec1d8a5ab..1cc98b9b3c0b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2149,8 +2149,9 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) } static void put_event(struct perf_event *event); -static void event_sched_out(struct perf_event *event, - struct perf_event_context *ctx); +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state); static void perf_put_aux_event(struct perf_event *event) { @@ -2183,8 +2184,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + __event_disable(iter, ctx, PERF_EVENT_STATE_ERROR); } } @@ -2242,18 +2242,6 @@ static inline struct list_head *get_event_list(struct perf_event *event) &event->pmu_ctx->flexible_active; } -/* - * Events that have PERF_EV_CAP_SIBLING require being part of a group and - * cannot exist on their own, schedule them out and move them into the ERROR - * state. Also see _perf_event_enable(), it will not be able to recover - * this ERROR state. - */ -static inline void perf_remove_sibling_event(struct perf_event *event) -{ - event_sched_out(event, event->ctx); - perf_event_set_state(event, PERF_EVENT_STATE_ERROR); -} - static void perf_group_detach(struct perf_event *event) { struct perf_event *leader = event->group_leader; @@ -2289,8 +2277,15 @@ static void perf_group_detach(struct perf_event *event) */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) { + /* + * Events that have PERF_EV_CAP_SIBLING require being part of + * a group and cannot exist on their own, schedule them out + * and move them into the ERROR state. Also see + * _perf_event_enable(), it will not be able to recover this + * ERROR state. + */ if (sibling->event_caps & PERF_EV_CAP_SIBLING) - perf_remove_sibling_event(sibling); + __event_disable(sibling, ctx, PERF_EVENT_STATE_ERROR); sibling->group_leader = sibling; list_del_init(&sibling->sibling_list); @@ -2562,6 +2557,15 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla event_function_call(event, __perf_remove_from_context, (void *)flags); } +static void __event_disable(struct perf_event *event, + struct perf_event_context *ctx, + enum perf_event_state state) +{ + event_sched_out(event, ctx); + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, state); +} + /* * Cross CPU call to disable a performance event */ @@ -2576,13 +2580,18 @@ static void __perf_event_disable(struct perf_event *event, perf_pmu_disable(event->pmu_ctx->pmu); ctx_time_update_event(ctx, event); + /* + * When disabling a group leader, the whole group becomes ineligible + * to run, so schedule out the full group. + */ if (event == event->group_leader) group_sched_out(event, ctx); - else - event_sched_out(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - perf_cgroup_event_disable(event, ctx); + /* + * But only mark the leader OFF; the siblings will remain + * INACTIVE. + */ + __event_disable(event, ctx, PERF_EVENT_STATE_OFF); perf_pmu_enable(event->pmu_ctx->pmu); } From 3b7a34aebbdf2a4b7295205bf0c654294283ec82 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Mon, 2 Jun 2025 19:40:49 +0100 Subject: [PATCH 004/284] perf: Fix dangling cgroup pointer in cpuctx Commit a3c3c6667("perf/core: Fix child_total_time_enabled accounting bug at task exit") moves the event->state update to before list_del_event(). This makes the event->state test in list_del_event() always false; never calling perf_cgroup_event_disable(). As a result, cpuctx->cgrp won't be cleared properly; causing havoc. Fixes: a3c3c6667("perf/core: Fix child_total_time_enabled accounting bug at task exit") Signed-off-by: Yeoreum Yun Signed-off-by: Peter Zijlstra (Intel) Tested-by: David Wang <00107082@163.com> Link: https://lore.kernel.org/all/aD2TspKH%2F7yvfYoO@e129823.arm.com/ --- kernel/events/core.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1cc98b9b3c0b..d78608323916 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2120,18 +2120,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) del_event_from_groups(event, ctx); - /* - * If event was in error state, then keep it - * that way, otherwise bogus counts will be - * returned on read(). The only way to get out - * of error state is by explicit re-enabling - * of the event - */ - if (event->state > PERF_EVENT_STATE_OFF) { - perf_cgroup_event_disable(event, ctx); - perf_event_set_state(event, PERF_EVENT_STATE_OFF); - } - ctx->generation++; event->pmu_ctx->nr_events--; } @@ -2488,11 +2476,14 @@ __perf_remove_from_context(struct perf_event *event, state = PERF_EVENT_STATE_EXIT; if (flags & DETACH_REVOKE) state = PERF_EVENT_STATE_REVOKED; - if (flags & DETACH_DEAD) { - event->pending_disable = 1; + if (flags & DETACH_DEAD) state = PERF_EVENT_STATE_DEAD; - } + event_sched_out(event, ctx); + + if (event->state > PERF_EVENT_STATE_OFF) + perf_cgroup_event_disable(event, ctx); + perf_event_set_state(event, min(event->state, state)); if (flags & DETACH_GROUP) From 3172fb986666dfb71bf483b6d3539e1e587fa197 Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Wed, 4 Jun 2025 03:39:24 +0000 Subject: [PATCH 005/284] perf/core: Fix WARN in perf_cgroup_switch() There may be concurrency between perf_cgroup_switch and perf_cgroup_event_disable. Consider the following scenario: after a new perf cgroup event is created on CPU0, the new event may not trigger a reprogramming, causing ctx->is_active to be 0. In this case, when CPU1 disables this perf event, it executes __perf_remove_from_context-> list _del_event->perf_cgroup_event_disable on CPU1, which causes a race with perf_cgroup_switch running on CPU0. The following describes the details of this concurrency scenario: CPU0 CPU1 perf_cgroup_switch: ... # cpuctx->cgrp is not NULL here if (READ_ONCE(cpuctx->cgrp) == NULL) return; perf_remove_from_context: ... raw_spin_lock_irq(&ctx->lock); ... # ctx->is_active == 0 because reprogramm is not # tigger, so CPU1 can do __perf_remove_from_context # for CPU0 __perf_remove_from_context: perf_cgroup_event_disable: ... if (--ctx->nr_cgroups) ... # this warning will happened because CPU1 changed # ctx.nr_cgroups to 0. WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); [peterz: use guard instead of goto unlock] Fixes: db4a835601b7 ("perf/core: Set cgroup in CPU contexts for new cgroup events") Signed-off-by: Luo Gengkun Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250604033924.3914647-3-luogengkun@huaweicloud.com --- kernel/events/core.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index d78608323916..d7cf008f3d75 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -207,6 +207,19 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, __perf_ctx_unlock(&cpuctx->ctx); } +typedef struct { + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; +} class_perf_ctx_lock_t; + +static inline void class_perf_ctx_lock_destructor(class_perf_ctx_lock_t *_T) +{ perf_ctx_unlock(_T->cpuctx, _T->ctx); } + +static inline class_perf_ctx_lock_t +class_perf_ctx_lock_constructor(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ perf_ctx_lock(cpuctx, ctx); return (class_perf_ctx_lock_t){ cpuctx, ctx }; } + #define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) @@ -944,7 +957,13 @@ static void perf_cgroup_switch(struct task_struct *task) if (READ_ONCE(cpuctx->cgrp) == cgrp) return; - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); + /* + * Re-check, could've raced vs perf_remove_from_context(). + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; + perf_ctx_disable(&cpuctx->ctx, true); ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); @@ -962,7 +981,6 @@ static void perf_cgroup_switch(struct task_struct *task) ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); perf_ctx_enable(&cpuctx->ctx, true); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, From 49b393af3130c7712c7e8f215f4126c9a8060fa6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Jun 2025 10:21:38 +0200 Subject: [PATCH 006/284] perf: Add comment to enum perf_event_state Better describe the event states. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Leo Yan Link: https://lkml.kernel.org/r/20250604135801.GK38114@noisy.programming.kicks-ass.net --- include/linux/perf_event.h | 42 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 52dc7cfab0e0..ec9d96025683 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -635,8 +635,46 @@ struct perf_addr_filter_range { unsigned long size; }; -/** - * enum perf_event_state - the states of an event: +/* + * The normal states are: + * + * ACTIVE --. + * ^ | + * | | + * sched_{in,out}() | + * | | + * v | + * ,---> INACTIVE --+ <-. + * | | | + * | {dis,en}able() + * sched_in() | | + * | OFF <--' --+ + * | | + * `---> ERROR ------' + * + * That is: + * + * sched_in: INACTIVE -> {ACTIVE,ERROR} + * sched_out: ACTIVE -> INACTIVE + * disable: {ACTIVE,INACTIVE} -> OFF + * enable: {OFF,ERROR} -> INACTIVE + * + * Where {OFF,ERROR} are disabled states. + * + * Then we have the {EXIT,REVOKED,DEAD} states which are various shades of + * defunct events: + * + * - EXIT means task that the even was assigned to died, but child events + * still live, and further children can still be created. But the event + * itself will never be active again. It can only transition to + * {REVOKED,DEAD}; + * + * - REVOKED means the PMU the event was associated with is gone; all + * functionality is stopped but the event is still alive. Can only + * transition to DEAD; + * + * - DEAD event really is DYING tearing down state and freeing bits. + * */ enum perf_event_state { PERF_EVENT_STATE_DEAD = -5, From 1a9dcf69c7a97e733aa2fc026db22f22928ca7b7 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 28 May 2025 10:55:19 +0200 Subject: [PATCH 007/284] selftests/futex: getopt() requires int as return value. Mark reported that futex_priv_hash fails on ARM64. It turns out that the command line parsing does not terminate properly and ends in the default case assuming an invalid option was passed. Use an int as the return type for getopt(). Closes: https://lore.kernel.org/all/31869a69-063f-44a3-a079-ba71b2506cce@sirena.org.uk/ Fixes: 3163369407baf ("selftests/futex: Add futex_numa_mpol") Fixes: cda95faef7bcf ("selftests/futex: Add futex_priv_hash") Reported-by: Mark Brown Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250528085521.1938355-2-bigeasy@linutronix.de --- tools/testing/selftests/futex/functional/futex_numa_mpol.c | 2 +- tools/testing/selftests/futex/functional/futex_priv_hash.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index 20a9d3ecf743..564dbd02d2f4 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -144,7 +144,7 @@ int main(int argc, char *argv[]) struct futex32_numa *futex_numa; int mem_size, i; void *futex_ptr; - char c; + int c; while ((c = getopt(argc, argv, "chv:")) != -1) { switch (c) { diff --git a/tools/testing/selftests/futex/functional/futex_priv_hash.c b/tools/testing/selftests/futex/functional/futex_priv_hash.c index 2dca18fefedc..24a92dc94eb8 100644 --- a/tools/testing/selftests/futex/functional/futex_priv_hash.c +++ b/tools/testing/selftests/futex/functional/futex_priv_hash.c @@ -130,7 +130,7 @@ int main(int argc, char *argv[]) pthread_mutexattr_t mutex_attr_pi; int use_global_hash = 0; int ret; - char c; + int c; while ((c = getopt(argc, argv, "cghv:")) != -1) { switch (c) { From 0ecb4232fc65e659ca7020f8bb2e0fc347acfb7d Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 28 May 2025 10:55:20 +0200 Subject: [PATCH 008/284] selftests/futex: Set the home_node in futex_numa_mpol The test fails at the MPOL step if multiple nodes are available. The reason is that mbind() sets the policy but the home_node, which is retrieved by the futex code, is not set. This causes to retrieve the current node and with multiple nodes it fails on one of the iterations. Use numa_set_mempolicy_home_node() to set the expected node. Use ksft_exit_fail_msg() to fail and exit in order not to confuse ktap. Fixes: 3163369407baf ("selftests/futex: Add futex_numa_mpol") Suggested-by: Vlastimil Babka Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250528085521.1938355-3-bigeasy@linutronix.de --- .../testing/selftests/futex/functional/futex_numa_mpol.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/futex/functional/futex_numa_mpol.c b/tools/testing/selftests/futex/functional/futex_numa_mpol.c index 564dbd02d2f4..a9ecfb2d3932 100644 --- a/tools/testing/selftests/futex/functional/futex_numa_mpol.c +++ b/tools/testing/selftests/futex/functional/futex_numa_mpol.c @@ -210,6 +210,10 @@ int main(int argc, char *argv[]) ret = mbind(futex_ptr, mem_size, MPOL_BIND, &nodemask, sizeof(nodemask) * 8, 0); if (ret == 0) { + ret = numa_set_mempolicy_home_node(futex_ptr, mem_size, i, 0); + if (ret != 0) + ksft_exit_fail_msg("Failed to set home node: %m, %d\n", errno); + ksft_print_msg("Node %d test\n", i); futex_numa->futex = 0; futex_numa->numa = FUTEX_NO_NODE; @@ -220,8 +224,8 @@ int main(int argc, char *argv[]) if (0) test_futex_mpol(futex_numa, 0); if (futex_numa->numa != i) { - ksft_test_result_fail("Returned NUMA node is %d expected %d\n", - futex_numa->numa, i); + ksft_exit_fail_msg("Returned NUMA node is %d expected %d\n", + futex_numa->numa, i); } } } From 8337204c58899fe422db765b481711eb2d95eb0b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 28 May 2025 10:55:21 +0200 Subject: [PATCH 009/284] futex: Handle invalid node numbers supplied by user syzbot used a negative node number which was not rejected early and led to invalid memory access in node_possible(). Reject negative node numbers except for FUTEX_NO_NODE. [bigeasy: Keep the FUTEX_NO_NODE check] Closes: https://lore.kernel.org/all/6835bfe3.a70a0220.253bc2.00b5.GAE@google.com/ Fixes: cec199c5e39bd ("futex: Implement FUTEX2_NUMA") Signed-off-by: Peter Zijlstra (Intel) Reported-by: syzbot+9afaf6749e3a7aa1bdf3@syzkaller.appspotmail.com Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250528085521.1938355-4-bigeasy@linutronix.de --- kernel/futex/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 565f9717c6ca..b652d2f60c40 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -583,8 +583,8 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, if (futex_get_value(&node, naddr)) return -EFAULT; - if (node != FUTEX_NO_NODE && - (node >= MAX_NUMNODES || !node_possible(node))) + if ((node != FUTEX_NO_NODE) && + ((unsigned int)node >= MAX_NUMNODES || !node_possible(node))) return -EINVAL; } From 22188b9df60dde48eaba276da22062aaf8e12dfe Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Tue, 3 Jun 2025 14:48:13 -0700 Subject: [PATCH 010/284] ASoC: cs48l32: Fix a signedness bug in cs48l32_hw_params() There is a type promotion that can happen when freq(u32) variable is comapared with sclk_target(integer), when sclk_target is a negative value it promotes to a large postive integer which might not be a problem in this particular case as the condition evaluates to false when that happens, but bail out early when sclk_target has negative error codes. cs48l32_sclk_rates[i].freq >= sclk_target Fix this by adding a negative error check when snd_soc_tdm_params_to_bclk() fails Fixes: e2bcbf99d045 ("ASoC: cs48l32: Add driver for Cirrus Logic CS48L32 audio DSP") Signed-off-by: Harshit Mogalapalli Link: https://patch.msgid.link/20250603214813.197346-1-harshit.m.mogalapalli@oracle.com Reviewed-by: Richard Fitzgerald Signed-off-by: Mark Brown --- sound/soc/codecs/cs48l32.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/soc/codecs/cs48l32.c b/sound/soc/codecs/cs48l32.c index 90a795230d27..9bdd48aab42a 100644 --- a/sound/soc/codecs/cs48l32.c +++ b/sound/soc/codecs/cs48l32.c @@ -2162,6 +2162,10 @@ static int cs48l32_hw_params(struct snd_pcm_substream *substream, n_slots_multiple = 1; sclk_target = snd_soc_tdm_params_to_bclk(params, slotw, n_slots, n_slots_multiple); + if (sclk_target < 0) { + cs48l32_asp_err(dai, "Invalid parameters\n"); + return sclk_target; + } for (i = 0; i < ARRAY_SIZE(cs48l32_sclk_rates); i++) { if ((cs48l32_sclk_rates[i].freq >= sclk_target) && From b44b2694005b501b2eaeb56aa778f3ca22de4659 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 4 Jun 2025 14:18:21 +0800 Subject: [PATCH 011/284] ASoC: codecs: ES8326: Modify initialization configuration Modify the value of ES8326_SDINOUT1_IO in the initialization Signed-off-by: Zhang Yi Link: https://patch.msgid.link/20250604061821.2678-1-zhangyi@everest-semi.com Signed-off-by: Mark Brown --- sound/soc/codecs/es8326.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/soc/codecs/es8326.c b/sound/soc/codecs/es8326.c index 066d92b54312..78c4e68f6002 100644 --- a/sound/soc/codecs/es8326.c +++ b/sound/soc/codecs/es8326.c @@ -1079,8 +1079,7 @@ static void es8326_init(struct snd_soc_component *component) regmap_update_bits(es8326->regmap, ES8326_HPDET_TYPE, 0x03, 0x00); regmap_write(es8326->regmap, ES8326_INTOUT_IO, es8326->interrupt_clk); - regmap_write(es8326->regmap, ES8326_SDINOUT1_IO, - (ES8326_IO_DMIC_CLK << ES8326_SDINOUT1_SHIFT)); + regmap_write(es8326->regmap, ES8326_SDINOUT1_IO, ES8326_IO_INPUT); regmap_write(es8326->regmap, ES8326_SDINOUT23_IO, ES8326_IO_INPUT); regmap_write(es8326->regmap, ES8326_ANA_PDN, 0x00); From a4e469c1e2e0ec7f08fff9ed29f5500f187ba9f2 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Sun, 8 Jun 2025 22:27:37 +0800 Subject: [PATCH 012/284] ASoC: loongson: Fix build warnings about export.h After commit 25704938c86d7b2 ("scripts/misc-check: check missing #include when W=1") and 1239f681359926101e ("scripts/misc-check: check unnecessary #include when W=1"), we get some build warnings with W=1: sound/soc/loongson/loongson_i2s.c: warning: EXPORT_SYMBOL() is used, but #include is missing So fix these build warnings for ASoC/Loongson. Signed-off-by: Huacai Chen Link: https://patch.msgid.link/20250608142737.168829-1-chenhuacai@loongson.cn Signed-off-by: Mark Brown --- sound/soc/loongson/loongson_i2s.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/loongson/loongson_i2s.c b/sound/soc/loongson/loongson_i2s.c index e8852a30f213..e336656e13eb 100644 --- a/sound/soc/loongson/loongson_i2s.c +++ b/sound/soc/loongson/loongson_i2s.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include From 83c4c67076c209787515e06fffd41dd0bdab09b9 Mon Sep 17 00:00:00 2001 From: "James A. MacInnes" Date: Wed, 12 Feb 2025 15:03:46 -0800 Subject: [PATCH 013/284] drm/msm/dp: Disable wide bus support for SDM845 When widebus was enabled for DisplayPort in commit c7c412202623 ("drm/msm/dp: enable widebus on all relevant chipsets") it was clarified that it is only supported on DPU 5.0.0 onwards which includes SC7180 on DPU revision 6.2. However, this patch missed that the description structure for SC7180 is also reused for SDM845 (because of identical io_start address) which is only DPU 4.0.0, leading to a wrongly enbled widebus feature and corruption on that platform. Create a separate msm_dp_desc_sdm845 structure for this SoC compatible, with the wide_bus_supported flag turned off. Fixes: c7c412202623 ("drm/msm/dp: enable widebus on all relevant chipsets") Signed-off-by: James A. MacInnes [DB: reworded commit text following Marijn's suggestion] Reviewed-by: Abhinav Kumar Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/636944/ Link: https://lore.kernel.org/r/20250212-sdm845_dp-v2-1-4954e51458f4@gmail.com Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/dp/dp_display.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/dp/dp_display.c b/drivers/gpu/drm/msm/dp/dp_display.c index 386c4669c831..a48e6db4f156 100644 --- a/drivers/gpu/drm/msm/dp/dp_display.c +++ b/drivers/gpu/drm/msm/dp/dp_display.c @@ -128,6 +128,11 @@ static const struct msm_dp_desc msm_dp_desc_sa8775p[] = { {} }; +static const struct msm_dp_desc msm_dp_desc_sdm845[] = { + { .io_start = 0x0ae90000, .id = MSM_DP_CONTROLLER_0 }, + {} +}; + static const struct msm_dp_desc msm_dp_desc_sc7180[] = { { .io_start = 0x0ae90000, .id = MSM_DP_CONTROLLER_0, .wide_bus_supported = true }, {} @@ -180,7 +185,7 @@ static const struct of_device_id msm_dp_dt_match[] = { { .compatible = "qcom,sc8180x-edp", .data = &msm_dp_desc_sc8180x }, { .compatible = "qcom,sc8280xp-dp", .data = &msm_dp_desc_sc8280xp }, { .compatible = "qcom,sc8280xp-edp", .data = &msm_dp_desc_sc8280xp }, - { .compatible = "qcom,sdm845-dp", .data = &msm_dp_desc_sc7180 }, + { .compatible = "qcom,sdm845-dp", .data = &msm_dp_desc_sdm845 }, { .compatible = "qcom,sm8350-dp", .data = &msm_dp_desc_sc7180 }, { .compatible = "qcom,sm8650-dp", .data = &msm_dp_desc_sm8650 }, { .compatible = "qcom,x1e80100-dp", .data = &msm_dp_desc_x1e80100 }, From 146e87f3e11de0dfa091ff87e34b4bc6eec761a4 Mon Sep 17 00:00:00 2001 From: "James A. MacInnes" Date: Wed, 12 Feb 2025 15:03:47 -0800 Subject: [PATCH 014/284] drm/msm/disp: Correct porch timing for SDM845 Type-C DisplayPort inoperable due to incorrect porch settings. - Re-used wide_bus_en as flag to prevent porch shifting Fixes: c943b4948b58 ("drm/msm/dp: add displayPort driver support") Signed-off-by: James A. MacInnes Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/636945/ Link: https://lore.kernel.org/r/20250212-sdm845_dp-v2-2-4954e51458f4@gmail.com Signed-off-by: Dmitry Baryshkov --- .../gpu/drm/msm/disp/dpu1/dpu_encoder_phys_vid.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_vid.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_vid.c index 8a618841e3ea..1c468ca5d692 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_vid.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder_phys_vid.c @@ -94,17 +94,21 @@ static void drm_mode_to_intf_timing_params( timing->vsync_polarity = 0; } - /* for DP/EDP, Shift timings to align it to bottom right */ - if (phys_enc->hw_intf->cap->type == INTF_DP) { + timing->wide_bus_en = dpu_encoder_is_widebus_enabled(phys_enc->parent); + timing->compression_en = dpu_encoder_is_dsc_enabled(phys_enc->parent); + + /* + * For DP/EDP, Shift timings to align it to bottom right. + * wide_bus_en is set for everything excluding SDM845 & + * porch changes cause DisplayPort failure and HDMI tearing. + */ + if (phys_enc->hw_intf->cap->type == INTF_DP && timing->wide_bus_en) { timing->h_back_porch += timing->h_front_porch; timing->h_front_porch = 0; timing->v_back_porch += timing->v_front_porch; timing->v_front_porch = 0; } - timing->wide_bus_en = dpu_encoder_is_widebus_enabled(phys_enc->parent); - timing->compression_en = dpu_encoder_is_dsc_enabled(phys_enc->parent); - /* * for DP, divide the horizonal parameters by 2 when * widebus is enabled From 8a48e35becb214743214f5504e726c3ec131cd6d Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 20 May 2025 13:13:26 +0200 Subject: [PATCH 015/284] drm/msm/dsi/dsi_phy_10nm: Fix missing initial VCO rate Driver unconditionally saves current state on first init in dsi_pll_10nm_init(), but does not save the VCO rate, only some of the divider registers. The state is then restored during probe/enable via msm_dsi_phy_enable() -> msm_dsi_phy_pll_restore_state() -> dsi_10nm_pll_restore_state(). Restoring calls dsi_pll_10nm_vco_set_rate() with pll_10nm->vco_current_rate=0, which basically overwrites existing rate of VCO and messes with clock hierarchy, by setting frequency to 0 to clock tree. This makes anyway little sense - VCO rate was not saved, so should not be restored. If PLL was not configured configure it to minimum rate to avoid glitches and configuring entire in clock hierarchy to 0 Hz. Suggested-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/sz4kbwy5nwsebgf64ia7uq4ee7wbsa5uy3xmlqwcstsbntzcov@ew3dcyjdzmi2/ Signed-off-by: Krzysztof Kozlowski Fixes: a4ccc37693a2 ("drm/msm/dsi_pll_10nm: restore VCO rate during Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/654796/ Link: https://lore.kernel.org/r/20250520111325.92352-2-krzysztof.kozlowski@linaro.org Signed-off-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/dsi/phy/dsi_phy_10nm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_10nm.c b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_10nm.c index 9812b4d69197..af2e30f3f842 100644 --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy_10nm.c +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy_10nm.c @@ -704,6 +704,13 @@ static int dsi_pll_10nm_init(struct msm_dsi_phy *phy) /* TODO: Remove this when we have proper display handover support */ msm_dsi_phy_pll_save_state(phy); + /* + * Store also proper vco_current_rate, because its value will be used in + * dsi_10nm_pll_restore_state(). + */ + if (!dsi_pll_10nm_vco_recalc_rate(&pll_10nm->clk_hw, VCO_REF_CLK_RATE)) + pll_10nm->vco_current_rate = pll_10nm->phy->cfg->min_pll_rate; + return 0; } From c987a390f1b3b8bdac11031d7004e3410fe259bd Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 8 Jun 2025 11:14:14 +0200 Subject: [PATCH 016/284] ALSA: hda/intel: Add Thinkpad E15 to PM deny list Lenovo Thinkpad E15 with Conexant CX8070 codec seems causing ugly noises after runtime-PM suspend. Disable the codec runtime PM as a workaround. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220210 Cc: Link: https://patch.msgid.link/20250608091415.21170-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_intel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index e5210ed48ddf..439cf1bda6e6 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2283,6 +2283,8 @@ static const struct snd_pci_quirk power_save_denylist[] = { SND_PCI_QUIRK(0x1734, 0x1232, "KONTRON SinglePC", 0), /* Dell ALC3271 */ SND_PCI_QUIRK(0x1028, 0x0962, "Dell ALC3271", 0), + /* https://bugzilla.kernel.org/show_bug.cgi?id=220210 */ + SND_PCI_QUIRK(0x17aa, 0x5079, "Lenovo Thinkpad E15", 0), {} }; From f00de8e82cab24ef2e2cae0b297b4dc46c3e37ca Mon Sep 17 00:00:00 2001 From: Brahmajit Das Date: Sat, 7 Jun 2025 02:10:00 +0530 Subject: [PATCH 017/284] ALSA: ctxfi: Replace deprecated strcpy() with strscpy() strcpy() is deprecated; use strscpy() instead. Use strscpy() to copy the long name because there's no string to format with sprintf(). No functional changes intended. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Brahmajit Das Link: https://patch.msgid.link/20250606204000.8156-1-listout@listout.xyz Signed-off-by: Takashi Iwai --- sound/pci/ctxfi/xfi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/ctxfi/xfi.c b/sound/pci/ctxfi/xfi.c index 713d36ea40cb..d8dd84d41c87 100644 --- a/sound/pci/ctxfi/xfi.c +++ b/sound/pci/ctxfi/xfi.c @@ -98,8 +98,8 @@ ct_card_probe(struct pci_dev *pci, const struct pci_device_id *pci_id) if (err < 0) goto error; - strcpy(card->driver, "SB-XFi"); - strcpy(card->shortname, "Creative X-Fi"); + strscpy(card->driver, "SB-XFi"); + strscpy(card->shortname, "Creative X-Fi"); snprintf(card->longname, sizeof(card->longname), "%s %s %s", card->shortname, atc->chip_name, atc->model_name); From a0914bf56e26d2cf457690602883f9cd2ec2c646 Mon Sep 17 00:00:00 2001 From: Edip Hazuri Date: Mon, 9 Jun 2025 10:59:44 +0300 Subject: [PATCH 018/284] ALSA: hda/realtek - Add mute LED support for HP Victus 16-s1xxx and HP Victus 15-fa1xxx The mute led on those laptops is using ALC245 but requires a quirk to work This patch enables the existing quirk for the devices. Tested on my Victus 16-s1011nt Laptop and my friend's Victus 15-fa1xxx. The LED behaviour works as intended. Cc: Signed-off-by: Edip Hazuri Link: https://patch.msgid.link/20250609075943.13934-2-edip@medip.dev Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index cd0d7ba7320e..c70bee62653e 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10787,6 +10787,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8b97, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8bb3, "HP Slim OMEN", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8bb4, "HP Slim OMEN", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x103c, 0x8bc8, "HP Victus 15-fa1xxx", ALC245_FIXUP_HP_MUTE_LED_COEFBIT), SND_PCI_QUIRK(0x103c, 0x8bcd, "HP Omen 16-xd0xxx", ALC245_FIXUP_HP_MUTE_LED_V1_COEFBIT), SND_PCI_QUIRK(0x103c, 0x8bdd, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8bde, "HP Envy 17", ALC287_FIXUP_CS35L41_I2C_2), @@ -10840,6 +10841,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8c91, "HP EliteBook 660", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8c96, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8c97, "HP ZBook", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8c9c, "HP Victus 16-s1xxx (MB 8C9C)", ALC245_FIXUP_HP_MUTE_LED_COEFBIT), SND_PCI_QUIRK(0x103c, 0x8ca1, "HP ZBook Power", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ca2, "HP ZBook Power", ALC236_FIXUP_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8ca4, "HP ZBook Fury", ALC245_FIXUP_CS35L41_SPI_2_HP_GPIO_LED), From 5d319f75ccf7f0927425a7545aa1a22b3eedc189 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 14 May 2025 09:33:32 -0700 Subject: [PATCH 019/284] drm/msm: Fix a fence leak in submit error path In error paths, we could unref the submit without calling drm_sched_entity_push_job(), so msm_job_free() will never get called. Since drm_sched_job_cleanup() will NULL out the s_fence, we can use that to detect this case. Signed-off-by: Rob Clark Patchwork: https://patchwork.freedesktop.org/patch/653584/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem_submit.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index 3e9aa2cc38ef..b2aeaecaa39b 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -85,6 +85,15 @@ void __msm_gem_submit_destroy(struct kref *kref) container_of(kref, struct msm_gem_submit, ref); unsigned i; + /* + * In error paths, we could unref the submit without calling + * drm_sched_entity_push_job(), so msm_job_free() will never + * get called. Since drm_sched_job_cleanup() will NULL out + * s_fence, we can use that to detect this case. + */ + if (submit->base.s_fence) + drm_sched_job_cleanup(&submit->base); + if (submit->fence_id) { spin_lock(&submit->queue->idr_lock); idr_remove(&submit->queue->fence_idr, submit->fence_id); From f681c2aa8676a890eacc84044717ab0fd26e058f Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Wed, 14 May 2025 09:33:33 -0700 Subject: [PATCH 020/284] drm/msm: Fix another leak in the submit error path put_unused_fd() doesn't free the installed file, if we've already done fd_install(). So we need to also free the sync_file. Signed-off-by: Rob Clark Patchwork: https://patchwork.freedesktop.org/patch/653583/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/msm_gem_submit.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_gem_submit.c b/drivers/gpu/drm/msm/msm_gem_submit.c index b2aeaecaa39b..d4f71bb54e84 100644 --- a/drivers/gpu/drm/msm/msm_gem_submit.c +++ b/drivers/gpu/drm/msm/msm_gem_submit.c @@ -658,6 +658,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, struct msm_ringbuffer *ring; struct msm_submit_post_dep *post_deps = NULL; struct drm_syncobj **syncobjs_to_reset = NULL; + struct sync_file *sync_file = NULL; int out_fence_fd = -1; unsigned i; int ret; @@ -867,7 +868,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, } if (ret == 0 && args->flags & MSM_SUBMIT_FENCE_FD_OUT) { - struct sync_file *sync_file = sync_file_create(submit->user_fence); + sync_file = sync_file_create(submit->user_fence); if (!sync_file) { ret = -ENOMEM; } else { @@ -901,8 +902,11 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, out_unlock: mutex_unlock(&queue->lock); out_post_unlock: - if (ret && (out_fence_fd >= 0)) + if (ret && (out_fence_fd >= 0)) { put_unused_fd(out_fence_fd); + if (sync_file) + fput(sync_file->file); + } if (!IS_ERR_OR_NULL(submit)) { msm_gem_submit_put(submit); From 0c5fea1eb0dc234cd321bcd19a6bd0f51273b4c6 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 20 May 2025 15:08:57 -0400 Subject: [PATCH 021/284] drm/msm: Don't use a worker to capture fault devcoredump Now that we use a threaded IRQ, it should be safe to do this in the fault handler. We can also remove fault_info from struct msm_gpu and just pass it directly. Signed-off-by: Connor Abbott Patchwork: https://patchwork.freedesktop.org/patch/654889/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 22 ++++++++-------------- drivers/gpu/drm/msm/msm_gpu.c | 20 +++++++++----------- drivers/gpu/drm/msm/msm_gpu.h | 8 ++------ 3 files changed, 19 insertions(+), 31 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 2348ffb35f7e..5ebd5a6ea143 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -270,14 +270,6 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags, const char *type = "UNKNOWN"; bool do_devcoredump = info && !READ_ONCE(gpu->crashstate); - /* - * If we aren't going to be resuming later from fault_worker, then do - * it now. - */ - if (!do_devcoredump) { - gpu->aspace->mmu->funcs->resume_translation(gpu->aspace->mmu); - } - /* * Print a default message if we couldn't get the data from the * adreno-smmu-priv @@ -304,16 +296,18 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags, scratch[0], scratch[1], scratch[2], scratch[3]); if (do_devcoredump) { + struct msm_gpu_fault_info fault_info = {}; + /* Turn off the hangcheck timer to keep it from bothering us */ timer_delete(&gpu->hangcheck_timer); - gpu->fault_info.ttbr0 = info->ttbr0; - gpu->fault_info.iova = iova; - gpu->fault_info.flags = flags; - gpu->fault_info.type = type; - gpu->fault_info.block = block; + fault_info.ttbr0 = info->ttbr0; + fault_info.iova = iova; + fault_info.flags = flags; + fault_info.type = type; + fault_info.block = block; - kthread_queue_work(gpu->worker, &gpu->fault_work); + msm_gpu_fault_crashstate_capture(gpu, &fault_info); } return 0; diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index c380d9d9f5af..457f019d507e 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -257,7 +257,8 @@ static void msm_gpu_crashstate_get_bo(struct msm_gpu_state *state, } static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, - struct msm_gem_submit *submit, char *comm, char *cmd) + struct msm_gem_submit *submit, struct msm_gpu_fault_info *fault_info, + char *comm, char *cmd) { struct msm_gpu_state *state; @@ -276,7 +277,8 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, /* Fill in the additional crash state information */ state->comm = kstrdup(comm, GFP_KERNEL); state->cmd = kstrdup(cmd, GFP_KERNEL); - state->fault_info = gpu->fault_info; + if (fault_info) + state->fault_info = *fault_info; if (submit) { int i; @@ -308,7 +310,8 @@ static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, } #else static void msm_gpu_crashstate_capture(struct msm_gpu *gpu, - struct msm_gem_submit *submit, char *comm, char *cmd) + struct msm_gem_submit *submit, struct msm_gpu_fault_info *fault_info, + char *comm, char *cmd) { } #endif @@ -405,7 +408,7 @@ static void recover_worker(struct kthread_work *work) /* Record the crash state */ pm_runtime_get_sync(&gpu->pdev->dev); - msm_gpu_crashstate_capture(gpu, submit, comm, cmd); + msm_gpu_crashstate_capture(gpu, submit, NULL, comm, cmd); kfree(cmd); kfree(comm); @@ -459,9 +462,8 @@ static void recover_worker(struct kthread_work *work) msm_gpu_retire(gpu); } -static void fault_worker(struct kthread_work *work) +void msm_gpu_fault_crashstate_capture(struct msm_gpu *gpu, struct msm_gpu_fault_info *fault_info) { - struct msm_gpu *gpu = container_of(work, struct msm_gpu, fault_work); struct msm_gem_submit *submit; struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu); char *comm = NULL, *cmd = NULL; @@ -484,16 +486,13 @@ static void fault_worker(struct kthread_work *work) /* Record the crash state */ pm_runtime_get_sync(&gpu->pdev->dev); - msm_gpu_crashstate_capture(gpu, submit, comm, cmd); + msm_gpu_crashstate_capture(gpu, submit, fault_info, comm, cmd); pm_runtime_put_sync(&gpu->pdev->dev); kfree(cmd); kfree(comm); resume_smmu: - memset(&gpu->fault_info, 0, sizeof(gpu->fault_info)); - gpu->aspace->mmu->funcs->resume_translation(gpu->aspace->mmu); - mutex_unlock(&gpu->lock); } @@ -882,7 +881,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, init_waitqueue_head(&gpu->retire_event); kthread_init_work(&gpu->retire_work, retire_worker); kthread_init_work(&gpu->recover_work, recover_worker); - kthread_init_work(&gpu->fault_work, fault_worker); priv->hangcheck_period = DRM_MSM_HANGCHECK_DEFAULT_PERIOD; diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index e25009150579..bed0692f5adb 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -253,12 +253,6 @@ struct msm_gpu { #define DRM_MSM_HANGCHECK_PROGRESS_RETRIES 3 struct timer_list hangcheck_timer; - /* Fault info for most recent iova fault: */ - struct msm_gpu_fault_info fault_info; - - /* work for handling GPU ioval faults: */ - struct kthread_work fault_work; - /* work for handling GPU recovery: */ struct kthread_work recover_work; @@ -705,6 +699,8 @@ static inline void msm_gpu_crashstate_put(struct msm_gpu *gpu) mutex_unlock(&gpu->lock); } +void msm_gpu_fault_crashstate_capture(struct msm_gpu *gpu, struct msm_gpu_fault_info *fault_info); + /* * Simple macro to semi-cleanly add the MAP_PRIV flag for targets that can * support expanded privileges From dedf404be8cf97e6fabbed7ad97000ab816897eb Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 20 May 2025 15:08:58 -0400 Subject: [PATCH 022/284] drm/msm: Delete resume_translation() Unused since the previous commit. Signed-off-by: Connor Abbott Patchwork: https://patchwork.freedesktop.org/patch/654890/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a2xx_gpummu.c | 5 ----- drivers/gpu/drm/msm/msm_iommu.c | 13 ------------- drivers/gpu/drm/msm/msm_mmu.h | 1 - 3 files changed, 19 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a2xx_gpummu.c b/drivers/gpu/drm/msm/adreno/a2xx_gpummu.c index 39641551eeb6..4280f71e472a 100644 --- a/drivers/gpu/drm/msm/adreno/a2xx_gpummu.c +++ b/drivers/gpu/drm/msm/adreno/a2xx_gpummu.c @@ -71,10 +71,6 @@ static int a2xx_gpummu_unmap(struct msm_mmu *mmu, uint64_t iova, size_t len) return 0; } -static void a2xx_gpummu_resume_translation(struct msm_mmu *mmu) -{ -} - static void a2xx_gpummu_destroy(struct msm_mmu *mmu) { struct a2xx_gpummu *gpummu = to_a2xx_gpummu(mmu); @@ -90,7 +86,6 @@ static const struct msm_mmu_funcs funcs = { .map = a2xx_gpummu_map, .unmap = a2xx_gpummu_unmap, .destroy = a2xx_gpummu_destroy, - .resume_translation = a2xx_gpummu_resume_translation, }; struct msm_mmu *a2xx_gpummu_new(struct device *dev, struct msm_gpu *gpu) diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c index fd73dcd3f30e..aae885d048d0 100644 --- a/drivers/gpu/drm/msm/msm_iommu.c +++ b/drivers/gpu/drm/msm/msm_iommu.c @@ -345,7 +345,6 @@ static int msm_gpu_fault_handler(struct iommu_domain *domain, struct device *dev unsigned long iova, int flags, void *arg) { struct msm_iommu *iommu = arg; - struct msm_mmu *mmu = &iommu->base; struct adreno_smmu_priv *adreno_smmu = dev_get_drvdata(iommu->base.dev); struct adreno_smmu_fault_info info, *ptr = NULL; @@ -359,9 +358,6 @@ static int msm_gpu_fault_handler(struct iommu_domain *domain, struct device *dev pr_warn_ratelimited("*** fault: iova=%16lx, flags=%d\n", iova, flags); - if (mmu->funcs->resume_translation) - mmu->funcs->resume_translation(mmu); - return 0; } @@ -376,14 +372,6 @@ static int msm_disp_fault_handler(struct iommu_domain *domain, struct device *de return -ENOSYS; } -static void msm_iommu_resume_translation(struct msm_mmu *mmu) -{ - struct adreno_smmu_priv *adreno_smmu = dev_get_drvdata(mmu->dev); - - if (adreno_smmu->resume_translation) - adreno_smmu->resume_translation(adreno_smmu->cookie, true); -} - static void msm_iommu_detach(struct msm_mmu *mmu) { struct msm_iommu *iommu = to_msm_iommu(mmu); @@ -431,7 +419,6 @@ static const struct msm_mmu_funcs funcs = { .map = msm_iommu_map, .unmap = msm_iommu_unmap, .destroy = msm_iommu_destroy, - .resume_translation = msm_iommu_resume_translation, }; struct msm_mmu *msm_iommu_new(struct device *dev, unsigned long quirks) diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h index daf91529e02b..c3d17aae88b0 100644 --- a/drivers/gpu/drm/msm/msm_mmu.h +++ b/drivers/gpu/drm/msm/msm_mmu.h @@ -15,7 +15,6 @@ struct msm_mmu_funcs { size_t len, int prot); int (*unmap)(struct msm_mmu *mmu, uint64_t iova, size_t len); void (*destroy)(struct msm_mmu *mmu); - void (*resume_translation)(struct msm_mmu *mmu); }; enum msm_mmu_type { From b13044092c1e30453d2f7e9be596d3a2616582a0 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 20 May 2025 15:08:59 -0400 Subject: [PATCH 023/284] drm/msm: Temporarily disable stall-on-fault after a page fault When things go wrong, the GPU is capable of quickly generating millions of faulting translation requests per second. When that happens, in the stall-on-fault model each access will stall until it wins the race to signal the fault and then the RESUME register is written. This slows processing page faults to a crawl as the GPU can generate faults much faster than the CPU can acknowledge them. It also means that all available resources in the SMMU are saturated waiting for the stalled transactions, so that other transactions such as transactions generated by the GMU, which shares translation resources with the GPU, cannot proceed. This causes a GMU watchdog timeout, which leads to a failed reset because GX cannot collapse when there is a transaction pending and a permanently hung GPU. On older platforms with qcom,smmu-v2, it seems that when one transaction is stalled subsequent faulting transactions are terminated, which avoids this problem, but the MMU-500 follows the spec here. To work around these problems, disable stall-on-fault as soon as we get a page fault until a cooldown period after pagefaults stop. This allows the GMU some guaranteed time to continue working. We only use stall-on-fault to halt the GPU while we collect a devcoredump and we always terminate the transaction afterward, so it's fine to miss some subsequent page faults. We also keep it disabled so long as the current devcoredump hasn't been deleted, because in that case we likely won't capture another one if there's a fault. After this commit HFI messages still occasionally time out, because the crashdump handler doesn't run fast enough to let the GMU resume, but the driver seems to recover from it. This will probably go away after the HFI timeout is increased. Signed-off-by: Connor Abbott Reviewed-by: Rob Clark Patchwork: https://patchwork.freedesktop.org/patch/654891/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 2 ++ drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 4 +++ drivers/gpu/drm/msm/adreno/adreno_gpu.c | 40 ++++++++++++++++++++++++- drivers/gpu/drm/msm/adreno/adreno_gpu.h | 2 ++ drivers/gpu/drm/msm/msm_debugfs.c | 32 ++++++++++++++++++++ drivers/gpu/drm/msm/msm_drv.c | 4 +++ drivers/gpu/drm/msm/msm_drv.h | 23 ++++++++++++++ drivers/gpu/drm/msm/msm_iommu.c | 9 ++++++ drivers/gpu/drm/msm/msm_mmu.h | 1 + 9 files changed, 116 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 650e5bac225f..60aef0796236 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -131,6 +131,8 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) struct msm_ringbuffer *ring = submit->ring; unsigned int i, ibs = 0; + adreno_check_and_reenable_stall(adreno_gpu); + if (IS_ENABLED(CONFIG_DRM_MSM_GPU_SUDO) && submit->in_rb) { ring->cur_ctx_seqno = 0; a5xx_submit_in_rb(gpu, submit); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index bf3758f010f4..a7ffd92de3bb 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -212,6 +212,8 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) struct msm_ringbuffer *ring = submit->ring; unsigned int i, ibs = 0; + adreno_check_and_reenable_stall(adreno_gpu); + a6xx_set_pagetable(a6xx_gpu, ring, submit); get_stats_counter(ring, REG_A6XX_RBBM_PERFCTR_CP(0), @@ -335,6 +337,8 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) struct msm_ringbuffer *ring = submit->ring; unsigned int i, ibs = 0; + adreno_check_and_reenable_stall(adreno_gpu); + /* * Toggle concurrent binning for pagetable switch and set the thread to * BR since only it can execute the pagetable switch packets. diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index 5ebd5a6ea143..86bff915c3e7 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -259,16 +259,54 @@ u64 adreno_private_address_space_size(struct msm_gpu *gpu) return BIT(ttbr1_cfg->ias) - ADRENO_VM_START; } +void adreno_check_and_reenable_stall(struct adreno_gpu *adreno_gpu) +{ + struct msm_gpu *gpu = &adreno_gpu->base; + struct msm_drm_private *priv = gpu->dev->dev_private; + unsigned long flags; + + /* + * Wait until the cooldown period has passed and we would actually + * collect a crashdump to re-enable stall-on-fault. + */ + spin_lock_irqsave(&priv->fault_stall_lock, flags); + if (!priv->stall_enabled && + ktime_after(ktime_get(), priv->stall_reenable_time) && + !READ_ONCE(gpu->crashstate)) { + priv->stall_enabled = true; + + gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, true); + } + spin_unlock_irqrestore(&priv->fault_stall_lock, flags); +} + #define ARM_SMMU_FSR_TF BIT(1) #define ARM_SMMU_FSR_PF BIT(3) #define ARM_SMMU_FSR_EF BIT(4) +#define ARM_SMMU_FSR_SS BIT(30) int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags, struct adreno_smmu_fault_info *info, const char *block, u32 scratch[4]) { + struct msm_drm_private *priv = gpu->dev->dev_private; const char *type = "UNKNOWN"; - bool do_devcoredump = info && !READ_ONCE(gpu->crashstate); + bool do_devcoredump = info && (info->fsr & ARM_SMMU_FSR_SS) && + !READ_ONCE(gpu->crashstate); + unsigned long irq_flags; + + /* + * In case there is a subsequent storm of pagefaults, disable + * stall-on-fault for at least half a second. + */ + spin_lock_irqsave(&priv->fault_stall_lock, irq_flags); + if (priv->stall_enabled) { + priv->stall_enabled = false; + + gpu->aspace->mmu->funcs->set_stall(gpu->aspace->mmu, false); + } + priv->stall_reenable_time = ktime_add_ms(ktime_get(), 500); + spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags); /* * Print a default message if we couldn't get the data from the diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h b/drivers/gpu/drm/msm/adreno/adreno_gpu.h index a8f4bf416e64..bc063594a359 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h @@ -636,6 +636,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags, struct adreno_smmu_fault_info *info, const char *block, u32 scratch[4]); +void adreno_check_and_reenable_stall(struct adreno_gpu *gpu); + int adreno_read_speedbin(struct device *dev, u32 *speedbin); /* diff --git a/drivers/gpu/drm/msm/msm_debugfs.c b/drivers/gpu/drm/msm/msm_debugfs.c index 7ab607252d18..6af72162cda4 100644 --- a/drivers/gpu/drm/msm/msm_debugfs.c +++ b/drivers/gpu/drm/msm/msm_debugfs.c @@ -208,6 +208,35 @@ DEFINE_DEBUGFS_ATTRIBUTE(shrink_fops, shrink_get, shrink_set, "0x%08llx\n"); +/* + * Return the number of microseconds to wait until stall-on-fault is + * re-enabled. If 0 then it is already enabled or will be re-enabled on the + * next submit (unless there's a leftover devcoredump). This is useful for + * kernel tests that intentionally produce a fault and check the devcoredump to + * wait until the cooldown period is over. + */ + +static int +stall_reenable_time_get(void *data, u64 *val) +{ + struct msm_drm_private *priv = data; + unsigned long irq_flags; + + spin_lock_irqsave(&priv->fault_stall_lock, irq_flags); + + if (priv->stall_enabled) + *val = 0; + else + *val = max(ktime_us_delta(priv->stall_reenable_time, ktime_get()), 0); + + spin_unlock_irqrestore(&priv->fault_stall_lock, irq_flags); + + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(stall_reenable_time_fops, + stall_reenable_time_get, NULL, + "%lld\n"); static int msm_gem_show(struct seq_file *m, void *arg) { @@ -319,6 +348,9 @@ static void msm_debugfs_gpu_init(struct drm_minor *minor) debugfs_create_bool("disable_err_irq", 0600, minor->debugfs_root, &priv->disable_err_irq); + debugfs_create_file("stall_reenable_time_us", 0400, minor->debugfs_root, + priv, &stall_reenable_time_fops); + gpu_devfreq = debugfs_create_dir("devfreq", minor->debugfs_root); debugfs_create_bool("idle_clamp",0600, gpu_devfreq, diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index f316e6776f67..132d4cac3587 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -245,6 +245,10 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) drm_gem_lru_init(&priv->lru.willneed, &priv->lru.lock); drm_gem_lru_init(&priv->lru.dontneed, &priv->lru.lock); + /* Initialize stall-on-fault */ + spin_lock_init(&priv->fault_stall_lock); + priv->stall_enabled = true; + /* Teach lockdep about lock ordering wrt. shrinker: */ fs_reclaim_acquire(GFP_KERNEL); might_lock(&priv->lru.lock); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index a65077855201..c8afb1ea6040 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -222,6 +222,29 @@ struct msm_drm_private { * the sw hangcheck mechanism. */ bool disable_err_irq; + + /** + * @fault_stall_lock: + * + * Serialize changes to stall-on-fault state. + */ + spinlock_t fault_stall_lock; + + /** + * @fault_stall_reenable_time: + * + * If stall_enabled is false, when to reenable stall-on-fault. + * Protected by @fault_stall_lock. + */ + ktime_t stall_reenable_time; + + /** + * @stall_enabled: + * + * Whether stall-on-fault is currently enabled. Protected by + * @fault_stall_lock. + */ + bool stall_enabled; }; const struct msm_format *mdp_get_format(struct msm_kms *kms, uint32_t format, uint64_t modifier); diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c index aae885d048d0..739ce2c283a4 100644 --- a/drivers/gpu/drm/msm/msm_iommu.c +++ b/drivers/gpu/drm/msm/msm_iommu.c @@ -372,6 +372,14 @@ static int msm_disp_fault_handler(struct iommu_domain *domain, struct device *de return -ENOSYS; } +static void msm_iommu_set_stall(struct msm_mmu *mmu, bool enable) +{ + struct adreno_smmu_priv *adreno_smmu = dev_get_drvdata(mmu->dev); + + if (adreno_smmu->set_stall) + adreno_smmu->set_stall(adreno_smmu->cookie, enable); +} + static void msm_iommu_detach(struct msm_mmu *mmu) { struct msm_iommu *iommu = to_msm_iommu(mmu); @@ -419,6 +427,7 @@ static const struct msm_mmu_funcs funcs = { .map = msm_iommu_map, .unmap = msm_iommu_unmap, .destroy = msm_iommu_destroy, + .set_stall = msm_iommu_set_stall, }; struct msm_mmu *msm_iommu_new(struct device *dev, unsigned long quirks) diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h index c3d17aae88b0..0c694907140d 100644 --- a/drivers/gpu/drm/msm/msm_mmu.h +++ b/drivers/gpu/drm/msm/msm_mmu.h @@ -15,6 +15,7 @@ struct msm_mmu_funcs { size_t len, int prot); int (*unmap)(struct msm_mmu *mmu, uint64_t iova, size_t len); void (*destroy)(struct msm_mmu *mmu); + void (*set_stall)(struct msm_mmu *mmu, bool enable); }; enum msm_mmu_type { From b1c9e797ad37d188675505b66a3a4bbeea5d9560 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 20 May 2025 18:28:05 -0400 Subject: [PATCH 024/284] drm/msm: Fix CP_RESET_CONTEXT_STATE bitfield names Based on kgsl. Fixes: af66706accdf ("drm/msm/a6xx: Add skeleton A7xx support") Signed-off-by: Connor Abbott Patchwork: https://patchwork.freedesktop.org/patch/654922/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml b/drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml index 5a6ae9fc3194..462713401622 100644 --- a/drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml +++ b/drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml @@ -2255,7 +2255,8 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) - + + From 2b520c6104f34e3a548525173c38ebca4402cac3 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 20 May 2025 18:28:06 -0400 Subject: [PATCH 025/284] drm/msm/a7xx: Call CP_RESET_CONTEXT_STATE Calling this packet is necessary when we switch contexts because there are various pieces of state used by userspace to synchronize between BR and BV that are persistent across submits and we need to make sure that they are in a "safe" state when switching contexts. Otherwise a userspace submission in one context could cause another context to function incorrectly and hang, effectively a denial of service (although without leaking data). This was missed during initial a7xx bringup. Fixes: af66706accdf ("drm/msm/a6xx: Add skeleton A7xx support") Signed-off-by: Connor Abbott Patchwork: https://patchwork.freedesktop.org/patch/654924/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index a7ffd92de3bb..491fde0083a2 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -130,6 +130,20 @@ static void a6xx_set_pagetable(struct a6xx_gpu *a6xx_gpu, OUT_RING(ring, lower_32_bits(rbmemptr(ring, fence))); OUT_RING(ring, upper_32_bits(rbmemptr(ring, fence))); OUT_RING(ring, submit->seqno - 1); + + OUT_PKT7(ring, CP_THREAD_CONTROL, 1); + OUT_RING(ring, CP_SET_THREAD_BOTH); + + /* Reset state used to synchronize BR and BV */ + OUT_PKT7(ring, CP_RESET_CONTEXT_STATE, 1); + OUT_RING(ring, + CP_RESET_CONTEXT_STATE_0_CLEAR_ON_CHIP_TS | + CP_RESET_CONTEXT_STATE_0_CLEAR_RESOURCE_TABLE | + CP_RESET_CONTEXT_STATE_0_CLEAR_BV_BR_COUNTER | + CP_RESET_CONTEXT_STATE_0_RESET_GLOBAL_LOCAL_TS); + + OUT_PKT7(ring, CP_THREAD_CONTROL, 1); + OUT_RING(ring, CP_SET_THREAD_BR); } if (!sysprof) { From ba64c6737f8656aa86e9fbcc1ff56e56316e62f6 Mon Sep 17 00:00:00 2001 From: Ryan Eatmon Date: Sat, 24 May 2025 21:25:37 +0530 Subject: [PATCH 026/284] drivers: gpu: drm: msm: registers: improve reproducibility The files generated by gen_header.py capture the source path to the input files and the date. While that can be informative, it varies based on where and when the kernel was built as the full path is captured. Since all of the files that this tool is run on is under the drivers directory, this modifies the application to strip all of the path before drivers. Additionally it prints instead of the date. Signed-off-by: Ryan Eatmon Signed-off-by: Bruce Ashfield Signed-off-by: Viswanath Kraleti Acked-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/655599/ Signed-off-by: Rob Clark --- drivers/gpu/drm/msm/registers/gen_header.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/msm/registers/gen_header.py b/drivers/gpu/drm/msm/registers/gen_header.py index 3926485bb197..a409404627c7 100644 --- a/drivers/gpu/drm/msm/registers/gen_header.py +++ b/drivers/gpu/drm/msm/registers/gen_header.py @@ -11,6 +11,7 @@ import collections import argparse import time import datetime +import re class Error(Exception): def __init__(self, message): @@ -877,13 +878,14 @@ The rules-ng-ng source files this header was generated from are: """) maxlen = 0 for filepath in p.xml_files: - maxlen = max(maxlen, len(filepath)) + new_filepath = re.sub("^.+drivers","drivers",filepath) + maxlen = max(maxlen, len(new_filepath)) for filepath in p.xml_files: - pad = " " * (maxlen - len(filepath)) + pad = " " * (maxlen - len(new_filepath)) filesize = str(os.path.getsize(filepath)) filesize = " " * (7 - len(filesize)) + filesize filetime = time.ctime(os.path.getmtime(filepath)) - print("- " + filepath + pad + " (" + filesize + " bytes, from " + filetime + ")") + print("- " + new_filepath + pad + " (" + filesize + " bytes, from )") if p.copyright_year: current_year = str(datetime.date.today().year) print() From 1453b532d193a0b1b94429c657365e14dd4f361e Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 9 Jun 2025 11:24:35 -0700 Subject: [PATCH 027/284] drm/msm: Rename add_components_mdp() To better match add_gpu_components(). Signed-off-by: Rob Clark Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/657700/ --- drivers/gpu/drm/msm/msm_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 132d4cac3587..8eeb54a367f0 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -930,7 +930,7 @@ static const struct drm_driver msm_driver = { * is no external component that we need to add since LVDS is within MDP4 * itself. */ -static int add_components_mdp(struct device *master_dev, +static int add_mdp_components(struct device *master_dev, struct component_match **matchptr) { struct device_node *np = master_dev->of_node; @@ -1075,7 +1075,7 @@ int msm_drv_probe(struct device *master_dev, /* Add mdp components if we have KMS. */ if (kms_init) { - ret = add_components_mdp(master_dev, &match); + ret = add_mdp_components(master_dev, &match); if (ret) return ret; } From 1efb73791c828f3d1ac68ebbc5eddc35a562f9d0 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 9 Jun 2025 11:24:36 -0700 Subject: [PATCH 028/284] drm/msm/adreno: Pass device_node to find_chipid() We are going to want to re-use this before the component is bound, when we don't yet have the device pointer (but we do have the of node). v2: use %pOF Signed-off-by: Rob Clark Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/657705/ --- drivers/gpu/drm/msm/adreno/adreno_device.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index f5e1490d07c1..f176796e238d 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -137,9 +137,8 @@ struct msm_gpu *adreno_load_gpu(struct drm_device *dev) return NULL; } -static int find_chipid(struct device *dev, uint32_t *chipid) +static int find_chipid(struct device_node *node, uint32_t *chipid) { - struct device_node *node = dev->of_node; const char *compat; int ret; @@ -173,11 +172,12 @@ static int find_chipid(struct device *dev, uint32_t *chipid) /* and if that fails, fall back to legacy "qcom,chipid" property: */ ret = of_property_read_u32(node, "qcom,chipid", chipid); if (ret) { - DRM_DEV_ERROR(dev, "could not parse qcom,chipid: %d\n", ret); + DRM_ERROR("%pOF: could not parse qcom,chipid: %d\n", + node, ret); return ret; } - dev_warn(dev, "Using legacy qcom,chipid binding!\n"); + pr_warn("%pOF: Using legacy qcom,chipid binding!\n", node); return 0; } @@ -191,7 +191,7 @@ static int adreno_bind(struct device *dev, struct device *master, void *data) struct msm_gpu *gpu; int ret; - ret = find_chipid(dev, &config.chip_id); + ret = find_chipid(dev->of_node, &config.chip_id); if (ret) return ret; From 0838fc3e6718743d595bf26e0e69ae55aa51fad1 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 9 Jun 2025 11:24:37 -0700 Subject: [PATCH 029/284] drm/msm/adreno: Check for recognized GPU before bind If we have a newer dtb than kernel, we could end up in a situation where the GPU device is present in the dtb, but not in the drivers device table. We don't want this to prevent the display from probing. So check that we recognize the GPU before adding the GPU component. v2: use %pOF Signed-off-by: Rob Clark Reviewed-by: Dmitry Baryshkov Patchwork: https://patchwork.freedesktop.org/patch/657701/ --- drivers/gpu/drm/msm/adreno/adreno_device.c | 29 ++++++++++++++++++---- drivers/gpu/drm/msm/msm_drv.c | 2 +- drivers/gpu/drm/msm/msm_gpu.h | 1 + 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index f176796e238d..5e7307567239 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -182,6 +182,26 @@ static int find_chipid(struct device_node *node, uint32_t *chipid) return 0; } +bool adreno_has_gpu(struct device_node *node) +{ + const struct adreno_info *info; + uint32_t chip_id; + int ret; + + ret = find_chipid(node, &chip_id); + if (ret) + return false; + + info = adreno_info(chip_id); + if (!info) { + pr_warn("%pOF: Unknown GPU revision: %"ADRENO_CHIPID_FMT"\n", + node, ADRENO_CHIPID_ARGS(chip_id)); + return false; + } + + return true; +} + static int adreno_bind(struct device *dev, struct device *master, void *data) { static struct adreno_platform_config config = {}; @@ -192,18 +212,17 @@ static int adreno_bind(struct device *dev, struct device *master, void *data) int ret; ret = find_chipid(dev->of_node, &config.chip_id); - if (ret) + /* We shouldn't have gotten this far if we can't parse the chip_id */ + if (WARN_ON(ret)) return ret; dev->platform_data = &config; priv->gpu_pdev = to_platform_device(dev); info = adreno_info(config.chip_id); - if (!info) { - dev_warn(drm->dev, "Unknown GPU revision: %"ADRENO_CHIPID_FMT"\n", - ADRENO_CHIPID_ARGS(config.chip_id)); + /* We shouldn't have gotten this far if we don't recognize the GPU: */ + if (!WARN_ON(info)) return -ENXIO; - } config.info = info; diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 8eeb54a367f0..d007687c2446 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -1034,7 +1034,7 @@ static int add_gpu_components(struct device *dev, if (!np) return 0; - if (of_device_is_available(np)) + if (of_device_is_available(np) && adreno_has_gpu(np)) drm_of_component_match_add(dev, matchptr, component_compare_of, np); of_node_put(np); diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index bed0692f5adb..5bf7cd985b9c 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -662,6 +662,7 @@ msm_gpu_create_private_address_space(struct msm_gpu *gpu, struct task_struct *ta void msm_gpu_cleanup(struct msm_gpu *gpu); struct msm_gpu *adreno_load_gpu(struct drm_device *dev); +bool adreno_has_gpu(struct device_node *node); void __init adreno_register(void); void __exit adreno_unregister(void); From c6451a7325874c119def1d4094f6815c0c8fdc23 Mon Sep 17 00:00:00 2001 From: Chris Chiu Date: Tue, 10 Jun 2025 11:56:07 +0800 Subject: [PATCH 030/284] ALSA: hda/realtek: Fix built-in mic on ASUS VivoBook X513EA The built-in mic of ASUS VivoBook X513EA is broken recently by the fix of the pin sort. The fixup ALC256_FIXUP_ASUS_MIC_NO_PRESENCE is working for addressing the regression, too. Fixes: 3b4309546b48 ("ALSA: hda: Fix headset detection failure due to unstable sort") Signed-off-by: Chris Chiu Cc: Link: https://patch.msgid.link/20250610035607.690771-1-chris.chiu@canonical.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index c70bee62653e..e296429c67d5 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10906,6 +10906,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8e60, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8e61, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8e62, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x1043, 0x1032, "ASUS VivoBook X513EA", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), SND_PCI_QUIRK(0x1043, 0x1054, "ASUS G614FH/FM/FP", ALC287_FIXUP_CS35L41_I2C_2), From 7b23887a0c70d15459f02c51651a111e9e5cab86 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 9 Jun 2025 11:21:25 +0100 Subject: [PATCH 031/284] ALSA: hda/realtek: Add quirk for Asus GU605C The GU605C has similar audio hardware to the GU605M so apply the same quirk. Note that in the linked bugzilla there are two separate problems with the GU605C. This patch fixes one of the problems, so I haven't added a Closes: tag. Signed-off-by: Richard Fitzgerald Reported-by: Nick Karaolidis Link: https://bugzilla.kernel.org/show_bug.cgi?id=220152 Cc: Link: https://patch.msgid.link/20250609102125.63196-1-rf@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index e296429c67d5..bca725bb8281 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10907,6 +10907,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8e61, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x103c, 0x8e62, "HP Trekker ", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x1043, 0x1032, "ASUS VivoBook X513EA", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x1034, "ASUS GU605C", ALC285_FIXUP_ASUS_GU605_SPI_SPEAKER2_TO_DAC1), SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), SND_PCI_QUIRK(0x1043, 0x1054, "ASUS G614FH/FM/FP", ALC287_FIXUP_CS35L41_I2C_2), From ed29e073ba93f2d52832804cabdd831d5d357d33 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Jun 2025 08:43:19 +0200 Subject: [PATCH 032/284] ALSA: sb: Don't allow changing the DMA mode during operations When a PCM stream is already running, one shouldn't change the DMA mode via kcontrol, which may screw up the hardware. Return -EBUSY instead. Link: https://bugzilla.kernel.org/show_bug.cgi?id=218185 Link: https://patch.msgid.link/20250610064322.26787-1-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/isa/sb/sb16_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/isa/sb/sb16_main.c b/sound/isa/sb/sb16_main.c index 74db11525003..c4930efd44e3 100644 --- a/sound/isa/sb/sb16_main.c +++ b/sound/isa/sb/sb16_main.c @@ -703,6 +703,9 @@ static int snd_sb16_dma_control_put(struct snd_kcontrol *kcontrol, struct snd_ct unsigned char nval, oval; int change; + if (chip->mode & (SB_MODE_PLAYBACK | SB_MODE_CAPTURE)) + return -EBUSY; + nval = ucontrol->value.enumerated.item[0]; if (nval > 2) return -EINVAL; From 4c267ae2ef349639b4d9ebf00dd28586a82fdbe6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Jun 2025 08:43:20 +0200 Subject: [PATCH 033/284] ALSA: sb: Force to disable DMAs once when DMA mode is changed When the DMA mode is changed on the (still real!) SB AWE32 after playing a stream and closing, the previous DMA setup was still silently kept, and it can confuse the hardware, resulting in the unexpected noises. As a workaround, enforce the disablement of DMA setups when the DMA setup is changed by the kcontrol. https://bugzilla.kernel.org/show_bug.cgi?id=218185 Link: https://patch.msgid.link/20250610064322.26787-2-tiwai@suse.de Signed-off-by: Takashi Iwai --- sound/isa/sb/sb16_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sound/isa/sb/sb16_main.c b/sound/isa/sb/sb16_main.c index c4930efd44e3..5a083eecaa6b 100644 --- a/sound/isa/sb/sb16_main.c +++ b/sound/isa/sb/sb16_main.c @@ -714,6 +714,10 @@ static int snd_sb16_dma_control_put(struct snd_kcontrol *kcontrol, struct snd_ct change = nval != oval; snd_sb16_set_dma_mode(chip, nval); spin_unlock_irqrestore(&chip->reg_lock, flags); + if (change) { + snd_dma_disable(chip->dma8); + snd_dma_disable(chip->dma16); + } return change; } From e0d4a0f1d066f14522049e827107a577444d9183 Mon Sep 17 00:00:00 2001 From: Yang Shen Date: Thu, 29 May 2025 11:40:23 +0800 Subject: [PATCH 034/284] MAINTAINERS: Update HiSilicon GPIO driver maintainer Add Yang Shen as the maintainer of the HiSilicon GPIO driver, replacing Jay Fang. Signed-off-by: Yang Shen Reviewed-by: Jay Fang Link: https://lore.kernel.org/r/20250529034023.3780376-1-shenyang39@huawei.com Signed-off-by: Bartosz Golaszewski --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..04734120b522 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10839,7 +10839,7 @@ S: Maintained F: drivers/dma/hisi_dma.c HISILICON GPIO DRIVER -M: Jay Fang +M: Yang Shen L: linux-gpio@vger.kernel.org S: Maintained F: Documentation/devicetree/bindings/gpio/hisilicon,ascend910-gpio.yaml From 6325766d69900d1aa9733fc7572456fc4427b708 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 10 Jun 2025 12:32:16 +0200 Subject: [PATCH 035/284] ASoC: sdw_utils: Fix potential NULL pointer deref in is_sdca_endpoint_present() Check the return value of kzalloc() and exit early to avoid a potential NULL pointer dereference. Cc: stable@vger.kernel.org Fixes: 4f8ef33dd44a ("ASoC: soc_sdw_utils: skip the endpoint that doesn't present") Signed-off-by: Thorsten Blum Link: https://patch.msgid.link/20250610103225.1475-2-thorsten.blum@linux.dev Signed-off-by: Mark Brown --- sound/soc/sdw_utils/soc_sdw_utils.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/soc/sdw_utils/soc_sdw_utils.c b/sound/soc/sdw_utils/soc_sdw_utils.c index b7060b746356..d75e7292240b 100644 --- a/sound/soc/sdw_utils/soc_sdw_utils.c +++ b/sound/soc/sdw_utils/soc_sdw_utils.c @@ -1205,6 +1205,8 @@ static int is_sdca_endpoint_present(struct device *dev, int i; dlc = kzalloc(sizeof(*dlc), GFP_KERNEL); + if (!dlc) + return -ENOMEM; adr_end = &adr_dev->endpoints[end_index]; dai_info = &codec_info->dais[adr_end->num]; From 6dea74e454c260cd757b53a2f6861fffe6d83308 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 1 Jun 2025 01:26:54 +0100 Subject: [PATCH 036/284] f2fs: Fix __write_node_folio() conversion This conversion moved the folio_unlock() to inside __write_node_folio(), but missed one caller so we had a double-unlock on this path. Cc: Christoph Hellwig Cc: Chao Yu Cc: Jaegeuk Kim Reported-by: syzbot+c0dc46208750f063d0e0@syzkaller.appspotmail.com Fixes: 80f31d2a7e5f (f2fs: return bool from __write_node_folio) Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 1cb4cba7f961..bfe104db284e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2078,7 +2078,6 @@ int f2fs_sync_node_pages(struct f2fs_sb_info *sbi, if (!__write_node_folio(folio, false, &submitted, wbc, do_balance, io_type, NULL)) { - folio_unlock(folio); folio_batch_release(&fbatch); ret = -EIO; goto out; From bc4394e5e79cdda1b0997e0be1d65e242f523f02 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 6 Jun 2025 12:25:46 -0700 Subject: [PATCH 037/284] perf: Fix the throttle error of some clock events Both ARM and IBM CI reports RCU stall, which can be reproduced by the below perf command. perf record -a -e cpu-clock -- sleep 2 The issue is introduced by the generic throttle patch set, which unconditionally invoke the event_stop() when throttle is triggered. The cpu-clock and task-clock are two special SW events, which rely on the hrtimer. The throttle is invoked in the hrtimer handler. The event_stop()->hrtimer_cancel() waits for the handler to finish, which is a deadlock. Instead of invoking the stop(), the HRTIMER_NORESTART should be used to stop the timer. There may be two ways to fix it: - Introduce a PMU flag to track the case. Avoid the event_stop in perf_event_throttle() if the flag is detected. It has been implemented in the https://lore.kernel.org/lkml/20250528175832.2999139-1-kan.liang@linux.intel.com/ The new flag was thought to be an overkill for the issue. - Add a check in the event_stop. Return immediately if the throttle is invoked in the hrtimer handler. Rely on the existing HRTIMER_NORESTART method to stop the timer. The latter is implemented here. Move event->hw.interrupts = MAX_INTERRUPTS before the stop(). It makes the order the same as perf_event_unthrottle(). Except the patch, no one checks the hw.interrupts in the stop(). There is no impact from the order change. When stops in the throttle, the event should not be updated, stop(event, 0). But the cpu_clock_event_stop() doesn't handle the flag. In logic, it's wrong. But it didn't bring any problems with the old code, because the stop() was not invoked when handling the throttle. Checking the flag before updating the event. Fixes: 9734e25fbf5a ("perf: Fix the throttle logic for a group") Closes: https://lore.kernel.org/lkml/20250527161656.GJ2566836@e132581.arm.com/ Closes: https://lore.kernel.org/lkml/djxlh5fx326gcenwrr52ry3pk4wxmugu4jccdjysza7tlc5fef@ktp4rffawgcw/ Closes: https://lore.kernel.org/lkml/8e8f51d8-af64-4d9e-934b-c0ee9f131293@linux.ibm.com/ Closes: https://lore.kernel.org/lkml/4ce106d0-950c-aadc-0b6a-f0215cd39913@maine.edu/ Reported-by: Leo Yan Reported-by: Aishwarya TCV Reported-by: Alexei Starovoitov Reported-by: Venkat Rao Bagalkote Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ian Rogers Link: https://lkml.kernel.org/r/20250606192546.915765-1-kan.liang@linux.intel.com --- kernel/events/core.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index d7cf008f3d75..1f746469fda5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2674,8 +2674,8 @@ static void perf_event_unthrottle(struct perf_event *event, bool start) static void perf_event_throttle(struct perf_event *event) { - event->pmu->stop(event, 0); event->hw.interrupts = MAX_INTERRUPTS; + event->pmu->stop(event, 0); if (event == event->group_leader) perf_log_throttle(event, 0); } @@ -11774,7 +11774,12 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - if (is_sampling_event(event)) { + /* + * The throttle can be triggered in the hrtimer handler. + * The HRTIMER_NORESTART should be used to stop the timer, + * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + */ + if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); @@ -11829,7 +11834,8 @@ static void cpu_clock_event_start(struct perf_event *event, int flags) static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - cpu_clock_event_update(event); + if (flags & PERF_EF_UPDATE) + cpu_clock_event_update(event); } static int cpu_clock_event_add(struct perf_event *event, int flags) @@ -11907,7 +11913,8 @@ static void task_clock_event_start(struct perf_event *event, int flags) static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); - task_clock_event_update(event, event->ctx->time); + if (flags & PERF_EF_UPDATE) + task_clock_event_update(event, event->ctx->time); } static int task_clock_event_add(struct perf_event *event, int flags) From 72f37957007d25f4290e3ba40d40aaec1dd0b0cf Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Tue, 10 Jun 2025 19:59:26 +0800 Subject: [PATCH 038/284] gpio: loongson-64bit: Correct Loongson-7A2000 ACPI GPIO access mode According to the description of the Loongson-7A2000 ACPI GPIO register in the manual, its access mode should be BIT_CTRL_MODE, otherwise there maybe some unpredictable behavior. Cc: stable@vger.kernel.org Fixes: 44fe79020b91 ("gpio: loongson-64bit: Add more gpio chip support") Signed-off-by: Binbin Zhou Reviewed-by: Huacai Chen Link: https://lore.kernel.org/r/20250610115926.347845-1-zhoubinbin@loongson.cn Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-loongson-64bit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-loongson-64bit.c b/drivers/gpio/gpio-loongson-64bit.c index 26227669f026..70a01c5b8ad1 100644 --- a/drivers/gpio/gpio-loongson-64bit.c +++ b/drivers/gpio/gpio-loongson-64bit.c @@ -268,7 +268,7 @@ static const struct loongson_gpio_chip_data loongson_gpio_ls7a2000_data0 = { /* LS7A2000 ACPI GPIO */ static const struct loongson_gpio_chip_data loongson_gpio_ls7a2000_data1 = { .label = "ls7a2000_gpio", - .mode = BYTE_CTRL_MODE, + .mode = BIT_CTRL_MODE, .conf_offset = 0x4, .in_offset = 0x8, .out_offset = 0x0, From 69a14d146f3b87819f3fb73ed5d1de3e1fa680c1 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 2 Jun 2025 13:00:27 +0200 Subject: [PATCH 039/284] futex: Verify under the lock if hash can be replaced Once the global hash is requested there is no way back to switch back to the per-task private hash. This is checked at the begin of the function. It is possible that two threads simultaneously request the global hash and both pass the initial check and block later on the mm::futex_hash_lock. In this case the first thread performs the switch to the global hash. The second thread will also attempt to switch to the global hash and while doing so, accessing the nonexisting slot 1 of the struct futex_private_hash. The same applies if the hash is made immutable: There is no reference counting and the hash must not be replaced. Verify under mm_struct::futex_phash that neither the global hash nor an immutable hash in use. Tested-by: "Lai, Yi" Reported-by: "Lai, Yi" Closes: https://lore.kernel.org/all/aDwDw9Aygqo6oAx+@ly-workstation/ Fixes: bd54df5ea7cad ("futex: Allow to resize the private local hash") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/20250610104400.1077266-5-bigeasy@linutronix.de/ --- kernel/futex/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index b652d2f60c40..90d53fb0ee9e 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1629,6 +1629,16 @@ static int futex_hash_allocate(unsigned int hash_slots, unsigned int flags) mm->futex_phash_new = NULL; if (fph) { + if (cur && (!cur->hash_mask || cur->immutable)) { + /* + * If two threads simultaneously request the global + * hash then the first one performs the switch, + * the second one returns here. + */ + free = fph; + mm->futex_phash_new = new; + return -EBUSY; + } if (cur && !new) { /* * If we have an existing hash, but do not yet have From 903cc7096db22f889d48e2cee8840709ce04fdac Mon Sep 17 00:00:00 2001 From: Akhil R Date: Tue, 3 Jun 2025 21:00:20 +0530 Subject: [PATCH 040/284] dt-bindings: i2c: nvidia,tegra20-i2c: Specify the required properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Specify the properties which are essential and which are not for the Tegra I2C driver to function correctly. This was not added correctly when the TXT binding was converted to yaml. All the existing DT nodes have these properties already and hence this does not break the ABI. dmas and dma-names which were specified as a must in the TXT binding is now made optional since the driver can work in PIO mode if dmas are missing. Fixes: f10a9b722f80 ("dt-bindings: i2c: tegra: Convert to json-schema”) Signed-off-by: Akhil R Cc: # v5.17+ Reviewed-by: Krzysztof Kozlowski Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250603153022.39434-1-akhilrajeev@nvidia.com --- .../bindings/i2c/nvidia,tegra20-i2c.yaml | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml b/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml index b57ae6963e62..6b6f6762d122 100644 --- a/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml +++ b/Documentation/devicetree/bindings/i2c/nvidia,tegra20-i2c.yaml @@ -97,7 +97,10 @@ properties: resets: items: - - description: module reset + - description: + Module reset. This property is optional for controllers in Tegra194, + Tegra234 etc where an internal software reset is available as an + alternative. reset-names: items: @@ -116,6 +119,13 @@ properties: - const: rx - const: tx +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + allOf: - $ref: /schemas/i2c/i2c-controller.yaml - if: @@ -169,6 +179,18 @@ allOf: properties: power-domains: false + - if: + not: + properties: + compatible: + contains: + enum: + - nvidia,tegra194-i2c + then: + required: + - resets + - reset-names + unevaluatedProperties: false examples: From 614b1c3cbfb0ecbafd40284d2f8e67c865818714 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 12 Jun 2025 09:27:22 +0200 Subject: [PATCH 041/284] i2c: use inclusive callbacks in struct i2c_algorithm Convert the I2C subsystem to drop using the 'master_'-prefixed callbacks in favor of the simplified ones. Fix alignment of '=' while here. Signed-off-by: Wolfram Sang --- drivers/i2c/algos/i2c-algo-bit.c | 4 ++-- drivers/i2c/algos/i2c-algo-pca.c | 4 ++-- drivers/i2c/algos/i2c-algo-pcf.c | 4 ++-- drivers/i2c/busses/i2c-amd-mp2-plat.c | 2 +- drivers/i2c/busses/i2c-aspeed.c | 8 ++++---- drivers/i2c/busses/i2c-at91-master.c | 4 ++-- drivers/i2c/busses/i2c-axxia.c | 2 +- drivers/i2c/busses/i2c-bcm-iproc.c | 2 +- drivers/i2c/busses/i2c-cadence.c | 10 +++++----- drivers/i2c/busses/i2c-cgbc.c | 4 ++-- drivers/i2c/busses/i2c-eg20t.c | 2 +- drivers/i2c/busses/i2c-emev2.c | 6 +++--- drivers/i2c/busses/i2c-exynos5.c | 6 +++--- drivers/i2c/busses/i2c-gxp.c | 6 +++--- drivers/i2c/busses/i2c-img-scb.c | 2 +- drivers/i2c/busses/i2c-imx-lpi2c.c | 8 ++++---- drivers/i2c/busses/i2c-imx.c | 8 ++++---- drivers/i2c/busses/i2c-keba.c | 2 +- drivers/i2c/busses/i2c-mchp-pci1xxxx.c | 2 +- drivers/i2c/busses/i2c-meson.c | 4 ++-- drivers/i2c/busses/i2c-microchip-corei2c.c | 2 +- drivers/i2c/busses/i2c-mt65xx.c | 2 +- drivers/i2c/busses/i2c-mxs.c | 2 +- drivers/i2c/busses/i2c-nomadik.c | 4 ++-- drivers/i2c/busses/i2c-npcm7xx.c | 6 +++--- drivers/i2c/busses/i2c-omap.c | 6 +++--- drivers/i2c/busses/i2c-pnx.c | 2 +- drivers/i2c/busses/i2c-pxa.c | 16 ++++++++-------- drivers/i2c/busses/i2c-qcom-cci.c | 4 ++-- drivers/i2c/busses/i2c-qcom-geni.c | 4 ++-- drivers/i2c/busses/i2c-qup.c | 8 ++++---- drivers/i2c/busses/i2c-rcar.c | 10 +++++----- drivers/i2c/busses/i2c-s3c2410.c | 6 +++--- drivers/i2c/busses/i2c-sh7760.c | 4 ++-- drivers/i2c/busses/i2c-sh_mobile.c | 4 ++-- drivers/i2c/busses/i2c-stm32f7.c | 4 ++-- drivers/i2c/busses/i2c-synquacer.c | 4 ++-- drivers/i2c/busses/i2c-tegra.c | 6 +++--- drivers/i2c/busses/i2c-xiic.c | 4 ++-- drivers/i2c/busses/i2c-xlp9xx.c | 2 +- drivers/i2c/i2c-atr.c | 2 +- drivers/i2c/i2c-mux.c | 6 +++--- drivers/i2c/muxes/i2c-demux-pinctrl.c | 4 ++-- 43 files changed, 101 insertions(+), 101 deletions(-) diff --git a/drivers/i2c/algos/i2c-algo-bit.c b/drivers/i2c/algos/i2c-algo-bit.c index eddf25b90ca8..6544d27e4419 100644 --- a/drivers/i2c/algos/i2c-algo-bit.c +++ b/drivers/i2c/algos/i2c-algo-bit.c @@ -619,8 +619,8 @@ static u32 bit_func(struct i2c_adapter *adap) /* -----exported algorithm data: ------------------------------------- */ const struct i2c_algorithm i2c_bit_algo = { - .master_xfer = bit_xfer, - .master_xfer_atomic = bit_xfer_atomic, + .xfer = bit_xfer, + .xfer_atomic = bit_xfer_atomic, .functionality = bit_func, }; EXPORT_SYMBOL(i2c_bit_algo); diff --git a/drivers/i2c/algos/i2c-algo-pca.c b/drivers/i2c/algos/i2c-algo-pca.c index 384af88e58ad..74b66aec33d4 100644 --- a/drivers/i2c/algos/i2c-algo-pca.c +++ b/drivers/i2c/algos/i2c-algo-pca.c @@ -361,8 +361,8 @@ static u32 pca_func(struct i2c_adapter *adap) } static const struct i2c_algorithm pca_algo = { - .master_xfer = pca_xfer, - .functionality = pca_func, + .xfer = pca_xfer, + .functionality = pca_func, }; static unsigned int pca_probe_chip(struct i2c_adapter *adap) diff --git a/drivers/i2c/algos/i2c-algo-pcf.c b/drivers/i2c/algos/i2c-algo-pcf.c index 740066ceaea3..fd563e845d4b 100644 --- a/drivers/i2c/algos/i2c-algo-pcf.c +++ b/drivers/i2c/algos/i2c-algo-pcf.c @@ -389,8 +389,8 @@ static u32 pcf_func(struct i2c_adapter *adap) /* exported algorithm data: */ static const struct i2c_algorithm pcf_algo = { - .master_xfer = pcf_xfer, - .functionality = pcf_func, + .xfer = pcf_xfer, + .functionality = pcf_func, }; /* diff --git a/drivers/i2c/busses/i2c-amd-mp2-plat.c b/drivers/i2c/busses/i2c-amd-mp2-plat.c index d9dd0e475d1a..188e24cc4d35 100644 --- a/drivers/i2c/busses/i2c-amd-mp2-plat.c +++ b/drivers/i2c/busses/i2c-amd-mp2-plat.c @@ -179,7 +179,7 @@ static u32 i2c_amd_func(struct i2c_adapter *a) } static const struct i2c_algorithm i2c_amd_algorithm = { - .master_xfer = i2c_amd_xfer, + .xfer = i2c_amd_xfer, .functionality = i2c_amd_func, }; diff --git a/drivers/i2c/busses/i2c-aspeed.c b/drivers/i2c/busses/i2c-aspeed.c index 1550d3d552ae..a26b74c71206 100644 --- a/drivers/i2c/busses/i2c-aspeed.c +++ b/drivers/i2c/busses/i2c-aspeed.c @@ -814,11 +814,11 @@ static int aspeed_i2c_unreg_slave(struct i2c_client *client) #endif /* CONFIG_I2C_SLAVE */ static const struct i2c_algorithm aspeed_i2c_algo = { - .master_xfer = aspeed_i2c_master_xfer, - .functionality = aspeed_i2c_functionality, + .xfer = aspeed_i2c_master_xfer, + .functionality = aspeed_i2c_functionality, #if IS_ENABLED(CONFIG_I2C_SLAVE) - .reg_slave = aspeed_i2c_reg_slave, - .unreg_slave = aspeed_i2c_unreg_slave, + .reg_slave = aspeed_i2c_reg_slave, + .unreg_slave = aspeed_i2c_unreg_slave, #endif /* CONFIG_I2C_SLAVE */ }; diff --git a/drivers/i2c/busses/i2c-at91-master.c b/drivers/i2c/busses/i2c-at91-master.c index 374fc50bb205..59795c1c24ff 100644 --- a/drivers/i2c/busses/i2c-at91-master.c +++ b/drivers/i2c/busses/i2c-at91-master.c @@ -739,8 +739,8 @@ static u32 at91_twi_func(struct i2c_adapter *adapter) } static const struct i2c_algorithm at91_twi_algorithm = { - .master_xfer = at91_twi_xfer, - .functionality = at91_twi_func, + .xfer = at91_twi_xfer, + .functionality = at91_twi_func, }; static int at91_twi_configure_dma(struct at91_twi_dev *dev, u32 phy_addr) diff --git a/drivers/i2c/busses/i2c-axxia.c b/drivers/i2c/busses/i2c-axxia.c index 50030256cd85..0555eeb6903a 100644 --- a/drivers/i2c/busses/i2c-axxia.c +++ b/drivers/i2c/busses/i2c-axxia.c @@ -706,7 +706,7 @@ static int axxia_i2c_unreg_slave(struct i2c_client *slave) } static const struct i2c_algorithm axxia_i2c_algo = { - .master_xfer = axxia_i2c_xfer, + .xfer = axxia_i2c_xfer, .functionality = axxia_i2c_func, .reg_slave = axxia_i2c_reg_slave, .unreg_slave = axxia_i2c_unreg_slave, diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c index 63bc3c8f49d3..e418a4f23f15 100644 --- a/drivers/i2c/busses/i2c-bcm-iproc.c +++ b/drivers/i2c/busses/i2c-bcm-iproc.c @@ -1041,7 +1041,7 @@ static int bcm_iproc_i2c_unreg_slave(struct i2c_client *slave) } static struct i2c_algorithm bcm_iproc_algo = { - .master_xfer = bcm_iproc_i2c_xfer, + .xfer = bcm_iproc_i2c_xfer, .functionality = bcm_iproc_i2c_functionality, .reg_slave = bcm_iproc_i2c_reg_slave, .unreg_slave = bcm_iproc_i2c_unreg_slave, diff --git a/drivers/i2c/busses/i2c-cadence.c b/drivers/i2c/busses/i2c-cadence.c index 8df63aaf2a80..697d095afbe4 100644 --- a/drivers/i2c/busses/i2c-cadence.c +++ b/drivers/i2c/busses/i2c-cadence.c @@ -1231,12 +1231,12 @@ static int cdns_unreg_slave(struct i2c_client *slave) #endif static const struct i2c_algorithm cdns_i2c_algo = { - .master_xfer = cdns_i2c_master_xfer, - .master_xfer_atomic = cdns_i2c_master_xfer_atomic, - .functionality = cdns_i2c_func, + .xfer = cdns_i2c_master_xfer, + .xfer_atomic = cdns_i2c_master_xfer_atomic, + .functionality = cdns_i2c_func, #if IS_ENABLED(CONFIG_I2C_SLAVE) - .reg_slave = cdns_reg_slave, - .unreg_slave = cdns_unreg_slave, + .reg_slave = cdns_reg_slave, + .unreg_slave = cdns_unreg_slave, #endif }; diff --git a/drivers/i2c/busses/i2c-cgbc.c b/drivers/i2c/busses/i2c-cgbc.c index f054d167ac47..25a74fa51aa0 100644 --- a/drivers/i2c/busses/i2c-cgbc.c +++ b/drivers/i2c/busses/i2c-cgbc.c @@ -331,8 +331,8 @@ static u32 cgbc_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm cgbc_i2c_algorithm = { - .master_xfer = cgbc_i2c_xfer, - .functionality = cgbc_i2c_func, + .xfer = cgbc_i2c_xfer, + .functionality = cgbc_i2c_func, }; static struct i2c_algo_cgbc_data cgbc_i2c_algo_data[] = { diff --git a/drivers/i2c/busses/i2c-eg20t.c b/drivers/i2c/busses/i2c-eg20t.c index efdaddf99f9e..27ea3c130a16 100644 --- a/drivers/i2c/busses/i2c-eg20t.c +++ b/drivers/i2c/busses/i2c-eg20t.c @@ -690,7 +690,7 @@ static u32 pch_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm pch_algorithm = { - .master_xfer = pch_i2c_xfer, + .xfer = pch_i2c_xfer, .functionality = pch_i2c_func }; diff --git a/drivers/i2c/busses/i2c-emev2.c b/drivers/i2c/busses/i2c-emev2.c index 2512cef8e2a2..ece019b3d066 100644 --- a/drivers/i2c/busses/i2c-emev2.c +++ b/drivers/i2c/busses/i2c-emev2.c @@ -351,10 +351,10 @@ static int em_i2c_unreg_slave(struct i2c_client *slave) } static const struct i2c_algorithm em_i2c_algo = { - .master_xfer = em_i2c_xfer, + .xfer = em_i2c_xfer, .functionality = em_i2c_func, - .reg_slave = em_i2c_reg_slave, - .unreg_slave = em_i2c_unreg_slave, + .reg_slave = em_i2c_reg_slave, + .unreg_slave = em_i2c_unreg_slave, }; static int em_i2c_probe(struct platform_device *pdev) diff --git a/drivers/i2c/busses/i2c-exynos5.c b/drivers/i2c/busses/i2c-exynos5.c index 02f24479aa07..9c1c5f3c09f6 100644 --- a/drivers/i2c/busses/i2c-exynos5.c +++ b/drivers/i2c/busses/i2c-exynos5.c @@ -879,9 +879,9 @@ static u32 exynos5_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm exynos5_i2c_algorithm = { - .master_xfer = exynos5_i2c_xfer, - .master_xfer_atomic = exynos5_i2c_xfer_atomic, - .functionality = exynos5_i2c_func, + .xfer = exynos5_i2c_xfer, + .xfer_atomic = exynos5_i2c_xfer_atomic, + .functionality = exynos5_i2c_func, }; static int exynos5_i2c_probe(struct platform_device *pdev) diff --git a/drivers/i2c/busses/i2c-gxp.c b/drivers/i2c/busses/i2c-gxp.c index 0fc39caa6c87..2d117e7e3cb6 100644 --- a/drivers/i2c/busses/i2c-gxp.c +++ b/drivers/i2c/busses/i2c-gxp.c @@ -184,11 +184,11 @@ static int gxp_i2c_unreg_slave(struct i2c_client *slave) #endif static const struct i2c_algorithm gxp_i2c_algo = { - .master_xfer = gxp_i2c_master_xfer, + .xfer = gxp_i2c_master_xfer, .functionality = gxp_i2c_func, #if IS_ENABLED(CONFIG_I2C_SLAVE) - .reg_slave = gxp_i2c_reg_slave, - .unreg_slave = gxp_i2c_unreg_slave, + .reg_slave = gxp_i2c_reg_slave, + .unreg_slave = gxp_i2c_unreg_slave, #endif }; diff --git a/drivers/i2c/busses/i2c-img-scb.c b/drivers/i2c/busses/i2c-img-scb.c index 3278707bb885..a454f9f25146 100644 --- a/drivers/i2c/busses/i2c-img-scb.c +++ b/drivers/i2c/busses/i2c-img-scb.c @@ -1143,7 +1143,7 @@ static u32 img_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm img_i2c_algo = { - .master_xfer = img_i2c_xfer, + .xfer = img_i2c_xfer, .functionality = img_i2c_func, }; diff --git a/drivers/i2c/busses/i2c-imx-lpi2c.c b/drivers/i2c/busses/i2c-imx-lpi2c.c index 342d47e67586..064bc83840a6 100644 --- a/drivers/i2c/busses/i2c-imx-lpi2c.c +++ b/drivers/i2c/busses/i2c-imx-lpi2c.c @@ -1268,10 +1268,10 @@ static u32 lpi2c_imx_func(struct i2c_adapter *adapter) } static const struct i2c_algorithm lpi2c_imx_algo = { - .master_xfer = lpi2c_imx_xfer, - .functionality = lpi2c_imx_func, - .reg_target = lpi2c_imx_register_target, - .unreg_target = lpi2c_imx_unregister_target, + .xfer = lpi2c_imx_xfer, + .functionality = lpi2c_imx_func, + .reg_target = lpi2c_imx_register_target, + .unreg_target = lpi2c_imx_unregister_target, }; static const struct of_device_id lpi2c_imx_of_match[] = { diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c index de01dfecb16e..e5732b0557fb 100644 --- a/drivers/i2c/busses/i2c-imx.c +++ b/drivers/i2c/busses/i2c-imx.c @@ -1692,11 +1692,11 @@ static u32 i2c_imx_func(struct i2c_adapter *adapter) } static const struct i2c_algorithm i2c_imx_algo = { - .master_xfer = i2c_imx_xfer, - .master_xfer_atomic = i2c_imx_xfer_atomic, + .xfer = i2c_imx_xfer, + .xfer_atomic = i2c_imx_xfer_atomic, .functionality = i2c_imx_func, - .reg_slave = i2c_imx_reg_slave, - .unreg_slave = i2c_imx_unreg_slave, + .reg_slave = i2c_imx_reg_slave, + .unreg_slave = i2c_imx_unreg_slave, }; static int i2c_imx_probe(struct platform_device *pdev) diff --git a/drivers/i2c/busses/i2c-keba.c b/drivers/i2c/busses/i2c-keba.c index 7b9ed2592f5b..9420c8b342b5 100644 --- a/drivers/i2c/busses/i2c-keba.c +++ b/drivers/i2c/busses/i2c-keba.c @@ -500,7 +500,7 @@ static u32 ki2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm ki2c_algo = { - .master_xfer = ki2c_xfer, + .xfer = ki2c_xfer, .functionality = ki2c_func, }; diff --git a/drivers/i2c/busses/i2c-mchp-pci1xxxx.c b/drivers/i2c/busses/i2c-mchp-pci1xxxx.c index 5ef136c3ecb1..bc0f1a0c8ee1 100644 --- a/drivers/i2c/busses/i2c-mchp-pci1xxxx.c +++ b/drivers/i2c/busses/i2c-mchp-pci1xxxx.c @@ -1048,7 +1048,7 @@ static u32 pci1xxxx_i2c_get_funcs(struct i2c_adapter *adap) } static const struct i2c_algorithm pci1xxxx_i2c_algo = { - .master_xfer = pci1xxxx_i2c_xfer, + .xfer = pci1xxxx_i2c_xfer, .functionality = pci1xxxx_i2c_get_funcs, }; diff --git a/drivers/i2c/busses/i2c-meson.c b/drivers/i2c/busses/i2c-meson.c index e1d69537353b..0d9032953e48 100644 --- a/drivers/i2c/busses/i2c-meson.c +++ b/drivers/i2c/busses/i2c-meson.c @@ -448,8 +448,8 @@ static u32 meson_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm meson_i2c_algorithm = { - .master_xfer = meson_i2c_xfer, - .master_xfer_atomic = meson_i2c_xfer_atomic, + .xfer = meson_i2c_xfer, + .xfer_atomic = meson_i2c_xfer_atomic, .functionality = meson_i2c_func, }; diff --git a/drivers/i2c/busses/i2c-microchip-corei2c.c b/drivers/i2c/busses/i2c-microchip-corei2c.c index 492bf4c34722..f173bda1c98c 100644 --- a/drivers/i2c/busses/i2c-microchip-corei2c.c +++ b/drivers/i2c/busses/i2c-microchip-corei2c.c @@ -526,7 +526,7 @@ static int mchp_corei2c_smbus_xfer(struct i2c_adapter *adap, u16 addr, unsigned } static const struct i2c_algorithm mchp_corei2c_algo = { - .master_xfer = mchp_corei2c_xfer, + .xfer = mchp_corei2c_xfer, .functionality = mchp_corei2c_func, .smbus_xfer = mchp_corei2c_smbus_xfer, }; diff --git a/drivers/i2c/busses/i2c-mt65xx.c b/drivers/i2c/busses/i2c-mt65xx.c index 5bd342047d59..ab456c3717db 100644 --- a/drivers/i2c/busses/i2c-mt65xx.c +++ b/drivers/i2c/busses/i2c-mt65xx.c @@ -1342,7 +1342,7 @@ static u32 mtk_i2c_functionality(struct i2c_adapter *adap) } static const struct i2c_algorithm mtk_i2c_algorithm = { - .master_xfer = mtk_i2c_transfer, + .xfer = mtk_i2c_transfer, .functionality = mtk_i2c_functionality, }; diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c index ad62d56b2186..08c9091a1e35 100644 --- a/drivers/i2c/busses/i2c-mxs.c +++ b/drivers/i2c/busses/i2c-mxs.c @@ -687,7 +687,7 @@ static irqreturn_t mxs_i2c_isr(int this_irq, void *dev_id) } static const struct i2c_algorithm mxs_i2c_algo = { - .master_xfer = mxs_i2c_xfer, + .xfer = mxs_i2c_xfer, .functionality = mxs_i2c_func, }; diff --git a/drivers/i2c/busses/i2c-nomadik.c b/drivers/i2c/busses/i2c-nomadik.c index d2877e4cc28d..19b648fc094d 100644 --- a/drivers/i2c/busses/i2c-nomadik.c +++ b/drivers/i2c/busses/i2c-nomadik.c @@ -996,8 +996,8 @@ static unsigned int nmk_i2c_functionality(struct i2c_adapter *adap) } static const struct i2c_algorithm nmk_i2c_algo = { - .master_xfer = nmk_i2c_xfer, - .functionality = nmk_i2c_functionality + .xfer = nmk_i2c_xfer, + .functionality = nmk_i2c_functionality }; static void nmk_i2c_of_probe(struct device_node *np, diff --git a/drivers/i2c/busses/i2c-npcm7xx.c b/drivers/i2c/busses/i2c-npcm7xx.c index 892e2d2988a7..8b7e15240fb0 100644 --- a/drivers/i2c/busses/i2c-npcm7xx.c +++ b/drivers/i2c/busses/i2c-npcm7xx.c @@ -2470,11 +2470,11 @@ static const struct i2c_adapter_quirks npcm_i2c_quirks = { }; static const struct i2c_algorithm npcm_i2c_algo = { - .master_xfer = npcm_i2c_master_xfer, + .xfer = npcm_i2c_master_xfer, .functionality = npcm_i2c_functionality, #if IS_ENABLED(CONFIG_I2C_SLAVE) - .reg_slave = npcm_i2c_reg_slave, - .unreg_slave = npcm_i2c_unreg_slave, + .reg_slave = npcm_i2c_reg_slave, + .unreg_slave = npcm_i2c_unreg_slave, #endif }; diff --git a/drivers/i2c/busses/i2c-omap.c b/drivers/i2c/busses/i2c-omap.c index 876791d20ed5..f1cc26ac5b80 100644 --- a/drivers/i2c/busses/i2c-omap.c +++ b/drivers/i2c/busses/i2c-omap.c @@ -1201,9 +1201,9 @@ omap_i2c_isr_thread(int this_irq, void *dev_id) } static const struct i2c_algorithm omap_i2c_algo = { - .master_xfer = omap_i2c_xfer_irq, - .master_xfer_atomic = omap_i2c_xfer_polling, - .functionality = omap_i2c_func, + .xfer = omap_i2c_xfer_irq, + .xfer_atomic = omap_i2c_xfer_polling, + .functionality = omap_i2c_func, }; static const struct i2c_adapter_quirks omap_i2c_quirks = { diff --git a/drivers/i2c/busses/i2c-pnx.c b/drivers/i2c/busses/i2c-pnx.c index 9a1af5bbd604..8daa0008bd05 100644 --- a/drivers/i2c/busses/i2c-pnx.c +++ b/drivers/i2c/busses/i2c-pnx.c @@ -580,7 +580,7 @@ static u32 i2c_pnx_func(struct i2c_adapter *adapter) } static const struct i2c_algorithm pnx_algorithm = { - .master_xfer = i2c_pnx_xfer, + .xfer = i2c_pnx_xfer, .functionality = i2c_pnx_func, }; diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index 4415a29f749b..968a8b8794da 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -1154,11 +1154,11 @@ static u32 i2c_pxa_functionality(struct i2c_adapter *adap) } static const struct i2c_algorithm i2c_pxa_algorithm = { - .master_xfer = i2c_pxa_xfer, - .functionality = i2c_pxa_functionality, + .xfer = i2c_pxa_xfer, + .functionality = i2c_pxa_functionality, #ifdef CONFIG_I2C_PXA_SLAVE - .reg_slave = i2c_pxa_slave_reg, - .unreg_slave = i2c_pxa_slave_unreg, + .reg_slave = i2c_pxa_slave_reg, + .unreg_slave = i2c_pxa_slave_unreg, #endif }; @@ -1244,11 +1244,11 @@ static int i2c_pxa_pio_xfer(struct i2c_adapter *adap, } static const struct i2c_algorithm i2c_pxa_pio_algorithm = { - .master_xfer = i2c_pxa_pio_xfer, - .functionality = i2c_pxa_functionality, + .xfer = i2c_pxa_pio_xfer, + .functionality = i2c_pxa_functionality, #ifdef CONFIG_I2C_PXA_SLAVE - .reg_slave = i2c_pxa_slave_reg, - .unreg_slave = i2c_pxa_slave_unreg, + .reg_slave = i2c_pxa_slave_reg, + .unreg_slave = i2c_pxa_slave_unreg, #endif }; diff --git a/drivers/i2c/busses/i2c-qcom-cci.c b/drivers/i2c/busses/i2c-qcom-cci.c index 05b73326afd4..a3afa11a71a1 100644 --- a/drivers/i2c/busses/i2c-qcom-cci.c +++ b/drivers/i2c/busses/i2c-qcom-cci.c @@ -462,8 +462,8 @@ static u32 cci_func(struct i2c_adapter *adap) } static const struct i2c_algorithm cci_algo = { - .master_xfer = cci_xfer, - .functionality = cci_func, + .xfer = cci_xfer, + .functionality = cci_func, }; static int cci_enable_clocks(struct cci *cci) diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index ccea575fb783..13889f52b6f7 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -727,8 +727,8 @@ static u32 geni_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm geni_i2c_algo = { - .master_xfer = geni_i2c_xfer, - .functionality = geni_i2c_func, + .xfer = geni_i2c_xfer, + .functionality = geni_i2c_func, }; #ifdef CONFIG_ACPI diff --git a/drivers/i2c/busses/i2c-qup.c b/drivers/i2c/busses/i2c-qup.c index 3a36d682ed57..6059f585843e 100644 --- a/drivers/i2c/busses/i2c-qup.c +++ b/drivers/i2c/busses/i2c-qup.c @@ -1634,13 +1634,13 @@ static u32 qup_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm qup_i2c_algo = { - .master_xfer = qup_i2c_xfer, - .functionality = qup_i2c_func, + .xfer = qup_i2c_xfer, + .functionality = qup_i2c_func, }; static const struct i2c_algorithm qup_i2c_algo_v2 = { - .master_xfer = qup_i2c_xfer_v2, - .functionality = qup_i2c_func, + .xfer = qup_i2c_xfer_v2, + .functionality = qup_i2c_func, }; /* diff --git a/drivers/i2c/busses/i2c-rcar.c b/drivers/i2c/busses/i2c-rcar.c index 5693a38da7b5..d51884ab99f4 100644 --- a/drivers/i2c/busses/i2c-rcar.c +++ b/drivers/i2c/busses/i2c-rcar.c @@ -1084,11 +1084,11 @@ static u32 rcar_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm rcar_i2c_algo = { - .master_xfer = rcar_i2c_master_xfer, - .master_xfer_atomic = rcar_i2c_master_xfer_atomic, - .functionality = rcar_i2c_func, - .reg_slave = rcar_reg_slave, - .unreg_slave = rcar_unreg_slave, + .xfer = rcar_i2c_master_xfer, + .xfer_atomic = rcar_i2c_master_xfer_atomic, + .functionality = rcar_i2c_func, + .reg_slave = rcar_reg_slave, + .unreg_slave = rcar_unreg_slave, }; static const struct i2c_adapter_quirks rcar_i2c_quirks = { diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c index 0f3cf500df68..f4fa4703acbd 100644 --- a/drivers/i2c/busses/i2c-s3c2410.c +++ b/drivers/i2c/busses/i2c-s3c2410.c @@ -800,9 +800,9 @@ static u32 s3c24xx_i2c_func(struct i2c_adapter *adap) /* i2c bus registration info */ static const struct i2c_algorithm s3c24xx_i2c_algorithm = { - .master_xfer = s3c24xx_i2c_xfer, - .master_xfer_atomic = s3c24xx_i2c_xfer_atomic, - .functionality = s3c24xx_i2c_func, + .xfer = s3c24xx_i2c_xfer, + .xfer_atomic = s3c24xx_i2c_xfer_atomic, + .functionality = s3c24xx_i2c_func, }; /* diff --git a/drivers/i2c/busses/i2c-sh7760.c b/drivers/i2c/busses/i2c-sh7760.c index 620f12596763..43f33988b98f 100644 --- a/drivers/i2c/busses/i2c-sh7760.c +++ b/drivers/i2c/busses/i2c-sh7760.c @@ -379,8 +379,8 @@ static u32 sh7760_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm sh7760_i2c_algo = { - .master_xfer = sh7760_i2c_master_xfer, - .functionality = sh7760_i2c_func, + .xfer = sh7760_i2c_master_xfer, + .functionality = sh7760_i2c_func, }; /* calculate CCR register setting for a desired scl clock. SCL clock is diff --git a/drivers/i2c/busses/i2c-sh_mobile.c b/drivers/i2c/busses/i2c-sh_mobile.c index adfcee6c9fdc..dae8967f8749 100644 --- a/drivers/i2c/busses/i2c-sh_mobile.c +++ b/drivers/i2c/busses/i2c-sh_mobile.c @@ -740,8 +740,8 @@ static u32 sh_mobile_i2c_func(struct i2c_adapter *adapter) static const struct i2c_algorithm sh_mobile_i2c_algorithm = { .functionality = sh_mobile_i2c_func, - .master_xfer = sh_mobile_i2c_xfer, - .master_xfer_atomic = sh_mobile_i2c_xfer_atomic, + .xfer = sh_mobile_i2c_xfer, + .xfer_atomic = sh_mobile_i2c_xfer_atomic, }; static const struct i2c_adapter_quirks sh_mobile_i2c_quirks = { diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 973a3a8c6d4a..e4aaeb2262d0 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -2151,8 +2151,8 @@ static u32 stm32f7_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm stm32f7_i2c_algo = { - .master_xfer = stm32f7_i2c_xfer, - .master_xfer_atomic = stm32f7_i2c_xfer_atomic, + .xfer = stm32f7_i2c_xfer, + .xfer_atomic = stm32f7_i2c_xfer_atomic, .smbus_xfer = stm32f7_i2c_smbus_xfer, .functionality = stm32f7_i2c_func, .reg_slave = stm32f7_i2c_reg_slave, diff --git a/drivers/i2c/busses/i2c-synquacer.c b/drivers/i2c/busses/i2c-synquacer.c index 31f8d08e32a4..1230f51e1624 100644 --- a/drivers/i2c/busses/i2c-synquacer.c +++ b/drivers/i2c/busses/i2c-synquacer.c @@ -520,8 +520,8 @@ static u32 synquacer_i2c_functionality(struct i2c_adapter *adap) } static const struct i2c_algorithm synquacer_i2c_algo = { - .master_xfer = synquacer_i2c_xfer, - .functionality = synquacer_i2c_functionality, + .xfer = synquacer_i2c_xfer, + .functionality = synquacer_i2c_functionality, }; static const struct i2c_adapter synquacer_i2c_ops = { diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c index 049b4d154c23..0862b98007f5 100644 --- a/drivers/i2c/busses/i2c-tegra.c +++ b/drivers/i2c/busses/i2c-tegra.c @@ -1440,9 +1440,9 @@ static u32 tegra_i2c_func(struct i2c_adapter *adap) } static const struct i2c_algorithm tegra_i2c_algo = { - .master_xfer = tegra_i2c_xfer, - .master_xfer_atomic = tegra_i2c_xfer_atomic, - .functionality = tegra_i2c_func, + .xfer = tegra_i2c_xfer, + .xfer_atomic = tegra_i2c_xfer_atomic, + .functionality = tegra_i2c_func, }; /* payload size is only 12 bit */ diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c index 6bc1575cea6c..607026c921d6 100644 --- a/drivers/i2c/busses/i2c-xiic.c +++ b/drivers/i2c/busses/i2c-xiic.c @@ -1398,8 +1398,8 @@ static u32 xiic_func(struct i2c_adapter *adap) } static const struct i2c_algorithm xiic_algorithm = { - .master_xfer = xiic_xfer, - .master_xfer_atomic = xiic_xfer_atomic, + .xfer = xiic_xfer, + .xfer_atomic = xiic_xfer_atomic, .functionality = xiic_func, }; diff --git a/drivers/i2c/busses/i2c-xlp9xx.c b/drivers/i2c/busses/i2c-xlp9xx.c index 4d5e49b6321b..ddb1c3e8bc9d 100644 --- a/drivers/i2c/busses/i2c-xlp9xx.c +++ b/drivers/i2c/busses/i2c-xlp9xx.c @@ -452,7 +452,7 @@ static u32 xlp9xx_i2c_functionality(struct i2c_adapter *adapter) } static const struct i2c_algorithm xlp9xx_i2c_algo = { - .master_xfer = xlp9xx_i2c_xfer, + .xfer = xlp9xx_i2c_xfer, .functionality = xlp9xx_i2c_functionality, }; diff --git a/drivers/i2c/i2c-atr.c b/drivers/i2c/i2c-atr.c index be7d6d41e0b2..dd194476b118 100644 --- a/drivers/i2c/i2c-atr.c +++ b/drivers/i2c/i2c-atr.c @@ -738,7 +738,7 @@ struct i2c_atr *i2c_atr_new(struct i2c_adapter *parent, struct device *dev, atr->flags = flags; if (parent->algo->master_xfer) - atr->algo.master_xfer = i2c_atr_master_xfer; + atr->algo.xfer = i2c_atr_master_xfer; if (parent->algo->smbus_xfer) atr->algo.smbus_xfer = i2c_atr_smbus_xfer; atr->algo.functionality = i2c_atr_functionality; diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c index fda72e8be885..4d8690981a55 100644 --- a/drivers/i2c/i2c-mux.c +++ b/drivers/i2c/i2c-mux.c @@ -293,12 +293,12 @@ int i2c_mux_add_adapter(struct i2c_mux_core *muxc, */ if (parent->algo->master_xfer) { if (muxc->mux_locked) - priv->algo.master_xfer = i2c_mux_master_xfer; + priv->algo.xfer = i2c_mux_master_xfer; else - priv->algo.master_xfer = __i2c_mux_master_xfer; + priv->algo.xfer = __i2c_mux_master_xfer; } if (parent->algo->master_xfer_atomic) - priv->algo.master_xfer_atomic = priv->algo.master_xfer; + priv->algo.xfer_atomic = priv->algo.master_xfer; if (parent->algo->smbus_xfer) { if (muxc->mux_locked) diff --git a/drivers/i2c/muxes/i2c-demux-pinctrl.c b/drivers/i2c/muxes/i2c-demux-pinctrl.c index 77a740561fd7..f2a1f4744978 100644 --- a/drivers/i2c/muxes/i2c-demux-pinctrl.c +++ b/drivers/i2c/muxes/i2c-demux-pinctrl.c @@ -95,9 +95,9 @@ static int i2c_demux_activate_master(struct i2c_demux_pinctrl_priv *priv, u32 ne priv->cur_chan = new_chan; /* Now fill out current adapter structure. cur_chan must be up to date */ - priv->algo.master_xfer = i2c_demux_master_xfer; + priv->algo.xfer = i2c_demux_master_xfer; if (adap->algo->master_xfer_atomic) - priv->algo.master_xfer_atomic = i2c_demux_master_xfer; + priv->algo.xfer_atomic = i2c_demux_master_xfer; priv->algo.functionality = i2c_demux_functionality; snprintf(priv->cur_adap.name, sizeof(priv->cur_adap.name), From e479da4054875c4cc53a7fb956ebff03d2dac939 Mon Sep 17 00:00:00 2001 From: John Keeping Date: Wed, 11 Jun 2025 12:13:06 +0100 Subject: [PATCH 042/284] drm/ssd130x: fix ssd132x_clear_screen() columns The number of columns relates to the width, not the height. Use the correct variable. Signed-off-by: John Keeping Reviewed-by: Javier Martinez Canillas Fixes: fdd591e00a9c ("drm/ssd130x: Add support for the SSD132x OLED controller family") Link: https://lore.kernel.org/r/20250611111307.1814876-1-jkeeping@inmusicbrands.com Signed-off-by: Javier Martinez Canillas --- drivers/gpu/drm/solomon/ssd130x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/solomon/ssd130x.c b/drivers/gpu/drm/solomon/ssd130x.c index dd2006d51c7a..eec43d1a5595 100644 --- a/drivers/gpu/drm/solomon/ssd130x.c +++ b/drivers/gpu/drm/solomon/ssd130x.c @@ -974,7 +974,7 @@ static void ssd130x_clear_screen(struct ssd130x_device *ssd130x, u8 *data_array) static void ssd132x_clear_screen(struct ssd130x_device *ssd130x, u8 *data_array) { - unsigned int columns = DIV_ROUND_UP(ssd130x->height, SSD132X_SEGMENT_WIDTH); + unsigned int columns = DIV_ROUND_UP(ssd130x->width, SSD132X_SEGMENT_WIDTH); unsigned int height = ssd130x->height; memset(data_array, 0, columns * height); From 8ecbad4853f83d9853d7bb0c2d8373afab8851d3 Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Fri, 23 May 2025 08:40:41 +0200 Subject: [PATCH 043/284] drm/arm/malidp: Silence informational message When checking for unsupported expect an error is printed every time. This spams the log for platforms where this is expected, e.g. ls1028a having a Vivante (etnaviv) GPU and Mali display processor. Signed-off-by: Alexander Stein Signed-off-by: Liviu Dudau Link: https://lore.kernel.org/r/20250523064042.3275926-1-alexander.stein@ew.tq-group.com --- drivers/gpu/drm/arm/malidp_planes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/arm/malidp_planes.c b/drivers/gpu/drm/arm/malidp_planes.c index 34547edf1ee3..87f2e5ee8790 100644 --- a/drivers/gpu/drm/arm/malidp_planes.c +++ b/drivers/gpu/drm/arm/malidp_planes.c @@ -159,7 +159,7 @@ bool malidp_format_mod_supported(struct drm_device *drm, } if (!fourcc_mod_is_vendor(modifier, ARM)) { - DRM_ERROR("Unknown modifier (not Arm)\n"); + DRM_DEBUG_KMS("Unknown modifier (not Arm)\n"); return false; } From d1fc7687959b0fbf335815a8d808e6076b969309 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Jun 2025 13:14:26 +0100 Subject: [PATCH 044/284] ASoC: cs35l56: Use SoundWire address as firmware name suffix for new silicon Use the SoundWire link number and device unique ID as the firmware file qualifier suffix on CS35L57, CS35L63 and revisions of CS35L56 after B0. The change in wm_adsp needed to support this has been included in this patch because it is fairly trivial. Originally, the firmware file names indicated which amplifier instance they were for by appending the ALSA prefix string. This is the standard ASoC way of distinguishing different instances of the same device. However, on SoundWire systems the SoundWire physical unique address is available as a unique identifier for each amp, and this address is hardwired by a pin on the amp. The firmware files are specific for each physical amp so they must be applied to that amp. Using the ALSA prefix for the filename qualifier means that to name a firmware file it must be determined what prefix string the machine driver will assign to each device and then use that to name the firmware file correctly. This is straightforward in traditional ASoC systems where the machine driver is specific to a particular piece of hardware. But on SoundWire the machine driver is generic and can handle a very wide range of hardware. It is more difficult to determine exactly what the prefix will be on any particular production device, and more prone to mistakes. Also, when the machine driver switches to generating this automatically from SDCA properties in ACPI, there is an additional layer of complexity in determining the mapping. This uncertainty is unnecessary because the firmware is built for a specific amp. with known address, so we can use that directly instead of introducing the redundant intermediate alias. This ensures the firmware is applied to the amp it was intended for. There have not been any firmwares published for CS35L57 or CS35L63, so these can safely be switched to using the SoundWire unique address as the suffix string. Also note that the machine driver in older kernel version only has match entries for the CS35L56 Soundwire identity so any future product with a cs35L57 or CS35L63 would require a new kernel anyway. There are already many published firmware for CS35L56 B0 silicon so this keeps the original naming scheme on those, to preserve backward compatibility. Note that although sdw_slave.id contains a unique_id field, this cannot be trusted because the SoundWire core code also puts magic values into it that it uses as a flag. So the unique ID is read from the chip register. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20250612121428.1667-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56-sdw.c | 18 ++++++++++-------- sound/soc/codecs/cs35l56.c | 29 +++++++++++++++++++++++++++++ sound/soc/codecs/cs35l56.h | 2 ++ sound/soc/codecs/wm_adsp.c | 21 ++++++++++++--------- sound/soc/codecs/wm_adsp.h | 1 + 5 files changed, 54 insertions(+), 17 deletions(-) diff --git a/sound/soc/codecs/cs35l56-sdw.c b/sound/soc/codecs/cs35l56-sdw.c index 13f602f51bf3..fa9693af3722 100644 --- a/sound/soc/codecs/cs35l56-sdw.c +++ b/sound/soc/codecs/cs35l56-sdw.c @@ -238,16 +238,15 @@ static const struct regmap_bus cs35l56_regmap_bus_sdw = { .val_format_endian_default = REGMAP_ENDIAN_BIG, }; -static int cs35l56_sdw_set_cal_index(struct cs35l56_private *cs35l56) +static int cs35l56_sdw_get_unique_id(struct cs35l56_private *cs35l56) { int ret; - /* SoundWire UniqueId is used to index the calibration array */ ret = sdw_read_no_pm(cs35l56->sdw_peripheral, SDW_SCP_DEVID_0); if (ret < 0) return ret; - cs35l56->base.cal_index = ret & 0xf; + cs35l56->sdw_unique_id = ret & 0xf; return 0; } @@ -259,11 +258,13 @@ static void cs35l56_sdw_init(struct sdw_slave *peripheral) pm_runtime_get_noresume(cs35l56->base.dev); - if (cs35l56->base.cal_index < 0) { - ret = cs35l56_sdw_set_cal_index(cs35l56); - if (ret < 0) - goto out; - } + ret = cs35l56_sdw_get_unique_id(cs35l56); + if (ret) + goto out; + + /* SoundWire UniqueId is used to index the calibration array */ + if (cs35l56->base.cal_index < 0) + cs35l56->base.cal_index = cs35l56->sdw_unique_id; ret = cs35l56_init(cs35l56); if (ret < 0) { @@ -587,6 +588,7 @@ static int cs35l56_sdw_probe(struct sdw_slave *peripheral, const struct sdw_devi cs35l56->base.dev = dev; cs35l56->sdw_peripheral = peripheral; + cs35l56->sdw_link_num = peripheral->bus->link_id; INIT_WORK(&cs35l56->sdw_irq_work, cs35l56_sdw_irq_work); dev_set_drvdata(dev, cs35l56); diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c index c78e4746e428..6e6120c39965 100644 --- a/sound/soc/codecs/cs35l56.c +++ b/sound/soc/codecs/cs35l56.c @@ -853,6 +853,31 @@ static void cs35l56_dsp_work(struct work_struct *work) pm_runtime_put_autosuspend(cs35l56->base.dev); } +static int cs35l56_set_fw_suffix(struct cs35l56_private *cs35l56) +{ + if (cs35l56->dsp.fwf_suffix) + return 0; + + if (!cs35l56->sdw_peripheral) + return 0; + + /* + * There are published firmware files for L56 B0 silicon using + * the default wm_adsp name suffixing so don't change those. + */ + if ((cs35l56->base.type == 0x56) && (cs35l56->base.rev == 0xb0)) + return 0; + + cs35l56->dsp.fwf_suffix = devm_kasprintf(cs35l56->base.dev, GFP_KERNEL, + "l%uu%u", + cs35l56->sdw_link_num, + cs35l56->sdw_unique_id); + if (!cs35l56->dsp.fwf_suffix) + return -ENOMEM; + + return 0; +} + static int cs35l56_component_probe(struct snd_soc_component *component) { struct cs35l56_private *cs35l56 = snd_soc_component_get_drvdata(component); @@ -891,6 +916,10 @@ static int cs35l56_component_probe(struct snd_soc_component *component) if (!cs35l56->dsp.part) return -ENOMEM; + ret = cs35l56_set_fw_suffix(cs35l56); + if (ret) + return ret; + cs35l56->component = component; wm_adsp2_component_probe(&cs35l56->dsp, component); diff --git a/sound/soc/codecs/cs35l56.h b/sound/soc/codecs/cs35l56.h index 200f695efca3..a84c83eb2d7c 100644 --- a/sound/soc/codecs/cs35l56.h +++ b/sound/soc/codecs/cs35l56.h @@ -52,6 +52,8 @@ struct cs35l56_private { bool tdm_mode; bool sysclk_set; u8 old_sdw_clock_scale; + u8 sdw_link_num; + u8 sdw_unique_id; }; extern const struct dev_pm_ops cs35l56_pm_ops_i2c_spi; diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 3c580faab3b7..13db9c6650d5 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -783,16 +783,19 @@ static int wm_adsp_request_firmware_files(struct wm_adsp *dsp, char **coeff_filename) { const char *system_name = dsp->system_name; - const char *asoc_component_prefix = dsp->component->name_prefix; + const char *suffix = dsp->component->name_prefix; int ret = 0; - if (system_name && asoc_component_prefix) { + if (dsp->fwf_suffix) + suffix = dsp->fwf_suffix; + + if (system_name && suffix) { if (!wm_adsp_request_firmware_file(dsp, wmfw_firmware, wmfw_filename, cirrus_dir, system_name, - asoc_component_prefix, "wmfw")) { + suffix, "wmfw")) { wm_adsp_request_firmware_file(dsp, coeff_firmware, coeff_filename, cirrus_dir, system_name, - asoc_component_prefix, "bin"); + suffix, "bin"); return 0; } } @@ -801,10 +804,10 @@ static int wm_adsp_request_firmware_files(struct wm_adsp *dsp, if (!wm_adsp_request_firmware_file(dsp, wmfw_firmware, wmfw_filename, cirrus_dir, system_name, NULL, "wmfw")) { - if (asoc_component_prefix) + if (suffix) wm_adsp_request_firmware_file(dsp, coeff_firmware, coeff_filename, cirrus_dir, system_name, - asoc_component_prefix, "bin"); + suffix, "bin"); if (!*coeff_firmware) wm_adsp_request_firmware_file(dsp, coeff_firmware, coeff_filename, @@ -816,10 +819,10 @@ static int wm_adsp_request_firmware_files(struct wm_adsp *dsp, /* Check system-specific bin without wmfw before falling back to generic */ if (dsp->wmfw_optional && system_name) { - if (asoc_component_prefix) + if (suffix) wm_adsp_request_firmware_file(dsp, coeff_firmware, coeff_filename, cirrus_dir, system_name, - asoc_component_prefix, "bin"); + suffix, "bin"); if (!*coeff_firmware) wm_adsp_request_firmware_file(dsp, coeff_firmware, coeff_filename, @@ -850,7 +853,7 @@ static int wm_adsp_request_firmware_files(struct wm_adsp *dsp, adsp_err(dsp, "Failed to request firmware <%s>%s-%s-%s<-%s<%s>>.wmfw\n", cirrus_dir, dsp->part, dsp->fwf_name ? dsp->fwf_name : dsp->cs_dsp.name, - wm_adsp_fw[dsp->fw].file, system_name, asoc_component_prefix); + wm_adsp_fw[dsp->fw].file, system_name, suffix); return -ENOENT; } diff --git a/sound/soc/codecs/wm_adsp.h b/sound/soc/codecs/wm_adsp.h index edc5b02ae765..075ea0bd06eb 100644 --- a/sound/soc/codecs/wm_adsp.h +++ b/sound/soc/codecs/wm_adsp.h @@ -29,6 +29,7 @@ struct wm_adsp { const char *part; const char *fwf_name; const char *system_name; + const char *fwf_suffix; struct snd_soc_component *component; unsigned int sys_config_size; From e5d5b3aebdc8acf9f52d1369a7744a2ab9ca591c Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Jun 2025 13:14:27 +0100 Subject: [PATCH 045/284] ASoC: cs35l56: Use SoundWire address as alternate firmware suffix on L56 B0 Use the SoundWire link number and device unique ID as the firmware file qualifier suffix on CS35L56 B0 if .bin files are not found with the older suffix. Some changes in wm_adsp needed to support this have been included in this patch because they are trivial. The allows future products with CS35L56 B0 silicon to use the same firmware file naming as CS35L57 and cs35L63, while retaining backward compatibility for firmware that has already been published with the old naming scheme. The old suffix is searched first, partly because there are already many files using that naming scheme, but also because they are a smaller subset of all the possible fallback name options offered by wm_adsp so we know that it will either find the qualified files or fail. All the firmware files already published have the wmfw qualified with only the ACPI SSID and the bin files qualified with both SSID and the suffix. Originally, the firmware file names indicated which amplifier instance they were for by appending the ALSA prefix string. This is the standard ASoC way of distinguishing different instances of the same device. However, on SoundWire systems the SoundWire physical unique address is available as a unique identifier for each amp, and this address is hardwired by the address pin on the amp. The firmware files are specific for each physical amp so they must be applied to that amp. Using the ALSA prefix for the filename qualifier means that to name a firmware file it must be determined what prefix string the machine driver will assign to each device and then use that to name the firmware file correctly. This is straightforward in traditional ASoC systems where the machine driver is specific to a particular piece of hardware. But on SoundWire the machine driver is generic and can handle a very wide range of hardware. It is more difficult to determine exactly what the prefix will be on any particular production device, and more prone to mistakes. Also, when the machine driver switches to generating this automatically from SDCA properties in ACPI, there is an additional layer of complexity in determining the mapping. This uncertainty is unnecessary because the firmware is built for a specific amp. with known address, so we can use that directly instead of introducing a redundant intermediate alias. This ensures the firmware is applied to the amp it was intended for. There are already many published firmware for CS35L56 B0 silicon so this first looks for the original name suffix, to keep backward compatibility. If this doesn't find .bin files it will switch to using the new name suffix so that future products using CS35L56 B0 can start to use the new suffix. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20250612121428.1667-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- sound/soc/codecs/cs35l56.c | 59 +++++++++++++++++++++++++++----------- sound/soc/codecs/cs35l56.h | 1 + sound/soc/codecs/wm_adsp.c | 6 ++++ sound/soc/codecs/wm_adsp.h | 1 + 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/sound/soc/codecs/cs35l56.c b/sound/soc/codecs/cs35l56.c index 6e6120c39965..1b42586794ad 100644 --- a/sound/soc/codecs/cs35l56.c +++ b/sound/soc/codecs/cs35l56.c @@ -706,16 +706,40 @@ static int cs35l56_write_cal(struct cs35l56_private *cs35l56) return ret; } +static int cs35l56_dsp_download_and_power_up(struct cs35l56_private *cs35l56, + bool load_firmware) +{ + int ret; + + /* + * Abort the first load if it didn't find the suffixed bins and + * we have an alternate fallback suffix. + */ + cs35l56->dsp.bin_mandatory = (load_firmware && cs35l56->fallback_fw_suffix); + + ret = wm_adsp_power_up(&cs35l56->dsp, load_firmware); + if ((ret == -ENOENT) && cs35l56->dsp.bin_mandatory) { + cs35l56->dsp.fwf_suffix = cs35l56->fallback_fw_suffix; + cs35l56->fallback_fw_suffix = NULL; + cs35l56->dsp.bin_mandatory = false; + ret = wm_adsp_power_up(&cs35l56->dsp, load_firmware); + } + + if (ret) { + dev_dbg(cs35l56->base.dev, "wm_adsp_power_up ret %d\n", ret); + return ret; + } + + return 0; +} + static void cs35l56_reinit_patch(struct cs35l56_private *cs35l56) { int ret; - /* Use wm_adsp to load and apply the firmware patch and coefficient files */ - ret = wm_adsp_power_up(&cs35l56->dsp, true); - if (ret) { - dev_dbg(cs35l56->base.dev, "%s: wm_adsp_power_up ret %d\n", __func__, ret); + ret = cs35l56_dsp_download_and_power_up(cs35l56, true); + if (ret) return; - } cs35l56_write_cal(cs35l56); @@ -750,11 +774,9 @@ static void cs35l56_patch(struct cs35l56_private *cs35l56, bool firmware_missing * but only if firmware is missing. If firmware is already patched just * power-up wm_adsp without downloading firmware. */ - ret = wm_adsp_power_up(&cs35l56->dsp, !!firmware_missing); - if (ret) { - dev_dbg(cs35l56->base.dev, "%s: wm_adsp_power_up ret %d\n", __func__, ret); + ret = cs35l56_dsp_download_and_power_up(cs35l56, firmware_missing); + if (ret) goto err; - } mutex_lock(&cs35l56->base.irq_lock); @@ -861,13 +883,6 @@ static int cs35l56_set_fw_suffix(struct cs35l56_private *cs35l56) if (!cs35l56->sdw_peripheral) return 0; - /* - * There are published firmware files for L56 B0 silicon using - * the default wm_adsp name suffixing so don't change those. - */ - if ((cs35l56->base.type == 0x56) && (cs35l56->base.rev == 0xb0)) - return 0; - cs35l56->dsp.fwf_suffix = devm_kasprintf(cs35l56->base.dev, GFP_KERNEL, "l%uu%u", cs35l56->sdw_link_num, @@ -875,6 +890,16 @@ static int cs35l56_set_fw_suffix(struct cs35l56_private *cs35l56) if (!cs35l56->dsp.fwf_suffix) return -ENOMEM; + /* + * There are published firmware files for L56 B0 silicon using + * the ALSA prefix as the filename suffix. Default to trying these + * first, with the new name as an alternate. + */ + if ((cs35l56->base.type == 0x56) && (cs35l56->base.rev == 0xb0)) { + cs35l56->fallback_fw_suffix = cs35l56->dsp.fwf_suffix; + cs35l56->dsp.fwf_suffix = cs35l56->component->name_prefix; + } + return 0; } @@ -916,11 +941,11 @@ static int cs35l56_component_probe(struct snd_soc_component *component) if (!cs35l56->dsp.part) return -ENOMEM; + cs35l56->component = component; ret = cs35l56_set_fw_suffix(cs35l56); if (ret) return ret; - cs35l56->component = component; wm_adsp2_component_probe(&cs35l56->dsp, component); debugfs_create_bool("init_done", 0444, debugfs_root, &cs35l56->base.init_done); diff --git a/sound/soc/codecs/cs35l56.h b/sound/soc/codecs/cs35l56.h index a84c83eb2d7c..bd77a57249d7 100644 --- a/sound/soc/codecs/cs35l56.h +++ b/sound/soc/codecs/cs35l56.h @@ -38,6 +38,7 @@ struct cs35l56_private { struct snd_soc_component *component; struct regulator_bulk_data supplies[CS35L56_NUM_BULK_SUPPLIES]; struct sdw_slave *sdw_peripheral; + const char *fallback_fw_suffix; struct work_struct sdw_irq_work; bool sdw_irq_no_unmask; bool soft_resetting; diff --git a/sound/soc/codecs/wm_adsp.c b/sound/soc/codecs/wm_adsp.c index 13db9c6650d5..8a1d5cc75d6c 100644 --- a/sound/soc/codecs/wm_adsp.c +++ b/sound/soc/codecs/wm_adsp.c @@ -1000,11 +1000,17 @@ int wm_adsp_power_up(struct wm_adsp *dsp, bool load_firmware) return ret; } + if (dsp->bin_mandatory && !coeff_firmware) { + ret = -ENOENT; + goto err; + } + ret = cs_dsp_power_up(&dsp->cs_dsp, wmfw_firmware, wmfw_filename, coeff_firmware, coeff_filename, wm_adsp_fw_text[dsp->fw]); +err: wm_adsp_release_firmware_files(dsp, wmfw_firmware, wmfw_filename, coeff_firmware, coeff_filename); diff --git a/sound/soc/codecs/wm_adsp.h b/sound/soc/codecs/wm_adsp.h index 075ea0bd06eb..25210d404bf1 100644 --- a/sound/soc/codecs/wm_adsp.h +++ b/sound/soc/codecs/wm_adsp.h @@ -36,6 +36,7 @@ struct wm_adsp { int fw; bool wmfw_optional; + bool bin_mandatory; struct work_struct boot_work; int (*control_add)(struct wm_adsp *dsp, struct cs_dsp_coeff_ctl *cs_ctl); From fa8fae5f82e48db1a06ba570a2a3fdc087fc93c0 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 12 Jun 2025 13:14:28 +0100 Subject: [PATCH 046/284] ASoC: doc: cs35l56: Update to add new SoundWire firmware filename suffix Add the new firmware filename suffix used for SoundWire systems with CS35L57, CS35L63 or CS35L56 later than B0 silicon. This uses the SoundWire physical address of the amp to identify which firmware file to load on that amp. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20250612121428.1667-4-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- Documentation/sound/codecs/cs35l56.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Documentation/sound/codecs/cs35l56.rst b/Documentation/sound/codecs/cs35l56.rst index 98c6f6c74394..7db9f93d9be5 100644 --- a/Documentation/sound/codecs/cs35l56.rst +++ b/Documentation/sound/codecs/cs35l56.rst @@ -104,6 +104,13 @@ In this example the SSID is 10280c63. The format of the firmware file names is: +SoundWire (except CS35L56 Rev B0): + cs35lxx-b0-dsp1-misc-SSID[-spkidX]-l?u? + +SoundWire CS35L56 Rev B0: + cs35lxx-b0-dsp1-misc-SSID[-spkidX]-ampN + +Non-SoundWire (HDA and I2S): cs35lxx-b0-dsp1-misc-SSID[-spkidX]-ampN Where: @@ -111,12 +118,18 @@ Where: * cs35lxx-b0 is the amplifier model and silicon revision. This information is logged by the driver during initialization. * SSID is the 8-digit hexadecimal SSID value. + * l?u? is the physical address on the SoundWire bus of the amp this + file applies to. * ampN is the amplifier number (for example amp1). This is the same as the prefix on the ALSA control names except that it is always lower-case in the file name. * spkidX is an optional part, used for laptops that have firmware configurations for different makes and models of internal speakers. +The CS35L56 Rev B0 continues to use the old filename scheme because a +large number of firmware files have already been published with these +names. + Sound Open Firmware and ALSA topology files ------------------------------------------- From ba06528ad5a31923efc24324706116ccd17e12d8 Mon Sep 17 00:00:00 2001 From: Gabriel Santese Date: Fri, 30 May 2025 02:52:32 +0200 Subject: [PATCH 047/284] ASoC: amd: yc: Add quirk for MSI Bravo 17 D7VF internal mic MSI Bravo 17 (D7VF), like other laptops from the family, has broken ACPI tables and needs a quirk for internal mic to work properly. Signed-off-by: Gabriel Santese Link: https://patch.msgid.link/20250530005444.23398-1-santesegabriel@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index 7e62445e02c1..c9387d6fbf18 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -451,6 +451,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Bravo 17 D7VEK"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Micro-Star International Co., Ltd."), + DMI_MATCH(DMI_PRODUCT_NAME, "Bravo 17 D7VF"), + } + }, { .driver_data = &acp6x_card, .matches = { From c0c7fa4e7a512006710c8e4d6b6f7b40c9f786cd Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Tue, 10 Jun 2025 14:09:35 +0200 Subject: [PATCH 048/284] docs: arm64: Fix ICC_SRE_EL2 register typo in booting.rst Fix trivial ICC_SRE_EL2 register spelling typo in booting.rst. Signed-off-by: Lorenzo Pieralisi Cc: Jonathan Corbet Cc: Will Deacon CC: Catalin Marinas Link: https://lore.kernel.org/r/20250610120935.852034-1-lpieralisi@kernel.org Signed-off-by: Will Deacon --- Documentation/arch/arm64/booting.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arch/arm64/booting.rst b/Documentation/arch/arm64/booting.rst index dee7b6de864f..ee9b790c0d72 100644 --- a/Documentation/arch/arm64/booting.rst +++ b/Documentation/arch/arm64/booting.rst @@ -234,7 +234,7 @@ Before jumping into the kernel, the following conditions must be met: - If the kernel is entered at EL1: - - ICC.SRE_EL2.Enable (bit 3) must be initialised to 0b1 + - ICC_SRE_EL2.Enable (bit 3) must be initialised to 0b1 - ICC_SRE_EL2.SRE (bit 0) must be initialised to 0b1. - The DT or ACPI tables must describe a GICv3 interrupt controller. From 650768c512faba8070bf4cfbb28c95eb5cd203f3 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 27 May 2025 13:56:33 +0530 Subject: [PATCH 049/284] arm64: Restrict pagetable teardown to avoid false warning Commit 9c006972c3fe ("arm64: mmu: drop pXd_present() checks from pXd_free_pYd_table()") removes the pxd_present() checks because the caller checks pxd_present(). But, in case of vmap_try_huge_pud(), the caller only checks pud_present(); pud_free_pmd_page() recurses on each pmd through pmd_free_pte_page(), wherein the pmd may be none. Thus it is possible to hit a warning in the latter, since pmd_none => !pmd_table(). Thus, add a pmd_present() check in pud_free_pmd_page(). This problem was found by code inspection. Fixes: 9c006972c3fe ("arm64: mmu: drop pXd_present() checks from pXd_free_pYd_table()") Cc: stable@vger.kernel.org Reported-by: Ryan Roberts Acked-by: David Hildenbrand Signed-off-by: Dev Jain Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20250527082633.61073-1-dev.jain@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8fcf59ba39db..00ab1d648db6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1305,7 +1305,8 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) next = addr; end = addr + PUD_SIZE; do { - pmd_free_pte_page(pmdp, next); + if (pmd_present(pmdp_get(pmdp))) + pmd_free_pte_page(pmdp, next); } while (pmdp++, next += PMD_SIZE, next != end); pud_clear(pudp); From d2be3270f40b1330f8c1c9512c59dd085a758ac0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 11 Jun 2025 17:28:13 +0100 Subject: [PATCH 050/284] arm64/gcs: Don't call gcs_free() during flush_gcs() Currently we call gcs_free() during flush_gcs() to reset the thread state for GCS. This includes unmapping any kernel allocated GCS, but this is redundant when doing a flush_thread() since we are reinitialising the thread memory too. Inline the reinitialisation of the thread struct. Signed-off-by: Mark Brown Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20250611-arm64-gcs-flush-thread-v1-1-cc26feeddabd@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a5ca15daeb8a..5954cec19660 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -288,7 +288,9 @@ static void flush_gcs(void) if (!system_supports_gcs()) return; - gcs_free(current); + current->thread.gcspr_el0 = 0; + current->thread.gcs_base = 0; + current->thread.gcs_size = 0; current->thread.gcs_el0_mode = 0; write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); write_sysreg_s(0, SYS_GCSPR_EL0); From 39dfc971e42d886e7df01371cd1bef505076d84c Mon Sep 17 00:00:00 2001 From: Tengda Wu Date: Wed, 4 Jun 2025 00:55:33 +0000 Subject: [PATCH 051/284] arm64/ptrace: Fix stack-out-of-bounds read in regs_get_kernel_stack_nth() KASAN reports a stack-out-of-bounds read in regs_get_kernel_stack_nth(). Call Trace: [ 97.283505] BUG: KASAN: stack-out-of-bounds in regs_get_kernel_stack_nth+0xa8/0xc8 [ 97.284677] Read of size 8 at addr ffff800089277c10 by task 1.sh/2550 [ 97.285732] [ 97.286067] CPU: 7 PID: 2550 Comm: 1.sh Not tainted 6.6.0+ #11 [ 97.287032] Hardware name: linux,dummy-virt (DT) [ 97.287815] Call trace: [ 97.288279] dump_backtrace+0xa0/0x128 [ 97.288946] show_stack+0x20/0x38 [ 97.289551] dump_stack_lvl+0x78/0xc8 [ 97.290203] print_address_description.constprop.0+0x84/0x3c8 [ 97.291159] print_report+0xb0/0x280 [ 97.291792] kasan_report+0x84/0xd0 [ 97.292421] __asan_load8+0x9c/0xc0 [ 97.293042] regs_get_kernel_stack_nth+0xa8/0xc8 [ 97.293835] process_fetch_insn+0x770/0xa30 [ 97.294562] kprobe_trace_func+0x254/0x3b0 [ 97.295271] kprobe_dispatcher+0x98/0xe0 [ 97.295955] kprobe_breakpoint_handler+0x1b0/0x210 [ 97.296774] call_break_hook+0xc4/0x100 [ 97.297451] brk_handler+0x24/0x78 [ 97.298073] do_debug_exception+0xac/0x178 [ 97.298785] el1_dbg+0x70/0x90 [ 97.299344] el1h_64_sync_handler+0xcc/0xe8 [ 97.300066] el1h_64_sync+0x78/0x80 [ 97.300699] kernel_clone+0x0/0x500 [ 97.301331] __arm64_sys_clone+0x70/0x90 [ 97.302084] invoke_syscall+0x68/0x198 [ 97.302746] el0_svc_common.constprop.0+0x11c/0x150 [ 97.303569] do_el0_svc+0x38/0x50 [ 97.304164] el0_svc+0x44/0x1d8 [ 97.304749] el0t_64_sync_handler+0x100/0x130 [ 97.305500] el0t_64_sync+0x188/0x190 [ 97.306151] [ 97.306475] The buggy address belongs to stack of task 1.sh/2550 [ 97.307461] and is located at offset 0 in frame: [ 97.308257] __se_sys_clone+0x0/0x138 [ 97.308910] [ 97.309241] This frame has 1 object: [ 97.309873] [48, 184) 'args' [ 97.309876] [ 97.310749] The buggy address belongs to the virtual mapping at [ 97.310749] [ffff800089270000, ffff800089279000) created by: [ 97.310749] dup_task_struct+0xc0/0x2e8 [ 97.313347] [ 97.313674] The buggy address belongs to the physical page: [ 97.314604] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14f69a [ 97.315885] flags: 0x15ffffe00000000(node=1|zone=2|lastcpupid=0xfffff) [ 97.316957] raw: 015ffffe00000000 0000000000000000 dead000000000122 0000000000000000 [ 97.318207] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 [ 97.319445] page dumped because: kasan: bad access detected [ 97.320371] [ 97.320694] Memory state around the buggy address: [ 97.321511] ffff800089277b00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 97.322681] ffff800089277b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 97.323846] >ffff800089277c00: 00 00 f1 f1 f1 f1 f1 f1 00 00 00 00 00 00 00 00 [ 97.325023] ^ [ 97.325683] ffff800089277c80: 00 00 00 00 00 00 00 00 00 f3 f3 f3 f3 f3 f3 f3 [ 97.326856] ffff800089277d00: f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 00 00 This issue seems to be related to the behavior of some gcc compilers and was also fixed on the s390 architecture before: commit d93a855c31b7 ("s390/ptrace: Avoid KASAN false positives in regs_get_kernel_stack_nth()") As described in that commit, regs_get_kernel_stack_nth() has confirmed that `addr` is on the stack, so reading the value at `*addr` should be allowed. Use READ_ONCE_NOCHECK() helper to silence the KASAN check for this case. Fixes: 0a8ea52c3eb1 ("arm64: Add HAVE_REGS_AND_STACK_ACCESS_API feature") Signed-off-by: Tengda Wu Link: https://lore.kernel.org/r/20250604005533.1278992-1-wutengda@huaweicloud.com [will: Use '*addr' as the argument to READ_ONCE_NOCHECK()] Signed-off-by: Will Deacon --- arch/arm64/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index a360e52db02f..ee94b72bf8fb 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -141,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; + return READ_ONCE_NOCHECK(*addr); else return 0; } From 32ce6b3a83b71d8abf0c0837dc78775f16c9902f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 8 Jun 2025 18:08:51 -0400 Subject: [PATCH 052/284] NFSD: Avoid corruption of a referring call list The new code neglects to remove a freshly-allocated RCL from the callback's referring call list when no matching referring call is found. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202505171002.cE46sdj5-lkp@intel.com/ Fixes: 4f3c8d8c9e10 ("NFSD: Implement CB_SEQUENCE referring call lists") Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4callback.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index ccb00aa93be0..e00b2aea8da2 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1409,6 +1409,7 @@ void nfsd41_cb_referring_call(struct nfsd4_callback *cb, out: if (!rcl->__nr_referring_calls) { cb->cb_nr_referring_call_list--; + list_del(&rcl->__list); kfree(rcl); } } From 8b3ac9fabaa825b7bae850ee7b4580c5cba32699 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 9 Jun 2025 13:21:56 -0400 Subject: [PATCH 053/284] SUNRPC: Cleanup/fix initial rq_pages allocation While investigating some reports of memory-constrained NUMA machines failing to mount v3 and v4.0 nfs mounts, we found that svc_init_buffer() was not attempting to retry allocations from the bulk page allocator. Typically, this results in a single page allocation being returned and the mount attempt fails with -ENOMEM. A retry would have allowed the mount to succeed. Additionally, it seems that the bulk allocation in svc_init_buffer() is redundant because svc_alloc_arg() will perform the required allocation and does the correct thing to retry the allocations. The call to allocate memory in svc_alloc_arg() drops the preferred node argument, but I expect we'll still allocate on the preferred node because the allocation call happens within the svc thread context, which chooses the node with memory closest to the current thread's execution. This patch cleans out the bulk allocation in svc_init_buffer() to allow svc_alloc_arg() to handle the allocation/retry logic for rq_pages. Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Fixes: ed603bcf4fea ("sunrpc: Replace the rq_pages array with dynamically-allocated memory") Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 939b6239df8a..ef8a05aac87f 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -638,8 +638,6 @@ EXPORT_SYMBOL_GPL(svc_destroy); static bool svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { - unsigned long ret; - rqstp->rq_maxpages = svc_serv_maxpages(serv); /* rq_pages' last entry is NULL for historical reasons. */ @@ -649,9 +647,7 @@ svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) if (!rqstp->rq_pages) return false; - ret = alloc_pages_bulk_node(GFP_KERNEL, node, rqstp->rq_maxpages, - rqstp->rq_pages); - return ret == rqstp->rq_maxpages; + return true; } /* From d742f3ec1cb91c4dd86b981f55cd291e07f03094 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 12 Jun 2025 10:42:55 +0200 Subject: [PATCH 054/284] drm/ast: Do not include Fix the compile-time warning drivers/gpu/drm/ast/ast_mode.c: warning: EXPORT_SYMBOL() is not used, but #include is present Signed-off-by: Thomas Zimmermann Reviewed-by: Jocelyn Falempe Link: https://lore.kernel.org/r/20250612084257.200907-1-tzimmermann@suse.de --- drivers/gpu/drm/ast/ast_mode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c index 1de832964e92..031980d8f3ab 100644 --- a/drivers/gpu/drm/ast/ast_mode.c +++ b/drivers/gpu/drm/ast/ast_mode.c @@ -29,7 +29,6 @@ */ #include -#include #include #include From 2e3395ab2a358ba8d1c80d8df35dcfb882cfc28e Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Thu, 12 Jun 2025 10:53:02 +0200 Subject: [PATCH 055/284] drm/mgag200: Do not include Fix the compile-time warning drivers/gpu/drm/mgag200/mgag200_ddc.c: warning: EXPORT_SYMBOL() is not used, but #include is present Signed-off-by: Thomas Zimmermann Reviewed-by: Jocelyn Falempe Link: https://lore.kernel.org/r/20250612085308.203861-1-tzimmermann@suse.de --- drivers/gpu/drm/mgag200/mgag200_ddc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/mgag200/mgag200_ddc.c b/drivers/gpu/drm/mgag200/mgag200_ddc.c index 6d81ea8931e8..c31673eaa554 100644 --- a/drivers/gpu/drm/mgag200/mgag200_ddc.c +++ b/drivers/gpu/drm/mgag200/mgag200_ddc.c @@ -26,7 +26,6 @@ * Authors: Dave Airlie */ -#include #include #include #include From b0823d5fbacb1c551d793cbfe7af24e0d1fa45ed Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 12 Jun 2025 07:38:18 -0700 Subject: [PATCH 056/284] perf/x86/intel: Fix crash in icl_update_topdown_event() The perf_fuzzer found a hard-lockup crash on a RaptorLake machine: Oops: general protection fault, maybe for address 0xffff89aeceab400: 0000 CPU: 23 UID: 0 PID: 0 Comm: swapper/23 Tainted: [W]=WARN Hardware name: Dell Inc. Precision 9660/0VJ762 RIP: 0010:native_read_pmc+0x7/0x40 Code: cc e8 8d a9 01 00 48 89 03 5b cd cc cc cc cc 0f 1f ... RSP: 000:fffb03100273de8 EFLAGS: 00010046 .... Call Trace: icl_update_topdown_event+0x165/0x190 ? ktime_get+0x38/0xd0 intel_pmu_read_event+0xf9/0x210 __perf_event_read+0xf9/0x210 CPUs 16-23 are E-core CPUs that don't support the perf metrics feature. The icl_update_topdown_event() should not be invoked on these CPUs. It's a regression of commit: f9bdf1f95339 ("perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read") The bug introduced by that commit is that the is_topdown_event() function is mistakenly used to replace the is_topdown_count() call to check if the topdown functions for the perf metrics feature should be invoked. Fix it. Fixes: f9bdf1f95339 ("perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read") Closes: https://lore.kernel.org/lkml/352f0709-f026-cd45-e60c-60dfd97f73f3@maine.edu/ Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Vince Weaver Cc: stable@vger.kernel.org # v6.15+ Link: https://lore.kernel.org/r/20250612143818.2889040-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 741b229f0718..c2fb729c270e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2826,7 +2826,7 @@ static void intel_pmu_read_event(struct perf_event *event) * If the PEBS counters snapshotting is enabled, * the topdown event is available in PEBS records. */ - if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) + if (is_topdown_count(event) && !is_pebs_counter_event_group(event)) static_call(intel_pmu_update_topdown_event)(event, NULL); else intel_pmu_drain_pebs_buffer(); From 93adf20ff4d6e865e0b974110d3cf2f07c057177 Mon Sep 17 00:00:00 2001 From: wangdicheng Date: Fri, 13 Jun 2025 14:36:36 +0800 Subject: [PATCH 057/284] ALSA: usb-audio: Rename ALSA kcontrol PCM and PCM1 for the KTMicro sound card PCM1 not in Pulseaudio's control list; standardize control to "Speaker" and "Headphone". Signed-off-by: wangdicheng Cc: Link: https://patch.msgid.link/20250613063636.239683-1-wangdich9700@163.com Signed-off-by: Takashi Iwai --- sound/usb/mixer_maps.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sound/usb/mixer_maps.c b/sound/usb/mixer_maps.c index 0e9b5431a47f..faac7df1fbcf 100644 --- a/sound/usb/mixer_maps.c +++ b/sound/usb/mixer_maps.c @@ -383,6 +383,13 @@ static const struct usbmix_name_map ms_usb_link_map[] = { { 0 } /* terminator */ }; +/* KTMicro USB */ +static struct usbmix_name_map s31b2_0022_map[] = { + { 23, "Speaker Playback" }, + { 18, "Headphone Playback" }, + { 0 } +}; + /* ASUS ROG Zenith II with Realtek ALC1220-VB */ static const struct usbmix_name_map asus_zenith_ii_map[] = { { 19, NULL, 12 }, /* FU, Input Gain Pad - broken response, disabled */ @@ -692,6 +699,11 @@ static const struct usbmix_ctl_map usbmix_ctl_maps[] = { .id = USB_ID(0x045e, 0x083c), .map = ms_usb_link_map, }, + { + /* KTMicro USB */ + .id = USB_ID(0X31b2, 0x0022), + .map = s31b2_0022_map, + }, { 0 } /* terminator */ }; From ac90aad0e9bf7c37e706fdc08ce763a553890bdf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Jun 2025 10:47:09 -0700 Subject: [PATCH 058/284] crypto: testmgr - reinstate kconfig control over full self-tests Commit 698de822780f ("crypto: testmgr - make it easier to enable the full set of tests") removed support for building kernels that run only the "fast" set of crypto self-tests by default. This assumed that nearly everyone actually wanted the full set of tests, *if* they had already chosen to enable the tests at all. Unfortunately, it turns out that both Debian and Fedora intentionally have the crypto self-tests enabled in their production kernels. And for production kernels we do need to keep the testing time down, which implies just running the "fast" tests, not the full set of tests. For Fedora, a reason for enabling the tests in production is that they are being (mis)used to meet the FIPS 140-3 pre-operational testing requirement. However, the other reason for enabling the tests in production, which applies to both distros, is that they provide some value in protecting users from buggy drivers. Unfortunately, the crypto/ subsystem has many buggy and untested drivers for off-CPU hardware accelerators on rare platforms. These broken drivers get shipped to users, and there have been multiple examples of the tests preventing these buggy drivers from being used. So effectively, the tests are being relied on in production kernels. I think this is kind of crazy (untested drivers should just not be enabled at all), but that seems to be how things work currently. Thus, reintroduce a kconfig option that controls the level of testing. Call it CRYPTO_SELFTESTS_FULL instead of the original name CRYPTO_MANAGER_EXTRA_TESTS, which was slightly misleading. Moreover, given the "production kernel" use case, make CRYPTO_SELFTESTS depend on EXPERT instead of DEBUG_KERNEL. I also haven't reinstated all the #ifdefs in crypto/testmgr.c. Instead, just rely on the compiler to optimize out unused code. Fixes: 40b9969796bf ("crypto: testmgr - replace CRYPTO_MANAGER_DISABLE_TESTS with CRYPTO_SELFTESTS") Fixes: 698de822780f ("crypto: testmgr - make it easier to enable the full set of tests") Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- crypto/Kconfig | 25 +++++++++++++++++++++---- crypto/testmgr.c | 15 ++++++++++++--- include/crypto/internal/simd.h | 6 ++++-- lib/crypto/Makefile | 2 +- 4 files changed, 38 insertions(+), 10 deletions(-) diff --git a/crypto/Kconfig b/crypto/Kconfig index e9fee7818e27..e1cfd0d4cc8f 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -176,16 +176,33 @@ config CRYPTO_USER config CRYPTO_SELFTESTS bool "Enable cryptographic self-tests" - depends on DEBUG_KERNEL + depends on EXPERT help Enable the cryptographic self-tests. The cryptographic self-tests run at boot time, or at algorithm registration time if algorithms are dynamically loaded later. - This is primarily intended for developer use. It should not be - enabled in production kernels, unless you are trying to use these - tests to fulfill a FIPS testing requirement. + There are two main use cases for these tests: + + - Development and pre-release testing. In this case, also enable + CRYPTO_SELFTESTS_FULL to get the full set of tests. All crypto code + in the kernel is expected to pass the full set of tests. + + - Production kernels, to help prevent buggy drivers from being used + and/or meet FIPS 140-3 pre-operational testing requirements. In + this case, enable CRYPTO_SELFTESTS but not CRYPTO_SELFTESTS_FULL. + +config CRYPTO_SELFTESTS_FULL + bool "Enable the full set of cryptographic self-tests" + depends on CRYPTO_SELFTESTS + help + Enable the full set of cryptographic self-tests for each algorithm. + + The full set of tests should be enabled for development and + pre-release testing, but not in production kernels. + + All crypto code in the kernel is expected to pass the full tests. config CRYPTO_NULL tristate "Null algorithms" diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 72005074a5c2..32f753d6c430 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -45,6 +45,7 @@ static bool notests; module_param(notests, bool, 0644); MODULE_PARM_DESC(notests, "disable all crypto self-tests"); +#ifdef CONFIG_CRYPTO_SELFTESTS_FULL static bool noslowtests; module_param(noslowtests, bool, 0644); MODULE_PARM_DESC(noslowtests, "disable slow crypto self-tests"); @@ -52,6 +53,10 @@ MODULE_PARM_DESC(noslowtests, "disable slow crypto self-tests"); static unsigned int fuzz_iterations = 100; module_param(fuzz_iterations, uint, 0644); MODULE_PARM_DESC(fuzz_iterations, "number of fuzz test iterations"); +#else +#define noslowtests 1 +#define fuzz_iterations 0 +#endif #ifndef CONFIG_CRYPTO_SELFTESTS @@ -319,9 +324,9 @@ struct testvec_config { /* * The following are the lists of testvec_configs to test for each algorithm - * type when the fast crypto self-tests are enabled. They aim to provide good - * test coverage, while keeping the test time much shorter than the full tests - * so that the fast tests can be used to fulfill FIPS 140 testing requirements. + * type when the "fast" crypto self-tests are enabled. They aim to provide good + * test coverage, while keeping the test time much shorter than the "full" tests + * so that the "fast" tests can be enabled in a wider range of circumstances. */ /* Configs for skciphers and aeads */ @@ -1183,14 +1188,18 @@ static void generate_random_testvec_config(struct rnd_state *rng, static void crypto_disable_simd_for_test(void) { +#ifdef CONFIG_CRYPTO_SELFTESTS_FULL migrate_disable(); __this_cpu_write(crypto_simd_disabled_for_test, true); +#endif } static void crypto_reenable_simd_for_test(void) { +#ifdef CONFIG_CRYPTO_SELFTESTS_FULL __this_cpu_write(crypto_simd_disabled_for_test, false); migrate_enable(); +#endif } /* diff --git a/include/crypto/internal/simd.h b/include/crypto/internal/simd.h index 7e7f1ac3b7fd..9e338e7aafbd 100644 --- a/include/crypto/internal/simd.h +++ b/include/crypto/internal/simd.h @@ -44,9 +44,11 @@ void simd_unregister_aeads(struct aead_alg *algs, int count, * * This delegates to may_use_simd(), except that this also returns false if SIMD * in crypto code has been temporarily disabled on this CPU by the crypto - * self-tests, in order to test the no-SIMD fallback code. + * self-tests, in order to test the no-SIMD fallback code. This override is + * currently limited to configurations where the "full" self-tests are enabled, + * because it might be a bit too invasive to be part of the "fast" self-tests. */ -#ifdef CONFIG_CRYPTO_SELFTESTS +#ifdef CONFIG_CRYPTO_SELFTESTS_FULL DECLARE_PER_CPU(bool, crypto_simd_disabled_for_test); #define crypto_simd_usable() \ (may_use_simd() && !this_cpu_read(crypto_simd_disabled_for_test)) diff --git a/lib/crypto/Makefile b/lib/crypto/Makefile index 3e79283b617d..f9e44aac6619 100644 --- a/lib/crypto/Makefile +++ b/lib/crypto/Makefile @@ -62,7 +62,7 @@ libsha256-generic-y := sha256-generic.o obj-$(CONFIG_MPILIB) += mpi/ -obj-$(CONFIG_CRYPTO_SELFTESTS) += simd.o +obj-$(CONFIG_CRYPTO_SELFTESTS_FULL) += simd.o obj-$(CONFIG_CRYPTO_LIB_SM3) += libsm3.o libsm3-y := sm3.o From f59427932885f9b47b22b532b079478905b9ad08 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Thu, 12 Jun 2025 21:11:34 +0000 Subject: [PATCH 059/284] ASoC: apple: mca: Drop default ARCH_APPLE in Kconfig When the first driver for Apple Silicon was upstreamed we accidentally included `default ARCH_APPLE` in its Kconfig which then spread to almost every subsequent driver. As soon as ARCH_APPLE is set to y this will pull in many drivers as built-ins which is not what we want. Thus, drop `default ARCH_APPLE` from Kconfig. Signed-off-by: Sven Peter Reviewed-by: Janne Grunau Link: https://patch.msgid.link/20250612-apple-kconfig-defconfig-v1-10-0e6f9cb512c1@kernel.org Signed-off-by: Mark Brown --- sound/soc/apple/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/apple/Kconfig b/sound/soc/apple/Kconfig index 793f7782e0d7..e9c777cdb6e3 100644 --- a/sound/soc/apple/Kconfig +++ b/sound/soc/apple/Kconfig @@ -2,7 +2,6 @@ config SND_SOC_APPLE_MCA tristate "Apple Silicon MCA driver" depends on ARCH_APPLE || COMPILE_TEST select SND_DMAENGINE_PCM - default ARCH_APPLE help This option enables an ASoC platform driver for MCA peripherals found on Apple Silicon SoCs. From 2b32fc8ff08deac3aa509f321a28e21b1eea5525 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 12 Jun 2025 11:32:51 -0700 Subject: [PATCH 060/284] genirq/cpuhotplug: Rebalance managed interrupts across multi-CPU hotplug Commit 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug") intended to only decrement the disable depth once per managed shutdown, but instead it decrements for each CPU hotplug in the affinity mask, until its depth reaches a point where it finally gets re-started. For example, consider: 1. Interrupt is affine to CPU {M,N} 2. disable_irq() -> depth is 1 3. CPU M goes offline -> interrupt migrates to CPU N / depth is still 1 4. CPU N goes offline -> irq_shutdown() / depth is 2 5. CPU N goes online -> irq_restore_affinity_of_irq() -> irqd_is_managed_and_shutdown()==true -> irq_startup_managed() -> depth is 1 6. CPU M goes online -> irq_restore_affinity_of_irq() -> irqd_is_managed_and_shutdown()==true -> irq_startup_managed() -> depth is 0 *** BUG: driver expects the interrupt is still disabled *** -> irq_startup() -> irqd_clr_managed_shutdown() 7. enable_irq() -> depth underflow / unbalanced enable_irq() warning This should clear the managed-shutdown flag at step 6, so that further hotplugs don't cause further imbalance. Note: It might be cleaner to also remove the irqd_clr_managed_shutdown() invocation from __irq_startup_managed(). But this is currently not possible because of irq_update_affinity_desc() as it sets IRQD_MANAGED_SHUTDOWN and expects irq_startup() to clear it. Fixes: 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug") Reported-by: Aleksandrs Vinarskis Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Aleksandrs Vinarskis Link: https://lore.kernel.org/all/20250612183303.3433234-2-briannorris@chromium.org --- kernel/irq/chip.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b0e0a7332993..2b274007e8ba 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -205,6 +205,14 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff, void irq_startup_managed(struct irq_desc *desc) { + struct irq_data *d = irq_desc_get_irq_data(desc); + + /* + * Clear managed-shutdown flag, so we don't repeat managed-startup for + * multiple hotplugs, and cause imbalanced disable depth. + */ + irqd_clr_managed_shutdown(d); + /* * Only start it up when the disable depth is 1, so that a disable, * hotunplug, hotplug sequence does not end up enabling it during From 72218d74c9c57b8ea36c2a58875dff406fc10462 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 12 Jun 2025 11:32:52 -0700 Subject: [PATCH 061/284] genirq/cpuhotplug: Restore affinity even for suspended IRQ Commit 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug") tried to make managed shutdown/startup properly reference counted, but it missed the fact that the unplug and hotplug code has an intentional imbalance by skipping IRQS_SUSPENDED interrupts on the "restore" path. This means that if a managed-affinity interrupt was both suspended and managed-shutdown (such as may happen during system suspend / S3), resume skips calling irq_startup_managed(), and would again have an unbalanced depth this time, with a positive value (i.e., remaining unexpectedly masked). This IRQS_SUSPENDED check was introduced in commit a60dd06af674 ("genirq/cpuhotplug: Skip suspended interrupts when restoring affinity") for essentially the same reason as commit 788019eb559f, to prevent that irq_startup() would unconditionally re-enable an interrupt too early. Because irq_startup_managed() now respsects the disable-depth count, the IRQS_SUSPENDED check is not longer needed, and instead, it causes harm. Thus, drop the IRQS_SUSPENDED check, and restore balance. This effectively reverts commit a60dd06af674 ("genirq/cpuhotplug: Skip suspended interrupts when restoring affinity"), because it is replaced by commit 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug"). Fixes: 788019eb559f ("genirq: Retain disable depth for managed interrupts across CPU hotplug") Reported-by: Aleksandrs Vinarskis Signed-off-by: Brian Norris Signed-off-by: Thomas Gleixner Tested-by: Aleksandrs Vinarskis Link: https://lore.kernel.org/all/20250612183303.3433234-3-briannorris@chromium.org Closes: https://lore.kernel.org/lkml/24ec4adc-7c80-49e9-93ee-19908a97ab84@gmail.com/ --- kernel/irq/cpuhotplug.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index f07529ae4895..755346ea9819 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -210,13 +210,6 @@ static void irq_restore_affinity_of_irq(struct irq_desc *desc, unsigned int cpu) !irq_data_get_irq_chip(data) || !cpumask_test_cpu(cpu, affinity)) return; - /* - * Don't restore suspended interrupts here when a system comes back - * from S3. They are reenabled via resume_device_irqs(). - */ - if (desc->istate & IRQS_SUSPENDED) - return; - if (irqd_is_managed_and_shutdown(data)) irq_startup_managed(desc); From 8a2277a3c9e4cc5398f80821afe7ecbe9bdf2819 Mon Sep 17 00:00:00 2001 From: Gyeyoung Baek Date: Thu, 12 Jun 2025 21:48:27 +0900 Subject: [PATCH 062/284] genirq/irq_sim: Initialize work context pointers properly Initialize `ops` member's pointers properly by using kzalloc() instead of kmalloc() when allocating the simulation work context. Otherwise the pointers contain random content leading to invalid dereferencing. Signed-off-by: Gyeyoung Baek Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250612124827.63259-1-gye976@gmail.com --- kernel/irq/irq_sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c index 1a3d483548e2..ae4c9cbd1b4b 100644 --- a/kernel/irq/irq_sim.c +++ b/kernel/irq/irq_sim.c @@ -202,7 +202,7 @@ struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode, void *data) { struct irq_sim_work_ctx *work_ctx __free(kfree) = - kmalloc(sizeof(*work_ctx), GFP_KERNEL); + kzalloc(sizeof(*work_ctx), GFP_KERNEL); if (!work_ctx) return ERR_PTR(-ENOMEM); From 80626ae6ffe57917915c6e6d8ea1e908689954fd Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 May 2025 14:15:12 +0100 Subject: [PATCH 063/284] drm/nouveau/gsp: Fix potential integer overflow on integer shifts The left shift int 32 bit integer constants 1 is evaluated using 32 bit arithmetic and then assigned to a 64 bit unsigned integer. In the case where the shift is 32 or more this can lead to an overflow. Avoid this by shifting using the BIT_ULL macro instead. Fixes: 6c3ac7bcfcff ("drm/nouveau/gsp: support deeper page tables in COPY_SERVER_RESERVED_PDES") Signed-off-by: Colin Ian King Signed-off-by: Danilo Krummrich Link: https://lore.kernel.org/r/20250522131512.2768310-1-colin.i.king@gmail.com --- drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/vmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/vmm.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/vmm.c index 52f2e5f14517..f25ea610cd99 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/vmm.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/vmm.c @@ -121,7 +121,7 @@ r535_mmu_vaspace_new(struct nvkm_vmm *vmm, u32 handle, bool external) page_shift -= desc->bits; ctrl->levels[i].physAddress = pd->pt[0]->addr; - ctrl->levels[i].size = (1 << desc->bits) * desc->size; + ctrl->levels[i].size = BIT_ULL(desc->bits) * desc->size; ctrl->levels[i].aperture = 1; ctrl->levels[i].pageShift = page_shift; From 9802f0a63b641f4cddb2139c814c2e95cb825099 Mon Sep 17 00:00:00 2001 From: Zhi Wang Date: Tue, 27 May 2025 16:37:12 +0000 Subject: [PATCH 064/284] drm/nouveau: fix a use-after-free in r535_gsp_rpc_push() The RPC container is released after being passed to r535_gsp_rpc_send(). When sending the initial fragment of a large RPC and passing the caller's RPC container, the container will be freed prematurely. Subsequent attempts to send remaining fragments will therefore result in a use-after-free. Allocate a temporary RPC container for holding the initial fragment of a large RPC when sending. Free the caller's container when all fragments are successfully sent. Fixes: 176fdcbddfd2 ("drm/nouveau/gsp/r535: add support for booting GSP-RM") Signed-off-by: Zhi Wang Link: https://lore.kernel.org/r/20250527163712.3444-1-zhiw@nvidia.com [ Rebase onto Blackwell changes. - Danilo ] Signed-off-by: Danilo Krummrich --- .../drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c index 5acb98d137bd..9d06ff722fea 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c @@ -637,12 +637,18 @@ r535_gsp_rpc_push(struct nvkm_gsp *gsp, void *payload, if (payload_size > max_payload_size) { const u32 fn = rpc->function; u32 remain_payload_size = payload_size; + void *next; - /* Adjust length, and send initial RPC. */ - rpc->length = sizeof(*rpc) + max_payload_size; - msg->checksum = rpc->length; + /* Send initial RPC. */ + next = r535_gsp_rpc_get(gsp, fn, max_payload_size); + if (IS_ERR(next)) { + repv = next; + goto done; + } - repv = r535_gsp_rpc_send(gsp, payload, NVKM_GSP_RPC_REPLY_NOWAIT, 0); + memcpy(next, payload, max_payload_size); + + repv = r535_gsp_rpc_send(gsp, next, NVKM_GSP_RPC_REPLY_NOWAIT, 0); if (IS_ERR(repv)) goto done; @@ -653,7 +659,6 @@ r535_gsp_rpc_push(struct nvkm_gsp *gsp, void *payload, while (remain_payload_size) { u32 size = min(remain_payload_size, max_payload_size); - void *next; next = r535_gsp_rpc_get(gsp, NV_VGPU_MSG_FUNCTION_CONTINUATION_RECORD, size); if (IS_ERR(next)) { @@ -674,6 +679,8 @@ r535_gsp_rpc_push(struct nvkm_gsp *gsp, void *payload, /* Wait for reply. */ repv = r535_gsp_rpc_handle_reply(gsp, fn, policy, payload_size + sizeof(*rpc)); + if (!IS_ERR(repv)) + kvfree(msg); } else { repv = r535_gsp_rpc_send(gsp, payload, policy, gsp_rpc_len); } From 61b2b3737499f1fb361a54a16828db24a8345e85 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Tue, 10 Jun 2025 14:54:51 -0700 Subject: [PATCH 065/284] drm/nouveau/bl: increase buffer size to avoid truncate warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The nouveau_get_backlight_name() function generates a unique name for the backlight interface, appending an id from 1 to 99 for all backlight devices after the first. GCC 15 (and likely other compilers) produce the following -Wformat-truncation warning: nouveau_backlight.c: In function ‘nouveau_backlight_init’: nouveau_backlight.c:56:69: error: ‘%d’ directive output may be truncated writing between 1 and 10 bytes into a region of size 3 [-Werror=format-truncation=] 56 | snprintf(backlight_name, BL_NAME_SIZE, "nv_backlight%d", nb); | ^~ In function ‘nouveau_get_backlight_name’, inlined from ‘nouveau_backlight_init’ at nouveau_backlight.c:351:7: nouveau_backlight.c:56:56: note: directive argument in the range [1, 2147483647] 56 | snprintf(backlight_name, BL_NAME_SIZE, "nv_backlight%d", nb); | ^~~~~~~~~~~~~~~~ nouveau_backlight.c:56:17: note: ‘snprintf’ output between 14 and 23 bytes into a destination of size 15 56 | snprintf(backlight_name, BL_NAME_SIZE, "nv_backlight%d", nb); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The warning started appearing after commit ab244be47a8f ("drm/nouveau: Fix a potential theorical leak in nouveau_get_backlight_name()") This fix for the ida usage removed the explicit value check for ids larger than 99. The compiler is unable to intuit that the ida_alloc_max() limits the returned value range between 0 and 99. Because the compiler can no longer infer that the number ranges from 0 to 99, it thinks that it could use as many as 11 digits (10 + the potential - sign for negative numbers). The warning has gone unfixed for some time, with at least one kernel test robot report. The code breaks W=1 builds, which is especially frustrating with the introduction of CONFIG_WERROR. The string is stored temporarily on the stack and then copied into the device name. Its not a big deal to use 11 more bytes of stack rounding out to an even 24 bytes. Increase BL_NAME_SIZE to 24 to avoid the truncation warning. This fixes the W=1 builds that include this driver. Compile tested only. Fixes: ab244be47a8f ("drm/nouveau: Fix a potential theorical leak in nouveau_get_backlight_name()") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202312050324.0kv4PnfZ-lkp@intel.com/ Suggested-by: Timur Tabi Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20250610-jk-nouveua-drm-bl-snprintf-fix-v2-1-7fdd4b84b48e@intel.com Signed-off-by: Danilo Krummrich --- drivers/gpu/drm/nouveau/nouveau_backlight.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_backlight.c b/drivers/gpu/drm/nouveau/nouveau_backlight.c index d47442125fa1..9aae26eb7d8f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_backlight.c +++ b/drivers/gpu/drm/nouveau/nouveau_backlight.c @@ -42,7 +42,7 @@ #include "nouveau_acpi.h" static struct ida bl_ida; -#define BL_NAME_SIZE 15 // 12 for name + 2 for digits + 1 for '\0' +#define BL_NAME_SIZE 24 // 12 for name + 11 for digits + 1 for '\0' static bool nouveau_get_backlight_name(char backlight_name[BL_NAME_SIZE], From 553ab30a181043f01b99a493ba13f9c011eb75ae Mon Sep 17 00:00:00 2001 From: Bagas Sanjaya Date: Wed, 11 Jun 2025 09:08:06 +0700 Subject: [PATCH 066/284] Documentation: nouveau: Update GSP message queue kernel-doc reference GSP message queue docs has been moved following RPC handling split in commit 8a8b1ec5261f20 ("drm/nouveau/gsp: split rpc handling out on its own"), before GSP-RM implementation is versioned in commit c472d828348caf ("drm/nouveau/gsp: move subdev/engine impls to subdev/gsp/rm/r535/"). However, the kernel-doc reference in nouveau docs is left behind, which triggers htmldocs warnings: ERROR: Cannot find file ./drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c WARNING: No kernel-doc for file ./drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c Update the reference. Fixes: c472d828348c ("drm/nouveau/gsp: move subdev/engine impls to subdev/gsp/rm/r535/") Fixes: 8a8b1ec5261f ("drm/nouveau/gsp: split rpc handling out on its own") Signed-off-by: Bagas Sanjaya Acked-by: Randy Dunlap Tested-by: Randy Dunlap Link: https://lore.kernel.org/r/20250611020805.22418-2-bagasdotme@gmail.com Signed-off-by: Danilo Krummrich --- Documentation/gpu/nouveau.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/gpu/nouveau.rst b/Documentation/gpu/nouveau.rst index b8c801e0068c..cab2e81013bc 100644 --- a/Documentation/gpu/nouveau.rst +++ b/Documentation/gpu/nouveau.rst @@ -25,7 +25,7 @@ providing a consistent API to upper layers of the driver stack. GSP Support ------------------------ -.. kernel-doc:: drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c +.. kernel-doc:: drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c :doc: GSP message queue element .. kernel-doc:: drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h From d57e92dd660014ccac884eda616cafc7b04601e0 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Fri, 13 Jun 2025 14:30:37 +0200 Subject: [PATCH 067/284] spi: tegra210-qspi: Remove cache operations The DMA memory for this driver is allocated using dma_alloc_coherent(), which ends up mapping the allocated memory as uncached. Performing the various dma_sync_*() operations on this memory causes issues during SPI flashing: [ 7.818017] pc : dcache_inval_poc+0x40/0x58 [ 7.822128] lr : arch_sync_dma_for_cpu+0x2c/0x4c [ 7.826854] sp : ffff80008193bcf0 [ 7.830267] x29: ffff80008193bcf0 x28: ffffa3fe5ff1e908 x27: ffffa3fe627bb130 [ 7.837528] x26: ffff000086952180 x25: ffff00008015c8ac x24: ffff000086c9b480 [ 7.844878] x23: ffff00008015c800 x22: 0000000000000002 x21: 0000000000010000 [ 7.852229] x20: 0000000106dae000 x19: ffff000080112410 x18: 0000000000000001 [ 7.859580] x17: ffff000080159400 x16: ffffa3fe607a9bd8 x15: ffff0000eac1b180 [ 7.866753] x14: 000000000000000c x13: 0000000000000001 x12: 000000000000025a [ 7.874104] x11: 0000000000000000 x10: 7f73e96357f6a07f x9 : db1fc8072a7f5e3a [ 7.881365] x8 : ffff000086c9c588 x7 : ffffa3fe607a9bd8 x6 : ffff80008193bc28 [ 7.888630] x5 : 000000000000ffff x4 : 0000000000000009 x3 : 000000000000003f [ 7.895892] x2 : 0000000000000040 x1 : ffff000086dbe000 x0 : ffff000086db0000 [ 7.903155] Call trace: [ 7.905606] dcache_inval_poc+0x40/0x58 (P) [ 7.909804] iommu_dma_sync_single_for_cpu+0xb4/0xb8 [ 7.914617] __dma_sync_single_for_cpu+0x158/0x194 [ 7.919428] __this_module+0x5b020/0x5baf8 [spi_tegra210_quad] [ 7.925291] irq_thread_fn+0x2c/0xc0 [ 7.928966] irq_thread+0x16c/0x318 [ 7.932467] kthread+0x12c/0x214 Fix this by removing all calls to the dma_sync_*() functions. This isn't ideal because DMA is used only for relatively large (> 64 words or 256 bytes) and using uncached memory for this might be slow. Reworking this to use cached memory for faster access and reintroducing the cache maintenance calls is probably worth a follow-up patch. Reported-by: Brad Griffis Fixes: 017f1b0bae08 ("spi: tegra210-quad: Add support for internal DMA") Signed-off-by: Thierry Reding Link: https://patch.msgid.link/20250613123037.2082788-1-thierry.reding@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-tegra210-quad.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/drivers/spi/spi-tegra210-quad.c b/drivers/spi/spi-tegra210-quad.c index 3581757a269b..3be7499db21e 100644 --- a/drivers/spi/spi-tegra210-quad.c +++ b/drivers/spi/spi-tegra210-quad.c @@ -407,9 +407,6 @@ tegra_qspi_read_rx_fifo_to_client_rxbuf(struct tegra_qspi *tqspi, struct spi_tra static void tegra_qspi_copy_client_txbuf_to_qspi_txbuf(struct tegra_qspi *tqspi, struct spi_transfer *t) { - dma_sync_single_for_cpu(tqspi->dev, tqspi->tx_dma_phys, - tqspi->dma_buf_size, DMA_TO_DEVICE); - /* * In packed mode, each word in FIFO may contain multiple packets * based on bits per word. So all bytes in each FIFO word are valid. @@ -442,17 +439,11 @@ tegra_qspi_copy_client_txbuf_to_qspi_txbuf(struct tegra_qspi *tqspi, struct spi_ tqspi->cur_tx_pos += write_bytes; } - - dma_sync_single_for_device(tqspi->dev, tqspi->tx_dma_phys, - tqspi->dma_buf_size, DMA_TO_DEVICE); } static void tegra_qspi_copy_qspi_rxbuf_to_client_rxbuf(struct tegra_qspi *tqspi, struct spi_transfer *t) { - dma_sync_single_for_cpu(tqspi->dev, tqspi->rx_dma_phys, - tqspi->dma_buf_size, DMA_FROM_DEVICE); - if (tqspi->is_packed) { tqspi->cur_rx_pos += tqspi->curr_dma_words * tqspi->bytes_per_word; } else { @@ -478,9 +469,6 @@ tegra_qspi_copy_qspi_rxbuf_to_client_rxbuf(struct tegra_qspi *tqspi, struct spi_ tqspi->cur_rx_pos += read_bytes; } - - dma_sync_single_for_device(tqspi->dev, tqspi->rx_dma_phys, - tqspi->dma_buf_size, DMA_FROM_DEVICE); } static void tegra_qspi_dma_complete(void *args) @@ -701,8 +689,6 @@ static int tegra_qspi_start_dma_based_transfer(struct tegra_qspi *tqspi, struct return ret; } - dma_sync_single_for_device(tqspi->dev, tqspi->rx_dma_phys, - tqspi->dma_buf_size, DMA_FROM_DEVICE); ret = tegra_qspi_start_rx_dma(tqspi, t, len); if (ret < 0) { dev_err(tqspi->dev, "failed to start RX DMA: %d\n", ret); From 13b86ea92ebf0fa587fbadfb8a60ca2e9993203f Mon Sep 17 00:00:00 2001 From: Raven Black Date: Fri, 13 Jun 2025 07:51:25 -0400 Subject: [PATCH 068/284] ASoC: amd: yc: update quirk data for HP Victus Make the internal microphone work on HP Victus laptops. Signed-off-by: Raven Black Link: https://patch.msgid.link/20250613-support-hp-victus-microphone-v1-1-bebc4c3a2041@gmail.com Signed-off-by: Mark Brown --- sound/soc/amd/yc/acp6x-mach.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sound/soc/amd/yc/acp6x-mach.c b/sound/soc/amd/yc/acp6x-mach.c index c9387d6fbf18..98022e5fd428 100644 --- a/sound/soc/amd/yc/acp6x-mach.c +++ b/sound/soc/amd/yc/acp6x-mach.c @@ -521,6 +521,13 @@ static const struct dmi_system_id yc_acp_quirk_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OMEN by HP Gaming Laptop 16z-n000"), } }, + { + .driver_data = &acp6x_card, + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "HP"), + DMI_MATCH(DMI_PRODUCT_NAME, "Victus by HP Gaming Laptop 15-fb2xxx"), + } + }, { .driver_data = &acp6x_card, .matches = { From 907a7a2e5bf40c6a359b2f6cc53d6fdca04009e0 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 11 Jun 2025 18:31:16 -0500 Subject: [PATCH 069/284] PCI/PM: Set up runtime PM even for devices without PCI PM 4d4c10f763d7 ("PCI: Explicitly put devices into D0 when initializing") intended to put PCI devices into D0, but in doing so unintentionally changed runtime PM initialization not to occur on devices that don't support PCI PM. This caused a regression in vfio-pci due to an imbalance with its use. Adjust the logic in pci_pm_init() so that even if PCI PM isn't supported runtime PM is still initialized. Fixes: 4d4c10f763d7 ("PCI: Explicitly put devices into D0 when initializing") Reported-by: Giovanni Cabiddu Closes: https://lore.kernel.org/linux-pci/20250424043232.1848107-1-superm1@kernel.org/T/#m7e8929d6421690dc8bd6dc639d86c2b4db27cbc4 Reported-by: Nicolas Dichtel Closes: https://lore.kernel.org/linux-pci/20250424043232.1848107-1-superm1@kernel.org/T/#m40d277dcdb9be64a1609a82412d1aa906263e201 Signed-off-by: Mario Limonciello Signed-off-by: Bjorn Helgaas Tested-by: Giovanni Cabiddu Tested-by: Nicolas Dichtel Reviewed-by: Kuppuswamy Sathyanarayanan Acked-by: Rafael J. Wysocki Cc: Alex Williamson Link: https://patch.msgid.link/20250611233117.61810-1-superm1@kernel.org --- drivers/pci/pci.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index e9448d55113b..9e42090fb108 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -3217,14 +3217,14 @@ void pci_pm_init(struct pci_dev *dev) /* find PCI PM capability in list */ pm = pci_find_capability(dev, PCI_CAP_ID_PM); if (!pm) - return; + goto poweron; /* Check device's ability to generate PME# */ pci_read_config_word(dev, pm + PCI_PM_PMC, &pmc); if ((pmc & PCI_PM_CAP_VER_MASK) > 3) { pci_err(dev, "unsupported PM cap regs version (%u)\n", pmc & PCI_PM_CAP_VER_MASK); - return; + goto poweron; } dev->pm_cap = pm; @@ -3269,6 +3269,7 @@ void pci_pm_init(struct pci_dev *dev) pci_read_config_word(dev, PCI_STATUS, &status); if (status & PCI_STATUS_IMM_READY) dev->imm_ready = 1; +poweron: pci_pm_power_up_and_verify_state(dev); pm_runtime_forbid(&dev->dev); pm_runtime_set_active(&dev->dev); From efa6bdf1bc75e26cafaa5f1d775e8bb7c5b0c431 Mon Sep 17 00:00:00 2001 From: Jonathan Lane Date: Wed, 11 Jun 2025 12:31:25 -0700 Subject: [PATCH 070/284] ALSA: hda/realtek: enable headset mic on Latitude 5420 Rugged Like many Dell laptops, the 3.5mm port by default can not detect a combined headphones+mic headset or even a pure microphone. This change enables the port's functionality. Signed-off-by: Jonathan Lane Cc: Link: https://patch.msgid.link/20250611193124.26141-2-jon@borg.moe Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index bca725bb8281..02a424b7a992 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10509,6 +10509,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x0871, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0872, "Dell Precision 3630", ALC255_FIXUP_DELL_HEADSET_MIC), SND_PCI_QUIRK(0x1028, 0x0873, "Dell Precision 3930", ALC255_FIXUP_DUMMY_LINEOUT_VERB), + SND_PCI_QUIRK(0x1028, 0x0879, "Dell Latitude 5420 Rugged", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x08ad, "Dell WYSE AIO", ALC225_FIXUP_DELL_WYSE_AIO_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x08ae, "Dell WYSE NB", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x0935, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), From 60599e4f8a041d1186afe4bcf12ba522afe4c5bc Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Fri, 13 Jun 2025 14:52:51 +0000 Subject: [PATCH 071/284] ALSA: hda/realtek: Add quirk for Asus GA605K The GA605K has similar audio hardware to the GA403U so apply the same quirk. Signed-off-by: Simon Trimmer Tested-by: Timofey Titovets Link: https://github.com/alsa-project/alsa-ucm-conf/issues/578 Link: https://patch.msgid.link/20250613145251.397500-1-simont@opensource.cirrus.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 02a424b7a992..76cc908d4957 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8030,6 +8030,8 @@ enum { ALC294_FIXUP_ASUS_CS35L41_SPI_2, ALC274_FIXUP_HP_AIO_BIND_DACS, ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2, + ALC285_FIXUP_ASUS_GA605K_HEADSET_MIC, + ALC285_FIXUP_ASUS_GA605K_I2C_SPEAKER2_TO_DAC1, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -10414,6 +10416,20 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc274_fixup_hp_aio_bind_dacs, }, + [ALC285_FIXUP_ASUS_GA605K_HEADSET_MIC] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x03a11050 }, + { 0x1b, 0x03a11c30 }, + { } + }, + .chained = true, + .chain_id = ALC285_FIXUP_ASUS_GA605K_I2C_SPEAKER2_TO_DAC1 + }, + [ALC285_FIXUP_ASUS_GA605K_I2C_SPEAKER2_TO_DAC1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_speaker2_to_dac1, + }, }; static const struct hda_quirk alc269_fixup_tbl[] = { @@ -10937,6 +10953,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1043, 0x12e0, "ASUS X541SA", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x12f0, "ASUS X541UV", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1313, "Asus K42JZ", ALC269VB_FIXUP_ASUS_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1043, 0x1314, "ASUS GA605K", ALC285_FIXUP_ASUS_GA605K_HEADSET_MIC), SND_PCI_QUIRK(0x1043, 0x13b0, "ASUS Z550SA", ALC256_FIXUP_ASUS_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1043, 0x1427, "Asus Zenbook UX31E", ALC269VB_FIXUP_ASUS_ZENBOOK), SND_PCI_QUIRK(0x1043, 0x1433, "ASUS GX650PY/PZ/PV/PU/PYV/PZV/PIV/PVV", ALC285_FIXUP_ASUS_I2C_HEADSET_MIC), From d3deabe4c619875714b9a844b1a3d9752dbae1dd Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 13 Jun 2025 07:41:44 -0700 Subject: [PATCH 072/284] drm/msm: Fix inverted WARN_ON() logic We want to WARN_ON() if info is NULL. Suggested-by: Konrad Dybcio Fixes: 0838fc3e6718 ("drm/msm/adreno: Check for recognized GPU before bind") Tested-by: Neil Armstrong Signed-off-by: Rob Clark Reported-by: Alexey Klimov Reviewed-by: Konrad Dybcio Patchwork: https://patchwork.freedesktop.org/patch/658631/ --- drivers/gpu/drm/msm/adreno/adreno_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 5e7307567239..16e7ac444efd 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -221,7 +221,7 @@ static int adreno_bind(struct device *dev, struct device *master, void *data) info = adreno_info(config.chip_id); /* We shouldn't have gotten this far if we don't recognize the GPU: */ - if (!WARN_ON(info)) + if (WARN_ON(!info)) return -ENXIO; config.info = info; From 1d27f11bf02b38c431e49a17dee5c10a2b4c2e28 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 15 Jun 2025 08:09:14 -0600 Subject: [PATCH 073/284] io_uring/rsrc: validate buffer count with offset for cloning syzbot reports that it can trigger a WARN_ON() for kmalloc() attempt that's too big: WARNING: CPU: 0 PID: 6488 at mm/slub.c:5024 __kvmalloc_node_noprof+0x520/0x640 mm/slub.c:5024 Modules linked in: CPU: 0 UID: 0 PID: 6488 Comm: syz-executor312 Not tainted 6.15.0-rc7-syzkaller-gd7fa1af5b33e #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 20400005 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __kvmalloc_node_noprof+0x520/0x640 mm/slub.c:5024 lr : __do_kmalloc_node mm/slub.c:-1 [inline] lr : __kvmalloc_node_noprof+0x3b4/0x640 mm/slub.c:5012 sp : ffff80009cfd7a90 x29: ffff80009cfd7ac0 x28: ffff0000dd52a120 x27: 0000000000412dc0 x26: 0000000000000178 x25: ffff7000139faf70 x24: 0000000000000000 x23: ffff800082f4cea8 x22: 00000000ffffffff x21: 000000010cd004a8 x20: ffff0000d75816c0 x19: ffff0000dd52a000 x18: 00000000ffffffff x17: ffff800092f39000 x16: ffff80008adbe9e4 x15: 0000000000000005 x14: 1ffff000139faf1c x13: 0000000000000000 x12: 0000000000000000 x11: ffff7000139faf21 x10: 0000000000000003 x9 : ffff80008f27b938 x8 : 0000000000000002 x7 : 0000000000000000 x6 : 0000000000000000 x5 : 00000000ffffffff x4 : 0000000000400dc0 x3 : 0000000200000000 x2 : 000000010cd004a8 x1 : ffff80008b3ebc40 x0 : 0000000000000001 Call trace: __kvmalloc_node_noprof+0x520/0x640 mm/slub.c:5024 (P) kvmalloc_array_node_noprof include/linux/slab.h:1065 [inline] io_rsrc_data_alloc io_uring/rsrc.c:206 [inline] io_clone_buffers io_uring/rsrc.c:1178 [inline] io_register_clone_buffers+0x484/0xa14 io_uring/rsrc.c:1287 __io_uring_register io_uring/register.c:815 [inline] __do_sys_io_uring_register io_uring/register.c:926 [inline] __se_sys_io_uring_register io_uring/register.c:903 [inline] __arm64_sys_io_uring_register+0x42c/0xea8 io_uring/register.c:903 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 which is due to offset + buffer_count being too large. The registration code checks only the total count of buffers, but given that the indexing is an array, it should also check offset + count. That can't exceed IORING_MAX_REG_BUFFERS either, as there's no way to reach buffers beyond that limit. There's no issue with registrering a table this large, outside of the fact that it's pointless to register buffers that cannot be reached, and that it can trigger this kmalloc() warning for attempting an allocation that is too large. Cc: stable@vger.kernel.org Fixes: b16e920a1909 ("io_uring/rsrc: allow cloning at an offset") Reported-by: syzbot+cb4bf3cb653be0d25de8@syzkaller.appspotmail.com Link: https://lore.kernel.org/io-uring/684e77bd.a00a0220.279073.0029.GAE@google.com/ Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index c592ceace97d..94a9db030e0e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1177,6 +1177,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx return -EINVAL; if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) return -EOVERFLOW; + if (nbufs > IORING_MAX_REG_BUFFERS) + return -EINVAL; ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); if (ret) From 89465d923bda180299e69ee2800aab84ad0ba689 Mon Sep 17 00:00:00 2001 From: Penglei Jiang Date: Sun, 15 Jun 2025 09:39:06 -0700 Subject: [PATCH 074/284] io_uring: fix task leak issue in io_wq_create() Add missing put_task_struct() in the error path Cc: stable@vger.kernel.org Fixes: 0f8baa3c9802 ("io-wq: fully initialize wqe before calling cpuhp_state_add_instance_nocalls()") Signed-off-by: Penglei Jiang Link: https://lore.kernel.org/r/20250615163906.2367-1-superman.xpt@gmail.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index cd1fcb115739..be91edf34f01 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1259,8 +1259,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) + if (ret) { + put_task_struct(wq->task); goto err; + } return wq; err: From a7137b1825b535eb7258b25beeb0d5425e0037d2 Mon Sep 17 00:00:00 2001 From: Tzung-Bi Shih Date: Thu, 12 Jun 2025 08:30:23 +0000 Subject: [PATCH 075/284] drm/i915/pmu: Fix build error with GCOV and AutoFDO enabled i915_pmu.c may fail to build with GCOV and AutoFDO enabled. ../drivers/gpu/drm/i915/i915_pmu.c:116:3: error: call to '__compiletime_assert_487' declared with 'error' attribute: BUILD_BUG_ON failed: bit > BITS_PER_TYPE(typeof_member(struct i915_pmu, enable)) - 1 116 | BUILD_BUG_ON(bit > | ^ Here is a way to reproduce the issue: $ git checkout v6.15 $ mkdir build $ ./scripts/kconfig/merge_config.sh -O build -n -m <(cat < Signed-off-by: Tzung-Bi Shih Signed-off-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20250612083023.562585-1-tzungbi@kernel.org (cherry picked from commit 686d773186bf72b739bab7e12eb8665d914676ee) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index e5a188ce3185..990bfaba3ce4 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -112,7 +112,7 @@ static u32 config_mask(const u64 config) { unsigned int bit = config_bit(config); - if (__builtin_constant_p(config)) + if (__builtin_constant_p(bit)) BUILD_BUG_ON(bit > BITS_PER_TYPE(typeof_member(struct i915_pmu, enable)) - 1); @@ -121,7 +121,7 @@ static u32 config_mask(const u64 config) BITS_PER_TYPE(typeof_member(struct i915_pmu, enable)) - 1); - return BIT(config_bit(config)); + return BIT(bit); } static bool is_engine_event(struct perf_event *event) From c464ce6af332e7c802c36cd337cacf81db05400c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Fri, 14 Mar 2025 17:01:34 +0200 Subject: [PATCH 076/284] drm/i915/dsi: Fix off by one in BXT_MIPI_TRANS_VTOTAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BXT_MIPI_TRANS_VTOTAL must be programmed with vtotal-1 instead of vtotal. Make it so. Cc: stable@vger.kernel.org Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20250314150136.22564-1-ville.syrjala@linux.intel.com Reviewed-by: Jani Nikula (cherry picked from commit 7b3685c9b38c3097f465efec8b24dbed63258cf6) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/display/vlv_dsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/vlv_dsi.c b/drivers/gpu/drm/i915/display/vlv_dsi.c index 346737f15fa9..21c1e10caf68 100644 --- a/drivers/gpu/drm/i915/display/vlv_dsi.c +++ b/drivers/gpu/drm/i915/display/vlv_dsi.c @@ -1056,7 +1056,7 @@ static void bxt_dsi_get_pipe_config(struct intel_encoder *encoder, BXT_MIPI_TRANS_VACTIVE(port)); adjusted_mode->crtc_vtotal = intel_de_read(display, - BXT_MIPI_TRANS_VTOTAL(port)); + BXT_MIPI_TRANS_VTOTAL(port)) + 1; hactive = adjusted_mode->crtc_hdisplay; hfp = intel_de_read(display, MIPI_HFP_COUNT(display, port)); @@ -1260,7 +1260,7 @@ static void set_dsi_timings(struct intel_encoder *encoder, intel_de_write(display, BXT_MIPI_TRANS_VACTIVE(port), adjusted_mode->crtc_vdisplay); intel_de_write(display, BXT_MIPI_TRANS_VTOTAL(port), - adjusted_mode->crtc_vtotal); + adjusted_mode->crtc_vtotal - 1); } intel_de_write(display, MIPI_HACTIVE_AREA_COUNT(display, port), From e6382fcf989074566bb9a54bbd3c514d7bb99397 Mon Sep 17 00:00:00 2001 From: Vivian Wang Date: Fri, 13 Jun 2025 17:25:33 +0800 Subject: [PATCH 077/284] gpio: spacemit: Add missing MODULE_DEVICE_TABLE The gpio-spacemit-k1 driver can be compiled as a module. Add missing MODULE_DEVICE_TABLE so it can be matched by modalias and automatically loaded by udev. Fixes: d00553240ef8 ("gpio: spacemit: add support for K1 SoC") Signed-off-by: Vivian Wang Reviewed-by: Yixun Lan Link: https://lore.kernel.org/r/20250613-k1-gpio-of-table-v1-1-9015da8fdfdb@iscas.ac.cn Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-spacemit-k1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-spacemit-k1.c b/drivers/gpio/gpio-spacemit-k1.c index f027066365ff..3cc75c701ec4 100644 --- a/drivers/gpio/gpio-spacemit-k1.c +++ b/drivers/gpio/gpio-spacemit-k1.c @@ -278,6 +278,7 @@ static const struct of_device_id spacemit_gpio_dt_ids[] = { { .compatible = "spacemit,k1-gpio" }, { /* sentinel */ } }; +MODULE_DEVICE_TABLE(of, spacemit_gpio_dt_ids); static struct platform_driver spacemit_gpio_driver = { .probe = spacemit_gpio_probe, From b081d8564e4a396843a78788476fdcbf70efed06 Mon Sep 17 00:00:00 2001 From: Laurentiu Mihalcea Date: Fri, 13 Jun 2025 15:43:10 -0400 Subject: [PATCH 078/284] ASoC: SOF: imx8: add core shutdown operation for imx8/imx8x Currently, the DSP core from i.MX8QM/i.MX8QXP is able to operate while the firmware image is being loaded. Because of this, the DSP may change the content of the firmware data just after it was loaded, thus leading to the data having unexpected values when the DSP is reset (via run()). Fix this by implementing the core_shutdown() operation that will put the DSP in stall during suspend(). The stall will be removed during the run() opertion, thus guaranteeing that the DSP core will not be able to run while the firmware image is being loaded. Signed-off-by: Laurentiu Mihalcea Reviewed-by: Daniel Baluta Link: https://patch.msgid.link/20250613194310.1128733-1-laurentiumihalcea111@gmail.com Signed-off-by: Mark Brown --- sound/soc/sof/imx/imx8.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/sound/soc/sof/imx/imx8.c b/sound/soc/sof/imx/imx8.c index a40a8047873e..b73dd91bd529 100644 --- a/sound/soc/sof/imx/imx8.c +++ b/sound/soc/sof/imx/imx8.c @@ -40,6 +40,19 @@ struct imx8m_chip_data { struct reset_control *run_stall; }; +static int imx8_shutdown(struct snd_sof_dev *sdev) +{ + /* + * Force the DSP to stall. After the firmware image is loaded, + * the stall will be removed during run() by a matching + * imx_sc_pm_cpu_start() call. + */ + imx_sc_pm_cpu_start(get_chip_pdata(sdev), IMX_SC_R_DSP, false, + RESET_VECTOR_VADDR); + + return 0; +} + /* * DSP control. */ @@ -281,11 +294,13 @@ static int imx8_ops_init(struct snd_sof_dev *sdev) static const struct imx_chip_ops imx8_chip_ops = { .probe = imx8_probe, .core_kick = imx8_run, + .core_shutdown = imx8_shutdown, }; static const struct imx_chip_ops imx8x_chip_ops = { .probe = imx8_probe, .core_kick = imx8x_run, + .core_shutdown = imx8_shutdown, }; static const struct imx_chip_ops imx8m_chip_ops = { From 8acfb165a492251a08a22a4fa6497a131e8c2609 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Fri, 6 Jun 2025 21:04:18 +0200 Subject: [PATCH 079/284] regulator: fan53555: add enable_time support and soft-start times The datasheets for all the fan53555 variants (and clones using the same interface) define so called soft start times, from enabling the regulator until at least some percentage of the output (i.e. 92% for the rk860x types) are available. The regulator framework supports this with the enable_time property but currently the fan53555 driver does not define enable_times for any variant. I ran into a problem with this while testing the new driver for the Rockchip NPUs (rocket), which does runtime-pm including disabling and enabling a rk8602 as needed. When reenabling the regulator while running a load, fatal hangs could be observed while enabling the associated power-domain, which the regulator supplies. Experimentally setting the regulator to always-on, made the issue disappear, leading to the missing delay to let power stabilize. And as expected, setting the enable-time to a non-zero value according to the datasheet also resolved the regulator-issue. The datasheets in nearly all cases only specify "typical" values, except for the fan53555 type 08. There both a typical and maximum value are listed - 40uS apart. For all typical values I've added 100uS to be on the safe side. Individual details for the relevant regulators below: - fan53526: The datasheet for all variants lists a typical value of 150uS, so make that 250uS with safety margin. - fan53555: types 08 and 18 (unsupported) are given a typical enable time of 135uS but also a maximum of 175uS so use that value. All the other types only have a typical time in the datasheet of 300uS, so give a bit margin by setting it to 400uS. - rk8600 + rk8602: Datasheet reports a typical value of 260us, so use 360uS to be safe. - syr82x + syr83x: All datasheets report typical soft-start values of 300uS for these regulators, so use 400uS. - tcs452x: Datasheet sadly does not report a soft-start time, so I've not set an enable-time Signed-off-by: Heiko Stuebner Link: https://patch.msgid.link/20250606190418.478633-1-heiko@sntech.de Signed-off-by: Mark Brown --- drivers/regulator/fan53555.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/regulator/fan53555.c b/drivers/regulator/fan53555.c index bd9447dac596..c282236959b1 100644 --- a/drivers/regulator/fan53555.c +++ b/drivers/regulator/fan53555.c @@ -147,6 +147,7 @@ struct fan53555_device_info { unsigned int slew_mask; const unsigned int *ramp_delay_table; unsigned int n_ramp_values; + unsigned int enable_time; unsigned int slew_rate; }; @@ -282,6 +283,7 @@ static int fan53526_voltages_setup_fairchild(struct fan53555_device_info *di) di->slew_mask = CTL_SLEW_MASK; di->ramp_delay_table = slew_rates; di->n_ramp_values = ARRAY_SIZE(slew_rates); + di->enable_time = 250; di->vsel_count = FAN53526_NVOLTAGES; return 0; @@ -296,10 +298,12 @@ static int fan53555_voltages_setup_fairchild(struct fan53555_device_info *di) case FAN53555_CHIP_REV_00: di->vsel_min = 600000; di->vsel_step = 10000; + di->enable_time = 400; break; case FAN53555_CHIP_REV_13: di->vsel_min = 800000; di->vsel_step = 10000; + di->enable_time = 400; break; default: dev_err(di->dev, @@ -311,13 +315,19 @@ static int fan53555_voltages_setup_fairchild(struct fan53555_device_info *di) case FAN53555_CHIP_ID_01: case FAN53555_CHIP_ID_03: case FAN53555_CHIP_ID_05: + di->vsel_min = 600000; + di->vsel_step = 10000; + di->enable_time = 400; + break; case FAN53555_CHIP_ID_08: di->vsel_min = 600000; di->vsel_step = 10000; + di->enable_time = 175; break; case FAN53555_CHIP_ID_04: di->vsel_min = 603000; di->vsel_step = 12826; + di->enable_time = 400; break; default: dev_err(di->dev, @@ -350,6 +360,7 @@ static int fan53555_voltages_setup_rockchip(struct fan53555_device_info *di) di->slew_mask = CTL_SLEW_MASK; di->ramp_delay_table = slew_rates; di->n_ramp_values = ARRAY_SIZE(slew_rates); + di->enable_time = 360; di->vsel_count = FAN53555_NVOLTAGES; return 0; @@ -372,6 +383,7 @@ static int rk8602_voltages_setup_rockchip(struct fan53555_device_info *di) di->slew_mask = CTL_SLEW_MASK; di->ramp_delay_table = slew_rates; di->n_ramp_values = ARRAY_SIZE(slew_rates); + di->enable_time = 360; di->vsel_count = RK8602_NVOLTAGES; return 0; @@ -395,6 +407,7 @@ static int fan53555_voltages_setup_silergy(struct fan53555_device_info *di) di->slew_mask = CTL_SLEW_MASK; di->ramp_delay_table = slew_rates; di->n_ramp_values = ARRAY_SIZE(slew_rates); + di->enable_time = 400; di->vsel_count = FAN53555_NVOLTAGES; return 0; @@ -594,6 +607,7 @@ static int fan53555_regulator_register(struct fan53555_device_info *di, rdesc->ramp_mask = di->slew_mask; rdesc->ramp_delay_table = di->ramp_delay_table; rdesc->n_ramp_values = di->n_ramp_values; + rdesc->enable_time = di->enable_time; rdesc->owner = THIS_MODULE; rdev = devm_regulator_register(di->dev, &di->desc, config); From 070b315333ee942f2cc69d02a864945c9bc27ef2 Mon Sep 17 00:00:00 2001 From: Chun-Tse Shao Date: Wed, 21 May 2025 15:44:44 -0700 Subject: [PATCH 080/284] perf test: Restrict uniquifying test to machines with 'uncore_imc' The test would fail if target machine does not have 'uncore_imc' devices. Since event uniquifying behavior is similar among different architectures, we are restricting the test to only run on machines with `uncore_imc` devices. Suggested-by: Ian Rogers Signed-off-by: Chun-Tse Shao Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250521224513.1104129-1-ctshao@google.com [ Skip the test, i.e. return 2, instead of returning 0 as if the test had succeed ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat+event_uniquifying.sh | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/shell/stat+event_uniquifying.sh b/tools/perf/tests/shell/stat+event_uniquifying.sh index 5ec35c52b7d9..bf54bd6c3e2e 100755 --- a/tools/perf/tests/shell/stat+event_uniquifying.sh +++ b/tools/perf/tests/shell/stat+event_uniquifying.sh @@ -9,7 +9,8 @@ perf_tool=perf err=0 test_event_uniquifying() { - # We use `clockticks` to verify the uniquify behavior. + # We use `clockticks` in `uncore_imc` to verify the uniquify behavior. + pmu="uncore_imc" event="clockticks" # If the `-A` option is added, the event should be uniquified. @@ -43,11 +44,18 @@ test_event_uniquifying() { echo "stat event uniquifying test" uniquified_event_array=() + # Skip if the machine does not have `uncore_imc` device. + if ! ${perf_tool} list pmu | grep -q ${pmu}; then + echo "Target does not support PMU ${pmu} [Skipped]" + err=2 + return + fi + # Check how many uniquified events. while IFS= read -r line; do uniquified_event=$(echo "$line" | awk '{print $1}') uniquified_event_array+=("${uniquified_event}") - done < <(${perf_tool} list -v ${event} | grep "\[Kernel PMU event\]") + done < <(${perf_tool} list -v ${event} | grep ${pmu}) perf_command="${perf_tool} stat -e $event -A -o ${stat_output} -- true" $perf_command From 11cfaf37d6a79447bde8192e2e3bb2877932b82b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Jun 2025 12:32:06 -0300 Subject: [PATCH 081/284] tools headers: Update the fs headers with the kernel sources To pick up changes from: 5d894321c49e6137 ("fs: add atomic write unit max opt to statx") a516403787e08119 ("fs/proc: extend the PAGEMAP_SCAN ioctl to report guard regions") c07d3aede2b26830 ("fscrypt: add support for hardware-wrapped keys") These are used to beautify fs syscall arguments, albeit the changes in this update are not affecting those beautifiers. This addresses these tools/ build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/fscrypt.h include/uapi/linux/fscrypt.h diff -u tools/include/uapi/linux/stat.h include/uapi/linux/stat.h diff -u tools/perf/trace/beauty/include/uapi/linux/fs.h include/uapi/linux/fs.h diff -u tools/perf/trace/beauty/include/uapi/linux/stat.h include/uapi/linux/stat.h Please see tools/include/uapi/README for details (it's in the first patch of this series). Cc: Adrian Hunter Cc: Andrei Vagin Cc: Andrew Morton Cc: Andrii Nakryiko Cc: Darrick J. Wong Cc: Eric Biggers Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kan Liang Cc: Liam R. Howlett Cc: Namhyung Kim Link: https://lore.kernel.org/r/aEce1keWdO-vGeqe@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fscrypt.h | 6 ++++-- tools/include/uapi/linux/stat.h | 8 ++++++-- tools/perf/trace/beauty/include/uapi/linux/fs.h | 1 + tools/perf/trace/beauty/include/uapi/linux/stat.h | 8 ++++++-- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tools/include/uapi/linux/fscrypt.h b/tools/include/uapi/linux/fscrypt.h index 7a8f4c290187..3aff99f2696a 100644 --- a/tools/include/uapi/linux/fscrypt.h +++ b/tools/include/uapi/linux/fscrypt.h @@ -119,7 +119,7 @@ struct fscrypt_key_specifier { */ struct fscrypt_provisioning_key_payload { __u32 type; - __u32 __reserved; + __u32 flags; __u8 raw[]; }; @@ -128,7 +128,9 @@ struct fscrypt_add_key_arg { struct fscrypt_key_specifier key_spec; __u32 raw_size; __u32 key_id; - __u32 __reserved[8]; +#define FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED 0x00000001 + __u32 flags; + __u32 __reserved[7]; __u8 raw[]; }; diff --git a/tools/include/uapi/linux/stat.h b/tools/include/uapi/linux/stat.h index f78ee3670dd5..1686861aae20 100644 --- a/tools/include/uapi/linux/stat.h +++ b/tools/include/uapi/linux/stat.h @@ -182,8 +182,12 @@ struct statx { /* File offset alignment for direct I/O reads */ __u32 stx_dio_read_offset_align; - /* 0xb8 */ - __u64 __spare3[9]; /* Spare space for future expansion */ + /* Optimised max atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max_opt; + __u32 __spare2[1]; + + /* 0xc0 */ + __u64 __spare3[8]; /* Spare space for future expansion */ /* 0x100 */ }; diff --git a/tools/perf/trace/beauty/include/uapi/linux/fs.h b/tools/perf/trace/beauty/include/uapi/linux/fs.h index e762e1af650c..0098b0ce8ccb 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/fs.h +++ b/tools/perf/trace/beauty/include/uapi/linux/fs.h @@ -361,6 +361,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) #define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) /* * struct page_region - Page region with flags diff --git a/tools/perf/trace/beauty/include/uapi/linux/stat.h b/tools/perf/trace/beauty/include/uapi/linux/stat.h index f78ee3670dd5..1686861aae20 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/stat.h +++ b/tools/perf/trace/beauty/include/uapi/linux/stat.h @@ -182,8 +182,12 @@ struct statx { /* File offset alignment for direct I/O reads */ __u32 stx_dio_read_offset_align; - /* 0xb8 */ - __u64 __spare3[9]; /* Spare space for future expansion */ + /* Optimised max atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max_opt; + __u32 __spare2[1]; + + /* 0xc0 */ + __u64 __spare3[8]; /* Spare space for future expansion */ /* 0x100 */ }; From bbeb1088a40b0da8b832d72e05e1c1cb71535712 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 9 Jun 2025 17:57:42 -0700 Subject: [PATCH 082/284] perf mem: Document new output fields (op, cache, mem, dtlb, snoop) Update the documentation of the new fields with examples and caveats. Also update the related documentation for AMD IBS. Reviewed-by: Ravi Bangoria Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250610005742.2173050-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-amd-ibs.txt | 57 ++++++++++++++++------- tools/perf/Documentation/perf-mem.txt | 50 ++++++++++++++++++++ 2 files changed, 91 insertions(+), 16 deletions(-) diff --git a/tools/perf/Documentation/perf-amd-ibs.txt b/tools/perf/Documentation/perf-amd-ibs.txt index 55f80beae037..548549935760 100644 --- a/tools/perf/Documentation/perf-amd-ibs.txt +++ b/tools/perf/Documentation/perf-amd-ibs.txt @@ -171,23 +171,48 @@ Below is a simple example of the perf mem tool. # perf mem report A normal perf mem report output will provide detailed memory access profile. -However, it can also be aggregated based on output fields. For example: +New output fields will show related access info together. For example: - # perf mem report -F mem,sample,snoop - Samples: 3M of event 'ibs_op//', Event count (approx.): 23524876 - Memory access Samples Snoop - N/A 1903343 N/A - L1 hit 1056754 N/A - L2 hit 75231 N/A - L3 hit 9496 HitM - L3 hit 2270 N/A - RAM hit 8710 N/A - Remote node, same socket RAM hit 3241 N/A - Remote core, same node Any cache hit 1572 HitM - Remote core, same node Any cache hit 514 N/A - Remote node, same socket Any cache hit 1216 HitM - Remote node, same socket Any cache hit 350 N/A - Uncached hit 18 N/A + # perf mem report -F overhead,cache,snoop,comm + ... + # Samples: 92K of event 'ibs_op//' + # Total weight : 531104 + # + # ---------- Cache ----------- --- Snoop ---- + # Overhead L1 L2 L1-buf Other HitM Other Command + # ........ ............................ .............. .......... + # + 76.07% 5.8% 35.7% 0.0% 34.6% 23.3% 52.8% cc1 + 5.79% 0.2% 0.0% 0.0% 5.6% 0.1% 5.7% make + 5.78% 0.1% 4.4% 0.0% 1.2% 0.5% 5.3% gcc + 5.33% 0.3% 3.9% 0.0% 1.1% 0.2% 5.2% as + 5.00% 0.1% 3.8% 0.0% 1.0% 0.3% 4.7% sh + 1.56% 0.1% 0.1% 0.0% 1.4% 0.6% 0.9% ld + 0.28% 0.1% 0.0% 0.0% 0.2% 0.1% 0.2% pkg-config + 0.09% 0.0% 0.0% 0.0% 0.1% 0.0% 0.1% git + 0.03% 0.0% 0.0% 0.0% 0.0% 0.0% 0.0% rm + ... + +Also, it can be aggregated based on various memory access info using the +sort keys. For example: + + # perf mem report -s mem,snoop + ... + # Samples: 92K of event 'ibs_op//' + # Total weight : 531104 + # Sort order : mem,snoop + # + # Overhead Samples Memory access Snoop + # ........ ............ ....................................... ............ + # + 47.99% 1509 L2 hit N/A + 25.08% 338 core, same node Any cache hit HitM + 10.24% 54374 N/A N/A + 6.77% 35938 L1 hit N/A + 6.39% 101 core, same node Any cache hit N/A + 3.50% 69 RAM hit N/A + 0.03% 158 LFB/MAB hit N/A + 0.00% 2 Uncached hit N/A Please refer to their man page for more detail. diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt index 965e73d37772..4d164836d094 100644 --- a/tools/perf/Documentation/perf-mem.txt +++ b/tools/perf/Documentation/perf-mem.txt @@ -119,6 +119,22 @@ REPORT OPTIONS And the default sort keys are changed to local_weight, mem, sym, dso, symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat. +-F:: +--fields=:: + Specify output field - multiple keys can be specified in CSV format. + Please see linkperf:perf-report[1] for details. + + In addition to the default fields, 'perf mem report' will provide the + following fields to break down sample periods. + + - op: operation in the sample instruction (load, store, prefetch, ...) + - cache: location in CPU cache (L1, L2, ...) where the sample hit + - mem: location in memory or other places the sample hit + - dtlb: location in Data TLB (L1, L2) where the sample hit + - snoop: snoop result for the sampled data access + + Please take a look at the OUTPUT FIELD SELECTION section for caveats. + -T:: --type-profile:: Show data-type profile result instead of code symbols. This requires @@ -156,6 +172,40 @@ but one sample with weight 180 and the other with weight 20: 90% [k] memcpy 10% [.] strcmp +OUTPUT FIELD SELECTION +---------------------- +"perf mem report" adds a number of new output fields specific to data source +information in the sample. Some of them have the same name with the existing +sort keys ("mem" and "snoop"). So unlike other fields and sort keys, they'll +behave differently when it's used by -F/--fields or -s/--sort. + +Using those two as output fields will aggregate samples altogether and show +breakdown. + + $ perf mem report -F mem,snoop + ... + # ------ Memory ------- --- Snoop ---- + # RAM Uncach Other HitM Other + # ..................... .............. + # + 3.5% 0.0% 96.5% 25.1% 74.9% + +But using the same name for sort keys will aggregate samples for each type +separately. + + $ perf mem report -s mem,snoop + # Overhead Samples Memory access Snoop + # ........ ............ ....................................... ............ + # + 47.99% 1509 L2 hit N/A + 25.08% 338 core, same node Any cache hit HitM + 10.24% 54374 N/A N/A + 6.77% 35938 L1 hit N/A + 6.39% 101 core, same node Any cache hit N/A + 3.50% 69 RAM hit N/A + 0.03% 158 LFB/MAB hit N/A + 0.00% 2 Uncached hit N/A + SEE ALSO -------- linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-arm-spe[1] From 3ce66a48a685f27a931e5bdffb637751c58df7d9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 9 Jun 2025 15:02:17 -0300 Subject: [PATCH 083/284] tools headers UAPI: Sync linux/prctl.h with the kernel sources to pick FUTEX knob To pick the changes in: 63e8595c060a1fef ("futex: Allow to make the private hash immutable") 80367ad01d93ac78 ("futex: Add basic infrastructure for local task local hash") That adds a FUTEX knob: $ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/perf/trace/beauty/include/uapi/linux/prctl.h $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after --- before 2025-06-09 14:50:45.162579336 -0300 +++ after 2025-06-09 14:50:52.797660024 -0300 @@ -72,6 +72,7 @@ [75] = "SET_SHADOW_STACK_STATUS", [76] = "LOCK_SHADOW_STACK_STATUS", [77] = "TIMER_CREATE_RESTORE_IDS", + [78] = "FUTEX_HASH", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", $ That now will be used to decode the syscall option and also to compose filters, for instance: [root@five ~]# perf trace -e syscalls:sys_enter_prctl --filter option==SET_NAME 0.000 Isolated Servi/3474327 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23f13b7aee) 0.032 DOM Worker/3474327 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23deb25670) 7.920 :3474328/3474328 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24fbb10) 7.935 StreamT~s #374/3474328 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24fb970) 8.400 Isolated Servi/3474329 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24bab10) 8.418 StreamT~s #374/3474329 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24ba970) ^C[root@five ~]# This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Acked-by: Sebastian Andrzej Siewior Cc: Adrian Hunter Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/aEiYOtKkrVDT03hZ@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/include/uapi/linux/prctl.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h index 15c18ef4eb11..43dec6eed559 100644 --- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h +++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h @@ -364,4 +364,11 @@ struct prctl_mm_map { # define PR_TIMER_CREATE_RESTORE_IDS_ON 1 # define PR_TIMER_CREATE_RESTORE_IDS_GET 2 +/* FUTEX hash management */ +#define PR_FUTEX_HASH 78 +# define PR_FUTEX_HASH_SET_SLOTS 1 +# define FH_FLAG_IMMUTABLE (1ULL << 0) +# define PR_FUTEX_HASH_GET_SLOTS 2 +# define PR_FUTEX_HASH_GET_IMMUTABLE 3 + #endif /* _LINUX_PRCTL_H */ From f1e30a4269f9636ffe6b0556f336633fac524ba7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Jun 2025 17:46:03 -0300 Subject: [PATCH 084/284] tools kvm headers arm64: Update KVM header from the kernel sources To pick the changes from: b7628c7973765c85 ("KVM: arm64: Allow userspace to limit the number of PMU counters for EL2 VMs") That doesn't result in any changes in tooling (built on a Raspberry PI 5 running Debian GNU/Linux 12 (bookworm)), only addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Marc Zyngier Cc: Namhyung Kim Link: https://lore.kernel.org/r/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/uapi/asm/kvm.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/arch/arm64/include/uapi/asm/kvm.h b/tools/arch/arm64/include/uapi/asm/kvm.h index af9d9acaf997..ed5f3892674c 100644 --- a/tools/arch/arm64/include/uapi/asm/kvm.h +++ b/tools/arch/arm64/include/uapi/asm/kvm.h @@ -431,10 +431,11 @@ enum { /* Device Control API on vcpu fd */ #define KVM_ARM_VCPU_PMU_V3_CTRL 0 -#define KVM_ARM_VCPU_PMU_V3_IRQ 0 -#define KVM_ARM_VCPU_PMU_V3_INIT 1 -#define KVM_ARM_VCPU_PMU_V3_FILTER 2 -#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3 +#define KVM_ARM_VCPU_PMU_V3_IRQ 0 +#define KVM_ARM_VCPU_PMU_V3_INIT 1 +#define KVM_ARM_VCPU_PMU_V3_FILTER 2 +#define KVM_ARM_VCPU_PMU_V3_SET_PMU 3 +#define KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS 4 #define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 From c30c18709587bba7753f8ee2419bacc9100d6b40 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 10:44:07 -0300 Subject: [PATCH 085/284] tools headers UAPI: Sync KVM's vmx.h header with the kernel sources To pick the changes in: 6c441e4d6e729616 ("KVM: TDX: Handle EXIT_REASON_OTHER_SMI") c42856af8f70d983 ("KVM: TDX: Add a place holder for handler of TDX hypercalls (TDG.VP.VMCALL)") That makes 'perf kvm-stat' aware of this new TDCALL exit reason, thus addressing the following perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/uapi/asm/vmx.h arch/x86/include/uapi/asm/vmx.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Isaku Yamahata Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Paolo Bonzini Link: https://lore.kernel.org/r/aErcVn_4plQyODR1@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/vmx.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/arch/x86/include/uapi/asm/vmx.h b/tools/arch/x86/include/uapi/asm/vmx.h index a5faf6d88f1b..f0f4a4cf84a7 100644 --- a/tools/arch/x86/include/uapi/asm/vmx.h +++ b/tools/arch/x86/include/uapi/asm/vmx.h @@ -34,6 +34,7 @@ #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_INIT_SIGNAL 3 #define EXIT_REASON_SIPI_SIGNAL 4 +#define EXIT_REASON_OTHER_SMI 6 #define EXIT_REASON_INTERRUPT_WINDOW 7 #define EXIT_REASON_NMI_WINDOW 8 @@ -92,6 +93,7 @@ #define EXIT_REASON_TPAUSE 68 #define EXIT_REASON_BUS_LOCK 74 #define EXIT_REASON_NOTIFY 75 +#define EXIT_REASON_TDCALL 77 #define VMX_EXIT_REASONS \ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ @@ -155,7 +157,8 @@ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" }, \ { EXIT_REASON_BUS_LOCK, "BUS_LOCK" }, \ - { EXIT_REASON_NOTIFY, "NOTIFY" } + { EXIT_REASON_NOTIFY, "NOTIFY" }, \ + { EXIT_REASON_TDCALL, "TDCALL" } #define VMX_EXIT_REASON_FLAGS \ { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } From da845e4bf4587cf4a212b607b6d508b00eab5217 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 10:49:47 -0300 Subject: [PATCH 086/284] tools headers x86 svm: Sync svm headers with the kernel sources To pick the changes in: 827547bc3a2a2af6 ("KVM: SVM: Add architectural definitions/assets for Bus Lock Threshold") That triggers: CC /tmp/build/perf-tools/arch/x86/util/kvm-stat.o LD /tmp/build/perf-tools/arch/x86/util/perf-util-in.o LD /tmp/build/perf-tools/arch/x86/perf-util-in.o LD /tmp/build/perf-tools/arch/perf-util-in.o LD /tmp/build/perf-tools/perf-util-in.o AR /tmp/build/perf-tools/libperf-util.a LINK /tmp/build/perf-tools/perf The SVM_EXIT_BUS_LOCK exit reason was added to SVM_EXIT_REASONS, used in kvm-stat.c. This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/uapi/asm/svm.h arch/x86/include/uapi/asm/svm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Nikunj A Dadhania Cc: Sean Christopherson Link: https://lore.kernel.org/r/aErcjuTTCVEZ-8Nb@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/svm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/arch/x86/include/uapi/asm/svm.h b/tools/arch/x86/include/uapi/asm/svm.h index ec1321248dac..9c640a521a67 100644 --- a/tools/arch/x86/include/uapi/asm/svm.h +++ b/tools/arch/x86/include/uapi/asm/svm.h @@ -95,6 +95,7 @@ #define SVM_EXIT_CR14_WRITE_TRAP 0x09e #define SVM_EXIT_CR15_WRITE_TRAP 0x09f #define SVM_EXIT_INVPCID 0x0a2 +#define SVM_EXIT_BUS_LOCK 0x0a5 #define SVM_EXIT_IDLE_HLT 0x0a6 #define SVM_EXIT_NPF 0x400 #define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 @@ -225,6 +226,7 @@ { SVM_EXIT_CR4_WRITE_TRAP, "write_cr4_trap" }, \ { SVM_EXIT_CR8_WRITE_TRAP, "write_cr8_trap" }, \ { SVM_EXIT_INVPCID, "invpcid" }, \ + { SVM_EXIT_BUS_LOCK, "buslock" }, \ { SVM_EXIT_IDLE_HLT, "idle-halt" }, \ { SVM_EXIT_NPF, "npf" }, \ { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ From 8fc50bec411e9cc7bba441dc6f7de9af30ea9cbb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 11:31:41 -0300 Subject: [PATCH 087/284] tools headers UAPI: Sync kvm header with the kernel sources To pick the changes in: c9c1e20b4c7d60fa ("KVM: x86: Introduce Intel specific quirk KVM_X86_QUIRK_IGNORE_GUEST_PAT") 012426d6f59cab21 ("KVM: TDX: Finalize VM initialization") c846b451d3c5d4ba ("KVM: TDX: Add an ioctl to create initial guest memory") 488808e682e72bdb ("KVM: x86: Introduce KVM_TDX_GET_CPUID") a50f673f25e0ba2b ("KVM: TDX: Do TDX specific vcpu initialization") 0186dd29a251866d ("KVM: TDX: add ioctl to initialize VM with TDX specific parameters") 61bb28279623b636 ("KVM: TDX: Get system-wide info about TDX module on initialization") b2aaf38ced6905b8 ("KVM: TDX: Add place holder for TDX VM specific mem_enc_op ioctl") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Please see tools/include/uapi/README for further details. Cc: Adrian Hunter Cc: Ian Rogers Cc: Isaku Yamahata Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Xiaoyao Li Cc: Yan Zhao Link: https://lore.kernel.org/r/aErqLPktXIzGyS-m@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/uapi/asm/kvm.h | 71 +++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index b663d916f162..6f3499507c5e 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -441,6 +441,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS (1 << 6) #define KVM_X86_QUIRK_SLOT_ZAP_ALL (1 << 7) #define KVM_X86_QUIRK_STUFF_FEATURE_MSRS (1 << 8) +#define KVM_X86_QUIRK_IGNORE_GUEST_PAT (1 << 9) #define KVM_STATE_NESTED_FORMAT_VMX 0 #define KVM_STATE_NESTED_FORMAT_SVM 1 @@ -931,4 +932,74 @@ struct kvm_hyperv_eventfd { #define KVM_X86_SNP_VM 4 #define KVM_X86_TDX_VM 5 +/* Trust Domain eXtension sub-ioctl() commands. */ +enum kvm_tdx_cmd_id { + KVM_TDX_CAPABILITIES = 0, + KVM_TDX_INIT_VM, + KVM_TDX_INIT_VCPU, + KVM_TDX_INIT_MEM_REGION, + KVM_TDX_FINALIZE_VM, + KVM_TDX_GET_CPUID, + + KVM_TDX_CMD_NR_MAX, +}; + +struct kvm_tdx_cmd { + /* enum kvm_tdx_cmd_id */ + __u32 id; + /* flags for sub-commend. If sub-command doesn't use this, set zero. */ + __u32 flags; + /* + * data for each sub-command. An immediate or a pointer to the actual + * data in process virtual address. If sub-command doesn't use it, + * set zero. + */ + __u64 data; + /* + * Auxiliary error code. The sub-command may return TDX SEAMCALL + * status code in addition to -Exxx. + */ + __u64 hw_error; +}; + +struct kvm_tdx_capabilities { + __u64 supported_attrs; + __u64 supported_xfam; + __u64 reserved[254]; + + /* Configurable CPUID bits for userspace */ + struct kvm_cpuid2 cpuid; +}; + +struct kvm_tdx_init_vm { + __u64 attributes; + __u64 xfam; + __u64 mrconfigid[6]; /* sha384 digest */ + __u64 mrowner[6]; /* sha384 digest */ + __u64 mrownerconfig[6]; /* sha384 digest */ + + /* The total space for TD_PARAMS before the CPUIDs is 256 bytes */ + __u64 reserved[12]; + + /* + * Call KVM_TDX_INIT_VM before vcpu creation, thus before + * KVM_SET_CPUID2. + * This configuration supersedes KVM_SET_CPUID2s for VCPUs because the + * TDX module directly virtualizes those CPUIDs without VMM. The user + * space VMM, e.g. qemu, should make KVM_SET_CPUID2 consistent with + * those values. If it doesn't, KVM may have wrong idea of vCPUIDs of + * the guest, and KVM may wrongly emulate CPUIDs or MSRs that the TDX + * module doesn't virtualize. + */ + struct kvm_cpuid2 cpuid; +}; + +#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0) + +struct kvm_tdx_init_mem_region { + __u64 source_addr; + __u64 gpa; + __u64 nr_pages; +}; + #endif /* _ASM_X86_KVM_H */ From 47684bfb7c3405aa8f7bfb31f2aa642ffd2b640f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 11:55:54 -0300 Subject: [PATCH 088/284] perf beauty: Update copy of linux/socket.h with the kernel sources To pick the changes in: b1e904999542ad67 ("net: pass const to msg_data_left()") That don't result in any changes in the tables generated from that header. This silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h Please see tools/include/uapi/README for details. Cc: Adrian Hunter Cc: Breno Leitao Cc: Ian Rogers Cc: Jakub Kicinski Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/r/aErrK24XLUILFH_P@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/include/linux/socket.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index c3322eb3d686..3b262487ec06 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -168,7 +168,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } -static inline size_t msg_data_left(struct msghdr *msg) +static inline size_t msg_data_left(const struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); } From c71bc5995409395d5c9094144534912886fd09a7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 12:03:28 -0300 Subject: [PATCH 089/284] tools headers UAPI: Sync the drm/drm.h with the kernel sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Picking the changes from: c2d3a730069545f2 ("drm/syncobj: Extend EXPORT_SYNC_FILE for timeline syncobjs") Silencing these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/drm/drm.h include/uapi/drm/drm.h No changes in tooling as these are just C comment documentation changes: $ tools/perf/trace/beauty/drm_ioctl.sh > before $ cp include/uapi/drm/drm.h tools/include/uapi/drm/drm.h $ tools/perf/trace/beauty/drm_ioctl.sh > after $ diff -u before after $ Cc: Adrian Hunter Cc: Christian König Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Rob Clark Link: https://lore.kernel.org/r/aErtHs3T2hdPjjHx@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/drm.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/uapi/drm/drm.h b/tools/include/uapi/drm/drm.h index 7fba37b94401..e63a71d3c607 100644 --- a/tools/include/uapi/drm/drm.h +++ b/tools/include/uapi/drm/drm.h @@ -905,13 +905,17 @@ struct drm_syncobj_destroy { }; #define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_IMPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_FD_TO_HANDLE_FLAGS_TIMELINE (1 << 1) #define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_EXPORT_SYNC_FILE (1 << 0) +#define DRM_SYNCOBJ_HANDLE_TO_FD_FLAGS_TIMELINE (1 << 1) struct drm_syncobj_handle { __u32 handle; __u32 flags; __s32 fd; __u32 pad; + + __u64 point; }; struct drm_syncobj_transfer { From aa69783a5920d5924d95a198917fa3dd9c2c84a3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 12:08:45 -0300 Subject: [PATCH 090/284] tools headers UAPI: Sync linux/kvm.h with the kernel sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To pick the changes in: 5b9db9c16f428ada ("RISC-V: KVM: add KVM_CAP_RISCV_MP_STATE_RESET") a7484c80e5ca1ae0 ("KVM: arm64: Allow userspace to request KVM_ARM_VCPU_EL2*") 79462faa2b2aa89d ("KVM: TDX: Handle TDG.VP.VMCALL") That just rebuilds perf, as these patches don't add any new KVM ioctl to be harvested for the the 'perf trace' ioctl syscall argument beautifiers. This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Adrian Hunter Cc: Anup Patel Cc: Binbin Wu Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Marc Zyngier Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Radim Krčmář Link: https://lore.kernel.org/r/aEruUUJvR0bfCg7_@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index b6ae8ad8934b..d00b85cb168c 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -375,6 +375,7 @@ struct kvm_run { #define KVM_SYSTEM_EVENT_WAKEUP 4 #define KVM_SYSTEM_EVENT_SUSPEND 5 #define KVM_SYSTEM_EVENT_SEV_TERM 6 +#define KVM_SYSTEM_EVENT_TDX_FATAL 7 __u32 type; __u32 ndata; union { @@ -930,6 +931,9 @@ struct kvm_enable_cap { #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 #define KVM_CAP_X86_GUEST_MODE 238 #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 +#define KVM_CAP_ARM_EL2 240 +#define KVM_CAP_ARM_EL2_E2H0 241 +#define KVM_CAP_RISCV_MP_STATE_RESET 242 struct kvm_irq_routing_irqchip { __u32 irqchip; From 00c8fde72fe2cdbd3892fbdc338c2d3a66ec8db3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 12:29:23 -0300 Subject: [PATCH 091/284] tools headers: Update the copy of x86's mem{cpy,set}_64.S used in 'perf bench' Also add SYM_PIC_ALIAS() to tools/perf/util/include/linux/linkage.h. This is to get the changes from: 419cbaf6a56a6e4b ("x86/boot: Add a bunch of PIC aliases") That addresses these perf tools build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S Cc: Adrian Hunter Cc: Ard Biesheuvel Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/r/aEry7L3fibwIG5au@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/lib/memcpy_64.S | 1 + tools/arch/x86/lib/memset_64.S | 1 + tools/perf/util/include/linux/linkage.h | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/tools/arch/x86/lib/memcpy_64.S b/tools/arch/x86/lib/memcpy_64.S index 59cf6f9065aa..ccc3d923fc1e 100644 --- a/tools/arch/x86/lib/memcpy_64.S +++ b/tools/arch/x86/lib/memcpy_64.S @@ -40,6 +40,7 @@ SYM_FUNC_END(__memcpy) EXPORT_SYMBOL(__memcpy) SYM_FUNC_ALIAS_MEMFUNC(memcpy, __memcpy) +SYM_PIC_ALIAS(memcpy) EXPORT_SYMBOL(memcpy) SYM_FUNC_START_LOCAL(memcpy_orig) diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S index d66b710d628f..fb5a03cf5ab7 100644 --- a/tools/arch/x86/lib/memset_64.S +++ b/tools/arch/x86/lib/memset_64.S @@ -42,6 +42,7 @@ SYM_FUNC_END(__memset) EXPORT_SYMBOL(__memset) SYM_FUNC_ALIAS_MEMFUNC(memset, __memset) +SYM_PIC_ALIAS(memset) EXPORT_SYMBOL(memset) SYM_FUNC_START_LOCAL(memset_orig) diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h index 178b00205fe6..89979ca23c3f 100644 --- a/tools/perf/util/include/linux/linkage.h +++ b/tools/perf/util/include/linux/linkage.h @@ -132,4 +132,8 @@ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) #endif +#ifndef SYM_PIC_ALIAS +#define SYM_PIC_ALIAS(sym) SYM_ALIAS(__pi_ ## sym, sym, SYM_T_FUNC, SYM_L_GLOBAL) +#endif + #endif /* PERF_LINUX_LINKAGE_H_ */ From 3417404c6f83c5d72d69ae81ded914bdc3841a49 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 12:53:37 -0300 Subject: [PATCH 092/284] tools headers: Syncronize linux/build_bug.h with the kernel sources To pick up the changes in: 243c90e917f5cfc9 ("build_bug.h: more user friendly error messages in BUILD_BUG_ON_ZERO()") This also needed to pick the __BUILD_BUG_ON_ZERO_MSG() in linux/compiler.h, that needed to be polished to avoid hitting old clang problems with _Static_assert on arrays of structs: Debian clang version 11.0.1-2~deb10u1 Debian clang version 11.0.1-2~deb10u1 $ make NO_LIBTRACEEVENT=1 ARCH= CROSS_COMPILE= EXTRA_CFLAGS= -C tools/perf O=/tmp/build/perf CC=clang btf_dump.c:895:18: error: type name does not allow storage class to be specified for (i = 0; i < ARRAY_SIZE(pads); i++) { ^ /git/perf-6.16.0-rc1/tools/include/linux/kernel.h:91:59: note: expanded from macro 'ARRAY_SIZE' #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) ^ /git/perf-6.16.0-rc1/tools/include/linux/compiler-gcc.h:26:28: note: expanded from macro '__must_be_array' #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) ^ /git/perf-6.16.0-rc1/tools/include/linux/build_bug.h:17:2: note: expanded from macro 'BUILD_BUG_ON_ZERO' __BUILD_BUG_ON_ZERO_MSG(e, ##__VA_ARGS__, #e " is true") ^ /git/perf-6.16.0-rc1/tools/include/linux/compiler.h:248:67: note: expanded from macro '__BUILD_BUG_ON_ZERO_MSG' #define __BUILD_BUG_ON_ZERO_MSG(e, msg, ...) ((int)sizeof(struct {_Static_assert(!(e), msg);})) ^ /usr/include/x86_64-linux-gnu/sys/cdefs.h:438:5: note: expanded from macro '_Static_assert' extern int (*__Static_assert_function (void)) \ ^ These also failed: toolsbuilder@five:~$ grep FAIL dm.log/summary | grep clang 1 72.87 almalinux:8 : FAIL clang version 19.1.7 ( 19.1.7-2.module_el8.10.0+3990+33d0d926) 15 73.39 centos:stream : FAIL clang version 17.0.6 (Red Hat 17.0.6-1.module_el8+767+9fa966b8) 36 87.14 opensuse:15.4 : FAIL clang version 15.0.7 37 80.08 opensuse:15.5 : FAIL clang version 15.0.7 40 72.12 oraclelinux:8 : FAIL clang version 16.0.6 (Red Hat 16.0.6-2.0.1.module+el8.9.0+90129+d3ee8717) 42 74.12 rockylinux:8 : FAIL clang version 16.0.6 (Red Hat 16.0.6-2.module+el8.9.0+1651+e10a8f6d) toolsbuilder@five:~$ This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/linux/build_bug.h include/linux/build_bug.h Cc: Adrian Hunter Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Vincent Mailhol Cc: Yury Norov Link: https://lore.kernel.org/r/aEszb7SSIJB6Lp6f@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/build_bug.h | 10 +++++----- tools/include/linux/compiler.h | 8 ++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/include/linux/build_bug.h b/tools/include/linux/build_bug.h index b4898ff085de..ab2aa97bd8ce 100644 --- a/tools/include/linux/build_bug.h +++ b/tools/include/linux/build_bug.h @@ -4,17 +4,17 @@ #include -#ifdef __CHECKER__ -#define BUILD_BUG_ON_ZERO(e) (0) -#else /* __CHECKER__ */ /* * Force a compilation error if condition is true, but also produce a * result (of value 0 and type int), so the expression can be used * e.g. in a structure initializer (or where-ever else comma expressions * aren't permitted). + * + * Take an error message as an optional second argument. If omitted, + * default to the stringification of the tested expression. */ -#define BUILD_BUG_ON_ZERO(e) ((int)(sizeof(struct { int:(-!!(e)); }))) -#endif /* __CHECKER__ */ +#define BUILD_BUG_ON_ZERO(e, ...) \ + __BUILD_BUG_ON_ZERO_MSG(e, ##__VA_ARGS__, #e " is true") /* Force a compilation error if a constant expression is not a power of 2 */ #define __BUILD_BUG_ON_NOT_POWER_OF_2(n) \ diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h index d627e66a04a6..33411ca0cc90 100644 --- a/tools/include/linux/compiler.h +++ b/tools/include/linux/compiler.h @@ -244,6 +244,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __asm__ ("" : "=r" (var) : "0" (var)) #endif +#ifndef __BUILD_BUG_ON_ZERO_MSG +#if defined(__clang__) +#define __BUILD_BUG_ON_ZERO_MSG(e, msg, ...) ((int)(sizeof(struct { int:(-!!(e)); }))) +#else +#define __BUILD_BUG_ON_ZERO_MSG(e, msg, ...) ((int)sizeof(struct {_Static_assert(!(e), msg);})) +#endif +#endif + #endif /* __ASSEMBLY__ */ #endif /* _TOOLS_LINUX_COMPILER_H */ From 6143374c6dc874d301dc16612aed144bf4544bae Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 17:58:05 -0300 Subject: [PATCH 093/284] tools arch x86: Sync the msr-index.h copy with the kernel sources To pick up the changes from these csets: 159013a7ca18c271 ("x86/its: Enumerate Indirect Target Selection (ITS) bug") f4138de5e41fae1a ("x86/msr: Standardize on u64 in ") ec980e4facef8110 ("perf/x86/intel: Support auto counter reload") That cause no changes to tooling as it doesn't include a new MSR to be captured by the tools/perf/trace/beauty/tracepoints/x86_msr.sh script. Just silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/msr-index.h arch/x86/include/asm/msr-index.h Cc: Adrian Hunter Cc: Dave Hansen Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Pawan Gupta Cc: Peter Zijlstra Link: https://lore.kernel.org/r/aEtAUg83OQGx8Kay@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/msr-index.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index e7d2f460fcc6..b7dded3c8113 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -533,7 +533,7 @@ #define MSR_HWP_CAPABILITIES 0x00000771 #define MSR_HWP_REQUEST_PKG 0x00000772 #define MSR_HWP_INTERRUPT 0x00000773 -#define MSR_HWP_REQUEST 0x00000774 +#define MSR_HWP_REQUEST 0x00000774 #define MSR_HWP_STATUS 0x00000777 /* CPUID.6.EAX */ @@ -550,16 +550,16 @@ #define HWP_LOWEST_PERF(x) (((x) >> 24) & 0xff) /* IA32_HWP_REQUEST */ -#define HWP_MIN_PERF(x) (x & 0xff) -#define HWP_MAX_PERF(x) ((x & 0xff) << 8) +#define HWP_MIN_PERF(x) (x & 0xff) +#define HWP_MAX_PERF(x) ((x & 0xff) << 8) #define HWP_DESIRED_PERF(x) ((x & 0xff) << 16) -#define HWP_ENERGY_PERF_PREFERENCE(x) (((unsigned long long) x & 0xff) << 24) +#define HWP_ENERGY_PERF_PREFERENCE(x) (((u64)x & 0xff) << 24) #define HWP_EPP_PERFORMANCE 0x00 #define HWP_EPP_BALANCE_PERFORMANCE 0x80 #define HWP_EPP_BALANCE_POWERSAVE 0xC0 #define HWP_EPP_POWERSAVE 0xFF -#define HWP_ACTIVITY_WINDOW(x) ((unsigned long long)(x & 0xff3) << 32) -#define HWP_PACKAGE_CONTROL(x) ((unsigned long long)(x & 0x1) << 42) +#define HWP_ACTIVITY_WINDOW(x) ((u64)(x & 0xff3) << 32) +#define HWP_PACKAGE_CONTROL(x) ((u64)(x & 0x1) << 42) /* IA32_HWP_STATUS */ #define HWP_GUARANTEED_CHANGE(x) (x & 0x1) @@ -602,7 +602,11 @@ /* V6 PMON MSR range */ #define MSR_IA32_PMC_V6_GP0_CTR 0x1900 #define MSR_IA32_PMC_V6_GP0_CFG_A 0x1901 +#define MSR_IA32_PMC_V6_GP0_CFG_B 0x1902 +#define MSR_IA32_PMC_V6_GP0_CFG_C 0x1903 #define MSR_IA32_PMC_V6_FX0_CTR 0x1980 +#define MSR_IA32_PMC_V6_FX0_CFG_B 0x1982 +#define MSR_IA32_PMC_V6_FX0_CFG_C 0x1983 #define MSR_IA32_PMC_V6_STEP 4 /* KeyID partitioning between MKTME and TDX */ From d8ab68bdb294b09a761e967dad374f2965e1913f Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Thu, 12 Jun 2025 12:15:56 +0200 Subject: [PATCH 094/284] scsi: target: Fix NULL pointer dereference in core_scsi3_decode_spec_i_port() The function core_scsi3_decode_spec_i_port(), in its error code path, unconditionally calls core_scsi3_lunacl_undepend_item() passing the dest_se_deve pointer, which may be NULL. This can lead to a NULL pointer dereference if dest_se_deve remains unset. SPC-3 PR SPEC_I_PT: Unable to locate dest_tpg Unable to handle kernel paging request at virtual address dfff800000000012 Call trace: core_scsi3_lunacl_undepend_item+0x2c/0xf0 [target_core_mod] (P) core_scsi3_decode_spec_i_port+0x120c/0x1c30 [target_core_mod] core_scsi3_emulate_pro_register+0x6b8/0xcd8 [target_core_mod] target_scsi3_emulate_pr_out+0x56c/0x840 [target_core_mod] Fix this by adding a NULL check before calling core_scsi3_lunacl_undepend_item() Signed-off-by: Maurizio Lombardi Link: https://lore.kernel.org/r/20250612101556.24829-1-mlombard@redhat.com Reviewed-by: Mike Christie Reviewed-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/target/target_core_pr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 34cf2c399b39..70905805cb17 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -1842,7 +1842,9 @@ core_scsi3_decode_spec_i_port( } kmem_cache_free(t10_pr_reg_cache, dest_pr_reg); - core_scsi3_lunacl_undepend_item(dest_se_deve); + + if (dest_se_deve) + core_scsi3_lunacl_undepend_item(dest_se_deve); if (is_local) continue; From 594902c986e269660302f09df9ec4bf1cf017b77 Mon Sep 17 00:00:00 2001 From: Qinyun Tan Date: Sat, 31 May 2025 02:20:53 +0800 Subject: [PATCH 095/284] x86,fs/resctrl: Remove inappropriate references to cacheinfo in the resctrl subsystem In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain structure representing a NUMA node relies on the cacheinfo interface (rdt_mon_domain::ci) to store L3 cache information (e.g., shared_cpu_map) for monitoring. The L3 cache information of a SNC NUMA node determines which domains are summed for the "top level" L3-scoped events. rdt_mon_domain::ci is initialized using the first online CPU of a NUMA node. When this CPU goes offline, its shared_cpu_map is cleared to contain only the offline CPU itself. Subsequently, attempting to read counters via smp_call_on_cpu(offline_cpu) fails (and error ignored), returning zero values for "top-level events" without any error indication. Replace the cacheinfo references in struct rdt_mon_domain and struct rmid_read with the cacheinfo ID (a unique identifier for the L3 cache). rdt_domain_hdr::cpu_mask contains the online CPUs associated with that domain. When reading "top-level events", select a CPU from rdt_domain_hdr::cpu_mask and utilize its L3 shared_cpu_map to determine valid CPUs for reading RMID counter via the MSR interface. Considering all CPUs associated with the L3 cache improves the chances of picking a housekeeping CPU on which the counter reading work can be queued, avoiding an unnecessary IPI. Fixes: 328ea68874642 ("x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files") Signed-off-by: Qinyun Tan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Tony Luck Link: https://lore.kernel.org/20250530182053.37502-2-qinyuntan@linux.alibaba.com --- arch/x86/kernel/cpu/resctrl/core.c | 6 ++++-- fs/resctrl/ctrlmondata.c | 13 +++++++++---- fs/resctrl/internal.h | 4 ++-- fs/resctrl/monitor.c | 6 ++++-- fs/resctrl/rdtgroup.c | 6 +++--- include/linux/resctrl.h | 4 ++-- 6 files changed, 24 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7109cbfcad4f..187d527ef73b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -498,6 +498,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; struct rdt_mon_domain *d; + struct cacheinfo *ci; int err; lockdep_assert_held(&domain_list_lock); @@ -525,12 +526,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; - d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!d->ci) { + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); mon_domain_free(hw_dom); return; } + d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); arch_mon_domain_online(r, d); diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c index 6ed2dfd4dbbd..d98e0d2de09f 100644 --- a/fs/resctrl/ctrlmondata.c +++ b/fs/resctrl/ctrlmondata.c @@ -594,9 +594,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) struct rmid_read rr = {0}; struct rdt_mon_domain *d; struct rdtgroup *rdtgrp; + int domid, cpu, ret = 0; struct rdt_resource *r; + struct cacheinfo *ci; struct mon_data *md; - int domid, ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -623,10 +624,14 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) * one that matches this cache id. */ list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; + if (d->ci_id == domid) { + rr.ci_id = d->ci_id; + cpu = cpumask_any(&d->hdr.cpu_mask); + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) + continue; mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); + &ci->shared_cpu_map, evtid, false); goto checkresult; } } diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h index 9a8cf6f11151..0a1eedba2b03 100644 --- a/fs/resctrl/internal.h +++ b/fs/resctrl/internal.h @@ -98,7 +98,7 @@ struct mon_data { * domains in @r sharing L3 @ci.id * @evtid: Which monitor event to read. * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains. * @err: Error encountered when reading counter. * @val: Returned value of event counter. If @rgrp is a parent resource group, * @val includes the sum of event counts from its child resource groups. @@ -112,7 +112,7 @@ struct rmid_read { struct rdt_mon_domain *d; enum resctrl_event_id evtid; bool first; - struct cacheinfo *ci; + unsigned int ci_id; int err; u64 val; void *arch_mon_ctx; diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c index bde2801289d3..f5637855c3ac 100644 --- a/fs/resctrl/monitor.c +++ b/fs/resctrl/monitor.c @@ -361,6 +361,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) { int cpu = smp_processor_id(); struct rdt_mon_domain *d; + struct cacheinfo *ci; struct mbm_state *m; int err, ret; u64 tval = 0; @@ -388,7 +389,8 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) } /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci || ci->id != rr->ci_id) return -EINVAL; /* @@ -400,7 +402,7 @@ static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) */ ret = -EINVAL; list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) + if (d->ci_id != rr->ci_id) continue; err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, rr->evtid, &tval, rr->arch_mon_ctx); diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c index 1beb124e25f6..77d08229d855 100644 --- a/fs/resctrl/rdtgroup.c +++ b/fs/resctrl/rdtgroup.c @@ -3036,7 +3036,7 @@ static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, char name[32]; snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); if (snc_mode) sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); @@ -3061,7 +3061,7 @@ static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, return -EPERM; list_for_each_entry(mevt, &r->evt_list, list) { - domid = do_sum ? d->ci->id : d->hdr.id; + domid = do_sum ? d->ci_id : d->hdr.id; priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); if (WARN_ON_ONCE(!priv)) return -EINVAL; @@ -3089,7 +3089,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, lockdep_assert_held(&rdtgroup_mutex); snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id); kn = kernfs_find_and_get(parent_kn, name); if (kn) { /* diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 9ba771f2ddea..6fb4894b8cfd 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -159,7 +159,7 @@ struct rdt_ctrl_domain { /** * struct rdt_mon_domain - group of CPUs sharing a resctrl monitor resource * @hdr: common header for different domain types - * @ci: cache info for this domain + * @ci_id: cache info id for this domain * @rmid_busy_llc: bitmap of which limbo RMIDs are above threshold * @mbm_total: saved state for MBM total bandwidth * @mbm_local: saved state for MBM local bandwidth @@ -170,7 +170,7 @@ struct rdt_ctrl_domain { */ struct rdt_mon_domain { struct rdt_domain_hdr hdr; - struct cacheinfo *ci; + unsigned int ci_id; unsigned long *rmid_busy_llc; struct mbm_state *mbm_total; struct mbm_state *mbm_local; From b2e673ae53ef4b943f68585207a5f21cfc9a0714 Mon Sep 17 00:00:00 2001 From: Avadhut Naik Date: Fri, 13 Jun 2025 00:51:35 +0000 Subject: [PATCH 096/284] EDAC/amd64: Correct number of UMCs for family 19h models 70h-7fh AMD's Family 19h-based Models 70h-7fh support 4 unified memory controllers (UMC) per processor die. The amd64_edac driver, however, assumes only 2 UMCs are supported since max_mcs variable for the models has not been explicitly set to 4. The same results in incomplete or incorrect memory information being logged to dmesg by the module during initialization in some instances. Fixes: 6c79e42169fe ("EDAC/amd64: Add support for ECC on family 19h model 60h-7Fh") Closes: https://lore.kernel.org/all/27dc093f-ce27-4c71-9e81-786150a040b6@reox.at/ Reported-by: reox Signed-off-by: Avadhut Naik Signed-off-by: Borislav Petkov (AMD) Cc: stable@kernel.org Link: https://lore.kernel.org/20250613005233.2330627-1-avadhut.naik@amd.com --- drivers/edac/amd64_edac.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 58b1482a0fbb..b681c0663203 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3879,6 +3879,7 @@ static int per_family_init(struct amd64_pvt *pvt) break; case 0x70 ... 0x7f: pvt->ctl_name = "F19h_M70h"; + pvt->max_mcs = 4; pvt->flags.zn_regs_v2 = 1; break; case 0x90 ... 0x9f: From 2a8a5a5dd06eef580f9818567773fd75057cb875 Mon Sep 17 00:00:00 2001 From: Vitaliy Shevtsov Date: Thu, 12 Jun 2025 21:35:18 +0500 Subject: [PATCH 097/284] scsi: elx: efct: Fix memory leak in efct_hw_parse_filter() strsep() modifies the address of the pointer passed to it so that it no longer points to the original address. This means kfree() gets the wrong pointer. Fix this by passing unmodified pointer returned from kstrdup() to kfree(). Found by Linux Verification Center (linuxtesting.org) with Svace. Fixes: 4df84e846624 ("scsi: elx: efct: Driver initialization routines") Signed-off-by: Vitaliy Shevtsov Link: https://lore.kernel.org/r/20250612163616.24298-1-v.shevtsov@mt-integration.ru Reviewed-by: Daniel Wagner Signed-off-by: Martin K. Petersen --- drivers/scsi/elx/efct/efct_hw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/elx/efct/efct_hw.c b/drivers/scsi/elx/efct/efct_hw.c index 5a5525054d71..5b079b8b7a08 100644 --- a/drivers/scsi/elx/efct/efct_hw.c +++ b/drivers/scsi/elx/efct/efct_hw.c @@ -1120,7 +1120,7 @@ int efct_hw_parse_filter(struct efct_hw *hw, void *value) { int rc = 0; - char *p = NULL; + char *p = NULL, *pp = NULL; char *token; u32 idx = 0; @@ -1132,6 +1132,7 @@ efct_hw_parse_filter(struct efct_hw *hw, void *value) efc_log_err(hw->os, "p is NULL\n"); return -ENOMEM; } + pp = p; idx = 0; while ((token = strsep(&p, ",")) && *token) { @@ -1144,7 +1145,7 @@ efct_hw_parse_filter(struct efct_hw *hw, void *value) if (idx == ARRAY_SIZE(hw->config.filter_def)) break; } - kfree(p); + kfree(pp); return rc; } From e1bc3a13bd775791cca0bb144d977b00f3598042 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Mon, 2 Jun 2025 12:14:02 -0300 Subject: [PATCH 098/284] drm/v3d: Avoid NULL pointer dereference in `v3d_job_update_stats()` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The following kernel Oops was recently reported by Mesa CI: [ 800.139824] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000588 [ 800.148619] Mem abort info: [ 800.151402] ESR = 0x0000000096000005 [ 800.155141] EC = 0x25: DABT (current EL), IL = 32 bits [ 800.160444] SET = 0, FnV = 0 [ 800.163488] EA = 0, S1PTW = 0 [ 800.166619] FSC = 0x05: level 1 translation fault [ 800.171487] Data abort info: [ 800.174357] ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 [ 800.179832] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 800.184873] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 800.190176] user pgtable: 4k pages, 39-bit VAs, pgdp=00000001014c2000 [ 800.196607] [0000000000000588] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 [ 800.205305] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP [ 800.211564] Modules linked in: vc4 snd_soc_hdmi_codec drm_display_helper v3d cec gpu_sched drm_dma_helper drm_shmem_helper drm_kms_helper drm drm_panel_orientation_quirks snd_soc_core snd_compress snd_pcm_dmaengine snd_pcm i2c_brcmstb snd_timer snd backlight [ 800.234448] CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.12.25+rpt-rpi-v8 #1 Debian 1:6.12.25-1+rpt1 [ 800.244182] Hardware name: Raspberry Pi 4 Model B Rev 1.4 (DT) [ 800.250005] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 800.256959] pc : v3d_job_update_stats+0x60/0x130 [v3d] [ 800.262112] lr : v3d_job_update_stats+0x48/0x130 [v3d] [ 800.267251] sp : ffffffc080003e60 [ 800.270555] x29: ffffffc080003e60 x28: ffffffd842784980 x27: 0224012000000000 [ 800.277687] x26: ffffffd84277f630 x25: ffffff81012fd800 x24: 0000000000000020 [ 800.284818] x23: ffffff8040238b08 x22: 0000000000000570 x21: 0000000000000158 [ 800.291948] x20: 0000000000000000 x19: ffffff8040238000 x18: 0000000000000000 [ 800.299078] x17: ffffffa8c1bd2000 x16: ffffffc080000000 x15: 0000000000000000 [ 800.306208] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 [ 800.313338] x11: 0000000000000040 x10: 0000000000001a40 x9 : ffffffd83b39757c [ 800.320468] x8 : ffffffd842786420 x7 : 7fffffffffffffff x6 : 0000000000ef32b0 [ 800.327598] x5 : 00ffffffffffffff x4 : 0000000000000015 x3 : ffffffd842784980 [ 800.334728] x2 : 0000000000000004 x1 : 0000000000010002 x0 : 000000ba4c0ca382 [ 800.341859] Call trace: [ 800.344294] v3d_job_update_stats+0x60/0x130 [v3d] [ 800.349086] v3d_irq+0x124/0x2e0 [v3d] [ 800.352835] __handle_irq_event_percpu+0x58/0x218 [ 800.357539] handle_irq_event+0x54/0xb8 [ 800.361369] handle_fasteoi_irq+0xac/0x240 [ 800.365458] handle_irq_desc+0x48/0x68 [ 800.369200] generic_handle_domain_irq+0x24/0x38 [ 800.373810] gic_handle_irq+0x48/0xd8 [ 800.377464] call_on_irq_stack+0x24/0x58 [ 800.381379] do_interrupt_handler+0x88/0x98 [ 800.385554] el1_interrupt+0x34/0x68 [ 800.389123] el1h_64_irq_handler+0x18/0x28 [ 800.393211] el1h_64_irq+0x64/0x68 [ 800.396603] default_idle_call+0x3c/0x168 [ 800.400606] do_idle+0x1fc/0x230 [ 800.403827] cpu_startup_entry+0x40/0x50 [ 800.407742] rest_init+0xe4/0xf0 [ 800.410962] start_kernel+0x5e8/0x790 [ 800.414616] __primary_switched+0x80/0x90 [ 800.418622] Code: 8b170277 8b160296 11000421 b9000861 (b9401ac1) [ 800.424707] ---[ end trace 0000000000000000 ]--- [ 800.457313] ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]--- This issue happens when the file descriptor is closed before the jobs submitted by it are completed. When the job completes, we update the global GPU stats and the per-fd GPU stats, which are exposed through fdinfo. If the file descriptor was closed, then the struct `v3d_file_priv` and its stats were already freed and we can't update the per-fd stats. Therefore, if the file descriptor was already closed, don't update the per-fd GPU stats, only update the global ones. Cc: stable@vger.kernel.org # v6.12+ Reviewed-by: Jose Maria Casanova Crespo Link: https://lore.kernel.org/r/20250602151451.10161-1-mcanal@igalia.com Signed-off-by: Maíra Canal --- drivers/gpu/drm/v3d/v3d_sched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 35f131a46d07..42df9d3567e7 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -199,7 +199,6 @@ v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue) struct v3d_dev *v3d = job->v3d; struct v3d_file_priv *file = job->file->driver_priv; struct v3d_stats *global_stats = &v3d->queue[queue].stats; - struct v3d_stats *local_stats = &file->stats[queue]; u64 now = local_clock(); unsigned long flags; @@ -209,7 +208,12 @@ v3d_job_update_stats(struct v3d_job *job, enum v3d_queue queue) else preempt_disable(); - v3d_stats_update(local_stats, now); + /* Don't update the local stats if the file context has already closed */ + if (file) + v3d_stats_update(&file->stats[queue], now); + else + drm_dbg(&v3d->drm, "The file descriptor was closed before job completion\n"); + v3d_stats_update(global_stats, now); if (IS_ENABLED(CONFIG_LOCKDEP)) From 61ee19dedb8d753249e20308782bf4e9e2fb7344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Mon, 2 Jun 2025 10:22:16 -0300 Subject: [PATCH 099/284] drm/etnaviv: Protect the scheduler's pending list with its lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 704d3d60fec4 ("drm/etnaviv: don't block scheduler when GPU is still active") ensured that active jobs are returned to the pending list when extending the timeout. However, it didn't use the pending list's lock to manipulate the list, which causes a race condition as the scheduler's workqueues are running. Hold the lock while manipulating the scheduler's pending list to prevent a race. Cc: stable@vger.kernel.org Fixes: 704d3d60fec4 ("drm/etnaviv: don't block scheduler when GPU is still active") Reported-by: Philipp Stanner Closes: https://lore.kernel.org/dri-devel/964e59ba1539083ef29b06d3c78f5e2e9b138ab8.camel@mailbox.org/ Reviewed-by: Lucas Stach Reviewed-by: Philipp Stanner Link: https://lore.kernel.org/r/20250602132240.93314-1-mcanal@igalia.com Signed-off-by: Maíra Canal --- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index 76a3a3e517d8..71e2e6b9d713 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -35,6 +35,7 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) { struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job); + struct drm_gpu_scheduler *sched = sched_job->sched; struct etnaviv_gpu *gpu = submit->gpu; u32 dma_addr, primid = 0; int change; @@ -89,7 +90,9 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job return DRM_GPU_SCHED_STAT_NOMINAL; out_no_timeout: - list_add(&sched_job->list, &sched_job->sched->pending_list); + spin_lock(&sched->job_list_lock); + list_add(&sched_job->list, &sched->pending_list); + spin_unlock(&sched->job_list_lock); return DRM_GPU_SCHED_STAT_NOMINAL; } From 6aba0cb5bba6141158d5449f2cf53187b7f755f9 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 5 Jun 2025 11:44:46 +0530 Subject: [PATCH 100/284] RISC-V: KVM: Fix the size parameter check in SBI SFENCE calls As-per the SBI specification, an SBI remote fence operation applies to the entire address space if either: 1) start_addr and size are both 0 2) size is equal to 2^XLEN-1 >From the above, only #1 is checked by SBI SFENCE calls so fix the size parameter check in SBI SFENCE calls to cover #2 as well. Fixes: 13acfec2dbcc ("RISC-V: KVM: Add remote HFENCE functions based on VCPU requests") Reviewed-by: Atish Patra Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250605061458.196003-2-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 5fbf3f94f1e8..9752d2ffff68 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -103,7 +103,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - if (cp->a2 == 0 && cp->a3 == 0) + if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask); else kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask, @@ -111,7 +111,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - if (cp->a2 == 0 && cp->a3 == 0) + if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, hbase, hmask, cp->a4); else From 2e7be162996640bbe3b6da694cc064c511b8a5d9 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 5 Jun 2025 11:44:47 +0530 Subject: [PATCH 101/284] RISC-V: KVM: Don't treat SBI HFENCE calls as NOPs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SBI specification clearly states that SBI HFENCE calls should return SBI_ERR_NOT_SUPPORTED when one of the target hart doesn’t support hypervisor extension (aka nested virtualization in-case of KVM RISC-V). Fixes: c7fa3c48de86 ("RISC-V: KVM: Treat SBI HFENCE calls as NOPs") Reviewed-by: Atish Patra Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250605061458.196003-3-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 9752d2ffff68..b17fad091bab 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -127,9 +127,9 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: /* * Until nested virtualization is implemented, the - * SBI HFENCE calls should be treated as NOPs + * SBI HFENCE calls should return not supported + * hence fallthrough. */ - break; default: retdata->err_val = SBI_ERR_NOT_SUPPORTED; } From 0a1db19f66c0960eb00e1f2ccd40708b6747f5b1 Mon Sep 17 00:00:00 2001 From: Sascha Hauer Date: Mon, 16 Jun 2025 15:45:03 +0200 Subject: [PATCH 102/284] gpio: pca953x: fix wrong error probe return value The second argument to dev_err_probe() is the error value. Pass the return value of devm_request_threaded_irq() there instead of the irq number. Signed-off-by: Sascha Hauer Fixes: c47f7ff0fe61 ("gpio: pca953x: Utilise dev_err_probe() where it makes sense") Link: https://lore.kernel.org/r/20250616134503.1201138-1-s.hauer@pengutronix.de Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-pca953x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index b852e4997629..e80a96f39788 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -974,7 +974,7 @@ static int pca953x_irq_setup(struct pca953x_chip *chip, int irq_base) IRQF_ONESHOT | IRQF_SHARED, dev_name(dev), chip); if (ret) - return dev_err_probe(dev, client->irq, "failed to request irq\n"); + return dev_err_probe(dev, ret, "failed to request irq\n"); return 0; } From 7f90d45e57cb2ef1f0adcaf925ddffdfc5e680ca Mon Sep 17 00:00:00 2001 From: Justin Sanders Date: Tue, 10 Jun 2025 17:05:59 +0000 Subject: [PATCH 103/284] aoe: clean device rq_list in aoedev_downdev() An aoe device's rq_list contains accepted block requests that are waiting to be transmitted to the aoe target. This queue was added as part of the conversion to blk_mq. However, the queue was not cleaned out when an aoe device is downed which caused blk_mq_freeze_queue() to sleep indefinitely waiting for those requests to complete, causing a hang. This fix cleans out the queue before calling blk_mq_freeze_queue(). Link: https://bugzilla.kernel.org/show_bug.cgi?id=212665 Fixes: 3582dd291788 ("aoe: convert aoeblk to blk-mq") Signed-off-by: Justin Sanders Link: https://lore.kernel.org/r/20250610170600.869-1-jsanders.devel@gmail.com Tested-By: Valentin Kleibel Signed-off-by: Jens Axboe --- drivers/block/aoe/aoedev.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 141b2a0e03f2..8c18034cb3d6 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -198,6 +198,7 @@ aoedev_downdev(struct aoedev *d) { struct aoetgt *t, **tt, **te; struct list_head *head, *pos, *nx; + struct request *rq, *rqnext; int i; d->flags &= ~DEVFL_UP; @@ -223,6 +224,13 @@ aoedev_downdev(struct aoedev *d) /* clean out the in-process request (if any) */ aoe_failip(d); + /* clean out any queued block requests */ + list_for_each_entry_safe(rq, rqnext, &d->rq_list, queuelist) { + list_del_init(&rq->queuelist); + blk_mq_start_request(rq); + blk_mq_end_request(rq, BLK_STS_IOERR); + } + /* fast fail all pending I/O */ if (d->blkq) { /* UP is cleared, freeze+quiesce to insure all are errored */ From cffc873d68ab09a0432b8212008c5613f8a70a2c Mon Sep 17 00:00:00 2001 From: Justin Sanders Date: Tue, 10 Jun 2025 17:06:00 +0000 Subject: [PATCH 104/284] aoe: defer rexmit timer downdev work to workqueue When aoe's rexmit_timer() notices that an aoe target fails to respond to commands for more than aoe_deadsecs, it calls aoedev_downdev() which cleans the outstanding aoe and block queues. This can involve sleeping, such as in blk_mq_freeze_queue(), which should not occur in irq context. This patch defers that aoedev_downdev() call to the aoe device's workqueue. Link: https://bugzilla.kernel.org/show_bug.cgi?id=212665 Signed-off-by: Justin Sanders Link: https://lore.kernel.org/r/20250610170600.869-2-jsanders.devel@gmail.com Tested-By: Valentin Kleibel Signed-off-by: Jens Axboe --- drivers/block/aoe/aoe.h | 1 + drivers/block/aoe/aoecmd.c | 8 ++++++-- drivers/block/aoe/aoedev.c | 5 ++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 749ae1246f4c..d35caa3c69e1 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -80,6 +80,7 @@ enum { DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */ DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */ DEVFL_FREED = (1<<8), /* device has been cleaned up */ + DEVFL_DEAD = (1<<9), /* device has timed out of aoe_deadsecs */ }; enum { diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 92b06d1de4cc..6c94cfd1c480 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -754,7 +754,7 @@ rexmit_timer(struct timer_list *timer) utgts = count_targets(d, NULL); - if (d->flags & DEVFL_TKILL) { + if (d->flags & (DEVFL_TKILL | DEVFL_DEAD)) { spin_unlock_irqrestore(&d->lock, flags); return; } @@ -786,7 +786,8 @@ rexmit_timer(struct timer_list *timer) * to clean up. */ list_splice(&flist, &d->factive[0]); - aoedev_downdev(d); + d->flags |= DEVFL_DEAD; + queue_work(aoe_wq, &d->work); goto out; } @@ -898,6 +899,9 @@ aoecmd_sleepwork(struct work_struct *work) { struct aoedev *d = container_of(work, struct aoedev, work); + if (d->flags & DEVFL_DEAD) + aoedev_downdev(d); + if (d->flags & DEVFL_GDALLOC) aoeblk_gdalloc(d); diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 8c18034cb3d6..9b66f8ebce50 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -200,8 +200,11 @@ aoedev_downdev(struct aoedev *d) struct list_head *head, *pos, *nx; struct request *rq, *rqnext; int i; + unsigned long flags; - d->flags &= ~DEVFL_UP; + spin_lock_irqsave(&d->lock, flags); + d->flags &= ~(DEVFL_UP | DEVFL_DEAD); + spin_unlock_irqrestore(&d->lock, flags); /* clean out active and to-be-retransmitted buffers */ for (i = 0; i < NFACTIVE; i++) { From 91a7703a036b146481b8a0bd6efa6200d296ca5d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Jun 2025 06:41:48 -0600 Subject: [PATCH 105/284] io_uring: remove duplicate io_uring_alloc_task_context() definition This function exists in both tctx.h (where it belongs) and in io_uring.h as a remnant of before the tctx handling code got split out. Remove the io_uring.h definition and ensure that sqpoll.c includes the tctx.h header to get the definition. Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 2 -- io_uring/sqpoll.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index d59c12277d58..66c1ca73f55e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -98,8 +98,6 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *coun struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 268d2fbe6160..fa5a6750ee52 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -16,6 +16,7 @@ #include #include "io_uring.h" +#include "tctx.h" #include "napi.h" #include "sqpoll.h" From f2320f1dd6f6f82cb2c7aff23a12bab537bdea89 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Jun 2025 06:43:18 -0600 Subject: [PATCH 106/284] io_uring/sqpoll: don't put task_struct on tctx setup failure A recent commit moved the error handling of sqpoll thread and tctx failures into the thread itself, as part of fixing an issue. However, it missed that tctx allocation may also fail, and that io_sq_offload_create() does its own error handling for the task_struct in that case. Remove the manual task putting in io_sq_offload_create(), as io_sq_thread() will notice that the tctx did not get setup and hence it should put itself and exit. Reported-by: syzbot+763e12bbf004fb1062e4@syzkaller.appspotmail.com Fixes: ac0b8b327a56 ("io_uring: fix use-after-free of sq->thread in __io_uring_show_fdinfo()") Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index fa5a6750ee52..a3f11349ce06 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -420,7 +420,6 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ @@ -499,7 +498,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, rcu_assign_pointer(sqd->thread, tsk); mutex_unlock(&sqd->lock); - task_to_put = get_task_struct(tsk); + get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -514,8 +513,6 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); - if (task_to_put) - put_task_struct(task_to_put); return ret; } From a766cfbbeb3a74397965a8fa2e9a402026d3e1d8 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 12 Jun 2025 22:28:56 -0700 Subject: [PATCH 107/284] bpf: Mark dentry->d_inode as trusted_or_null LSM hooks such as security_path_mknod() and security_inode_rename() have access to newly allocated negative dentry, which has NULL d_inode. Therefore, it is necessary to do the NULL pointer check for d_inode. Also add selftests that checks the verifier enforces the NULL pointer check. Signed-off-by: Song Liu Reviewed-by: Matt Bobrowski Link: https://lore.kernel.org/r/20250613052857.1992233-1-song@kernel.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 5 ++--- .../selftests/bpf/progs/verifier_vfs_accept.c | 18 ++++++++++++++++++ .../selftests/bpf/progs/verifier_vfs_reject.c | 15 +++++++++++++++ 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7d6e0c5928b..169845710c7e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7027,8 +7027,7 @@ BTF_TYPE_SAFE_TRUSTED(struct file) { struct inode *f_inode; }; -BTF_TYPE_SAFE_TRUSTED(struct dentry) { - /* no negative dentry-s in places where bpf can see it */ +BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry) { struct inode *d_inode; }; @@ -7066,7 +7065,6 @@ static bool type_is_trusted(struct bpf_verifier_env *env, BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm)); BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file)); - BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted"); } @@ -7076,6 +7074,7 @@ static bool type_is_trusted_or_null(struct bpf_verifier_env *env, const char *field_name, u32 btf_id) { BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct socket)); + BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED_OR_NULL(struct dentry)); return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted_or_null"); diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c index a7c0a553aa50..3e2d76ee8050 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_accept.c @@ -2,6 +2,7 @@ /* Copyright (c) 2024 Google LLC. */ #include +#include #include #include @@ -82,4 +83,21 @@ int BPF_PROG(path_d_path_from_file_argument, struct file *file) return 0; } +SEC("lsm.s/inode_rename") +__success +int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct inode *inode = new_dentry->d_inode; + ino_t ino; + + if (!inode) + return 0; + ino = inode->i_ino; + if (ino == 0) + return -EACCES; + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c index d6d3f4fcb24c..4b392c6c8fc4 100644 --- a/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c +++ b/tools/testing/selftests/bpf/progs/verifier_vfs_reject.c @@ -2,6 +2,7 @@ /* Copyright (c) 2024 Google LLC. */ #include +#include #include #include #include @@ -158,4 +159,18 @@ int BPF_PROG(path_d_path_kfunc_non_lsm, struct path *path, struct file *f) return 0; } +SEC("lsm.s/inode_rename") +__failure __msg("invalid mem access 'trusted_ptr_or_null_'") +int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct inode *inode = new_dentry->d_inode; + ino_t ino; + + ino = inode->i_ino; + if (ino == 0) + return -EACCES; + return 0; +} char _license[] SEC("license") = "GPL"; From bb6b4143503750318a3f85975186db899a3caaf2 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 18:08:40 -0300 Subject: [PATCH 108/284] tools arch amd ibs: Sync ibs.h with the kernel sources To pick up the changes from: 861c6b1185fbb2e3 ("x86/platform/amd: Add standard header guards to ") A small change to tools/perf/check-headers.sh was made to cope with the move of this header done in: 3846389c03a85188 ("x86/platform/amd: Move the header to ") That don't result in any changes in the tools, just address this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/amd/ibs.h arch/x86/include/asm/amd/ibs.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Link: https://lore.kernel.org/r/aEtCi0pup5FEwnzn@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/amd/ibs.h | 5 +++++ tools/perf/check-headers.sh | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/arch/x86/include/asm/amd/ibs.h b/tools/arch/x86/include/asm/amd/ibs.h index 300b6e0765b2..cbce54fec7b9 100644 --- a/tools/arch/x86/include/asm/amd/ibs.h +++ b/tools/arch/x86/include/asm/amd/ibs.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AMD_IBS_H +#define _ASM_X86_AMD_IBS_H + /* * From PPR Vol 1 for AMD Family 19h Model 01h B1 * 55898 Rev 0.35 - Feb 5, 2021 @@ -151,3 +154,5 @@ struct perf_ibs_data { }; u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; }; + +#endif /* _ASM_X86_AMD_IBS_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index e9fab20e9330..8085e4d1d8af 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -186,7 +186,7 @@ done # diff with extra ignore lines check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include " -I"^SYM_FUNC_START\(_LOCAL\)*(memcpy_\(erms\|orig\))" -I"^#include "' check arch/x86/lib/memset_64.S '-I "^EXPORT_SYMBOL" -I "^#include " -I"^SYM_FUNC_START\(_LOCAL\)*(memset_\(erms\|orig\))"' -check arch/x86/include/asm/amd/ibs.h '-I "^#include [<\"]\(asm/\)*msr-index.h"' +check arch/x86/include/asm/amd/ibs.h '-I "^#include .*/msr-index.h"' check arch/arm64/include/asm/cputype.h '-I "^#include [<\"]\(asm/\)*sysreg.h"' check include/linux/unaligned.h '-I "^#include " -I "^#include " -I "^#pragma GCC diagnostic"' check include/uapi/asm-generic/mman.h '-I "^#include <\(uapi/\)*asm-generic/mman-common\(-tools\)*.h>"' From fc92099902fbf21000554678a47654b029c15a4d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 12 Jun 2025 12:36:06 -0300 Subject: [PATCH 109/284] tools headers: Synchronize linux/bits.h with the kernel sources To pick up the changes in this cset: 1e7933a575ed8af4 ("uapi: Revert "bitops: avoid integer overflow in GENMASK(_ULL)"") 5b572e8a9f3dcd6e ("bits: introduce fixed-type BIT_U*()") 19408200c094858d ("bits: introduce fixed-type GENMASK_U*()") 31299a5e02112411 ("bits: add comments and newlines to #if, #else and #endif directives") This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/linux/bits.h include/linux/bits.h Please see tools/include/uapi/README for further details. Acked-by: Vincent Mailhol Cc: I Hsin Cheng Cc: Yury Norov Cc: Adrian Hunter Cc: Ian Rogers Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Lucas De Marchi Cc: Namhyung Kim Cc: Yury Norov Link: https://lore.kernel.org/r/aEr0ZJ60EbshEy6p@x1 Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/bits.h | 4 +-- tools/include/linux/bits.h | 57 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/include/uapi/linux/bits.h b/include/uapi/linux/bits.h index a04afef9efca..682b406e1067 100644 --- a/include/uapi/linux/bits.h +++ b/include/uapi/linux/bits.h @@ -4,9 +4,9 @@ #ifndef _UAPI_LINUX_BITS_H #define _UAPI_LINUX_BITS_H -#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (__BITS_PER_LONG - 1 - (h)))) +#define __GENMASK(h, l) (((~_UL(0)) << (l)) & (~_UL(0) >> (BITS_PER_LONG - 1 - (h)))) -#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (__BITS_PER_LONG_LONG - 1 - (h)))) +#define __GENMASK_ULL(h, l) (((~_ULL(0)) << (l)) & (~_ULL(0) >> (BITS_PER_LONG_LONG - 1 - (h)))) #define __GENMASK_U128(h, l) \ ((_BIT128((h)) << 1) - (_BIT128(l))) diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h index 14fd0ca9a6cd..7ad056219115 100644 --- a/tools/include/linux/bits.h +++ b/tools/include/linux/bits.h @@ -12,6 +12,7 @@ #define BIT_ULL_MASK(nr) (ULL(1) << ((nr) % BITS_PER_LONG_LONG)) #define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) #define BITS_PER_BYTE 8 +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) /* * Create a contiguous bitmask starting at bit position @l and ending at @@ -19,16 +20,68 @@ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. */ #if !defined(__ASSEMBLY__) + +/* + * Missing asm support + * + * GENMASK_U*() and BIT_U*() depend on BITS_PER_TYPE() which relies on sizeof(), + * something not available in asm. Nevertheless, fixed width integers is a C + * concept. Assembly code can rely on the long and long long versions instead. + */ + #include #include +#include + #define GENMASK_INPUT_CHECK(h, l) BUILD_BUG_ON_ZERO(const_true((l) > (h))) -#else + +/* + * Generate a mask for the specified type @t. Additional checks are made to + * guarantee the value returned fits in that type, relying on + * -Wshift-count-overflow compiler check to detect incompatible arguments. + * For example, all these create build errors or warnings: + * + * - GENMASK(15, 20): wrong argument order + * - GENMASK(72, 15): doesn't fit unsigned long + * - GENMASK_U32(33, 15): doesn't fit in a u32 + */ +#define GENMASK_TYPE(t, h, l) \ + ((t)(GENMASK_INPUT_CHECK(h, l) + \ + (type_max(t) << (l) & \ + type_max(t) >> (BITS_PER_TYPE(t) - 1 - (h))))) + +#define GENMASK_U8(h, l) GENMASK_TYPE(u8, h, l) +#define GENMASK_U16(h, l) GENMASK_TYPE(u16, h, l) +#define GENMASK_U32(h, l) GENMASK_TYPE(u32, h, l) +#define GENMASK_U64(h, l) GENMASK_TYPE(u64, h, l) + +/* + * Fixed-type variants of BIT(), with additional checks like GENMASK_TYPE(). The + * following examples generate compiler warnings due to -Wshift-count-overflow: + * + * - BIT_U8(8) + * - BIT_U32(-1) + * - BIT_U32(40) + */ +#define BIT_INPUT_CHECK(type, nr) \ + BUILD_BUG_ON_ZERO(const_true((nr) >= BITS_PER_TYPE(type))) + +#define BIT_TYPE(type, nr) ((type)(BIT_INPUT_CHECK(type, nr) + BIT_ULL(nr))) + +#define BIT_U8(nr) BIT_TYPE(u8, nr) +#define BIT_U16(nr) BIT_TYPE(u16, nr) +#define BIT_U32(nr) BIT_TYPE(u32, nr) +#define BIT_U64(nr) BIT_TYPE(u64, nr) + +#else /* defined(__ASSEMBLY__) */ + /* * BUILD_BUG_ON_ZERO is not available in h files included from asm files, * disable the input check if that is the case. */ #define GENMASK_INPUT_CHECK(h, l) 0 -#endif + +#endif /* !defined(__ASSEMBLY__) */ #define GENMASK(h, l) \ (GENMASK_INPUT_CHECK(h, l) + __GENMASK(h, l)) From ebec62bc7ec435b475722a5467d67c720a1ad79f Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 13 Jun 2025 17:41:05 -0700 Subject: [PATCH 110/284] perf evsel: Missed close() when probing hybrid core PMUs Add missing close() to avoid leaking perf events. In past perfs this mattered little as the function was just used by 'perf list'. As the function is now used to detect hybrid PMUs leaking the perf event is somewhat more painful. Fixes: b41f1cec91c37eee ("perf list: Skip unsupported events") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiapeng Chong Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tiezhu Yang Link: https://lore.kernel.org/r/20250614004108.1650988-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/print-events.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/util/print-events.c b/tools/perf/util/print-events.c index a786cbfb0ff5..83aaf7cda635 100644 --- a/tools/perf/util/print-events.c +++ b/tools/perf/util/print-events.c @@ -268,6 +268,7 @@ bool is_event_supported(u8 type, u64 config) ret = evsel__open(evsel, NULL, tmap) >= 0; } + evsel__close(evsel); evsel__delete(evsel); } From 19f4422d485b2d0a935117a1a16015328f99be25 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 13 Jun 2025 17:41:04 -0700 Subject: [PATCH 111/284] perf test: Directory file descriptor leak Add missed close when iterating over the script directories. Fixes: f3295f5b067d3c26 ("perf tests: Use scandirat for shell script finding") Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiapeng Chong Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Michael Petlan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Tiezhu Yang Link: https://lore.kernel.org/r/20250614004108.1650988-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/tests-scripts.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/perf/tests/tests-scripts.c b/tools/perf/tests/tests-scripts.c index 1d5759d08141..3a2a8438f9af 100644 --- a/tools/perf/tests/tests-scripts.c +++ b/tools/perf/tests/tests-scripts.c @@ -260,6 +260,7 @@ static void append_scripts_in_dir(int dir_fd, continue; /* Skip scripts that have a separate driver. */ fd = openat(dir_fd, ent->d_name, O_PATH); append_scripts_in_dir(fd, result, result_sz); + close(fd); } for (i = 0; i < n_dirs; i++) /* Clean up */ zfree(&entlist[i]); From 1c85c94b3767895d70b7a5a49b111f974f5660ec Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 16 Jun 2025 14:33:17 -0300 Subject: [PATCH 112/284] perf bench futex: Fix prctl include in musl libc Namhyung Kim reported: I've updated the perf-tools-next to v6.16-rc1 and found a build error like below on alpine linux 3.18. In file included from bench/futex.c:6: /usr/include/sys/prctl.h:88:8: error: redefinition of 'struct prctl_mm_map' 88 | struct prctl_mm_map { | ^~~~~~~~~~~~ In file included from bench/futex.c:5: /linux/tools/include/uapi/linux/prctl.h:134:8: note: originally defined here 134 | struct prctl_mm_map { | ^~~~~~~~~~~~ make[4]: *** [/linux/tools/build/Makefile.build:86: /build/bench/futex.o] Error 1 git bisect says it's the first commit introduced the failure. So both /usr/include/sys/prctl.h and /linux/tools/include/uapi/linux/prctl.h provide struct prctl_mm_map but their include guard must be different. /usr/include/sys/prctl.h provided by glibc contains the prctl() declaration. It includes also linux/prctl.h. The /usr/include/sys/prctl.h on alpine linux is different. This is probably coming from musl. It contains the PR_* definition and the prctl() declaration. So it clashes here because now the one struct is available twice. The man page for prctl(2) says: | #include /* Definition of PR_* constants */ | #include so musl doesn't follow this. So don't include linux/prctl.h explicitely and add some new defines needed if they aren't available. Acked-by: Sebastian Andrzej Siewior Reported-by: Namhyung Kim Closes: https://lore.kernel.org/r/20250611092542.F4ooE2FL@linutronix.de Link: https://www.openwall.com/lists/musl/2025/06/12/11 Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/futex-hash.c | 1 - tools/perf/bench/futex.c | 9 ++++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c index fdf133c9520f..d2d6d7f3ea33 100644 --- a/tools/perf/bench/futex-hash.c +++ b/tools/perf/bench/futex-hash.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/tools/perf/bench/futex.c b/tools/perf/bench/futex.c index 26382e4d8d4c..4c4fee107e59 100644 --- a/tools/perf/bench/futex.c +++ b/tools/perf/bench/futex.c @@ -2,11 +2,18 @@ #include #include #include -#include #include #include "futex.h" +#ifndef PR_FUTEX_HASH +#define PR_FUTEX_HASH 78 +# define PR_FUTEX_HASH_SET_SLOTS 1 +# define FH_FLAG_IMMUTABLE (1ULL << 0) +# define PR_FUTEX_HASH_GET_SLOTS 2 +# define PR_FUTEX_HASH_GET_IMMUTABLE 3 +#endif // PR_FUTEX_HASH + void futex_set_nbuckets_param(struct bench_futex_parameters *params) { unsigned long flags; From d222b6e6fb31e320eca506e665694d8ddf459157 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Mon, 16 Jun 2025 14:18:55 -0300 Subject: [PATCH 113/284] tools headers x86 cpufeatures: Sync with the kernel sources To pick the changes from: faad6645e1128ec2 ("x86/cpufeatures: Add CPUID feature bit for the Bus Lock Threshold") 159013a7ca18c271 ("x86/its: Enumerate Indirect Target Selection (ITS) bug") f9f27c4a377a8b45 ("x86/cpufeatures: Add "Allowed SEV Features" Feature") b02dc185ee86836c ("x86/cpufeatures: Add X86_FEATURE_APX") d88bb2ded2efdc38 ("KVM: x86: Advertise support for AMD's PREFETCHI") This causes these perf files to be rebuilt and brings some X86_FEATURE that may be used by: CC /tmp/build/perf/bench/mem-memcpy-x86-64-asm.o CC /tmp/build/perf/bench/mem-memset-x86-64-asm.o And addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/x86/include/asm/cpufeatures.h arch/x86/include/asm/cpufeatures.h Please see tools/include/uapi/README for further details. Cc: Adrian Hunter Cc: Babu Moger Cc: Chang S. Bae Cc: Dave Hansen Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Kan Liang Cc: Kishon Vijay Abraham I Cc: Manali Shukla Cc: Namhyung Kim Cc: Pawan Gupta Cc: Sean Christopherson Link: https://lore.kernel.org/r/aFBWAI3kHYX5aL9G@x1 Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/x86/include/asm/cpufeatures.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index e02be2962a01..ee176236c2be 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -336,7 +336,7 @@ #define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_AMD_STIBP_ALWAYS_ON (13*32+17) /* Single Thread Indirect Branch Predictors always-on preferred */ -#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ +#define X86_FEATURE_AMD_IBRS_SAME_MODE (13*32+19) /* Indirect Branch Restricted Speculation same mode protection*/ #define X86_FEATURE_AMD_PPIN (13*32+23) /* "amd_ppin" Protected Processor Inventory Number */ #define X86_FEATURE_AMD_SSBD (13*32+24) /* Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* "virt_ssbd" Virtualized Speculative Store Bypass Disable */ @@ -379,6 +379,7 @@ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* "v_spec_ctrl" Virtual SPEC_CTRL */ #define X86_FEATURE_VNMI (15*32+25) /* "vnmi" Virtual NMI */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* SVME addr check */ +#define X86_FEATURE_BUS_LOCK_THRESHOLD (15*32+29) /* Bus lock threshold */ #define X86_FEATURE_IDLE_HLT (15*32+30) /* IDLE HLT intercept */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ @@ -447,6 +448,7 @@ #define X86_FEATURE_DEBUG_SWAP (19*32+14) /* "debug_swap" SEV-ES full debug state swap support */ #define X86_FEATURE_RMPREAD (19*32+21) /* RMPREAD instruction */ #define X86_FEATURE_SEGMENTED_RMP (19*32+23) /* Segmented RMP support */ +#define X86_FEATURE_ALLOWED_SEV_FEATURES (19*32+27) /* Allowed SEV Features */ #define X86_FEATURE_SVSM (19*32+28) /* "svsm" SVSM present */ #define X86_FEATURE_HV_INUSE_WR_ALLOWED (19*32+30) /* Allow Write to in-use hypervisor-owned pages */ @@ -458,6 +460,7 @@ #define X86_FEATURE_AUTOIBRS (20*32+ 8) /* Automatic IBRS */ #define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* SMM_CTL MSR is not present */ +#define X86_FEATURE_PREFETCHI (20*32+20) /* Prefetch Data/Instruction to Cache Level */ #define X86_FEATURE_SBPB (20*32+27) /* Selective Branch Prediction Barrier */ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* CPU is not affected by SRSO */ @@ -482,7 +485,8 @@ #define X86_FEATURE_AMD_HTR_CORES (21*32+ 6) /* Heterogeneous Core Topology */ #define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32+ 7) /* Workload Classification */ #define X86_FEATURE_PREFER_YMM (21*32+ 8) /* Avoid ZMM registers due to downclocking */ -#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+ 9) /* Use thunk for indirect branches in lower half of cacheline */ +#define X86_FEATURE_APX (21*32+ 9) /* Advanced Performance Extensions */ +#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32+10) /* Use thunk for indirect branches in lower half of cacheline */ /* * BUG word(s) @@ -535,6 +539,8 @@ #define X86_BUG_BHI X86_BUG( 1*32+ 3) /* "bhi" CPU is affected by Branch History Injection */ #define X86_BUG_IBPB_NO_RET X86_BUG( 1*32+ 4) /* "ibpb_no_ret" IBPB omits return target predictions */ #define X86_BUG_SPECTRE_V2_USER X86_BUG( 1*32+ 5) /* "spectre_v2_user" CPU is affected by Spectre variant 2 attack between user processes */ -#define X86_BUG_ITS X86_BUG( 1*32+ 6) /* "its" CPU is affected by Indirect Target Selection */ -#define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 7) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ +#define X86_BUG_OLD_MICROCODE X86_BUG( 1*32+ 6) /* "old_microcode" CPU has old microcode, it is surely vulnerable to something */ +#define X86_BUG_ITS X86_BUG( 1*32+ 7) /* "its" CPU is affected by Indirect Target Selection */ +#define X86_BUG_ITS_NATIVE_ONLY X86_BUG( 1*32+ 8) /* "its_native_only" CPU is affected by ITS, VMX is not affected */ + #endif /* _ASM_X86_CPUFEATURES_H */ From 94a17f2dc90bc7eae36c0f478515d4bd1c23e877 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 10 Jun 2025 15:24:20 -0700 Subject: [PATCH 114/284] x86/mm: Disable INVLPGB when PTI is enabled PTI uses separate ASIDs (aka. PCIDs) for kernel and user address spaces. When the kernel needs to flush the user address space, it just sets a bit in a bitmap and then flushes the entire PCID on the next switch to userspace. This bitmap is a single 'unsigned long' which is plenty for all 6 dynamic ASIDs. But, unfortunately, the INVLPGB support brings along a bunch more user ASIDs, as many as ~2k more. The bitmap can't address that many. Fortunately, the bitmap is only needed for PTI and all the CPUs with INVLPGB are AMD CPUs that aren't vulnerable to Meltdown and don't need PTI. The only way someone can run into an issue in practice is by booting with pti=on on a newer AMD CPU. Disable INVLPGB if PTI is enabled. Avoid overrunning the small bitmap. Note: this will be fixed up properly by making the bitmap bigger. For now, just avoid the mostly theoretical bug. Fixes: 4afeb0ed1753 ("x86/mm: Enable broadcast TLB invalidation for multi-threaded processes") Signed-off-by: Dave Hansen Acked-by: Rik van Riel Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250610222420.E8CBF472%40davehans-spike.ostc.intel.com --- arch/x86/mm/pti.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 190299834011..c0c40b67524e 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -98,6 +98,11 @@ void __init pti_check_boottime_disable(void) return; setup_force_cpu_cap(X86_FEATURE_PTI); + + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + pr_debug("PTI enabled, disabling INVLPGB\n"); + setup_clear_cpu_cap(X86_FEATURE_INVLPGB); + } } static int __init pti_parse_cmdline(char *arg) From 3c902383f2da91cba3821b73aa6edd49f4db6023 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 16 Jun 2025 12:04:32 +0200 Subject: [PATCH 115/284] x86/its: Fix an ifdef typo in its_alloc() Commit a82b26451de1 ("x86/its: explicitly manage permissions for ITS pages") reworks its_alloc() and introduces a typo in an ifdef conditional, referring to CONFIG_MODULE instead of CONFIG_MODULES. Fix this typo in its_alloc(). Fixes: a82b26451de1 ("x86/its: explicitly manage permissions for ITS pages") Signed-off-by: Lukas Bulwahn Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250616100432.22941-1-lukas.bulwahn%40redhat.com --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6455f7f751b3..9ae80fa904a2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -228,7 +228,7 @@ static void *its_alloc(void) struct its_array *pages = &its_pages; void *page; -#ifdef CONFIG_MODULE +#ifdef CONFIG_MODULES if (its_mod) pages = &its_mod->arch.its_pages; #endif From cb6075bc62dc6a9cd7ab3572758685fdf78e3e20 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 6 Jun 2025 13:10:34 -0400 Subject: [PATCH 116/284] x86/mm: Fix early boot use of INVPLGB The INVLPGB instruction has limits on how many pages it can invalidate at once. That limit is enumerated in CPUID, read by the kernel, and stored in 'invpgb_count_max'. Ranged invalidation, like invlpgb_kernel_range_flush() break up their invalidations so that they do not exceed the limit. However, early boot code currently attempts to do ranged invalidation before populating 'invlpgb_count_max'. There is a for loop which is basically: for (...; addr < end; addr += invlpgb_count_max*PAGE_SIZE) If invlpgb_kernel_range_flush is called before the kernel has read the value of invlpgb_count_max from the hardware, the normally bounded loop can become an infinite loop if invlpgb_count_max is initialized to zero. Fix that issue by initializing invlpgb_count_max to 1. This way INVPLGB at early boot time will be a little bit slower than normal (with initialized invplgb_count_max), and not an instant hang at bootup time. Fixes: b7aa05cbdc52 ("x86/mm: Add INVLPGB support code") Signed-off-by: Rik van Riel Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250606171112.4013261-3-riel%40surriel.com --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 93da466dfe2c..b2ad8d13211a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -31,7 +31,7 @@ #include "cpu.h" -u16 invlpgb_count_max __ro_after_init; +u16 invlpgb_count_max __ro_after_init = 1; static inline int rdmsrq_amd_safe(unsigned msr, u64 *p) { From 905eeb2b7c33adda23a966aeb811ab4cb9e62031 Mon Sep 17 00:00:00 2001 From: Tatsuyuki Ishi Date: Thu, 12 Jun 2025 19:18:25 +0900 Subject: [PATCH 117/284] erofs: impersonate the opener's credentials when accessing backing file Previously, file operations on a file-backed mount used the current process' credentials to access the backing FD. Attempting to do so on Android lead to SELinux denials, as ACL rules on the backing file (e.g. /system/apex/foo.apex) is restricted to a small set of process. Arguably, this error is redundant and leaking implementation details, as access to files on a mount is already ACL'ed by path. Instead, override to use the opener's cred when accessing the backing file. This makes the behavior similar to a loop-backed mount, which uses kworker cred when accessing the backing file and does not cause SELinux denials. Signed-off-by: Tatsuyuki Ishi Reviewed-by: Gao Xiang Reviewed-by: Hongbo Li Link: https://lore.kernel.org/r/20250612-b4-erofs-impersonate-v1-1-8ea7d6f65171@google.com Signed-off-by: Gao Xiang --- fs/erofs/fileio.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 7d81f504bff0..df5cc63f2c01 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -47,6 +47,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) { + const struct cred *old_cred; struct iov_iter iter; int ret; @@ -60,7 +61,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) rq->iocb.ki_flags = IOCB_DIRECT; iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, rq->bio.bi_iter.bi_size); + old_cred = override_creds(rq->iocb.ki_filp->f_cred); ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); + revert_creds(old_cred); if (ret != -EIOCBQUEUED) erofs_fileio_ki_complete(&rq->iocb, ret); } From 30b58444807c93bffeaba7d776110f2a909d2f9a Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 17 Jun 2025 13:40:56 +0800 Subject: [PATCH 118/284] erofs: remove unused trace event erofs_destroy_inode The trace event `erofs_destroy_inode` was added but remains unused. This unused event contributes approximately 5KB to the kernel module size. Reported-by: Steven Rostedt Closes: https://lore.kernel.org/r/20250612224906.15000244@batman.local.home Fixes: 13f06f48f7bf ("staging: erofs: support tracepoint") Cc: stable@vger.kernel.org Reviewed-by: Hongbo Li Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250617054056.3232365-1-hsiangkao@linux.alibaba.com --- include/trace/events/erofs.h | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index a5f4b9234f46..dad7360f42f9 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -211,24 +211,6 @@ TRACE_EVENT(erofs_map_blocks_exit, show_mflags(__entry->mflags), __entry->ret) ); -TRACE_EVENT(erofs_destroy_inode, - TP_PROTO(struct inode *inode), - - TP_ARGS(inode), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( erofs_nid_t, nid ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->nid = EROFS_I(inode)->nid; - ), - - TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry)) -); - #endif /* _TRACE_EROFS_H */ /* This part must be outside protection */ From df29f60369ccec0aa17d7eed7e2ae1fcdc9be6d4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 16 Jun 2025 12:09:17 +0800 Subject: [PATCH 119/284] crypto: ahash - Fix infinite recursion in ahash_def_finup Invoke the final function directly in the default finup implementation since crypto_ahash_final is now just a wrapper around finup. Reported-by: Eric Biggers Fixes: 9d7a0ab1c753 ("crypto: ahash - Handle partial blocks in API") Signed-off-by: Herbert Xu --- crypto/ahash.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/ahash.c b/crypto/ahash.c index e10bc2659ae4..bc84a07c924c 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -600,12 +600,14 @@ static void ahash_def_finup_done2(void *data, int err) static int ahash_def_finup_finish1(struct ahash_request *req, int err) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + if (err) goto out; req->base.complete = ahash_def_finup_done2; - err = crypto_ahash_final(req); + err = crypto_ahash_alg(tfm)->final(req); if (err == -EINPROGRESS || err == -EBUSY) return err; From 635e118317ffa773f6d25ec6a71b7927d7e8886a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 18 Jun 2025 10:54:44 +0200 Subject: [PATCH 120/284] Revert "mtd: core: always create master device" The idea behind this patch was to always let a "master" mtd device available to anchor runtime PM. Historically, there was no mtd device representing the whole storage as soon as partitions were coming into play. The introduction of CONFIG_MTD_PARTITIONED_MASTER allowed to keep this "master" device, but was not enabled by default to avoid breaking existing users (otherwise the mtd device numbering would be totally messed up with an off by 1, at least). The approach of adding an mtd_master class on top of partitioned mtd devices is breaking the mtd core in many creative ways, so better think again this approach and revert the faulty changes for now. This reverts commit 0aa7b390fc40a871267a2328bbbefca8b37ad307. Fixes: 0aa7b390fc40 ("mtd: core: always create master device") Tested-by: Guenter Roeck Acked-by: Guenter Roeck Signed-off-by: Miquel Raynal --- drivers/mtd/mtdchar.c | 2 +- drivers/mtd/mtdcore.c | 152 +++++++++------------------------ drivers/mtd/mtdcore.h | 2 +- drivers/mtd/mtdpart.c | 16 ++-- include/linux/mtd/partitions.h | 2 +- 5 files changed, 51 insertions(+), 123 deletions(-) diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index 391d81ad960c..8dc4f5c493fc 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -559,7 +559,7 @@ static int mtdchar_blkpg_ioctl(struct mtd_info *mtd, /* Sanitize user input */ p.devname[BLKPG_DEVNAMELTH - 1] = '\0'; - return mtd_add_partition(mtd, p.devname, p.start, p.length, NULL); + return mtd_add_partition(mtd, p.devname, p.start, p.length); case BLKPG_DEL_PARTITION: diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 429d8c16baf0..5ba9a741f5ac 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -68,13 +68,7 @@ static struct class mtd_class = { .pm = MTD_CLS_PM_OPS, }; -static struct class mtd_master_class = { - .name = "mtd_master", - .pm = MTD_CLS_PM_OPS, -}; - static DEFINE_IDR(mtd_idr); -static DEFINE_IDR(mtd_master_idr); /* These are exported solely for the purpose of mtd_blkdevs.c. You should not use them for _anything_ else */ @@ -89,9 +83,8 @@ EXPORT_SYMBOL_GPL(__mtd_next_device); static LIST_HEAD(mtd_notifiers); -#define MTD_MASTER_DEVS 255 + #define MTD_DEVT(index) MKDEV(MTD_CHAR_MAJOR, (index)*2) -static dev_t mtd_master_devt; /* REVISIT once MTD uses the driver model better, whoever allocates * the mtd_info will probably want to use the release() hook... @@ -111,17 +104,6 @@ static void mtd_release(struct device *dev) device_destroy(&mtd_class, index + 1); } -static void mtd_master_release(struct device *dev) -{ - struct mtd_info *mtd = dev_get_drvdata(dev); - - idr_remove(&mtd_master_idr, mtd->index); - of_node_put(mtd_get_of_node(mtd)); - - if (mtd_is_partition(mtd)) - release_mtd_partition(mtd); -} - static void mtd_device_release(struct kref *kref) { struct mtd_info *mtd = container_of(kref, struct mtd_info, refcnt); @@ -385,11 +367,6 @@ static const struct device_type mtd_devtype = { .release = mtd_release, }; -static const struct device_type mtd_master_devtype = { - .name = "mtd_master", - .release = mtd_master_release, -}; - static bool mtd_expert_analysis_mode; #ifdef CONFIG_DEBUG_FS @@ -657,13 +634,13 @@ static void mtd_check_of_node(struct mtd_info *mtd) /** * add_mtd_device - register an MTD device * @mtd: pointer to new MTD device info structure - * @partitioned: create partitioned device * * Add a device to the list of MTD devices present in the system, and * notify each currently active MTD 'user' of its arrival. Returns * zero on success or non-zero on failure. */ -int add_mtd_device(struct mtd_info *mtd, bool partitioned) + +int add_mtd_device(struct mtd_info *mtd) { struct device_node *np = mtd_get_of_node(mtd); struct mtd_info *master = mtd_get_master(mtd); @@ -710,17 +687,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) ofidx = -1; if (np) ofidx = of_alias_get_id(np, "mtd"); - if (partitioned) { - if (ofidx >= 0) - i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); - else - i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); - } else { - if (ofidx >= 0) - i = idr_alloc(&mtd_master_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); - else - i = idr_alloc(&mtd_master_idr, mtd, 0, 0, GFP_KERNEL); - } + if (ofidx >= 0) + i = idr_alloc(&mtd_idr, mtd, ofidx, ofidx + 1, GFP_KERNEL); + else + i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL); if (i < 0) { error = i; goto fail_locked; @@ -768,18 +738,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) /* Caller should have set dev.parent to match the * physical device, if appropriate. */ - if (partitioned) { - mtd->dev.type = &mtd_devtype; - mtd->dev.class = &mtd_class; - mtd->dev.devt = MTD_DEVT(i); - dev_set_name(&mtd->dev, "mtd%d", i); - error = dev_set_name(&mtd->dev, "mtd%d", i); - } else { - mtd->dev.type = &mtd_master_devtype; - mtd->dev.class = &mtd_master_class; - mtd->dev.devt = MKDEV(MAJOR(mtd_master_devt), i); - error = dev_set_name(&mtd->dev, "mtd_master%d", i); - } + mtd->dev.type = &mtd_devtype; + mtd->dev.class = &mtd_class; + mtd->dev.devt = MTD_DEVT(i); + error = dev_set_name(&mtd->dev, "mtd%d", i); if (error) goto fail_devname; dev_set_drvdata(&mtd->dev, mtd); @@ -787,7 +749,6 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) of_node_get(mtd_get_of_node(mtd)); error = device_register(&mtd->dev); if (error) { - pr_err("mtd: %s device_register fail %d\n", mtd->name, error); put_device(&mtd->dev); goto fail_added; } @@ -799,13 +760,10 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) mtd_debugfs_populate(mtd); - if (partitioned) { - device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL, - "mtd%dro", i); - } + device_create(&mtd_class, mtd->dev.parent, MTD_DEVT(i) + 1, NULL, + "mtd%dro", i); - pr_debug("mtd: Giving out %spartitioned device %d to %s\n", - partitioned ? "" : "un-", i, mtd->name); + pr_debug("mtd: Giving out device %d to %s\n", i, mtd->name); /* No need to get a refcount on the module containing the notifier, since we hold the mtd_table_mutex */ list_for_each_entry(not, &mtd_notifiers, list) @@ -813,16 +771,13 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) mutex_unlock(&mtd_table_mutex); - if (partitioned) { - if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) { - if (IS_BUILTIN(CONFIG_MTD)) { - pr_info("mtd: setting mtd%d (%s) as root device\n", - mtd->index, mtd->name); - ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index); - } else { - pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n", - mtd->index, mtd->name); - } + if (of_property_read_bool(mtd_get_of_node(mtd), "linux,rootfs")) { + if (IS_BUILTIN(CONFIG_MTD)) { + pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name); + ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index); + } else { + pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n", + mtd->index, mtd->name); } } @@ -838,10 +793,7 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) fail_added: of_node_put(mtd_get_of_node(mtd)); fail_devname: - if (partitioned) - idr_remove(&mtd_idr, i); - else - idr_remove(&mtd_master_idr, i); + idr_remove(&mtd_idr, i); fail_locked: mutex_unlock(&mtd_table_mutex); return error; @@ -859,14 +811,12 @@ int add_mtd_device(struct mtd_info *mtd, bool partitioned) int del_mtd_device(struct mtd_info *mtd) { - struct mtd_notifier *not; - struct idr *idr; int ret; + struct mtd_notifier *not; mutex_lock(&mtd_table_mutex); - idr = mtd->dev.class == &mtd_class ? &mtd_idr : &mtd_master_idr; - if (idr_find(idr, mtd->index) != mtd) { + if (idr_find(&mtd_idr, mtd->index) != mtd) { ret = -ENODEV; goto out_error; } @@ -1106,7 +1056,6 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, const struct mtd_partition *parts, int nr_parts) { - struct mtd_info *parent; int ret, err; mtd_set_dev_defaults(mtd); @@ -1115,30 +1064,25 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, if (ret) goto out; - ret = add_mtd_device(mtd, false); - if (ret) - goto out; - if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) { - ret = mtd_add_partition(mtd, mtd->name, 0, MTDPART_SIZ_FULL, &parent); + ret = add_mtd_device(mtd); if (ret) goto out; - - } else { - parent = mtd; } /* Prefer parsed partitions over driver-provided fallback */ - ret = parse_mtd_partitions(parent, types, parser_data); + ret = parse_mtd_partitions(mtd, types, parser_data); if (ret == -EPROBE_DEFER) goto out; if (ret > 0) ret = 0; else if (nr_parts) - ret = add_mtd_partitions(parent, parts, nr_parts); - else if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) - ret = mtd_add_partition(parent, mtd->name, 0, MTDPART_SIZ_FULL, NULL); + ret = add_mtd_partitions(mtd, parts, nr_parts); + else if (!device_is_registered(&mtd->dev)) + ret = add_mtd_device(mtd); + else + ret = 0; if (ret) goto out; @@ -1158,14 +1102,13 @@ int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types, register_reboot_notifier(&mtd->reboot_notifier); } - return 0; out: - nvmem_unregister(mtd->otp_user_nvmem); - nvmem_unregister(mtd->otp_factory_nvmem); + if (ret) { + nvmem_unregister(mtd->otp_user_nvmem); + nvmem_unregister(mtd->otp_factory_nvmem); + } - del_mtd_partitions(mtd); - - if (device_is_registered(&mtd->dev)) { + if (ret && device_is_registered(&mtd->dev)) { err = del_mtd_device(mtd); if (err) pr_err("Error when deleting MTD device (%d)\n", err); @@ -1324,7 +1267,8 @@ int __get_mtd_device(struct mtd_info *mtd) mtd = mtd->parent; } - kref_get(&master->refcnt); + if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) + kref_get(&master->refcnt); return 0; } @@ -1418,7 +1362,8 @@ void __put_mtd_device(struct mtd_info *mtd) mtd = parent; } - kref_put(&master->refcnt, mtd_device_release); + if (IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER)) + kref_put(&master->refcnt, mtd_device_release); module_put(master->owner); @@ -2585,16 +2530,6 @@ static int __init init_mtd(void) if (ret) goto err_reg; - ret = class_register(&mtd_master_class); - if (ret) - goto err_reg2; - - ret = alloc_chrdev_region(&mtd_master_devt, 0, MTD_MASTER_DEVS, "mtd_master"); - if (ret < 0) { - pr_err("unable to allocate char dev region\n"); - goto err_chrdev; - } - mtd_bdi = mtd_bdi_init("mtd"); if (IS_ERR(mtd_bdi)) { ret = PTR_ERR(mtd_bdi); @@ -2619,10 +2554,6 @@ static int __init init_mtd(void) bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); err_bdi: - unregister_chrdev_region(mtd_master_devt, MTD_MASTER_DEVS); -err_chrdev: - class_unregister(&mtd_master_class); -err_reg2: class_unregister(&mtd_class); err_reg: pr_err("Error registering mtd class or bdi: %d\n", ret); @@ -2636,12 +2567,9 @@ static void __exit cleanup_mtd(void) if (proc_mtd) remove_proc_entry("mtd", NULL); class_unregister(&mtd_class); - class_unregister(&mtd_master_class); - unregister_chrdev_region(mtd_master_devt, MTD_MASTER_DEVS); bdi_unregister(mtd_bdi); bdi_put(mtd_bdi); idr_destroy(&mtd_idr); - idr_destroy(&mtd_master_idr); } module_init(init_mtd); diff --git a/drivers/mtd/mtdcore.h b/drivers/mtd/mtdcore.h index 2258d31c5aa6..b014861a06a6 100644 --- a/drivers/mtd/mtdcore.h +++ b/drivers/mtd/mtdcore.h @@ -8,7 +8,7 @@ extern struct mutex mtd_table_mutex; extern struct backing_dev_info *mtd_bdi; struct mtd_info *__mtd_next_device(int i); -int __must_check add_mtd_device(struct mtd_info *mtd, bool partitioned); +int __must_check add_mtd_device(struct mtd_info *mtd); int del_mtd_device(struct mtd_info *mtd); int add_mtd_partitions(struct mtd_info *, const struct mtd_partition *, int); int del_mtd_partitions(struct mtd_info *); diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 5a3db36d734e..994e8c51e674 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -86,7 +86,8 @@ static struct mtd_info *allocate_partition(struct mtd_info *parent, * parent conditional on that option. Note, this is a way to * distinguish between the parent and its partitions in sysfs. */ - child->dev.parent = &parent->dev; + child->dev.parent = IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) || mtd_is_partition(parent) ? + &parent->dev : parent->dev.parent; child->dev.of_node = part->of_node; child->parent = parent; child->part.offset = part->offset; @@ -242,7 +243,7 @@ static int mtd_add_partition_attrs(struct mtd_info *new) } int mtd_add_partition(struct mtd_info *parent, const char *name, - long long offset, long long length, struct mtd_info **out) + long long offset, long long length) { struct mtd_info *master = mtd_get_master(parent); u64 parent_size = mtd_is_partition(parent) ? @@ -275,15 +276,12 @@ int mtd_add_partition(struct mtd_info *parent, const char *name, list_add_tail(&child->part.node, &parent->partitions); mutex_unlock(&master->master.partitions_lock); - ret = add_mtd_device(child, true); + ret = add_mtd_device(child); if (ret) goto err_remove_part; mtd_add_partition_attrs(child); - if (out) - *out = child; - return 0; err_remove_part: @@ -415,7 +413,7 @@ int add_mtd_partitions(struct mtd_info *parent, list_add_tail(&child->part.node, &parent->partitions); mutex_unlock(&master->master.partitions_lock); - ret = add_mtd_device(child, true); + ret = add_mtd_device(child); if (ret) { mutex_lock(&master->master.partitions_lock); list_del(&child->part.node); @@ -592,6 +590,9 @@ static int mtd_part_of_parse(struct mtd_info *master, int ret, err = 0; dev = &master->dev; + /* Use parent device (controller) if the top level MTD is not registered */ + if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) && !mtd_is_partition(master)) + dev = master->dev.parent; np = mtd_get_of_node(master); if (mtd_is_partition(master)) @@ -710,7 +711,6 @@ int parse_mtd_partitions(struct mtd_info *master, const char *const *types, if (ret < 0 && !err) err = ret; } - return err; } diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index 5daf80df9e89..b74a539ec581 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -108,7 +108,7 @@ extern void deregister_mtd_parser(struct mtd_part_parser *parser); deregister_mtd_parser) int mtd_add_partition(struct mtd_info *master, const char *name, - long long offset, long long length, struct mtd_info **part); + long long offset, long long length); int mtd_del_partition(struct mtd_info *master, int partno); uint64_t mtd_get_device_size(const struct mtd_info *mtd); From 60dffe96fab00cee7ef7f1b151da485d42ccb33a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 18 Jun 2025 10:47:58 +0200 Subject: [PATCH 121/284] mtd: spinand: winbond: Fix W35N number of planes/LUN There's been a mistake when extracting the geometry of the W35N02 and W35N04 chips from the datasheet. There is a single plane, however there are respectively 2 and 4 LUNs. They are actually referred in the datasheet as dies (equivalent of target), but as there is no die select operation and the chips only feature a single configuration register for the entire chip (instead of one per die), we can reasonably assume we are talking about LUNs and not dies. Reported-by: Andreas Dannenberg Suggested-by: Vignesh Raghavendra Fixes: 25e08bf66660 ("mtd: spinand: winbond: Add support for W35N02JW and W35N04JW chips") Cc: stable@vger.kernel.org Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/winbond.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c index 19f8dd4a6370..2808bbd7a16e 100644 --- a/drivers/mtd/nand/spi/winbond.c +++ b/drivers/mtd/nand/spi/winbond.c @@ -289,7 +289,7 @@ static const struct spinand_info winbond_spinand_table[] = { SPINAND_ECCINFO(&w35n01jw_ooblayout, NULL)), SPINAND_INFO("W35N02JW", /* 1.8V */ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xdf, 0x22), - NAND_MEMORG(1, 4096, 128, 64, 512, 10, 2, 1, 1), + NAND_MEMORG(1, 4096, 128, 64, 512, 10, 1, 2, 1), NAND_ECCREQ(1, 512), SPINAND_INFO_OP_VARIANTS(&read_cache_octal_variants, &write_cache_octal_variants, @@ -298,7 +298,7 @@ static const struct spinand_info winbond_spinand_table[] = { SPINAND_ECCINFO(&w35n01jw_ooblayout, NULL)), SPINAND_INFO("W35N04JW", /* 1.8V */ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0xdf, 0x23), - NAND_MEMORG(1, 4096, 128, 64, 512, 10, 4, 1, 1), + NAND_MEMORG(1, 4096, 128, 64, 512, 10, 1, 4, 1), NAND_ECCREQ(1, 512), SPINAND_INFO_OP_VARIANTS(&read_cache_octal_variants, &write_cache_octal_variants, From 29384bbb1a2a5926f97b8cbe469081e7a1770dea Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 18 Jun 2025 10:47:59 +0200 Subject: [PATCH 122/284] mtd: spinand: winbond: Increase maximum frequency on an octal operation The default number of dummy cycles is 16 in octal I/O mode (1S-8S-8S), and with this default configuration the maximum frequency is higher than what is being advertised. There are higher and lower frequency possibilities, which involve making changes in the number of dummy cycles through the VCR register. At this stage, let's just describe the default configuration correctly. There should be no functional change. Fixes: 1ac5ff2f2ad6 ("mtd: spinand: winbond: Add octal support") Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/winbond.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c index 2808bbd7a16e..0756966b4e3c 100644 --- a/drivers/mtd/nand/spi/winbond.c +++ b/drivers/mtd/nand/spi/winbond.c @@ -25,7 +25,7 @@ static SPINAND_OP_VARIANTS(read_cache_octal_variants, SPINAND_PAGE_READ_FROM_CACHE_1S_1D_8D_OP(0, 2, NULL, 0, 105 * HZ_PER_MHZ), - SPINAND_PAGE_READ_FROM_CACHE_1S_8S_8S_OP(0, 16, NULL, 0, 86 * HZ_PER_MHZ), + SPINAND_PAGE_READ_FROM_CACHE_1S_8S_8S_OP(0, 16, NULL, 0, 162 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1S_8S_OP(0, 1, NULL, 0, 133 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0), SPINAND_PAGE_READ_FROM_CACHE_1S_1S_1S_OP(0, 1, NULL, 0)); From dba90f5a79c13936de4273a19e67908a0c296afe Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 18 Jun 2025 10:48:00 +0200 Subject: [PATCH 123/284] mtd: spinand: winbond: Prevent unsupported frequencies on dual/quad I/O variants Dual and quad capable chips natively support dual and quad I/O variants at up to 104MHz (1-2-2 and 1-4-4 operations). Reaching the maximum speed of 166MHz is theoretically possible (while still unsupported in the field) by adding a few more dummy cycles. Let's be accurate and clearly state this limit. Setting a maximum frequency implies adding the frequency parameter to the macro, which is done using a variadic argument to avoid impacting all the other drivers which already make use of this macro. Fixes: 1ea808b4d15b ("mtd: spinand: winbond: Update the *JW chip definitions") Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/winbond.c | 4 ++-- include/linux/mtd/spinand.h | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/nand/spi/winbond.c b/drivers/mtd/nand/spi/winbond.c index 0756966b4e3c..b7a28f001a38 100644 --- a/drivers/mtd/nand/spi/winbond.c +++ b/drivers/mtd/nand/spi/winbond.c @@ -42,11 +42,11 @@ static SPINAND_OP_VARIANTS(update_cache_octal_variants, static SPINAND_OP_VARIANTS(read_cache_dual_quad_dtr_variants, SPINAND_PAGE_READ_FROM_CACHE_1S_4D_4D_OP(0, 8, NULL, 0, 80 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1D_4D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ), - SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(0, 2, NULL, 0, 104 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1S_4S_OP(0, 1, NULL, 0), SPINAND_PAGE_READ_FROM_CACHE_1S_2D_2D_OP(0, 4, NULL, 0, 80 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1D_2D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ), - SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0), + SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(0, 1, NULL, 0, 104 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_1S_1S_2S_OP(0, 1, NULL, 0), SPINAND_PAGE_READ_FROM_CACHE_1S_1D_1D_OP(0, 2, NULL, 0, 80 * HZ_PER_MHZ), SPINAND_PAGE_READ_FROM_CACHE_FAST_1S_1S_1S_OP(0, 1, NULL, 0), diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 811a0f356315..15eaa09da998 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -113,11 +113,12 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 2), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_2S_2S_OP(addr, ndummy, buf, len, ...) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ SPI_MEM_OP_ADDR(2, addr, 2), \ SPI_MEM_OP_DUMMY(ndummy, 2), \ - SPI_MEM_OP_DATA_IN(len, buf, 2)) + SPI_MEM_OP_DATA_IN(len, buf, 2), \ + SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0)) #define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_2S_2S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xbb, 1), \ @@ -151,11 +152,12 @@ SPI_MEM_DTR_OP_DATA_IN(len, buf, 4), \ SPI_MEM_OP_MAX_FREQ(freq)) -#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len) \ +#define SPINAND_PAGE_READ_FROM_CACHE_1S_4S_4S_OP(addr, ndummy, buf, len, ...) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ SPI_MEM_OP_ADDR(2, addr, 4), \ SPI_MEM_OP_DUMMY(ndummy, 4), \ - SPI_MEM_OP_DATA_IN(len, buf, 4)) + SPI_MEM_OP_DATA_IN(len, buf, 4), \ + SPI_MEM_OP_MAX_FREQ(__VA_ARGS__ + 0)) #define SPINAND_PAGE_READ_FROM_CACHE_3A_1S_4S_4S_OP(addr, ndummy, buf, len) \ SPI_MEM_OP(SPI_MEM_OP_CMD(0xeb, 1), \ From 10af0273a35ab4513ca1546644b8c853044da134 Mon Sep 17 00:00:00 2001 From: David Thompson Date: Fri, 13 Jun 2025 16:34:43 +0000 Subject: [PATCH 124/284] gpio: mlxbf3: only get IRQ for device instance 0 The gpio-mlxbf3 driver interfaces with two GPIO controllers, device instance 0 and 1. There is a single IRQ resource shared between the two controllers, and it is found in the ACPI table for device instance 0. The driver should not attempt to get an IRQ resource when probing device instance 1, otherwise the following error is logged: mlxbf3_gpio MLNXBF33:01: error -ENXIO: IRQ index 0 not found Signed-off-by: David Thompson Reviewed-by: Shravan Kumar Ramani Fixes: cd33f216d241 ("gpio: mlxbf3: Add gpio driver support") Link: https://lore.kernel.org/r/20250613163443.1065217-1-davthompson@nvidia.com Signed-off-by: Bartosz Golaszewski --- drivers/gpio/gpio-mlxbf3.c | 52 +++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/drivers/gpio/gpio-mlxbf3.c b/drivers/gpio/gpio-mlxbf3.c index 10ea71273c89..9875e34bde72 100644 --- a/drivers/gpio/gpio-mlxbf3.c +++ b/drivers/gpio/gpio-mlxbf3.c @@ -190,7 +190,9 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) struct mlxbf3_gpio_context *gs; struct gpio_irq_chip *girq; struct gpio_chip *gc; + char *colon_ptr; int ret, irq; + long num; gs = devm_kzalloc(dev, sizeof(*gs), GFP_KERNEL); if (!gs) @@ -227,25 +229,39 @@ static int mlxbf3_gpio_probe(struct platform_device *pdev) gc->owner = THIS_MODULE; gc->add_pin_ranges = mlxbf3_gpio_add_pin_ranges; - irq = platform_get_irq(pdev, 0); - if (irq >= 0) { - girq = &gs->gc.irq; - gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip); - girq->default_type = IRQ_TYPE_NONE; - /* This will let us handle the parent IRQ in the driver */ - girq->num_parents = 0; - girq->parents = NULL; - girq->parent_handler = NULL; - girq->handler = handle_bad_irq; + colon_ptr = strchr(dev_name(dev), ':'); + if (!colon_ptr) { + dev_err(dev, "invalid device name format\n"); + return -EINVAL; + } - /* - * Directly request the irq here instead of passing - * a flow-handler because the irq is shared. - */ - ret = devm_request_irq(dev, irq, mlxbf3_gpio_irq_handler, - IRQF_SHARED, dev_name(dev), gs); - if (ret) - return dev_err_probe(dev, ret, "failed to request IRQ"); + ret = kstrtol(++colon_ptr, 16, &num); + if (ret) { + dev_err(dev, "invalid device instance\n"); + return ret; + } + + if (!num) { + irq = platform_get_irq(pdev, 0); + if (irq >= 0) { + girq = &gs->gc.irq; + gpio_irq_chip_set_chip(girq, &gpio_mlxbf3_irqchip); + girq->default_type = IRQ_TYPE_NONE; + /* This will let us handle the parent IRQ in the driver */ + girq->num_parents = 0; + girq->parents = NULL; + girq->parent_handler = NULL; + girq->handler = handle_bad_irq; + + /* + * Directly request the irq here instead of passing + * a flow-handler because the irq is shared. + */ + ret = devm_request_irq(dev, irq, mlxbf3_gpio_irq_handler, + IRQF_SHARED, dev_name(dev), gs); + if (ret) + return dev_err_probe(dev, ret, "failed to request IRQ"); + } } platform_set_drvdata(pdev, gs); From e1c75831f682eef0f68b35723437146ed86070b1 Mon Sep 17 00:00:00 2001 From: Penglei Jiang Date: Tue, 17 Jun 2025 09:56:44 -0700 Subject: [PATCH 125/284] io_uring: fix potential page leak in io_sqe_buffer_register() If allocation of the 'imu' fails, then the existing pages aren't unpinned in the error path. This is mostly a theoretical issue, requiring fault injection to hit. Move unpin_user_pages() to unified error handling to fix the page leak issue. Fixes: d8c2237d0aa9 ("io_uring: add io_pin_pages() helper") Signed-off-by: Penglei Jiang Link: https://lore.kernel.org/r/20250617165644.79165-1-superman.xpt@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 94a9db030e0e..d724602697e7 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -809,10 +809,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); + if (ret) goto done; - } size = iov->iov_len; /* store original address for later verification */ @@ -842,6 +840,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (ret) { if (imu) io_free_imu(ctx, imu); + if (pages) + unpin_user_pages(pages, nr_pages); io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } From 2aebf5ee43bf0ed225a09a30cf515d9f2813b759 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 18 Jun 2025 09:05:23 +0900 Subject: [PATCH 126/284] x86/alternatives: Fix int3 handling failure from broken text_poke array Since smp_text_poke_single() does not expect there is another text_poke request is queued, it can make text_poke_array not sorted or cause a buffer overflow on the text_poke_array.vec[]. This will cause an Oops in int3 because of bsearch failing; CPU 0 CPU 1 CPU 2 ----- ----- ----- smp_text_poke_batch_add() smp_text_poke_single() <<-- Adds out of order [Fails o find address in text_poke_array ] OOPS! Or unhandled page fault because of a buffer overflow; CPU 0 CPU 1 ----- ----- smp_text_poke_batch_add() <<+ ... | smp_text_poke_batch_add() <<-- Adds TEXT_POKE_ARRAY_MAX times. smp_text_poke_single() { __smp_text_poke_batch_add() <<-- Adds entry at TEXT_POKE_ARRAY_MAX + 1 smp_text_poke_batch_finish() [Unhandled page fault because text_poke_array.nr_entries is overwritten] BUG! } Use smp_text_poke_batch_add() instead of __smp_text_poke_batch_add() so that it correctly flush the queue if needed. Closes: https://lore.kernel.org/all/CA+G9fYsLu0roY3DV=tKyqP7FEKbOEETRvTDhnpPxJGbA=Cg+4w@mail.gmail.com/ Fixes: c8976ade0c1b ("x86/alternatives: Simplify smp_text_poke_single() by using tp_vec and existing APIs") Reported-by: Linux Kernel Functional Testing Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (Google) Tested-by: Linux Kernel Functional Testing Link: https://lkml.kernel.org/r/\ 175020512308.3582717.13631440385506146631.stgit@mhiramat.tok.corp.google.com --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9ae80fa904a2..ea1d984166cd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -3138,6 +3138,6 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - __smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_add(addr, opcode, len, emulate); smp_text_poke_batch_finish(); } From bbf10cd686835d5a4b8566dc73a3b00b4cd7932a Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 18 Jun 2025 16:38:25 +0200 Subject: [PATCH 127/284] PCI: pciehp: Ignore belated Presence Detect Changed caused by DPC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c3be50f7547c ("PCI: pciehp: Ignore Presence Detect Changed caused by DPC") sought to ignore Presence Detect Changed events occurring as a side effect of Downstream Port Containment. The commit awaits recovery from DPC and then clears events which occurred in the meantime. However if the first event seen after DPC is Data Link Layer State Changed, only that event is cleared and not Presence Detect Changed. The object of the commit is thus defeated. That's because pciehp_ist() computes the events to clear based on the local "events" variable instead of "ctrl->pending_events". The former contains the events that had occurred when pciehp_ist() was entered, whereas the latter also contains events that have accumulated while awaiting DPC recovery. In practice, the order of PDC and DLLSC events is arbitrary and the delay in-between can be several milliseconds. So change the logic to always clear PDC events, even if they come after an initial DLLSC event. Fixes: c3be50f7547c ("PCI: pciehp: Ignore Presence Detect Changed caused by DPC") Reported-by: Lương Việt Hoàng Reported-by: Joel Mathew Thomas Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219765#c165 Signed-off-by: Lukas Wunner Signed-off-by: Bjorn Helgaas Tested-by: Lương Việt Hoàng Tested-by: Joel Mathew Thomas Link: https://patch.msgid.link/d9c4286a16253af7e93eaf12e076e3ef3546367a.1750257164.git.lukas@wunner.de --- drivers/pci/hotplug/pciehp_hpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index ebd342bda235..91d2d92717d9 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -771,7 +771,7 @@ static irqreturn_t pciehp_ist(int irq, void *dev_id) u16 ignored_events = PCI_EXP_SLTSTA_DLLSC; if (!ctrl->inband_presence_disabled) - ignored_events |= events & PCI_EXP_SLTSTA_PDC; + ignored_events |= PCI_EXP_SLTSTA_PDC; events &= ~ignored_events; pciehp_ignore_link_change(ctrl, pdev, irq, ignored_events); From 6c5b8895c8cab4fdb496b38513ec417588b58596 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Wed, 18 Jun 2025 15:55:47 +0100 Subject: [PATCH 128/284] ASoC: doc: cs35l56: Add CS35L63 to the list of supported devices Add CS35L63 to the list of parts supported by the cs35l56 driver and mention the CS35L63 where the text refers to the supported part numbers. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20250618145547.152814-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- Documentation/sound/codecs/cs35l56.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/sound/codecs/cs35l56.rst b/Documentation/sound/codecs/cs35l56.rst index 7db9f93d9be5..57d1964453e1 100644 --- a/Documentation/sound/codecs/cs35l56.rst +++ b/Documentation/sound/codecs/cs35l56.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0-only -===================================================================== -Audio drivers for Cirrus Logic CS35L54/56/57 Boosted Smart Amplifiers -===================================================================== +======================================================================== +Audio drivers for Cirrus Logic CS35L54/56/57/63 Boosted Smart Amplifiers +======================================================================== :Copyright: 2025 Cirrus Logic, Inc. and Cirrus Logic International Semiconductor Ltd. @@ -13,11 +13,11 @@ Summary The high-level summary of this document is: -**If you have a laptop that uses CS35L54/56/57 amplifiers but audio is not +**If you have a laptop that uses CS35L54/56/57/63 amplifiers but audio is not working, DO NOT ATTEMPT TO USE FIRMWARE AND SETTINGS FROM ANOTHER LAPTOP, EVEN IF THAT LAPTOP SEEMS SIMILAR.** -The CS35L54/56/57 amplifiers must be correctly configured for the power +The CS35L54/56/57/63 amplifiers must be correctly configured for the power supply voltage, speaker impedance, maximum speaker voltage/current, and other external hardware connections. @@ -34,6 +34,7 @@ The cs35l56 drivers support: * CS35L54 * CS35L56 * CS35L57 +* CS35L63 There are two drivers in the kernel From a55737dab6ba63eb4241e9c6547629058af31e12 Mon Sep 17 00:00:00 2001 From: "Jesse.Zhang" Date: Thu, 29 May 2025 11:27:37 +0800 Subject: [PATCH 129/284] drm/amdkfd: move SDMA queue reset capability check to node_show Relocate the per-SDMA queue reset capability check from kfd_topology_set_capabilities() to node_show() to ensure we read the latest value of sdma.supported_reset after all IP blocks are initialized. Fixes: ceb7114c961b ("drm/amdkfd: flag per-sdma queue reset supported to user space") Reviewed-by: Jonathan Kim Signed-off-by: Jesse Zhang Signed-off-by: Alex Deucher (cherry picked from commit e17df7b086cf908cedd919f448da9e00028419bb) --- drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index baa2374acdeb..4ec73f33535e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -510,6 +510,10 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.capability |= HSA_CAP_AQL_QUEUE_DOUBLE_MAP; + if (KFD_GC_VERSION(dev->gpu) < IP_VERSION(10, 0, 0) && + (dev->gpu->adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)) + dev->node_props.capability2 |= HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED; + sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_fcompute", dev->node_props.max_engine_clk_fcompute); @@ -2008,8 +2012,6 @@ static void kfd_topology_set_capabilities(struct kfd_topology_device *dev) if (!amdgpu_sriov_vf(dev->gpu->adev)) dev->node_props.capability |= HSA_CAP_PER_QUEUE_RESET_SUPPORTED; - if (dev->gpu->adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) - dev->node_props.capability2 |= HSA_CAP2_PER_SDMA_QUEUE_RESET_SUPPORTED; } else { dev->node_props.debug_prop |= HSA_DBG_WATCH_ADDR_MASK_LO_BIT_GFX10 | HSA_DBG_WATCH_ADDR_MASK_HI_BIT; From 3251b69b7efb82eba24c9e168a6142a3078de72f Mon Sep 17 00:00:00 2001 From: Peichen Huang Date: Mon, 26 May 2025 16:04:10 +0800 Subject: [PATCH 130/284] drm/amd/display: Add dc cap for dp tunneling [WHAT] 1. add dc cap for dp tunneling 2. add function to get index of host router Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Cruise Hung Signed-off-by: Peichen Huang Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 29e178d13979cf6fdb42c5fe2dfec2da2306c4ad) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/core/dc.c | 33 +++++++++++++++++++ drivers/gpu/drm/amd/display/dc/dc.h | 8 ++++- .../dc/resource/dcn31/dcn31_resource.c | 3 ++ .../dc/resource/dcn314/dcn314_resource.c | 3 ++ .../dc/resource/dcn35/dcn35_resource.c | 3 ++ .../dc/resource/dcn351/dcn351_resource.c | 3 ++ .../dc/resource/dcn36/dcn36_resource.c | 3 ++ 7 files changed, 55 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 56d011a1323c..b34b5b52236d 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -241,6 +241,7 @@ static bool create_links( DC_LOG_DC("BIOS object table - end"); /* Create a link for each usb4 dpia port */ + dc->lowest_dpia_link_index = MAX_LINKS; for (i = 0; i < dc->res_pool->usb4_dpia_count; i++) { struct link_init_data link_init_params = {0}; struct dc_link *link; @@ -253,6 +254,9 @@ static bool create_links( link = dc->link_srv->create_link(&link_init_params); if (link) { + if (dc->lowest_dpia_link_index > dc->link_count) + dc->lowest_dpia_link_index = dc->link_count; + dc->links[dc->link_count] = link; link->dc = dc; ++dc->link_count; @@ -6376,6 +6380,35 @@ unsigned int dc_get_det_buffer_size_from_state(const struct dc_state *context) else return 0; } +/** + *********************************************************************************************** + * dc_get_host_router_index: Get index of host router from a dpia link + * + * This function return a host router index of the target link. If the target link is dpia link. + * + * @param [in] link: target link + * @param [out] host_router_index: host router index of the target link + * + * @return: true if the host router index is found and valid. + * + *********************************************************************************************** + */ +bool dc_get_host_router_index(const struct dc_link *link, unsigned int *host_router_index) +{ + struct dc *dc = link->ctx->dc; + + if (link->ep_type != DISPLAY_ENDPOINT_USB4_DPIA) + return false; + + if (link->link_index < dc->lowest_dpia_link_index) + return false; + + *host_router_index = (link->link_index - dc->lowest_dpia_link_index) / dc->caps.num_of_dpias_per_host_router; + if (*host_router_index < dc->caps.num_of_host_routers) + return true; + else + return false; +} bool dc_is_cursor_limit_pending(struct dc *dc) { diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h index 1d917be36fc4..f41073c0147e 100644 --- a/drivers/gpu/drm/amd/display/dc/dc.h +++ b/drivers/gpu/drm/amd/display/dc/dc.h @@ -66,7 +66,8 @@ struct dmub_notification; #define MAX_STREAMS 6 #define MIN_VIEWPORT_SIZE 12 #define MAX_NUM_EDP 2 -#define MAX_HOST_ROUTERS_NUM 2 +#define MAX_HOST_ROUTERS_NUM 3 +#define MAX_DPIA_PER_HOST_ROUTER 2 /* Display Core Interfaces */ struct dc_versions { @@ -305,6 +306,8 @@ struct dc_caps { /* Conservative limit for DCC cases which require ODM4:1 to support*/ uint32_t dcc_plane_width_limit; struct dc_scl_caps scl_caps; + uint8_t num_of_host_routers; + uint8_t num_of_dpias_per_host_router; }; struct dc_bug_wa { @@ -1603,6 +1606,7 @@ struct dc { uint8_t link_count; struct dc_link *links[MAX_LINKS]; + uint8_t lowest_dpia_link_index; struct link_service *link_srv; struct dc_state *current_state; @@ -2595,6 +2599,8 @@ struct dc_power_profile dc_get_power_profile_for_dc_state(const struct dc_state unsigned int dc_get_det_buffer_size_from_state(const struct dc_state *context); +bool dc_get_host_router_index(const struct dc_link *link, unsigned int *host_router_index); + /* DSC Interfaces */ #include "dc_dsc.h" diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn31/dcn31_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn31/dcn31_resource.c index 7e0af5297dc4..51ca0b2959fc 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn31/dcn31_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn31/dcn31_resource.c @@ -1954,6 +1954,9 @@ static bool dcn31_resource_construct( dc->caps.color.mpc.ogam_rom_caps.hlg = 0; dc->caps.color.mpc.ocsc = 1; + dc->caps.num_of_host_routers = 2; + dc->caps.num_of_dpias_per_host_router = 2; + /* Use pipe context based otg sync logic */ dc->config.use_pipe_ctx_sync_logic = true; dc->config.disable_hbr_audio_dp2 = true; diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn314/dcn314_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn314/dcn314_resource.c index d96bc6cb73ad..8383e2e59be5 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn314/dcn314_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn314/dcn314_resource.c @@ -1885,6 +1885,9 @@ static bool dcn314_resource_construct( dc->caps.max_disp_clock_khz_at_vmin = 650000; + dc->caps.num_of_host_routers = 2; + dc->caps.num_of_dpias_per_host_router = 2; + /* Use pipe context based otg sync logic */ dc->config.use_pipe_ctx_sync_logic = true; diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c index 72c6cf047db0..e01aa2f2e13e 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn35/dcn35_resource.c @@ -1894,6 +1894,9 @@ static bool dcn35_resource_construct( dc->caps.color.mpc.ogam_rom_caps.hlg = 0; dc->caps.color.mpc.ocsc = 1; + dc->caps.num_of_host_routers = 2; + dc->caps.num_of_dpias_per_host_router = 2; + /* max_disp_clock_khz_at_vmin is slightly lower than the STA value in order * to provide some margin. * It's expected for furture ASIC to have equal or higher value, in order to diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c index 989a270f7dea..4ebe4e00a4f8 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn351/dcn351_resource.c @@ -1866,6 +1866,9 @@ static bool dcn351_resource_construct( dc->caps.color.mpc.ogam_rom_caps.hlg = 0; dc->caps.color.mpc.ocsc = 1; + dc->caps.num_of_host_routers = 2; + dc->caps.num_of_dpias_per_host_router = 2; + /* max_disp_clock_khz_at_vmin is slightly lower than the STA value in order * to provide some margin. * It's expected for furture ASIC to have equal or higher value, in order to diff --git a/drivers/gpu/drm/amd/display/dc/resource/dcn36/dcn36_resource.c b/drivers/gpu/drm/amd/display/dc/resource/dcn36/dcn36_resource.c index 48e1f234185f..db36b8f9ce65 100644 --- a/drivers/gpu/drm/amd/display/dc/resource/dcn36/dcn36_resource.c +++ b/drivers/gpu/drm/amd/display/dc/resource/dcn36/dcn36_resource.c @@ -1867,6 +1867,9 @@ static bool dcn36_resource_construct( dc->caps.color.mpc.ogam_rom_caps.hlg = 0; dc->caps.color.mpc.ocsc = 1; + dc->caps.num_of_host_routers = 2; + dc->caps.num_of_dpias_per_host_router = 2; + /* max_disp_clock_khz_at_vmin is slightly lower than the STA value in order * to provide some margin. * It's expected for furture ASIC to have equal or higher value, in order to From d358a51444c88bcc995e471dc8cc840f19e4b374 Mon Sep 17 00:00:00 2001 From: Michael Strauss Date: Wed, 26 Feb 2025 10:03:48 -0500 Subject: [PATCH 131/284] drm/amd/display: Get LTTPR IEEE OUI/Device ID From Closest LTTPR To Host [WHY] These fields are read for the explicit purpose of detecting embedded LTTPRs (i.e. between host ASIC and the user-facing port), and thus need to calculate the correct DPCD address offset based on LTTPR count to target the appropriate LTTPR's DPCD register space with these queries. [HOW] Cascaded LTTPRs in a link each snoop and increment LTTPR count when queried via DPCD read, so an LTTPR embedded in a source device (e.g. USB4 port on a laptop) will always be addressible using the max LTTPR count seen by the host. Therefore we simply need to use a recently added helper function to calculate the correct DPCD address to target potentially embedded LTTPRs based on the received LTTPR count. Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Wenjing Liu Signed-off-by: Michael Strauss Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 791897f5c77a2a65d0e500be4743af2ddf6eb061) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dc_dp_types.h | 4 +- .../dc/link/protocols/link_dp_capability.c | 38 +++++++++++++++---- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dc_dp_types.h b/drivers/gpu/drm/amd/display/dc/dc_dp_types.h index 0bad8304ccf6..d346f8ae1634 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_dp_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_dp_types.h @@ -1172,8 +1172,8 @@ struct dc_lttpr_caps { union dp_128b_132b_supported_lttpr_link_rates supported_128b_132b_rates; union dp_alpm_lttpr_cap alpm; uint8_t aux_rd_interval[MAX_REPEATER_CNT - 1]; - uint8_t lttpr_ieee_oui[3]; - uint8_t lttpr_device_id[6]; + uint8_t lttpr_ieee_oui[3]; // Always read from closest LTTPR to host + uint8_t lttpr_device_id[6]; // Always read from closest LTTPR to host }; struct dc_dongle_dfp_cap_ext { diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c index a5127c2d47ef..0f965380a9b4 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_capability.c @@ -385,9 +385,15 @@ bool dp_is_128b_132b_signal(struct pipe_ctx *pipe_ctx) bool dp_is_lttpr_present(struct dc_link *link) { /* Some sink devices report invalid LTTPR revision, so don't validate against that cap */ - return (dp_parse_lttpr_repeater_count(link->dpcd_caps.lttpr_caps.phy_repeater_cnt) != 0 && + uint32_t lttpr_count = dp_parse_lttpr_repeater_count(link->dpcd_caps.lttpr_caps.phy_repeater_cnt); + bool is_lttpr_present = (lttpr_count > 0 && link->dpcd_caps.lttpr_caps.max_lane_count > 0 && link->dpcd_caps.lttpr_caps.max_lane_count <= 4); + + if (lttpr_count > 0 && !is_lttpr_present) + DC_LOG_ERROR("LTTPR count is nonzero but invalid lane count reported. Assuming no LTTPR present.\n"); + + return is_lttpr_present; } /* in DP compliance test, DPR-120 may have @@ -1551,6 +1557,8 @@ enum dc_status dp_retrieve_lttpr_cap(struct dc_link *link) uint8_t lttpr_dpcd_data[10] = {0}; enum dc_status status; bool is_lttpr_present; + uint32_t lttpr_count; + uint32_t closest_lttpr_offset; /* Logic to determine LTTPR support*/ bool vbios_lttpr_interop = link->dc->caps.vbios_lttpr_aware; @@ -1602,20 +1610,22 @@ enum dc_status dp_retrieve_lttpr_cap(struct dc_link *link) lttpr_dpcd_data[DP_LTTPR_ALPM_CAPABILITIES - DP_LT_TUNABLE_PHY_REPEATER_FIELD_DATA_STRUCTURE_REV]; + lttpr_count = dp_parse_lttpr_repeater_count(link->dpcd_caps.lttpr_caps.phy_repeater_cnt); + /* If this chip cap is set, at least one retimer must exist in the chain * Override count to 1 if we receive a known bad count (0 or an invalid value) */ if (((link->chip_caps & AMD_EXT_DISPLAY_PATH_CAPS__EXT_CHIP_MASK) == AMD_EXT_DISPLAY_PATH_CAPS__DP_FIXED_VS_EN) && - (dp_parse_lttpr_repeater_count(link->dpcd_caps.lttpr_caps.phy_repeater_cnt) == 0)) { + lttpr_count == 0) { /* If you see this message consistently, either the host platform has FIXED_VS flag * incorrectly configured or the sink device is returning an invalid count. */ DC_LOG_ERROR("lttpr_caps phy_repeater_cnt is 0x%x, forcing it to 0x80.", link->dpcd_caps.lttpr_caps.phy_repeater_cnt); link->dpcd_caps.lttpr_caps.phy_repeater_cnt = 0x80; + lttpr_count = 1; DC_LOG_DC("lttpr_caps forced phy_repeater_cnt = %d\n", link->dpcd_caps.lttpr_caps.phy_repeater_cnt); } - /* Attempt to train in LTTPR transparent mode if repeater count exceeds 8. */ is_lttpr_present = dp_is_lttpr_present(link); DC_LOG_DC("is_lttpr_present = %d\n", is_lttpr_present); @@ -1623,11 +1633,25 @@ enum dc_status dp_retrieve_lttpr_cap(struct dc_link *link) if (is_lttpr_present) { CONN_DATA_DETECT(link, lttpr_dpcd_data, sizeof(lttpr_dpcd_data), "LTTPR Caps: "); - core_link_read_dpcd(link, DP_LTTPR_IEEE_OUI, link->dpcd_caps.lttpr_caps.lttpr_ieee_oui, sizeof(link->dpcd_caps.lttpr_caps.lttpr_ieee_oui)); - CONN_DATA_DETECT(link, link->dpcd_caps.lttpr_caps.lttpr_ieee_oui, sizeof(link->dpcd_caps.lttpr_caps.lttpr_ieee_oui), "LTTPR IEEE OUI: "); + // Identify closest LTTPR to determine if workarounds required for known embedded LTTPR + closest_lttpr_offset = dp_get_closest_lttpr_offset(lttpr_count); - core_link_read_dpcd(link, DP_LTTPR_DEVICE_ID, link->dpcd_caps.lttpr_caps.lttpr_device_id, sizeof(link->dpcd_caps.lttpr_caps.lttpr_device_id)); - CONN_DATA_DETECT(link, link->dpcd_caps.lttpr_caps.lttpr_device_id, sizeof(link->dpcd_caps.lttpr_caps.lttpr_device_id), "LTTPR Device ID: "); + core_link_read_dpcd(link, (DP_LTTPR_IEEE_OUI + closest_lttpr_offset), + link->dpcd_caps.lttpr_caps.lttpr_ieee_oui, sizeof(link->dpcd_caps.lttpr_caps.lttpr_ieee_oui)); + core_link_read_dpcd(link, (DP_LTTPR_DEVICE_ID + closest_lttpr_offset), + link->dpcd_caps.lttpr_caps.lttpr_device_id, sizeof(link->dpcd_caps.lttpr_caps.lttpr_device_id)); + + if (lttpr_count > 1) { + CONN_DATA_DETECT(link, link->dpcd_caps.lttpr_caps.lttpr_ieee_oui, sizeof(link->dpcd_caps.lttpr_caps.lttpr_ieee_oui), + "Closest LTTPR To Host's IEEE OUI: "); + CONN_DATA_DETECT(link, link->dpcd_caps.lttpr_caps.lttpr_device_id, sizeof(link->dpcd_caps.lttpr_caps.lttpr_device_id), + "Closest LTTPR To Host's LTTPR Device ID: "); + } else { + CONN_DATA_DETECT(link, link->dpcd_caps.lttpr_caps.lttpr_ieee_oui, sizeof(link->dpcd_caps.lttpr_caps.lttpr_ieee_oui), + "LTTPR IEEE OUI: "); + CONN_DATA_DETECT(link, link->dpcd_caps.lttpr_caps.lttpr_device_id, sizeof(link->dpcd_caps.lttpr_caps.lttpr_device_id), + "LTTPR Device ID: "); + } } return status; From 0d57dd1765d311111d9885346108c4deeae1deb4 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Wed, 21 May 2025 16:40:25 -0400 Subject: [PATCH 132/284] drm/amd/display: Add more checks for DSC / HUBP ONO guarantees [WHY] For non-zero DSC instances it's possible that the HUBP domain required to drive it for sequential ONO ASICs isn't met, potentially causing the logic to the tile to enter an undefined state leading to a system hang. [HOW] Add more checks to ensure that the HUBP domain matching the DSC instance is appropriately powered. Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Duncan Ma Signed-off-by: Nicholas Kazlauskas Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit da63df07112e5a9857a8d2aaa04255c4206754ec) Cc: stable@vger.kernel.org --- .../amd/display/dc/hwss/dcn35/dcn35_hwseq.c | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c index c814d957305a..a267f574b619 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dcn35/dcn35_hwseq.c @@ -1047,6 +1047,15 @@ void dcn35_calc_blocks_to_gate(struct dc *dc, struct dc_state *context, if (dc->caps.sequential_ono) { update_state->pg_pipe_res_update[PG_HUBP][pipe_ctx->stream_res.dsc->inst] = false; update_state->pg_pipe_res_update[PG_DPP][pipe_ctx->stream_res.dsc->inst] = false; + + /* All HUBP/DPP instances must be powered if the DSC inst != HUBP inst */ + if (!pipe_ctx->top_pipe && pipe_ctx->plane_res.hubp && + pipe_ctx->plane_res.hubp->inst != pipe_ctx->stream_res.dsc->inst) { + for (j = 0; j < dc->res_pool->pipe_count; ++j) { + update_state->pg_pipe_res_update[PG_HUBP][j] = false; + update_state->pg_pipe_res_update[PG_DPP][j] = false; + } + } } } @@ -1193,6 +1202,25 @@ void dcn35_calc_blocks_to_ungate(struct dc *dc, struct dc_state *context, update_state->pg_pipe_res_update[PG_HDMISTREAM][0] = true; if (dc->caps.sequential_ono) { + for (i = 0; i < dc->res_pool->pipe_count; i++) { + struct pipe_ctx *new_pipe = &context->res_ctx.pipe_ctx[i]; + + if (new_pipe->stream_res.dsc && !new_pipe->top_pipe && + update_state->pg_pipe_res_update[PG_DSC][new_pipe->stream_res.dsc->inst]) { + update_state->pg_pipe_res_update[PG_HUBP][new_pipe->stream_res.dsc->inst] = true; + update_state->pg_pipe_res_update[PG_DPP][new_pipe->stream_res.dsc->inst] = true; + + /* All HUBP/DPP instances must be powered if the DSC inst != HUBP inst */ + if (new_pipe->plane_res.hubp && + new_pipe->plane_res.hubp->inst != new_pipe->stream_res.dsc->inst) { + for (j = 0; j < dc->res_pool->pipe_count; ++j) { + update_state->pg_pipe_res_update[PG_HUBP][j] = true; + update_state->pg_pipe_res_update[PG_DPP][j] = true; + } + } + } + } + for (i = dc->res_pool->pipe_count - 1; i >= 0; i--) { if (update_state->pg_pipe_res_update[PG_HUBP][i] && update_state->pg_pipe_res_update[PG_DPP][i]) { From 8724a5380c4390eed81e271d22f34ff06453ded9 Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Thu, 29 May 2025 10:59:19 -0600 Subject: [PATCH 133/284] drm/amd/display: Fix mpv playback corruption on weston [WHAT] Severe video playback corruption is observed in the following setup: weston 14.0.90 (built from source) + mpv v0.40.0 with command: mpv bbb_sunflower_1080p_60fps_normal.mp4 --vo=gpu [HOW] ABGR16161616 needs to be included in dml2/2.1 translation. Cc: Mario Limonciello Cc: Alex Deucher Acked-by: Aurabindo Pillai Reviewed-by: Harry Wentland Reviewed-by: Austin Zheng Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit d023de809f85307ca819a9dbbceee6ae1f50e2ad) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c | 1 + drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c index d47cacfdb695..2aa6d44bb359 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_translation_helper.c @@ -788,6 +788,7 @@ static void populate_dml21_plane_config_from_plane_state(struct dml2_context *dm plane->pixel_format = dml2_420_10; break; case SURFACE_PIXEL_FORMAT_GRPH_ARGB16161616: + case SURFACE_PIXEL_FORMAT_GRPH_ABGR16161616: case SURFACE_PIXEL_FORMAT_GRPH_ARGB16161616F: case SURFACE_PIXEL_FORMAT_GRPH_ABGR16161616F: plane->pixel_format = dml2_444_64; diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c index 5de775fd8fce..208630754c8a 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_translation_helper.c @@ -953,6 +953,7 @@ static void populate_dml_surface_cfg_from_plane_state(enum dml_project_id dml2_p out->SourcePixelFormat[location] = dml_420_10; break; case SURFACE_PIXEL_FORMAT_GRPH_ARGB16161616: + case SURFACE_PIXEL_FORMAT_GRPH_ABGR16161616: case SURFACE_PIXEL_FORMAT_GRPH_ARGB16161616F: case SURFACE_PIXEL_FORMAT_GRPH_ABGR16161616F: out->SourcePixelFormat[location] = dml_444_64; From 158f9944ac05dafd2d3a23d0688e6cf40ef68b90 Mon Sep 17 00:00:00 2001 From: Yihan Zhu Date: Tue, 27 May 2025 16:47:40 -0400 Subject: [PATCH 134/284] drm/amd/display: Fix RMCM programming seq errors [WHY & HOW] Fix RMCM programming sequence errors and mapping issues to pass the RMCM test. Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Dmytro Laktyushkin Signed-off-by: Yihan Zhu Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 11baa4975025033547f45f5894087a0dda6efbb8) Cc: stable@vger.kernel.org --- .../dc/dml2/dml21/src/dml2_core/dml2_core_dcn4_calcs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_core/dml2_core_dcn4_calcs.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_core/dml2_core_dcn4_calcs.c index c4dad7164d31..5b62cd19d979 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_core/dml2_core_dcn4_calcs.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/src/dml2_core/dml2_core_dcn4_calcs.c @@ -4685,7 +4685,10 @@ static void calculate_tdlut_setting( //the tdlut is fetched during the 2 row times of prefetch. if (p->setup_for_tdlut) { *p->tdlut_groups_per_2row_ub = (unsigned int)math_ceil2((double) *p->tdlut_bytes_per_frame / *p->tdlut_bytes_per_group, 1); - *p->tdlut_opt_time = (*p->tdlut_bytes_per_frame - p->cursor_buffer_size * 1024) / tdlut_drain_rate; + if (*p->tdlut_bytes_per_frame > p->cursor_buffer_size * 1024) + *p->tdlut_opt_time = (*p->tdlut_bytes_per_frame - p->cursor_buffer_size * 1024) / tdlut_drain_rate; + else + *p->tdlut_opt_time = 0; *p->tdlut_drain_time = p->cursor_buffer_size * 1024 / tdlut_drain_rate; *p->tdlut_bytes_to_deliver = (unsigned int) (p->cursor_buffer_size * 1024.0); } From ffcaed1d7ecef31198000dfbbea791f30f7ca437 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 29 May 2025 11:33:44 -0500 Subject: [PATCH 135/284] drm/amd/display: Only read ACPI backlight caps once [WHY] Backlight caps are read already in amdgpu_dm_update_backlight_caps(). They may be updated by update_connector_ext_caps(). Reading again when registering backlight device may cause wrong values to be used. [HOW] Use backlight caps already registered to the dm. Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Roman Li Signed-off-by: Mario Limonciello Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 148144f6d2f14b02eaaa39b86bbe023cbff350bd) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index d3100f641ac6..0b5d5ab14a69 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4908,7 +4908,7 @@ amdgpu_dm_register_backlight_device(struct amdgpu_dm_connector *aconnector) struct drm_device *drm = aconnector->base.dev; struct amdgpu_display_manager *dm = &drm_to_adev(drm)->dm; struct backlight_properties props = { 0 }; - struct amdgpu_dm_backlight_caps caps = { 0 }; + struct amdgpu_dm_backlight_caps *caps; char bl_name[16]; int min, max; @@ -4922,20 +4922,20 @@ amdgpu_dm_register_backlight_device(struct amdgpu_dm_connector *aconnector) return; } - amdgpu_acpi_get_backlight_caps(&caps); - if (caps.caps_valid && get_brightness_range(&caps, &min, &max)) { + caps = &dm->backlight_caps[aconnector->bl_idx]; + if (get_brightness_range(caps, &min, &max)) { if (power_supply_is_system_supplied() > 0) - props.brightness = (max - min) * DIV_ROUND_CLOSEST(caps.ac_level, 100); + props.brightness = (max - min) * DIV_ROUND_CLOSEST(caps->ac_level, 100); else - props.brightness = (max - min) * DIV_ROUND_CLOSEST(caps.dc_level, 100); + props.brightness = (max - min) * DIV_ROUND_CLOSEST(caps->dc_level, 100); /* min is zero, so max needs to be adjusted */ props.max_brightness = max - min; drm_dbg(drm, "Backlight caps: min: %d, max: %d, ac %d, dc %d\n", min, max, - caps.ac_level, caps.dc_level); + caps->ac_level, caps->dc_level); } else props.brightness = AMDGPU_MAX_BL_LEVEL; - if (caps.data_points && !(amdgpu_dc_debug_mask & DC_DISABLE_CUSTOM_BRIGHTNESS_CURVE)) + if (caps->data_points && !(amdgpu_dc_debug_mask & DC_DISABLE_CUSTOM_BRIGHTNESS_CURVE)) drm_info(drm, "Using custom brightness curve\n"); props.max_brightness = AMDGPU_MAX_BL_LEVEL; props.type = BACKLIGHT_RAW; From 16dc8bc27c2aa3c93905d3e885e27f1e3535f09a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 29 May 2025 09:46:32 -0500 Subject: [PATCH 136/284] drm/amd/display: Export full brightness range to userspace [WHY] Userspace currently is offered a range from 0-0xFF but the PWM is programmed from 0-0xFFFF. This can be limiting to some software that wants to apply greater granularity. [HOW] Convert internally to firmware values only when mapping custom brightness curves because these are in 0-0xFF range. Advertise full PWM range to userspace. Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Roman Li Signed-off-by: Mario Limonciello Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 8dbd72cb790058ce52279af38a43c2b302fdd3e5) Cc: stable@vger.kernel.org --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 43 ++++++++++++------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 0b5d5ab14a69..bc4cd11bfc79 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -4718,9 +4718,23 @@ static int get_brightness_range(const struct amdgpu_dm_backlight_caps *caps, return 1; } -static void convert_custom_brightness(const struct amdgpu_dm_backlight_caps *caps, - uint32_t *brightness) +/* Rescale from [min..max] to [0..AMDGPU_MAX_BL_LEVEL] */ +static inline u32 scale_input_to_fw(int min, int max, u64 input) { + return DIV_ROUND_CLOSEST_ULL(input * AMDGPU_MAX_BL_LEVEL, max - min); +} + +/* Rescale from [0..AMDGPU_MAX_BL_LEVEL] to [min..max] */ +static inline u32 scale_fw_to_input(int min, int max, u64 input) +{ + return min + DIV_ROUND_CLOSEST_ULL(input * (max - min), AMDGPU_MAX_BL_LEVEL); +} + +static void convert_custom_brightness(const struct amdgpu_dm_backlight_caps *caps, + unsigned int min, unsigned int max, + uint32_t *user_brightness) +{ + u32 brightness = scale_input_to_fw(min, max, *user_brightness); u8 prev_signal = 0, prev_lum = 0; int i = 0; @@ -4731,7 +4745,7 @@ static void convert_custom_brightness(const struct amdgpu_dm_backlight_caps *cap return; /* choose start to run less interpolation steps */ - if (caps->luminance_data[caps->data_points/2].input_signal > *brightness) + if (caps->luminance_data[caps->data_points/2].input_signal > brightness) i = caps->data_points/2; do { u8 signal = caps->luminance_data[i].input_signal; @@ -4742,17 +4756,18 @@ static void convert_custom_brightness(const struct amdgpu_dm_backlight_caps *cap * brightness < signal: interpolate between previous and current luminance numerator * brightness > signal: find next data point */ - if (*brightness > signal) { + if (brightness > signal) { prev_signal = signal; prev_lum = lum; i++; continue; } - if (*brightness < signal) + if (brightness < signal) lum = prev_lum + DIV_ROUND_CLOSEST((lum - prev_lum) * - (*brightness - prev_signal), + (brightness - prev_signal), signal - prev_signal); - *brightness = DIV_ROUND_CLOSEST(lum * *brightness, 101); + *user_brightness = scale_fw_to_input(min, max, + DIV_ROUND_CLOSEST(lum * brightness, 101)); return; } while (i < caps->data_points); } @@ -4765,11 +4780,10 @@ static u32 convert_brightness_from_user(const struct amdgpu_dm_backlight_caps *c if (!get_brightness_range(caps, &min, &max)) return brightness; - convert_custom_brightness(caps, &brightness); + convert_custom_brightness(caps, min, max, &brightness); - // Rescale 0..255 to min..max - return min + DIV_ROUND_CLOSEST((max - min) * brightness, - AMDGPU_MAX_BL_LEVEL); + // Rescale 0..max to min..max + return min + DIV_ROUND_CLOSEST_ULL((u64)(max - min) * brightness, max); } static u32 convert_brightness_to_user(const struct amdgpu_dm_backlight_caps *caps, @@ -4782,8 +4796,8 @@ static u32 convert_brightness_to_user(const struct amdgpu_dm_backlight_caps *cap if (brightness < min) return 0; - // Rescale min..max to 0..255 - return DIV_ROUND_CLOSEST(AMDGPU_MAX_BL_LEVEL * (brightness - min), + // Rescale min..max to 0..max + return DIV_ROUND_CLOSEST_ULL((u64)max * (brightness - min), max - min); } @@ -4933,11 +4947,10 @@ amdgpu_dm_register_backlight_device(struct amdgpu_dm_connector *aconnector) drm_dbg(drm, "Backlight caps: min: %d, max: %d, ac %d, dc %d\n", min, max, caps->ac_level, caps->dc_level); } else - props.brightness = AMDGPU_MAX_BL_LEVEL; + props.brightness = props.max_brightness = AMDGPU_MAX_BL_LEVEL; if (caps->data_points && !(amdgpu_dc_debug_mask & DC_DISABLE_CUSTOM_BRIGHTNESS_CURVE)) drm_info(drm, "Using custom brightness curve\n"); - props.max_brightness = AMDGPU_MAX_BL_LEVEL; props.type = BACKLIGHT_RAW; snprintf(bl_name, sizeof(bl_name), "amdgpu_bl%d", From 0bbf5fd86c585d437b75003f11365b324360a5d6 Mon Sep 17 00:00:00 2001 From: Frank Min Date: Wed, 4 Jun 2025 21:00:44 +0800 Subject: [PATCH 137/284] drm/amdgpu: Add kicker device detection 1. add kicker device list 2. add kicker device checking helper function Signed-off-by: Frank Min Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit 09aa2b408f4ab689c3541d22b0968de0392ee406) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 17 +++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h | 6 ++++++ 2 files changed, 23 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c index 2505c46a9c3d..eaddc441c51a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c @@ -30,6 +30,10 @@ #define AMDGPU_UCODE_NAME_MAX (128) +static const struct kicker_device kicker_device_list[] = { + {0x744B, 0x00}, +}; + static void amdgpu_ucode_print_common_hdr(const struct common_firmware_header *hdr) { DRM_DEBUG("size_bytes: %u\n", le32_to_cpu(hdr->size_bytes)); @@ -1387,6 +1391,19 @@ static const char *amdgpu_ucode_legacy_naming(struct amdgpu_device *adev, int bl return NULL; } +bool amdgpu_is_kicker_fw(struct amdgpu_device *adev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(kicker_device_list); i++) { + if (adev->pdev->device == kicker_device_list[i].device && + adev->pdev->revision == kicker_device_list[i].revision) + return true; + } + + return false; +} + void amdgpu_ucode_ip_version_decode(struct amdgpu_device *adev, int block_type, char *ucode_prefix, int len) { int maj, min, rev; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h index 9e89c3487be5..6349aad6da35 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h @@ -605,6 +605,11 @@ struct amdgpu_firmware { uint32_t pldm_version; }; +struct kicker_device{ + unsigned short device; + u8 revision; +}; + void amdgpu_ucode_print_mc_hdr(const struct common_firmware_header *hdr); void amdgpu_ucode_print_smc_hdr(const struct common_firmware_header *hdr); void amdgpu_ucode_print_imu_hdr(const struct common_firmware_header *hdr); @@ -632,5 +637,6 @@ amdgpu_ucode_get_load_type(struct amdgpu_device *adev, int load_type); const char *amdgpu_ucode_name(enum AMDGPU_UCODE_ID ucode_id); void amdgpu_ucode_ip_version_decode(struct amdgpu_device *adev, int block_type, char *ucode_prefix, int len); +bool amdgpu_is_kicker_fw(struct amdgpu_device *adev); #endif From 854171405e7f093532b33d8ed0875b9e34fc55b4 Mon Sep 17 00:00:00 2001 From: Frank Min Date: Wed, 4 Jun 2025 21:17:05 +0800 Subject: [PATCH 138/284] drm/amdgpu: add kicker fws loading for gfx11/smu13/psp13 1. Add kicker firmwares loading for gfx11/smu13/psp13 2. Register additional MODULE_FIRMWARE entries for kicker fws - gc_11_0_0_rlc_kicker.bin - gc_11_0_0_imu_kicker.bin - psp_13_0_0_sos_kicker.bin - psp_13_0_0_ta_kicker.bin - smu_13_0_0_kicker.bin Signed-off-by: Frank Min Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher (cherry picked from commit fb5ec2174d70a8989bc207d257db90ffeca3b163) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 16 ++++++++++++---- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/imu_v11_0.c | 9 +++++++-- drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 2 ++ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 12 +++++++++--- 5 files changed, 35 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index e6f0b035e20b..c14f63cefe67 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -3522,8 +3522,12 @@ int psp_init_sos_microcode(struct psp_context *psp, const char *chip_name) uint8_t *ucode_array_start_addr; int err = 0; - err = amdgpu_ucode_request(adev, &adev->psp.sos_fw, AMDGPU_UCODE_REQUIRED, - "amdgpu/%s_sos.bin", chip_name); + if (amdgpu_is_kicker_fw(adev)) + err = amdgpu_ucode_request(adev, &adev->psp.sos_fw, AMDGPU_UCODE_REQUIRED, + "amdgpu/%s_sos_kicker.bin", chip_name); + else + err = amdgpu_ucode_request(adev, &adev->psp.sos_fw, AMDGPU_UCODE_REQUIRED, + "amdgpu/%s_sos.bin", chip_name); if (err) goto out; @@ -3799,8 +3803,12 @@ int psp_init_ta_microcode(struct psp_context *psp, const char *chip_name) struct amdgpu_device *adev = psp->adev; int err; - err = amdgpu_ucode_request(adev, &adev->psp.ta_fw, AMDGPU_UCODE_REQUIRED, - "amdgpu/%s_ta.bin", chip_name); + if (amdgpu_is_kicker_fw(adev)) + err = amdgpu_ucode_request(adev, &adev->psp.ta_fw, AMDGPU_UCODE_REQUIRED, + "amdgpu/%s_ta_kicker.bin", chip_name); + else + err = amdgpu_ucode_request(adev, &adev->psp.ta_fw, AMDGPU_UCODE_REQUIRED, + "amdgpu/%s_ta.bin", chip_name); if (err) return err; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index afd6d59164bf..ec9b84f92d46 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -85,6 +85,7 @@ MODULE_FIRMWARE("amdgpu/gc_11_0_0_pfp.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_0_me.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_0_mec.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_0_rlc.bin"); +MODULE_FIRMWARE("amdgpu/gc_11_0_0_rlc_kicker.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_0_rlc_1.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_0_toc.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_1_pfp.bin"); @@ -759,6 +760,10 @@ static int gfx_v11_0_init_microcode(struct amdgpu_device *adev) err = amdgpu_ucode_request(adev, &adev->gfx.rlc_fw, AMDGPU_UCODE_REQUIRED, "amdgpu/gc_11_0_0_rlc_1.bin"); + else if (amdgpu_is_kicker_fw(adev)) + err = amdgpu_ucode_request(adev, &adev->gfx.rlc_fw, + AMDGPU_UCODE_REQUIRED, + "amdgpu/%s_rlc_kicker.bin", ucode_prefix); else err = amdgpu_ucode_request(adev, &adev->gfx.rlc_fw, AMDGPU_UCODE_REQUIRED, diff --git a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c index cfa91d709d49..cc626036ed9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/imu_v11_0.c @@ -32,6 +32,7 @@ #include "gc/gc_11_0_0_sh_mask.h" MODULE_FIRMWARE("amdgpu/gc_11_0_0_imu.bin"); +MODULE_FIRMWARE("amdgpu/gc_11_0_0_imu_kicker.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_1_imu.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_2_imu.bin"); MODULE_FIRMWARE("amdgpu/gc_11_0_3_imu.bin"); @@ -51,8 +52,12 @@ static int imu_v11_0_init_microcode(struct amdgpu_device *adev) DRM_DEBUG("\n"); amdgpu_ucode_ip_version_decode(adev, GC_HWIP, ucode_prefix, sizeof(ucode_prefix)); - err = amdgpu_ucode_request(adev, &adev->gfx.imu_fw, AMDGPU_UCODE_REQUIRED, - "amdgpu/%s_imu.bin", ucode_prefix); + if (amdgpu_is_kicker_fw(adev)) + err = amdgpu_ucode_request(adev, &adev->gfx.imu_fw, AMDGPU_UCODE_REQUIRED, + "amdgpu/%s_imu_kicker.bin", ucode_prefix); + else + err = amdgpu_ucode_request(adev, &adev->gfx.imu_fw, AMDGPU_UCODE_REQUIRED, + "amdgpu/%s_imu.bin", ucode_prefix); if (err) goto out; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index df612fd9cc50..ead616c11705 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -42,7 +42,9 @@ MODULE_FIRMWARE("amdgpu/psp_13_0_5_ta.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_8_toc.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_8_ta.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_0_sos.bin"); +MODULE_FIRMWARE("amdgpu/psp_13_0_0_sos_kicker.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_0_ta.bin"); +MODULE_FIRMWARE("amdgpu/psp_13_0_0_ta_kicker.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_7_sos.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_7_ta.bin"); MODULE_FIRMWARE("amdgpu/psp_13_0_10_sos.bin"); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index a7167668d189..1c7235935d14 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -58,6 +58,7 @@ MODULE_FIRMWARE("amdgpu/aldebaran_smc.bin"); MODULE_FIRMWARE("amdgpu/smu_13_0_0.bin"); +MODULE_FIRMWARE("amdgpu/smu_13_0_0_kicker.bin"); MODULE_FIRMWARE("amdgpu/smu_13_0_7.bin"); MODULE_FIRMWARE("amdgpu/smu_13_0_10.bin"); @@ -92,7 +93,7 @@ const int pmfw_decoded_link_width[7] = {0, 1, 2, 4, 8, 12, 16}; int smu_v13_0_init_microcode(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - char ucode_prefix[15]; + char ucode_prefix[30]; int err = 0; const struct smc_firmware_header_v1_0 *hdr; const struct common_firmware_header *header; @@ -103,8 +104,13 @@ int smu_v13_0_init_microcode(struct smu_context *smu) return 0; amdgpu_ucode_ip_version_decode(adev, MP1_HWIP, ucode_prefix, sizeof(ucode_prefix)); - err = amdgpu_ucode_request(adev, &adev->pm.fw, AMDGPU_UCODE_REQUIRED, - "amdgpu/%s.bin", ucode_prefix); + + if (amdgpu_is_kicker_fw(adev)) + err = amdgpu_ucode_request(adev, &adev->pm.fw, AMDGPU_UCODE_REQUIRED, + "amdgpu/%s_kicker.bin", ucode_prefix); + else + err = amdgpu_ucode_request(adev, &adev->pm.fw, AMDGPU_UCODE_REQUIRED, + "amdgpu/%s.bin", ucode_prefix); if (err) goto out; From 09b585592fa481384597c81388733aed4a04dd05 Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Wed, 11 Jun 2025 15:02:09 +0800 Subject: [PATCH 139/284] drm/amdgpu: Fix SDMA engine reset with logical instance ID This commit makes the following improvements to SDMA engine reset handling: 1. Clarifies in the function documentation that instance_id refers to a logical ID 2. Adds conversion from logical to physical instance ID before performing reset using GET_INST(SDMA0, instance_id) macro 3. Improves error messaging to indicate when a logical instance reset fails 4. Adds better code organization with blank lines for readability The change ensures proper SDMA engine reset by using the correct physical instance ID while maintaining the logical ID interface for callers. V2: Remove harvest_config check and convert directly to physical instance (Lijo) Suggested-by: Jonathan Kim Signed-off-by: Jesse Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 5efa6217c239ed1ceec0f0414f9b6f6927387dfc) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 6716ac281c49..9b54a1ece447 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -540,8 +540,10 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id) case IP_VERSION(4, 4, 2): case IP_VERSION(4, 4, 4): case IP_VERSION(4, 4, 5): - /* For SDMA 4.x, use the existing DPM interface for backward compatibility */ - r = amdgpu_dpm_reset_sdma(adev, 1 << instance_id); + /* For SDMA 4.x, use the existing DPM interface for backward compatibility, + * we need to convert the logical instance ID to physical instance ID before reset. + */ + r = amdgpu_dpm_reset_sdma(adev, 1 << GET_INST(SDMA0, instance_id)); break; case IP_VERSION(5, 0, 0): case IP_VERSION(5, 0, 1): @@ -568,7 +570,7 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id) /** * amdgpu_sdma_reset_engine - Reset a specific SDMA engine * @adev: Pointer to the AMDGPU device - * @instance_id: ID of the SDMA engine instance to reset + * @instance_id: Logical ID of the SDMA engine instance to reset * * Returns: 0 on success, or a negative error code on failure. */ @@ -601,7 +603,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id) /* Perform the SDMA reset for the specified instance */ ret = amdgpu_sdma_soft_reset(adev, instance_id); if (ret) { - dev_err(adev->dev, "Failed to reset SDMA instance %u\n", instance_id); + dev_err(adev->dev, "Failed to reset SDMA logical instance %u\n", instance_id); goto exit; } From caade9d69f2e2b76d2e2d47089736a99135b8772 Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Wed, 11 Jun 2025 15:07:11 +0800 Subject: [PATCH 140/284] drm/amdgpu: Use logical instance ID for SDMA v4_4_2 queue operations Simplify SDMA v4_4_2 queue reset and stop operations by: 1. Removing GET_INST(SDMA0) conversion for ring->me 2. Using the logical instance ID (ring->me) directly 3. Maintaining consistent behavior with other SDMA queue operations This change aligns with the existing queue handling logic where ring->me already represents the correct instance identifier. Signed-off-by: Lijo Lazar Signed-off-by: Jesse Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 3bab282dfe25dff7a55add432f56898505a6cc6c) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 9c169112a5e7..3de125062ee3 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1670,7 +1670,7 @@ static bool sdma_v4_4_2_page_ring_is_guilty(struct amdgpu_ring *ring) static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid) { struct amdgpu_device *adev = ring->adev; - u32 id = GET_INST(SDMA0, ring->me); + u32 id = ring->me; int r; if (!(adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE)) @@ -1686,7 +1686,7 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid) static int sdma_v4_4_2_stop_queue(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; - u32 instance_id = GET_INST(SDMA0, ring->me); + u32 instance_id = ring->me; u32 inst_mask; uint64_t rptr; From 46e15197b513e60786a44107759d6ca293d6288c Mon Sep 17 00:00:00 2001 From: Sonny Jiang Date: Thu, 12 Jun 2025 11:01:08 -0400 Subject: [PATCH 141/284] drm/amdgpu: VCN v5_0_1 to prevent FW checking RB during DPG pause Add a protection to ensure programming are all complete prior VCPU starting. This is a WA for an unintended VCPU running. Signed-off-by: Sonny Jiang Acked-by: Leo Liu Reviewed-by: Ruijing Dong Signed-off-by: Alex Deucher (cherry picked from commit c29521b529fa5e225feaf709d863a636ca0cbbfa) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c index 338cf43c45fe..cdefd7fcb0da 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v5_0_1.c @@ -669,6 +669,9 @@ static int vcn_v5_0_1_start_dpg_mode(struct amdgpu_vcn_inst *vinst, if (indirect) amdgpu_vcn_psp_update_sram(adev, inst_idx, AMDGPU_UCODE_ID_VCN0_RAM); + /* resetting ring, fw should not check RB ring */ + fw_shared->sq.queue_mode |= FW_QUEUE_RING_RESET; + /* Pause dpg */ vcn_v5_0_1_pause_dpg_mode(vinst, &state); @@ -681,7 +684,7 @@ static int vcn_v5_0_1_start_dpg_mode(struct amdgpu_vcn_inst *vinst, tmp = RREG32_SOC15(VCN, vcn_inst, regVCN_RB_ENABLE); tmp &= ~(VCN_RB_ENABLE__RB1_EN_MASK); WREG32_SOC15(VCN, vcn_inst, regVCN_RB_ENABLE, tmp); - fw_shared->sq.queue_mode |= FW_QUEUE_RING_RESET; + WREG32_SOC15(VCN, vcn_inst, regUVD_RB_RPTR, 0); WREG32_SOC15(VCN, vcn_inst, regUVD_RB_WPTR, 0); @@ -692,6 +695,7 @@ static int vcn_v5_0_1_start_dpg_mode(struct amdgpu_vcn_inst *vinst, tmp = RREG32_SOC15(VCN, vcn_inst, regVCN_RB_ENABLE); tmp |= VCN_RB_ENABLE__RB1_EN_MASK; WREG32_SOC15(VCN, vcn_inst, regVCN_RB_ENABLE, tmp); + /* resetting done, fw can check RB ring */ fw_shared->sq.queue_mode &= ~(FW_QUEUE_RING_RESET | FW_QUEUE_DPG_HOLD_OFF); WREG32_SOC15(VCN, vcn_inst, regVCN_RB1_DB_CTRL, From b669507b637eb6b1aaecf347f193efccc65d756e Mon Sep 17 00:00:00 2001 From: Alex Hung Date: Tue, 3 Jun 2025 18:30:55 -0600 Subject: [PATCH 142/284] drm/amd/display: Check dce_hwseq before dereferencing it [WHAT] hws was checked for null earlier in dce110_blank_stream, indicating hws can be null, and should be checked whenever it is used. Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Aurabindo Pillai Signed-off-by: Alex Hung Signed-off-by: Aurabindo Pillai Signed-off-by: Alex Deucher (cherry picked from commit 79db43611ff61280b6de58ce1305e0b2ecf675ad) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c index e8730cc40edb..38e17b1796e1 100644 --- a/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/hwss/dce110/dce110_hwseq.c @@ -1225,7 +1225,7 @@ void dce110_blank_stream(struct pipe_ctx *pipe_ctx) return; if (link->local_sink && link->local_sink->sink_signal == SIGNAL_TYPE_EDP) { - if (!link->skip_implict_edp_power_control) + if (!link->skip_implict_edp_power_control && hws) hws->funcs.edp_backlight_control(link, false); link->dc->hwss.set_abm_immediate_disable(pipe_ctx); } From 785c536c31c0bb057252fddedb30d79ae4979f4b Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 6 Jun 2025 10:29:28 +0530 Subject: [PATCH 143/284] drm/amdgpu: Release reset locks during failures Make sure to release reset domain lock in case of failures. Signed-off-by: Lijo Lazar Signed-off-by: Ce Sun Reviewed-by: Hawking Zhang Fixes: 11bb33766f66 ("drm/amdgpu: refactor amdgpu_device_gpu_recover") Signed-off-by: Alex Deucher (cherry picked from commit 1ab11a82681eb33a66f423216cb063e7f40c6f85) --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 80 +++++++++++++++------- 1 file changed, 55 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e1bab6a96cb6..500bf85b3ee8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6019,16 +6019,12 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle) return ret; } -static int amdgpu_device_halt_activities(struct amdgpu_device *adev, - struct amdgpu_job *job, - struct amdgpu_reset_context *reset_context, - struct list_head *device_list, - struct amdgpu_hive_info *hive, - bool need_emergency_restart) +static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev, + struct list_head *device_list, + struct amdgpu_hive_info *hive) { - struct list_head *device_list_handle = NULL; struct amdgpu_device *tmp_adev = NULL; - int i, r = 0; + int r; /* * Build list of devices to reset. @@ -6045,26 +6041,54 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev, } if (!list_is_first(&adev->reset_list, device_list)) list_rotate_to_front(&adev->reset_list, device_list); - device_list_handle = device_list; } else { list_add_tail(&adev->reset_list, device_list); - device_list_handle = device_list; } if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) { - r = amdgpu_device_health_check(device_list_handle); + r = amdgpu_device_health_check(device_list); if (r) return r; } - /* We need to lock reset domain only once both for XGMI and single device */ - tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, - reset_list); + return 0; +} + +static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + if (list_empty(device_list)) + return; + tmp_adev = + list_first_entry(device_list, struct amdgpu_device, reset_list); amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); +} + +static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev, + struct list_head *device_list) +{ + struct amdgpu_device *tmp_adev = NULL; + + if (list_empty(device_list)) + return; + tmp_adev = + list_first_entry(device_list, struct amdgpu_device, reset_list); + amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); +} + +static int amdgpu_device_halt_activities( + struct amdgpu_device *adev, struct amdgpu_job *job, + struct amdgpu_reset_context *reset_context, + struct list_head *device_list, struct amdgpu_hive_info *hive, + bool need_emergency_restart) +{ + struct amdgpu_device *tmp_adev = NULL; + int i, r = 0; /* block all schedulers and reset given job's ring */ - list_for_each_entry(tmp_adev, device_list_handle, reset_list) { - + list_for_each_entry(tmp_adev, device_list, reset_list) { amdgpu_device_set_mp1_state(tmp_adev); /* @@ -6252,11 +6276,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev, amdgpu_ras_set_error_query_ready(tmp_adev, true); } - - tmp_adev = list_first_entry(device_list, struct amdgpu_device, - reset_list); - amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); - } @@ -6324,10 +6343,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, reset_context->hive = hive; INIT_LIST_HEAD(&device_list); + if (amdgpu_device_recovery_prepare(adev, &device_list, hive)) + goto end_reset; + + /* We need to lock reset domain only once both for XGMI and single device */ + amdgpu_device_recovery_get_reset_lock(adev, &device_list); + r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list, hive, need_emergency_restart); if (r) - goto end_reset; + goto reset_unlock; if (need_emergency_restart) goto skip_sched_resume; @@ -6345,13 +6370,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, r = amdgpu_device_asic_reset(adev, &device_list, reset_context); if (r) - goto end_reset; + goto reset_unlock; skip_hw_reset: r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled); if (r) - goto end_reset; + goto reset_unlock; skip_sched_resume: amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart); +reset_unlock: + amdgpu_device_recovery_put_reset_lock(adev, &device_list); end_reset: if (hive) { mutex_unlock(&hive->hive_lock); @@ -6763,6 +6790,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta memset(&reset_context, 0, sizeof(reset_context)); INIT_LIST_HEAD(&device_list); + amdgpu_device_recovery_prepare(adev, &device_list, hive); + amdgpu_device_recovery_get_reset_lock(adev, &device_list); r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list, hive, false); if (hive) { @@ -6880,8 +6909,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) if (hive) { list_for_each_entry(tmp_adev, &device_list, reset_list) amdgpu_device_unset_mp1_state(tmp_adev); - amdgpu_device_unlock_reset_domain(adev->reset_domain); } + amdgpu_device_recovery_put_reset_lock(adev, &device_list); } if (hive) { @@ -6927,6 +6956,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) amdgpu_device_sched_resume(&device_list, NULL, NULL); amdgpu_device_gpu_resume(adev, &device_list, false); + amdgpu_device_recovery_put_reset_lock(adev, &device_list); adev->pcie_reset_ctx.occurs_dpc = false; if (hive) { From 7f3b16f3f229e37cc3e02e9e4e7106c523b119e9 Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Mon, 16 Jun 2025 19:21:41 +0800 Subject: [PATCH 144/284] drm/amdgpu: Fix SDMA UTC_L1 handling during start/stop sequences This commit makes two key fixes to SDMA v4.4.2 handling: 1. disable UTC_L1 in sdma_cntl register when stopping SDMA engines by reading the current value before modifying UTC_L1_ENABLE bit. 2. Ensure UTC_L1_ENABLE is consistently managed by: - Adding the missing register write when enabling UTC_L1 during start - Keeping UTC_L1 enabled by default as per hardware requirements v2: Correct SDMA_CNTL setting (Philip) Suggested-by: Jonathan Kim Signed-off-by: Jesse Zhang Acked-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 375bf564654e85a7b1b0657b191645b3edca1bda) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 3de125062ee3..cef68df4c663 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -490,7 +490,7 @@ static void sdma_v4_4_2_inst_gfx_stop(struct amdgpu_device *adev, { struct amdgpu_ring *sdma[AMDGPU_MAX_SDMA_INSTANCES]; u32 doorbell_offset, doorbell; - u32 rb_cntl, ib_cntl; + u32 rb_cntl, ib_cntl, sdma_cntl; int i; for_each_inst(i, inst_mask) { @@ -502,6 +502,9 @@ static void sdma_v4_4_2_inst_gfx_stop(struct amdgpu_device *adev, ib_cntl = RREG32_SDMA(i, regSDMA_GFX_IB_CNTL); ib_cntl = REG_SET_FIELD(ib_cntl, SDMA_GFX_IB_CNTL, IB_ENABLE, 0); WREG32_SDMA(i, regSDMA_GFX_IB_CNTL, ib_cntl); + sdma_cntl = RREG32_SDMA(i, regSDMA_CNTL); + sdma_cntl = REG_SET_FIELD(sdma_cntl, SDMA_CNTL, UTC_L1_ENABLE, 0); + WREG32_SDMA(i, regSDMA_CNTL, sdma_cntl); if (sdma[i]->use_doorbell) { doorbell = RREG32_SDMA(i, regSDMA_GFX_DOORBELL); @@ -995,6 +998,7 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev, /* set utc l1 enable flag always to 1 */ temp = RREG32_SDMA(i, regSDMA_CNTL); temp = REG_SET_FIELD(temp, SDMA_CNTL, UTC_L1_ENABLE, 1); + WREG32_SDMA(i, regSDMA_CNTL, temp); if (amdgpu_ip_version(adev, SDMA0_HWIP, 0) < IP_VERSION(4, 4, 5)) { /* enable context empty interrupt during initialization */ From ebe43542702c3d15d1a1d95e8e13b1b54076f05a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 2 Jun 2025 11:31:52 -0400 Subject: [PATCH 145/284] drm/amdgpu: switch job hw_fence to amdgpu_fence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the amdgpu fence container so we can store additional data in the fence. This also fixes the start_time handling for MCBP since we were casting the fence to an amdgpu_fence and it wasn't. Fixes: 3f4c175d62d8 ("drm/amdgpu: MCBP based on DRM scheduler (v9)") Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit bf1cd14f9e2e1fdf981eed273ddd595863f5288c) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 30 +++++---------------- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 ++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 16 +++++++++++ 6 files changed, 32 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 8e626f50b362..f81608330a3d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1902,7 +1902,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring) continue; } job = to_amdgpu_job(s_job); - if (preempted && (&job->hw_fence) == fence) + if (preempted && (&job->hw_fence.base) == fence) /* mark the job as preempted */ job->preemption_status |= AMDGPU_IB_PREEMPTED; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 500bf85b3ee8..78f8755996f0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6362,7 +6362,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * * job->base holds a reference to parent fence */ - if (job && dma_fence_is_signaled(&job->hw_fence)) { + if (job && dma_fence_is_signaled(&job->hw_fence.base)) { job_signaled = true; dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); goto skip_hw_reset; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 8cecf25996ed..5fec808d7f54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -41,22 +41,6 @@ #include "amdgpu_trace.h" #include "amdgpu_reset.h" -/* - * Fences mark an event in the GPUs pipeline and are used - * for GPU/CPU synchronization. When the fence is written, - * it is expected that all buffers associated with that fence - * are no longer in use by the associated ring on the GPU and - * that the relevant GPU caches have been flushed. - */ - -struct amdgpu_fence { - struct dma_fence base; - - /* RB, DMA, etc. */ - struct amdgpu_ring *ring; - ktime_t start_timestamp; -}; - static struct kmem_cache *amdgpu_fence_slab; int amdgpu_fence_slab_init(void) @@ -151,12 +135,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC); if (am_fence == NULL) return -ENOMEM; - fence = &am_fence->base; - am_fence->ring = ring; } else { /* take use of job-embedded fence */ - fence = &job->hw_fence; + am_fence = &job->hw_fence; } + fence = &am_fence->base; + am_fence->ring = ring; seq = ++ring->fence_drv.sync_seq; if (job && job->job_run_counter) { @@ -718,7 +702,7 @@ void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring) * it right here or we won't be able to track them in fence_drv * and they will remain unsignaled during sa_bo free. */ - job = container_of(old, struct amdgpu_job, hw_fence); + job = container_of(old, struct amdgpu_job, hw_fence.base); if (!job->base.s_fence && !dma_fence_is_signaled(old)) dma_fence_signal(old); RCU_INIT_POINTER(*ptr, NULL); @@ -780,7 +764,7 @@ static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f) static const char *amdgpu_job_fence_get_timeline_name(struct dma_fence *f) { - struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence); + struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence.base); return (const char *)to_amdgpu_ring(job->base.sched)->name; } @@ -810,7 +794,7 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f) */ static bool amdgpu_job_fence_enable_signaling(struct dma_fence *f) { - struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence); + struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence.base); if (!timer_pending(&to_amdgpu_ring(job->base.sched)->fence_drv.fallback_timer)) amdgpu_fence_schedule_fallback(to_amdgpu_ring(job->base.sched)); @@ -845,7 +829,7 @@ static void amdgpu_job_fence_free(struct rcu_head *rcu) struct dma_fence *f = container_of(rcu, struct dma_fence, rcu); /* free job if fence has a parent job */ - kfree(container_of(f, struct amdgpu_job, hw_fence)); + kfree(container_of(f, struct amdgpu_job, hw_fence.base)); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index acb21fc8b3ce..ddb9d3269357 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -272,8 +272,8 @@ void amdgpu_job_free_resources(struct amdgpu_job *job) /* Check if any fences where initialized */ if (job->base.s_fence && job->base.s_fence->finished.ops) f = &job->base.s_fence->finished; - else if (job->hw_fence.ops) - f = &job->hw_fence; + else if (job->hw_fence.base.ops) + f = &job->hw_fence.base; else f = NULL; @@ -290,10 +290,10 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) amdgpu_sync_free(&job->explicit_sync); /* only put the hw fence if has embedded fence */ - if (!job->hw_fence.ops) + if (!job->hw_fence.base.ops) kfree(job); else - dma_fence_put(&job->hw_fence); + dma_fence_put(&job->hw_fence.base); } void amdgpu_job_set_gang_leader(struct amdgpu_job *job, @@ -322,10 +322,10 @@ void amdgpu_job_free(struct amdgpu_job *job) if (job->gang_submit != &job->base.s_fence->scheduled) dma_fence_put(job->gang_submit); - if (!job->hw_fence.ops) + if (!job->hw_fence.base.ops) kfree(job); else - dma_fence_put(&job->hw_fence); + dma_fence_put(&job->hw_fence.base); } struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index f2c049129661..931fed8892cc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -48,7 +48,7 @@ struct amdgpu_job { struct drm_sched_job base; struct amdgpu_vm *vm; struct amdgpu_sync explicit_sync; - struct dma_fence hw_fence; + struct amdgpu_fence hw_fence; struct dma_fence *gang_submit; uint32_t preamble_status; uint32_t preemption_status; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index b95b47110769..e1f25218943a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -127,6 +127,22 @@ struct amdgpu_fence_driver { struct dma_fence **fences; }; +/* + * Fences mark an event in the GPUs pipeline and are used + * for GPU/CPU synchronization. When the fence is written, + * it is expected that all buffers associated with that fence + * are no longer in use by the associated ring on the GPU and + * that the relevant GPU caches have been flushed. + */ + +struct amdgpu_fence { + struct dma_fence base; + + /* RB, DMA, etc. */ + struct amdgpu_ring *ring; + ktime_t start_timestamp; +}; + extern const struct drm_sched_backend_ops amdgpu_sched_ops; void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring); From 49cc5beeabd5b9b6a6660be8ea2e95de30ec0a07 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 16 Jun 2025 15:43:55 -0400 Subject: [PATCH 146/284] drm/amdgpu/sdma5: init engine reset mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Missing the mutex init. Fixes: e56d4bf57fab ("drm/amdgpu/: drm/amdgpu: Register the new sdma function pointers for sdma_v5_0") Reviewed-by: Jesse Zhang Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 3f4caf092f02f0de169c6122639af481c7edc8f9) --- drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c index 9505ae96fbec..1813c3ed0aa6 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c @@ -1399,6 +1399,7 @@ static int sdma_v5_0_sw_init(struct amdgpu_ip_block *ip_block) return r; for (i = 0; i < adev->sdma.num_instances; i++) { + mutex_init(&adev->sdma.instance[i].engine_reset_mutex); adev->sdma.instance[i].funcs = &sdma_v5_0_sdma_funcs; ring = &adev->sdma.instance[i].ring; ring->ring_obj = NULL; From cfb05257ae168a0496c7637e1d9e3ab8a25cbffe Mon Sep 17 00:00:00 2001 From: Jay Cornwall Date: Wed, 11 Jun 2025 09:52:14 -0500 Subject: [PATCH 147/284] drm/amdkfd: Fix race in GWS queue scheduling q->gws is not updated atomically with qpd->mapped_gws_queue. If a runlist is created between pqm_set_gws and update_queue it will contain a queue which uses GWS in a process with no GWS allocated. This will result in a scheduler hang. Use q->properties.is_gws which is changed while holding the DQM lock. Signed-off-by: Jay Cornwall Reviewed-by: Harish Kasiviswanathan Signed-off-by: Alex Deucher (cherry picked from commit b98370220eb3110e82248e3354e16a489a492cfb) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index 8fa6489b6f5d..505036968a77 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -240,7 +240,7 @@ static int pm_map_queues_v9(struct packet_manager *pm, uint32_t *buffer, packet->bitfields2.engine_sel = engine_sel__mes_map_queues__compute_vi; - packet->bitfields2.gws_control_queue = q->gws ? 1 : 0; + packet->bitfields2.gws_control_queue = q->properties.is_gws ? 1 : 0; packet->bitfields2.extended_engine_sel = extended_engine_sel__mes_map_queues__legacy_engine_sel; packet->bitfields2.queue_type = From fe79ef3530d3c681ef2d5644a9d1d1a67b21426a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 16 Jun 2025 15:47:13 -0400 Subject: [PATCH 148/284] drm/amdgpu/sdma5.2: init engine reset mutex Missing the mutex init. Fixes: 47454f2dc0bf ("drm/amdgpu: Register the new sdma function pointers for sdma_v5_2") Reviewed-by: Jesse Zhang Signed-off-by: Alex Deucher (cherry picked from commit ea685ff30a51a25dd9be90786933ada49a088f65) --- drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c index a6e612b4a892..23f97da62808 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c @@ -1318,6 +1318,7 @@ static int sdma_v5_2_sw_init(struct amdgpu_ip_block *ip_block) } for (i = 0; i < adev->sdma.num_instances; i++) { + mutex_init(&adev->sdma.instance[i].engine_reset_mutex); adev->sdma.instance[i].funcs = &sdma_v5_2_sdma_funcs; ring = &adev->sdma.instance[i].ring; ring->ring_obj = NULL; From 88efa0de3285be66969b71ec137d9dab1ee19e52 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 19 Jun 2025 00:23:06 +0800 Subject: [PATCH 149/284] EDAC/igen6: Fix NULL pointer dereference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A kernel panic was reported with the following kernel log: EDAC igen6: Expected 2 mcs, but only 1 detected. BUG: unable to handle page fault for address: 000000000000d570 ... Hardware name: Notebook V54x_6x_TU/V54x_6x_TU, BIOS Dasharo (coreboot+UEFI) v0.9.0 07/17/2024 RIP: e030:ecclog_handler+0x7e/0xf0 [igen6_edac] ... igen6_probe+0x2a0/0x343 [igen6_edac] ... igen6_init+0xc5/0xff0 [igen6_edac] ... This issue occurred because one memory controller was disabled by the BIOS but the igen6_edac driver still checked all the memory controllers, including this absent one, to identify the source of the error. Accessing the null MMIO for the absent memory controller resulted in the oops above. Fix this issue by reverting the configuration structure to non-const and updating the field 'res_cfg->num_imc' to reflect the number of detected memory controllers. Fixes: 20e190b1c1fd ("EDAC/igen6: Skip absent memory controllers") Reported-by: Marek Marczykowski-Górecki Closes: https://lore.kernel.org/all/aFFN7RlXkaK_loQb@mail-itl/ Suggested-by: Borislav Petkov Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov (AMD) Tested-by: Marek Marczykowski-Górecki Link: https://lore.kernel.org/r/20250618162307.1523736-1-qiuxu.zhuo@intel.com --- drivers/edac/igen6_edac.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 1930dc00c791..1cb5c67e78ae 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -125,7 +125,7 @@ #define MEM_SLICE_HASH_MASK(v) (GET_BITFIELD(v, 6, 19) << 6) #define MEM_SLICE_HASH_LSB_MASK_BIT(v) GET_BITFIELD(v, 24, 26) -static const struct res_config { +static struct res_config { bool machine_check; /* The number of present memory controllers. */ int num_imc; @@ -479,7 +479,7 @@ static u64 rpl_p_err_addr(u64 ecclog) return ECC_ERROR_LOG_ADDR45(ecclog); } -static const struct res_config ehl_cfg = { +static struct res_config ehl_cfg = { .num_imc = 1, .imc_base = 0x5000, .ibecc_base = 0xdc00, @@ -489,7 +489,7 @@ static const struct res_config ehl_cfg = { .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, }; -static const struct res_config icl_cfg = { +static struct res_config icl_cfg = { .num_imc = 1, .imc_base = 0x5000, .ibecc_base = 0xd800, @@ -499,7 +499,7 @@ static const struct res_config icl_cfg = { .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, }; -static const struct res_config tgl_cfg = { +static struct res_config tgl_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0x5000, @@ -513,7 +513,7 @@ static const struct res_config tgl_cfg = { .err_addr_to_imc_addr = tgl_err_addr_to_imc_addr, }; -static const struct res_config adl_cfg = { +static struct res_config adl_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -524,7 +524,7 @@ static const struct res_config adl_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static const struct res_config adl_n_cfg = { +static struct res_config adl_n_cfg = { .machine_check = true, .num_imc = 1, .imc_base = 0xd800, @@ -535,7 +535,7 @@ static const struct res_config adl_n_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static const struct res_config rpl_p_cfg = { +static struct res_config rpl_p_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -547,7 +547,7 @@ static const struct res_config rpl_p_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static const struct res_config mtl_ps_cfg = { +static struct res_config mtl_ps_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -558,7 +558,7 @@ static const struct res_config mtl_ps_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static const struct res_config mtl_p_cfg = { +static struct res_config mtl_p_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -569,7 +569,7 @@ static const struct res_config mtl_p_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static const struct pci_device_id igen6_pci_tbl[] = { +static struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU7), (kernel_ulong_t)&ehl_cfg }, @@ -1350,9 +1350,11 @@ static int igen6_register_mcis(struct pci_dev *pdev, u64 mchbar) return -ENODEV; } - if (lmc < res_cfg->num_imc) + if (lmc < res_cfg->num_imc) { igen6_printk(KERN_WARNING, "Expected %d mcs, but only %d detected.", res_cfg->num_imc, lmc); + res_cfg->num_imc = lmc; + } return 0; From 306cb65bb0cb243389fcbd0a66907d5bdea07d1e Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Mon, 17 Mar 2025 15:57:27 +0530 Subject: [PATCH 150/284] smb: fix secondary channel creation issue with kerberos by populating hostname when adding channels When mounting a share with kerberos authentication with multichannel support, share mounts correctly, but fails to create secondary channels. This occurs because the hostname is not populated when adding the channels. The hostname is necessary for the userspace cifs.upcall program to retrieve the required credentials and pass it back to kernel, without hostname secondary channels fails establish. Cc: stable@vger.kernel.org Reviewed-by: Shyam Prasad N Signed-off-by: Bharath SM Reported-by: xfuren Link: https://bugzilla.samba.org/show_bug.cgi?id=15824 Signed-off-by: Steve French --- fs/smb/client/sess.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index ec0db32c7d98..330bc3d25bad 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -498,8 +498,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, ctx->domainauto = ses->domainAuto; ctx->domainname = ses->domainName; - /* no hostname for extra channels */ - ctx->server_hostname = ""; + ctx->server_hostname = ses->server->hostname; ctx->username = ses->user_name; ctx->password = ses->password; From 840738eae94864993a735ab677b9795bb8f3b961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sun, 8 Jun 2025 16:10:33 +0200 Subject: [PATCH 151/284] cifs: Remove duplicate fattr->cf_dtype assignment from wsl_to_fattr() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8bd25b61c5a5 ("smb: client: set correct d_type for reparse DFS/DFSR and mount point") deduplicated assignment of fattr->cf_dtype member from all places to end of the function cifs_reparse_point_to_fattr(). The only one missing place which was not deduplicated is wsl_to_fattr(). Fix it. Fixes: 8bd25b61c5a5 ("smb: client: set correct d_type for reparse DFS/DFSR and mount point") Signed-off-by: Pali Rohár Signed-off-by: Steve French --- fs/smb/client/reparse.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index bb25e77c5540..511611206dab 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -1172,7 +1172,6 @@ static bool wsl_to_fattr(struct cifs_open_info_data *data, if (!have_xattr_dev && (tag == IO_REPARSE_TAG_LX_CHR || tag == IO_REPARSE_TAG_LX_BLK)) return false; - fattr->cf_dtype = S_DT(fattr->cf_mode); return true; } From 6fcab2791543924d438e7fa49276d0998b0a069f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 18 Jun 2025 14:17:45 +0200 Subject: [PATCH 152/284] ACPICA: Refuse to evaluate a method if arguments are missing As reported in [1], a platform firmware update that increased the number of method parameters and forgot to update a least one of its callers, caused ACPICA to crash due to use-after-free. Since this a result of a clear AML issue that arguably cannot be fixed up by the interpreter (it cannot produce missing data out of thin air), address it by making ACPICA refuse to evaluate a method if the caller attempts to pass fewer arguments than expected to it. Closes: https://github.com/acpica/acpica/issues/1027 [1] Reported-by: Peter Williams Signed-off-by: Rafael J. Wysocki Reviewed-by: Hans de Goede Tested-by: Hans de Goede # Dell XPS 9640 with BIOS 1.12.0 Link: https://patch.msgid.link/5909446.DvuYhMxLoT@rjwysocki.net Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/dsmethod.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/acpi/acpica/dsmethod.c b/drivers/acpi/acpica/dsmethod.c index c8f37f4e6626..fef6fb29ece4 100644 --- a/drivers/acpi/acpica/dsmethod.c +++ b/drivers/acpi/acpica/dsmethod.c @@ -483,6 +483,13 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, return_ACPI_STATUS(AE_NULL_OBJECT); } + if (this_walk_state->num_operands < obj_desc->method.param_count) { + ACPI_ERROR((AE_INFO, "Missing argument for method [%4.4s]", + acpi_ut_get_node_name(method_node))); + + return_ACPI_STATUS(AE_AML_UNINITIALIZED_ARG); + } + /* Init for new method, possibly wait on method mutex */ status = From ba8dac350faf16afc129ce6303ca4feaf083ccb1 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 5 Jun 2025 11:26:33 +0800 Subject: [PATCH 153/284] f2fs: fix to zero post-eof page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fstest reports a f2fs bug: generic/363 42s ... [failed, exit status 1]- output mismatch (see /share/git/fstests/results//generic/363.out.bad) --- tests/generic/363.out 2025-01-12 21:57:40.271440542 +0800 +++ /share/git/fstests/results//generic/363.out.bad 2025-05-19 19:55:58.000000000 +0800 @@ -1,2 +1,78 @@ QA output created by 363 fsx -q -S 0 -e 1 -N 100000 +READ BAD DATA: offset = 0xd6fb, size = 0xf044, fname = /mnt/f2fs/junk +OFFSET GOOD BAD RANGE +0x1540d 0x0000 0x2a25 0x0 +operation# (mod 256) for the bad data may be 37 +0x1540e 0x0000 0x2527 0x1 ... (Run 'diff -u /share/git/fstests/tests/generic/363.out /share/git/fstests/results//generic/363.out.bad' to see the entire diff) Ran: generic/363 Failures: generic/363 Failed 1 of 1 tests The root cause is user can update post-eof page via mmap [1], however, f2fs missed to zero post-eof page in below operations, so, once it expands i_size, then it will include dummy data locates previous post-eof page, so during below operations, we need to zero post-eof page. Operations which can include dummy data after previous i_size after expanding i_size: - write - mapwrite [1] - truncate - fallocate * preallocate * zero_range * insert_range * collapse_range - clone_range (doesn’t support in f2fs) - copy_range (doesn’t support in f2fs) [1] https://man7.org/linux/man-pages/man2/mmap.2.html 'BUG section' Cc: stable@kernel.org Signed-off-by: Chao Yu Reviewed-by: Zhiguo Niu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6bd3de64f2a8..696131e655ed 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,6 +35,17 @@ #include #include +static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +{ + loff_t old_size = i_size_read(inode); + + if (old_size >= new_size) + return; + + /* zero or drop pages only in range of [old_size, new_size] */ + truncate_pagecache(inode, old_size); +} + static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -103,8 +114,13 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); + filemap_invalidate_unlock(inode->i_mapping); + file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); + folio_lock(folio); if (unlikely(folio->mapping != inode->i_mapping || folio_pos(folio) > i_size_read(inode) || @@ -1109,6 +1125,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, f2fs_down_write(&fi->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + if (attr->ia_size > old_size) + f2fs_zero_post_eof_page(inode, attr->ia_size); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1227,6 +1245,10 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1510,6 +1532,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); truncate_pagecache(inode, offset); @@ -1631,6 +1655,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + filemap_invalidate_lock(mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(mapping); + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1762,6 +1790,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); + + f2fs_zero_post_eof_page(inode, offset + len); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1819,6 +1849,10 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len); + filemap_invalidate_unlock(inode->i_mapping); + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; @@ -4860,6 +4894,10 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) err = file_modified(file); if (err) return err; + + filemap_invalidate_lock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); + filemap_invalidate_unlock(inode->i_mapping); return count; } From d4adf1c9ee7722545450608bcb095fb31512f0c6 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 18 Jun 2025 17:57:40 -0400 Subject: [PATCH 154/284] bpf: Adjust free target to avoid global starvation of LRU map BPF_MAP_TYPE_LRU_HASH can recycle most recent elements well before the map is full, due to percpu reservations and force shrink before neighbor stealing. Once a CPU is unable to borrow from the global map, it will once steal one elem from a neighbor and after that each time flush this one element to the global list and immediately recycle it. Batch value LOCAL_FREE_TARGET (128) will exhaust a 10K element map with 79 CPUs. CPU 79 will observe this behavior even while its neighbors hold 78 * 127 + 1 * 15 == 9921 free elements (99%). CPUs need not be active concurrently. The issue can appear with affinity migration, e.g., irqbalance. Each CPU can reserve and then hold onto its 128 elements indefinitely. Avoid global list exhaustion by limiting aggregate percpu caches to half of map size, by adjusting LOCAL_FREE_TARGET based on cpu count. This change has no effect on sufficiently large tables. Similar to LOCAL_NR_SCANS and lru->nr_scans, introduce a map variable lru->free_target. The extra field fits in a hole in struct bpf_lru. The cacheline is already warm where read in the hot path. The field is only accessed with the lru lock held. Tested-by: Anton Protopopov Signed-off-by: Willem de Bruijn Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20250618215803.3587312-1-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/map_hash.rst | 8 ++- Documentation/bpf/map_lru_hash_update.dot | 6 +- kernel/bpf/bpf_lru_list.c | 9 ++- kernel/bpf/bpf_lru_list.h | 1 + tools/testing/selftests/bpf/test_lru_map.c | 72 +++++++++++----------- 5 files changed, 52 insertions(+), 44 deletions(-) diff --git a/Documentation/bpf/map_hash.rst b/Documentation/bpf/map_hash.rst index d2343952f2cb..8606bf958a8c 100644 --- a/Documentation/bpf/map_hash.rst +++ b/Documentation/bpf/map_hash.rst @@ -233,10 +233,16 @@ attempts in order to enforce the LRU property which have increasing impacts on other CPUs involved in the following operation attempts: - Attempt to use CPU-local state to batch operations -- Attempt to fetch free nodes from global lists +- Attempt to fetch ``target_free`` free nodes from global lists - Attempt to pull any node from a global list and remove it from the hashmap - Attempt to pull any node from any CPU's list and remove it from the hashmap +The number of nodes to borrow from the global list in a batch, ``target_free``, +depends on the size of the map. Larger batch size reduces lock contention, but +may also exhaust the global structure. The value is computed at map init to +avoid exhaustion, by limiting aggregate reservation by all CPUs to half the map +size. With a minimum of a single element and maximum budget of 128 at a time. + This algorithm is described visually in the following diagram. See the description in commit 3a08c2fd7634 ("bpf: LRU List") for a full explanation of the corresponding operations: diff --git a/Documentation/bpf/map_lru_hash_update.dot b/Documentation/bpf/map_lru_hash_update.dot index a0fee349d29c..ab10058f5b79 100644 --- a/Documentation/bpf/map_lru_hash_update.dot +++ b/Documentation/bpf/map_lru_hash_update.dot @@ -35,18 +35,18 @@ digraph { fn_bpf_lru_list_pop_free_to_local [shape=rectangle,fillcolor=2, label="Flush local pending, Rotate Global list, move - LOCAL_FREE_TARGET + target_free from global -> local"] // Also corresponds to: // fn__local_list_flush() // fn_bpf_lru_list_rotate() fn___bpf_lru_node_move_to_free[shape=diamond,fillcolor=2, - label="Able to free\nLOCAL_FREE_TARGET\nnodes?"] + label="Able to free\ntarget_free\nnodes?"] fn___bpf_lru_list_shrink_inactive [shape=rectangle,fillcolor=3, label="Shrink inactive list up to remaining - LOCAL_FREE_TARGET + target_free (global LRU -> local)"] fn___bpf_lru_list_shrink [shape=diamond,fillcolor=2, label="> 0 entries in\nlocal free list?"] diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 3dabdd137d10..2d6e1c98d8ad 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -337,12 +337,12 @@ static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, list) { __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); - if (++nfree == LOCAL_FREE_TARGET) + if (++nfree == lru->target_free) break; } - if (nfree < LOCAL_FREE_TARGET) - __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, + if (nfree < lru->target_free) + __bpf_lru_list_shrink(lru, l, lru->target_free - nfree, local_free_list(loc_l), BPF_LRU_LOCAL_LIST_T_FREE); @@ -577,6 +577,9 @@ static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); buf += elem_size; } + + lru->target_free = clamp((nr_elems / num_possible_cpus()) / 2, + 1, LOCAL_FREE_TARGET); } static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index cbd8d3720c2b..fe2661a58ea9 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -58,6 +58,7 @@ struct bpf_lru { del_from_htab_func del_from_htab; void *del_arg; unsigned int hash_offset; + unsigned int target_free; unsigned int nr_scans; bool percpu; }; diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index fda7589c5023..4ae83f4b7fc7 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try) return ret; } +/* Inverse of how bpf_common_lru_populate derives target_free from map_size. */ +static unsigned int __map_size(unsigned int tgt_free) +{ + return tgt_free * nr_cpus * 2; +} + /* Size of the LRU map is 2 * Add key=1 (+1 key) * Add key=2 (+1 key) @@ -231,11 +237,11 @@ static void test_lru_sanity0(int map_type, int map_flags) printf("Pass\n"); } -/* Size of the LRU map is 1.5*tgt_free - * Insert 1 to tgt_free (+tgt_free keys) - * Lookup 1 to tgt_free/2 - * Insert 1+tgt_free to 2*tgt_free (+tgt_free keys) - * => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU +/* Verify that unreferenced elements are recycled before referenced ones. + * Insert elements. + * Reference a subset of these. + * Insert more, enough to trigger recycling. + * Verify that unreferenced are recycled. */ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) { @@ -257,7 +263,7 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) batch_size = tgt_free / 2; assert(batch_size * 2 == tgt_free); - map_size = tgt_free + batch_size; + map_size = __map_size(tgt_free) + batch_size; lru_map_fd = create_map(map_type, map_flags, map_size); assert(lru_map_fd != -1); @@ -266,13 +272,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1 to tgt_free (+tgt_free keys) */ - end_key = 1 + tgt_free; + /* Insert map_size - batch_size keys */ + end_key = 1 + __map_size(tgt_free); for (key = 1; key < end_key; key++) assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); - /* Lookup 1 to tgt_free/2 */ + /* Lookup 1 to batch_size */ end_key = 1 + batch_size; for (key = 1; key < end_key; key++) { assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value)); @@ -280,12 +286,13 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) BPF_NOEXIST)); } - /* Insert 1+tgt_free to 2*tgt_free - * => 1+tgt_free/2 to LOCALFREE_TARGET will be + /* Insert another map_size - batch_size keys + * Map will contain 1 to batch_size plus these latest, i.e., + * => previous 1+batch_size to map_size - batch_size will have been * removed by LRU */ - key = 1 + tgt_free; - end_key = key + tgt_free; + key = 1 + __map_size(tgt_free); + end_key = key + __map_size(tgt_free); for (; key < end_key; key++) { assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -301,17 +308,8 @@ static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free) printf("Pass\n"); } -/* Size of the LRU map 1.5 * tgt_free - * Insert 1 to tgt_free (+tgt_free keys) - * Update 1 to tgt_free/2 - * => The original 1 to tgt_free/2 will be removed due to - * the LRU shrink process - * Re-insert 1 to tgt_free/2 again and do a lookup immeidately - * Insert 1+tgt_free to tgt_free*3/2 - * Insert 1+tgt_free*3/2 to tgt_free*5/2 - * => Key 1+tgt_free to tgt_free*3/2 - * will be removed from LRU because it has never - * been lookup and ref bit is not set +/* Verify that insertions exceeding map size will recycle the oldest. + * Verify that unreferenced elements are recycled before referenced. */ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) { @@ -334,7 +332,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) batch_size = tgt_free / 2; assert(batch_size * 2 == tgt_free); - map_size = tgt_free + batch_size; + map_size = __map_size(tgt_free) + batch_size; lru_map_fd = create_map(map_type, map_flags, map_size); assert(lru_map_fd != -1); @@ -343,8 +341,8 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1 to tgt_free (+tgt_free keys) */ - end_key = 1 + tgt_free; + /* Insert map_size - batch_size keys */ + end_key = 1 + __map_size(tgt_free); for (key = 1; key < end_key; key++) assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -357,8 +355,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) * shrink the inactive list to get tgt_free * number of free nodes. * - * Hence, the oldest key 1 to tgt_free/2 - * are removed from the LRU list. + * Hence, the oldest key is removed from the LRU list. */ key = 1; if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { @@ -370,8 +367,7 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) BPF_EXIST)); } - /* Re-insert 1 to tgt_free/2 again and do a lookup - * immeidately. + /* Re-insert 1 to batch_size again and do a lookup immediately. */ end_key = 1 + batch_size; value[0] = 4321; @@ -387,17 +383,18 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1+tgt_free to tgt_free*3/2 */ - end_key = 1 + tgt_free + batch_size; - for (key = 1 + tgt_free; key < end_key; key++) + /* Insert batch_size new elements */ + key = 1 + __map_size(tgt_free); + end_key = key + batch_size; + for (; key < end_key; key++) /* These newly added but not referenced keys will be * gone during the next LRU shrink. */ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); - /* Insert 1+tgt_free*3/2 to tgt_free*5/2 */ - end_key = key + tgt_free; + /* Insert map_size - batch_size elements */ + end_key += __map_size(tgt_free); for (; key < end_key; key++) { assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); @@ -500,7 +497,8 @@ static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free) lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free * nr_cpus); else - lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free); + lru_map_fd = create_map(map_type, map_flags, + 3 * __map_size(tgt_free)); assert(lru_map_fd != -1); expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, From 7869738b6908eec7755818aaf6f3aa068b2f0e1b Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Thu, 19 Jun 2025 11:28:39 +0800 Subject: [PATCH 155/284] erofs: refuse crafted out-of-file-range encoded extents Crafted encoded extents could record out-of-range `lstart`, which should not happen in normal cases. It caused an iomap_iter_done() complaint [1] reported by syzbot. [1] https://lore.kernel.org/r/684cb499.a00a0220.c6bd7.0010.GAE@google.com Fixes: 1d191b4ca51d ("erofs: implement encoded extent metadata") Reported-and-tested-by: syzbot+d8f000c609f05f52d9b5@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d8f000c609f05f52d9b5 Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250619032839.2642193-1-hsiangkao@linux.alibaba.com --- fs/erofs/zmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 14ea47f954f5..6afcb054780d 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -597,6 +597,10 @@ static int z_erofs_map_blocks_ext(struct inode *inode, if (la > map->m_la) { r = mid; + if (la > lend) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } lend = la; } else { l = mid + 1; From 8a8ff069c7ad9a359c54683329883e2432cff191 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 15 Jun 2025 16:11:38 +0100 Subject: [PATCH 156/284] KVM: arm64: nv: Fix tracking of shadow list registers Wei-Lin reports that the tracking of shadow list registers is majorly broken when resync'ing the L2 state after a run, as we confuse the guest's LR index with the host's, potentially losing the interrupt state. While this could be fixed by adding yet another side index to track it (Wei-Lin's fix), it may be better to refactor this code to avoid having a side index altogether, limiting the risk to introduce this class of bugs. A key observation is that the shadow index is always the number of bits in the lr_map bitmap. With that, the parallel indexing scheme can be completely dropped. While doing this, introduce a couple of helpers that abstract the index conversion and some of the LR repainting, making the whole exercise much simpler. Reported-by: Wei-Lin Chang Reviewed-by: Wei-Lin Chang Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250614145721.2504524-1-r09922117@csie.ntu.edu.tw Link: https://lore.kernel.org/r/86qzzkc5xa.wl-maz@kernel.org --- arch/arm64/kvm/vgic/vgic-v3-nested.c | 81 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index d22a8ad7bcc5..a50fb7e6841f 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -36,6 +36,11 @@ struct shadow_if { static DEFINE_PER_CPU(struct shadow_if, shadow_if); +static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) +{ + return hweight16(shadow_if->lr_map & (BIT(idx) - 1)); +} + /* * Nesting GICv3 support * @@ -209,6 +214,29 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) return reg; } +static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr) +{ + struct vgic_irq *irq; + + if (!(lr & ICH_LR_HW)) + return lr; + + /* We have the HW bit set, check for validity of pINTID */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + /* If there was no real mapping, nuke the HW bit */ + if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI) + lr &= ~ICH_LR_HW; + + /* Translate the virtual mapping to the real one, even if invalid */ + if (irq) { + lr &= ~ICH_LR_PHYS_ID_MASK; + lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + vgic_put_irq(vcpu->kvm, irq); + } + + return lr; +} + /* * For LRs which have HW bit set such as timer interrupts, we modify them to * have the host hardware interrupt number instead of the virtual one programmed @@ -217,58 +245,37 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, struct vgic_v3_cpu_if *s_cpu_if) { - unsigned long lr_map = 0; - int index = 0; + struct shadow_if *shadow_if; + + shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif); + shadow_if->lr_map = 0; for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); - struct vgic_irq *irq; if (!(lr & ICH_LR_STATE)) - lr = 0; + continue; - if (!(lr & ICH_LR_HW)) - goto next; + lr = translate_lr_pintid(vcpu, lr); - /* We have the HW bit set, check for validity of pINTID */ - irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) { - /* There was no real mapping, so nuke the HW bit */ - lr &= ~ICH_LR_HW; - if (irq) - vgic_put_irq(vcpu->kvm, irq); - goto next; - } - - /* Translate the virtual mapping to the real one */ - lr &= ~ICH_LR_PHYS_ID_MASK; - lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); - - vgic_put_irq(vcpu->kvm, irq); - -next: - s_cpu_if->vgic_lr[index] = lr; - if (lr) { - lr_map |= BIT(i); - index++; - } + s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr; + shadow_if->lr_map |= BIT(i); } - container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map; - s_cpu_if->used_lrs = index; + s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); } void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); - int i, index = 0; + int i; for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); struct vgic_irq *irq; if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) - goto next; + continue; /* * If we had a HW lr programmed by the guest hypervisor, we @@ -277,15 +284,13 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) */ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ - goto next; + continue; - lr = __gic_v3_get_lr(index); + lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); if (!(lr & ICH_LR_STATE)) irq->active = false; vgic_put_irq(vcpu->kvm, irq); - next: - index++; } } @@ -368,13 +373,11 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); val &= ~ICH_LR_STATE; - val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; + val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE; __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - s_cpu_if->vgic_lr[i] = 0; } - shadow_if->lr_map = 0; vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; } From 1fbe6861a6d9a942fb8ab8677ddf1ecb86b1af60 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:04 -0700 Subject: [PATCH 157/284] KVM: arm64: Explicitly treat routing entry type changes as changes Explicitly treat type differences as GSI routing changes, as comparing MSI data between two entries could get a false negative, e.g. if userspace changed the type but left the type-specific data as- Note, the same bug was fixed in x86 by commit bcda70c56f3e ("KVM: x86: Explicitly treat routing entry type changes as changes"). Fixes: 4bf3693d36af ("KVM: arm64: Unmap vLPIs affected by changes to GSI routing information") Signed-off-by: Sean Christopherson Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250611224604.313496-3-seanjc@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index de2b4e9c9f9f..38a91bb5d4c7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2764,7 +2764,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return memcmp(&old->msi, &new->msi, sizeof(new->msi)); From 56a14984505b11674df5e01407748236bc4bc8f8 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Sun, 8 Jun 2025 17:54:02 +0800 Subject: [PATCH 158/284] KVM: arm64: selftests: Close the GIC FD in arch_timer_edge_cases Close the GIC FD to free the reference it holds to the VM so that we can correctly clean up the VM. This also gets rid of the "KVM: debugfs: duplicate directory 395722-4" warning when running arch_timer_edge_cases. Signed-off-by: Zenghui Yu Reviewed-by: Miguel Luis Reviewed-by: Sebastian Ott Link: https://lore.kernel.org/r/20250608095402.1131-1-yuzenghui@huawei.com Signed-off-by: Marc Zyngier --- .../selftests/kvm/arm64/arch_timer_edge_cases.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c index b4d22b3ab7cc..4e71740a098b 100644 --- a/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c +++ b/tools/testing/selftests/kvm/arm64/arch_timer_edge_cases.c @@ -954,6 +954,8 @@ static void test_init_timer_irq(struct kvm_vm *vm, struct kvm_vcpu *vcpu) pr_debug("ptimer_irq: %d; vtimer_irq: %d\n", ptimer_irq, vtimer_irq); } +static int gic_fd; + static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu, enum arch_timer timer) { @@ -968,12 +970,20 @@ static void test_vm_create(struct kvm_vm **vm, struct kvm_vcpu **vcpu, vcpu_args_set(*vcpu, 1, timer); test_init_timer_irq(*vm, *vcpu); - vgic_v3_setup(*vm, 1, 64); + gic_fd = vgic_v3_setup(*vm, 1, 64); + __TEST_REQUIRE(gic_fd >= 0, "Failed to create vgic-v3"); + sync_global_to_guest(*vm, test_args); sync_global_to_guest(*vm, CVAL_MAX); sync_global_to_guest(*vm, DEF_CNT); } +static void test_vm_cleanup(struct kvm_vm *vm) +{ + close(gic_fd); + kvm_vm_free(vm); +} + static void test_print_help(char *name) { pr_info("Usage: %s [-h] [-b] [-i iterations] [-l long_wait_ms] [-p] [-v]\n" @@ -1060,13 +1070,13 @@ int main(int argc, char *argv[]) if (test_args.test_virtual) { test_vm_create(&vm, &vcpu, VIRTUAL); test_run(vm, vcpu); - kvm_vm_free(vm); + test_vm_cleanup(vm); } if (test_args.test_physical) { test_vm_create(&vm, &vcpu, PHYSICAL); test_run(vm, vcpu); - kvm_vm_free(vm); + test_vm_cleanup(vm); } return 0; From cade3d57e456e69f67aa9894bf89dc8678796bb7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:12 +0100 Subject: [PATCH 159/284] KVM: arm64: VHE: Synchronize restore of host debug registers When KVM runs in non-protected VHE mode, there's no context synchronization event between __debug_switch_to_host() restoring the host debug registers and __kvm_vcpu_run() unmasking debug exceptions. Due to this, it's theoretically possible for the host to take an unexpected debug exception due to the stale guest configuration. This cannot happen in NVHE/HVHE mode as debug exceptions are masked in the hyp code, and the exception return to the host will provide the necessary context synchronization before debug exceptions can be taken. For now, avoid the problem by adding an ISB after VHE hyp code restores the host debug registers. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250617133718.4014181-2-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 502a5b73ee70..73881e1dc267 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -167,6 +167,9 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); + + if (has_vhe()) + isb(); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ From 257d0aa8e2502754bc758faceceb6ff59318af60 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:13 +0100 Subject: [PATCH 160/284] KVM: arm64: VHE: Synchronize CPTR trap deactivation Currently there is no ISB between __deactivate_cptr_traps() disabling traps that affect EL2 and fpsimd_lazy_switch_to_host() manipulating registers potentially affected by CPTR traps. When NV is not in use, this is safe because the relevant registers are only accessed when guest_owns_fp_regs() && vcpu_has_sve(vcpu), and this also implies that SVE traps affecting EL2 have been deactivated prior to __guest_entry(). When NV is in use, a guest hypervisor may have configured SVE traps for a nested context, and so it is necessary to have an ISB between __deactivate_cptr_traps() and fpsimd_lazy_switch_to_host(). Due to the current lack of an ISB, when a guest hypervisor enables SVE traps in CPTR, the host can take an unexpected SVE trap from within fpsimd_lazy_switch_to_host(), e.g. | Unhandled 64-bit el1h sync exception on CPU1, ESR 0x0000000066000000 -- SVE | CPU: 1 UID: 0 PID: 164 Comm: kvm-vcpu-0 Not tainted 6.15.0-rc4-00138-ga05e0f012c05 #3 PREEMPT | Hardware name: FVP Base RevC (DT) | pstate: 604023c9 (nZCv DAIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : __kvm_vcpu_run+0x6f4/0x844 | lr : __kvm_vcpu_run+0x150/0x844 | sp : ffff800083903a60 | x29: ffff800083903a90 x28: ffff000801f4a300 x27: 0000000000000000 | x26: 0000000000000000 x25: ffff000801f90000 x24: ffff000801f900f0 | x23: ffff800081ff7720 x22: 0002433c807d623f x21: ffff000801f90000 | x20: ffff00087f730730 x19: 0000000000000000 x18: 0000000000000000 | x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 | x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 | x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 | x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff000801f90d70 | x5 : 0000000000001000 x4 : ffff8007fd739000 x3 : ffff000801f90000 | x2 : 0000000000000000 x1 : 00000000000003cc x0 : ffff800082f9d000 | Kernel panic - not syncing: Unhandled exception | CPU: 1 UID: 0 PID: 164 Comm: kvm-vcpu-0 Not tainted 6.15.0-rc4-00138-ga05e0f012c05 #3 PREEMPT | Hardware name: FVP Base RevC (DT) | Call trace: | show_stack+0x18/0x24 (C) | dump_stack_lvl+0x60/0x80 | dump_stack+0x18/0x24 | panic+0x168/0x360 | __panic_unhandled+0x68/0x74 | el1h_64_irq_handler+0x0/0x24 | el1h_64_sync+0x6c/0x70 | __kvm_vcpu_run+0x6f4/0x844 (P) | kvm_arm_vcpu_enter_exit+0x64/0xa0 | kvm_arch_vcpu_ioctl_run+0x21c/0x870 | kvm_vcpu_ioctl+0x1a8/0x9d0 | __arm64_sys_ioctl+0xb4/0xf4 | invoke_syscall+0x48/0x104 | el0_svc_common.constprop.0+0x40/0xe0 | do_el0_svc+0x1c/0x28 | el0_svc+0x30/0xcc | el0t_64_sync_handler+0x10c/0x138 | el0t_64_sync+0x198/0x19c | SMP: stopping secondary CPUs | Kernel Offset: disabled | CPU features: 0x0000,000002c0,02df4fb9,97ee773f | Memory Limit: none | ---[ end Kernel panic - not syncing: Unhandled exception ]--- Fix this by adding an ISB between __deactivate_traps() and fpsimd_lazy_switch_to_host(). Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-3-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vhe/switch.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 09df2b42bc1b..20df44b49ceb 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -667,6 +667,9 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); + /* Ensure CPTR trap deactivation has taken effect */ + isb(); + fpsimd_lazy_switch_to_host(vcpu); sysreg_restore_host_state_vhe(host_ctxt); From e62dd507844fa47f0fdc29f3be5a90a83f297820 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:14 +0100 Subject: [PATCH 161/284] KVM: arm64: Reorganise CPTR trap manipulation The NVHE/HVHE and VHE modes have separate implementations of __activate_cptr_traps() and __deactivate_cptr_traps() in their respective switch.c files. There's some duplication of logic, and it's not currently possible to reuse this logic elsewhere. Move the logic into the common switch.h header so that it can be reused, and de-duplicate the common logic. This rework changes the way SVE traps are deactivated in VHE mode, aligning it with NVHE/HVHE modes: * Before this patch, VHE's __deactivate_cptr_traps() would unconditionally enable SVE for host EL2 (but not EL0), regardless of whether the ARM64_SVE cpucap was set. * After this patch, VHE's __deactivate_cptr_traps() will take the ARM64_SVE cpucap into account. When ARM64_SVE is not set, SVE will be trapped from EL2 and below. The old and new behaviour are both benign: * When ARM64_SVE is not set, the host will not touch SVE state, and will not reconfigure SVE traps. Host EL0 access to SVE will be trapped as expected. * When ARM64_SVE is set, the host will configure EL0 SVE traps before returning to EL0 as part of reloading the EL0 FPSIMD/SVE/SME state. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-4-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 130 ++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/switch.c | 59 ----------- arch/arm64/kvm/hyp/vhe/switch.c | 81 --------------- 3 files changed, 130 insertions(+), 140 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 76dfda116e56..8a77fcccbcf6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -65,6 +65,136 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); +} + +static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA; + u64 cptr; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_EL1_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); +} + +static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + + if (has_vhe() || has_hvhe()) + __activate_cptr_traps_vhe(vcpu); + else + __activate_cptr_traps_nvhe(vcpu); +} + +static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); +} + +static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN; + + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); +} + +static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_vhe() || has_hvhe()) + __deactivate_cptr_traps_vhe(vcpu); + else + __deactivate_cptr_traps_nvhe(vcpu); +} + #define reg_to_fgt_masks(reg) \ ({ \ struct fgt_masks *m; \ diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 73affe1333a4..0e752b515d0f 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -47,65 +47,6 @@ struct fgt_masks hdfgwtr2_masks; extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - - if (!guest_owns_fp_regs()) - __activate_traps_fpsimd32(vcpu); - - if (has_hvhe()) { - val |= CPACR_EL1_TTA; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } - - write_sysreg(val, cpacr_el1); - } else { - val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; - - /* - * Always trap SME since it's not supported in KVM. - * TSM is RES1 if SME isn't implemented. - */ - val |= CPTR_EL2_TSM; - - if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) - val |= CPTR_EL2_TZ; - - if (!guest_owns_fp_regs()) - val |= CPTR_EL2_TFP; - - write_sysreg(val, cptr_el2); - } -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - if (has_hvhe()) { - u64 val = CPACR_EL1_FPEN; - - if (cpus_have_final_cap(ARM64_SVE)) - val |= CPACR_EL1_ZEN; - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN; - - write_sysreg(val, cpacr_el1); - } else { - u64 val = CPTR_NVHE_EL2_RES1; - - if (!cpus_have_final_cap(ARM64_SVE)) - val |= CPTR_EL2_TZ; - if (!cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; - - write_sysreg(val, cptr_el2); - } -} - static void __activate_traps(struct kvm_vcpu *vcpu) { ___activate_traps(vcpu, vcpu->arch.hcr_el2); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 20df44b49ceb..32b4814eb2b7 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -90,87 +90,6 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu) return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 cptr; - - /* - * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to - * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, - * except for some missing controls, such as TAM. - * In this case, CPTR_EL2.TAM has the same position with or without - * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM - * shift value for trapping the AMU accesses. - */ - u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } else { - __activate_traps_fpsimd32(vcpu); - } - - if (!vcpu_has_nv(vcpu)) - goto write; - - /* - * The architecture is a bit crap (what a surprise): an EL2 guest - * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, - * as they are RES0 in the guest's view. To work around it, trap the - * sucker using the very same bit it can't set... - */ - if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) - val |= CPTR_EL2_TCPAC; - - /* - * Layer the guest hypervisor's trap configuration on top of our own if - * we're in a nested context. - */ - if (is_hyp_ctxt(vcpu)) - goto write; - - cptr = vcpu_sanitised_cptr_el2(vcpu); - - /* - * Pay attention, there's some interesting detail here. - * - * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two - * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): - * - * - CPTR_EL2.xEN = x0, traps are enabled - * - CPTR_EL2.xEN = x1, traps are disabled - * - * In other words, bit[0] determines if guest accesses trap or not. In - * the interest of simplicity, clear the entire field if the guest - * hypervisor has traps enabled to dispel any illusion of something more - * complicated taking place. - */ - if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_FPEN; - if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_ZEN; - - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - val |= cptr & CPACR_EL1_E0POE; - - val |= cptr & CPTR_EL2_TCPAC; - -write: - write_sysreg(val, cpacr_el1); -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; - - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN; - - write_sysreg(val, cpacr_el1); -} - static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; From 59e6e101a6fa542a365dd5858affd18ba3e84cb8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:15 +0100 Subject: [PATCH 162/284] KVM: arm64: Remove ad-hoc CPTR manipulation from fpsimd_sve_sync() There's no need for fpsimd_sve_sync() to write to CPTR/CPACR. All relevant traps are always disabled earlier within __kvm_vcpu_run(), when __deactivate_cptr_traps() configures CPTR/CPACR. With irrelevant details elided, the flow is: handle___kvm_vcpu_run(...) { flush_hyp_vcpu(...) { fpsimd_sve_flush(...); } __kvm_vcpu_run(...) { __activate_traps(...) { __activate_cptr_traps(...); } do { __guest_enter(...); } while (...); __deactivate_traps(....) { __deactivate_cptr_traps(...); } } sync_hyp_vcpu(...) { fpsimd_sve_sync(...); } } Remove the unnecessary write to CPTR/CPACR. An ISB is still necessary, so a comment is added to describe this requirement. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-5-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index e9198e56e784..3206b2c07f82 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -69,7 +69,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) return; - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); + /* + * Traps have been disabled by __deactivate_cptr_traps(), but there + * hasn't necessarily been a context synchronization event yet. + */ isb(); if (vcpu_has_sve(vcpu)) From 186b58bacd74d9b7892869f7c7d20cf865a3c237 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:16 +0100 Subject: [PATCH 163/284] KVM: arm64: Remove ad-hoc CPTR manipulation from kvm_hyp_handle_fpsimd() The hyp code FPSIMD/SVE/SME trap handling logic has some rather messy open-coded manipulation of CPTR/CPACR. This is benign for non-nested guests, but broken for nested guests, as the guest hypervisor's CPTR configuration is not taken into account. Consider the case where L0 provides FPSIMD+SVE to an L1 guest hypervisor, and the L1 guest hypervisor only provides FPSIMD to an L2 guest (with L1 configuring CPTR/CPACR to trap SVE usage from L2). If the L2 guest triggers an FPSIMD trap to the L0 hypervisor, kvm_hyp_handle_fpsimd() will see that the vCPU supports FPSIMD+SVE, and will configure CPTR/CPACR to NOT trap FPSIMD+SVE before returning to the L2 guest. Consequently the L2 guest would be able to manipulate SVE state even though the L1 hypervisor had configured CPTR/CPACR to forbid this. Clean this up, and fix the nested virt issue by always using __deactivate_cptr_traps() and __activate_cptr_traps() to manage the CPTR traps. This removes the need for the ad-hoc fixup in kvm_hyp_save_fpsimd_host(), and ensures that any guest hypervisor configuration of CPTR/CPACR is taken into account. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-6-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 8a77fcccbcf6..2ad57b117385 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -616,11 +616,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) */ if (system_supports_sve()) { __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_EL1_ZEN, 0); - } else { __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); } @@ -671,10 +666,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); - else - cpacr_clear_set(0, CPACR_EL1_FPEN); + __deactivate_cptr_traps(vcpu); isb(); /* Write out the host state if it's in the registers */ @@ -696,6 +688,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED; + /* + * Re-enable traps necessary for the current state of the guest, e.g. + * those enabled by a guest hypervisor. The ERET to the guest will + * provide the necessary context synchronization. + */ + __activate_cptr_traps(vcpu); + return true; } From 3a300a33e4063fb44c7887bec3aecd2fd6966df8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:17 +0100 Subject: [PATCH 164/284] KVM: arm64: Remove cpacr_clear_set() We no longer use cpacr_clear_set(). Remove cpacr_clear_set() and its helper functions. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-7-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 62 ---------------------------- 1 file changed, 62 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index bd020fc28aa9..0720898f563e 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -561,68 +561,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu_set_flag((v), e); \ } while (0) -#define __build_check_all_or_none(r, bits) \ - BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits)) - -#define __cpacr_to_cptr_clr(clr, set) \ - ({ \ - u64 cptr = 0; \ - \ - if ((set) & CPACR_EL1_FPEN) \ - cptr |= CPTR_EL2_TFP; \ - if ((set) & CPACR_EL1_ZEN) \ - cptr |= CPTR_EL2_TZ; \ - if ((set) & CPACR_EL1_SMEN) \ - cptr |= CPTR_EL2_TSM; \ - if ((clr) & CPACR_EL1_TTA) \ - cptr |= CPTR_EL2_TTA; \ - if ((clr) & CPTR_EL2_TAM) \ - cptr |= CPTR_EL2_TAM; \ - if ((clr) & CPTR_EL2_TCPAC) \ - cptr |= CPTR_EL2_TCPAC; \ - \ - cptr; \ - }) - -#define __cpacr_to_cptr_set(clr, set) \ - ({ \ - u64 cptr = 0; \ - \ - if ((clr) & CPACR_EL1_FPEN) \ - cptr |= CPTR_EL2_TFP; \ - if ((clr) & CPACR_EL1_ZEN) \ - cptr |= CPTR_EL2_TZ; \ - if ((clr) & CPACR_EL1_SMEN) \ - cptr |= CPTR_EL2_TSM; \ - if ((set) & CPACR_EL1_TTA) \ - cptr |= CPTR_EL2_TTA; \ - if ((set) & CPTR_EL2_TAM) \ - cptr |= CPTR_EL2_TAM; \ - if ((set) & CPTR_EL2_TCPAC) \ - cptr |= CPTR_EL2_TCPAC; \ - \ - cptr; \ - }) - -#define cpacr_clear_set(clr, set) \ - do { \ - BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \ - BUILD_BUG_ON((clr) & CPACR_EL1_E0POE); \ - __build_check_all_or_none((clr), CPACR_EL1_FPEN); \ - __build_check_all_or_none((set), CPACR_EL1_FPEN); \ - __build_check_all_or_none((clr), CPACR_EL1_ZEN); \ - __build_check_all_or_none((set), CPACR_EL1_ZEN); \ - __build_check_all_or_none((clr), CPACR_EL1_SMEN); \ - __build_check_all_or_none((set), CPACR_EL1_SMEN); \ - \ - if (has_vhe() || has_hvhe()) \ - sysreg_clear_set(cpacr_el1, clr, set); \ - else \ - sysreg_clear_set(cptr_el2, \ - __cpacr_to_cptr_clr(clr, set), \ - __cpacr_to_cptr_set(clr, set));\ - } while (0) - /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. From 04c5355b2a94ff3191ce63ab035fb7f04d036869 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:18 +0100 Subject: [PATCH 165/284] KVM: arm64: VHE: Centralize ISBs when returning to host The VHE hyp code has recently gained a few ISBs. Simplify this to one unconditional ISB in __kvm_vcpu_run_vhe(), and remove the unnecessary ISB from the kvm_call_hyp_ret() macro. While kvm_call_hyp_ret() is also used to invoke __vgic_v3_get_gic_config(), but no ISB is necessary in that case either. For the moment, an ISB is left in kvm_call_hyp(), as there are many more users, and removing the ISB would require a more thorough audit. Suggested-by: Marc Zyngier Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-8-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 6 ++---- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 3 --- arch/arm64/kvm/hyp/vhe/switch.c | 25 +++++++++++------------ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5ccca509dff1..d27079968341 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1289,9 +1289,8 @@ void kvm_arm_resume_guest(struct kvm *kvm); }) /* - * The couple of isb() below are there to guarantee the same behaviour - * on VHE as on !VHE, where the eret to EL1 acts as a context - * synchronization event. + * The isb() below is there to guarantee the same behaviour on VHE as on !VHE, + * where the eret to EL1 acts as a context synchronization event. */ #define kvm_call_hyp(f, ...) \ do { \ @@ -1309,7 +1308,6 @@ void kvm_arm_resume_guest(struct kvm *kvm); \ if (has_vhe()) { \ ret = f(__VA_ARGS__); \ - isb(); \ } else { \ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 73881e1dc267..502a5b73ee70 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -167,9 +167,6 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - - if (has_vhe()) - isb(); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 32b4814eb2b7..477f1580ffea 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -558,10 +558,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - sysreg_save_host_state_vhe(host_ctxt); - fpsimd_lazy_switch_to_guest(vcpu); + sysreg_save_host_state_vhe(host_ctxt); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -586,18 +586,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); - /* Ensure CPTR trap deactivation has taken effect */ + sysreg_restore_host_state_vhe(host_ctxt); + + __debug_switch_to_host(vcpu); + + /* + * Ensure that all system register writes above have taken effect + * before returning to the host. In VHE mode, CPTR traps for + * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be + * manipulated after the ISB. + */ isb(); fpsimd_lazy_switch_to_host(vcpu); - sysreg_restore_host_state_vhe(host_ctxt); - if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); - __debug_switch_to_host(vcpu); - return exit_code; } NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); @@ -627,12 +632,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ local_daif_restore(DAIF_PROCCTX_NOIRQ); - /* - * When we exit from the guest we change a number of CPU configuration - * parameters, such as traps. We rely on the isb() in kvm_call_hyp*() - * to make sure these changes take effect before running the host or - * additional guests. - */ return ret; } From c769be2d3dbb165a3d432f4fcca2c80fabb35877 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 22 May 2025 16:10:00 +0100 Subject: [PATCH 166/284] btrfs: include root in error message when unlinking inode To help debugging include the root number in the error message, and since this is a critical error that implies a metadata inconsistency and results in a transaction abort change the log message level from "info" to "critical", which is a much better fit. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c0c778243bf1..a62f92ab5977 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4250,9 +4250,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { - btrfs_info(fs_info, - "failed to delete reference to %.*s, inode %llu parent %llu", - name->len, name->name, ino, dir_ino); + btrfs_crit(fs_info, + "failed to delete reference to %.*s, root %llu inode %llu parent %llu", + name->len, name->name, btrfs_root_id(root), ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } From dd276214e439db08f444fd3e07e9fe4c9e0ca210 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 27 May 2025 17:04:21 -0700 Subject: [PATCH 167/284] btrfs: fix delayed ref refcount leak in debug assertion If the delayed_root is not empty we are increasing the number of references to a delayed_node without decreasing it, causing a leak. Fix by decrementing the delayed_node reference count. Reviewed-by: Filipe Manana Signed-off-by: Leo Martins Reviewed-by: Qu Wenruo [ Remove the changelog from the commit message. ] Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c7cc24a5dd5e..8c597fa60523 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1377,7 +1377,10 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); + struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root); + + if (WARN_ON(node)) + refcount_dec(&node->refs); } static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) From 186b9dc3c302ad706b3f23c857eb128165f6b484 Mon Sep 17 00:00:00 2001 From: Leo Martins Date: Tue, 27 May 2025 17:04:22 -0700 Subject: [PATCH 168/284] btrfs: warn if leaking delayed_nodes in btrfs_put_root() Add a warning for leaked delayed_nodes when putting a root. We currently do this for inodes, but not delayed_nodes. Signed-off-by: Leo Martins Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo [ Remove the changelog from the commit message. ] Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1beb9458f622..3def93016963 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1835,6 +1835,8 @@ void btrfs_put_root(struct btrfs_root *root) if (refcount_dec_and_test(&root->refs)) { if (WARN_ON(!xa_empty(&root->inodes))) xa_destroy(&root->inodes); + if (WARN_ON(!xa_empty(&root->delayed_nodes))) + xa_destroy(&root->delayed_nodes); WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); if (root->anon_dev) free_anon_bdev(root->anon_dev); From 65d5112b4d7cf019ccb62cf40077038aae66239b Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Tue, 20 May 2025 19:42:23 +0800 Subject: [PATCH 169/284] btrfs: scrub: add prefix for the error messages Add a "scrub: " prefix to all messages logged by scrub so that it's easy to filter them from dmesg for analysis. Reviewed-by: Filipe Manana Reviewed-by: Qu Wenruo Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- fs/btrfs/scrub.c | 53 ++++++++++++++++++++++++------------------------ 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a498fe524c90..1e8f7082239c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3142,7 +3142,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) return -EPERM; if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { - btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet"); + btrfs_err(fs_info, "scrub: extent tree v2 not yet supported"); return -EINVAL; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ed120621021b..94618add3b44 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -557,7 +557,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, */ for (i = 0; i < ipath->fspath->elem_cnt; ++i) btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", +"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -571,7 +571,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, err: btrfs_warn_in_rcu(fs_info, - "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", + "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, btrfs_dev_name(swarn->dev), swarn->physical, @@ -596,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * /* Super block error, no need to search extent tree. */ if (is_super) { - btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", + btrfs_warn_in_rcu(fs_info, "scrub: %s on device %s, physical %llu", errstr, btrfs_dev_name(dev), physical); return; } @@ -631,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device * &ref_level); if (ret < 0) { btrfs_warn(fs_info, - "failed to resolve tree backref for logical %llu: %d", - swarn.logical, ret); + "scrub: failed to resolve tree backref for logical %llu: %d", + swarn.logical, ret); break; } if (ret > 0) break; btrfs_warn_in_rcu(fs_info, -"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", +"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, btrfs_dev_name(dev), swarn.physical, (ref_level ? "node" : "leaf"), ref_level, ref_root); @@ -718,7 +718,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad bytenr, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_bytenr(header), logical); return; @@ -728,7 +728,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad fsid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, header->fsid, fs_info->fs_devices->fsid); return; @@ -738,7 +738,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", + "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, header->chunk_tree_uuid, fs_info->chunk_tree_uuid); return; @@ -760,7 +760,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); @@ -771,7 +771,7 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, - "tree block %llu mirror %u has bad generation, has %llu want %llu", + "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, btrfs_stack_header_generation(header), stripe->sectors[sector_nr].generation); @@ -814,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) */ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { btrfs_warn_rl(fs_info, - "tree block at %llu crosses stripe boundary %llu", + "scrub: tree block at %llu crosses stripe boundary %llu", stripe->logical + (sector_nr << fs_info->sectorsize_bits), stripe->logical); @@ -1046,12 +1046,12 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, if (repaired) { if (dev) { btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on dev %s physical %llu", + "scrub: fixed up error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, - "fixed up error at logical %llu on mirror %u", + "scrub: fixed up error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } continue; @@ -1060,12 +1060,12 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* The remaining are all for unrepaired. */ if (dev) { btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on dev %s physical %llu", +"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu", stripe->logical, btrfs_dev_name(dev), physical); } else { btrfs_err_rl_in_rcu(fs_info, - "unable to fixup (regular) error at logical %llu on mirror %u", + "scrub: unable to fixup (regular) error at logical %llu on mirror %u", stripe->logical, stripe->mirror_num); } @@ -1593,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, physical, sctx->write_pointer); if (ret) - btrfs_err(fs_info, - "zoned: failed to recover write pointer"); + btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer"); } mutex_unlock(&sctx->wr_lock); btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); @@ -1658,7 +1657,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, int ret; if (unlikely(!extent_root || !csum_root)) { - btrfs_err(fs_info, "no valid extent or csum root for scrub"); + btrfs_err(fs_info, "scrub: no valid extent or csum root found"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * @@ -1907,7 +1906,7 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, - "stripe %llu has unrepaired metadata sector at %llu", + "scrub: stripe %llu has unrepaired metadata sector at logical %llu", stripe->logical, stripe->logical + (i << fs_info->sectorsize_bits)); return true; @@ -2167,7 +2166,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, -"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", +"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", full_stripe_start, i, stripe->nr_sectors, &error); ret = -EIO; @@ -2789,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ro_set = 0; } else if (ret == -ETXTBSY) { btrfs_warn(fs_info, - "skipping scrub of block group %llu due to active swapfile", + "scrub: skipping scrub of block group %llu due to active swapfile", cache->start); scrub_pause_off(fs_info); ret = 0; goto skip_unfreeze; } else { - btrfs_warn(fs_info, - "failed setting block group ro: %d", ret); + btrfs_warn(fs_info, "scrub: failed setting block group ro: %d", + ret); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); scrub_pause_off(fs_info); @@ -2898,13 +2897,13 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, ret = btrfs_check_super_csum(fs_info, sb); if (ret != 0) { btrfs_err_rl(fs_info, - "super block at physical %llu devid %llu has bad csum", + "scrub: super block at physical %llu devid %llu has bad csum", physical, dev->devid); return -EIO; } if (btrfs_super_generation(sb) != generation) { btrfs_err_rl(fs_info, -"super block at physical %llu devid %llu has bad generation %llu expect %llu", +"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu", physical, dev->devid, btrfs_super_generation(sb), generation); return -EUCLEAN; @@ -3065,7 +3064,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, - "scrub on devid %llu: filesystem on %s is not writable", + "scrub: devid %llu: filesystem on %s is not writable", devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; From 3ca864de852bc91007b32d2a0d48993724f4abad Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 28 May 2025 12:28:27 +0100 Subject: [PATCH 170/284] btrfs: fix a race between renames and directory logging We have a race between a rename and directory inode logging that if it happens and we crash/power fail before the rename completes, the next time the filesystem is mounted, the log replay code will end up deleting the file that was being renamed. This is best explained following a step by step analysis of an interleaving of steps that lead into this situation. Consider the initial conditions: 1) We are at transaction N; 2) We have directories A and B created in a past transaction (< N); 3) We have inode X corresponding to a file that has 2 hardlinks, one in directory A and the other in directory B, so we'll name them as "A/foo_link1" and "B/foo_link2". Both hard links were persisted in a past transaction (< N); 4) We have inode Y corresponding to a file that as a single hard link and is located in directory A, we'll name it as "A/bar". This file was also persisted in a past transaction (< N). The steps leading to a file loss are the following and for all of them we are under transaction N: 1) Link "A/foo_link1" is removed, so inode's X last_unlink_trans field is updated to N, through btrfs_unlink() -> btrfs_record_unlink_dir(); 2) Task A starts a rename for inode Y, with the goal of renaming from "A/bar" to "A/baz", so we enter btrfs_rename(); 3) Task A inserts the new BTRFS_INODE_REF_KEY for inode Y by calling btrfs_insert_inode_ref(); 4) Because the rename happens in the same directory, we don't set the last_unlink_trans field of directoty A's inode to the current transaction id, that is, we don't cal btrfs_record_unlink_dir(); 5) Task A then removes the entries from directory A (BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY items) when calling __btrfs_unlink_inode() (actually the dir index item is added as a delayed item, but the effect is the same); 6) Now before task A adds the new entry "A/baz" to directory A by calling btrfs_add_link(), another task, task B is logging inode X; 7) Task B starts a fsync of inode X and after logging inode X, at btrfs_log_inode_parent() it calls btrfs_log_all_parents(), since inode X has a last_unlink_trans value of N, set at in step 1; 8) At btrfs_log_all_parents() we search for all parent directories of inode X using the commit root, so we find directories A and B and log them. Bu when logging direct A, we don't have a dir index item for inode Y anymore, neither the old name "A/bar" nor for the new name "A/baz" since the rename has deleted the old name but has not yet inserted the new name - task A hasn't called yet btrfs_add_link() to do that. Note that logging directory A doesn't fallback to a transaction commit because its last_unlink_trans has a lower value than the current transaction's id (see step 4); 9) Task B finishes logging directories A and B and gets back to btrfs_sync_file() where it calls btrfs_sync_log() to persist the log tree; 10) Task B successfully persisted the log tree, btrfs_sync_log() completed with success, and a power failure happened. We have a log tree without any directory entry for inode Y, so the log replay code deletes the entry for inode Y, name "A/bar", from the subvolume tree since it doesn't exist in the log tree and the log tree is authorative for its index (we logged a BTRFS_DIR_LOG_INDEX_KEY item that covers the index range for the dentry that corresponds to "A/bar"). Since there's no other hard link for inode Y and the log replay code deletes the name "A/bar", the file is lost. The issue wouldn't happen if task B synced the log only after task A called btrfs_log_new_name(), which would update the log with the new name for inode Y ("A/bar"). Fix this by pinning the log root during renames before removing the old directory entry, and unpinning after btrfs_log_new_name() is called. Fixes: 259c4b96d78d ("btrfs: stop doing unnecessary log updates during a rename") CC: stable@vger.kernel.org # 5.18+ Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 83 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a62f92ab5977..26d6ed170a19 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8059,6 +8059,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + bool logs_pinned = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; @@ -8182,6 +8183,31 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && + new_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not for + * root entries) pin the log early to prevent any concurrent + * task from logging the directory after we removed the old + * entries and before we add the new entries, otherwise that + * task can sync a log without any entry for the inodes we are + * renaming and therefore replaying that log, if a power failure + * happens after syncing the log, would result in deleting the + * inodes. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8253,30 +8279,23 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; /* - * Now pin the logs of the roots. We do it to ensure that no other task - * can sync the logs while we are in progress with the rename, because - * that could result in an inconsistency in case any of the inodes that - * are part of this rename operation were logged before. + * Do the log updates for all inodes. + * + * If either entry is for a root we don't need to update the logs since + * we've called btrfs_set_log_full_commit() before. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_pin_log_trans(dest); - - /* Do the log updates for all inodes. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) { btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); + } - /* Now unpin the logs. */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_end_log_trans(root); - if (new_ino != BTRFS_FIRST_FREE_OBJECTID) - btrfs_end_log_trans(dest); out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: @@ -8326,6 +8345,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; + bool logs_pinned = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -8460,6 +8480,29 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + /* + * If we are renaming in the same directory (and it's not a + * root entry) pin the log to prevent any concurrent task from + * logging the directory after we removed the old entry and + * before we add the new entry, otherwise that task can sync + * a log without any entry for the inode we are renaming and + * therefore replaying that log, if a power failure happens + * after syncing the log, would result in deleting the inode. + * + * If the rename affects two different directories, we want to + * make sure the that there's no log commit that contains + * updates for only one of the directories but not for the + * other. + * + * If we are renaming an entry for a root, we don't care about + * log updates since we called btrfs_set_log_full_commit(). + */ + btrfs_pin_log_trans(root); + btrfs_pin_log_trans(dest); + logs_pinned = true; + } + if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); @@ -8524,7 +8567,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; - if (old_ino != BTRFS_FIRST_FREE_OBJECTID) + if (logs_pinned) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); @@ -8540,6 +8583,10 @@ static int btrfs_rename(struct mnt_idmap *idmap, } } out_fail: + if (logs_pinned) { + btrfs_end_log_trans(root); + btrfs_end_log_trans(dest); + } ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: From ae4477f937569d097ca5dbce92a89ba384b49bc6 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 29 May 2025 10:37:44 +0100 Subject: [PATCH 171/284] btrfs: update superblock's device bytes_used when dropping chunk Each superblock contains a copy of the device item for that device. In a transaction which drops a chunk but doesn't create any new ones, we were correctly updating the device item in the chunk tree but not copying over the new bytes_used value to the superblock. This can be seen by doing the following: # dd if=/dev/zero of=test bs=4096 count=2621440 # mkfs.btrfs test # mount test /root/temp # cd /root/temp # for i in {00..10}; do dd if=/dev/zero of=$i bs=4096 count=32768; done # sync # rm * # sync # btrfs balance start -dusage=0 . # sync # cd # umount /root/temp # btrfs check test For btrfs-check to detect this, you will also need my patch at https://github.com/kdave/btrfs-progs/pull/991. Change btrfs_remove_dev_extents() so that it adds the devices to the fs_info->post_commit_list if they're not there already. This causes btrfs_commit_device_sizes() to be called, which updates the bytes_used value in the superblock. Fixes: bbbf7243d62d ("btrfs: combine device update operations during transaction commit") CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 89835071cfea..f475b4b7c457 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3282,6 +3282,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) device->bytes_used - dev_extent_len); atomic64_add(dev_extent_len, &fs_info->free_chunk_space); btrfs_clear_space_info_full(fs_info); + + if (list_empty(&device->post_commit_list)) { + list_add_tail(&device->post_commit_list, + &trans->transaction->dev_update_list); + } + mutex_unlock(&fs_info->chunk_mutex); } } From e5b5596011773a38e035e9633ed928ef13c720b1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 3 Jun 2025 19:45:35 +0100 Subject: [PATCH 172/284] btrfs: fix double unlock of buffer_tree xarray when releasing subpage eb If we break out of the loop because an extent buffer doesn't have the bit EXTENT_BUFFER_TREE_REF set, we end up unlocking the xarray twice, once before we tested for the bit and break out of the loop, and once again after the loop. Fix this by testing the bit and exiting before unlocking the xarray. The time spent testing the bit is negligible and it's not worth trying to do that outside the critical section delimited by the xarray lock due to the code complexity required to avoid it (like using a local boolean variable to track whether the xarray is locked or not). The xarray unlock only needs to be done before calling release_extent_buffer(), as that needs to lock the xarray (through xa_cmpxchg_irq()) and does a more significant amount of work. Fixes: 19d7f65f032f ("btrfs: convert the buffer_radix to an xarray") Reported-by: Dan Carpenter Link: https://lore.kernel.org/linux-btrfs/aDRNDU0GM1_D4Xnw@stanley.mountain/ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 849199768664..1dc931c4937f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4312,7 +4312,6 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_unlock(&eb->refs_lock); continue; } - xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4329,6 +4328,7 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ + xa_unlock_irq(&fs_info->buffer_tree); release_extent_buffer(eb); xa_lock_irq(&fs_info->buffer_tree); } From 2dcf838cf5c2f0f4501edaa1680fcad03618d760 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 3 Jun 2025 19:29:01 +0100 Subject: [PATCH 173/284] btrfs: fix invalid inode pointer dereferences during log replay In a few places where we call read_one_inode(), if we get a NULL pointer we end up jumping into an error path, or fallthrough in case of __add_inode_ref(), where we then do something like this: iput(&inode->vfs_inode); which results in an invalid inode pointer that triggers an invalid memory access, resulting in a crash. Fix this by making sure we don't do such dereferences. Fixes: b4c50cbb01a1 ("btrfs: return a btrfs_inode from read_one_inode()") CC: stable@vger.kernel.org # 6.15+ Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 97e933113b82..21d2f3dded51 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -668,15 +668,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - ret = 0; - goto out; + return 0; } inode = read_one_inode(root, key->objectid); - if (!inode) { - ret = -EIO; - goto out; - } + if (!inode) + return -EIO; /* * first check to see if we already have this extent in the @@ -961,7 +958,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(&inode->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1176,8 +1174,8 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, ret = unlink_inode_for_log_replay(trans, victim_parent, inode, &victim_name); + iput(&victim_parent->vfs_inode); } - iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; From 16edae52f60658caeaa0702e7cb6b738a09d4ad4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 3 Jun 2025 22:06:17 +0100 Subject: [PATCH 174/284] btrfs: don't silently ignore unexpected extent type when replaying log If there's an unexpected (invalid) extent type, we just silently ignore it. This means a corruption or some bug somewhere, so instead return -EUCLEAN to the caller, making log replay fail, and print an error message with relevant information. Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 21d2f3dded51..858b609e292c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -668,7 +668,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, extent_end = ALIGN(start + size, fs_info->sectorsize); } else { - return 0; + btrfs_err(fs_info, + "unexpected extent type=%d root=%llu inode=%llu offset=%llu", + found_type, btrfs_root_id(root), key->objectid, key->offset); + return -EUCLEAN; } inode = read_one_inode(root, key->objectid); From 1961d20f6fa8903266ed9bd77c691924c22c8f02 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 5 Jun 2025 20:51:03 +0100 Subject: [PATCH 175/284] btrfs: fix assertion when building free space tree When building the free space tree with the block group tree feature enabled, we can hit an assertion failure like this: BTRFS info (device loop0 state M): rebuilding free space tree assertion failed: ret == 0, in fs/btrfs/free-space-tree.c:1102 ------------[ cut here ]------------ kernel BUG at fs/btrfs/free-space-tree.c:1102! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP Modules linked in: CPU: 1 UID: 0 PID: 6592 Comm: syz-executor322 Not tainted 6.15.0-rc7-syzkaller-gd7fa1af5b33e #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 lr : populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 sp : ffff8000a4ce7600 x29: ffff8000a4ce76e0 x28: ffff0000c9bc6000 x27: ffff0000ddfff3d8 x26: ffff0000ddfff378 x25: dfff800000000000 x24: 0000000000000001 x23: ffff8000a4ce7660 x22: ffff70001499cecc x21: ffff0000e1d8c160 x20: ffff0000e1cb7800 x19: ffff0000e1d8c0b0 x18: 00000000ffffffff x17: ffff800092f39000 x16: ffff80008ad27e48 x15: ffff700011e740c0 x14: 1ffff00011e740c0 x13: 0000000000000004 x12: ffffffffffffffff x11: ffff700011e740c0 x10: 0000000000ff0100 x9 : 94ef24f55d2dbc00 x8 : 94ef24f55d2dbc00 x7 : 0000000000000001 x6 : 0000000000000001 x5 : ffff8000a4ce6f98 x4 : ffff80008f415ba0 x3 : ffff800080548ef0 x2 : 0000000000000000 x1 : 0000000100000000 x0 : 000000000000003e Call trace: populate_free_space_tree+0x514/0x518 fs/btrfs/free-space-tree.c:1102 (P) btrfs_rebuild_free_space_tree+0x14c/0x54c fs/btrfs/free-space-tree.c:1337 btrfs_start_pre_rw_mount+0xa78/0xe10 fs/btrfs/disk-io.c:3074 btrfs_remount_rw fs/btrfs/super.c:1319 [inline] btrfs_reconfigure+0x828/0x2418 fs/btrfs/super.c:1543 reconfigure_super+0x1d4/0x6f0 fs/super.c:1083 do_remount fs/namespace.c:3365 [inline] path_mount+0xb34/0xde0 fs/namespace.c:4200 do_mount fs/namespace.c:4221 [inline] __do_sys_mount fs/namespace.c:4432 [inline] __se_sys_mount fs/namespace.c:4409 [inline] __arm64_sys_mount+0x3e8/0x468 fs/namespace.c:4409 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 Code: f0047182 91178042 528089c3 9771d47b (d4210000) ---[ end trace 0000000000000000 ]--- This happens because we are processing an empty block group, which has no extents allocated from it, there are no items for this block group, including the block group item since block group items are stored in a dedicated tree when using the block group tree feature. It also means this is the block group with the highest start offset, so there are no higher keys in the extent root, hence btrfs_search_slot_for_read() returns 1 (no higher key found). Fix this by asserting 'ret' is 0 only if the block group tree feature is not enabled, in which case we should find a block group item for the block group since it's stored in the extent root and block group item keys are greater than extent item keys (the value for BTRFS_BLOCK_GROUP_ITEM_KEY is 192 and for BTRFS_EXTENT_ITEM_KEY and BTRFS_METADATA_ITEM_KEY the values are 168 and 169 respectively). In case 'ret' is 1, we just need to add a record to the free space tree which spans the whole block group, and we can achieve this by making 'ret == 0' as the while loop's condition. Reported-by: syzbot+36fae25c35159a763a2a@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6841dca8.a00a0220.d4325.0020.GAE@google.com/ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-tree.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 0c573d46639a..a3e2a2a81461 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1115,11 +1115,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); if (ret < 0) goto out_locked; - ASSERT(ret == 0); + /* + * If ret is 1 (no key found), it means this is an empty block group, + * without any extents allocated from it and there's no block group + * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree + * because we are using the block group tree feature, so block group + * items are stored in the block group tree. It also means there are no + * extents allocated for block groups with a start offset beyond this + * block group's end offset (this is the last, highest, block group). + */ + if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) + ASSERT(ret == 0); start = block_group->start; end = block_group->start + block_group->length; - while (1) { + while (ret == 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.type == BTRFS_EXTENT_ITEM_KEY || @@ -1149,8 +1159,6 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = btrfs_next_item(extent_root, path); if (ret < 0) goto out_locked; - if (ret) - break; } if (start < end) { ret = __add_to_free_space_tree(trans, block_group, path2, From a26bf338cdad3643a6e7c3d78a172baadba15c1a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 4 Jun 2025 16:54:44 +0100 Subject: [PATCH 176/284] btrfs: fix race between async reclaim worker and close_ctree() Syzbot reported an assertion failure due to an attempt to add a delayed iput after we have set BTRFS_FS_STATE_NO_DELAYED_IPUT in the fs_info state: WARNING: CPU: 0 PID: 65 at fs/btrfs/inode.c:3420 btrfs_add_delayed_iput+0x2f8/0x370 fs/btrfs/inode.c:3420 Modules linked in: CPU: 0 UID: 0 PID: 65 Comm: kworker/u8:4 Not tainted 6.15.0-next-20250530-syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Workqueue: btrfs-endio-write btrfs_work_helper RIP: 0010:btrfs_add_delayed_iput+0x2f8/0x370 fs/btrfs/inode.c:3420 Code: 4e ad 5d (...) RSP: 0018:ffffc9000213f780 EFLAGS: 00010293 RAX: ffffffff83c635b7 RBX: ffff888058920000 RCX: ffff88801c769e00 RDX: 0000000000000000 RSI: 0000000000000100 RDI: 0000000000000000 RBP: 0000000000000001 R08: ffff888058921b67 R09: 1ffff1100b12436c R10: dffffc0000000000 R11: ffffed100b12436d R12: 0000000000000001 R13: dffffc0000000000 R14: ffff88807d748000 R15: 0000000000000100 FS: 0000000000000000(0000) GS:ffff888125c53000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00002000000bd038 CR3: 000000006a142000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: btrfs_put_ordered_extent+0x19f/0x470 fs/btrfs/ordered-data.c:635 btrfs_finish_one_ordered+0x11d8/0x1b10 fs/btrfs/inode.c:3312 btrfs_work_helper+0x399/0xc20 fs/btrfs/async-thread.c:312 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xae1/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x70e/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 This can happen due to a race with the async reclaim worker like this: 1) The async metadata reclaim worker enters shrink_delalloc(), which calls btrfs_start_delalloc_roots() with an nr_pages argument that has a value less than LONG_MAX, and that in turn enters start_delalloc_inodes(), which sets the local variable 'full_flush' to false because wbc->nr_to_write is less than LONG_MAX; 2) There it finds inode X in a root's delalloc list, grabs a reference for inode X (with igrab()), and triggers writeback for it with filemap_fdatawrite_wbc(), which creates an ordered extent for inode X; 3) The unmount sequence starts from another task, we enter close_ctree() and we flush the workqueue fs_info->endio_write_workers, which waits for the ordered extent for inode X to complete and when dropping the last reference of the ordered extent, with btrfs_put_ordered_extent(), when we call btrfs_add_delayed_iput() we don't add the inode to the list of delayed iputs because it has a refcount of 2, so we decrement it to 1 and return; 4) Shortly after at close_ctree() we call btrfs_run_delayed_iputs() which runs all delayed iputs, and then we set BTRFS_FS_STATE_NO_DELAYED_IPUT in the fs_info state; 5) The async reclaim worker, after calling filemap_fdatawrite_wbc(), now calls btrfs_add_delayed_iput() for inode X and there we trigger an assertion failure since the fs_info state has the flag BTRFS_FS_STATE_NO_DELAYED_IPUT set. Fix this by setting BTRFS_FS_STATE_NO_DELAYED_IPUT only after we wait for the async reclaim workers to finish, after we call cancel_work_sync() for them at close_ctree(), and by running delayed iputs after wait for the reclaim workers to finish and before setting the bit. This race was recently introduced by commit 19e60b2a95f5 ("btrfs: add extra warning if delayed iput is added when it's not allowed"). Without the new validation at btrfs_add_delayed_iput(), this described scenario was safe because close_ctree() later calls btrfs_commit_super(). That will run any final delayed iputs added by reclaim workers in the window between the btrfs_run_delayed_iputs() and the the reclaim workers being shut down. Reported-by: syzbot+0ed30ad435bf6f5b7a42@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/6840481c.a00a0220.d4325.000c.GAE@google.com/T/#u Fixes: 19e60b2a95f5 ("btrfs: add extra warning if delayed iput is added when it's not allowed") Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3def93016963..f48f9d924a62 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4312,8 +4312,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) * * So wait for all ongoing ordered extents to complete and then run * delayed iputs. This works because once we reach this point no one - * can either create new ordered extents nor create delayed iputs - * through some other means. + * can create new ordered extents, but delayed iputs can still be added + * by a reclaim worker (see comments further below). * * Also note that btrfs_wait_ordered_roots() is not safe here, because * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent, @@ -4324,15 +4324,29 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->endio_write_workers); /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); + /* + * Run delayed iputs in case an async reclaim worker is waiting for them + * to be run as mentioned above. + */ btrfs_run_delayed_iputs(fs_info); - /* There should be no more workload to generate new delayed iputs. */ - set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); cancel_work_sync(&fs_info->em_shrinker_work); + /* + * Run delayed iputs again because an async reclaim worker may have + * added new ones if it was flushing delalloc: + * + * shrink_delalloc() -> btrfs_start_delalloc_roots() -> + * start_delalloc_inodes() -> btrfs_add_delayed_iput() + */ + btrfs_run_delayed_iputs(fs_info); + + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); From 547e836661554dcfa15c212a3821664e85b4191a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 7 Jun 2025 09:18:43 +0930 Subject: [PATCH 177/284] btrfs: handle csum tree error with rescue=ibadroots correctly [BUG] There is syzbot based reproducer that can crash the kernel, with the following call trace: (With some debug output added) DEBUG: rescue=ibadroots parsed BTRFS: device fsid 14d642db-7b15-43e4-81e6-4b8fac6a25f8 devid 1 transid 8 /dev/loop0 (7:0) scanned by repro (1010) BTRFS info (device loop0): first mount of filesystem 14d642db-7b15-43e4-81e6-4b8fac6a25f8 BTRFS info (device loop0): using blake2b (blake2b-256-generic) checksum algorithm BTRFS info (device loop0): using free-space-tree BTRFS warning (device loop0): checksum verify failed on logical 5312512 mirror 1 wanted 0xb043382657aede36608fd3386d6b001692ff406164733d94e2d9a180412c6003 found 0x810ceb2bacb7f0f9eb2bf3b2b15c02af867cb35ad450898169f3b1f0bd818651 level 0 DEBUG: read tree root path failed for tree csum, ret=-5 BTRFS warning (device loop0): checksum verify failed on logical 5328896 mirror 1 wanted 0x51be4e8b303da58e6340226815b70e3a93592dac3f30dd510c7517454de8567a found 0x51be4e8b303da58e634022a315b70e3a93592dac3f30dd510c7517454de8567a level 0 BTRFS warning (device loop0): checksum verify failed on logical 5292032 mirror 1 wanted 0x1924ccd683be9efc2fa98582ef58760e3848e9043db8649ee382681e220cdee4 found 0x0cb6184f6e8799d9f8cb335dccd1d1832da1071d12290dab3b85b587ecacca6e level 0 process 'repro' launched './file2' with NULL argv: empty string added DEBUG: no csum root, idatacsums=0 ibadroots=134217728 Oops: general protection fault, probably for non-canonical address 0xdffffc0000000041: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000208-0x000000000000020f] CPU: 5 UID: 0 PID: 1010 Comm: repro Tainted: G OE 6.15.0-custom+ #249 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS unknown 02/02/2022 RIP: 0010:btrfs_lookup_csum+0x93/0x3d0 [btrfs] Call Trace: btrfs_lookup_bio_sums+0x47a/0xdf0 [btrfs] btrfs_submit_bbio+0x43e/0x1a80 [btrfs] submit_one_bio+0xde/0x160 [btrfs] btrfs_readahead+0x498/0x6a0 [btrfs] read_pages+0x1c3/0xb20 page_cache_ra_order+0x4b5/0xc20 filemap_get_pages+0x2d3/0x19e0 filemap_read+0x314/0xde0 __kernel_read+0x35b/0x900 bprm_execve+0x62e/0x1140 do_execveat_common.isra.0+0x3fc/0x520 __x64_sys_execveat+0xdc/0x130 do_syscall_64+0x54/0x1d0 entry_SYSCALL_64_after_hwframe+0x76/0x7e ---[ end trace 0000000000000000 ]--- [CAUSE] Firstly the fs has a corrupted csum tree root, thus to mount the fs we have to go "ro,rescue=ibadroots" mount option. Normally with that mount option, a bad csum tree root should set BTRFS_FS_STATE_NO_DATA_CSUMS flag, so that any future data read will ignore csum search. But in this particular case, we have the following call trace that caused NULL csum root, but not setting BTRFS_FS_STATE_NO_DATA_CSUMS: load_global_roots_objectid(): ret = btrfs_search_slot(); /* Succeeded */ btrfs_item_key_to_cpu() found = true; /* We found the root item for csum tree. */ root = read_tree_root_path(); if (IS_ERR(root)) { if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) /* * Since we have rescue=ibadroots mount option, * @ret is still 0. */ break; if (!found || ret) { /* @found is true, @ret is 0, error handling for csum * tree is skipped. */ } This means we completely skipped to set BTRFS_FS_STATE_NO_DATA_CSUMS if the csum tree is corrupted, which results unexpected later csum lookup. [FIX] If read_tree_root_path() failed, always populate @ret to the error number. As at the end of the function, we need @ret to determine if we need to do the extra error handling for csum tree. Fixes: abed4aaae4f7 ("btrfs: track the csum, extent, and free space trees in a rb tree") Reported-by: Zhiyu Zhang Reported-by: Longxing Li Reviewed-by: David Sterba Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f48f9d924a62..0d6ad7512f21 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2158,8 +2158,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) - ret = PTR_ERR(root); + ret = PTR_ERR(root); break; } set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); From c0d90a79e8e65b89037508276b2b31f41a1b3783 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Fri, 6 Jun 2025 09:17:41 +0200 Subject: [PATCH 178/284] btrfs: zoned: fix alloc_offset calculation for partly conventional block groups When one of two zones composing a DUP block group is a conventional zone, we have the zone_info[i]->alloc_offset = WP_CONVENTIONAL. That will, of course, not match the write pointer of the other zone, and fails that block group. This commit solves that issue by properly recovering the emulated write pointer from the last allocated extent. The offset for the SINGLE, DUP, and RAID1 are straight-forward: it is same as the end of last allocated extent. The RAID0 and RAID10 are a bit tricky that we need to do the math of striping. This is the kernel equivalent of Naohiro's user-space commit: "btrfs-progs: zoned: fix alloc_offset calculation for partly conventional block groups". Reviewed-by: Naohiro Aota Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 86 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index b5b0156d5b95..9430b34d3cbb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1403,7 +1403,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1426,6 +1427,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, zone_info[1].physical); return -EIO; } + + if (zone_info[0].alloc_offset == WP_CONVENTIONAL) + zone_info[0].alloc_offset = last_alloc; + + if (zone_info[1].alloc_offset == WP_CONVENTIONAL) + zone_info[1].alloc_offset = last_alloc; + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { btrfs_err(bg->fs_info, "zoned: write pointer offset mismatch of zones in DUP profile"); @@ -1446,7 +1454,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; int i; @@ -1461,10 +1470,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); for (i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + zone_info[i].alloc_offset = last_alloc; + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && !btrfs_test_opt(fs_info, DEGRADED)) { btrfs_err(fs_info, @@ -1494,7 +1505,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1505,10 +1517,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, map->num_stripes); + div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > i) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == i) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if (test_bit(0, active) != test_bit(i, active)) { if (!btrfs_zone_activate(bg)) return -EIO; @@ -1526,7 +1557,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, struct btrfs_chunk_map *map, struct zone_info *zone_info, - unsigned long *active) + unsigned long *active, + u64 last_alloc) { struct btrfs_fs_info *fs_info = bg->fs_info; @@ -1537,8 +1569,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, } for (int i = 0; i < map->num_stripes; i++) { - if (zone_info[i].alloc_offset == WP_MISSING_DEV || - zone_info[i].alloc_offset == WP_CONVENTIONAL) + if (zone_info[i].alloc_offset == WP_MISSING_DEV) continue; if (test_bit(0, active) != test_bit(i, active)) { @@ -1549,6 +1580,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { + u64 stripe_nr, full_stripe_nr; + u64 stripe_offset; + int stripe_index; + + stripe_nr = div64_u64(last_alloc, map->stripe_size); + stripe_offset = stripe_nr * map->stripe_size; + full_stripe_nr = div_u64(stripe_nr, + map->num_stripes / map->sub_stripes); + div_u64_rem(stripe_nr, + (map->num_stripes / map->sub_stripes), + &stripe_index); + + zone_info[i].alloc_offset = + full_stripe_nr * map->stripe_size; + + if (stripe_index > (i / map->sub_stripes)) + zone_info[i].alloc_offset += map->stripe_size; + else if (stripe_index == (i / map->sub_stripes)) + zone_info[i].alloc_offset += + (last_alloc - stripe_offset); + } + if ((i % map->sub_stripes) == 0) { bg->zone_capacity += zone_info[i].capacity; bg->alloc_offset += zone_info[i].alloc_offset; @@ -1637,18 +1691,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - ret = btrfs_load_block_group_dup(cache, map, zone_info, active); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active, + last_alloc); break; case BTRFS_BLOCK_GROUP_RAID1: case BTRFS_BLOCK_GROUP_RAID1C3: case BTRFS_BLOCK_GROUP_RAID1C4: - ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid1(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID0: - ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid0(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID10: - ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + ret = btrfs_load_block_group_raid10(cache, map, zone_info, + active, last_alloc); break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: From 8ea688a3372e8369dc04395b39b4e71a6d91d4d5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 27 May 2025 20:12:47 -0400 Subject: [PATCH 179/284] nfsd: use threads array as-is in netlink interface The old nfsdfs interface for starting a server with multiple pools handles the special case of a single entry array passed down from userland by distributing the threads over every NUMA node. The netlink control interface however constructs an array of length nfsd_nrpools() and fills any unprovided slots with 0's. This behavior defeats the special casing that the old interface relies on. Change nfsd_nl_threads_set_doit() to pass down the array from userland as-is. Fixes: 7f5c330b2620 ("nfsd: allow passing in array of thread counts via netlink") Cc: stable@vger.kernel.org Reported-by: Mike Snitzer Closes: https://lore.kernel.org/linux-nfs/aDC-ftnzhJAlwqwh@kernel.org/ Signed-off-by: Jeff Layton Reviewed-by: Simon Horman Tested-by: Mike Snitzer Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3f3e9f6c4250..6a42cc7a845a 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1611,7 +1611,7 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, */ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) { - int *nthreads, count = 0, nrpools, i, ret = -EOPNOTSUPP, rem; + int *nthreads, nrpools = 0, i, ret = -EOPNOTSUPP, rem; struct net *net = genl_info_net(info); struct nfsd_net *nn = net_generic(net, nfsd_net_id); const struct nlattr *attr; @@ -1623,12 +1623,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) /* count number of SERVER_THREADS values */ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { if (nla_type(attr) == NFSD_A_SERVER_THREADS) - count++; + nrpools++; } mutex_lock(&nfsd_mutex); - nrpools = max(count, nfsd_nrpools(net)); nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL); if (!nthreads) { ret = -ENOMEM; From 94d10a4dba0bc482f2b01e39f06d5513d0f75742 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 19 Jun 2025 06:01:55 -0400 Subject: [PATCH 180/284] sunrpc: handle SVC_GARBAGE during svc auth processing as auth error tianshuo han reported a remotely-triggerable crash if the client sends a kernel RPC server a specially crafted packet. If decoding the RPC reply fails in such a way that SVC_GARBAGE is returned without setting the rq_accept_statp pointer, then that pointer can be dereferenced and a value stored there. If it's the first time the thread has processed an RPC, then that pointer will be set to NULL and the kernel will crash. In other cases, it could create a memory scribble. The server sunrpc code treats a SVC_GARBAGE return from svc_authenticate or pg_authenticate as if it should send a GARBAGE_ARGS reply. RFC 5531 says that if authentication fails that the RPC should be rejected instead with a status of AUTH_ERR. Handle a SVC_GARBAGE return as an AUTH_ERROR, with a reason of AUTH_BADCRED instead of returning GARBAGE_ARGS in that case. This sidesteps the whole problem of touching the rpc_accept_statp pointer in this situation and avoids the crash. Cc: stable@kernel.org Fixes: 29cd2927fb91 ("SUNRPC: Fix encoding of accepted but unsuccessful RPC replies") Reported-by: tianshuo han Reviewed-by: Chuck Lever Signed-off-by: Jeff Layton Signed-off-by: Chuck Lever --- net/sunrpc/svc.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index ef8a05aac87f..9c93b854e809 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1371,7 +1371,8 @@ svc_process_common(struct svc_rqst *rqstp) case SVC_OK: break; case SVC_GARBAGE: - goto err_garbage_args; + rqstp->rq_auth_stat = rpc_autherr_badcred; + goto err_bad_auth; case SVC_SYSERR: goto err_system_err; case SVC_DENIED: @@ -1512,14 +1513,6 @@ svc_process_common(struct svc_rqst *rqstp) *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; -err_garbage_args: - svc_printk(rqstp, "failed to decode RPC header\n"); - - if (serv->sv_stats) - serv->sv_stats->rpcbadfmt++; - *rqstp->rq_accept_statp = rpc_garbage_args; - goto sendit; - err_system_err: if (serv->sv_stats) serv->sv_stats->rpcbadfmt++; From 8c8472855884355caf3d8e0c50adf825f83454b2 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Thu, 19 Jun 2025 12:10:31 +1000 Subject: [PATCH 181/284] ublk: santizize the arguments from userspace when adding a device Sanity check the values for queue depth and number of queues we get from userspace when adding a device. Signed-off-by: Ronnie Sahlberg Reviewed-by: Ming Lei Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Fixes: 62fe99cef94a ("ublk: add read()/write() support for ublk char device") Link: https://lore.kernel.org/r/20250619021031.181340-1-ronniesahlberg@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c637ea010d34..d36f44f5ee80 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2825,6 +2825,9 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) if (copy_from_user(&info, argp, sizeof(info))) return -EFAULT; + if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || info.nr_hw_queues > UBLK_MAX_NR_QUEUES) + return -EINVAL; + if (capable(CAP_SYS_ADMIN)) info.flags &= ~UBLK_F_UNPRIVILEGED_DEV; else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) From 16c1241b08751a67cd7a0221ea9f82b0b02806f4 Mon Sep 17 00:00:00 2001 From: Vinay Belgaumkar Date: Thu, 12 Jun 2025 00:09:01 -0700 Subject: [PATCH 182/284] drm/xe/bmg: Update Wa_16023588340 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows for additional L2 caching modes. Fixes: 01570b446939 ("drm/xe/bmg: implement Wa_16023588340") Cc: Matthew Auld Reviewed-by: Matthew Auld Signed-off-by: Vinay Belgaumkar Link: https://lore.kernel.org/r/20250612-wa-14022085890-v4-2-94ba5dcc1e30@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 6ab42fa03d4c88a0ddf5e56e62794853b198e7bf) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_gt.c b/drivers/gpu/drm/xe/xe_gt.c index 0e5d243c9451..6c4cb9576fb6 100644 --- a/drivers/gpu/drm/xe/xe_gt.c +++ b/drivers/gpu/drm/xe/xe_gt.c @@ -118,7 +118,7 @@ static void xe_gt_enable_host_l2_vram(struct xe_gt *gt) xe_gt_mcr_multicast_write(gt, XE2_GAMREQSTRM_CTRL, reg); } - xe_gt_mcr_multicast_write(gt, XEHPC_L3CLOS_MASK(3), 0x3); + xe_gt_mcr_multicast_write(gt, XEHPC_L3CLOS_MASK(3), 0xF); xe_force_wake_put(gt_to_fw(gt), fw_ref); } From 87a15c89d8c7b00b0fc94e0d4f554f7ee2fe6961 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Thu, 12 Jun 2025 15:14:12 -0700 Subject: [PATCH 183/284] drm/xe: Fix memset on iomem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It should rather use xe_map_memset() as the BO is created with XE_BO_FLAG_VRAM_IF_DGFX in xe_guc_pc_init(). Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: stable@vger.kernel.org Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/20250612-vmap-vaddr-v1-1-26238ed443eb@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 21cf47d89fba353b2d5915ba4718040c4cb955d3) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_guc_pc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/xe/xe_guc_pc.c b/drivers/gpu/drm/xe/xe_guc_pc.c index 18c623992035..3beaaa7b25c1 100644 --- a/drivers/gpu/drm/xe/xe_guc_pc.c +++ b/drivers/gpu/drm/xe/xe_guc_pc.c @@ -1068,7 +1068,7 @@ int xe_guc_pc_start(struct xe_guc_pc *pc) goto out; } - memset(pc->bo->vmap.vaddr, 0, size); + xe_map_memset(xe, &pc->bo->vmap, 0, 0, size); slpc_shared_data_write(pc, header.size, size); earlier = ktime_get(); From 16ea4666bbb7f5bd1130fa2d75631ccf8b62362e Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Thu, 19 Jun 2025 13:47:05 +0300 Subject: [PATCH 184/284] ASoC: Intel: sof-function-topology-lib: Print out the unsupported dmic count It is better to print out the non supported num_dmics than printing that it is not matching with 2 or 4. Fixes: 2fbeff33381c ("ASoC: Intel: add sof_sdw_get_tplg_files ops") Cc: stable@vger.kernel.org Signed-off-by: Peter Ujfalusi Reviewed-by: Bard Liao Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20250619104705.26057-1-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- sound/soc/intel/common/sof-function-topology-lib.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/intel/common/sof-function-topology-lib.c b/sound/soc/intel/common/sof-function-topology-lib.c index 90fe7aa3df1c..3cc81dcf047e 100644 --- a/sound/soc/intel/common/sof-function-topology-lib.c +++ b/sound/soc/intel/common/sof-function-topology-lib.c @@ -73,7 +73,8 @@ int sof_sdw_get_tplg_files(struct snd_soc_card *card, const struct snd_soc_acpi_ break; default: dev_warn(card->dev, - "only -2ch and -4ch are supported for dmic\n"); + "unsupported number of dmics: %d\n", + mach_params.dmic_num); continue; } tplg_dev = TPLG_DEVICE_INTEL_PCH_DMIC; From a39d082c3553d35b4fe5585e1e2fb221c130cae8 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Wed, 11 Jun 2025 14:44:54 -0700 Subject: [PATCH 185/284] drm/xe: Fix early wedge on GuC load failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the GuC fails to load we declare the device wedged. However, the very first GuC load attempt on GT0 (from xe_gt_init_hwconfig) is done before the GT1 GuC objects are initialized, so things go bad when the wedge code attempts to cleanup GT1. To fix this, check the initialization status in the functions called during wedge. Fixes: 7dbe8af13c18 ("drm/xe: Wedge the entire device") Signed-off-by: Daniele Ceraolo Spurio Cc: Rodrigo Vivi Cc: Matthew Brost Cc: Jonathan Cavitt Cc: Lucas De Marchi Cc: Zhanjun Dong Cc: stable@vger.kernel.org # v6.12+: 1e1981b16bb1: drm/xe: Fix taking invalid lock on wedge Cc: stable@vger.kernel.org # v6.12+ Reviewed-by: Jonathan Cavitt Reviewed-by: Lucas De Marchi Link: https://lore.kernel.org/r/20250611214453.1159846-2-daniele.ceraolospurio@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit 0b93b7dcd9eb888a6ac7546560877705d4ad61bf) Signed-off-by: Thomas Hellström --- drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c | 8 ++++++++ drivers/gpu/drm/xe/xe_guc_ct.c | 7 +++++-- drivers/gpu/drm/xe/xe_guc_ct.h | 5 +++++ drivers/gpu/drm/xe/xe_guc_submit.c | 3 +++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c index 084cbdeba8ea..e1362e608146 100644 --- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c +++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c @@ -137,6 +137,14 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt) struct xe_gt_tlb_invalidation_fence *fence, *next; int pending_seqno; + /* + * we can get here before the CTs are even initialized if we're wedging + * very early, in which case there are not going to be any pending + * fences so we can bail immediately. + */ + if (!xe_guc_ct_initialized(>->uc.guc.ct)) + return; + /* * CT channel is already disabled at this point. No new TLB requests can * appear. diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c index 2447de0ebedf..d0ac48d8f4f7 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.c +++ b/drivers/gpu/drm/xe/xe_guc_ct.c @@ -514,6 +514,9 @@ void xe_guc_ct_disable(struct xe_guc_ct *ct) */ void xe_guc_ct_stop(struct xe_guc_ct *ct) { + if (!xe_guc_ct_initialized(ct)) + return; + xe_guc_ct_set_state(ct, XE_GUC_CT_STATE_STOPPED); stop_g2h_handler(ct); } @@ -760,7 +763,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u16 seqno; int ret; - xe_gt_assert(gt, ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED); + xe_gt_assert(gt, xe_guc_ct_initialized(ct)); xe_gt_assert(gt, !g2h_len || !g2h_fence); xe_gt_assert(gt, !num_g2h || !g2h_fence); xe_gt_assert(gt, !g2h_len || num_g2h); @@ -1344,7 +1347,7 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path) u32 action; u32 *hxg; - xe_gt_assert(gt, ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED); + xe_gt_assert(gt, xe_guc_ct_initialized(ct)); lockdep_assert_held(&ct->fast_lock); if (ct->state == XE_GUC_CT_STATE_DISABLED) diff --git a/drivers/gpu/drm/xe/xe_guc_ct.h b/drivers/gpu/drm/xe/xe_guc_ct.h index 82c4ae458dda..582aac106469 100644 --- a/drivers/gpu/drm/xe/xe_guc_ct.h +++ b/drivers/gpu/drm/xe/xe_guc_ct.h @@ -22,6 +22,11 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, struct drm_pr void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot); void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb); +static inline bool xe_guc_ct_initialized(struct xe_guc_ct *ct) +{ + return ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED; +} + static inline bool xe_guc_ct_enabled(struct xe_guc_ct *ct) { return ct->state == XE_GUC_CT_STATE_ENABLED; diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index 6d84a52b660a..9567f6700cf2 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -1762,6 +1762,9 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc) { int ret; + if (!guc->submission_state.initialized) + return 0; + /* * Using an atomic here rather than submission_state.lock as this * function can be called while holding the CT lock (engine reset From 6463cbe08b0cbf9bba8763306764f5fd643023e1 Mon Sep 17 00:00:00 2001 From: Pablo Martin-Gomez Date: Wed, 18 Jun 2025 13:35:16 +0200 Subject: [PATCH 186/284] mtd: spinand: fix memory leak of ECC engine conf Memory allocated for the ECC engine conf is not released during spinand cleanup. Below kmemleak trace is seen for this memory leak: unreferenced object 0xffffff80064f00e0 (size 8): comm "swapper/0", pid 1, jiffies 4294937458 hex dump (first 8 bytes): 00 00 00 00 00 00 00 00 ........ backtrace (crc 0): kmemleak_alloc+0x30/0x40 __kmalloc_cache_noprof+0x208/0x3c0 spinand_ondie_ecc_init_ctx+0x114/0x200 nand_ecc_init_ctx+0x70/0xa8 nanddev_ecc_engine_init+0xec/0x27c spinand_probe+0xa2c/0x1620 spi_mem_probe+0x130/0x21c spi_probe+0xf0/0x170 really_probe+0x17c/0x6e8 __driver_probe_device+0x17c/0x21c driver_probe_device+0x58/0x180 __device_attach_driver+0x15c/0x1f8 bus_for_each_drv+0xec/0x150 __device_attach+0x188/0x24c device_initial_probe+0x10/0x20 bus_probe_device+0x11c/0x160 Fix the leak by calling nanddev_ecc_engine_cleanup() inside spinand_cleanup(). Signed-off-by: Pablo Martin-Gomez Signed-off-by: Miquel Raynal --- drivers/mtd/nand/spi/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/nand/spi/core.c b/drivers/mtd/nand/spi/core.c index 7099db7a62be..c411fe9be3ef 100644 --- a/drivers/mtd/nand/spi/core.c +++ b/drivers/mtd/nand/spi/core.c @@ -1585,6 +1585,7 @@ static void spinand_cleanup(struct spinand_device *spinand) { struct nand_device *nand = spinand_to_nand(spinand); + nanddev_ecc_engine_cleanup(nand); nanddev_cleanup(nand); spinand_manufacturer_cleanup(spinand); kfree(spinand->databuf); From fde46f60f6c5138ee422087addbc5bf5b4968bf1 Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 10 Jun 2025 15:48:27 -0400 Subject: [PATCH 187/284] selinux: change security_compute_sid to return the ssid or tsid on match If the end result of a security_compute_sid() computation matches the ssid or tsid, return that SID rather than looking it up again. This avoids the problem of multiple initial SIDs that map to the same context. Cc: stable@vger.kernel.org Reported-by: Guido Trentalancia Fixes: ae254858ce07 ("selinux: introduce an initial SID for early boot processes") Signed-off-by: Stephen Smalley Tested-by: Guido Trentalancia Signed-off-by: Paul Moore --- security/selinux/ss/services.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index 7becf3808818..d185754c2786 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -1909,11 +1909,17 @@ static int security_compute_sid(u32 ssid, goto out_unlock; } /* Obtain the sid for the context. */ - rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid); - if (rc == -ESTALE) { - rcu_read_unlock(); - context_destroy(&newcontext); - goto retry; + if (context_equal(scontext, &newcontext)) + *out_sid = ssid; + else if (context_equal(tcontext, &newcontext)) + *out_sid = tsid; + else { + rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid); + if (rc == -ESTALE) { + rcu_read_unlock(); + context_destroy(&newcontext); + goto retry; + } } out_unlock: rcu_read_unlock(); From e0fca6f2cebff539e9317a15a37dcf432e3b851a Mon Sep 17 00:00:00 2001 From: Long Li Date: Tue, 17 Jun 2025 18:36:46 -0700 Subject: [PATCH 188/284] net: mana: Record doorbell physical address in PF mode MANA supports RDMA in PF mode. The driver should record the doorbell physical address when in PF mode. The doorbell physical address is used by the RDMA driver to map doorbell pages of the device to user-mode applications through RDMA verbs interface. In the past, they have been mapped to user-mode while the device is in VF mode. With the support for PF mode implemented, also expose those pages in PF mode. Support for PF mode is implemented in 290e5d3c49f6 ("net: mana: Add support for Multi Vports on Bare metal") Signed-off-by: Long Li Reviewed-by: Simon Horman Link: https://patch.msgid.link/1750210606-12167-1-git-send-email-longli@linuxonhyperv.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 3504507477c6..52cf7112762c 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -31,6 +31,9 @@ static void mana_gd_init_pf_regs(struct pci_dev *pdev) gc->db_page_base = gc->bar0_va + mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); + gc->phys_db_page_base = gc->bar0_pa + + mana_gd_r64(gc, GDMA_PF_REG_DB_PAGE_OFF); + sriov_base_off = mana_gd_r64(gc, GDMA_SRIOV_REG_CFG_BASE_OFF); sriov_base_va = gc->bar0_va + sriov_base_off; From 752eb816b55adb0673727ba0ed96609a17895654 Mon Sep 17 00:00:00 2001 From: Chen Yu Date: Wed, 4 Jun 2025 12:25:56 +0800 Subject: [PATCH 189/284] scsi: megaraid_sas: Fix invalid node index On a system with DRAM interleave enabled, out-of-bound access is detected: megaraid_sas 0000:3f:00.0: requested/available msix 128/128 poll_queue 0 ------------[ cut here ]------------ UBSAN: array-index-out-of-bounds in ./arch/x86/include/asm/topology.h:72:28 index -1 is out of range for type 'cpumask *[1024]' dump_stack_lvl+0x5d/0x80 ubsan_epilogue+0x5/0x2b __ubsan_handle_out_of_bounds.cold+0x46/0x4b megasas_alloc_irq_vectors+0x149/0x190 [megaraid_sas] megasas_probe_one.cold+0xa4d/0x189c [megaraid_sas] local_pci_probe+0x42/0x90 pci_device_probe+0xdc/0x290 really_probe+0xdb/0x340 __driver_probe_device+0x78/0x110 driver_probe_device+0x1f/0xa0 __driver_attach+0xba/0x1c0 bus_for_each_dev+0x8b/0xe0 bus_add_driver+0x142/0x220 driver_register+0x72/0xd0 megasas_init+0xdf/0xff0 [megaraid_sas] do_one_initcall+0x57/0x310 do_init_module+0x90/0x250 init_module_from_file+0x85/0xc0 idempotent_init_module+0x114/0x310 __x64_sys_finit_module+0x65/0xc0 do_syscall_64+0x82/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fix it accordingly. Signed-off-by: Chen Yu Link: https://lore.kernel.org/r/20250604042556.3731059-1-yu.c.chen@intel.com Fixes: 8049da6f3943 ("scsi: megaraid_sas: Use irq_set_affinity_and_hint()") Cc: stable@vger.kernel.org Signed-off-by: Martin K. Petersen --- drivers/scsi/megaraid/megaraid_sas_base.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 3aac0e17cb00..9179f8aee964 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5910,7 +5910,11 @@ megasas_set_high_iops_queue_affinity_and_hint(struct megasas_instance *instance) const struct cpumask *mask; if (instance->perf_mode == MR_BALANCED_PERF_MODE) { - mask = cpumask_of_node(dev_to_node(&instance->pdev->dev)); + int nid = dev_to_node(&instance->pdev->dev); + + if (nid == NUMA_NO_NODE) + nid = 0; + mask = cpumask_of_node(nid); for (i = 0; i < instance->low_latency_index_start; i++) { irq = pci_irq_vector(instance->pdev, i); From 2e083cd802294693a5414e4557a183dd7e442e71 Mon Sep 17 00:00:00 2001 From: anvithdosapati Date: Mon, 16 Jun 2025 08:57:34 +0000 Subject: [PATCH 190/284] scsi: ufs: core: Fix clk scaling to be conditional in reset and restore In ufshcd_host_reset_and_restore(), scale up clocks only when clock scaling is supported. Without this change CPU latency is voted for 0 (ufshcd_pm_qos_update) during resume unconditionally. Signed-off-by: anvithdosapati Link: https://lore.kernel.org/r/20250616085734.2133581-1-anvithdosapati@google.com Fixes: a3cd5ec55f6c ("scsi: ufs: add load based scaling of UFS gear") Cc: stable@vger.kernel.org Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index f62d89c8e580..50adfb8b335b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -7807,7 +7807,8 @@ static int ufshcd_host_reset_and_restore(struct ufs_hba *hba) hba->silence_err_logs = false; /* scale up clocks to max frequency before full reinitialization */ - ufshcd_scale_clks(hba, ULONG_MAX, true); + if (ufshcd_is_clkscaling_supported(hba)) + ufshcd_scale_clks(hba, ULONG_MAX, true); err = ufshcd_hba_enable(hba); From a35b29bdedb4d2ae3160d4d6684a6f1ecd9ca7c2 Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Jun 2025 17:34:28 -0700 Subject: [PATCH 191/284] scsi: fnic: Fix crash in fnic_wq_cmpl_handler when FDMI times out When both the RHBA and RPA FDMI requests time out, fnic reuses a frame to send ABTS for each of them. On send completion, this causes an attempt to free the same frame twice that leads to a crash. Fix crash by allocating separate frames for RHBA and RPA, and modify ABTS logic accordingly. Tested by checking MDS for FDMI information. Tested by using instrumented driver to: - Drop PLOGI response - Drop RHBA response - Drop RPA response - Drop RHBA and RPA response - Drop PLOGI response + ABTS response - Drop RHBA response + ABTS response - Drop RPA response + ABTS response - Drop RHBA and RPA response + ABTS response for both of them Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI") Reviewed-by: Sesidhar Baddela Reviewed-by: Arulprabhu Ponnusamy Reviewed-by: Gian Carlo Boffa Tested-by: Arun Easi Co-developed-by: Arun Easi Signed-off-by: Arun Easi Tested-by: Karan Tilak Kumar Cc: stable@vger.kernel.org Signed-off-by: Karan Tilak Kumar Link: https://lore.kernel.org/r/20250618003431.6314-1-kartilak@cisco.com Reviewed-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fdls_disc.c | 113 +++++++++++++++++++++++++--------- drivers/scsi/fnic/fnic.h | 2 +- drivers/scsi/fnic/fnic_fdls.h | 1 + 3 files changed, 87 insertions(+), 29 deletions(-) diff --git a/drivers/scsi/fnic/fdls_disc.c b/drivers/scsi/fnic/fdls_disc.c index f8ab69c51dab..36b498ad55b4 100644 --- a/drivers/scsi/fnic/fdls_disc.c +++ b/drivers/scsi/fnic/fdls_disc.c @@ -763,47 +763,69 @@ static void fdls_send_fabric_abts(struct fnic_iport_s *iport) iport->fabric.timer_pending = 1; } -static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) +static uint8_t *fdls_alloc_init_fdmi_abts_frame(struct fnic_iport_s *iport, + uint16_t oxid) { - uint8_t *frame; + struct fc_frame_header *pfdmi_abts; uint8_t d_id[3]; + uint8_t *frame; struct fnic *fnic = iport->fnic; - struct fc_frame_header *pfabric_abts; - unsigned long fdmi_tov; - uint16_t oxid; - uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET + - sizeof(struct fc_frame_header); frame = fdls_alloc_frame(iport); if (frame == NULL) { FNIC_FCS_DBG(KERN_ERR, fnic->host, fnic->fnic_num, "Failed to allocate frame to send FDMI ABTS"); - return; + return NULL; } - pfabric_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET); + pfdmi_abts = (struct fc_frame_header *) (frame + FNIC_ETH_FCOE_HDRS_OFFSET); fdls_init_fabric_abts_frame(frame, iport); hton24(d_id, FC_FID_MGMT_SERV); - FNIC_STD_SET_D_ID(*pfabric_abts, d_id); + FNIC_STD_SET_D_ID(*pfdmi_abts, d_id); + FNIC_STD_SET_OX_ID(*pfdmi_abts, oxid); + + return frame; +} + +static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) +{ + uint8_t *frame; + unsigned long fdmi_tov; + uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET + + sizeof(struct fc_frame_header); if (iport->fabric.fdmi_pending & FDLS_FDMI_PLOGI_PENDING) { - oxid = iport->active_oxid_fdmi_plogi; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_plogi); + if (frame == NULL) + return; + fnic_send_fcoe_frame(iport, frame, frame_size); } else { if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) { - oxid = iport->active_oxid_fdmi_rhba; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_rhba); + if (frame == NULL) + return; + fnic_send_fcoe_frame(iport, frame, frame_size); } if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) { - oxid = iport->active_oxid_fdmi_rpa; - FNIC_STD_SET_OX_ID(*pfabric_abts, oxid); + frame = fdls_alloc_init_fdmi_abts_frame(iport, + iport->active_oxid_fdmi_rpa); + if (frame == NULL) { + if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) + goto arm_timer; + else + return; + } + fnic_send_fcoe_frame(iport, frame, frame_size); } } +arm_timer: fdmi_tov = jiffies + msecs_to_jiffies(2 * iport->e_d_tov); mod_timer(&iport->fabric.fdmi_timer, round_jiffies(fdmi_tov)); iport->fabric.fdmi_pending |= FDLS_FDMI_ABORT_PENDING; @@ -2245,6 +2267,21 @@ void fdls_fabric_timer_callback(struct timer_list *t) spin_unlock_irqrestore(&fnic->fnic_lock, flags); } +void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport) +{ + struct fnic *fnic = iport->fnic; + + iport->fabric.fdmi_pending = 0; + /* If max retries not exhausted, start over from fdmi plogi */ + if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) { + iport->fabric.fdmi_retry++; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Retry FDMI PLOGI. FDMI retry: %d", + iport->fabric.fdmi_retry); + fdls_send_fdmi_plogi(iport); + } +} + void fdls_fdmi_timer_callback(struct timer_list *t) { struct fnic_fdls_fabric_s *fabric = timer_container_of(fabric, t, @@ -2291,14 +2328,7 @@ void fdls_fdmi_timer_callback(struct timer_list *t) FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); - iport->fabric.fdmi_pending = 0; - /* If max retries not exhaused, start over from fdmi plogi */ - if (iport->fabric.fdmi_retry < FDLS_FDMI_MAX_RETRY) { - iport->fabric.fdmi_retry++; - FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "retry fdmi timer %d", iport->fabric.fdmi_retry); - fdls_send_fdmi_plogi(iport); - } + fdls_fdmi_retry_plogi(iport); FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); spin_unlock_irqrestore(&fnic->fnic_lock, flags); @@ -3716,11 +3746,32 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, switch (FNIC_FRAME_TYPE(oxid)) { case FNIC_FRAME_TYPE_FDMI_PLOGI: fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_plogi); + + iport->fabric.fdmi_pending &= ~FDLS_FDMI_PLOGI_PENDING; + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; break; case FNIC_FRAME_TYPE_FDMI_RHBA: + iport->fabric.fdmi_pending &= ~FDLS_FDMI_REG_HBA_PENDING; + + /* If RPA is still pending, don't turn off ABORT PENDING. + * We count on the timer to detect the ABTS timeout and take + * corrective action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING)) + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; + fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rhba); break; case FNIC_FRAME_TYPE_FDMI_RPA: + iport->fabric.fdmi_pending &= ~FDLS_FDMI_RPA_PENDING; + + /* If RHBA is still pending, don't turn off ABORT PENDING. + * We count on the timer to detect the ABTS timeout and take + * corrective action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING)) + iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; + fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rpa); break; default: @@ -3730,10 +3781,16 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, break; } - timer_delete_sync(&iport->fabric.fdmi_timer); - iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; - - fdls_send_fdmi_plogi(iport); + /* + * Only if ABORT PENDING is off, delete the timer, and if no other + * operations are pending, retry FDMI. + * Otherwise, let the timer pop and take the appropriate action. + */ + if (!(iport->fabric.fdmi_pending & FDLS_FDMI_ABORT_PENDING)) { + timer_delete_sync(&iport->fabric.fdmi_timer); + if (!iport->fabric.fdmi_pending) + fdls_fdmi_retry_plogi(iport); + } } static void diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h index 6c5f6046b1f5..86e293ce530d 100644 --- a/drivers/scsi/fnic/fnic.h +++ b/drivers/scsi/fnic/fnic.h @@ -30,7 +30,7 @@ #define DRV_NAME "fnic" #define DRV_DESCRIPTION "Cisco FCoE HBA Driver" -#define DRV_VERSION "1.8.0.0" +#define DRV_VERSION "1.8.0.1" #define PFX DRV_NAME ": " #define DFX DRV_NAME "%d: " diff --git a/drivers/scsi/fnic/fnic_fdls.h b/drivers/scsi/fnic/fnic_fdls.h index 8e610b65ad57..531d0b37e450 100644 --- a/drivers/scsi/fnic/fnic_fdls.h +++ b/drivers/scsi/fnic/fnic_fdls.h @@ -394,6 +394,7 @@ void fdls_send_tport_abts(struct fnic_iport_s *iport, bool fdls_delete_tport(struct fnic_iport_s *iport, struct fnic_tport_s *tport); void fdls_fdmi_timer_callback(struct timer_list *t); +void fdls_fdmi_retry_plogi(struct fnic_iport_s *iport); /* fnic_fcs.c */ void fnic_fdls_init(struct fnic *fnic, int usefip); From 74f46a0524f8d2f01dc7ca95bb5fc463a8603e72 Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Jun 2025 17:34:29 -0700 Subject: [PATCH 192/284] scsi: fnic: Turn off FDMI ACTIVE flags on link down When the link goes down and comes up, FDMI requests are not sent out anymore. Fix bug by turning off FNIC_FDMI_ACTIVE when the link goes down. Fixes: 09c1e6ab4ab2 ("scsi: fnic: Add and integrate support for FDMI") Reviewed-by: Sesidhar Baddela Reviewed-by: Arulprabhu Ponnusamy Reviewed-by: Gian Carlo Boffa Reviewed-by: Arun Easi Tested-by: Karan Tilak Kumar Cc: stable@vger.kernel.org Signed-off-by: Karan Tilak Kumar Link: https://lore.kernel.org/r/20250618003431.6314-2-kartilak@cisco.com Reviewed-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fdls_disc.c | 9 ++++++--- drivers/scsi/fnic/fnic.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/fnic/fdls_disc.c b/drivers/scsi/fnic/fdls_disc.c index 36b498ad55b4..fa9cf0b37d72 100644 --- a/drivers/scsi/fnic/fdls_disc.c +++ b/drivers/scsi/fnic/fdls_disc.c @@ -5029,9 +5029,12 @@ void fnic_fdls_link_down(struct fnic_iport_s *iport) fdls_delete_tport(iport, tport); } - if ((fnic_fdmi_support == 1) && (iport->fabric.fdmi_pending > 0)) { - timer_delete_sync(&iport->fabric.fdmi_timer); - iport->fabric.fdmi_pending = 0; + if (fnic_fdmi_support == 1) { + if (iport->fabric.fdmi_pending > 0) { + timer_delete_sync(&iport->fabric.fdmi_timer); + iport->fabric.fdmi_pending = 0; + } + iport->flags &= ~FNIC_FDMI_ACTIVE; } FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, diff --git a/drivers/scsi/fnic/fnic.h b/drivers/scsi/fnic/fnic.h index 86e293ce530d..c2fdc6553e62 100644 --- a/drivers/scsi/fnic/fnic.h +++ b/drivers/scsi/fnic/fnic.h @@ -30,7 +30,7 @@ #define DRV_NAME "fnic" #define DRV_DESCRIPTION "Cisco FCoE HBA Driver" -#define DRV_VERSION "1.8.0.1" +#define DRV_VERSION "1.8.0.2" #define PFX DRV_NAME ": " #define DFX DRV_NAME "%d: " From 9b9b8594654a79e3d4166356fd86cd5397477b24 Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Jun 2025 17:34:30 -0700 Subject: [PATCH 193/284] scsi: fnic: Add and improve logs in FDMI and FDMI ABTS paths Add logs in FDMI and FDMI ABTS paths. Modify log text in these paths. Reviewed-by: Sesidhar Baddela Reviewed-by: Arulprabhu Ponnusamy Reviewed-by: Gian Carlo Boffa Reviewed-by: Arun Easi Reviewed-by: John Meneghini Signed-off-by: Karan Tilak Kumar Link: https://lore.kernel.org/r/20250618003431.6314-3-kartilak@cisco.com Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fdls_disc.c | 65 +++++++++++++++++++++++++++++++---- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/fnic/fdls_disc.c b/drivers/scsi/fnic/fdls_disc.c index fa9cf0b37d72..ae37f85f618b 100644 --- a/drivers/scsi/fnic/fdls_disc.c +++ b/drivers/scsi/fnic/fdls_disc.c @@ -791,6 +791,7 @@ static uint8_t *fdls_alloc_init_fdmi_abts_frame(struct fnic_iport_s *iport, static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) { uint8_t *frame; + struct fnic *fnic = iport->fnic; unsigned long fdmi_tov; uint16_t frame_size = FNIC_ETH_FCOE_HDRS_OFFSET + sizeof(struct fc_frame_header); @@ -801,6 +802,9 @@ static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) if (frame == NULL) return; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: FDLS send FDMI PLOGI abts. iport->fabric.state: %d oxid: 0x%x", + iport->fcid, iport->fabric.state, iport->active_oxid_fdmi_plogi); fnic_send_fcoe_frame(iport, frame, frame_size); } else { if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) { @@ -809,6 +813,9 @@ static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) if (frame == NULL) return; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: FDLS send FDMI RHBA abts. iport->fabric.state: %d oxid: 0x%x", + iport->fcid, iport->fabric.state, iport->active_oxid_fdmi_rhba); fnic_send_fcoe_frame(iport, frame, frame_size); } if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) { @@ -821,6 +828,9 @@ static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) return; } + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: FDLS send FDMI RPA abts. iport->fabric.state: %d oxid: 0x%x", + iport->fcid, iport->fabric.state, iport->active_oxid_fdmi_rpa); fnic_send_fcoe_frame(iport, frame, frame_size); } } @@ -829,6 +839,10 @@ static void fdls_send_fdmi_abts(struct fnic_iport_s *iport) fdmi_tov = jiffies + msecs_to_jiffies(2 * iport->e_d_tov); mod_timer(&iport->fabric.fdmi_timer, round_jiffies(fdmi_tov)); iport->fabric.fdmi_pending |= FDLS_FDMI_ABORT_PENDING; + + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); } static void fdls_send_fabric_flogi(struct fnic_iport_s *iport) @@ -2294,7 +2308,7 @@ void fdls_fdmi_timer_callback(struct timer_list *t) spin_lock_irqsave(&fnic->fnic_lock, flags); FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); if (!iport->fabric.fdmi_pending) { /* timer expired after fdmi responses received. */ @@ -2302,7 +2316,7 @@ void fdls_fdmi_timer_callback(struct timer_list *t) return; } FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); /* if not abort pending, send an abort */ if (!(iport->fabric.fdmi_pending & FDLS_FDMI_ABORT_PENDING)) { @@ -2311,26 +2325,37 @@ void fdls_fdmi_timer_callback(struct timer_list *t) return; } FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); /* ABTS pending for an active fdmi request that is pending. * That means FDMI ABTS timed out * Schedule to free the OXID after 2*r_a_tov and proceed */ if (iport->fabric.fdmi_pending & FDLS_FDMI_PLOGI_PENDING) { + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "FDMI PLOGI ABTS timed out. Schedule oxid free: 0x%x\n", + iport->active_oxid_fdmi_plogi); fdls_schedule_oxid_free(iport, &iport->active_oxid_fdmi_plogi); } else { - if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) + if (iport->fabric.fdmi_pending & FDLS_FDMI_REG_HBA_PENDING) { + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "FDMI RHBA ABTS timed out. Schedule oxid free: 0x%x\n", + iport->active_oxid_fdmi_rhba); fdls_schedule_oxid_free(iport, &iport->active_oxid_fdmi_rhba); - if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) + } + if (iport->fabric.fdmi_pending & FDLS_FDMI_RPA_PENDING) { + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "FDMI RPA ABTS timed out. Schedule oxid free: 0x%x\n", + iport->active_oxid_fdmi_rpa); fdls_schedule_oxid_free(iport, &iport->active_oxid_fdmi_rpa); + } } FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); fdls_fdmi_retry_plogi(iport); FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, - "fdmi timer callback : 0x%x\n", iport->fabric.fdmi_pending); + "iport->fabric.fdmi_pending: 0x%x\n", iport->fabric.fdmi_pending); spin_unlock_irqrestore(&fnic->fnic_lock, flags); } @@ -3745,12 +3770,26 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, switch (FNIC_FRAME_TYPE(oxid)) { case FNIC_FRAME_TYPE_FDMI_PLOGI: + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Received FDMI PLOGI ABTS rsp with oxid: 0x%x", oxid); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_plogi); iport->fabric.fdmi_pending &= ~FDLS_FDMI_PLOGI_PENDING; iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); break; case FNIC_FRAME_TYPE_FDMI_RHBA: + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Received FDMI RHBA ABTS rsp with oxid: 0x%x", oxid); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); + iport->fabric.fdmi_pending &= ~FDLS_FDMI_REG_HBA_PENDING; /* If RPA is still pending, don't turn off ABORT PENDING. @@ -3761,8 +3800,17 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rhba); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); break; case FNIC_FRAME_TYPE_FDMI_RPA: + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "Received FDMI RPA ABTS rsp with oxid: 0x%x", oxid); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); + iport->fabric.fdmi_pending &= ~FDLS_FDMI_RPA_PENDING; /* If RHBA is still pending, don't turn off ABORT PENDING. @@ -3773,6 +3821,9 @@ static void fdls_process_fdmi_abts_rsp(struct fnic_iport_s *iport, iport->fabric.fdmi_pending &= ~FDLS_FDMI_ABORT_PENDING; fdls_free_oxid(iport, oxid, &iport->active_oxid_fdmi_rpa); + FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + "0x%x: iport->fabric.fdmi_pending: 0x%x", + iport->fcid, iport->fabric.fdmi_pending); break; default: FNIC_FCS_DBG(KERN_INFO, fnic->host, fnic->fnic_num, From 18b5cb6f1fdda4454f55a31f7c78d94da62be495 Mon Sep 17 00:00:00 2001 From: Karan Tilak Kumar Date: Tue, 17 Jun 2025 17:34:31 -0700 Subject: [PATCH 194/284] scsi: fnic: Set appropriate logging level for log message Replace KERN_INFO with KERN_DEBUG for a log message. Reviewed-by: Sesidhar Baddela Reviewed-by: Arulprabhu Ponnusamy Reviewed-by: Gian Carlo Boffa Reviewed-by: Arun Easi Signed-off-by: Karan Tilak Kumar Link: https://lore.kernel.org/stable/20250612002212.4144-1-kartilak%40cisco.com Link: https://lore.kernel.org/r/20250618003431.6314-4-kartilak@cisco.com Reviewed-by: John Meneghini Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic_scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c index 7133b254cbe4..75b29a018d1f 100644 --- a/drivers/scsi/fnic/fnic_scsi.c +++ b/drivers/scsi/fnic/fnic_scsi.c @@ -1046,7 +1046,7 @@ static void fnic_fcpio_icmnd_cmpl_handler(struct fnic *fnic, unsigned int cq_ind if (icmnd_cmpl->scsi_status == SAM_STAT_TASK_SET_FULL) atomic64_inc(&fnic_stats->misc_stats.queue_fulls); - FNIC_SCSI_DBG(KERN_INFO, fnic->host, fnic->fnic_num, + FNIC_SCSI_DBG(KERN_DEBUG, fnic->host, fnic->fnic_num, "xfer_len: %llu", xfer_len); break; From 85d6fbc47c3087c5d048e6734926b0c36af34fe9 Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Wed, 18 Jun 2025 08:57:04 +0200 Subject: [PATCH 195/284] scsi: fnic: Fix missing DMA mapping error in fnic_send_frame() dma_map_XXX() can fail and should be tested for errors with dma_mapping_error(). Fixes: a63e78eb2b0f ("scsi: fnic: Add support for fabric based solicited requests and responses") Signed-off-by: Thomas Fourier Link: https://lore.kernel.org/r/20250618065715.14740-2-fourier.thomas@gmail.com Reviewed-by: Karan Tilak Kumar Reviewed-by: John Menghini Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic_fcs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/fnic/fnic_fcs.c b/drivers/scsi/fnic/fnic_fcs.c index 1e8cd64f9a5c..103ab6f1f7cd 100644 --- a/drivers/scsi/fnic/fnic_fcs.c +++ b/drivers/scsi/fnic/fnic_fcs.c @@ -636,6 +636,8 @@ static int fnic_send_frame(struct fnic *fnic, void *frame, int frame_len) unsigned long flags; pa = dma_map_single(&fnic->pdev->dev, frame, frame_len, DMA_TO_DEVICE); + if (dma_mapping_error(&fnic->pdev->dev, pa)) + return -ENOMEM; if ((fnic_fc_trace_set_data(fnic->fnic_num, FNIC_FC_SEND | 0x80, (char *) frame, From a05dd8ae5cbb1cb45f349922cfea4f548a5e5d6f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 10 Jun 2025 01:17:51 +0800 Subject: [PATCH 196/284] mm/shmem, swap: fix softlockup with mTHP swapin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Following softlockup can be easily reproduced on my test machine with: echo always > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled swapon /dev/zram0 # zram0 is a 48G swap device mkdir -p /sys/fs/cgroup/memory/test echo 1G > /sys/fs/cgroup/test/memory.max echo $BASHPID > /sys/fs/cgroup/test/cgroup.procs while true; do dd if=/dev/zero of=/tmp/test.img bs=1M count=5120 cat /tmp/test.img > /dev/null rm /tmp/test.img done Then after a while: watchdog: BUG: soft lockup - CPU#0 stuck for 763s! [cat:5787] Modules linked in: zram virtiofs CPU: 0 UID: 0 PID: 5787 Comm: cat Kdump: loaded Tainted: G L 6.15.0.orig-gf3021d9246bc-dirty #118 PREEMPT(voluntary)· Tainted: [L]=SOFTLOCKUP Hardware name: Red Hat KVM/RHEL-AV, BIOS 0.0.0 02/06/2015 RIP: 0010:mpol_shared_policy_lookup+0xd/0x70 Code: e9 b8 b4 ff ff 31 c0 c3 cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 66 0f 1f 00 0f 1f 44 00 00 41 54 55 53 <48> 8b 1f 48 85 db 74 41 4c 8d 67 08 48 89 fb 48 89 f5 4c 89 e7 e8 RSP: 0018:ffffc90002b1fc28 EFLAGS: 00000202 RAX: 00000000001c20ca RBX: 0000000000724e1e RCX: 0000000000000001 RDX: ffff888118e214c8 RSI: 0000000000057d42 RDI: ffff888118e21518 RBP: 000000000002bec8 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000bf4 R11: 0000000000000000 R12: 0000000000000001 R13: 00000000001c20ca R14: 00000000001c20ca R15: 0000000000000000 FS: 00007f03f995c740(0000) GS:ffff88a07ad9a000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f03f98f1000 CR3: 0000000144626004 CR4: 0000000000770eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: shmem_alloc_folio+0x31/0xc0 shmem_swapin_folio+0x309/0xcf0 ? filemap_get_entry+0x117/0x1e0 ? xas_load+0xd/0xb0 ? filemap_get_entry+0x101/0x1e0 shmem_get_folio_gfp+0x2ed/0x5b0 shmem_file_read_iter+0x7f/0x2e0 vfs_read+0x252/0x330 ksys_read+0x68/0xf0 do_syscall_64+0x4c/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f03f9a46991 Code: 00 48 8b 15 81 14 10 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd e8 20 ad 01 00 f3 0f 1e fa 80 3d 35 97 10 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec RSP: 002b:00007fff3c52bd28 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000040000 RCX: 00007f03f9a46991 RDX: 0000000000040000 RSI: 00007f03f98ba000 RDI: 0000000000000003 RBP: 00007fff3c52bd50 R08: 0000000000000000 R09: 00007f03f9b9a380 R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000040000 R13: 00007f03f98ba000 R14: 0000000000000003 R15: 0000000000000000 The reason is simple, readahead brought some order 0 folio in swap cache, and the swapin mTHP folio being allocated is in conflict with it, so swapcache_prepare fails and causes shmem_swap_alloc_folio to return -EEXIST, and shmem simply retries again and again causing this loop. Fix it by applying a similar fix for anon mTHP swapin. The performance change is very slight, time of swapin 10g zero folios with shmem (test for 12 times): Before: 2.47s After: 2.48s [kasong@tencent.com: add comment] Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250610181645.45922-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250609171751.36305-1-ryncsn@gmail.com Fixes: 1dd44c0af4fa ("mm: shmem: skip swapcache for swapin of synchronous swap device") Signed-off-by: Kairui Song Reviewed-by: Barry Song Acked-by: Nhat Pham Reviewed-by: Baolin Wang Cc: Baoquan He Cc: Chris Li Cc: Hugh Dickins Cc: Kemeng Shi Cc: Usama Arif Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 20 -------------------- mm/shmem.c | 6 +++++- mm/swap.h | 23 +++++++++++++++++++++++ 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 8eba595056fe..b0cda5aab398 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4315,26 +4315,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. diff --git a/mm/shmem.c b/mm/shmem.c index 0c5fb4ffa03a..3a5a65b1f41a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2259,6 +2259,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = swap_cache_get_folio(swap, NULL, 0); order = xa_get_order(&mapping->i_pages, index); if (!folio) { + int nr_pages = 1 << order; bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ @@ -2272,9 +2273,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. */ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled())) + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) fallback_order0 = true; /* Skip swapcache for synchronous device. */ diff --git a/mm/swap.h b/mm/swap.h index 2269eb9df0af..9096082a915e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing mTHP swapin, we need to + * ensure all entries are not cached, otherwise, the mTHP folio will + * be in conflict with the folio in swap cache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -199,6 +218,10 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return 0; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ /** From 965f87700adbcc6d72430f524d88135027f5bba3 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Mon, 9 Jun 2025 12:06:07 +0000 Subject: [PATCH 197/284] selftests/mm: increase timeout from 180 to 900 seconds The mm selftests are timing out with the current 180-second limit. Testing shows that run_vmtests.sh takes approximately 11 minutes (664 seconds) to complete. Increase the timeout to 900 seconds (15 minutes) to provide sufficient buffer for the tests to complete successfully. Link: https://lkml.kernel.org/r/20250609120606.73145-2-shivankg@amd.com Signed-off-by: Shivank Garg Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/settings | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings index a953c96aa16e..e2206265f67c 100644 --- a/tools/testing/selftests/mm/settings +++ b/tools/testing/selftests/mm/settings @@ -1 +1 @@ -timeout=180 +timeout=900 From 517f496e1e61bd169d585dab4dd77e7147506322 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 11 Jun 2025 15:13:14 +0200 Subject: [PATCH 198/284] mm/gup: revert "mm: gup: fix infinite loop within __get_longterm_locked" After commit 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked") we are able to longterm pin folios that are not supposed to get longterm pinned, simply because they temporarily have the LRU flag cleared (esp. temporarily isolated). For example, two __get_longterm_locked() callers can race, or __get_longterm_locked() can race with anything else that temporarily isolates folios. The introducing commit mentions the use case of a driver that uses vm_ops->fault to insert pages allocated through cma_alloc() into the page tables, assuming they can later get longterm pinned. These pages/ folios would never have the LRU flag set and consequently cannot get isolated. There is no known in-tree user making use of that so far, fortunately. To handle that in the future -- and avoid retrying forever to isolate/migrate them -- we will need a different mechanism for the CMA area *owner* to indicate that it actually already allocated the page and is fine with longterm pinning it. The LRU flag is not suitable for that. Probably we can lookup the relevant CMA area and query the bitmap; we only have have to care about some races, probably. If already allocated, we could just allow longterm pinning) Anyhow, let's fix the "must not be longterm pinned" problem first by reverting the original commit. Link: https://lkml.kernel.org/r/20250611131314.594529-1-david@redhat.com Fixes: 1aaf8c122918 ("mm: gup: fix infinite loop within __get_longterm_locked") Signed-off-by: David Hildenbrand Closes: https://lore.kernel.org/all/20250522092755.GA3277597@tiffany/ Reported-by: Hyesoo Yu Reviewed-by: John Hubbard Cc: Jason Gunthorpe Cc: Peter Xu Cc: Zhaoyang Huang Cc: Aijun Sun Cc: Alistair Popple Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index e065a49842a8..3c39cbbeebef 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2303,13 +2303,13 @@ static void pofs_unpin(struct pages_or_folios *pofs) /* * Returns the number of collected folios. Return value is always >= 0. */ -static void collect_longterm_unpinnable_folios( +static unsigned long collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { + unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; - unsigned long i; for (i = 0; i < pofs->nr_entries; i++) { struct folio *folio = pofs_get_folio(pofs, i); @@ -2321,6 +2321,8 @@ static void collect_longterm_unpinnable_folios( if (folio_is_longterm_pinnable(folio)) continue; + collected++; + if (folio_is_device_coherent(folio)) continue; @@ -2342,6 +2344,8 @@ static void collect_longterm_unpinnable_folios( NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } + + return collected; } /* @@ -2418,9 +2422,11 @@ static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); + unsigned long collected; - collect_longterm_unpinnable_folios(&movable_folio_list, pofs); - if (list_empty(&movable_folio_list)) + collected = collect_longterm_unpinnable_folios(&movable_folio_list, + pofs); + if (!collected) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); From 0ea148a799198518d8ebab63ddd0bb6114a103bc Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 4 Jun 2025 23:10:38 +0800 Subject: [PATCH 199/284] mm: userfaultfd: fix race of userfaultfd_move and swap cache This commit fixes two kinds of races, they may have different results: Barry reported a BUG_ON in commit c50f8e6053b0, we may see the same BUG_ON if the filemap lookup returned NULL and folio is added to swap cache after that. If another kind of race is triggered (folio changed after lookup) we may see RSS counter is corrupted: [ 406.893936] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0 type:MM_ANONPAGES val:-1 [ 406.894071] BUG: Bad rss-counter state mm:ffff0000c5a9ddc0 type:MM_SHMEMPAGES val:1 Because the folio is being accounted to the wrong VMA. I'm not sure if there will be any data corruption though, seems no. The issues above are critical already. On seeing a swap entry PTE, userfaultfd_move does a lockless swap cache lookup, and tries to move the found folio to the faulting vma. Currently, it relies on checking the PTE value to ensure that the moved folio still belongs to the src swap entry and that no new folio has been added to the swap cache, which turns out to be unreliable. While working and reviewing the swap table series with Barry, following existing races are observed and reproduced [1]: In the example below, move_pages_pte is moving src_pte to dst_pte, where src_pte is a swap entry PTE holding swap entry S1, and S1 is not in the swap cache: CPU1 CPU2 userfaultfd_move move_pages_pte() entry = pte_to_swp_entry(orig_src_pte); // Here it got entry = S1 ... < interrupted> ... // folio A is a new allocated folio // and get installed into src_pte // src_pte now points to folio A, S1 // has swap count == 0, it can be freed // by folio_swap_swap or swap // allocator's reclaim. // folio B is a folio in another VMA. // S1 is freed, folio B can use it // for swap out with no problem. ... folio = filemap_get_folio(S1) // Got folio B here !!! ... < interrupted again> ... // Now S1 is free to be used again. // Now src_pte is a swap entry PTE // holding S1 again. folio_trylock(folio) move_swap_pte double_pt_lock is_pte_pages_stable // Check passed because src_pte == S1 folio_move_anon_rmap(...) // Moved invalid folio B here !!! The race window is very short and requires multiple collisions of multiple rare events, so it's very unlikely to happen, but with a deliberately constructed reproducer and increased time window, it can be reproduced easily. This can be fixed by checking if the folio returned by filemap is the valid swap cache folio after acquiring the folio lock. Another similar race is possible: filemap_get_folio may return NULL, but folio (A) could be swapped in and then swapped out again using the same swap entry after the lookup. In such a case, folio (A) may remain in the swap cache, so it must be moved too: CPU1 CPU2 userfaultfd_move move_pages_pte() entry = pte_to_swp_entry(orig_src_pte); // Here it got entry = S1, and S1 is not in swap cache folio = filemap_get_folio(S1) // Got NULL ... < interrupted again> ... move_swap_pte double_pt_lock is_pte_pages_stable // Check passed because src_pte == S1 folio_move_anon_rmap(...) // folio A is ignored !!! Fix this by checking the swap cache again after acquiring the src_pte lock. And to avoid the filemap overhead, we check swap_map directly [2]. The SWP_SYNCHRONOUS_IO path does make the problem more complex, but so far we don't need to worry about that, since folios can only be exposed to the swap cache in the swap out path, and this is covered in this patch by checking the swap cache again after acquiring the src_pte lock. Testing with a simple C program that allocates and moves several GB of memory did not show any observable performance change. Link: https://lkml.kernel.org/r/20250604151038.21968-1-ryncsn@gmail.com Fixes: adef440691ba ("userfaultfd: UFFDIO_MOVE uABI") Signed-off-by: Kairui Song Closes: https://lore.kernel.org/linux-mm/CAMgjq7B1K=6OOrK2OUZ0-tqCzi+EJt+2_K97TPGoSt=9+JwP7Q@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/CAGsJ_4yJhJBo16XhiC-nUzSheyX-V3-nFE+tAi=8Y560K8eT=A@mail.gmail.com/ [2] Reviewed-by: Lokesh Gidra Acked-by: Peter Xu Reviewed-by: Suren Baghdasaryan Reviewed-by: Barry Song Reviewed-by: Chris Li Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Kairui Song Cc: Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bc473ad21202..8253978ee0fb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) { + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; + double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, if (src_folio) { folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1412,7 +1441,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, } err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, - dst_ptl, src_ptl, src_folio); + dst_ptl, src_ptl, src_folio, si, entry); } out: From 417d145c2e71ad7362200ede6e8afdfc6e56c4fb Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Fri, 13 Jun 2025 15:19:14 +0200 Subject: [PATCH 200/284] MAINTAINERS: add linux-mm@ list to Kexec Handover Along with kexec, KHO also has parts dealing with memory management, like page/folio initialization, memblock, and preserving/unpreserving memory for next kernel. Copy linux-mm@ to KHO patches so the right set of eyes can look at changes to those parts. Link: https://lkml.kernel.org/r/20250613131917.4488-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0c1d245bf7b8..9a7fdc52cf9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13347,6 +13347,7 @@ M: Alexander Graf M: Mike Rapoport M: Changyuan Lyu L: kexec@lists.infradead.org +L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/kho.rst F: Documentation/core-api/kho/* From 12b9a2c05d1b474518b0f5fac4a50b7f93b16930 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Thu, 5 Jun 2025 19:11:41 +0200 Subject: [PATCH 201/284] kho: initialize tail pages for higher order folios properly Currently, when restoring higher order folios, kho_restore_folio() only calls prep_compound_page() on all the pages. That is not enough to properly initialize the folios. The managed page count does not get updated, the reserved flag does not get dropped, and page count does not get initialized properly. Restoring a higher order folio with it results in the following BUG with CONFIG_DEBUG_VM when attempting to free the folio: BUG: Bad page state in process test pfn:104e2b page: refcount:1 mapcount:0 mapping:0000000000000000 index:0xffffffffffffffff pfn:0x104e2b flags: 0x2fffff80000000(node=0|zone=2|lastcpupid=0x1fffff) raw: 002fffff80000000 0000000000000000 00000000ffffffff 0000000000000000 raw: ffffffffffffffff 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: nonzero _refcount [...] Call Trace: dump_stack_lvl+0x4b/0x70 bad_page.cold+0x97/0xb2 __free_frozen_pages+0x616/0x850 [...] Combine the path for 0-order and higher order folios, initialize the tail pages with a count of zero, and call adjust_managed_page_count() to account for all the pages instead of just missing them. In addition, since all the KHO-preserved pages get marked with MEMBLOCK_RSRV_NOINIT by deserialize_bitmap(), the reserved flag is not actually set (as can also be seen from the flags of the dumped page in the logs above). So drop the ClearPageReserved() calls. [ptyadav@amazon.de: declare i in the loop instead of at the top] Link: https://lkml.kernel.org/r/20250613125916.39272-1-pratyush@kernel.org Link: https://lkml.kernel.org/r/20250605171143.76963-1-pratyush@kernel.org Fixes: fc33e4b44b27 ("kexec: enable KHO support for memory preservation") Signed-off-by: Pratyush Yadav Reviewed-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 69b953551677..5a21dbe17950 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -164,11 +164,21 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } /* almost as free_reserved_page(), just don't free the page */ -static void kho_restore_page(struct page *page) +static void kho_restore_page(struct page *page, unsigned int order) { - ClearPageReserved(page); - init_page_count(page); - adjust_managed_page_count(page, 1); + unsigned int nr_pages = (1 << order); + + /* Head page gets refcount of 1. */ + set_page_count(page, 1); + + /* For higher order folios, tail pages get a page count of zero. */ + for (unsigned int i = 1; i < nr_pages; i++) + set_page_count(page + i, 0); + + if (order > 0) + prep_compound_page(page, order); + + adjust_managed_page_count(page, nr_pages); } /** @@ -186,15 +196,10 @@ struct folio *kho_restore_folio(phys_addr_t phys) return NULL; order = page->private; - if (order) { - if (order > MAX_PAGE_ORDER) - return NULL; - - prep_compound_page(page, order); - } else { - kho_restore_page(page); - } + if (order > MAX_PAGE_ORDER) + return NULL; + kho_restore_page(page, order); return page_folio(page); } EXPORT_SYMBOL_GPL(kho_restore_folio); From 223731cd63004cb07edfc6257d53565995c00cbb Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 13 Jun 2025 09:19:12 +0530 Subject: [PATCH 202/284] selftests/mm: add configs to fix testcase failure If CONFIG_UPROBES is not set, a merge subtest fails: Failure log: 7151 12:46:54.627936 # # # RUN merge.handle_uprobe_upon_merged_vma ... 7152 12:46:54.639014 # # f /sys/bus/event_source/devices/uprobe/type 7153 12:46:54.639306 # # fopen: No such file or directory 7154 12:46:54.650451 # # # merge.c:473:handle_uprobe_upon_merged_vma:Expected read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) (1) == 0 (0) 7155 12:46:54.650730 # # # handle_uprobe_upon_merged_vma: Test terminated by assertion 7156 12:46:54.661750 # # # FAIL merge.handle_uprobe_upon_merged_vma 7157 12:46:54.662030 # # not ok 8 merge.handle_uprobe_upon_merged_vma CONFIG_UPROBES is enabled by CONFIG_UPROBE_EVENTS, which gets enabled by CONFIG_FTRACE. Therefore add these configs to selftests/mm/config so that CI systems can include this config in the kernel build. To be completely safe, add CONFIG_PROFILING too, to enable the dependency chain PROFILING -> PERF_EVENTS -> UPROBE_EVENTS -> UPROBES. Link: https://lkml.kernel.org/r/20250613034912.53791-1-dev.jain@arm.com Fixes: efe99fabeb11 ("selftests/mm: add test about uprobe pte be orphan during vma merge") Signed-off-by: Dev Jain Reported-by: Aishwarya Closes: https://lore.kernel.org/all/20250610103729.72440-1-aishwarya.tcv@arm.com/ Tested-by: Aishwarya TCV Tested-by : Donet Tom Reviewed-by: Lorenzo Stoakes Reviewed-by: Anshuman Khandual Reviewed-by: Mark Brown Reviewed-by: Donet Tom Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Liam Howlett Cc: Pu Lehui Cc: Ryan Roberts Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config index a28baa536332..deba93379c80 100644 --- a/tools/testing/selftests/mm/config +++ b/tools/testing/selftests/mm/config @@ -8,3 +8,6 @@ CONFIG_GUP_TEST=y CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_MEM_SOFT_DIRTY=y CONFIG_ANON_VMA_NAME=y +CONFIG_FTRACE=y +CONFIG_PROFILING=y +CONFIG_UPROBES=y From 845f1f2d69f3f49b3d8c142265952c8257e3368c Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 15 Jun 2025 04:23:51 +0800 Subject: [PATCH 203/284] Revert "bcache: update min_heap_callbacks to use default builtin swap" Patch series "bcache: Revert min_heap migration due to performance regression". This patch series reverts the migration of bcache from its original heap implementation to the generic min_heap library. While the original change aimed to simplify the code and improve maintainability, it introduced a severe performance regression in real-world scenarios. As reported by Robert, systems using bcache now suffer from periodic latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. This degrades bcache's value as a low-latency caching layer, and leads to frequent timeouts and application stalls in production environments. The primary cause of this regression is the behavior of the generic min_heap implementation's bottom-up sift_down, which performs up to 2 * log2(n) comparisons when many elements are equal. The original top-down variant used by bcache only required O(1) comparisons in such cases. The issue was further exacerbated by commit 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions"), which introduced non-inlined versions of the min_heap API, adding function call overhead to a performance-critical hot path. This patch (of 3): This reverts commit 3d8a9a1c35227c3f1b0bd132c9f0a80dbda07b65. Although removing the custom swap function simplified the code, this change is part of a broader migration to the generic min_heap API that introduced significant performance regressions in bcache. As reported by Robert, bcache now suffers from latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. These regressions degrade bcache's effectiveness as a low-latency cache layer and lead to frequent timeouts and application stalls in production environments. This revert is part of a series of changes to restore previous performance by undoing the min_heap transition. Link: https://lkml.kernel.org/r/20250614202353.1632957-1-visitorckw@gmail.com Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com Link: https://lkml.kernel.org/r/20250614202353.1632957-2-visitorckw@gmail.com Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap") Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions") Signed-off-by: Kuan-Wei Chiu Reported-by: Robert Pang Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com Acked-by: Coly Li Cc: Ching-Chun (Jim) Huang Cc: Kent Overstreet Cc: Signed-off-by: Andrew Morton --- drivers/md/bcache/alloc.c | 11 +++++++++-- drivers/md/bcache/bset.c | 14 +++++++++++--- drivers/md/bcache/extents.c | 10 +++++++++- drivers/md/bcache/movinggc.c | 10 +++++++++- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8998e61efa40..da50f6661bae 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -189,16 +189,23 @@ static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); } +static inline void new_bucket_swap(void *l, void *r, void __always_unused *args) +{ + struct bucket **lhs = l, **rhs = r; + + swap(*lhs, *rhs); +} + static void invalidate_buckets_lru(struct cache *ca) { struct bucket *b; const struct min_heap_callbacks bucket_max_cmp_callback = { .less = new_bucket_max_cmp, - .swp = NULL, + .swp = new_bucket_swap, }; const struct min_heap_callbacks bucket_min_cmp_callback = { .less = new_bucket_min_cmp, - .swp = NULL, + .swp = new_bucket_swap, }; ca->heap.nr = 0; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 68258a16e125..bd97d8626887 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -1093,6 +1093,14 @@ static inline bool new_btree_iter_cmp(const void *l, const void *r, void __alway return bkey_cmp(_l->k, _r->k) <= 0; } +static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) +{ + struct btree_iter_set *_iter1 = iter1; + struct btree_iter_set *_iter2 = iter2; + + swap(*_iter1, *_iter2); +} + static inline bool btree_iter_end(struct btree_iter *iter) { return !iter->heap.nr; @@ -1103,7 +1111,7 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, { const struct min_heap_callbacks callbacks = { .less = new_btree_iter_cmp, - .swp = NULL, + .swp = new_btree_iter_swap, }; if (k != end) @@ -1149,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, struct bkey *ret = NULL; const struct min_heap_callbacks callbacks = { .less = cmp, - .swp = NULL, + .swp = new_btree_iter_swap, }; if (!btree_iter_end(iter)) { @@ -1223,7 +1231,7 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, : bch_ptr_invalid; const struct min_heap_callbacks callbacks = { .less = b->ops->sort_cmp, - .swp = NULL, + .swp = new_btree_iter_swap, }; /* Heapify the iterator, using our comparison function */ diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 4b84fda1530a..a7221e5dbe81 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -266,12 +266,20 @@ static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_ return !(c ? c > 0 : _l->k < _r->k); } +static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) +{ + struct btree_iter_set *_iter1 = iter1; + struct btree_iter_set *_iter2 = iter2; + + swap(*_iter1, *_iter2); +} + static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { const struct min_heap_callbacks callbacks = { .less = new_bch_extent_sort_cmp, - .swp = NULL, + .swp = new_btree_iter_swap, }; while (iter->heap.nr > 1) { struct btree_iter_set *top = iter->heap.data, *i = top + 1; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 45ca134cbf02..d6c73dd8eb2b 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -190,6 +190,14 @@ static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *a return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); } +static void new_bucket_swap(void *l, void *r, void __always_unused *args) +{ + struct bucket **_l = l; + struct bucket **_r = r; + + swap(*_l, *_r); +} + static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; @@ -204,7 +212,7 @@ void bch_moving_gc(struct cache_set *c) unsigned long sectors_to_move, reserve_sectors; const struct min_heap_callbacks callbacks = { .less = new_bucket_cmp, - .swp = NULL, + .swp = new_bucket_swap, }; if (!c->copy_gc_enabled) From 48fd7ebe00c1cdc782b42576548b25185902f64c Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 15 Jun 2025 04:23:52 +0800 Subject: [PATCH 204/284] Revert "bcache: remove heap-related macros and switch to generic min_heap" This reverts commit 866898efbb25bb44fd42848318e46db9e785973a. The generic bottom-up min_heap implementation causes performance regression in invalidate_buckets_lru(), a hot path in bcache. Before the cache is fully populated, new_bucket_prio() often returns zero, leading to many equal comparisons. In such cases, bottom-up sift_down performs up to 2 * log2(n) comparisons, while the original top-down approach completes with just O() comparisons, resulting in a measurable performance gap. The performance degradation is further worsened by the non-inlined min_heap API functions introduced in commit 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions"), adding function call overhead to this critical path. As reported by Robert, bcache now suffers from latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. These regressions degrade bcache's effectiveness as a low-latency cache layer and lead to frequent timeouts and application stalls in production environments. This revert aims to restore bcache's original low-latency behavior. Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com Link: https://lkml.kernel.org/r/20250614202353.1632957-3-visitorckw@gmail.com Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap") Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions") Signed-off-by: Kuan-Wei Chiu Reported-by: Robert Pang Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com Acked-by: Coly Li Cc: Ching-Chun (Jim) Huang Cc: Kent Overstreet Cc: Signed-off-by: Andrew Morton --- drivers/md/bcache/alloc.c | 64 +++++------------- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/bset.c | 124 ++++++++++++---------------------- drivers/md/bcache/bset.h | 40 ++++++----- drivers/md/bcache/btree.c | 69 ++++++++----------- drivers/md/bcache/extents.c | 51 +++++--------- drivers/md/bcache/movinggc.c | 41 +++-------- drivers/md/bcache/super.c | 3 +- drivers/md/bcache/sysfs.c | 4 +- drivers/md/bcache/util.h | 67 +++++++++++++++++- drivers/md/bcache/writeback.c | 13 ++-- 11 files changed, 216 insertions(+), 262 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index da50f6661bae..48ce750bf70a 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -164,68 +164,40 @@ static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b) * prio is worth 1/8th of what INITIAL_PRIO is worth. */ -static inline unsigned int new_bucket_prio(struct cache *ca, struct bucket *b) -{ - unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; +#define bucket_prio(b) \ +({ \ + unsigned int min_prio = (INITIAL_PRIO - ca->set->min_prio) / 8; \ + \ + (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); \ +}) - return (b->prio - ca->set->min_prio + min_prio) * GC_SECTORS_USED(b); -} - -static inline bool new_bucket_max_cmp(const void *l, const void *r, void *args) -{ - struct bucket **lhs = (struct bucket **)l; - struct bucket **rhs = (struct bucket **)r; - struct cache *ca = args; - - return new_bucket_prio(ca, *lhs) > new_bucket_prio(ca, *rhs); -} - -static inline bool new_bucket_min_cmp(const void *l, const void *r, void *args) -{ - struct bucket **lhs = (struct bucket **)l; - struct bucket **rhs = (struct bucket **)r; - struct cache *ca = args; - - return new_bucket_prio(ca, *lhs) < new_bucket_prio(ca, *rhs); -} - -static inline void new_bucket_swap(void *l, void *r, void __always_unused *args) -{ - struct bucket **lhs = l, **rhs = r; - - swap(*lhs, *rhs); -} +#define bucket_max_cmp(l, r) (bucket_prio(l) < bucket_prio(r)) +#define bucket_min_cmp(l, r) (bucket_prio(l) > bucket_prio(r)) static void invalidate_buckets_lru(struct cache *ca) { struct bucket *b; - const struct min_heap_callbacks bucket_max_cmp_callback = { - .less = new_bucket_max_cmp, - .swp = new_bucket_swap, - }; - const struct min_heap_callbacks bucket_min_cmp_callback = { - .less = new_bucket_min_cmp, - .swp = new_bucket_swap, - }; + ssize_t i; - ca->heap.nr = 0; + ca->heap.used = 0; for_each_bucket(b, ca) { if (!bch_can_invalidate_bucket(ca, b)) continue; - if (!min_heap_full(&ca->heap)) - min_heap_push(&ca->heap, &b, &bucket_max_cmp_callback, ca); - else if (!new_bucket_max_cmp(&b, min_heap_peek(&ca->heap), ca)) { + if (!heap_full(&ca->heap)) + heap_add(&ca->heap, b, bucket_max_cmp); + else if (bucket_max_cmp(b, heap_peek(&ca->heap))) { ca->heap.data[0] = b; - min_heap_sift_down(&ca->heap, 0, &bucket_max_cmp_callback, ca); + heap_sift(&ca->heap, 0, bucket_max_cmp); } } - min_heapify_all(&ca->heap, &bucket_min_cmp_callback, ca); + for (i = ca->heap.used / 2 - 1; i >= 0; --i) + heap_sift(&ca->heap, i, bucket_min_cmp); while (!fifo_full(&ca->free_inc)) { - if (!ca->heap.nr) { + if (!heap_pop(&ca->heap, b, bucket_min_cmp)) { /* * We don't want to be calling invalidate_buckets() * multiple times when it can't do anything @@ -234,8 +206,6 @@ static void invalidate_buckets_lru(struct cache *ca) wake_up_gc(ca->set); return; } - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &bucket_min_cmp_callback, ca); bch_invalidate_one_bucket(ca, b); } diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 785b0d9008fa..1d33e40d26ea 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -458,7 +458,7 @@ struct cache { /* Allocation stuff: */ struct bucket *buckets; - DEFINE_MIN_HEAP(struct bucket *, cache_heap) heap; + DECLARE_HEAP(struct bucket *, heap); /* * If nonzero, we know we aren't going to find any buckets to invalidate diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index bd97d8626887..463eb13bd0b2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -54,11 +54,9 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { unsigned int ret = 0; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - if (b->ops->is_extents) for_each_key(b, k, &iter) ret += KEY_SIZE(k); @@ -69,11 +67,9 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; const char *err; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - for_each_key(b, k, &iter) { if (b->ops->is_extents) { err = "Keys out of order"; @@ -114,9 +110,9 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) static void bch_btree_iter_next_check(struct btree_iter *iter) { - struct bkey *k = iter->heap.data->k, *next = bkey_next(k); + struct bkey *k = iter->data->k, *next = bkey_next(k); - if (next < iter->heap.data->end && + if (next < iter->data->end && bkey_cmp(k, iter->b->ops->is_extents ? &START_KEY(next) : next) > 0) { bch_dump_bucket(iter->b); @@ -883,14 +879,12 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey preceding_key_on_stack = ZERO_KEY; struct bkey *preceding_key_p = &preceding_key_on_stack; BUG_ON(b->ops->is_extents && !KEY_SIZE(k)); - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* * If k has preceding key, preceding_key_p will be set to address * of k's preceding key; otherwise preceding_key_p will be set @@ -901,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, else preceding_key(k, &preceding_key_p); - m = bch_btree_iter_init(b, &iter, preceding_key_p); + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); - if (b->ops->insert_fixup(b, k, &iter, replace_key)) + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) return status; status = BTREE_INSERT_STATUS_INSERT; @@ -1083,102 +1077,79 @@ struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, /* Btree iterator */ -typedef bool (new_btree_iter_cmp_fn)(const void *, const void *, void *); +typedef bool (btree_iter_cmp_fn)(struct btree_iter_set, + struct btree_iter_set); -static inline bool new_btree_iter_cmp(const void *l, const void *r, void __always_unused *args) +static inline bool btree_iter_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - const struct btree_iter_set *_l = l; - const struct btree_iter_set *_r = r; - - return bkey_cmp(_l->k, _r->k) <= 0; -} - -static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) -{ - struct btree_iter_set *_iter1 = iter1; - struct btree_iter_set *_iter2 = iter2; - - swap(*_iter1, *_iter2); + return bkey_cmp(l.k, r.k) > 0; } static inline bool btree_iter_end(struct btree_iter *iter) { - return !iter->heap.nr; + return !iter->used; } void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end) { - const struct min_heap_callbacks callbacks = { - .less = new_btree_iter_cmp, - .swp = new_btree_iter_swap, - }; - if (k != end) - BUG_ON(!min_heap_push(&iter->heap, - &((struct btree_iter_set) { k, end }), - &callbacks, - NULL)); + BUG_ON(!heap_add(iter, + ((struct btree_iter_set) { k, end }), + btree_iter_cmp)); } -static struct bkey *__bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search, - struct bset_tree *start) +static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; - iter->heap.size = ARRAY_SIZE(iter->heap.preallocated); - iter->heap.nr = 0; + iter->iter.size = ARRAY_SIZE(iter->stack_data); + iter->iter.used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->iter.b = b; #endif for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->set); + return __bch_btree_iter_stack_init(b, iter, search, b->set); } static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, - new_btree_iter_cmp_fn *cmp) + btree_iter_cmp_fn *cmp) { struct btree_iter_set b __maybe_unused; struct bkey *ret = NULL; - const struct min_heap_callbacks callbacks = { - .less = cmp, - .swp = new_btree_iter_swap, - }; if (!btree_iter_end(iter)) { bch_btree_iter_next_check(iter); - ret = iter->heap.data->k; - iter->heap.data->k = bkey_next(iter->heap.data->k); + ret = iter->data->k; + iter->data->k = bkey_next(iter->data->k); - if (iter->heap.data->k > iter->heap.data->end) { + if (iter->data->k > iter->data->end) { WARN_ONCE(1, "bset was corrupt!\n"); - iter->heap.data->k = iter->heap.data->end; + iter->data->k = iter->data->end; } - if (iter->heap.data->k == iter->heap.data->end) { - if (iter->heap.nr) { - b = min_heap_peek(&iter->heap)[0]; - min_heap_pop(&iter->heap, &callbacks, NULL); - } - } + if (iter->data->k == iter->data->end) + heap_pop(iter, b, cmp); else - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); + heap_sift(iter, 0, cmp); } return ret; @@ -1186,7 +1157,7 @@ static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, struct bkey *bch_btree_iter_next(struct btree_iter *iter) { - return __bch_btree_iter_next(iter, new_btree_iter_cmp); + return __bch_btree_iter_next(iter, btree_iter_cmp); } @@ -1224,18 +1195,16 @@ static void btree_mergesort(struct btree_keys *b, struct bset *out, struct btree_iter *iter, bool fixup, bool remove_stale) { + int i; struct bkey *k, *last = NULL; BKEY_PADDED(k) tmp; bool (*bad)(struct btree_keys *, const struct bkey *) = remove_stale ? bch_ptr_bad : bch_ptr_invalid; - const struct min_heap_callbacks callbacks = { - .less = b->ops->sort_cmp, - .swp = new_btree_iter_swap, - }; /* Heapify the iterator, using our comparison function */ - min_heapify_all(&iter->heap, &callbacks, NULL); + for (i = iter->used / 2 - 1; i >= 0; --i) + heap_sift(iter, i, b->ops->sort_cmp); while (!btree_iter_end(iter)) { if (b->ops->sort_fixup && fixup) @@ -1324,11 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; - struct btree_iter iter; + struct btree_iter_stack iter; int oldsize = bch_count_data(b); - min_heap_init(&iter.heap, NULL, MAX_BSETS); - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned int i; @@ -1339,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false, state); + __btree_sort(b, &iter.iter, start, order, false, state); EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } @@ -1355,13 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); + bch_btree_iter_stack_init(b, &iter, NULL); - bch_btree_iter_init(b, &iter, NULL); - - btree_mergesort(b, new->set->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter.iter, false, true); bch_time_stats_update(&state->time, start_time); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index f79441acd4c1..011f6062c4c0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -187,9 +187,8 @@ struct bset_tree { }; struct btree_keys_ops { - bool (*sort_cmp)(const void *l, - const void *r, - void *args); + bool (*sort_cmp)(struct btree_iter_set l, + struct btree_iter_set r); struct bkey *(*sort_fixup)(struct btree_iter *iter, struct bkey *tmp); bool (*insert_fixup)(struct btree_keys *b, @@ -313,17 +312,23 @@ enum { BTREE_INSERT_STATUS_FRONT_MERGE, }; -struct btree_iter_set { - struct bkey *k, *end; -}; - /* Btree key iteration */ struct btree_iter { + size_t size, used; #ifdef CONFIG_BCACHE_DEBUG struct btree_keys *b; #endif - MIN_HEAP_PREALLOCATED(struct btree_iter_set, btree_iter_heap, MAX_BSETS) heap; + struct btree_iter_set { + struct bkey *k, *end; + } data[]; +}; + +/* Fixed-size btree_iter that can be allocated on the stack */ + +struct btree_iter_stack { + struct btree_iter iter; + struct btree_iter_set stack_data[MAX_BSETS]; }; typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); @@ -335,9 +340,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end); -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search); +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search); struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search); @@ -352,13 +357,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, return search ? __bch_bset_search(b, t, search) : t->data->start; } -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) +#define for_each_key_filter(b, k, stack_iter, filter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ + filter));) -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) +#define for_each_key(b, k, stack_iter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) /* Sorting */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 1d0100677357..210b59007d98 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -148,19 +148,19 @@ void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; struct bset *i = btree_bset_first(b); - struct btree_iter iter; + struct btree_iter *iter; /* * c->fill_iter can allocate an iterator with more memory space * than static MAX_BSETS. * See the comment arount cache_set->fill_iter. */ - iter.heap.data = mempool_alloc(&b->c->fill_iter, GFP_NOIO); - iter.heap.size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; - iter.heap.nr = 0; + iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); + iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; + iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter.b = &b->keys; + iter->b = &b->keys; #endif if (!i->seq) @@ -198,7 +198,7 @@ void bch_btree_node_read_done(struct btree *b) if (i != b->keys.set[0].data && !i->keys) goto err; - bch_btree_iter_push(&iter, i->start, bset_bkey_last(i)); + bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); b->written += set_blocks(i, block_bytes(b->c->cache)); } @@ -210,7 +210,7 @@ void bch_btree_node_read_done(struct btree *b) if (i->seq == b->keys.set[0].data->seq) goto err; - bch_btree_sort_and_fix_extents(&b->keys, &iter, &b->c->sort); + bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); i = b->keys.set[0].data; err = "short btree key"; @@ -222,7 +222,7 @@ void bch_btree_node_read_done(struct btree *b) bch_bset_init_next(&b->keys, write_block(b), bset_magic(&b->c->cache->sb)); out: - mempool_free(iter.heap.data, &b->c->fill_iter); + mempool_free(iter, &b->c->fill_iter); return; err: set_btree_node_io_error(b); @@ -1306,11 +1306,9 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct bset_tree *t; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - gc->nodes++; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { @@ -1569,11 +1567,9 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; unsigned int ret = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); @@ -1612,18 +1608,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1918,9 +1914,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1928,10 +1922,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_stack_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, &b->keys, + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1959,7 +1953,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1967,11 +1961,9 @@ static int bch_btree_check_thread(void *arg) cur_idx = prev_idx = 0; ret = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* root node keys are checked before thread created */ - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1989,7 +1981,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -2062,11 +2054,9 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct btree_check_state check_state; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - /* check and mark root node keys */ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(c, c->root->level, k); @@ -2560,12 +2550,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2594,12 +2583,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index a7221e5dbe81..d626ffcbecb9 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -33,16 +33,15 @@ static void sort_key_next(struct btree_iter *iter, i->k = bkey_next(i->k); if (i->k == i->end) - *i = iter->heap.data[--iter->heap.nr]; + *i = iter->data[--iter->used]; } -static bool new_bch_key_sort_cmp(const void *l, const void *r, void *args) +static bool bch_key_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - struct btree_iter_set *_l = (struct btree_iter_set *)l; - struct btree_iter_set *_r = (struct btree_iter_set *)r; - int64_t c = bkey_cmp(_l->k, _r->k); + int64_t c = bkey_cmp(l.k, r.k); - return !(c ? c > 0 : _l->k < _r->k); + return c ? c > 0 : l.k < r.k; } static bool __ptr_invalid(struct cache_set *c, const struct bkey *k) @@ -239,7 +238,7 @@ static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk, } const struct btree_keys_ops bch_btree_keys_ops = { - .sort_cmp = new_bch_key_sort_cmp, + .sort_cmp = bch_key_sort_cmp, .insert_fixup = bch_btree_ptr_insert_fixup, .key_invalid = bch_btree_ptr_invalid, .key_bad = bch_btree_ptr_bad, @@ -256,36 +255,22 @@ const struct btree_keys_ops bch_btree_keys_ops = { * Necessary for btree_sort_fixup() - if there are multiple keys that compare * equal in different sets, we have to process them newest to oldest. */ - -static bool new_bch_extent_sort_cmp(const void *l, const void *r, void __always_unused *args) +static bool bch_extent_sort_cmp(struct btree_iter_set l, + struct btree_iter_set r) { - struct btree_iter_set *_l = (struct btree_iter_set *)l; - struct btree_iter_set *_r = (struct btree_iter_set *)r; - int64_t c = bkey_cmp(&START_KEY(_l->k), &START_KEY(_r->k)); + int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k)); - return !(c ? c > 0 : _l->k < _r->k); -} - -static inline void new_btree_iter_swap(void *iter1, void *iter2, void __always_unused *args) -{ - struct btree_iter_set *_iter1 = iter1; - struct btree_iter_set *_iter2 = iter2; - - swap(*_iter1, *_iter2); + return c ? c > 0 : l.k < r.k; } static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, struct bkey *tmp) { - const struct min_heap_callbacks callbacks = { - .less = new_bch_extent_sort_cmp, - .swp = new_btree_iter_swap, - }; - while (iter->heap.nr > 1) { - struct btree_iter_set *top = iter->heap.data, *i = top + 1; + while (iter->used > 1) { + struct btree_iter_set *top = iter->data, *i = top + 1; - if (iter->heap.nr > 2 && - !new_bch_extent_sort_cmp(&i[0], &i[1], NULL)) + if (iter->used > 2 && + bch_extent_sort_cmp(i[0], i[1])) i++; if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0) @@ -293,7 +278,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, if (!KEY_SIZE(i->k)) { sort_key_next(iter, i); - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); + heap_sift(iter, i - top, bch_extent_sort_cmp); continue; } @@ -303,7 +288,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, else bch_cut_front(top->k, i->k); - min_heap_sift_down(&iter->heap, i - top, &callbacks, NULL); + heap_sift(iter, i - top, bch_extent_sort_cmp); } else { /* can't happen because of comparison func */ BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k))); @@ -313,7 +298,7 @@ static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter, bch_cut_back(&START_KEY(i->k), tmp); bch_cut_front(i->k, top->k); - min_heap_sift_down(&iter->heap, 0, &callbacks, NULL); + heap_sift(iter, 0, bch_extent_sort_cmp); return tmp; } else { @@ -633,7 +618,7 @@ static bool bch_extent_merge(struct btree_keys *bk, } const struct btree_keys_ops bch_extent_keys_ops = { - .sort_cmp = new_bch_extent_sort_cmp, + .sort_cmp = bch_extent_sort_cmp, .sort_fixup = bch_extent_sort_fixup, .insert_fixup = bch_extent_insert_fixup, .key_invalid = bch_extent_invalid, diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index d6c73dd8eb2b..26a6a535ec32 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -182,27 +182,16 @@ err: if (!IS_ERR_OR_NULL(w->private)) closure_sync(&cl); } -static bool new_bucket_cmp(const void *l, const void *r, void __always_unused *args) +static bool bucket_cmp(struct bucket *l, struct bucket *r) { - struct bucket **_l = (struct bucket **)l; - struct bucket **_r = (struct bucket **)r; - - return GC_SECTORS_USED(*_l) >= GC_SECTORS_USED(*_r); -} - -static void new_bucket_swap(void *l, void *r, void __always_unused *args) -{ - struct bucket **_l = l; - struct bucket **_r = r; - - swap(*_l, *_r); + return GC_SECTORS_USED(l) < GC_SECTORS_USED(r); } static unsigned int bucket_heap_top(struct cache *ca) { struct bucket *b; - return (b = min_heap_peek(&ca->heap)[0]) ? GC_SECTORS_USED(b) : 0; + return (b = heap_peek(&ca->heap)) ? GC_SECTORS_USED(b) : 0; } void bch_moving_gc(struct cache_set *c) @@ -210,10 +199,6 @@ void bch_moving_gc(struct cache_set *c) struct cache *ca = c->cache; struct bucket *b; unsigned long sectors_to_move, reserve_sectors; - const struct min_heap_callbacks callbacks = { - .less = new_bucket_cmp, - .swp = new_bucket_swap, - }; if (!c->copy_gc_enabled) return; @@ -224,7 +209,7 @@ void bch_moving_gc(struct cache_set *c) reserve_sectors = ca->sb.bucket_size * fifo_used(&ca->free[RESERVE_MOVINGGC]); - ca->heap.nr = 0; + ca->heap.used = 0; for_each_bucket(b, ca) { if (GC_MARK(b) == GC_MARK_METADATA || @@ -233,31 +218,25 @@ void bch_moving_gc(struct cache_set *c) atomic_read(&b->pin)) continue; - if (!min_heap_full(&ca->heap)) { + if (!heap_full(&ca->heap)) { sectors_to_move += GC_SECTORS_USED(b); - min_heap_push(&ca->heap, &b, &callbacks, NULL); - } else if (!new_bucket_cmp(&b, min_heap_peek(&ca->heap), ca)) { + heap_add(&ca->heap, b, bucket_cmp); + } else if (bucket_cmp(b, heap_peek(&ca->heap))) { sectors_to_move -= bucket_heap_top(ca); sectors_to_move += GC_SECTORS_USED(b); ca->heap.data[0] = b; - min_heap_sift_down(&ca->heap, 0, &callbacks, NULL); + heap_sift(&ca->heap, 0, bucket_cmp); } } while (sectors_to_move > reserve_sectors) { - if (ca->heap.nr) { - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &callbacks, NULL); - } + heap_pop(&ca->heap, b, bucket_cmp); sectors_to_move -= GC_SECTORS_USED(b); } - while (ca->heap.nr) { - b = min_heap_peek(&ca->heap)[0]; - min_heap_pop(&ca->heap, &callbacks, NULL); + while (heap_pop(&ca->heap, b, bucket_cmp)) SET_GC_MOVE(b, 1); - } mutex_unlock(&c->bucket_lock); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 1efb768b2890..2ea490b9d370 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1912,7 +1912,8 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * + iter_size = sizeof(struct btree_iter) + + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * sizeof(struct btree_iter_set); c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index e8f696cb58c0..826b14cae4e5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -660,9 +660,7 @@ static unsigned int bch_root_usage(struct cache_set *c) unsigned int bytes = 0; struct bkey *k; struct btree *b; - struct btree_iter iter; - - min_heap_init(&iter.heap, NULL, MAX_BSETS); + struct btree_iter_stack iter; goto lock_root; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 539454d8e2d0..f61ab1bada6c 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -31,10 +30,16 @@ struct closure; #endif +#define DECLARE_HEAP(type, name) \ + struct { \ + size_t size, used; \ + type *data; \ + } name + #define init_heap(heap, _size, gfp) \ ({ \ size_t _bytes; \ - (heap)->nr = 0; \ + (heap)->used = 0; \ (heap)->size = (_size); \ _bytes = (heap)->size * sizeof(*(heap)->data); \ (heap)->data = kvmalloc(_bytes, (gfp) & GFP_KERNEL); \ @@ -47,6 +52,64 @@ do { \ (heap)->data = NULL; \ } while (0) +#define heap_swap(h, i, j) swap((h)->data[i], (h)->data[j]) + +#define heap_sift(h, i, cmp) \ +do { \ + size_t _r, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _r) { \ + _r = _j * 2 + 1; \ + if (_r + 1 < (h)->used && \ + cmp((h)->data[_r], (h)->data[_r + 1])) \ + _r++; \ + \ + if (cmp((h)->data[_r], (h)->data[_j])) \ + break; \ + heap_swap(h, _r, _j); \ + } \ +} while (0) + +#define heap_sift_down(h, i, cmp) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp((h)->data[i], (h)->data[p])) \ + break; \ + heap_swap(h, i, p); \ + i = p; \ + } \ +} while (0) + +#define heap_add(h, d, cmp) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) { \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + \ + heap_sift_down(h, _i, cmp); \ + heap_sift(h, _i, cmp); \ + } \ + _r; \ +}) + +#define heap_pop(h, d, cmp) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + (h)->used--; \ + heap_swap(h, 0, (h)->used); \ + heap_sift(h, 0, cmp); \ + } \ + _r; \ +}) + +#define heap_peek(h) ((h)->used ? (h)->data[0] : NULL) + +#define heap_full(h) ((h)->used == (h)->size) + #define DECLARE_FIFO(type, name) \ struct { \ size_t front, back, size, mask; \ diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 453efbbdc8ee..302e75f1fc4b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -908,16 +908,15 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; prev_idx = 0; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -931,7 +930,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -980,13 +979,11 @@ void bch_sectors_dirty_init(struct bcache_device *d) int i; struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; - min_heap_init(&iter.heap, NULL, MAX_BSETS); - retry_lock: b = c->root; rw_lock(0, b, b->level); From 95b2e31e1752494d477c5da89d6789f769b0d67b Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sun, 15 Jun 2025 04:23:53 +0800 Subject: [PATCH 205/284] bcache: remove unnecessary select MIN_HEAP After reverting the transition to the generic min heap library, bcache no longer depends on MIN_HEAP. The select entry can be removed to reduce code size and shrink the kernel's attack surface. This change effectively reverts the bcache-related part of commit 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions"). This is part of a series of changes to address a performance regression caused by the use of the generic min_heap implementation. As reported by Robert, bcache now suffers from latency spikes, with P100 (max) latency increasing from 600 ms to 2.4 seconds every 5 minutes. These regressions degrade bcache's effectiveness as a low-latency cache layer and lead to frequent timeouts and application stalls in production environments. Link: https://lore.kernel.org/lkml/CAJhEC05+0S69z+3+FB2Cd0hD+pCRyWTKLEOsc8BOmH73p1m+KQ@mail.gmail.com Link: https://lkml.kernel.org/r/20250614202353.1632957-4-visitorckw@gmail.com Fixes: 866898efbb25 ("bcache: remove heap-related macros and switch to generic min_heap") Fixes: 92a8b224b833 ("lib/min_heap: introduce non-inline versions of min heap API functions") Signed-off-by: Kuan-Wei Chiu Reported-by: Robert Pang Closes: https://lore.kernel.org/linux-bcache/CAJhEC06F_AtrPgw2-7CvCqZgeStgCtitbD-ryuPpXQA-JG5XXw@mail.gmail.com Acked-by: Coly Li Cc: Ching-Chun (Jim) Huang Cc: Kent Overstreet Cc: Signed-off-by: Andrew Morton --- drivers/md/bcache/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index d4697e79d5a3..b2d10063d35f 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -5,7 +5,6 @@ config BCACHE select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 select CLOSURES - select MIN_HEAP help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. From 3333871296efd52ef6f6d5cce5a92dc7b06ba879 Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Tue, 10 Jun 2025 13:22:09 +0100 Subject: [PATCH 206/284] selftests/mm: skip uprobe vma merge test if uprobes are not enabled If uprobes are not enabled, the test currently fails with: 7151 12:46:54.627936 # # # RUN merge.handle_uprobe_upon_merged_vma ... 7152 12:46:54.639014 # # f /sys/bus/event_source/devices/uprobe/type 7153 12:46:54.639306 # # fopen: No such file or directory 7154 12:46:54.650451 # # # merge.c:473:handle_uprobe_upon_merged_vma:Expected read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) (1) == 0 (0) 7155 12:46:54.650730 # # # handle_uprobe_upon_merged_vma: Test terminated by assertion 7156 12:46:54.661750 # # # FAIL merge.handle_uprobe_upon_merged_vma 7157 12:46:54.662030 # # not ok 8 merge.handle_uprobe_upon_merged_vma Skipping is a more sane and friendly behavior here. Link: https://lkml.kernel.org/r/20250610122209.3177587-1-pfalcato@suse.de Fixes: efe99fabeb11 ("selftests/mm: add test about uprobe pte be orphan during vma merge") Signed-off-by: Pedro Falcato Reported-by: Aishwarya Closes: https://lore.kernel.org/linux-mm/20250610103729.72440-1-aishwarya.tcv@arm.com/ Reviewed-by: Lorenzo Stoakes Tested-by : Donet Tom Reviewed-by : Donet Tom Reviewed-by: Dev Jain Reviewed-by: Pu Lehui Cc: Jann Horn Cc: Liam Howlett Cc: Mark Brown Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/merge.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/merge.c b/tools/testing/selftests/mm/merge.c index bbae66fc5038..cc26480098ae 100644 --- a/tools/testing/selftests/mm/merge.c +++ b/tools/testing/selftests/mm/merge.c @@ -470,7 +470,9 @@ TEST_F(merge, handle_uprobe_upon_merged_vma) ASSERT_GE(fd, 0); ASSERT_EQ(ftruncate(fd, page_size), 0); - ASSERT_EQ(read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type), 0); + if (read_sysfs("/sys/bus/event_source/devices/uprobe/type", &type) != 0) { + SKIP(goto out, "Failed to read uprobe sysfs file, skipping"); + } memset(&attr, 0, attr_sz); attr.size = attr_sz; @@ -491,6 +493,7 @@ TEST_F(merge, handle_uprobe_upon_merged_vma) ASSERT_NE(mremap(ptr2, page_size, page_size, MREMAP_MAYMOVE | MREMAP_FIXED, ptr1), MAP_FAILED); +out: close(fd); remove(probe_file); } From 38103247777695ca2b09691b2247e66f157908c6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 16 Jun 2025 21:16:43 +0100 Subject: [PATCH 207/284] MAINTAINERS: add missing mm/workingset.c file to mm reclaim section The working set logic belongs very much to the reclaim section and is otherwise not assigned to any other MAINTAINERS section so add it here. Link: https://lkml.kernel.org/r/20250616201643.561626-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Shakeel Butt Acked-by: Qi Zheng Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 9a7fdc52cf9d..0497cbcc5d3e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15871,6 +15871,7 @@ L: linux-mm@kvack.org S: Maintained F: mm/pt_reclaim.c F: mm/vmscan.c +F: mm/workingset.c MEMORY MANAGEMENT - RMAP (REVERSE MAPPING) M: Andrew Morton From 40ffd2887635c492b758d8b02172c97f5d42f593 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 16 Jun 2025 21:08:44 +0100 Subject: [PATCH 208/284] MAINTAINERS: add missing test files to mm gup section We previously overlooked GUP test files that sensibly should belong to the GUP section, include them now. Link: https://lkml.kernel.org/r/20250616200844.560225-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: John Hubbard Cc: Jason Gunthorpe Cc: Peter Xu Signed-off-by: Andrew Morton --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0497cbcc5d3e..a70454f527c3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15795,6 +15795,10 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: mm/gup.c +F: mm/gup_test.c +F: mm/gup_test.h +F: tools/testing/selftests/mm/gup_longterm.c +F: tools/testing/selftests/mm/gup_test.c MEMORY MANAGEMENT - KSM (Kernel Samepage Merging) M: Andrew Morton From fba46a5d83ca8decb338722fb4899026d8d9ead2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 16 Jun 2025 14:45:20 -0400 Subject: [PATCH 209/284] maple_tree: fix MA_STATE_PREALLOC flag in mas_preallocate() Temporarily clear the preallocation flag when explicitly requesting allocations. Pre-existing allocations are already counted against the request through mas_node_count_gfp(), but the allocations will not happen if the MA_STATE_PREALLOC flag is set. This flag is meant to avoid re-allocating in bulk allocation mode, and to detect issues with preallocation calculations. The MA_STATE_PREALLOC flag should also always be set on zero allocations so that detection of underflow allocations will print a WARN_ON() during consumption. User visible effect of this flaw is a WARN_ON() followed by a null pointer dereference when subsequent requests for larger number of nodes is ignored, such as the vma merge retry in mmap_region() caused by drivers altering the vma flags (which happens in v6.6, at least) Link: https://lkml.kernel.org/r/20250616184521.3382795-3-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Liam R. Howlett Reported-by: Zhaoyang Huang Reported-by: Hailong Liu Link: https://lore.kernel.org/all/1652f7eb-a51b-4fee-8058-c73af63bacd1@oppo.com/ Link: https://lore.kernel.org/all/20250428184058.1416274-1-Liam.Howlett@oracle.com/ Link: https://lore.kernel.org/all/20250429014754.1479118-1-Liam.Howlett@oracle.com/ Cc: Lorenzo Stoakes Cc: Suren Baghdasaryan Cc: Hailong Liu Cc: zhangpeng.00@bytedance.com Cc: Steve Kang Cc: Matthew Wilcox Cc: Sidhartha Kumar Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index affe979bd14d..00524e55a21e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5527,8 +5527,9 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(&wr_mas, entry); if (!request) - return ret; + goto set_flag; + mas->mas_flags &= ~MA_STATE_PREALLOC; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); @@ -5538,6 +5539,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) return ret; } +set_flag: mas->mas_flags |= MA_STATE_PREALLOC; return ret; } From 883cf5b0b8389610f17106c454180c7f54b8d486 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 17 Jun 2025 20:59:10 +0200 Subject: [PATCH 210/284] MAINTAINERS: update maintainers for HugeTLB Change my role to Maintainer as I am quite involved in HugeTLB development, and will be more so with the upcoming HugetLB-pagewalk unification, so I would like to help Munchun take care of the code. Besides, having two people will help in offloading some pressure. Also add David as a Reviewer since he has quite some knowledge in the field and has already provided valuable feedback. Link: https://lkml.kernel.org/r/20250617185910.471406-1-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Muchun Song Cc: Michal Hocko Cc: Peter Xu Signed-off-by: Andrew Morton --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a70454f527c3..a2c8e4546cf8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11157,7 +11157,8 @@ F: include/linux/platform_data/huawei-gaokun-ec.h HUGETLB SUBSYSTEM M: Muchun Song -R: Oscar Salvador +M: Oscar Salvador +R: David Hildenbrand L: linux-mm@kvack.org S: Maintained F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages From b6d19f3742ff3429d8eb2cdf51a8ab598c197ee5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 18:45:38 +0100 Subject: [PATCH 211/284] MAINTAINERS: add further init files to mm init block These files comprise the bootmem info logic which is initialised on startup and also memory tests that are run on startup and as such this seems the most appropriate section for them. Link: https://lkml.kernel.org/r/20250617174538.188977-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index a2c8e4546cf8..1f87cb444c83 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15680,8 +15680,11 @@ S: Maintained F: Documentation/core-api/boot-time-mm.rst F: Documentation/core-api/kho/bindings/memblock/* F: include/linux/memblock.h +F: mm/bootmem_info.c F: mm/memblock.c +F: mm/memtest.c F: mm/mm_init.c +F: mm/rodata_test.c F: tools/testing/memblock/ MEMORY ALLOCATION PROFILING From d91b00b687abf6fef13be030abd848416f320149 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 18:15:38 +0100 Subject: [PATCH 212/284] MAINTAINERS: add hugetlb_cgroup.c to hugetlb section This file is clearly specific to hugetlb so this seems the most appropriate place for it. Link: https://lkml.kernel.org/r/20250617171538.178042-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1f87cb444c83..cf9ea0425011 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11169,6 +11169,7 @@ F: fs/hugetlbfs/ F: include/linux/hugetlb.h F: include/trace/events/hugetlbfs.h F: mm/hugetlb.c +F: mm/hugetlb_cgroup.c F: mm/hugetlb_cma.c F: mm/hugetlb_cma.h F: mm/hugetlb_vmemmap.c From a1540dcbe0246c42ee271a3b7bba5ec7c933321a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 17:51:42 +0100 Subject: [PATCH 213/284] MAINTAINERS: add stray rmap file to mm rmap section page_vma_mapped_walk() is used to traverse page tables from a VMA, used by rmap logic once the reverse mapping has been traversed to the VMA level. It is also used by other users (migration, damon, etc.) but is primarily used by the reverse mapping and is a key part of its logic, so it seems appropriate to place it here. Link: https://lkml.kernel.org/r/20250617165142.173716-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index cf9ea0425011..49fcc5d02df7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15893,6 +15893,7 @@ R: Harry Yoo L: linux-mm@kvack.org S: Maintained F: include/linux/rmap.h +F: mm/page_vma_mapped.c F: mm/rmap.c MEMORY MANAGEMENT - SECRETMEM From db5921ab8aa23142b277325192a7443601dc4534 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 17:13:59 +0100 Subject: [PATCH 214/284] MAINTAINERS: add memfd, shmem quota files to shmem section These files seem best suited to shmem. Link: https://lkml.kernel.org/r/20250617161359.166955-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 49fcc5d02df7..0eec6e1d4694 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15740,7 +15740,6 @@ F: Documentation/admin-guide/mm/ F: Documentation/mm/ F: include/linux/gfp.h F: include/linux/gfp_types.h -F: include/linux/memfd.h F: include/linux/memory_hotplug.h F: include/linux/memory-tiers.h F: include/linux/mempolicy.h @@ -25042,8 +25041,11 @@ M: Hugh Dickins R: Baolin Wang L: linux-mm@kvack.org S: Maintained +F: include/linux/memfd.h F: include/linux/shmem_fs.h +F: mm/memfd.c F: mm/shmem.c +F: mm/shmem_quota.c TOMOYO SECURITY MODULE M: Kentaro Takeda From c742d127d2d831aa83ae2987a508bca2bf0c7736 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 17 Jun 2025 15:41:30 +0100 Subject: [PATCH 215/284] MAINTAINERS: add additional mmap-related files to mmap section msync and nommu are directly related to memory mapping, mincore is less so but all are roughly speaking operating on virtual memory mappings from the point of view of the user so this seems the most appropriate place for them. Link: https://lkml.kernel.org/r/20250617144130.147847-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Acked-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Jann Horn Signed-off-by: Andrew Morton --- MAINTAINERS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 0eec6e1d4694..ca8a70f7b047 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15985,11 +15985,14 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm F: include/trace/events/mmap.h +F: mm/mincore.c F: mm/mlock.c F: mm/mmap.c F: mm/mprotect.c F: mm/mremap.c F: mm/mseal.c +F: mm/msync.c +F: mm/nommu.c F: mm/vma.c F: mm/vma.h F: mm/vma_exec.c From 33bdee12822d240caaa01aeac231eac42182fd64 Mon Sep 17 00:00:00 2001 From: Edson Juliano Drosdeck Date: Thu, 19 Jun 2025 16:12:15 -0300 Subject: [PATCH 216/284] ALSA: hda/realtek: Enable headset Mic on Positivo P15X Positivo P15X is equipped with ALC269VC, and needs a fix to make the headset mic to work. Also must to limits the internal microphone boost. Signed-off-by: Edson Juliano Drosdeck Link: https://patch.msgid.link/20250619191215.17203-1-edson.drosdeck@gmail.com Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 76cc908d4957..2e1618494c20 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -8032,6 +8032,7 @@ enum { ALC287_FIXUP_PREDATOR_SPK_CS35L41_I2C_2, ALC285_FIXUP_ASUS_GA605K_HEADSET_MIC, ALC285_FIXUP_ASUS_GA605K_I2C_SPEAKER2_TO_DAC1, + ALC269_FIXUP_POSITIVO_P15X_HEADSET_MIC, }; /* A special fixup for Lenovo C940 and Yoga Duet 7; @@ -10430,6 +10431,12 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc285_fixup_speaker2_to_dac1, }, + [ALC269_FIXUP_POSITIVO_P15X_HEADSET_MIC] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc269_fixup_limit_int_mic_boost, + .chained = true, + .chain_id = ALC269VC_FIXUP_ACER_MIC_NO_PRESENCE, + }, }; static const struct hda_quirk alc269_fixup_tbl[] = { @@ -11406,6 +11413,7 @@ static const struct hda_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x2782, 0x0214, "VAIO VJFE-CL", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), SND_PCI_QUIRK(0x2782, 0x0228, "Infinix ZERO BOOK 13", ALC269VB_FIXUP_INFINIX_ZERO_BOOK_13), SND_PCI_QUIRK(0x2782, 0x0232, "CHUWI CoreBook XPro", ALC269VB_FIXUP_CHUWI_COREBOOK_XPRO), + SND_PCI_QUIRK(0x2782, 0x1407, "Positivo P15X", ALC269_FIXUP_POSITIVO_P15X_HEADSET_MIC), SND_PCI_QUIRK(0x2782, 0x1701, "Infinix Y4 Max", ALC269VC_FIXUP_INFINIX_Y4_MAX), SND_PCI_QUIRK(0x2782, 0x1705, "MEDION E15433", ALC269VC_FIXUP_INFINIX_Y4_MAX), SND_PCI_QUIRK(0x2782, 0x1707, "Vaio VJFE-ADL", ALC298_FIXUP_SPK_VOLUME), From 51a4598ad5d9eb6be4ec9ba65bbfdf0ac302eb2e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Jun 2025 07:41:21 -0600 Subject: [PATCH 217/284] io_uring/net: always use current transfer count for buffer put A previous fix corrected the retry condition for when to continue a current bundle, but it missed that the current (not the total) transfer count also applies to the buffer put. If not, then for incrementally consumed buffer rings repeated completions on the same request may end up over consuming. Reported-by: Roy Tang (ErgoniaTrading) Cc: stable@vger.kernel.org Fixes: 3a08988123c8 ("io_uring/net: only retry recv bundle for a full transfer") Link: https://github.com/axboe/liburing/issues/1423 Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index e16633fd6630..9550d4c8f866 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -821,7 +821,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, if (sr->flags & IORING_RECVSEND_BUNDLE) { size_t this_ret = *ret - sr->done_io; - cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret), + cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), issue_flags); if (sr->retry) cflags = req->cqe.flags | (cflags & CQE_F_MASK); From 417b8af2e30d7f131682a893ad79c506fd39c624 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 20 Jun 2025 23:31:08 +0800 Subject: [PATCH 218/284] erofs: remove a superfluous check for encoded extents It is possible when an inode is split into segments for multi-threaded compression, and the tail extent of a segment could also be small. Fixes: 1d191b4ca51d ("erofs: implement encoded extent metadata") Signed-off-by: Gao Xiang Link: https://lore.kernel.org/r/20250620153108.1368029-1-hsiangkao@linux.alibaba.com --- fs/erofs/zmap.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 6afcb054780d..0bebc6e3a4d7 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -639,12 +639,6 @@ static int z_erofs_map_blocks_ext(struct inode *inode, } } map->m_llen = lend - map->m_la; - if (!last && map->m_llen < sb->s_blocksize) { - erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu", - map->m_llen, map->m_la, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } return 0; } From 89a33de314945c866b155d369f224fa552af1722 Mon Sep 17 00:00:00 2001 From: Kiran K Date: Tue, 10 Jun 2025 19:30:37 +0530 Subject: [PATCH 219/284] Bluetooth: btintel_pcie: Fix potential race condition in firmware download During firmware download, if an error occurs, interrupts must be disabled, synchronized, and re-enabled before retrying the download. This change ensures proper interrupt handling to prevent race conditions. Signed-off-by: Chandrashekar Devegowda Signed-off-by: Kiran K Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel_pcie.c | 33 ++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index 563165c5efae..e1c688dd2d45 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -2033,6 +2033,28 @@ static void btintel_pcie_release_hdev(struct btintel_pcie_data *data) data->hdev = NULL; } +static void btintel_pcie_disable_interrupts(struct btintel_pcie_data *data) +{ + spin_lock(&data->irq_lock); + btintel_pcie_wr_reg32(data, BTINTEL_PCIE_CSR_MSIX_FH_INT_MASK, data->fh_init_mask); + btintel_pcie_wr_reg32(data, BTINTEL_PCIE_CSR_MSIX_HW_INT_MASK, data->hw_init_mask); + spin_unlock(&data->irq_lock); +} + +static void btintel_pcie_enable_interrupts(struct btintel_pcie_data *data) +{ + spin_lock(&data->irq_lock); + btintel_pcie_wr_reg32(data, BTINTEL_PCIE_CSR_MSIX_FH_INT_MASK, ~data->fh_init_mask); + btintel_pcie_wr_reg32(data, BTINTEL_PCIE_CSR_MSIX_HW_INT_MASK, ~data->hw_init_mask); + spin_unlock(&data->irq_lock); +} + +static void btintel_pcie_synchronize_irqs(struct btintel_pcie_data *data) +{ + for (int i = 0; i < data->alloc_vecs; i++) + synchronize_irq(data->msix_entries[i].vector); +} + static int btintel_pcie_setup_internal(struct hci_dev *hdev) { struct btintel_pcie_data *data = hci_get_drvdata(hdev); @@ -2152,6 +2174,8 @@ static int btintel_pcie_setup(struct hci_dev *hdev) bt_dev_err(hdev, "Firmware download retry count: %d", fw_dl_retry); btintel_pcie_dump_debug_registers(hdev); + btintel_pcie_disable_interrupts(data); + btintel_pcie_synchronize_irqs(data); err = btintel_pcie_reset_bt(data); if (err) { bt_dev_err(hdev, "Failed to do shr reset: %d", err); @@ -2159,6 +2183,7 @@ static int btintel_pcie_setup(struct hci_dev *hdev) } usleep_range(10000, 12000); btintel_pcie_reset_ia(data); + btintel_pcie_enable_interrupts(data); btintel_pcie_config_msix(data); err = btintel_pcie_enable_bt(data); if (err) { @@ -2291,6 +2316,12 @@ static void btintel_pcie_remove(struct pci_dev *pdev) data = pci_get_drvdata(pdev); + btintel_pcie_disable_interrupts(data); + + btintel_pcie_synchronize_irqs(data); + + flush_work(&data->rx_work); + btintel_pcie_reset_bt(data); for (int i = 0; i < data->alloc_vecs; i++) { struct msix_entry *msix_entry; @@ -2303,8 +2334,6 @@ static void btintel_pcie_remove(struct pci_dev *pdev) btintel_pcie_release_hdev(data); - flush_work(&data->rx_work); - destroy_workqueue(data->workqueue); btintel_pcie_free(data); From 042bb9603c44620dce98717a2d23235ca57a00d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Danis?= Date: Thu, 12 Jun 2025 09:50:34 +0200 Subject: [PATCH 220/284] Bluetooth: L2CAP: Fix L2CAP MTU negotiation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OBEX download from iPhone is currently slow due to small packet size used to transfer data which doesn't follow the MTU negotiated during L2CAP connection, i.e. 672 bytes instead of 32767: < ACL Data TX: Handle 11 flags 0x00 dlen 12 L2CAP: Connection Request (0x02) ident 18 len 4 PSM: 4103 (0x1007) Source CID: 72 > ACL Data RX: Handle 11 flags 0x02 dlen 16 L2CAP: Connection Response (0x03) ident 18 len 8 Destination CID: 14608 Source CID: 72 Result: Connection successful (0x0000) Status: No further information available (0x0000) < ACL Data TX: Handle 11 flags 0x00 dlen 27 L2CAP: Configure Request (0x04) ident 20 len 19 Destination CID: 14608 Flags: 0x0000 Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 63 Max transmit: 3 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 > ACL Data RX: Handle 11 flags 0x02 dlen 26 L2CAP: Configure Request (0x04) ident 72 len 18 Destination CID: 72 Flags: 0x0000 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 32 Max transmit: 255 Retransmission timeout: 0 Monitor timeout: 0 Maximum PDU size: 65527 Option: Frame Check Sequence (0x05) [mandatory] FCS: 16-bit FCS (0x01) < ACL Data TX: Handle 11 flags 0x00 dlen 29 L2CAP: Configure Response (0x05) ident 72 len 21 Source CID: 14608 Flags: 0x0000 Result: Success (0x0000) Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 672 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 32 Max transmit: 255 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 > ACL Data RX: Handle 11 flags 0x02 dlen 32 L2CAP: Configure Response (0x05) ident 20 len 24 Source CID: 72 Flags: 0x0000 Result: Success (0x0000) Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 63 Max transmit: 3 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 Option: Frame Check Sequence (0x05) [mandatory] FCS: 16-bit FCS (0x01) ... > ACL Data RX: Handle 11 flags 0x02 dlen 680 Channel: 72 len 676 ctrl 0x0202 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Unsegmented TxSeq 1 ReqSeq 2 < ACL Data TX: Handle 11 flags 0x00 dlen 13 Channel: 14608 len 9 ctrl 0x0204 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Unsegmented TxSeq 2 ReqSeq 2 > ACL Data RX: Handle 11 flags 0x02 dlen 680 Channel: 72 len 676 ctrl 0x0304 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Unsegmented TxSeq 2 ReqSeq 3 The MTUs are negotiated for each direction. In this traces 32767 for iPhone->localhost and no MTU for localhost->iPhone, which based on '4.4 L2CAP_CONFIGURATION_REQ' (Core specification v5.4, Vol. 3, Part A): The only parameters that should be included in the L2CAP_CONFIGURATION_REQ packet are those that require different values than the default or previously agreed values. ... Any missing configuration parameters are assumed to have their most recently explicitly or implicitly accepted values. and '5.1 Maximum transmission unit (MTU)': If the remote device sends a positive L2CAP_CONFIGURATION_RSP packet it should include the actual MTU to be used on this channel for traffic flowing into the local device. ... The default value is 672 octets. is set by BlueZ to 672 bytes. It seems that the iPhone used the lowest negotiated value to transfer data to the localhost instead of the negotiated one for the incoming direction. This could be fixed by using the MTU negotiated for the other direction, if exists, in the L2CAP_CONFIGURATION_RSP. This allows to use segmented packets as in the following traces: < ACL Data TX: Handle 11 flags 0x00 dlen 12 L2CAP: Connection Request (0x02) ident 22 len 4 PSM: 4103 (0x1007) Source CID: 72 < ACL Data TX: Handle 11 flags 0x00 dlen 27 L2CAP: Configure Request (0x04) ident 24 len 19 Destination CID: 2832 Flags: 0x0000 Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 63 Max transmit: 3 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 > ACL Data RX: Handle 11 flags 0x02 dlen 26 L2CAP: Configure Request (0x04) ident 15 len 18 Destination CID: 72 Flags: 0x0000 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 32 Max transmit: 255 Retransmission timeout: 0 Monitor timeout: 0 Maximum PDU size: 65527 Option: Frame Check Sequence (0x05) [mandatory] FCS: 16-bit FCS (0x01) < ACL Data TX: Handle 11 flags 0x00 dlen 29 L2CAP: Configure Response (0x05) ident 15 len 21 Source CID: 2832 Flags: 0x0000 Result: Success (0x0000) Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 32 Max transmit: 255 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 > ACL Data RX: Handle 11 flags 0x02 dlen 32 L2CAP: Configure Response (0x05) ident 24 len 24 Source CID: 72 Flags: 0x0000 Result: Success (0x0000) Option: Maximum Transmission Unit (0x01) [mandatory] MTU: 32767 Option: Retransmission and Flow Control (0x04) [mandatory] Mode: Enhanced Retransmission (0x03) TX window size: 63 Max transmit: 3 Retransmission timeout: 2000 Monitor timeout: 12000 Maximum PDU size: 1009 Option: Frame Check Sequence (0x05) [mandatory] FCS: 16-bit FCS (0x01) ... > ACL Data RX: Handle 11 flags 0x02 dlen 1009 Channel: 72 len 1005 ctrl 0x4202 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Start (len 21884) TxSeq 1 ReqSeq 2 > ACL Data RX: Handle 11 flags 0x02 dlen 1009 Channel: 72 len 1005 ctrl 0xc204 [PSM 4103 mode Enhanced Retransmission (0x03)] {chan 8} I-frame: Continuation TxSeq 2 ReqSeq 2 This has been tested with kernel 5.4 and BlueZ 5.77. Cc: stable@vger.kernel.org Signed-off-by: Frédéric Danis Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/l2cap_core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a5bde5db58ef..40daa38276f3 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -3415,7 +3415,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; struct l2cap_conf_efs efs; u8 remote_efs = 0; - u16 mtu = L2CAP_DEFAULT_MTU; + u16 mtu = 0; u16 result = L2CAP_CONF_SUCCESS; u16 size; @@ -3520,6 +3520,13 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data /* Configure output options and let the other side know * which ones we don't like. */ + /* If MTU is not provided in configure request, use the most recently + * explicitly or implicitly accepted value for the other direction, + * or the default value. + */ + if (mtu == 0) + mtu = chan->imtu ? chan->imtu : L2CAP_DEFAULT_MTU; + if (mtu < L2CAP_DEFAULT_MIN_MTU) result = L2CAP_CONF_UNACCEPT; else { From db0ff7e15923ffa7067874604ca275e92343f1b1 Mon Sep 17 00:00:00 2001 From: Shuai Zhang Date: Mon, 9 Jun 2025 18:55:00 +0800 Subject: [PATCH 221/284] driver: bluetooth: hci_qca:fix unable to load the BT driver Some modules have BT_EN enabled via a hardware pull-up, meaning it is not defined in the DTS and is not controlled through the power sequence. In such cases, fall through to follow the legacy flow. Signed-off-by: Shuai Zhang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_qca.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 5fe5879881f5..3ec0be496820 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -2392,10 +2392,17 @@ static int qca_serdev_probe(struct serdev_device *serdev) */ qcadev->bt_power->pwrseq = devm_pwrseq_get(&serdev->dev, "bluetooth"); - if (IS_ERR(qcadev->bt_power->pwrseq)) - return PTR_ERR(qcadev->bt_power->pwrseq); - break; + /* + * Some modules have BT_EN enabled via a hardware pull-up, + * meaning it is not defined in the DTS and is not controlled + * through the power sequence. In such cases, fall through + * to follow the legacy flow. + */ + if (IS_ERR(qcadev->bt_power->pwrseq)) + qcadev->bt_power->pwrseq = NULL; + else + break; } fallthrough; case QCA_WCN3950: From b5aafcb4efd2bdacbc37753cf807d69faa6a7304 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:19 +0800 Subject: [PATCH 222/284] KVM: TDX: Add new TDVMCALL status code for unsupported subfuncs Add the new TDVMCALL status code TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED and return it for unimplemented TDVMCALL subfunctions. Returning TDVMCALL_STATUS_INVALID_OPERAND when a subfunction is not implemented is vague because TDX guests can't tell the error is due to the subfunction is not supported or an invalid input of the subfunction. New GHCI spec adds TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED to avoid the ambiguity. Use it instead of TDVMCALL_STATUS_INVALID_OPERAND. Before the change, for common guest implementations, when a TDX guest receives TDVMCALL_STATUS_INVALID_OPERAND, it has two cases: 1. Some operand is invalid. It could change the operand to another value retry. 2. The subfunction is not supported. For case 1, an invalid operand usually means the guest implementation bug. Since the TDX guest can't tell which case is, the best practice for handling TDVMCALL_STATUS_INVALID_OPERAND is stopping calling such leaf, treating the failure as fatal if the TDVMCALL is essential or ignoring it if the TDVMCALL is optional. With this change, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED could be sent to old TDX guest that do not know about it, but it is expected that the guest will make the same action as TDVMCALL_STATUS_INVALID_OPERAND. Currently, no known TDX guest checks TDVMCALL_STATUS_INVALID_OPERAND specifically; for example Linux just checks for success. Signed-off-by: Binbin Wu [Return it for untrapped KVM_HC_MAP_GPA_RANGE. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/shared/tdx.h | 1 + arch/x86/kvm/vmx/tdx.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 2f3820342598..d8525e6ef50a 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -80,6 +80,7 @@ #define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL #define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL #define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL +#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED 0x8000000000000003ULL /* * Bitmasks of exposed registers (with VMM). diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b952bc673271..5d100c240ab3 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1212,11 +1212,13 @@ static int tdx_map_gpa(struct kvm_vcpu *vcpu) /* * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE - * bit set. If not, the error code is not defined in GHCI for TDX, use - * TDVMCALL_STATUS_INVALID_OPERAND for this case. + * bit set. This is a base call so it should always be supported, but + * KVM has no way to ensure that userspace implements the GHCI correctly. + * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error + * to the guest. */ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { - ret = TDVMCALL_STATUS_INVALID_OPERAND; + ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; goto error; } @@ -1476,7 +1478,7 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) break; } - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); return 1; } From cf207eac06f661fb692f405d5ab8230df884ee52 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:20 +0800 Subject: [PATCH 223/284] KVM: TDX: Handle TDG.VP.VMCALL Handle TDVMCALL for GetQuote to generate a TD-Quote. GetQuote is a doorbell-like interface used by TDX guests to request VMM to generate a TD-Quote signed by a service hosting TD-Quoting Enclave operating on the host. A TDX guest passes a TD Report (TDREPORT_STRUCT) in a shared-memory area as parameter. Host VMM can access it and queue the operation for a service hosting TD-Quoting enclave. When completed, the Quote is returned via the same shared-memory area. KVM only checks the GPA from the TDX guest has the shared-bit set and drops the shared-bit before exiting to userspace to avoid bleeding the shared-bit into KVM's exit ABI. KVM forwards the request to userspace VMM (e.g. QEMU) and userspace VMM queues the operation asynchronously. KVM sets the return code according to the 'ret' field set by userspace to notify the TDX guest whether the request has been queued successfully or not. When the request has been queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. Add KVM_EXIT_TDX as a new exit reason to userspace. Userspace is required to handle the KVM exit reason as the initial support for TDX, by reentering KVM to ensure that the TDVMCALL is complete. While at it, add a note that KVM_EXIT_HYPERCALL also requires reentry with KVM_RUN. Signed-off-by: Binbin Wu Tested-by: Mikko Ylinen Acked-by: Kai Huang [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 49 +++++++++++++++++++++++++++++++++- arch/x86/kvm/vmx/tdx.c | 32 ++++++++++++++++++++++ include/uapi/linux/kvm.h | 17 ++++++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 1bd2d42e6424..115ec3c2b641 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6645,7 +6645,8 @@ to the byte array. .. note:: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN, - KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding + KVM_EXIT_EPR, KVM_EXIT_HYPERCALL, KVM_EXIT_TDX, + KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding operations are complete (and guest state is consistent) only after userspace has re-entered the kernel with KVM_RUN. The kernel side will first finish incomplete operations and then check for pending signals. @@ -7174,6 +7175,52 @@ The valid value for 'flags' is: - KVM_NOTIFY_CONTEXT_INVALID -- the VM context is corrupted and not valid in VMCS. It would run into unknown result if resume the target VM. +:: + + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + u64 ret; + u64 data[5]; + } unknown; + struct { + u64 ret; + u64 gpa; + u64 size; + } get_quote; + }; + } tdx; + +Process a TDVMCALL from the guest. KVM forwards select TDVMCALL based +on the Guest-Hypervisor Communication Interface (GHCI) specification; +KVM bridges these requests to the userspace VMM with minimal changes, +placing the inputs in the union and copying them back to the guest +on re-entry. + +Flags are currently always zero, whereas ``nr`` contains the TDVMCALL +number from register R11. The remaining field of the union provide the +inputs and outputs of the TDVMCALL. Currently the following values of +``nr`` are defined: + +* ``TDVMCALL_GET_QUOTE``: the guest has requested to generate a TD-Quote +signed by a service hosting TD-Quoting Enclave operating on the host. +Parameters and return value are in the ``get_quote`` field of the union. +The ``gpa`` field and ``size`` specify the guest physical address +(without the shared bit set) and the size of a shared-memory buffer, in +which the TDX guest passes a TD Report. The ``ret`` field represents +the return value of the GetQuote request. When the request has been +queued successfully, the TDX guest can poll the status field in the +shared-memory area to check whether the Quote generation is completed or +not. When completed, the generated Quote is returned via the same buffer. + +KVM may add support for more values in the future that may cause a userspace +exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, +it will enter with output fields already valid; in the common case, the +``unknown.ret`` field of the union will be ``TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED``. +Userspace need not do anything if it does not wish to support a TDVMCALL. :: /* Fix the size of the union. */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5d100c240ab3..b619a3478983 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1465,6 +1465,36 @@ static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) return 1; } +static int tdx_complete_simple(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); + return 1; +} + +static int tdx_get_quote(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + + /* The gpa of buffer must have shared bit set. */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; + vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + vcpu->run->tdx.get_quote.size = size; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + static int handle_tdvmcall(struct kvm_vcpu *vcpu) { switch (tdvmcall_leaf(vcpu)) { @@ -1474,6 +1504,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) return tdx_report_fatal_error(vcpu); case TDVMCALL_GET_TD_VM_CALL_INFO: return tdx_get_td_vm_call_info(vcpu); + case TDVMCALL_GET_QUOTE: + return tdx_get_quote(vcpu); default: break; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d00b85cb168c..e23e7286ad1a 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -178,6 +178,7 @@ struct kvm_xen_exit { #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 +#define KVM_EXIT_TDX 40 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -447,6 +448,22 @@ struct kvm_run { __u64 gpa; __u64 size; } memory_fault; + /* KVM_EXIT_TDX */ + struct { + __u64 flags; + __u64 nr; + union { + struct { + __u64 ret; + __u64 data[5]; + } unknown; + struct { + __u64 ret; + __u64 gpa; + __u64 size; + } get_quote; + }; + } tdx; /* Fix the size of the union. */ char padding[256]; }; From 25e8b1dd4883e6c251c3db5b347f3c8ae4ade921 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:21 +0800 Subject: [PATCH 224/284] KVM: TDX: Exit to userspace for GetTdVmCallInfo Exit to userspace for TDG.VP.VMCALL via KVM_EXIT_TDX, to allow userspace to provide information about the support of TDVMCALLs when r12 is 1 for the TDVMCALLs beyond the GHCI base API. GHCI spec defines the GHCI base TDVMCALLs: , , , , <#VE.RequestMMIO>, , , and . They must be supported by VMM to support TDX guests. For GetTdVmCallInfo - When leaf (r12) to enumerate TDVMCALL functionality is set to 0, successful execution indicates all GHCI base TDVMCALLs listed above are supported. Update the KVM TDX document with the set of the GHCI base APIs. - When leaf (r12) to enumerate TDVMCALL functionality is set to 1, it indicates the TDX guest is querying the supported TDVMCALLs beyond the GHCI base TDVMCALLs. Exit to userspace to let userspace set the TDVMCALL sub-function bit(s) accordingly to the leaf outputs. KVM could set the TDVMCALL bit(s) supported by itself when the TDVMCALLs don't need support from userspace after returning from userspace and before entering guest. Currently, no such TDVMCALLs implemented, KVM just sets the values returned from userspace. Suggested-by: Paolo Bonzini Signed-off-by: Binbin Wu [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 10 ++++++++ arch/x86/kvm/vmx/tdx.c | 43 ++++++++++++++++++++++++++++++---- include/uapi/linux/kvm.h | 5 ++++ 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 115ec3c2b641..9abf93ee5f65 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -7191,6 +7191,11 @@ The valid value for 'flags' is: u64 gpa; u64 size; } get_quote; + struct { + u64 ret; + u64 leaf; + u64 r11, r12, r13, r14; + } get_tdvmcall_info; }; } tdx; @@ -7216,6 +7221,11 @@ queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. +* ``TDVMCALL_GET_TD_VM_CALL_INFO``: the guest has requested the support +status of TDVMCALLs. The output values for the given leaf should be +placed in fields from ``r11`` to ``r14`` of the ``get_tdvmcall_info`` +field of the union. + KVM may add support for more values in the future that may cause a userspace exit, even without calls to ``KVM_ENABLE_CAP`` or similar. In this case, it will enter with output fields already valid; in the common case, the diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b619a3478983..1ad20c273f3b 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1451,18 +1451,53 @@ static int tdx_emulate_mmio(struct kvm_vcpu *vcpu) return 1; } +static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); + + /* + * For now, there is no TDVMCALL beyond GHCI base API supported by KVM + * directly without the support from userspace, just set the value + * returned from userspace. + */ + tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; + tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; + tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; + tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; + + return 1; +} + static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) { struct vcpu_tdx *tdx = to_tdx(vcpu); - if (tdx->vp_enter_args.r12) - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); - else { + switch (tdx->vp_enter_args.r12) { + case 0: tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r12 = 0; tdx->vp_enter_args.r13 = 0; tdx->vp_enter_args.r14 = 0; + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); + return 1; + case 1: + vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; + vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; + vcpu->run->tdx.get_tdvmcall_info.r11 = 0; + vcpu->run->tdx.get_tdvmcall_info.r12 = 0; + vcpu->run->tdx.get_tdvmcall_info.r13 = 0; + vcpu->run->tdx.get_tdvmcall_info.r14 = 0; + vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; + return 0; + default: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; } - return 1; } static int tdx_complete_simple(struct kvm_vcpu *vcpu) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e23e7286ad1a..37891580d05d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -462,6 +462,11 @@ struct kvm_run { __u64 gpa; __u64 size; } get_quote; + struct { + __u64 ret; + __u64 leaf; + __u64 r11, r12, r13, r14; + } get_tdvmcall_info; }; } tdx; /* Fix the size of the union. */ From 33b6a1f155d627f5bd80c7485c598ce45428f74f Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 10 Jun 2025 19:34:48 +0200 Subject: [PATCH 225/284] rcu: Return early if callback is not specified Currently the call_rcu() API does not check whether a callback pointer is NULL. If NULL is passed, rcu_core() will try to invoke it, resulting in NULL pointer dereference and a kernel crash. To prevent this and improve debuggability, this patch adds a check for NULL and emits a kernel stack trace to help identify a faulty caller. Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes Signed-off-by: Joel Fernandes --- kernel/rcu/tree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e8a4b720d7d2..14d4499c6fc3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3072,6 +3072,10 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) /* Misaligned rcu_head! */ WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); + /* Avoid NULL dereference if callback is NULL. */ + if (WARN_ON_ONCE(!func)) + return; + if (debug_rcu_head_queue(head)) { /* * Probable double call_rcu(), so leak the callback. From 3085ef9d9e7ab5ae4cddbe809e2e3b8dc11cdc75 Mon Sep 17 00:00:00 2001 From: Shiji Yang Date: Wed, 18 Jun 2025 23:07:43 +0800 Subject: [PATCH 226/284] irqchip/ath79-misc: Fix missing prototypes warnings ath79_misc_irq_init() was defined but unused since commit 51fa4f8912c0 ("MIPS: ath79: drop legacy IRQ code"), so it's time to drop it. The build also warns about a missing prototype of get_c0_perfcount_int(). Remove the stale leftover function and add the missing include. Signed-off-by: Shiji Yang Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/OSBPR01MB167032D2017645200787AAEBBC72A@OSBPR01MB1670.jpnprd01.prod.outlook.com --- drivers/irqchip/irq-ath79-misc.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/drivers/irqchip/irq-ath79-misc.c b/drivers/irqchip/irq-ath79-misc.c index 268cc18b781f..258b8e9a2d57 100644 --- a/drivers/irqchip/irq-ath79-misc.c +++ b/drivers/irqchip/irq-ath79-misc.c @@ -15,6 +15,8 @@ #include #include +#include + #define AR71XX_RESET_REG_MISC_INT_STATUS 0 #define AR71XX_RESET_REG_MISC_INT_ENABLE 4 @@ -177,21 +179,3 @@ static int __init ar7240_misc_intc_of_init( IRQCHIP_DECLARE(ar7240_misc_intc, "qca,ar7240-misc-intc", ar7240_misc_intc_of_init); - -void __init ath79_misc_irq_init(void __iomem *regs, int irq, - int irq_base, bool is_ar71xx) -{ - struct irq_domain *domain; - - if (is_ar71xx) - ath79_misc_irq_chip.irq_mask_ack = ar71xx_misc_irq_mask; - else - ath79_misc_irq_chip.irq_ack = ar724x_misc_irq_ack; - - domain = irq_domain_create_legacy(NULL, ATH79_MISC_IRQ_COUNT, - irq_base, 0, &misc_irq_domain_ops, regs); - if (!domain) - panic("Failed to create MISC irqdomain"); - - ath79_misc_intc_domain_init(domain, irq); -} From 0b39b055b5b48cbbdf5746a1ca6e3f6b0221e537 Mon Sep 17 00:00:00 2001 From: Xiaowei Li Date: Fri, 20 Jun 2025 10:27:02 +0800 Subject: [PATCH 227/284] net: usb: qmi_wwan: add SIMCom 8230C composition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for SIMCom 8230C which is based on Qualcomm SDX35 chip. 0x9071: tty (DM) + tty (NMEA) + tty (AT) + rmnet T: Bus=01 Lev=01 Prnt=01 Port=05 Cnt=02 Dev#= 8 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1e0e ProdID=9071 Rev= 5.15 S: Manufacturer=SIMCOM S: Product=SDXBAAGHA-IDP _SN:D744C4C5 S: SerialNumber=0123456789ABCDEF C:* #Ifs= 5 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=84(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=83(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=86(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=85(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=42 Prot=01 Driver=none E: Ad=05(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=87(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Xiaowei Li Acked-by: Bjørn Mork Link: https://patch.msgid.link/tencent_21D781FAA4969FEACA6ABB460362B52C9409@qq.com Signed-off-by: Jakub Kicinski --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index b586b1c13a47..f5647ee0adde 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1426,6 +1426,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x22de, 0x9051, 2)}, /* Hucom Wireless HM-211S/K */ {QMI_FIXED_INTF(0x22de, 0x9061, 3)}, /* WeTelecom WPD-600N */ {QMI_QUIRK_SET_DTR(0x1e0e, 0x9001, 5)}, /* SIMCom 7100E, 7230E, 7600E ++ */ + {QMI_QUIRK_SET_DTR(0x1e0e, 0x9071, 3)}, /* SIMCom 8230C ++ */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)}, /* Quectel EC21 Mini PCIe */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0191, 4)}, /* Quectel EG91 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x0195, 4)}, /* Quectel EG95 */ From 714db279942b1fb9b97c4243e186825a96750239 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 19 Jun 2025 14:16:07 -0700 Subject: [PATCH 228/284] CREDITS: Add entry for Shannon Nelson I'm retiring and have already had my name removed from MAINTAINERS. A couple of folks kindly suggested I should have an entry here. Signed-off-by: Shannon Nelson Reviewed-by: Simon Horman Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250619211607.1244217-1-sln@onemain.com Signed-off-by: Jakub Kicinski --- CREDITS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CREDITS b/CREDITS index 45446ae322ec..c30b75f9a732 100644 --- a/CREDITS +++ b/CREDITS @@ -2981,6 +2981,11 @@ S: 521 Pleasant Valley Road S: Potsdam, New York 13676 S: USA +N: Shannon Nelson +E: sln@onemain.com +D: Worked on several network drivers including +D: ixgbe, i40e, ionic, pds_core, pds_vdpa, pds_fwctl + N: Dave Neuer E: dave.neuer@pobox.com D: Helped implement support for Compaq's H31xx series iPAQs From 34331d7beed7576acfc98e991c39738b96162499 Mon Sep 17 00:00:00 2001 From: zhangjian Date: Thu, 19 Jun 2025 09:18:29 +0800 Subject: [PATCH 229/284] smb: client: fix first command failure during re-negotiation after fabc4ed200f9, server_unresponsive add a condition to check whether client need to reconnect depending on server->lstrp. When client failed to reconnect for some time and abort connection, server->lstrp is updated for the last time. In the following scene, server->lstrp is too old. This cause next command failure in re-negotiation rather than waiting for re-negotiation done. 1. mount -t cifs -o username=Everyone,echo_internal=10 //$server_ip/export /mnt 2. ssh $server_ip "echo b > /proc/sysrq-trigger &" 3. ls /mnt 4. sleep 21s 5. ssh $server_ip "service firewalld stop" 6. ls # return EHOSTDOWN If the interval between 5 and 6 is too small, 6 may trigger sending negotiation request. Before backgrounding cifsd thread try to receive negotiation response from server in cifs_readv_from_socket, server_unresponsive may trigger cifs_reconnect which cause 6 to be failed: ls thread ---------------- smb2_negotiate server->tcpStatus = CifsInNegotiate compound_send_recv wait_for_compound_request cifsd thread ---------------- cifs_readv_from_socket server_unresponsive server->tcpStatus == CifsInNegotiate && jiffies > server->lstrp + 20s cifs_reconnect cifs_abort_connection: mid_state = MID_RETRY_NEEDED ls thread ---------------- cifs_sync_mid_result return EAGAIN smb2_negotiate return EHOSTDOWN Though server->lstrp means last server response time, it is updated in cifs_abort_connection and cifs_get_tcp_session. We can also update server->lstrp before switching into CifsInNegotiate state to avoid failure in 6. Fixes: 7ccc1465465d ("smb: client: fix hang in wait_for_response() for negproto") Acked-by: Paulo Alcantara (Red Hat) Acked-by: Meetakshi Setiya Signed-off-by: zhangjian Signed-off-by: Steve French --- fs/smb/client/connect.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index c4fb80b37738..c48869c29e15 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -4199,6 +4199,7 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, return 0; } + server->lstrp = jiffies; server->tcpStatus = CifsInNegotiate; spin_unlock(&server->srv_lock); From a379a8a2a0032e12e7ef397197c9c2ad011588d6 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 18 Jun 2025 18:51:40 +0200 Subject: [PATCH 230/284] smb: client: fix max_sge overflow in smb_extract_folioq_to_rdma() This fixes the following problem: [ 749.901015] [ T8673] run fstests cifs/001 at 2025-06-17 09:40:30 [ 750.346409] [ T9870] ================================================================== [ 750.346814] [ T9870] BUG: KASAN: slab-out-of-bounds in smb_set_sge+0x2cc/0x3b0 [cifs] [ 750.347330] [ T9870] Write of size 8 at addr ffff888011082890 by task xfs_io/9870 [ 750.347705] [ T9870] [ 750.348077] [ T9870] CPU: 0 UID: 0 PID: 9870 Comm: xfs_io Kdump: loaded Not tainted 6.16.0-rc2-metze.02+ #1 PREEMPT(voluntary) [ 750.348082] [ T9870] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 750.348085] [ T9870] Call Trace: [ 750.348086] [ T9870] [ 750.348088] [ T9870] dump_stack_lvl+0x76/0xa0 [ 750.348106] [ T9870] print_report+0xd1/0x640 [ 750.348116] [ T9870] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 750.348120] [ T9870] ? kasan_complete_mode_report_info+0x26/0x210 [ 750.348124] [ T9870] kasan_report+0xe7/0x130 [ 750.348128] [ T9870] ? smb_set_sge+0x2cc/0x3b0 [cifs] [ 750.348262] [ T9870] ? smb_set_sge+0x2cc/0x3b0 [cifs] [ 750.348377] [ T9870] __asan_report_store8_noabort+0x17/0x30 [ 750.348381] [ T9870] smb_set_sge+0x2cc/0x3b0 [cifs] [ 750.348496] [ T9870] smbd_post_send_iter+0x1990/0x3070 [cifs] [ 750.348625] [ T9870] ? __pfx_smbd_post_send_iter+0x10/0x10 [cifs] [ 750.348741] [ T9870] ? update_stack_state+0x2a0/0x670 [ 750.348749] [ T9870] ? cifs_flush+0x153/0x320 [cifs] [ 750.348870] [ T9870] ? cifs_flush+0x153/0x320 [cifs] [ 750.348990] [ T9870] ? update_stack_state+0x2a0/0x670 [ 750.348995] [ T9870] smbd_send+0x58c/0x9c0 [cifs] [ 750.349117] [ T9870] ? __pfx_smbd_send+0x10/0x10 [cifs] [ 750.349231] [ T9870] ? unwind_get_return_address+0x65/0xb0 [ 750.349235] [ T9870] ? __pfx_stack_trace_consume_entry+0x10/0x10 [ 750.349242] [ T9870] ? arch_stack_walk+0xa7/0x100 [ 750.349250] [ T9870] ? stack_trace_save+0x92/0xd0 [ 750.349254] [ T9870] __smb_send_rqst+0x931/0xec0 [cifs] [ 750.349374] [ T9870] ? kernel_text_address+0x173/0x190 [ 750.349379] [ T9870] ? kasan_save_stack+0x39/0x70 [ 750.349382] [ T9870] ? kasan_save_track+0x18/0x70 [ 750.349385] [ T9870] ? __kasan_slab_alloc+0x9d/0xa0 [ 750.349389] [ T9870] ? __pfx___smb_send_rqst+0x10/0x10 [cifs] [ 750.349508] [ T9870] ? smb2_mid_entry_alloc+0xb4/0x7e0 [cifs] [ 750.349626] [ T9870] ? cifs_call_async+0x277/0xb00 [cifs] [ 750.349746] [ T9870] ? cifs_issue_write+0x256/0x610 [cifs] [ 750.349867] [ T9870] ? netfs_do_issue_write+0xc2/0x340 [netfs] [ 750.349900] [ T9870] ? netfs_advance_write+0x45b/0x1270 [netfs] [ 750.349929] [ T9870] ? netfs_write_folio+0xd6c/0x1be0 [netfs] [ 750.349958] [ T9870] ? netfs_writepages+0x2e9/0xa80 [netfs] [ 750.349987] [ T9870] ? do_writepages+0x21f/0x590 [ 750.349993] [ T9870] ? filemap_fdatawrite_wbc+0xe1/0x140 [ 750.349997] [ T9870] ? entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 750.350002] [ T9870] smb_send_rqst+0x22e/0x2f0 [cifs] [ 750.350131] [ T9870] ? __pfx_smb_send_rqst+0x10/0x10 [cifs] [ 750.350255] [ T9870] ? local_clock_noinstr+0xe/0xd0 [ 750.350261] [ T9870] ? kasan_save_alloc_info+0x37/0x60 [ 750.350268] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.350271] [ T9870] ? _raw_spin_lock+0x81/0xf0 [ 750.350275] [ T9870] ? __pfx__raw_spin_lock+0x10/0x10 [ 750.350278] [ T9870] ? smb2_setup_async_request+0x293/0x580 [cifs] [ 750.350398] [ T9870] cifs_call_async+0x477/0xb00 [cifs] [ 750.350518] [ T9870] ? __pfx_smb2_writev_callback+0x10/0x10 [cifs] [ 750.350636] [ T9870] ? __pfx_cifs_call_async+0x10/0x10 [cifs] [ 750.350756] [ T9870] ? __pfx__raw_spin_lock+0x10/0x10 [ 750.350760] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.350763] [ T9870] ? __smb2_plain_req_init+0x933/0x1090 [cifs] [ 750.350891] [ T9870] smb2_async_writev+0x15ff/0x2460 [cifs] [ 750.351008] [ T9870] ? sched_clock_noinstr+0x9/0x10 [ 750.351012] [ T9870] ? local_clock_noinstr+0xe/0xd0 [ 750.351018] [ T9870] ? __pfx_smb2_async_writev+0x10/0x10 [cifs] [ 750.351144] [ T9870] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 750.351150] [ T9870] ? _raw_spin_unlock+0xe/0x40 [ 750.351154] [ T9870] ? cifs_pick_channel+0x242/0x370 [cifs] [ 750.351275] [ T9870] cifs_issue_write+0x256/0x610 [cifs] [ 750.351554] [ T9870] ? cifs_issue_write+0x256/0x610 [cifs] [ 750.351677] [ T9870] netfs_do_issue_write+0xc2/0x340 [netfs] [ 750.351710] [ T9870] netfs_advance_write+0x45b/0x1270 [netfs] [ 750.351740] [ T9870] ? rolling_buffer_append+0x12d/0x440 [netfs] [ 750.351769] [ T9870] netfs_write_folio+0xd6c/0x1be0 [netfs] [ 750.351798] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.351804] [ T9870] netfs_writepages+0x2e9/0xa80 [netfs] [ 750.351835] [ T9870] ? __pfx_netfs_writepages+0x10/0x10 [netfs] [ 750.351864] [ T9870] ? exit_files+0xab/0xe0 [ 750.351867] [ T9870] ? do_exit+0x148f/0x2980 [ 750.351871] [ T9870] ? do_group_exit+0xb5/0x250 [ 750.351874] [ T9870] ? arch_do_signal_or_restart+0x92/0x630 [ 750.351879] [ T9870] ? exit_to_user_mode_loop+0x98/0x170 [ 750.351882] [ T9870] ? do_syscall_64+0x2cf/0xd80 [ 750.351886] [ T9870] ? entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 750.351890] [ T9870] do_writepages+0x21f/0x590 [ 750.351894] [ T9870] ? __pfx_do_writepages+0x10/0x10 [ 750.351897] [ T9870] filemap_fdatawrite_wbc+0xe1/0x140 [ 750.351901] [ T9870] __filemap_fdatawrite_range+0xba/0x100 [ 750.351904] [ T9870] ? __pfx___filemap_fdatawrite_range+0x10/0x10 [ 750.351912] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.351916] [ T9870] filemap_write_and_wait_range+0x7d/0xf0 [ 750.351920] [ T9870] cifs_flush+0x153/0x320 [cifs] [ 750.352042] [ T9870] filp_flush+0x107/0x1a0 [ 750.352046] [ T9870] filp_close+0x14/0x30 [ 750.352049] [ T9870] put_files_struct.part.0+0x126/0x2a0 [ 750.352053] [ T9870] ? __pfx__raw_spin_lock+0x10/0x10 [ 750.352058] [ T9870] exit_files+0xab/0xe0 [ 750.352061] [ T9870] do_exit+0x148f/0x2980 [ 750.352065] [ T9870] ? __pfx_do_exit+0x10/0x10 [ 750.352069] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.352072] [ T9870] ? _raw_spin_lock_irq+0x8a/0xf0 [ 750.352076] [ T9870] do_group_exit+0xb5/0x250 [ 750.352080] [ T9870] get_signal+0x22d3/0x22e0 [ 750.352086] [ T9870] ? __pfx_get_signal+0x10/0x10 [ 750.352089] [ T9870] ? fpregs_assert_state_consistent+0x68/0x100 [ 750.352101] [ T9870] ? folio_add_lru+0xda/0x120 [ 750.352105] [ T9870] arch_do_signal_or_restart+0x92/0x630 [ 750.352109] [ T9870] ? __pfx_arch_do_signal_or_restart+0x10/0x10 [ 750.352115] [ T9870] exit_to_user_mode_loop+0x98/0x170 [ 750.352118] [ T9870] do_syscall_64+0x2cf/0xd80 [ 750.352123] [ T9870] ? __kasan_check_read+0x11/0x20 [ 750.352126] [ T9870] ? count_memcg_events+0x1b4/0x420 [ 750.352132] [ T9870] ? handle_mm_fault+0x148/0x690 [ 750.352136] [ T9870] ? _raw_spin_lock_irq+0x8a/0xf0 [ 750.352140] [ T9870] ? __kasan_check_read+0x11/0x20 [ 750.352143] [ T9870] ? fpregs_assert_state_consistent+0x68/0x100 [ 750.352146] [ T9870] ? irqentry_exit_to_user_mode+0x2e/0x250 [ 750.352151] [ T9870] ? irqentry_exit+0x43/0x50 [ 750.352154] [ T9870] ? exc_page_fault+0x75/0xe0 [ 750.352160] [ T9870] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 750.352163] [ T9870] RIP: 0033:0x7858c94ab6e2 [ 750.352167] [ T9870] Code: Unable to access opcode bytes at 0x7858c94ab6b8. [ 750.352175] [ T9870] RSP: 002b:00007858c9248ce8 EFLAGS: 00000246 ORIG_RAX: 0000000000000022 [ 750.352179] [ T9870] RAX: fffffffffffffdfe RBX: 00007858c92496c0 RCX: 00007858c94ab6e2 [ 750.352182] [ T9870] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 750.352184] [ T9870] RBP: 00007858c9248d10 R08: 0000000000000000 R09: 0000000000000000 [ 750.352185] [ T9870] R10: 0000000000000000 R11: 0000000000000246 R12: fffffffffffffde0 [ 750.352187] [ T9870] R13: 0000000000000020 R14: 0000000000000002 R15: 00007ffc072d2230 [ 750.352191] [ T9870] [ 750.352195] [ T9870] [ 750.395206] [ T9870] Allocated by task 9870 on cpu 0 at 750.346406s: [ 750.395523] [ T9870] kasan_save_stack+0x39/0x70 [ 750.395532] [ T9870] kasan_save_track+0x18/0x70 [ 750.395536] [ T9870] kasan_save_alloc_info+0x37/0x60 [ 750.395539] [ T9870] __kasan_slab_alloc+0x9d/0xa0 [ 750.395543] [ T9870] kmem_cache_alloc_noprof+0x13c/0x3f0 [ 750.395548] [ T9870] mempool_alloc_slab+0x15/0x20 [ 750.395553] [ T9870] mempool_alloc_noprof+0x135/0x340 [ 750.395557] [ T9870] smbd_post_send_iter+0x63e/0x3070 [cifs] [ 750.395694] [ T9870] smbd_send+0x58c/0x9c0 [cifs] [ 750.395819] [ T9870] __smb_send_rqst+0x931/0xec0 [cifs] [ 750.395950] [ T9870] smb_send_rqst+0x22e/0x2f0 [cifs] [ 750.396081] [ T9870] cifs_call_async+0x477/0xb00 [cifs] [ 750.396232] [ T9870] smb2_async_writev+0x15ff/0x2460 [cifs] [ 750.396359] [ T9870] cifs_issue_write+0x256/0x610 [cifs] [ 750.396492] [ T9870] netfs_do_issue_write+0xc2/0x340 [netfs] [ 750.396544] [ T9870] netfs_advance_write+0x45b/0x1270 [netfs] [ 750.396576] [ T9870] netfs_write_folio+0xd6c/0x1be0 [netfs] [ 750.396608] [ T9870] netfs_writepages+0x2e9/0xa80 [netfs] [ 750.396639] [ T9870] do_writepages+0x21f/0x590 [ 750.396643] [ T9870] filemap_fdatawrite_wbc+0xe1/0x140 [ 750.396647] [ T9870] __filemap_fdatawrite_range+0xba/0x100 [ 750.396651] [ T9870] filemap_write_and_wait_range+0x7d/0xf0 [ 750.396656] [ T9870] cifs_flush+0x153/0x320 [cifs] [ 750.396787] [ T9870] filp_flush+0x107/0x1a0 [ 750.396791] [ T9870] filp_close+0x14/0x30 [ 750.396795] [ T9870] put_files_struct.part.0+0x126/0x2a0 [ 750.396800] [ T9870] exit_files+0xab/0xe0 [ 750.396803] [ T9870] do_exit+0x148f/0x2980 [ 750.396808] [ T9870] do_group_exit+0xb5/0x250 [ 750.396813] [ T9870] get_signal+0x22d3/0x22e0 [ 750.396817] [ T9870] arch_do_signal_or_restart+0x92/0x630 [ 750.396822] [ T9870] exit_to_user_mode_loop+0x98/0x170 [ 750.396827] [ T9870] do_syscall_64+0x2cf/0xd80 [ 750.396832] [ T9870] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 750.396836] [ T9870] [ 750.397150] [ T9870] The buggy address belongs to the object at ffff888011082800 which belongs to the cache smbd_request_0000000008f3bd7b of size 144 [ 750.397798] [ T9870] The buggy address is located 0 bytes to the right of allocated 144-byte region [ffff888011082800, ffff888011082890) [ 750.398469] [ T9870] [ 750.398800] [ T9870] The buggy address belongs to the physical page: [ 750.399141] [ T9870] page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x11082 [ 750.399148] [ T9870] flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) [ 750.399155] [ T9870] page_type: f5(slab) [ 750.399161] [ T9870] raw: 000fffffc0000000 ffff888022d65640 dead000000000122 0000000000000000 [ 750.399165] [ T9870] raw: 0000000000000000 0000000080100010 00000000f5000000 0000000000000000 [ 750.399169] [ T9870] page dumped because: kasan: bad access detected [ 750.399172] [ T9870] [ 750.399505] [ T9870] Memory state around the buggy address: [ 750.399863] [ T9870] ffff888011082780: fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 750.400247] [ T9870] ffff888011082800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 750.400618] [ T9870] >ffff888011082880: 00 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 750.400982] [ T9870] ^ [ 750.401370] [ T9870] ffff888011082900: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 750.401774] [ T9870] ffff888011082980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc [ 750.402171] [ T9870] ================================================================== [ 750.402696] [ T9870] Disabling lock debugging due to kernel taint [ 750.403202] [ T9870] BUG: unable to handle page fault for address: ffff8880110a2000 [ 750.403797] [ T9870] #PF: supervisor write access in kernel mode [ 750.404204] [ T9870] #PF: error_code(0x0003) - permissions violation [ 750.404581] [ T9870] PGD 5ce01067 P4D 5ce01067 PUD 5ce02067 PMD 78aa063 PTE 80000000110a2021 [ 750.404969] [ T9870] Oops: Oops: 0003 [#1] SMP KASAN PTI [ 750.405394] [ T9870] CPU: 0 UID: 0 PID: 9870 Comm: xfs_io Kdump: loaded Tainted: G B 6.16.0-rc2-metze.02+ #1 PREEMPT(voluntary) [ 750.406510] [ T9870] Tainted: [B]=BAD_PAGE [ 750.406967] [ T9870] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 750.407440] [ T9870] RIP: 0010:smb_set_sge+0x15c/0x3b0 [cifs] [ 750.408065] [ T9870] Code: 48 83 f8 ff 0f 84 b0 00 00 00 48 ba 00 00 00 00 00 fc ff df 4c 89 e1 48 c1 e9 03 80 3c 11 00 0f 85 69 01 00 00 49 8d 7c 24 08 <49> 89 04 24 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 0f [ 750.409283] [ T9870] RSP: 0018:ffffc90005e2e758 EFLAGS: 00010246 [ 750.409803] [ T9870] RAX: ffff888036c53400 RBX: ffffc90005e2e878 RCX: 1ffff11002214400 [ 750.410323] [ T9870] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff8880110a2008 [ 750.411217] [ T9870] RBP: ffffc90005e2e798 R08: 0000000000000001 R09: 0000000000000400 [ 750.411770] [ T9870] R10: ffff888011082800 R11: 0000000000000000 R12: ffff8880110a2000 [ 750.412325] [ T9870] R13: 0000000000000000 R14: ffffc90005e2e888 R15: ffff88801a4b6000 [ 750.412901] [ T9870] FS: 0000000000000000(0000) GS:ffff88812bc68000(0000) knlGS:0000000000000000 [ 750.413477] [ T9870] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 750.414077] [ T9870] CR2: ffff8880110a2000 CR3: 000000005b0a6005 CR4: 00000000000726f0 [ 750.414654] [ T9870] Call Trace: [ 750.415211] [ T9870] [ 750.415748] [ T9870] smbd_post_send_iter+0x1990/0x3070 [cifs] [ 750.416449] [ T9870] ? __pfx_smbd_post_send_iter+0x10/0x10 [cifs] [ 750.417128] [ T9870] ? update_stack_state+0x2a0/0x670 [ 750.417685] [ T9870] ? cifs_flush+0x153/0x320 [cifs] [ 750.418380] [ T9870] ? cifs_flush+0x153/0x320 [cifs] [ 750.419055] [ T9870] ? update_stack_state+0x2a0/0x670 [ 750.419624] [ T9870] smbd_send+0x58c/0x9c0 [cifs] [ 750.420297] [ T9870] ? __pfx_smbd_send+0x10/0x10 [cifs] [ 750.420936] [ T9870] ? unwind_get_return_address+0x65/0xb0 [ 750.421456] [ T9870] ? __pfx_stack_trace_consume_entry+0x10/0x10 [ 750.421954] [ T9870] ? arch_stack_walk+0xa7/0x100 [ 750.422460] [ T9870] ? stack_trace_save+0x92/0xd0 [ 750.422948] [ T9870] __smb_send_rqst+0x931/0xec0 [cifs] [ 750.423579] [ T9870] ? kernel_text_address+0x173/0x190 [ 750.424056] [ T9870] ? kasan_save_stack+0x39/0x70 [ 750.424813] [ T9870] ? kasan_save_track+0x18/0x70 [ 750.425323] [ T9870] ? __kasan_slab_alloc+0x9d/0xa0 [ 750.425831] [ T9870] ? __pfx___smb_send_rqst+0x10/0x10 [cifs] [ 750.426548] [ T9870] ? smb2_mid_entry_alloc+0xb4/0x7e0 [cifs] [ 750.427231] [ T9870] ? cifs_call_async+0x277/0xb00 [cifs] [ 750.427882] [ T9870] ? cifs_issue_write+0x256/0x610 [cifs] [ 750.428909] [ T9870] ? netfs_do_issue_write+0xc2/0x340 [netfs] [ 750.429425] [ T9870] ? netfs_advance_write+0x45b/0x1270 [netfs] [ 750.429882] [ T9870] ? netfs_write_folio+0xd6c/0x1be0 [netfs] [ 750.430345] [ T9870] ? netfs_writepages+0x2e9/0xa80 [netfs] [ 750.430809] [ T9870] ? do_writepages+0x21f/0x590 [ 750.431239] [ T9870] ? filemap_fdatawrite_wbc+0xe1/0x140 [ 750.431652] [ T9870] ? entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 750.432041] [ T9870] smb_send_rqst+0x22e/0x2f0 [cifs] [ 750.432586] [ T9870] ? __pfx_smb_send_rqst+0x10/0x10 [cifs] [ 750.433108] [ T9870] ? local_clock_noinstr+0xe/0xd0 [ 750.433482] [ T9870] ? kasan_save_alloc_info+0x37/0x60 [ 750.433855] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.434214] [ T9870] ? _raw_spin_lock+0x81/0xf0 [ 750.434561] [ T9870] ? __pfx__raw_spin_lock+0x10/0x10 [ 750.434903] [ T9870] ? smb2_setup_async_request+0x293/0x580 [cifs] [ 750.435394] [ T9870] cifs_call_async+0x477/0xb00 [cifs] [ 750.435892] [ T9870] ? __pfx_smb2_writev_callback+0x10/0x10 [cifs] [ 750.436388] [ T9870] ? __pfx_cifs_call_async+0x10/0x10 [cifs] [ 750.436881] [ T9870] ? __pfx__raw_spin_lock+0x10/0x10 [ 750.437237] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.437579] [ T9870] ? __smb2_plain_req_init+0x933/0x1090 [cifs] [ 750.438062] [ T9870] smb2_async_writev+0x15ff/0x2460 [cifs] [ 750.438557] [ T9870] ? sched_clock_noinstr+0x9/0x10 [ 750.438906] [ T9870] ? local_clock_noinstr+0xe/0xd0 [ 750.439293] [ T9870] ? __pfx_smb2_async_writev+0x10/0x10 [cifs] [ 750.439786] [ T9870] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 750.440143] [ T9870] ? _raw_spin_unlock+0xe/0x40 [ 750.440495] [ T9870] ? cifs_pick_channel+0x242/0x370 [cifs] [ 750.440989] [ T9870] cifs_issue_write+0x256/0x610 [cifs] [ 750.441492] [ T9870] ? cifs_issue_write+0x256/0x610 [cifs] [ 750.441987] [ T9870] netfs_do_issue_write+0xc2/0x340 [netfs] [ 750.442387] [ T9870] netfs_advance_write+0x45b/0x1270 [netfs] [ 750.442969] [ T9870] ? rolling_buffer_append+0x12d/0x440 [netfs] [ 750.443376] [ T9870] netfs_write_folio+0xd6c/0x1be0 [netfs] [ 750.443768] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.444145] [ T9870] netfs_writepages+0x2e9/0xa80 [netfs] [ 750.444541] [ T9870] ? __pfx_netfs_writepages+0x10/0x10 [netfs] [ 750.444936] [ T9870] ? exit_files+0xab/0xe0 [ 750.445312] [ T9870] ? do_exit+0x148f/0x2980 [ 750.445672] [ T9870] ? do_group_exit+0xb5/0x250 [ 750.446028] [ T9870] ? arch_do_signal_or_restart+0x92/0x630 [ 750.446402] [ T9870] ? exit_to_user_mode_loop+0x98/0x170 [ 750.446762] [ T9870] ? do_syscall_64+0x2cf/0xd80 [ 750.447132] [ T9870] ? entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 750.447499] [ T9870] do_writepages+0x21f/0x590 [ 750.447859] [ T9870] ? __pfx_do_writepages+0x10/0x10 [ 750.448236] [ T9870] filemap_fdatawrite_wbc+0xe1/0x140 [ 750.448595] [ T9870] __filemap_fdatawrite_range+0xba/0x100 [ 750.448953] [ T9870] ? __pfx___filemap_fdatawrite_range+0x10/0x10 [ 750.449336] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.449697] [ T9870] filemap_write_and_wait_range+0x7d/0xf0 [ 750.450062] [ T9870] cifs_flush+0x153/0x320 [cifs] [ 750.450592] [ T9870] filp_flush+0x107/0x1a0 [ 750.450952] [ T9870] filp_close+0x14/0x30 [ 750.451322] [ T9870] put_files_struct.part.0+0x126/0x2a0 [ 750.451678] [ T9870] ? __pfx__raw_spin_lock+0x10/0x10 [ 750.452033] [ T9870] exit_files+0xab/0xe0 [ 750.452401] [ T9870] do_exit+0x148f/0x2980 [ 750.452751] [ T9870] ? __pfx_do_exit+0x10/0x10 [ 750.453109] [ T9870] ? __kasan_check_write+0x14/0x30 [ 750.453459] [ T9870] ? _raw_spin_lock_irq+0x8a/0xf0 [ 750.453787] [ T9870] do_group_exit+0xb5/0x250 [ 750.454082] [ T9870] get_signal+0x22d3/0x22e0 [ 750.454406] [ T9870] ? __pfx_get_signal+0x10/0x10 [ 750.454709] [ T9870] ? fpregs_assert_state_consistent+0x68/0x100 [ 750.455031] [ T9870] ? folio_add_lru+0xda/0x120 [ 750.455347] [ T9870] arch_do_signal_or_restart+0x92/0x630 [ 750.455656] [ T9870] ? __pfx_arch_do_signal_or_restart+0x10/0x10 [ 750.455967] [ T9870] exit_to_user_mode_loop+0x98/0x170 [ 750.456282] [ T9870] do_syscall_64+0x2cf/0xd80 [ 750.456591] [ T9870] ? __kasan_check_read+0x11/0x20 [ 750.456897] [ T9870] ? count_memcg_events+0x1b4/0x420 [ 750.457280] [ T9870] ? handle_mm_fault+0x148/0x690 [ 750.457616] [ T9870] ? _raw_spin_lock_irq+0x8a/0xf0 [ 750.457925] [ T9870] ? __kasan_check_read+0x11/0x20 [ 750.458297] [ T9870] ? fpregs_assert_state_consistent+0x68/0x100 [ 750.458672] [ T9870] ? irqentry_exit_to_user_mode+0x2e/0x250 [ 750.459191] [ T9870] ? irqentry_exit+0x43/0x50 [ 750.459600] [ T9870] ? exc_page_fault+0x75/0xe0 [ 750.460130] [ T9870] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 750.460570] [ T9870] RIP: 0033:0x7858c94ab6e2 [ 750.461206] [ T9870] Code: Unable to access opcode bytes at 0x7858c94ab6b8. [ 750.461780] [ T9870] RSP: 002b:00007858c9248ce8 EFLAGS: 00000246 ORIG_RAX: 0000000000000022 [ 750.462327] [ T9870] RAX: fffffffffffffdfe RBX: 00007858c92496c0 RCX: 00007858c94ab6e2 [ 750.462653] [ T9870] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 [ 750.462969] [ T9870] RBP: 00007858c9248d10 R08: 0000000000000000 R09: 0000000000000000 [ 750.463290] [ T9870] R10: 0000000000000000 R11: 0000000000000246 R12: fffffffffffffde0 [ 750.463640] [ T9870] R13: 0000000000000020 R14: 0000000000000002 R15: 00007ffc072d2230 [ 750.463965] [ T9870] [ 750.464285] [ T9870] Modules linked in: siw ib_uverbs ccm cmac nls_utf8 cifs cifs_arc4 nls_ucs2_utils rdma_cm iw_cm ib_cm ib_core cifs_md4 netfs softdog vboxsf vboxguest cpuid intel_rapl_msr intel_rapl_common intel_uncore_frequency_common intel_pmc_core pmt_telemetry pmt_class intel_pmc_ssram_telemetry intel_vsec polyval_clmulni ghash_clmulni_intel sha1_ssse3 aesni_intel rapl i2c_piix4 i2c_smbus joydev input_leds mac_hid sunrpc binfmt_misc kvm_intel kvm irqbypass sch_fq_codel efi_pstore nfnetlink vsock_loopback vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vsock vmw_vmci dmi_sysfs ip_tables x_tables autofs4 hid_generic vboxvideo usbhid drm_vram_helper psmouse vga16fb vgastate drm_ttm_helper serio_raw hid ahci libahci ttm pata_acpi video wmi [last unloaded: vboxguest] [ 750.467127] [ T9870] CR2: ffff8880110a2000 cc: Tom Talpey cc: linux-cifs@vger.kernel.org Reviewed-by: David Howells Reviewed-by: Tom Talpey Fixes: c45ebd636c32 ("cifs: Provide the capability to extract from ITER_FOLIOQ to RDMA SGEs") Signed-off-by: Stefan Metzmacher Signed-off-by: Steve French --- fs/smb/client/smbdirect.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 5ae847919da5..cbc85bca006f 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -2589,13 +2589,14 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, size_t fsize = folioq_folio_size(folioq, slot); if (offset < fsize) { - size_t part = umin(maxsize - ret, fsize - offset); + size_t part = umin(maxsize, fsize - offset); if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part)) return -EIO; offset += part; ret += part; + maxsize -= part; } if (offset >= fsize) { @@ -2610,7 +2611,7 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, slot = 0; } } - } while (rdma->nr_sge < rdma->max_sge || maxsize > 0); + } while (rdma->nr_sge < rdma->max_sge && maxsize > 0); iter->folioq = folioq; iter->folioq_slot = slot; From 2c4fd3d141465ef1afc8318c95e7b916d9a8c49c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 18 Jun 2025 16:39:47 +0100 Subject: [PATCH 231/284] cifs: Fix prepare_write to negotiate wsize if needed Fix cifs_prepare_write() to negotiate the wsize if it is unset. Reviewed-by: Shyam Prasad N Reviewed-by: Bharath SM Signed-off-by: David Howells cc: Paulo Alcantara cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-cifs@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 9835672267d2..e9212da32f01 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -52,6 +52,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr]; struct TCP_Server_Info *server; struct cifsFileInfo *open_file = req->cfile; + struct cifs_sb_info *cifs_sb = CIFS_SB(wdata->rreq->inode->i_sb); size_t wsize = req->rreq.wsize; int rc; @@ -63,6 +64,10 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq) server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses); wdata->server = server; + if (cifs_sb->ctx->wsize == 0) + cifs_negotiate_wsize(server, cifs_sb->ctx, + tlink_tcon(req->cfile->tlink)); + retry: if (open_file->invalidHandle) { rc = cifs_reopen_file(open_file, false); @@ -160,10 +165,9 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; - if (cifs_sb->ctx->rsize == 0) { + if (cifs_sb->ctx->rsize == 0) cifs_negotiate_rsize(server, cifs_sb->ctx, tlink_tcon(req->cfile->tlink)); - } rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &size, &rdata->credits); From a2182743a8b4969481f64aec4908ff162e8a206c Mon Sep 17 00:00:00 2001 From: Paul Aurich Date: Wed, 20 Nov 2024 08:01:54 -0800 Subject: [PATCH 232/284] smb: Log an error when close_all_cached_dirs fails Under low-memory conditions, close_all_cached_dirs() can't move the dentries to a separate list to dput() them once the locks are dropped. This will result in a "Dentry still in use" error, so add an error message that makes it clear this is what happened: [ 495.281119] CIFS: VFS: \\otters.example.com\share Out of memory while dropping dentries [ 495.281595] ------------[ cut here ]------------ [ 495.281887] BUG: Dentry ffff888115531138{i=78,n=/} still in use (2) [unmount of cifs cifs] [ 495.282391] WARNING: CPU: 1 PID: 2329 at fs/dcache.c:1536 umount_check+0xc8/0xf0 Also, bail out of looping through all tcons as soon as a single allocation fails, since we're already in trouble, and kmalloc() attempts for subseqeuent tcons are likely to fail just like the first one did. Signed-off-by: Paul Aurich Acked-by: Bharath SM Suggested-by: Ruben Devos Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/cached_dir.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 5200a0f3cafc..368e870624da 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -509,8 +509,17 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { tmp_list = kmalloc(sizeof(*tmp_list), GFP_ATOMIC); - if (tmp_list == NULL) - break; + if (tmp_list == NULL) { + /* + * If the malloc() fails, we won't drop all + * dentries, and unmounting is likely to trigger + * a 'Dentry still in use' error. + */ + cifs_tcon_dbg(VFS, "Out of memory while dropping dentries\n"); + spin_unlock(&cfids->cfid_list_lock); + spin_unlock(&cifs_sb->tlink_tree_lock); + goto done; + } spin_lock(&cfid->fid_lock); tmp_list->dentry = cfid->dentry; cfid->dentry = NULL; @@ -522,6 +531,7 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); +done: list_for_each_entry_safe(tmp_list, q, &entry, entry) { list_del(&tmp_list->entry); dput(tmp_list->dentry); From 4eb11a34b72c86d559f437fdadc47e512bba41d2 Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Thu, 19 Jun 2025 21:05:32 +0530 Subject: [PATCH 233/284] smb: Use loff_t for directory position in cached_dirents Change the pos field in struct cached_dirents from int to loff_t to support large directory offsets. This avoids overflow and matches kernel conventions for directory positions. Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cached_dir.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index bc8a812ff95f..a28f7cae3caa 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -26,7 +26,7 @@ struct cached_dirents { * open file instance. */ struct mutex de_mutex; - int pos; /* Expected ctx->pos */ + loff_t pos; /* Expected ctx->pos */ struct list_head entries; }; From 4d360cfe8cbac3b62bd9e1df9889bde8a9d5b1d7 Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Thu, 19 Jun 2025 21:05:33 +0530 Subject: [PATCH 234/284] smb: minor fix to use sizeof to initialize flags_string buffer Replaced hardcoded length with sizeof(flags_string). Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifs_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index c0196be0e65f..3fdf75737d43 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -1105,7 +1105,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, if ((count < 1) || (count > 11)) return -EINVAL; - memset(flags_string, 0, 12); + memset(flags_string, 0, sizeof(flags_string)); if (copy_from_user(flags_string, buffer, count)) return -EFAULT; From 27e9d5d021dbaa1211836d07a240078bf84b284e Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Thu, 19 Jun 2025 21:05:34 +0530 Subject: [PATCH 235/284] smb: minor fix to use SMB2_NTLMV2_SESSKEY_SIZE for auth_key size Replaced hardcoded value 16 with SMB2_NTLMV2_SESSKEY_SIZE in the auth_key definition and memcpy call. Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Bharath SM Signed-off-by: Steve French --- fs/smb/client/cifs_ioctl.h | 2 +- fs/smb/client/ioctl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h index 26327442e383..b51ce64fcccf 100644 --- a/fs/smb/client/cifs_ioctl.h +++ b/fs/smb/client/cifs_ioctl.h @@ -61,7 +61,7 @@ struct smb_query_info { struct smb3_key_debug_info { __u64 Suid; __u16 cipher_type; - __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 auth_key[SMB2_NTLMV2_SESSKEY_SIZE]; __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; } __packed; diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c index 56439da4f119..0a9935ce05a5 100644 --- a/fs/smb/client/ioctl.c +++ b/fs/smb/client/ioctl.c @@ -506,7 +506,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) le16_to_cpu(tcon->ses->server->cipher_type); pkey_inf.Suid = tcon->ses->Suid; memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response, - 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + SMB2_NTLMV2_SESSKEY_SIZE); memcpy(pkey_inf.smb3decryptionkey, tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); memcpy(pkey_inf.smb3encryptionkey, From a6c23dac756b9541b33aa3bcd30f464df2879209 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Mon, 16 Jun 2025 07:51:36 -0500 Subject: [PATCH 236/284] i2c: k1: check for transfer error If spacemit_i2c_xfer_msg() times out waiting for a message transfer to complete, or if the hardware reports an error, it returns a negative error code (-ETIMEDOUT, -EAGAIN, -ENXIO. or -EIO). The sole caller of spacemit_i2c_xfer_msg() is spacemit_i2c_xfer(), which is the i2c_algorithm->xfer callback function. It currently does not save the value returned by spacemit_i2c_xfer_msg(). The result is that transfer errors go unreported, and a caller has no indication anything is wrong. When this code was out for review, the return value *was* checked in early versions. But for some reason, that assignment got dropped between versions 5 and 6 of the series, perhaps related to reworking the code to merge spacemit_i2c_xfer_core() into spacemit_i2c_xfer(). Simply assigning the value returned to "ret" fixes the problem. Fixes: 5ea558473fa31 ("i2c: spacemit: add support for SpacemiT K1 SoC") Signed-off-by: Alex Elder Cc: # v6.15+ Reviewed-by: Troy Mitchell Link: https://lore.kernel.org/r/20250616125137.1555453-1-elder@riscstar.com Signed-off-by: Andi Shyti Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-k1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-k1.c b/drivers/i2c/busses/i2c-k1.c index 5965b4cf6220..b68a21fff0b5 100644 --- a/drivers/i2c/busses/i2c-k1.c +++ b/drivers/i2c/busses/i2c-k1.c @@ -477,7 +477,7 @@ static int spacemit_i2c_xfer(struct i2c_adapter *adapt, struct i2c_msg *msgs, in ret = spacemit_i2c_wait_bus_idle(i2c); if (!ret) - spacemit_i2c_xfer_msg(i2c); + ret = spacemit_i2c_xfer_msg(i2c); else if (ret < 0) dev_dbg(i2c->dev, "i2c transfer error: %d\n", ret); else From 302251f1fdfd302ce99a619aac1a5164d0bb7c4b Mon Sep 17 00:00:00 2001 From: Faisal Bukhari Date: Sat, 21 Jun 2025 16:02:04 +0530 Subject: [PATCH 237/284] Fix typo in marvell octeontx2 documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst Fixes a spelling mistake: "funcionality" → "functionality". Signed-off-by: Faisal Bukhari Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../networking/device_drivers/ethernet/marvell/octeontx2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst b/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst index af7db0e91f6b..a52850602cd8 100644 --- a/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst +++ b/Documentation/networking/device_drivers/ethernet/marvell/octeontx2.rst @@ -66,7 +66,7 @@ Admin Function driver As mentioned above RVU PF0 is called the admin function (AF), this driver supports resource provisioning and configuration of functional blocks. Doesn't handle any I/O. It sets up few basic stuff but most of the -funcionality is achieved via configuration requests from PFs and VFs. +functionality is achieved via configuration requests from PFs and VFs. PF/VFs communicates with AF via a shared memory region (mailbox). Upon receiving requests AF does resource provisioning and other HW configuration. From b993ea46b3b601915ceaaf3c802adf11e7d6bac6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jun 2025 14:28:44 +0000 Subject: [PATCH 238/284] atm: clip: prevent NULL deref in clip_push() Blamed commit missed that vcc_destroy_socket() calls clip_push() with a NULL skb. If clip_devs is NULL, clip_push() then crashes when reading skb->truesize. Fixes: 93a2014afbac ("atm: fix a UAF in lec_arp_clear_vccs()") Reported-by: syzbot+1316233c4c6803382a8b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68556f59.a00a0220.137b3.004e.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Cong Wang Cc: Gengming Liu Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/atm/clip.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/atm/clip.c b/net/atm/clip.c index 61b5b700817d..b234dc3bcb0d 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -193,12 +193,6 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("\n"); - if (!clip_devs) { - atm_return(vcc, skb->truesize); - kfree_skb(skb); - return; - } - if (!skb) { pr_debug("removing VCC %p\n", clip_vcc); if (clip_vcc->entry) @@ -208,6 +202,11 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb) return; } atm_return(vcc, skb->truesize); + if (!clip_devs) { + kfree_skb(skb); + return; + } + skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; /* clip_vcc->entry == NULL if we don't have an IP address yet */ if (!skb->dev) { From 86731a2a651e58953fc949573895f2fa6d456841 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 Jun 2025 13:30:08 -0700 Subject: [PATCH 239/284] Linux 6.16-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ba0827a1fccd..f884dfe10246 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* From b872f562c8cef59743993b48eb458c2d87c1651e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 23 Jun 2025 19:11:50 +0800 Subject: [PATCH 240/284] dm-crypt: Extend state buffer size in crypt_iv_lmk_one Add a macro CRYPTO_MD5_STATESIZE for the Crypto API export state size of md5 and use that in dm-crypt instead of relying on the size of struct md5_state (the latter is currently undergoing a transition and may shrink). This commit fixes a crash on 32-bit machines: Oops: Oops: 0000 [#1] SMP CPU: 1 UID: 0 PID: 12 Comm: kworker/u16:0 Not tainted 6.16.0-rc2+ #993 PREEMPT(full) Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 Workqueue: kcryptd-254:0-1 kcryptd_crypt [dm_crypt] EIP: __crypto_shash_export+0xf/0x90 Code: 4a c1 c7 40 20 a0 b4 4a c1 81 cf 0e 00 04 08 89 78 50 e9 2b ff ff ff 8d 74 26 00 55 89 e5 57 56 53 89 c3 89 d6 8b 00 8b 40 14 <8b> 50 fc f6 40 13 01 74 04 4a 2b 50 14 85 c9 74 10 89 f2 89 d8 ff EAX: 303a3435 EBX: c3007c90 ECX: 00000000 EDX: c3007c38 ESI: c3007c38 EDI: c3007c90 EBP: c3007bfc ESP: c3007bf0 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 EFLAGS: 00010216 CR0: 80050033 CR2: 303a3431 CR3: 04fbe000 CR4: 00350e90 Call Trace: crypto_shash_export+0x65/0xc0 crypt_iv_lmk_one+0x106/0x1a0 [dm_crypt] Fixes: efd62c85525e ("crypto: md5-generic - Use API partial block handling") Reported-by: Milan Broz Signed-off-by: Herbert Xu Tested-by: Milan Broz Closes: https://lore.kernel.org/linux-crypto/f1625ddc-e82e-4b77-80c2-dc8e45b54848@gmail.com/T/ Signed-off-by: Mikulas Patocka --- drivers/md/dm-crypt.c | 11 +++++++---- include/crypto/hash.h | 2 ++ include/crypto/md5.h | 4 ++++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9dfdb63220d7..17157c4216a5 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -517,7 +517,10 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, { struct iv_lmk_private *lmk = &cc->iv_gen_private.lmk; SHASH_DESC_ON_STACK(desc, lmk->hash_tfm); - struct md5_state md5state; + union { + struct md5_state md5state; + u8 state[CRYPTO_MD5_STATESIZE]; + } u; __le32 buf[4]; int i, r; @@ -548,13 +551,13 @@ static int crypt_iv_lmk_one(struct crypt_config *cc, u8 *iv, return r; /* No MD5 padding here */ - r = crypto_shash_export(desc, &md5state); + r = crypto_shash_export(desc, &u.md5state); if (r) return r; for (i = 0; i < MD5_HASH_WORDS; i++) - __cpu_to_le32s(&md5state.hash[i]); - memcpy(iv, &md5state.hash, cc->iv_size); + __cpu_to_le32s(&u.md5state.hash[i]); + memcpy(iv, &u.md5state.hash, cc->iv_size); return 0; } diff --git a/include/crypto/hash.h b/include/crypto/hash.h index 6f6b9de12cd3..db294d452e8c 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -202,6 +202,8 @@ struct shash_desc { #define HASH_REQUEST_CLONE(name, gfp) \ hash_request_clone(name, sizeof(__##name##_req), gfp) +#define CRYPTO_HASH_STATESIZE(coresize, blocksize) (coresize + blocksize + 1) + /** * struct shash_alg - synchronous message digest definition * @init: see struct ahash_alg diff --git a/include/crypto/md5.h b/include/crypto/md5.h index 198b5d69b92f..28ee533a0507 100644 --- a/include/crypto/md5.h +++ b/include/crypto/md5.h @@ -2,6 +2,7 @@ #ifndef _CRYPTO_MD5_H #define _CRYPTO_MD5_H +#include #include #define MD5_DIGEST_SIZE 16 @@ -15,6 +16,9 @@ #define MD5_H2 0x98badcfeUL #define MD5_H3 0x10325476UL +#define CRYPTO_MD5_STATESIZE \ + CRYPTO_HASH_STATESIZE(MD5_STATE_SIZE, MD5_HMAC_BLOCK_SIZE) + extern const u8 md5_zero_message_hash[MD5_DIGEST_SIZE]; struct md5_state { From 95b6759a81833d0e8c7456430186c2f6d174764e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 15:09:53 +0200 Subject: [PATCH 241/284] net: qed: reduce stack usage for TLV processing clang gets a bit confused by the code in the qed_mfw_process_tlv_req and ends up spilling registers to the stack hundreds of times. When sanitizers are enabled, this can end up blowing the stack warning limit: drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c:1244:5: error: stack frame size (1824) exceeds limit (1280) in 'qed_mfw_process_tlv_req' [-Werror,-Wframe-larger-than] Apparently the problem is the complexity of qed_mfw_update_tlvs() after inlining, and marking the four main branches of that function as noinline_for_stack makes this problem completely go away, the stack usage goes down to 100 bytes. Signed-off-by: Arnd Bergmann Reviewed-by: Alexander Lobakin Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c index f55eed092f25..7d78f072b0a1 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mng_tlv.c @@ -242,7 +242,7 @@ static int qed_mfw_get_tlv_group(u8 tlv_type, u8 *tlv_group) } /* Returns size of the data buffer or, -1 in case TLV data is not available. */ -static int +static noinline_for_stack int qed_mfw_get_gen_tlv_value(struct qed_drv_tlv_hdr *p_tlv, struct qed_mfw_tlv_generic *p_drv_buf, struct qed_tlv_parsed_buf *p_buf) @@ -304,7 +304,7 @@ qed_mfw_get_gen_tlv_value(struct qed_drv_tlv_hdr *p_tlv, return -1; } -static int +static noinline_for_stack int qed_mfw_get_eth_tlv_value(struct qed_drv_tlv_hdr *p_tlv, struct qed_mfw_tlv_eth *p_drv_buf, struct qed_tlv_parsed_buf *p_buf) @@ -438,7 +438,7 @@ qed_mfw_get_tlv_time_value(struct qed_mfw_tlv_time *p_time, return QED_MFW_TLV_TIME_SIZE; } -static int +static noinline_for_stack int qed_mfw_get_fcoe_tlv_value(struct qed_drv_tlv_hdr *p_tlv, struct qed_mfw_tlv_fcoe *p_drv_buf, struct qed_tlv_parsed_buf *p_buf) @@ -1073,7 +1073,7 @@ qed_mfw_get_fcoe_tlv_value(struct qed_drv_tlv_hdr *p_tlv, return -1; } -static int +static noinline_for_stack int qed_mfw_get_iscsi_tlv_value(struct qed_drv_tlv_hdr *p_tlv, struct qed_mfw_tlv_iscsi *p_drv_buf, struct qed_tlv_parsed_buf *p_buf) From db53805156f1e0aa6d059c0d3f9ac660d4ef3eb4 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 10 Jun 2025 20:53:30 +0200 Subject: [PATCH 242/284] dm-raid: fix variable in journal device check Replace "rdev" with correct loop variable name "r". Signed-off-by: Heinz Mauelshagen Cc: stable@vger.kernel.org Fixes: 63c32ed4afc2 ("dm raid: add raid4/5/6 journaling support") Signed-off-by: Mikulas Patocka --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index d296770478b2..e8c0a8c6fb51 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2407,7 +2407,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) */ sb_retrieve_failed_devices(sb, failed_devices); rdev_for_each(r, mddev) { - if (test_bit(Journal, &rdev->flags) || + if (test_bit(Journal, &r->flags) || !r->sb_page) continue; sb2 = page_address(r->sb_page); From 1d6123102e9fbedc8d25bf4731da6d513173e49e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 17 Jun 2025 09:58:13 -0700 Subject: [PATCH 243/284] Bluetooth: hci_core: Fix use-after-free in vhci_flush() syzbot reported use-after-free in vhci_flush() without repro. [0] From the splat, a thread close()d a vhci file descriptor while its device was being used by iotcl() on another thread. Once the last fd refcnt is released, vhci_release() calls hci_unregister_dev(), hci_free_dev(), and kfree() for struct vhci_data, which is set to hci_dev->dev->driver_data. The problem is that there is no synchronisation after unlinking hdev from hci_dev_list in hci_unregister_dev(). There might be another thread still accessing the hdev which was fetched before the unlink operation. We can use SRCU for such synchronisation. Let's run hci_dev_reset() under SRCU and wait for its completion in hci_unregister_dev(). Another option would be to restore hci_dev->destruct(), which was removed in commit 587ae086f6e4 ("Bluetooth: Remove unused hci-destruct cb"). However, this would not be a good solution, as we should not run hci_unregister_dev() while there are in-flight ioctl() requests, which could lead to another data-race KCSAN splat. Note that other drivers seem to have the same problem, for exmaple, virtbt_remove(). [0]: BUG: KASAN: slab-use-after-free in skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline] BUG: KASAN: slab-use-after-free in skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937 Read of size 8 at addr ffff88807cb8d858 by task syz.1.219/6718 CPU: 1 UID: 0 PID: 6718 Comm: syz.1.219 Not tainted 6.16.0-rc1-syzkaller-00196-g08207f42d3ff #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xd2/0x2b0 mm/kasan/report.c:521 kasan_report+0x118/0x150 mm/kasan/report.c:634 skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline] skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937 skb_queue_purge include/linux/skbuff.h:3368 [inline] vhci_flush+0x44/0x50 drivers/bluetooth/hci_vhci.c:69 hci_dev_do_reset net/bluetooth/hci_core.c:552 [inline] hci_dev_reset+0x420/0x5c0 net/bluetooth/hci_core.c:592 sock_do_ioctl+0xd9/0x300 net/socket.c:1190 sock_ioctl+0x576/0x790 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fcf5b98e929 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fcf5c7b9038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fcf5bbb6160 RCX: 00007fcf5b98e929 RDX: 0000000000000000 RSI: 00000000400448cb RDI: 0000000000000009 RBP: 00007fcf5ba10b39 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007fcf5bbb6160 R15: 00007ffd6353d528 Allocated by task 6535: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] vhci_open+0x57/0x360 drivers/bluetooth/hci_vhci.c:635 misc_open+0x2bc/0x330 drivers/char/misc.c:161 chrdev_open+0x4c9/0x5e0 fs/char_dev.c:414 do_dentry_open+0xdf0/0x1970 fs/open.c:964 vfs_open+0x3b/0x340 fs/open.c:1094 do_open fs/namei.c:3887 [inline] path_openat+0x2ee5/0x3830 fs/namei.c:4046 do_filp_open+0x1fa/0x410 fs/namei.c:4073 do_sys_openat2+0x121/0x1c0 fs/open.c:1437 do_sys_open fs/open.c:1452 [inline] __do_sys_openat fs/open.c:1468 [inline] __se_sys_openat fs/open.c:1463 [inline] __x64_sys_openat+0x138/0x170 fs/open.c:1463 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6535: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 vhci_release+0xbc/0xd0 drivers/bluetooth/hci_vhci.c:671 __fput+0x44c/0xa70 fs/file_table.c:465 task_work_run+0x1d1/0x260 kernel/task_work.c:227 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0x6ad/0x22e0 kernel/exit.c:955 do_group_exit+0x21c/0x2d0 kernel/exit.c:1104 __do_sys_exit_group kernel/exit.c:1115 [inline] __se_sys_exit_group kernel/exit.c:1113 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1113 x64_sys_call+0x21ba/0x21c0 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff88807cb8d800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 88 bytes inside of freed 1024-byte region [ffff88807cb8d800, ffff88807cb8dc00) Fixes: bf18c7118cf8 ("Bluetooth: vhci: Free driver_data on file release") Reported-by: syzbot+2faa4825e556199361f9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f62d64848fc4c7c30cd6 Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 34 ++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a760f05fa3fb..9fc8f544e20e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -347,6 +348,7 @@ struct adv_monitor { struct hci_dev { struct list_head list; + struct srcu_struct srcu; struct mutex lock; struct ida unset_handle_ida; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 07a8b4281a39..14d7221b8ac0 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -64,7 +64,7 @@ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ -struct hci_dev *hci_dev_get(int index) +static struct hci_dev *__hci_dev_get(int index, int *srcu_index) { struct hci_dev *hdev = NULL, *d; @@ -77,6 +77,8 @@ struct hci_dev *hci_dev_get(int index) list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); + if (srcu_index) + *srcu_index = srcu_read_lock(&d->srcu); break; } } @@ -84,6 +86,22 @@ struct hci_dev *hci_dev_get(int index) return hdev; } +struct hci_dev *hci_dev_get(int index) +{ + return __hci_dev_get(index, NULL); +} + +static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index) +{ + return __hci_dev_get(index, srcu_index); +} + +static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index) +{ + srcu_read_unlock(&hdev->srcu, srcu_index); + hci_dev_put(hdev); +} + /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) @@ -568,9 +586,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev) int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; - int err; + int err, srcu_index; - hdev = hci_dev_get(dev); + hdev = hci_dev_get_srcu(dev, &srcu_index); if (!hdev) return -ENODEV; @@ -592,7 +610,7 @@ int hci_dev_reset(__u16 dev) err = hci_dev_do_reset(hdev); done: - hci_dev_put(hdev); + hci_dev_put_srcu(hdev, srcu_index); return err; } @@ -2433,6 +2451,11 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) if (!hdev) return NULL; + if (init_srcu_struct(&hdev->srcu)) { + kfree(hdev); + return NULL; + } + hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); @@ -2678,6 +2701,9 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + synchronize_srcu(&hdev->srcu); + cleanup_srcu_struct(&hdev->srcu); + disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); From 7484e15dbb016d9d40f8c6e0475810212ae181db Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 17 Jun 2025 00:09:51 -0400 Subject: [PATCH 244/284] replace collect_mounts()/drop_collected_mounts() with a safer variant collect_mounts() has several problems - one can't iterate over the results directly, so it has to be done with callback passed to iterate_mounts(); it has an oopsable race with d_invalidate(); it creates temporary clones of mounts invisibly for sync umount (IOW, you can have non-lazy umount succeed leaving filesystem not mounted anywhere and yet still busy). A saner approach is to give caller an array of struct path that would pin every mount in a subtree, without cloning any mounts. * collect_mounts()/drop_collected_mounts()/iterate_mounts() is gone * collect_paths(where, preallocated, size) gives either ERR_PTR(-E...) or a pointer to array of struct path, one for each chunk of tree visible under 'where' (i.e. the first element is a copy of where, followed by (mount,root) for everything mounted under it - the same set collect_mounts() would give). Unlike collect_mounts(), the mounts are *not* cloned - we just get pinning references to the roots of subtrees in the caller's namespace. Array is terminated by {NULL, NULL} struct path. If it fits into preallocated array (on-stack, normally), that's where it goes; otherwise it's allocated by kmalloc_array(). Passing 0 as size means that 'preallocated' is ignored (and expected to be NULL). * drop_collected_paths(paths, preallocated) is given the array returned by an earlier call of collect_paths() and the preallocated array passed to that call. All mount/dentry references are dropped and array is kfree'd if it's not equal to 'preallocated'. * instead of iterate_mounts(), users should just iterate over array of struct path - nothing exotic is needed for that. Existing users (all in audit_tree.c) are converted. [folded a fix for braino reported by Venkat Rao Bagalkote ] Fixes: 80b5dce8c59b0 ("vfs: Add a function to lazily unmount all mounts from any dentry") Tested-by: Venkat Rao Bagalkote Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 9 +++ fs/namespace.c | 99 ++++++++++++++++----------- fs/pnode.h | 2 - include/linux/mount.h | 6 +- kernel/audit_tree.c | 63 +++++++++-------- 5 files changed, 105 insertions(+), 74 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 3616d7161dab..a5734bdd1cc7 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1249,3 +1249,12 @@ Using try_lookup_noperm() will require linux/namei.h to be included. Calling conventions for ->d_automount() have changed; we should *not* grab an extra reference to new mount - it should be returned with refcount 1. + +--- + +collect_mounts()/drop_collected_mounts()/iterate_mounts() are gone now. +Replacement is collect_paths()/drop_collected_path(), with no special +iterator needed. Instead of a cloned mount tree, the new interface returns +an array of struct path, one for each mount collect_mounts() would've +created. These struct path point to locations in the caller's namespace +that would be roots of the cloned mounts. diff --git a/fs/namespace.c b/fs/namespace.c index e13d9ab4f564..14601ec4c2c5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2310,21 +2310,62 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, return dst_mnt; } -/* Caller should check returned pointer for errors */ - -struct vfsmount *collect_mounts(const struct path *path) +static inline bool extend_array(struct path **res, struct path **to_free, + unsigned n, unsigned *count, unsigned new_count) { - struct mount *tree; - namespace_lock(); - if (!check_mnt(real_mount(path->mnt))) - tree = ERR_PTR(-EINVAL); - else - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); - namespace_unlock(); - if (IS_ERR(tree)) - return ERR_CAST(tree); - return &tree->mnt; + struct path *p; + + if (likely(n < *count)) + return true; + p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); + if (p && *count) + memcpy(p, *res, *count * sizeof(struct path)); + *count = new_count; + kfree(*to_free); + *to_free = *res = p; + return p; +} + +struct path *collect_paths(const struct path *path, + struct path *prealloc, unsigned count) +{ + struct mount *root = real_mount(path->mnt); + struct mount *child; + struct path *res = prealloc, *to_free = NULL; + unsigned n = 0; + + guard(rwsem_read)(&namespace_sem); + + if (!check_mnt(root)) + return ERR_PTR(-EINVAL); + if (!extend_array(&res, &to_free, 0, &count, 32)) + return ERR_PTR(-ENOMEM); + res[n++] = *path; + list_for_each_entry(child, &root->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, path->dentry)) + continue; + for (struct mount *m = child; m; m = next_mnt(m, child)) { + if (!extend_array(&res, &to_free, n, &count, 2 * count)) + return ERR_PTR(-ENOMEM); + res[n].mnt = &m->mnt; + res[n].dentry = m->mnt.mnt_root; + n++; + } + } + if (!extend_array(&res, &to_free, n, &count, count + 1)) + return ERR_PTR(-ENOMEM); + memset(res + n, 0, (count - n) * sizeof(struct path)); + for (struct path *p = res; p->mnt; p++) + path_get(p); + return res; +} + +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ + for (struct path *p = paths; p->mnt; p++) + path_put(p); + if (paths != prealloc) + kfree(paths); } static void free_mnt_ns(struct mnt_namespace *); @@ -2401,15 +2442,6 @@ void dissolve_on_fput(struct vfsmount *mnt) free_mnt_ns(ns); } -void drop_collected_mounts(struct vfsmount *mnt) -{ - namespace_lock(); - lock_mount_hash(); - umount_tree(real_mount(mnt), 0); - unlock_mount_hash(); - namespace_unlock(); -} - static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2511,21 +2543,6 @@ struct vfsmount *clone_private_mount(const struct path *path) } EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - struct vfsmount *root) -{ - struct mount *mnt; - int res = f(root, arg); - if (res) - return res; - list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { - res = f(&mnt->mnt, arg); - if (res) - return res; - } - return 0; -} - static void lock_mnt_tree(struct mount *mnt) { struct mount *p; @@ -6262,7 +6279,11 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - drop_collected_mounts(&ns->root->mnt); + namespace_lock(); + lock_mount_hash(); + umount_tree(ns->root, 0); + unlock_mount_hash(); + namespace_unlock(); free_mnt_ns(ns); } diff --git a/fs/pnode.h b/fs/pnode.h index 34b6247af01d..2d026fb98b18 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -28,8 +28,6 @@ #define CL_SHARED_TO_SLAVE 0x20 #define CL_COPY_MNT_NS_FILE 0x40 -#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE) - static inline void set_mnt_shared(struct mount *mnt) { mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK; diff --git a/include/linux/mount.h b/include/linux/mount.h index 4880f434c021..1a508beba446 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -116,10 +116,8 @@ extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); int do_mount(const char *, const char __user *, const char *, unsigned long, void *); -extern struct vfsmount *collect_mounts(const struct path *); -extern void drop_collected_mounts(struct vfsmount *); -extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *, - struct vfsmount *); +extern struct path *collect_paths(const struct path *, struct path *, unsigned); +extern void drop_collected_paths(struct path *, struct path *); extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); extern int cifs_root_data(char **dev, char **opts); diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f2f38903b2fe..b0eae2a3c895 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -668,12 +668,6 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } -static int compare_root(struct vfsmount *mnt, void *arg) -{ - return inode_to_key(d_backing_inode(mnt->mnt_root)) == - (unsigned long)arg; -} - void audit_trim_trees(void) { struct list_head cursor; @@ -683,8 +677,9 @@ void audit_trim_trees(void) while (cursor.next != &tree_list) { struct audit_tree *tree; struct path path; - struct vfsmount *root_mnt; struct audit_node *node; + struct path *paths; + struct path array[16]; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -696,9 +691,9 @@ void audit_trim_trees(void) if (err) goto skip_it; - root_mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(root_mnt)) + if (IS_ERR(paths)) goto skip_it; spin_lock(&hash_lock); @@ -706,14 +701,17 @@ void audit_trim_trees(void) struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ node->index |= 1U<<31; - if (iterate_mounts(compare_root, - (void *)(chunk->key), - root_mnt)) - node->index &= ~(1U<<31); + for (struct path *p = paths; p->dentry; p++) { + struct inode *inode = p->dentry->d_inode; + if (inode_to_key(inode) == chunk->key) { + node->index &= ~(1U<<31); + break; + } + } } spin_unlock(&hash_lock); trim_marked(tree); - drop_collected_mounts(root_mnt); + drop_collected_paths(paths, array); skip_it: put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -742,9 +740,14 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } -static int tag_mount(struct vfsmount *mnt, void *arg) +static int tag_mounts(struct path *paths, struct audit_tree *tree) { - return tag_chunk(d_backing_inode(mnt->mnt_root), arg); + for (struct path *p = paths; p->dentry; p++) { + int err = tag_chunk(p->dentry->d_inode, tree); + if (err) + return err; + } + return 0; } /* @@ -801,7 +804,8 @@ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt; + struct path array[16]; + struct path *paths; int err; rule->tree = NULL; @@ -828,16 +832,16 @@ int audit_add_tree_rule(struct audit_krule *rule) err = kern_path(tree->pathname, 0, &path); if (err) goto Err; - mnt = collect_mounts(&path); + paths = collect_paths(&path, array, 16); path_put(&path); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); + if (IS_ERR(paths)) { + err = PTR_ERR(paths); goto Err; } get_tree(tree); - err = iterate_mounts(tag_mount, tree, mnt); - drop_collected_mounts(mnt); + err = tag_mounts(paths, tree); + drop_collected_paths(paths, array); if (!err) { struct audit_node *node; @@ -872,20 +876,21 @@ int audit_tag_tree(char *old, char *new) struct list_head cursor, barrier; int failed = 0; struct path path1, path2; - struct vfsmount *tagged; + struct path array[16]; + struct path *paths; int err; err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path2); + paths = collect_paths(&path2, array, 16); path_put(&path2); - if (IS_ERR(tagged)) - return PTR_ERR(tagged); + if (IS_ERR(paths)) + return PTR_ERR(paths); err = kern_path(old, 0, &path1); if (err) { - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return err; } @@ -914,7 +919,7 @@ int audit_tag_tree(char *old, char *new) continue; } - failed = iterate_mounts(tag_mount, tree, tagged); + failed = tag_mounts(paths, tree); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -955,7 +960,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); mutex_unlock(&audit_filter_mutex); path_put(&path1); - drop_collected_mounts(tagged); + drop_collected_paths(paths, array); return failed; } From ce7df19686530920f2f6b636e71ce5eb1d9303ef Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 22 Jun 2025 18:03:29 -0400 Subject: [PATCH 245/284] attach_recursive_mnt(): do not lock the covering tree when sliding something under it If we are propagating across the userns boundary, we need to lock the mounts added there. However, in case when something has already been mounted there and we end up sliding a new tree under that, the stuff that had been there before should not get locked. IOW, lock_mnt_tree() should be called before we reparent the preexisting tree on top of what we are adding. Fixes: 3bd045cc9c4b ("separate copying and locking mount tree on cross-userns copies") Signed-off-by: Al Viro --- fs/namespace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 14601ec4c2c5..eed83254492f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2768,14 +2768,14 @@ static int attach_recursive_mnt(struct mount *source_mnt, hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt(&child->mnt_parent->mnt, - child->mnt_mountpoint); - if (q) - mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); child->mnt.mnt_flags &= ~MNT_LOCKED; + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) + mnt_change_mountpoint(child, smp, q); commit_tree(child); } put_mountpoint(smp); From aa485e8789d56a4573f7c8d000a182b749eaa64d Mon Sep 17 00:00:00 2001 From: Yuan Chen Date: Wed, 18 Jun 2025 09:19:33 +0800 Subject: [PATCH 246/284] libbpf: Fix null pointer dereference in btf_dump__free on allocation failure When btf_dump__new() fails to allocate memory for the internal hashmap (btf_dump->type_names), it returns an error code. However, the cleanup function btf_dump__free() does not check if btf_dump->type_names is NULL before attempting to free it. This leads to a null pointer dereference when btf_dump__free() is called on a btf_dump object. Fixes: 351131b51c7a ("libbpf: add btf_dump API for BTF-to-C conversion") Signed-off-by: Yuan Chen Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250618011933.11423-1-chenyuan_fl@163.com --- tools/lib/bpf/btf_dump.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 460c3e57fadb..0381f209920a 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -226,6 +226,9 @@ static void btf_dump_free_names(struct hashmap *map) size_t bkt; struct hashmap_entry *cur; + if (!map) + return; + hashmap__for_each_entry(map, cur, bkt) free((void *)cur->pkey); From f5990207026987a353d5a95204c4d9cb725637fd Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 20 Jun 2025 11:48:55 -0700 Subject: [PATCH 247/284] net: netpoll: Initialize UDP checksum field before checksumming commit f1fce08e63fe ("netpoll: Eliminate redundant assignment") removed the initialization of the UDP checksum, which was wrong and broke netpoll IPv6 transmission due to bad checksumming. udph->check needs to be set before calling csum_ipv6_magic(). Fixes: f1fce08e63fe ("netpoll: Eliminate redundant assignment") Signed-off-by: Breno Leitao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250620-netpoll_fix-v1-1-f9f0b82bc059@debian.org Signed-off-by: Jakub Kicinski --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4ddb7490df4b..6ad84d4a2b46 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -432,6 +432,7 @@ int netpoll_send_udp(struct netpoll *np, const char *msg, int len) udph->dest = htons(np->remote_port); udph->len = htons(udp_len); + udph->check = 0; if (np->ipv6) { udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, @@ -460,7 +461,6 @@ int netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { - udph->check = 0; udph->check = csum_tcpudp_magic(np->local_ip.ip, np->remote_ip.ip, udp_len, IPPROTO_UDP, From d5e3241c5a386a2425823c8c7afb77a465bd040f Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Thu, 19 Jun 2025 11:45:30 +0200 Subject: [PATCH 248/284] ethernet: ionic: Fix DMA mapping tests Change error values of `ionic_tx_map_single()` and `ionic_tx_map_frag()` from 0 to `DMA_MAPPING_ERROR` to prevent collision with 0 as a valid address. This also fixes the use of `dma_mapping_error()` to test against 0 in `ionic_xdp_post_frame()` Fixes: 0f3154e6bcb3 ("ionic: Add Tx and Rx handling") Fixes: 56e41ee12d2d ("ionic: better dma-map error handling") Fixes: ac8813c0ab7d ("ionic: convert Rx queue buffers to use page_pool") Signed-off-by: Thomas Fourier Reviewed-by: Brett Creeley Link: https://patch.msgid.link/20250619094538.283723-2-fourier.thomas@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c index 2ac59564ded1..d10b58ebf603 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c @@ -321,7 +321,7 @@ static int ionic_xdp_post_frame(struct ionic_queue *q, struct xdp_frame *frame, len, DMA_TO_DEVICE); } else /* XDP_REDIRECT */ { dma_addr = ionic_tx_map_single(q, frame->data, len); - if (!dma_addr) + if (dma_addr == DMA_MAPPING_ERROR) return -EIO; } @@ -357,7 +357,7 @@ static int ionic_xdp_post_frame(struct ionic_queue *q, struct xdp_frame *frame, } else { dma_addr = ionic_tx_map_frag(q, frag, 0, skb_frag_size(frag)); - if (dma_mapping_error(q->dev, dma_addr)) { + if (dma_addr == DMA_MAPPING_ERROR) { ionic_tx_desc_unmap_bufs(q, desc_info); return -EIO; } @@ -1083,7 +1083,7 @@ static dma_addr_t ionic_tx_map_single(struct ionic_queue *q, net_warn_ratelimited("%s: DMA single map failed on %s!\n", dev_name(dev), q->name); q_to_tx_stats(q)->dma_map_err++; - return 0; + return DMA_MAPPING_ERROR; } return dma_addr; } @@ -1100,7 +1100,7 @@ static dma_addr_t ionic_tx_map_frag(struct ionic_queue *q, net_warn_ratelimited("%s: DMA frag map failed on %s!\n", dev_name(dev), q->name); q_to_tx_stats(q)->dma_map_err++; - return 0; + return DMA_MAPPING_ERROR; } return dma_addr; } @@ -1116,7 +1116,7 @@ static int ionic_tx_map_skb(struct ionic_queue *q, struct sk_buff *skb, int frag_idx; dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb)); - if (!dma_addr) + if (dma_addr == DMA_MAPPING_ERROR) return -EIO; buf_info->dma_addr = dma_addr; buf_info->len = skb_headlen(skb); @@ -1126,7 +1126,7 @@ static int ionic_tx_map_skb(struct ionic_queue *q, struct sk_buff *skb, nfrags = skb_shinfo(skb)->nr_frags; for (frag_idx = 0; frag_idx < nfrags; frag_idx++, frag++) { dma_addr = ionic_tx_map_frag(q, frag, 0, skb_frag_size(frag)); - if (!dma_addr) + if (dma_addr == DMA_MAPPING_ERROR) goto dma_fail; buf_info->dma_addr = dma_addr; buf_info->len = skb_frag_size(frag); From 7544f3f5b0b58c396f374d060898b5939da31709 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 19 Jun 2025 21:22:28 +0300 Subject: [PATCH 249/284] bridge: mcast: Fix use-after-free during router port configuration The bridge maintains a global list of ports behind which a multicast router resides. The list is consulted during forwarding to ensure multicast packets are forwarded to these ports even if the ports are not member in the matching MDB entry. When per-VLAN multicast snooping is enabled, the per-port multicast context is disabled on each port and the port is removed from the global router port list: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 # ip link add name dummy1 up master br1 type dummy # ip link set dev dummy1 type bridge_slave mcast_router 2 $ bridge -d mdb show | grep router router ports on br1: dummy1 # ip link set dev br1 type bridge mcast_vlan_snooping 1 $ bridge -d mdb show | grep router However, the port can be re-added to the global list even when per-VLAN multicast snooping is enabled: # ip link set dev dummy1 type bridge_slave mcast_router 0 # ip link set dev dummy1 type bridge_slave mcast_router 2 $ bridge -d mdb show | grep router router ports on br1: dummy1 Since commit 4b30ae9adb04 ("net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions"), when per-VLAN multicast snooping is enabled, multicast disablement on a port will disable the per-{port, VLAN} multicast contexts and not the per-port one. As a result, a port will remain in the global router port list even after it is deleted. This will lead to a use-after-free [1] when the list is traversed (when adding a new port to the list, for example): # ip link del dev dummy1 # ip link add name dummy2 up master br1 type dummy # ip link set dev dummy2 type bridge_slave mcast_router 2 Similarly, stale entries can also be found in the per-VLAN router port list. When per-VLAN multicast snooping is disabled, the per-{port, VLAN} contexts are disabled on each port and the port is removed from the per-VLAN router port list: # ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 1 mcast_vlan_snooping 1 # ip link add name dummy1 up master br1 type dummy # bridge vlan add vid 2 dev dummy1 # bridge vlan global set vid 2 dev br1 mcast_snooping 1 # bridge vlan set vid 2 dev dummy1 mcast_router 2 $ bridge vlan global show dev br1 vid 2 | grep router router ports: dummy1 # ip link set dev br1 type bridge mcast_vlan_snooping 0 $ bridge vlan global show dev br1 vid 2 | grep router However, the port can be re-added to the per-VLAN list even when per-VLAN multicast snooping is disabled: # bridge vlan set vid 2 dev dummy1 mcast_router 0 # bridge vlan set vid 2 dev dummy1 mcast_router 2 $ bridge vlan global show dev br1 vid 2 | grep router router ports: dummy1 When the VLAN is deleted from the port, the per-{port, VLAN} multicast context will not be disabled since multicast snooping is not enabled on the VLAN. As a result, the port will remain in the per-VLAN router port list even after it is no longer member in the VLAN. This will lead to a use-after-free [2] when the list is traversed (when adding a new port to the list, for example): # ip link add name dummy2 up master br1 type dummy # bridge vlan add vid 2 dev dummy2 # bridge vlan del vid 2 dev dummy1 # bridge vlan set vid 2 dev dummy2 mcast_router 2 Fix these issues by removing the port from the relevant (global or per-VLAN) router port list in br_multicast_port_ctx_deinit(). The function is invoked during port deletion with the per-port multicast context and during VLAN deletion with the per-{port, VLAN} multicast context. Note that deleting the multicast router timer is not enough as it only takes care of the temporary multicast router states (1 or 3) and not the permanent one (2). [1] BUG: KASAN: slab-out-of-bounds in br_multicast_add_router.part.0+0x3f1/0x560 Write of size 8 at addr ffff888004a67328 by task ip/384 [...] Call Trace: dump_stack_lvl+0x6f/0xa0 print_address_description.constprop.0+0x6f/0x350 print_report+0x108/0x205 kasan_report+0xdf/0x110 br_multicast_add_router.part.0+0x3f1/0x560 br_multicast_set_port_router+0x74e/0xac0 br_setport+0xa55/0x1870 br_port_slave_changelink+0x95/0x120 __rtnl_newlink+0x5e8/0xa40 rtnl_newlink+0x627/0xb00 rtnetlink_rcv_msg+0x6fb/0xb70 netlink_rcv_skb+0x11f/0x350 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x124/0x1c0 do_syscall_64+0xbb/0x360 entry_SYSCALL_64_after_hwframe+0x4b/0x53 [2] BUG: KASAN: slab-use-after-free in br_multicast_add_router.part.0+0x378/0x560 Read of size 8 at addr ffff888009f00840 by task bridge/391 [...] Call Trace: dump_stack_lvl+0x6f/0xa0 print_address_description.constprop.0+0x6f/0x350 print_report+0x108/0x205 kasan_report+0xdf/0x110 br_multicast_add_router.part.0+0x378/0x560 br_multicast_set_port_router+0x6f9/0xac0 br_vlan_process_options+0x8b6/0x1430 br_vlan_rtm_process_one+0x605/0xa30 br_vlan_rtm_process+0x396/0x4c0 rtnetlink_rcv_msg+0x2f7/0xb70 netlink_rcv_skb+0x11f/0x350 netlink_unicast+0x426/0x710 netlink_sendmsg+0x75a/0xc20 __sock_sendmsg+0xc1/0x150 ____sys_sendmsg+0x5aa/0x7b0 ___sys_sendmsg+0xfc/0x180 __sys_sendmsg+0x124/0x1c0 do_syscall_64+0xbb/0x360 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Fixes: 2796d846d74a ("net: bridge: vlan: convert mcast router global option to per-vlan entry") Fixes: 4b30ae9adb04 ("net: bridge: mcast: re-implement br_multicast_{enable, disable}_port functions") Reported-by: syzbot+7bfa4b72c6a5da128d32@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/684c18bd.a00a0220.279073.000b.GAE@google.com/T/ Signed-off-by: Ido Schimmel Link: https://patch.msgid.link/20250619182228.1656906-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 0224ef3dfec0..1377f31b719c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2015,10 +2015,19 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { + struct net_bridge *br = pmctx->port->br; + bool del = false; + #if IS_ENABLED(CONFIG_IPV6) timer_delete_sync(&pmctx->ip6_mc_router_timer); #endif timer_delete_sync(&pmctx->ip4_mc_router_timer); + + spin_lock_bh(&br->multicast_lock); + del |= br_ip6_multicast_rport_del(pmctx); + del |= br_ip4_multicast_rport_del(pmctx); + br_multicast_rport_del_notify(pmctx, del); + spin_unlock_bh(&br->multicast_lock); } int br_multicast_add_port(struct net_bridge_port *port) From 2eb7648558a7911f2208e8940cd22ca40e93cc76 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 19 Jun 2025 16:06:02 +0200 Subject: [PATCH 250/284] bpf: Specify access type of bpf_sysctl_get_name args The second argument of bpf_sysctl_get_name() helper is a pointer to a buffer that is being written to. However that isn't specify in the prototype. Until commit 37cce22dbd51a ("bpf: verifier: Refactor helper access type tracking"), all helper accesses were considered as a possible write access by the verifier, so no big harm was done. However, since then, the verifier might make wrong asssumption about the content of that address which might lead it to make faulty optimizations (such as removing code that was wrongly labeled dead). This is what happens in test_sysctl selftest to the tests related to sysctl_get_name. Add MEM_WRITE flag the second argument of bpf_sysctl_get_name(). Signed-off-by: Jerome Marchand Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250619140603.148942-2-jmarchan@redhat.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 9122c39870bf..f4885514f007 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -2134,7 +2134,7 @@ static const struct bpf_func_proto bpf_sysctl_get_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; From b8a205486ed5c0c5c0386e472157a81ce686af25 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 19 Jun 2025 16:06:03 +0200 Subject: [PATCH 251/284] selftests/bpf: Convert test_sysctl to prog_tests Convert test_sysctl test to prog_tests with minimal change to the tests themselves. Signed-off-by: Jerome Marchand Acked-by: Yonghong Song Link: https://lore.kernel.org/r/20250619140603.148942-3-jmarchan@redhat.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/.gitignore | 1 - tools/testing/selftests/bpf/Makefile | 5 +-- .../bpf/{ => prog_tests}/test_sysctl.c | 37 ++++--------------- 3 files changed, 10 insertions(+), 33 deletions(-) rename tools/testing/selftests/bpf/{ => prog_tests}/test_sysctl.c (98%) diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index e2a2c46c008b..3d8378972d26 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -21,7 +21,6 @@ test_lirc_mode2_user flow_dissector_load test_tcpnotify_user test_libbpf -test_sysctl xdping test_cpp *.d diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index cf5ed3bee573..910d8d6402ef 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -73,7 +73,7 @@ endif # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_progs \ test_sockmap \ - test_tcpnotify_user test_sysctl \ + test_tcpnotify_user \ test_progs-no_alu32 TEST_INST_SUBDIRS := no_alu32 @@ -220,7 +220,7 @@ ifeq ($(VMLINUX_BTF),) $(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") endif -# Define simple and short `make test_progs`, `make test_sysctl`, etc targets +# Define simple and short `make test_progs`, `make test_maps`, etc targets # to build individual tests. # NOTE: Semicolon at the end is critical to override lib.mk's default static # rule for binaries. @@ -329,7 +329,6 @@ NETWORK_HELPERS := $(OUTPUT)/network_helpers.o $(OUTPUT)/test_sockmap: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tcpnotify_user: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(TRACE_HELPERS) $(OUTPUT)/test_sock_fields: $(CGROUP_HELPERS) $(TESTING_HELPERS) -$(OUTPUT)/test_sysctl: $(CGROUP_HELPERS) $(TESTING_HELPERS) $(OUTPUT)/test_tag: $(TESTING_HELPERS) $(OUTPUT)/test_lirc_mode2_user: $(TESTING_HELPERS) $(OUTPUT)/xdping: $(TESTING_HELPERS) diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c similarity index 98% rename from tools/testing/selftests/bpf/test_sysctl.c rename to tools/testing/selftests/bpf/prog_tests/test_sysctl.c index bcdbd27f22f0..273dd41ca09e 100644 --- a/tools/testing/selftests/bpf/test_sysctl.c +++ b/tools/testing/selftests/bpf/prog_tests/test_sysctl.c @@ -1,22 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2019 Facebook -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include -#include "bpf_util.h" +#include "test_progs.h" #include "cgroup_helpers.h" -#include "testing_helpers.h" #define CG_PATH "/foo" #define MAX_INSNS 512 @@ -1608,26 +1594,19 @@ static int run_tests(int cgfd) return fails ? -1 : 0; } -int main(int argc, char **argv) +void test_sysctl(void) { - int cgfd = -1; - int err = 0; + int cgfd; cgfd = cgroup_setup_and_join(CG_PATH); - if (cgfd < 0) - goto err; + if (!ASSERT_OK_FD(cgfd < 0, "create_cgroup")) + goto out; - /* Use libbpf 1.0 API mode */ - libbpf_set_strict_mode(LIBBPF_STRICT_ALL); + if (!ASSERT_OK(run_tests(cgfd), "run_tests")) + goto out; - if (run_tests(cgfd)) - goto err; - - goto out; -err: - err = -1; out: close(cgfd); cleanup_cgroup_environment(); - return err; + return; } From 0e7facea6da2bd360361440786785752aa5b0e30 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 20 Jun 2025 13:39:42 +0200 Subject: [PATCH 252/284] wifi: iwlegacy: work around excessive stack usage on clang/kasan In some rare randconfig builds, I seem to trigger a bug in clang where it unrolls a loop but then runs out of registers, which then get spilled to the stack: net/wireless/intel/iwlegacy/4965-rs.c:2262:1: error: stack frame size (1696) exceeds limit (1280) in 'il4965_rs_rate_init' [-Werror,-Wframe-larger-than] This seems to be the same one I saw in the omapdrm driver, and there is an easy workaround by not inlining the il4965_rs_rate_scale_clear_win function. Link: https://github.com/llvm/llvm-project/issues/143908 Signed-off-by: Arnd Bergmann Acked-by: Stanislaw Gruszka Link: https://patch.msgid.link/20250620113946.3987160-1-arnd@kernel.org Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlegacy/4965-rs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/intel/iwlegacy/4965-rs.c b/drivers/net/wireless/intel/iwlegacy/4965-rs.c index 0e5130d1fccd..031d88bf6393 100644 --- a/drivers/net/wireless/intel/iwlegacy/4965-rs.c +++ b/drivers/net/wireless/intel/iwlegacy/4965-rs.c @@ -203,7 +203,8 @@ il4965_rs_extract_rate(u32 rate_n_flags) return (u8) (rate_n_flags & 0xFF); } -static void +/* noinline works around https://github.com/llvm/llvm-project/issues/143908 */ +static noinline_for_stack void il4965_rs_rate_scale_clear_win(struct il_rate_scale_data *win) { win->data = 0; From 7a3750ff0f2e8fee338a9c168f429f6c37f0e820 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Sat, 21 Jun 2025 22:32:09 +1000 Subject: [PATCH 253/284] wifi: mac80211: fix beacon interval calculation overflow As we are converting from TU to usecs, a beacon interval of 100*1024 usecs will lead to integer wrapping. To fix change to use a u32. Fixes: 057d5f4ba1e4 ("mac80211: sync dtim_count to TSF") Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250621123209.511796-1-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- net/mac80211/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 27d414efa3fd..a125995ed252 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3884,7 +3884,7 @@ void ieee80211_recalc_dtim(struct ieee80211_local *local, { u64 tsf = drv_get_tsf(local, sdata); u64 dtim_count = 0; - u16 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; + u32 beacon_int = sdata->vif.bss_conf.beacon_int * 1024; u8 dtim_period = sdata->vif.bss_conf.dtim_period; struct ps_data *ps; u8 bcns_from_dtim; From 32ca245464e1479bfea8592b9db227fdc1641705 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:55 -0700 Subject: [PATCH 254/284] af_unix: Don't leave consecutive consumed OOB skbs. Jann Horn reported a use-after-free in unix_stream_read_generic(). The following sequences reproduce the issue: $ python3 from socket import * s1, s2 = socketpair(AF_UNIX, SOCK_STREAM) s1.send(b'x', MSG_OOB) s2.recv(1, MSG_OOB) # leave a consumed OOB skb s1.send(b'y', MSG_OOB) s2.recv(1, MSG_OOB) # leave a consumed OOB skb s1.send(b'z', MSG_OOB) s2.recv(1) # recv 'z' illegally s2.recv(1, MSG_OOB) # access 'z' skb (use-after-free) Even though a user reads OOB data, the skb holding the data stays on the recv queue to mark the OOB boundary and break the next recv(). After the last send() in the scenario above, the sk2's recv queue has 2 leading consumed OOB skbs and 1 real OOB skb. Then, the following happens during the next recv() without MSG_OOB 1. unix_stream_read_generic() peeks the first consumed OOB skb 2. manage_oob() returns the next consumed OOB skb 3. unix_stream_read_generic() fetches the next not-yet-consumed OOB skb 4. unix_stream_read_generic() reads and frees the OOB skb , and the last recv(MSG_OOB) triggers KASAN splat. The 3. above occurs because of the SO_PEEK_OFF code, which does not expect unix_skb_len(skb) to be 0, but this is true for such consumed OOB skbs. while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); skb = skb_peek_next(skb, &sk->sk_receive_queue); ... } In addition to this use-after-free, there is another issue that ioctl(SIOCATMARK) does not function properly with consecutive consumed OOB skbs. So, nothing good comes out of such a situation. Instead of complicating manage_oob(), ioctl() handling, and the next ECONNRESET fix by introducing a loop for consecutive consumed OOB skbs, let's not leave such consecutive OOB unnecessarily. Now, while receiving an OOB skb in unix_stream_recv_urg(), if its previous skb is a consumed OOB skb, it is freed. [0]: BUG: KASAN: slab-use-after-free in unix_stream_read_actor (net/unix/af_unix.c:3027) Read of size 4 at addr ffff888106ef2904 by task python3/315 CPU: 2 UID: 0 PID: 315 Comm: python3 Not tainted 6.16.0-rc1-00407-gec315832f6f9 #8 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-4.fc42 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:122) print_report (mm/kasan/report.c:409 mm/kasan/report.c:521) kasan_report (mm/kasan/report.c:636) unix_stream_read_actor (net/unix/af_unix.c:3027) unix_stream_read_generic (net/unix/af_unix.c:2708 net/unix/af_unix.c:2847) unix_stream_recvmsg (net/unix/af_unix.c:3048) sock_recvmsg (net/socket.c:1063 (discriminator 20) net/socket.c:1085 (discriminator 20)) __sys_recvfrom (net/socket.c:2278) __x64_sys_recvfrom (net/socket.c:2291 (discriminator 1) net/socket.c:2287 (discriminator 1) net/socket.c:2287 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f8911fcea06 Code: 5d e8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 75 19 83 e2 39 83 fa 08 75 11 e8 26 ff ff ff 66 0f 1f 44 00 00 48 8b 45 10 0f 05 <48> 8b 5d f8 c9 c3 0f 1f 40 00 f3 0f 1e fa 55 48 89 e5 48 83 ec 08 RSP: 002b:00007fffdb0dccb0 EFLAGS: 00000202 ORIG_RAX: 000000000000002d RAX: ffffffffffffffda RBX: 00007fffdb0dcdc8 RCX: 00007f8911fcea06 RDX: 0000000000000001 RSI: 00007f8911a5e060 RDI: 0000000000000006 RBP: 00007fffdb0dccd0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000202 R12: 00007f89119a7d20 R13: ffffffffc4653600 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 315: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:60 (discriminator 1) mm/kasan/common.c:69 (discriminator 1)) __kasan_slab_alloc (mm/kasan/common.c:348) kmem_cache_alloc_node_noprof (./include/linux/kasan.h:250 mm/slub.c:4148 mm/slub.c:4197 mm/slub.c:4249) __alloc_skb (net/core/skbuff.c:660 (discriminator 4)) alloc_skb_with_frags (./include/linux/skbuff.h:1336 net/core/skbuff.c:6668) sock_alloc_send_pskb (net/core/sock.c:2993) unix_stream_sendmsg (./include/net/sock.h:1847 net/unix/af_unix.c:2256 net/unix/af_unix.c:2418) __sys_sendto (net/socket.c:712 (discriminator 20) net/socket.c:727 (discriminator 20) net/socket.c:2226 (discriminator 20)) __x64_sys_sendto (net/socket.c:2233 (discriminator 1) net/socket.c:2229 (discriminator 1) net/socket.c:2229 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Freed by task 315: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:60 (discriminator 1) mm/kasan/common.c:69 (discriminator 1)) kasan_save_free_info (mm/kasan/generic.c:579 (discriminator 1)) __kasan_slab_free (mm/kasan/common.c:271) kmem_cache_free (mm/slub.c:4643 (discriminator 3) mm/slub.c:4745 (discriminator 3)) unix_stream_read_generic (net/unix/af_unix.c:3010) unix_stream_recvmsg (net/unix/af_unix.c:3048) sock_recvmsg (net/socket.c:1063 (discriminator 20) net/socket.c:1085 (discriminator 20)) __sys_recvfrom (net/socket.c:2278) __x64_sys_recvfrom (net/socket.c:2291 (discriminator 1) net/socket.c:2287 (discriminator 1) net/socket.c:2287 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) The buggy address belongs to the object at ffff888106ef28c0 which belongs to the cache skbuff_head_cache of size 224 The buggy address is located 68 bytes inside of freed 224-byte region [ffff888106ef28c0, ffff888106ef29a0) The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff888106ef3cc0 pfn:0x106ef2 head: order:1 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x200000000000040(head|node=0|zone=2) page_type: f5(slab) raw: 0200000000000040 ffff8881001d28c0 ffffea000422fe00 0000000000000004 raw: ffff888106ef3cc0 0000000080190010 00000000f5000000 0000000000000000 head: 0200000000000040 ffff8881001d28c0 ffffea000422fe00 0000000000000004 head: ffff888106ef3cc0 0000000080190010 00000000f5000000 0000000000000000 head: 0200000000000001 ffffea00041bbc81 00000000ffffffff 00000000ffffffff head: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888106ef2800: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc ffff888106ef2880: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb >ffff888106ef2900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888106ef2980: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ffff888106ef2a00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: Jann Horn Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jann Horn Link: https://patch.msgid.link/20250619041457.1132791-2-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 22e170fb5dda..5392aa53cbc8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2680,11 +2680,11 @@ struct unix_stream_read_state { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { + struct sk_buff *oob_skb, *read_skb = NULL; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; - struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); @@ -2699,9 +2699,16 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) oob_skb = u->oob_skb; - if (!(state->flags & MSG_PEEK)) + if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); + if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && + !unix_skb_len(oob_skb->prev)) { + read_skb = oob_skb->prev; + __skb_unlink(read_skb, &sk->sk_receive_queue); + } + } + spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); @@ -2712,6 +2719,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) mutex_unlock(&u->iolock); + consume_skb(read_skb); + if (chunk < 0) return -EFAULT; From e1ca44e85f652a6ebd657c67c394894c1fdfb403 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:56 -0700 Subject: [PATCH 255/284] af_unix: Add test for consecutive consumed OOB. Let's add a test case where consecutive concumed OOB skbs stay at the head of the queue. Without the previous patch, ioctl(SIOCATMARK) assertion fails. Before: # RUN msg_oob.no_peek.ex_oob_ex_oob_oob ... # msg_oob.c:305:ex_oob_ex_oob_oob:Expected answ[0] (0) == oob_head (1) # ex_oob_ex_oob_oob: Test terminated by assertion # FAIL msg_oob.no_peek.ex_oob_ex_oob_oob not ok 12 msg_oob.no_peek.ex_oob_ex_oob_oob After: # RUN msg_oob.no_peek.ex_oob_ex_oob_oob ... # OK msg_oob.no_peek.ex_oob_ex_oob_oob ok 12 msg_oob.no_peek.ex_oob_ex_oob_oob Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250619041457.1132791-3-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 3ed3882a93b8..918509a3f040 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -548,6 +548,29 @@ TEST_F(msg_oob, ex_oob_oob) siocatmarkpair(false); } +TEST_F(msg_oob, ex_oob_ex_oob_oob) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("x", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("y", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + sendpair("z", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); +} + TEST_F(msg_oob, ex_oob_ahead_break) { sendpair("hello", 5, MSG_OOB); From 2a5a4841846b079b5fca5752fe94e59346fbda40 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:57 -0700 Subject: [PATCH 256/284] af_unix: Don't set -ECONNRESET for consumed OOB skb. Christian Brauner reported that even after MSG_OOB data is consumed, calling close() on the receiver socket causes the peer's recv() to return -ECONNRESET: 1. send() and recv() an OOB data. >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX, SOCK_STREAM) >>> s1.send(b'x', MSG_OOB) 1 >>> s2.recv(1, MSG_OOB) b'x' 2. close() for s2 sets ECONNRESET to s1->sk_err even though s2 consumed the OOB data >>> s2.close() >>> s1.recv(10, MSG_DONTWAIT) ... ConnectionResetError: [Errno 104] Connection reset by peer Even after being consumed, the skb holding the OOB 1-byte data stays in the recv queue to mark the OOB boundary and break recv() at that point. This must be considered while close()ing a socket. Let's skip the leading consumed OOB skb while checking the -ECONNRESET condition in unix_release_sock(). Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: Christian Brauner Closes: https://lore.kernel.org/netdev/20250529-sinkt-abfeuern-e7b08200c6b0@brauner/ Signed-off-by: Kuniyuki Iwashima Acked-by: Christian Brauner Link: https://patch.msgid.link/20250619041457.1132791-4-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5392aa53cbc8..52b155123985 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -660,6 +660,11 @@ static void unix_sock_destructor(struct sock *sk) #endif } +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); @@ -694,10 +699,16 @@ static void unix_release_sock(struct sock *sk, int embrion) if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb && !unix_skb_len(skb)) + skb = skb_peek_next(skb, &sk->sk_receive_queue); +#endif unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); - if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) + if (skb || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); @@ -2661,11 +2672,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, return timeo; } -static unsigned int unix_skb_len(const struct sk_buff *skb) -{ - return skb->len - UNIXCB(skb).consumed; -} - struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); From 632f55fa60c481035297739ecd374d945c9b32c7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:58 -0700 Subject: [PATCH 257/284] selftest: af_unix: Add tests for -ECONNRESET. A new function resetpair() calls close() for the receiver and checks the return value from recv() on the initial sender side. Now resetpair() is added to each test case and some additional test cases. Note that TCP sets -ECONNRESET to the consumed OOB, but we have decided not to touch TCP MSG_OOB code in the past. Before: # RUN msg_oob.no_peek.ex_oob_ex_oob ... # msg_oob.c:236:ex_oob_ex_oob:AF_UNIX :Connection reset by peer # msg_oob.c:237:ex_oob_ex_oob:Expected: # msg_oob.c:239:ex_oob_ex_oob:Expected ret[0] (-1) == expected_len (0) # ex_oob_ex_oob: Test terminated by assertion # FAIL msg_oob.no_peek.ex_oob_ex_oob not ok 14 msg_oob.no_peek.ex_oob_ex_oob ... # FAILED: 36 / 48 tests passed. # Totals: pass:36 fail:12 xfail:0 xpass:0 skip:0 error:0 After: # RUN msg_oob.no_peek.ex_oob_ex_oob ... # msg_oob.c:244:ex_oob_ex_oob:AF_UNIX : # msg_oob.c:245:ex_oob_ex_oob:TCP :Connection reset by peer # OK msg_oob.no_peek.ex_oob_ex_oob ok 14 msg_oob.no_peek.ex_oob_ex_oob ... # PASSED: 48 / 48 tests passed. # Totals: pass:48 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250619041457.1132791-5-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/af_unix/msg_oob.c | 119 +++++++++++++++++- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/net/af_unix/msg_oob.c b/tools/testing/selftests/net/af_unix/msg_oob.c index 918509a3f040..b5f474969917 100644 --- a/tools/testing/selftests/net/af_unix/msg_oob.c +++ b/tools/testing/selftests/net/af_unix/msg_oob.c @@ -210,7 +210,7 @@ static void __sendpair(struct __test_metadata *_metadata, static void __recvpair(struct __test_metadata *_metadata, FIXTURE_DATA(msg_oob) *self, const char *expected_buf, int expected_len, - int buf_len, int flags) + int buf_len, int flags, bool is_sender) { int i, ret[2], recv_errno[2], expected_errno = 0; char recv_buf[2][BUF_SZ] = {}; @@ -221,7 +221,9 @@ static void __recvpair(struct __test_metadata *_metadata, errno = 0; for (i = 0; i < 2; i++) { - ret[i] = recv(self->fd[i * 2 + 1], recv_buf[i], buf_len, flags); + int index = is_sender ? i * 2 : i * 2 + 1; + + ret[i] = recv(self->fd[index], recv_buf[i], buf_len, flags); recv_errno[i] = errno; } @@ -308,6 +310,20 @@ static void __siocatmarkpair(struct __test_metadata *_metadata, ASSERT_EQ(answ[0], answ[1]); } +static void __resetpair(struct __test_metadata *_metadata, + FIXTURE_DATA(msg_oob) *self, + const FIXTURE_VARIANT(msg_oob) *variant, + bool reset) +{ + int i; + + for (i = 0; i < 2; i++) + close(self->fd[i * 2 + 1]); + + __recvpair(_metadata, self, "", reset ? -ECONNRESET : 0, 1, + variant->peek ? MSG_PEEK : 0, true); +} + #define sendpair(buf, len, flags) \ __sendpair(_metadata, self, buf, len, flags) @@ -316,9 +332,10 @@ static void __siocatmarkpair(struct __test_metadata *_metadata, if (variant->peek) \ __recvpair(_metadata, self, \ expected_buf, expected_len, \ - buf_len, (flags) | MSG_PEEK); \ + buf_len, (flags) | MSG_PEEK, false); \ __recvpair(_metadata, self, \ - expected_buf, expected_len, buf_len, flags); \ + expected_buf, expected_len, \ + buf_len, flags, false); \ } while (0) #define epollpair(oob_remaining) \ @@ -330,6 +347,9 @@ static void __siocatmarkpair(struct __test_metadata *_metadata, #define setinlinepair() \ __setinlinepair(_metadata, self) +#define resetpair(reset) \ + __resetpair(_metadata, self, variant, reset) + #define tcp_incompliant \ for (self->tcp_compliant = false; \ self->tcp_compliant == false; \ @@ -344,6 +364,21 @@ TEST_F(msg_oob, non_oob) recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); siocatmarkpair(false); + + resetpair(true); +} + +TEST_F(msg_oob, non_oob_no_reset) +{ + sendpair("x", 1, 0); + epollpair(false); + siocatmarkpair(false); + + recvpair("x", 1, 1, 0); + epollpair(false); + siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, oob) @@ -355,6 +390,19 @@ TEST_F(msg_oob, oob) recvpair("x", 1, 1, MSG_OOB); epollpair(false); siocatmarkpair(true); + + tcp_incompliant { + resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */ + } +} + +TEST_F(msg_oob, oob_reset) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + resetpair(true); } TEST_F(msg_oob, oob_drop) @@ -370,6 +418,8 @@ TEST_F(msg_oob, oob_drop) recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, oob_ahead) @@ -385,6 +435,10 @@ TEST_F(msg_oob, oob_ahead) recvpair("hell", 4, 4, 0); epollpair(false); siocatmarkpair(true); + + tcp_incompliant { + resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */ + } } TEST_F(msg_oob, oob_break) @@ -403,6 +457,8 @@ TEST_F(msg_oob, oob_break) recvpair("", -EAGAIN, 1, 0); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, oob_ahead_break) @@ -426,6 +482,8 @@ TEST_F(msg_oob, oob_ahead_break) recvpair("world", 5, 5, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, oob_break_drop) @@ -449,6 +507,8 @@ TEST_F(msg_oob, oob_break_drop) recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, ex_oob_break) @@ -476,6 +536,8 @@ TEST_F(msg_oob, ex_oob_break) recvpair("ld", 2, 2, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, ex_oob_drop) @@ -498,6 +560,8 @@ TEST_F(msg_oob, ex_oob_drop) epollpair(false); siocatmarkpair(true); } + + resetpair(false); } TEST_F(msg_oob, ex_oob_drop_2) @@ -523,6 +587,8 @@ TEST_F(msg_oob, ex_oob_drop_2) epollpair(false); siocatmarkpair(true); } + + resetpair(false); } TEST_F(msg_oob, ex_oob_oob) @@ -546,6 +612,31 @@ TEST_F(msg_oob, ex_oob_oob) recvpair("", -EINVAL, 1, MSG_OOB); epollpair(false); siocatmarkpair(false); + + resetpair(false); +} + +TEST_F(msg_oob, ex_oob_ex_oob) +{ + sendpair("x", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("x", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + sendpair("y", 1, MSG_OOB); + epollpair(true); + siocatmarkpair(true); + + recvpair("y", 1, 1, MSG_OOB); + epollpair(false); + siocatmarkpair(true); + + tcp_incompliant { + resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */ + } } TEST_F(msg_oob, ex_oob_ex_oob_oob) @@ -599,6 +690,10 @@ TEST_F(msg_oob, ex_oob_ahead_break) recvpair("d", 1, 1, MSG_OOB); epollpair(false); siocatmarkpair(true); + + tcp_incompliant { + resetpair(false); /* TCP sets -ECONNRESET for ex-OOB. */ + } } TEST_F(msg_oob, ex_oob_siocatmark) @@ -618,6 +713,8 @@ TEST_F(msg_oob, ex_oob_siocatmark) recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */ epollpair(true); siocatmarkpair(false); + + resetpair(true); } TEST_F(msg_oob, inline_oob) @@ -635,6 +732,8 @@ TEST_F(msg_oob, inline_oob) recvpair("x", 1, 1, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_oob_break) @@ -656,6 +755,8 @@ TEST_F(msg_oob, inline_oob_break) recvpair("o", 1, 1, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_oob_ahead_break) @@ -684,6 +785,8 @@ TEST_F(msg_oob, inline_oob_ahead_break) epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_ex_oob_break) @@ -709,6 +812,8 @@ TEST_F(msg_oob, inline_ex_oob_break) recvpair("rld", 3, 3, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_ex_oob_no_drop) @@ -730,6 +835,8 @@ TEST_F(msg_oob, inline_ex_oob_no_drop) recvpair("y", 1, 1, 0); epollpair(false); siocatmarkpair(false); + + resetpair(false); } TEST_F(msg_oob, inline_ex_oob_drop) @@ -754,6 +861,8 @@ TEST_F(msg_oob, inline_ex_oob_drop) epollpair(false); siocatmarkpair(false); } + + resetpair(false); } TEST_F(msg_oob, inline_ex_oob_siocatmark) @@ -775,6 +884,8 @@ TEST_F(msg_oob, inline_ex_oob_siocatmark) recvpair("hell", 4, 4, 0); /* Intentionally stop at ex-OOB. */ epollpair(true); siocatmarkpair(false); + + resetpair(true); } TEST_HARNESS_MAIN From 93598167dcb6351ba40449d994244696168f1094 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Tue, 24 Jun 2025 10:14:27 +0300 Subject: [PATCH 258/284] wifi: iwlwifi: mvm: assume '1' as the default mac_config_cmd version Unfortunately, FWs of some devices don't have the version of the iwl_mac_config_cmd defined in the TLVs. We send 0 as the 'def argument to iwl_fw_lookup_cmd_ver, so for such FWs, the return value will be 0, leading to a warning, and to not sending the command. Fix this by assuming that the default version is 1. Fixes: 83f3ac2848b4 ("wifi: iwlwifi: Fix incorrect logic on cmd_ver range checking") Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250624071427.2662621-1-miriam.rachel.korenblit@intel.com --- drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c index 3c255ae916c8..3f8b840871d3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mld-mac.c @@ -32,9 +32,9 @@ static void iwl_mvm_mld_mac_ctxt_cmd_common(struct iwl_mvm *mvm, unsigned int link_id; int cmd_ver = iwl_fw_lookup_cmd_ver(mvm->fw, WIDE_ID(MAC_CONF_GROUP, - MAC_CONFIG_CMD), 0); + MAC_CONFIG_CMD), 1); - if (WARN_ON(cmd_ver < 1 || cmd_ver > 3)) + if (WARN_ON(cmd_ver > 3)) return; cmd->id_and_color = cpu_to_le32(mvmvif->id); From d87c3ca0f8f1ca4c25f2ed819e954952f4d8d709 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 24 Jun 2025 13:07:49 +0200 Subject: [PATCH 259/284] wifi: mac80211: finish link init before RCU publish Since the link/conf pointers can be accessed without any protection other than RCU, make sure the data is actually set up before publishing the structures. Fixes: b2e8434f1829 ("wifi: mac80211: set up/tear down client vif links properly") Link: https://patch.msgid.link/20250624130749.9a308b713c74.I4a80f5eead112a38730939ea591d2e275c721256@changeid Signed-off-by: Johannes Berg --- net/mac80211/link.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mac80211/link.c b/net/mac80211/link.c index d40c2bd3b50b..4f7b7d0f64f2 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -93,9 +93,6 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, if (link_id < 0) link_id = 0; - rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf); - rcu_assign_pointer(sdata->link[link_id], link); - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { struct ieee80211_sub_if_data *ap_bss; struct ieee80211_bss_conf *ap_bss_conf; @@ -145,6 +142,9 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_link_debugfs_add(link); } + + rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf); + rcu_assign_pointer(sdata->link[link_id], link); } void ieee80211_link_stop(struct ieee80211_link_data *link) From 0748e553df0225754c316a92af3a77fdc057b358 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 24 Jun 2025 10:25:04 -0400 Subject: [PATCH 260/284] userns and mnt_idmap leak in open_tree_attr(2) Once want_mount_setattr() has returned a positive, it does require finish_mount_kattr() to release ->mnt_userns. Failing do_mount_setattr() does not change that. As the result, we can end up leaking userns and possibly mnt_idmap as well. Fixes: c4a16820d901 ("fs: add open_tree_attr()") Reviewed-by: Christian Brauner Signed-off-by: Al Viro --- fs/namespace.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index eed83254492f..54c59e091919 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -5307,16 +5307,12 @@ SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, kattr.kflags |= MOUNT_KATTR_RECURSE; ret = wants_mount_setattr(uattr, usize, &kattr); - if (ret < 0) - return ret; - - if (ret) { + if (ret > 0) { ret = do_mount_setattr(&file->f_path, &kattr); - if (ret) - return ret; - finish_mount_kattr(&kattr); } + if (ret) + return ret; } fd = get_unused_fd_flags(flags & O_CLOEXEC); From b07f349d1864abe29436f45e3047da2bdd476462 Mon Sep 17 00:00:00 2001 From: Khairul Anuar Romli Date: Mon, 16 Jun 2025 09:13:53 +0800 Subject: [PATCH 261/284] spi: spi-cadence-quadspi: Fix pm runtime unbalance Having PM put sync in remove function is causing PM underflow during remove operation. This is caused by the function, runtime_pm_get_sync, not being called anywhere during the op. Ensure that calls to pm_runtime_enable()/pm_runtime_disable() and pm_runtime_get_sync()/pm_runtime_put_sync() match. echo 108d2000.spi > /sys/bus/platform/drivers/cadence-qspi/unbind [ 49.644256] Deleting MTD partitions on "108d2000.spi.0": [ 49.649575] Deleting u-boot MTD partition [ 49.684087] Deleting root MTD partition [ 49.724188] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! Continuous bind/unbind will result in an "Unbalanced pm_runtime_enable" error. Subsequent unbind attempts will return a "No such device" error, while bind attempts will return a "Resource temporarily unavailable" error. [ 47.592434] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! [ 49.592233] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128) [ 53.232309] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! [ 55.828550] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128) [ 57.940627] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! [ 59.912490] cadence-qspi 108d2000.spi: detected FIFO depth (1024) different from config (128) [ 61.876243] cadence-qspi 108d2000.spi: Runtime PM usage count underflow! [ 61.883000] platform 108d2000.spi: Unbalanced pm_runtime_enable! [ 532.012270] cadence-qspi 108d2000.spi: probe with driver cadence-qspi failed1 Also, change clk_disable_unprepare() to clk_disable() since continuous bind and unbind operations will trigger a warning indicating that the clock is already unprepared. Fixes: 4892b374c9b7 ("mtd: spi-nor: cadence-quadspi: Add runtime PM support") cc: stable@vger.kernel.org # 6.6+ Signed-off-by: Khairul Anuar Romli Reviewed-by: Matthew Gerlach Link: https://patch.msgid.link/4e7a4b8aba300e629b45a04f90bddf665fbdb335.1749601877.git.khairul.anuar.romli@altera.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence-quadspi.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index fe0f122f07b0..aa1932ba17cb 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -1958,10 +1958,10 @@ static int cqspi_probe(struct platform_device *pdev) goto probe_setup_failed; } - ret = devm_pm_runtime_enable(dev); - if (ret) { - if (cqspi->rx_chan) - dma_release_channel(cqspi->rx_chan); + pm_runtime_enable(dev); + + if (cqspi->rx_chan) { + dma_release_channel(cqspi->rx_chan); goto probe_setup_failed; } @@ -1981,6 +1981,7 @@ static int cqspi_probe(struct platform_device *pdev) return 0; probe_setup_failed: cqspi_controller_enable(cqspi, 0); + pm_runtime_disable(dev); probe_reset_failed: if (cqspi->is_jh7110) cqspi_jh7110_disable_clk(pdev, cqspi); @@ -1999,7 +2000,8 @@ static void cqspi_remove(struct platform_device *pdev) if (cqspi->rx_chan) dma_release_channel(cqspi->rx_chan); - clk_disable_unprepare(cqspi->clk); + if (pm_runtime_get_sync(&pdev->dev) >= 0) + clk_disable(cqspi->clk); if (cqspi->is_jh7110) cqspi_jh7110_disable_clk(pdev, cqspi); From 22bbc1dcd0d6785fb390c41f0dd5b5e218d23bdd Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 23 Jun 2025 12:00:53 +0200 Subject: [PATCH 262/284] vsock/uapi: fix linux/vm_sockets.h userspace compilation errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a userspace application just include will fail to build with the following errors: /usr/include/linux/vm_sockets.h:182:39: error: invalid application of ‘sizeof’ to incomplete type ‘struct sockaddr’ 182 | unsigned char svm_zero[sizeof(struct sockaddr) - | ^~~~~~ /usr/include/linux/vm_sockets.h:183:39: error: ‘sa_family_t’ undeclared here (not in a function) 183 | sizeof(sa_family_t) - | Include for userspace (guarded by ifndef __KERNEL__) where `struct sockaddr` and `sa_family_t` are defined. We already do something similar in and . Fixes: d021c344051a ("VSOCK: Introduce VM Sockets") Reported-by: Daan De Meyer Signed-off-by: Stefano Garzarella Link: https://patch.msgid.link/20250623100053.40979-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/vm_sockets.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h index ed07181d4eff..e05280e41522 100644 --- a/include/uapi/linux/vm_sockets.h +++ b/include/uapi/linux/vm_sockets.h @@ -17,6 +17,10 @@ #ifndef _UAPI_VM_SOCKETS_H #define _UAPI_VM_SOCKETS_H +#ifndef __KERNEL__ +#include /* for struct sockaddr and sa_family_t */ +#endif + #include #include From 9caca6ac0e26cd20efd490d8b3b2ffb1c7c00f6f Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Mon, 23 Jun 2025 09:06:38 -0700 Subject: [PATCH 263/284] bnxt: properly flush XDP redirect lists We encountered following crash when testing a XDP_REDIRECT feature in production: [56251.579676] list_add corruption. next->prev should be prev (ffff93120dd40f30), but was ffffb301ef3a6740. (next=ffff93120dd 40f30). [56251.601413] ------------[ cut here ]------------ [56251.611357] kernel BUG at lib/list_debug.c:29! [56251.621082] Oops: invalid opcode: 0000 [#1] PREEMPT SMP NOPTI [56251.632073] CPU: 111 UID: 0 PID: 0 Comm: swapper/111 Kdump: loaded Tainted: P O 6.12.33-cloudflare-2025.6. 3 #1 [56251.653155] Tainted: [P]=PROPRIETARY_MODULE, [O]=OOT_MODULE [56251.663877] Hardware name: MiTAC GC68B-B8032-G11P6-GPU/S8032GM-HE-CFR, BIOS V7.020.B10-sig 01/22/2025 [56251.682626] RIP: 0010:__list_add_valid_or_report+0x4b/0xa0 [56251.693203] Code: 0e 48 c7 c7 68 e7 d9 97 e8 42 16 fe ff 0f 0b 48 8b 52 08 48 39 c2 74 14 48 89 f1 48 c7 c7 90 e7 d9 97 48 89 c6 e8 25 16 fe ff <0f> 0b 4c 8b 02 49 39 f0 74 14 48 89 d1 48 c7 c7 e8 e7 d9 97 4c 89 [56251.725811] RSP: 0018:ffff93120dd40b80 EFLAGS: 00010246 [56251.736094] RAX: 0000000000000075 RBX: ffffb301e6bba9d8 RCX: 0000000000000000 [56251.748260] RDX: 0000000000000000 RSI: ffff9149afda0b80 RDI: ffff9149afda0b80 [56251.760349] RBP: ffff9131e49c8000 R08: 0000000000000000 R09: ffff93120dd40a18 [56251.772382] R10: ffff9159cf2ce1a8 R11: 0000000000000003 R12: ffff911a80850000 [56251.784364] R13: ffff93120fbc7000 R14: 0000000000000010 R15: ffff9139e7510e40 [56251.796278] FS: 0000000000000000(0000) GS:ffff9149afd80000(0000) knlGS:0000000000000000 [56251.809133] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [56251.819561] CR2: 00007f5e85e6f300 CR3: 00000038b85e2006 CR4: 0000000000770ef0 [56251.831365] PKRU: 55555554 [56251.838653] Call Trace: [56251.845560] [56251.851943] cpu_map_enqueue.cold+0x5/0xa [56251.860243] xdp_do_redirect+0x2d9/0x480 [56251.868388] bnxt_rx_xdp+0x1d8/0x4c0 [bnxt_en] [56251.877028] bnxt_rx_pkt+0x5f7/0x19b0 [bnxt_en] [56251.885665] ? cpu_max_write+0x1e/0x100 [56251.893510] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.902276] __bnxt_poll_work+0x190/0x340 [bnxt_en] [56251.911058] bnxt_poll+0xab/0x1b0 [bnxt_en] [56251.919041] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.927568] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.935958] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.944250] __napi_poll+0x2b/0x160 [56251.951155] bpf_trampoline_6442548651+0x79/0x123 [56251.959262] __napi_poll+0x5/0x160 [56251.966037] net_rx_action+0x3d2/0x880 [56251.973133] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.981265] ? srso_alias_return_thunk+0x5/0xfbef5 [56251.989262] ? __hrtimer_run_queues+0x162/0x2a0 [56251.996967] ? srso_alias_return_thunk+0x5/0xfbef5 [56252.004875] ? srso_alias_return_thunk+0x5/0xfbef5 [56252.012673] ? bnxt_msix+0x62/0x70 [bnxt_en] [56252.019903] handle_softirqs+0xcf/0x270 [56252.026650] irq_exit_rcu+0x67/0x90 [56252.032933] common_interrupt+0x85/0xa0 [56252.039498] [56252.044246] [56252.048935] asm_common_interrupt+0x26/0x40 [56252.055727] RIP: 0010:cpuidle_enter_state+0xb8/0x420 [56252.063305] Code: dc 01 00 00 e8 f9 79 3b ff e8 64 f7 ff ff 49 89 c5 0f 1f 44 00 00 31 ff e8 a5 32 3a ff 45 84 ff 0f 85 ae 01 00 00 fb 45 85 f6 <0f> 88 88 01 00 00 48 8b 04 24 49 63 ce 4c 89 ea 48 6b f1 68 48 29 [56252.088911] RSP: 0018:ffff93120c97fe98 EFLAGS: 00000202 [56252.096912] RAX: ffff9149afd80000 RBX: ffff9141d3a72800 RCX: 0000000000000000 [56252.106844] RDX: 00003329176c6b98 RSI: ffffffe36db3fdc7 RDI: 0000000000000000 [56252.116733] RBP: 0000000000000002 R08: 0000000000000002 R09: 000000000000004e [56252.126652] R10: ffff9149afdb30c4 R11: 071c71c71c71c71c R12: ffffffff985ff860 [56252.136637] R13: 00003329176c6b98 R14: 0000000000000002 R15: 0000000000000000 [56252.146667] ? cpuidle_enter_state+0xab/0x420 [56252.153909] cpuidle_enter+0x2d/0x40 [56252.160360] do_idle+0x176/0x1c0 [56252.166456] cpu_startup_entry+0x29/0x30 [56252.173248] start_secondary+0xf7/0x100 [56252.179941] common_startup_64+0x13e/0x141 [56252.186886] From the crash dump, we found that the cpu_map_flush_list inside redirect info is partially corrupted: its list_head->next points to itself, but list_head->prev points to a valid list of unflushed bq entries. This turned out to be a result of missed XDP flush on redirect lists. By digging in the actual source code, we found that commit 7f0a168b0441 ("bnxt_en: Add completion ring pointer in TX and RX ring structures") incorrectly overwrites the event mask for XDP_REDIRECT in bnxt_rx_xdp. We can stably reproduce this crash by returning XDP_TX and XDP_REDIRECT randomly for incoming packets in a naive XDP program. Properly propagate the XDP_REDIRECT events back fixes the crash. Fixes: a7559bc8c17c ("bnxt: support transmit and free of aggregation buffers") Tested-by: Andrew Rzeznik Signed-off-by: Yan Zhai Acked-by: Jesper Dangaard Brouer Reviewed-by: Michael Chan Reviewed-by: Andy Gospodarek Link: https://patch.msgid.link/aFl7jpCNzscumuN2@debian.debian Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2cb3185c442c..ae89a981e052 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2989,6 +2989,7 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, { struct bnxt_napi *bnapi = cpr->bnapi; u32 raw_cons = cpr->cp_raw_cons; + bool flush_xdp = false; u32 cons; int rx_pkts = 0; u8 event = 0; @@ -3042,6 +3043,8 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, else rc = bnxt_force_rx_discard(bp, cpr, &raw_cons, &event); + if (event & BNXT_REDIRECT_EVENT) + flush_xdp = true; if (likely(rc >= 0)) rx_pkts += rc; /* Increment rx_pkts when rc is -ENOMEM to count towards @@ -3066,7 +3069,7 @@ static int __bnxt_poll_work(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, } } - if (event & BNXT_REDIRECT_EVENT) { + if (flush_xdp) { xdp_do_flush(); event &= ~BNXT_REDIRECT_EVENT; } From c55c7a85e02a7bfee20a3ffebdff7cbeb41613ef Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:25 +0800 Subject: [PATCH 264/284] um: ubd: Add missing error check in start_io_thread() The subsequent call to os_set_fd_block() overwrites the previous return value. OR the two return values together to fix it. Fixes: f88f0bdfc32f ("um: UBD Improvements") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/ubd_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c index c5e6545f6fcf..8e8a8bf518b6 100644 --- a/arch/um/drivers/ubd_user.c +++ b/arch/um/drivers/ubd_user.c @@ -41,7 +41,7 @@ int start_io_thread(struct os_helper_thread **td_out, int *fd_out) *fd_out = fds[1]; err = os_set_fd_block(*fd_out, 0); - err = os_set_fd_block(kernel_fd, 0); + err |= os_set_fd_block(kernel_fd, 0); if (err) { printk("start_io_thread - failed to set nonblocking I/O.\n"); goto out_close; From bc4e2ae08183d9017f43bf88aa7cfdf84e76f573 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:27 +0800 Subject: [PATCH 265/284] um: vfio: Prevent duplicate device assignments Ensure devices are assigned only once. Reject subsequent requests for duplicate assignments. Fixes: a0e2cb6a9063 ("um: Add VFIO-based virtual PCI driver") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-4-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vfio_kern.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c index b51fc9888ae1..13b971a2bd43 100644 --- a/arch/um/drivers/vfio_kern.c +++ b/arch/um/drivers/vfio_kern.c @@ -570,6 +570,17 @@ static void uml_vfio_release_device(struct uml_vfio_device *dev) kfree(dev); } +static struct uml_vfio_device *uml_vfio_find_device(const char *device) +{ + struct uml_vfio_device *dev; + + list_for_each_entry(dev, ¨_vfio_devices, list) { + if (!strcmp(dev->name, device)) + return dev; + } + return NULL; +} + static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) { struct uml_vfio_device *dev; @@ -582,6 +593,9 @@ static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *k uml_vfio_container.fd = fd; } + if (uml_vfio_find_device(device)) + return -EEXIST; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; From 8948941276024fcfb08fdb3d678867dc1f7b1c16 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:28 +0800 Subject: [PATCH 266/284] um: Use correct data source in fpregs_legacy_set() Read from the buffer pointed to by 'from' instead of '&buf', as 'buf' contains no valid data when 'ubuf' is NULL. Fixes: b1e1bd2e6943 ("um: Add helper functions to get/set state for SECCOMP") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-5-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/x86/um/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c index 3275870330fe..fae8aabad10f 100644 --- a/arch/x86/um/ptrace.c +++ b/arch/x86/um/ptrace.c @@ -161,7 +161,7 @@ static int fpregs_legacy_set(struct task_struct *target, from = kbuf; } - return um_fxsr_from_i387(fxsave, &buf); + return um_fxsr_from_i387(fxsave, from); } #endif From 2d65fc13be85c336c56af7077f08ccd3a3a15a4a Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 23 Jun 2025 19:08:29 +0800 Subject: [PATCH 267/284] um: vector: Reduce stack usage in vector_eth_configure() When compiling with clang (19.1.7), initializing *vp using a compound literal may result in excessive stack usage. Fix it by initializing the required fields of *vp individually. Without this patch: $ objdump -d arch/um/drivers/vector_kern.o | ./scripts/checkstack.pl x86_64 0 ... 0x0000000000000540 vector_eth_configure [vector_kern.o]:1472 ... With this patch: $ objdump -d arch/um/drivers/vector_kern.o | ./scripts/checkstack.pl x86_64 0 ... 0x0000000000000540 vector_eth_configure [vector_kern.o]:208 ... Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506221017.WtB7Usua-lkp@intel.com/ Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250623110829.314864-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vector_kern.c | 42 +++++++++++------------------------ 1 file changed, 13 insertions(+), 29 deletions(-) diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index f292e0b4ff8b..9bbbddfe866b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1625,35 +1625,19 @@ static void vector_eth_configure( device->dev = dev; - *vp = ((struct vector_private) - { - .list = LIST_HEAD_INIT(vp->list), - .dev = dev, - .unit = n, - .options = get_transport_options(def), - .rx_irq = 0, - .tx_irq = 0, - .parsed = def, - .max_packet = get_mtu(def) + ETH_HEADER_OTHER, - /* TODO - we need to calculate headroom so that ip header - * is 16 byte aligned all the time - */ - .headroom = get_headroom(def), - .form_header = NULL, - .verify_header = NULL, - .header_rxbuffer = NULL, - .header_txbuffer = NULL, - .header_size = 0, - .rx_header_size = 0, - .rexmit_scheduled = false, - .opened = false, - .transport_data = NULL, - .in_write_poll = false, - .coalesce = 2, - .req_size = get_req_size(def), - .in_error = false, - .bpf = NULL - }); + INIT_LIST_HEAD(&vp->list); + vp->dev = dev; + vp->unit = n; + vp->options = get_transport_options(def); + vp->parsed = def; + vp->max_packet = get_mtu(def) + ETH_HEADER_OTHER; + /* + * TODO - we need to calculate headroom so that ip header + * is 16 byte aligned all the time + */ + vp->headroom = get_headroom(def); + vp->coalesce = 2; + vp->req_size = get_req_size(def); dev->features = dev->hw_features = (NETIF_F_SG | NETIF_F_FRAGLIST); INIT_WORK(&vp->reset_tx, vector_reset_tx); From fa6f092cc0a02d0fcee37e9e8172eda372a03d33 Mon Sep 17 00:00:00 2001 From: Adin Scannell Date: Tue, 24 Jun 2025 22:02:15 -0700 Subject: [PATCH 268/284] libbpf: Fix possible use-after-free for externs The `name` field in `obj->externs` points into the BTF data at initial open time. However, some functions may invalidate this after opening and before loading (e.g. `bpf_map__set_value_size`), which results in pointers into freed memory and undefined behavior. The simplest solution is to simply `strdup` these strings, similar to the `essent_name`, and free them at the same time. In order to test this path, the `global_map_resize` BPF selftest is modified slightly to ensure the presence of an extern, which causes this test to fail prior to the fix. Given there isn't an obvious API or error to test against, I opted to add this to the existing test as an aspect of the resizing feature rather than duplicate the test. Fixes: 9d0a23313b1a ("libbpf: Add capability for resizing datasec maps") Signed-off-by: Adin Scannell Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250625050215.2777374-1-amscanne@meta.com --- tools/lib/bpf/libbpf.c | 10 +++++++--- .../selftests/bpf/progs/test_global_map_resize.c | 16 ++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index e9c641a2fb20..52e353368f58 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -597,7 +597,7 @@ struct extern_desc { int sym_idx; int btf_id; int sec_btf_id; - const char *name; + char *name; char *essent_name; bool is_set; bool is_weak; @@ -4259,7 +4259,9 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return ext->btf_id; } t = btf__type_by_id(obj->btf, ext->btf_id); - ext->name = btf__name_by_offset(obj->btf, t->name_off); + ext->name = strdup(btf__name_by_offset(obj->btf, t->name_off)); + if (!ext->name) + return -ENOMEM; ext->sym_idx = i; ext->is_weak = ELF64_ST_BIND(sym->st_info) == STB_WEAK; @@ -9138,8 +9140,10 @@ void bpf_object__close(struct bpf_object *obj) zfree(&obj->btf_custom_path); zfree(&obj->kconfig); - for (i = 0; i < obj->nr_extern; i++) + for (i = 0; i < obj->nr_extern; i++) { + zfree(&obj->externs[i].name); zfree(&obj->externs[i].essent_name); + } zfree(&obj->externs); obj->nr_extern = 0; diff --git a/tools/testing/selftests/bpf/progs/test_global_map_resize.c b/tools/testing/selftests/bpf/progs/test_global_map_resize.c index a3f220ba7025..ee65bad0436d 100644 --- a/tools/testing/selftests/bpf/progs/test_global_map_resize.c +++ b/tools/testing/selftests/bpf/progs/test_global_map_resize.c @@ -32,6 +32,16 @@ int my_int_last SEC(".data.array_not_last"); int percpu_arr[1] SEC(".data.percpu_arr"); +/* at least one extern is included, to ensure that a specific + * regression is tested whereby resizing resulted in a free-after-use + * bug after type information is invalidated by the resize operation. + * + * There isn't a particularly good API to test for this specific condition, + * but by having externs for the resizing tests it will cover this path. + */ +extern int LINUX_KERNEL_VERSION __kconfig; +long version_sink; + SEC("tp/syscalls/sys_enter_getpid") int bss_array_sum(void *ctx) { @@ -44,6 +54,9 @@ int bss_array_sum(void *ctx) for (size_t i = 0; i < bss_array_len; ++i) sum += array[i]; + /* see above; ensure this is not optimized out */ + version_sink = LINUX_KERNEL_VERSION; + return 0; } @@ -59,6 +72,9 @@ int data_array_sum(void *ctx) for (size_t i = 0; i < data_array_len; ++i) sum += my_array[i]; + /* see above; ensure this is not optimized out */ + version_sink = LINUX_KERNEL_VERSION; + return 0; } From 5e9388f7984a9cc7e659a105113f6ccf0aebedd0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 25 Jun 2025 17:03:55 -0400 Subject: [PATCH 269/284] selftests/bpf: adapt one more case in test_lru_map to the new target_free The below commit that updated BPF_MAP_TYPE_LRU_HASH free target, also updated tools/testing/selftests/bpf/test_lru_map to match. But that missed one case that passes with 4 cores, but fails at higher cpu counts. Update test_lru_sanity3 to also adjust its expectation of target_free. This time tested with 1, 4, 16, 64 and 384 cpu count. Fixes: d4adf1c9ee77 ("bpf: Adjust free target to avoid global starvation of LRU map") Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20250625210412.2732970-1-willemdebruijn.kernel@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_lru_map.c | 33 ++++++++++++---------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c index 4ae83f4b7fc7..0921939532c6 100644 --- a/tools/testing/selftests/bpf/test_lru_map.c +++ b/tools/testing/selftests/bpf/test_lru_map.c @@ -138,6 +138,12 @@ static int sched_next_online(int pid, int *next_to_try) return ret; } +/* Derive target_free from map_size, same as bpf_common_lru_populate */ +static unsigned int __tgt_size(unsigned int map_size) +{ + return (map_size / nr_cpus) / 2; +} + /* Inverse of how bpf_common_lru_populate derives target_free from map_size. */ static unsigned int __map_size(unsigned int tgt_free) { @@ -410,12 +416,12 @@ static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free) printf("Pass\n"); } -/* Size of the LRU map is 2*tgt_free - * It is to test the active/inactive list rotation - * Insert 1 to 2*tgt_free (+2*tgt_free keys) - * Lookup key 1 to tgt_free*3/2 - * Add 1+2*tgt_free to tgt_free*5/2 (+tgt_free/2 keys) - * => key 1+tgt_free*3/2 to 2*tgt_free are removed from LRU +/* Test the active/inactive list rotation + * + * Fill the whole map, deplete the free list. + * Reference all except the last lru->target_free elements. + * Insert lru->target_free new elements. This triggers one shrink. + * Verify that the non-referenced elements are replaced. */ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free) { @@ -434,8 +440,7 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free) assert(sched_next_online(0, &next_cpu) != -1); - batch_size = tgt_free / 2; - assert(batch_size * 2 == tgt_free); + batch_size = __tgt_size(tgt_free); map_size = tgt_free * 2; lru_map_fd = create_map(map_type, map_flags, map_size); @@ -446,23 +451,21 @@ static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free) value[0] = 1234; - /* Insert 1 to 2*tgt_free (+2*tgt_free keys) */ - end_key = 1 + (2 * tgt_free); + /* Fill the map */ + end_key = 1 + map_size; for (key = 1; key < end_key; key++) assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST)); - /* Lookup key 1 to tgt_free*3/2 */ - end_key = tgt_free + batch_size; + /* Reference all but the last batch_size */ + end_key = 1 + map_size - batch_size; for (key = 1; key < end_key; key++) { assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value)); assert(!bpf_map_update_elem(expected_map_fd, &key, value, BPF_NOEXIST)); } - /* Add 1+2*tgt_free to tgt_free*5/2 - * (+tgt_free/2 keys) - */ + /* Insert new batch_size: replaces the non-referenced elements */ key = 2 * tgt_free + 1; end_key = key + batch_size; for (; key < end_key; key++) { From c4890963350dcf4e9a909bae23665921fba4ad27 Mon Sep 17 00:00:00 2001 From: Thomas Fourier Date: Tue, 24 Jun 2025 08:41:47 +0200 Subject: [PATCH 270/284] atm: idt77252: Add missing `dma_map_error()` The DMA map functions can fail and should be tested for errors. Signed-off-by: Thomas Fourier Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250624064148.12815-3-fourier.thomas@gmail.com Signed-off-by: Jakub Kicinski --- drivers/atm/idt77252.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c index 1206ab764ba9..f2e91b7d79f0 100644 --- a/drivers/atm/idt77252.c +++ b/drivers/atm/idt77252.c @@ -852,6 +852,8 @@ queue_skb(struct idt77252_dev *card, struct vc_map *vc, IDT77252_PRV_PADDR(skb) = dma_map_single(&card->pcidev->dev, skb->data, skb->len, DMA_TO_DEVICE); + if (dma_mapping_error(&card->pcidev->dev, IDT77252_PRV_PADDR(skb))) + return -ENOMEM; error = -EINVAL; @@ -1857,6 +1859,8 @@ add_rx_skb(struct idt77252_dev *card, int queue, paddr = dma_map_single(&card->pcidev->dev, skb->data, skb_end_pointer(skb) - skb->data, DMA_FROM_DEVICE); + if (dma_mapping_error(&card->pcidev->dev, paddr)) + goto outpoolrm; IDT77252_PRV_PADDR(skb) = paddr; if (push_rx_skb(card, skb, queue)) { @@ -1871,6 +1875,7 @@ add_rx_skb(struct idt77252_dev *card, int queue, dma_unmap_single(&card->pcidev->dev, IDT77252_PRV_PADDR(skb), skb_end_pointer(skb) - skb->data, DMA_FROM_DEVICE); +outpoolrm: handle = IDT77252_PRV_POOL(skb); card->sbpool[POOL_QUEUE(handle)].skb[POOL_INDEX(handle)] = NULL; From 7b515f35a911fdc31fbde6531828dcd6ae9803d3 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 24 Jun 2025 17:35:12 +0100 Subject: [PATCH 271/284] net: enetc: Correct endianness handling in _enetc_rd_reg64 enetc_hw.h provides two versions of _enetc_rd_reg64. One which simply calls ioread64() when available. And another that composes the 64-bit result from ioread32() calls. In the second case the code appears to assume that each ioread32() call returns a little-endian value. However both the shift and logical or used to compose the return value would not work correctly on big endian systems if this were the case. Moreover, this is inconsistent with the first case where the return value of ioread64() is assumed to be in host byte order. It appears that the correct approach is for both versions to treat the return value of ioread*() functions as being in host byte order. And this patch corrects the ioread32()-based version to do so. This is a bug but would only manifest on big endian systems that make use of the ioread32-based implementation of _enetc_rd_reg64. While all in-tree users of this driver are little endian and make use of the ioread64-based implementation of _enetc_rd_reg64. Thus, no in-tree user of this driver is affected by this bug. Flagged by Sparse. Compile tested only. Fixes: 16eb4c85c964 ("enetc: Add ethtool statistics") Closes: https://lore.kernel.org/all/AM9PR04MB850500D3FC24FE23DEFCEA158879A@AM9PR04MB8505.eurprd04.prod.outlook.com/ Signed-off-by: Simon Horman Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250624-etnetc-le-v1-1-a73a95d96e4e@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc_hw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_hw.h b/drivers/net/ethernet/freescale/enetc/enetc_hw.h index 4098f01479bc..53e8d18c7a34 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_hw.h +++ b/drivers/net/ethernet/freescale/enetc/enetc_hw.h @@ -507,7 +507,7 @@ static inline u64 _enetc_rd_reg64(void __iomem *reg) tmp = ioread32(reg + 4); } while (high != tmp); - return le64_to_cpu((__le64)high << 32 | low); + return (u64)high << 32 | low; } #endif From 2434ccb94dfc8e036c81e9d7c77fca6dc1dc2590 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:53 -0700 Subject: [PATCH 272/284] netlink: specs: nfsd: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 13727f85b49b ("NFSD: introduce netlink stubs") Reviewed-by: Donald Hunter Reviewed-by: Jeff Layton Link: https://patch.msgid.link/20250624211002.3475021-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/nfsd.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/nfsd.yaml b/Documentation/netlink/specs/nfsd.yaml index c87658114852..8d1a3c01708f 100644 --- a/Documentation/netlink/specs/nfsd.yaml +++ b/Documentation/netlink/specs/nfsd.yaml @@ -27,7 +27,7 @@ attribute-sets: name: proc type: u32 - - name: service_time + name: service-time type: s64 - name: pad @@ -139,7 +139,7 @@ operations: - prog - version - proc - - service_time + - service-time - saddr4 - daddr4 - saddr6 From 791a9ed0a40dfa70fe793dcecf950273255cbb84 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:54 -0700 Subject: [PATCH 273/284] netlink: specs: fou: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 4eb77b4ecd3c ("netlink: add a proto specification for FOU") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/fou.yaml | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Documentation/netlink/specs/fou.yaml b/Documentation/netlink/specs/fou.yaml index 0af5ab842c04..b02ab19817d3 100644 --- a/Documentation/netlink/specs/fou.yaml +++ b/Documentation/netlink/specs/fou.yaml @@ -15,7 +15,7 @@ kernel-policy: global definitions: - type: enum - name: encap_type + name: encap-type name-prefix: fou-encap- enum-name: entries: [ unspec, direct, gue ] @@ -43,26 +43,26 @@ attribute-sets: name: type type: u8 - - name: remcsum_nopartial + name: remcsum-nopartial type: flag - - name: local_v4 + name: local-v4 type: u32 - - name: local_v6 + name: local-v6 type: binary checks: min-len: 16 - - name: peer_v4 + name: peer-v4 type: u32 - - name: peer_v6 + name: peer-v6 type: binary checks: min-len: 16 - - name: peer_port + name: peer-port type: u16 byte-order: big-endian - @@ -90,12 +90,12 @@ operations: - port - ipproto - type - - remcsum_nopartial - - local_v4 - - peer_v4 - - local_v6 - - peer_v6 - - peer_port + - remcsum-nopartial + - local-v4 + - peer-v4 + - local-v6 + - peer-v6 + - peer-port - ifindex - @@ -112,11 +112,11 @@ operations: - af - ifindex - port - - peer_port - - local_v4 - - peer_v4 - - local_v6 - - peer_v6 + - peer-port + - local-v4 + - peer-v4 + - local-v6 + - peer-v6 - name: get From 07caaf875c937e5f0a262a1dbad307d08ecc8673 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:55 -0700 Subject: [PATCH 274/284] netlink: specs: ethtool: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen replaces special chars in names) but gives more uniform naming in Python. Fixes: 13e59344fb9d ("net: ethtool: add support for symmetric-xor RSS hash") Fixes: 46fb3ba95b93 ("ethtool: Add an interface for flashing transceiver modules' firmware") Reviewed-by: Kory Maincent Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ethtool.yaml | 6 +++--- tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/ethtool.yaml b/Documentation/netlink/specs/ethtool.yaml index 72a076b0e1b5..348c6ad548f5 100644 --- a/Documentation/netlink/specs/ethtool.yaml +++ b/Documentation/netlink/specs/ethtool.yaml @@ -48,7 +48,7 @@ definitions: name: started doc: The firmware flashing process has started. - - name: in_progress + name: in-progress doc: The firmware flashing process is in progress. - name: completed @@ -1422,7 +1422,7 @@ attribute-sets: name: hkey type: binary - - name: input_xfrm + name: input-xfrm type: u32 - name: start-context @@ -2238,7 +2238,7 @@ operations: - hfunc - indir - hkey - - input_xfrm + - input-xfrm dump: request: attributes: diff --git a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py index f439c434ba36..648ff50bc1c3 100755 --- a/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py +++ b/tools/testing/selftests/drivers/net/hw/rss_input_xfrm.py @@ -38,7 +38,7 @@ def test_rss_input_xfrm(cfg, ipver): raise KsftSkipEx("socket.SO_INCOMING_CPU was added in Python 3.11") input_xfrm = cfg.ethnl.rss_get( - {'header': {'dev-name': cfg.ifname}}).get('input_xfrm') + {'header': {'dev-name': cfg.ifname}}).get('input-xfrm') # Check for symmetric xor/or-xor if not input_xfrm or (input_xfrm != 1 and input_xfrm != 2): From 354592f19c7b0ac5983d56d8594a76aa3437f0a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:56 -0700 Subject: [PATCH 275/284] netlink: specs: dpll: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 3badff3a25d8 ("dpll: spec: Add Netlink spec in YAML") Reviewed-by: Vadim Fedorenko Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/dpll.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/dpll.yaml b/Documentation/netlink/specs/dpll.yaml index 8feefeae5376..f434140b538e 100644 --- a/Documentation/netlink/specs/dpll.yaml +++ b/Documentation/netlink/specs/dpll.yaml @@ -188,7 +188,7 @@ definitions: value: 10000 - type: const - name: pin-frequency-77_5-khz + name: pin-frequency-77-5-khz value: 77500 - type: const From 9407680945145cc34fafd0e9f802b03574620d43 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:57 -0700 Subject: [PATCH 276/284] netlink: specs: devlink: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 429ac6211494 ("devlink: define enum for attr types of dynamic attributes") Fixes: f2f9dd164db0 ("netlink: specs: devlink: add the remaining command to generate complete split_ops") Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 05fee1b7fe19..38ddc04f9e6d 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -38,15 +38,15 @@ definitions: - name: dsa - - name: pci_pf + name: pci-pf - - name: pci_vf + name: pci-vf - name: virtual - name: unused - - name: pci_sf + name: pci-sf - type: enum name: port-fn-state @@ -220,7 +220,7 @@ definitions: - name: flag - - name: nul_string + name: nul-string value: 10 - name: binary From e40d3d0931d2b1e86ba2ca0e9c95f90ac6dd5525 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:58 -0700 Subject: [PATCH 277/284] netlink: specs: ovs_flow: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: 93b230b549bc ("netlink: specs: add ynl spec for ovs_flow") Reviewed-by: Donald Hunter Reviewed-by: Ilya Maximets Reviewed-by: Eelco Chaudron Link: https://patch.msgid.link/20250624211002.3475021-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/ovs_flow.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/netlink/specs/ovs_flow.yaml b/Documentation/netlink/specs/ovs_flow.yaml index 46f5d1cd8a5f..7974aa7d8905 100644 --- a/Documentation/netlink/specs/ovs_flow.yaml +++ b/Documentation/netlink/specs/ovs_flow.yaml @@ -216,7 +216,7 @@ definitions: type: struct members: - - name: nd_target + name: nd-target type: binary len: 16 byte-order: big-endian @@ -258,12 +258,12 @@ definitions: type: struct members: - - name: vlan_tpid + name: vlan-tpid type: u16 byte-order: big-endian doc: Tag protocol identifier (TPID) to push. - - name: vlan_tci + name: vlan-tci type: u16 byte-order: big-endian doc: Tag control identifier (TCI) to push. From 9e6dd4c256d0774701637b958ba682eff4991277 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:09:59 -0700 Subject: [PATCH 278/284] netlink: specs: mptcp: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: bc8aeb2045e2 ("Documentation: netlink: add a YAML spec for mptcp") Reviewed-by: Davide Caratti Reviewed-by: Donald Hunter Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250624211002.3475021-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 8 ++++---- include/uapi/linux/mptcp_pm.h | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index dfd017780d2f..fb57860fe778 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -57,21 +57,21 @@ definitions: doc: >- A new subflow has been established. 'error' should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: sub-closed doc: >- A subflow has been closed. An error (copy of sk_err) could be set if an error has been detected for this subflow. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: sub-priority value: 13 doc: >- The priority of a subflow has changed. 'error' should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - daddr6, sport, dport, backup, if_idx [, error]. + daddr6, sport, dport, backup, if-idx [, error]. - name: listener-created value: 15 @@ -255,7 +255,7 @@ attribute-sets: name: timeout type: u32 - - name: if_idx + name: if-idx type: u32 - name: reset-reason diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index 84fa8a21dfd0..6ac84b2f636c 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -27,14 +27,14 @@ * token, rem_id. * @MPTCP_EVENT_SUB_ESTABLISHED: A new subflow has been established. 'error' * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_SUB_CLOSED: A subflow has been closed. An error (copy of * sk_err) could be set if an error has been detected for this subflow. * Attributes: token, family, loc_id, rem_id, saddr4 | saddr6, daddr4 | - * daddr6, sport, dport, backup, if_idx [, error]. + * daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_SUB_PRIORITY: The priority of a subflow has changed. 'error' * should not be set. Attributes: token, family, loc_id, rem_id, saddr4 | - * saddr6, daddr4 | daddr6, sport, dport, backup, if_idx [, error]. + * saddr6, daddr4 | daddr6, sport, dport, backup, if-idx [, error]. * @MPTCP_EVENT_LISTENER_CREATED: A new PM listener is created. Attributes: * family, sport, saddr4 | saddr6. * @MPTCP_EVENT_LISTENER_CLOSED: A PM listener is closed. Attributes: family, From 8d7e211ea925e050882c762ddf8cf2da78856dfb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:10:00 -0700 Subject: [PATCH 279/284] netlink: specs: rt-link: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: b2f63d904e72 ("doc/netlink: Add spec for rt link messages") Reviewed-by: Jacob Keller Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-link.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/rt-link.yaml b/Documentation/netlink/specs/rt-link.yaml index b41b31eebcae..28c4cf66517c 100644 --- a/Documentation/netlink/specs/rt-link.yaml +++ b/Documentation/netlink/specs/rt-link.yaml @@ -603,7 +603,7 @@ definitions: name: optmask type: u32 - - name: if_stats_msg + name: if-stats-msg type: struct members: - @@ -2486,7 +2486,7 @@ operations: name: getstats doc: Get / dump link stats. attribute-set: stats-attrs - fixed-header: if_stats_msg + fixed-header: if-stats-msg do: request: value: 94 From eef0eaeca7fa8e358a31e89802f564451b797718 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:10:01 -0700 Subject: [PATCH 280/284] netlink: specs: tc: replace underscores with dashes in names We're trying to add a strict regexp for the name format in the spec. Underscores will not be allowed, dashes should be used instead. This makes no difference to C (codegen, if used, replaces special chars in names) but it gives more uniform naming in Python. Fixes: a1bcfde83669 ("doc/netlink/specs: Add a spec for tc") Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-10-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/tc.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/netlink/specs/tc.yaml b/Documentation/netlink/specs/tc.yaml index cb7ea7d62e56..42d74c9aeb54 100644 --- a/Documentation/netlink/specs/tc.yaml +++ b/Documentation/netlink/specs/tc.yaml @@ -232,7 +232,7 @@ definitions: type: u8 doc: log(P_max / (qth-max - qth-min)) - - name: Scell_log + name: Scell-log type: u8 doc: cell size for idle damping - @@ -253,7 +253,7 @@ definitions: name: DPs type: u32 - - name: def_DP + name: def-DP type: u32 - name: grio From af852f1f1c951d43b36881302fd10d9f898cdb54 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 14:10:02 -0700 Subject: [PATCH 281/284] netlink: specs: enforce strict naming of properties Add a regexp to make sure all names which may end up being visible to the user consist of lower case characters, numbers and dashes. Underscores keep sneaking into the specs, which is not visible in the C code but makes the Python and alike inconsistent. Note that starting with a number is okay, as in C the full name will include the family name. For legacy families we can't enforce the naming in the family name or the multicast group names, as these are part of the binary uAPI of the kernel. For classic netlink we need to allow capital letters in names of struct members. TC has some structs with capitalized members. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250624211002.3475021-11-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/genetlink-legacy.yaml | 15 +++++++++------ Documentation/netlink/genetlink.yaml | 17 ++++++++++------- Documentation/netlink/netlink-raw.yaml | 18 ++++++++++++------ 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/Documentation/netlink/genetlink-legacy.yaml b/Documentation/netlink/genetlink-legacy.yaml index 4cbfe666e6f5..b29d62eefa16 100644 --- a/Documentation/netlink/genetlink-legacy.yaml +++ b/Documentation/netlink/genetlink-legacy.yaml @@ -6,6 +6,9 @@ $schema: https://json-schema.org/draft-07/schema # Common defines $defs: + name: + type: string + pattern: ^[0-9a-z-]+$ uint: type: integer minimum: 0 @@ -76,7 +79,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' header: description: For C-compatible languages, header which already defines this value. type: string @@ -103,7 +106,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' value: type: integer doc: @@ -132,7 +135,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' type: description: The netlink attribute type enum: [ u8, u16, u32, u64, s8, s16, s32, s64, string, binary ] @@ -169,7 +172,7 @@ properties: name: description: | Name used when referring to this space in other definitions, not used outside of the spec. - type: string + $ref: '#/$defs/name' name-prefix: description: | Prefix for the C enum name of the attributes. Default family[name]-set[name]-a- @@ -206,7 +209,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' type: &attr-type description: The netlink attribute type enum: [ unused, pad, flag, binary, bitfield32, @@ -348,7 +351,7 @@ properties: properties: name: description: Name of the operation, also defining its C enum value in uAPI. - type: string + $ref: '#/$defs/name' doc: description: Documentation for the command. type: string diff --git a/Documentation/netlink/genetlink.yaml b/Documentation/netlink/genetlink.yaml index 40efbbad76ab..7b1ec153e834 100644 --- a/Documentation/netlink/genetlink.yaml +++ b/Documentation/netlink/genetlink.yaml @@ -6,6 +6,9 @@ $schema: https://json-schema.org/draft-07/schema # Common defines $defs: + name: + type: string + pattern: ^[0-9a-z-]+$ uint: type: integer minimum: 0 @@ -29,7 +32,7 @@ additionalProperties: False properties: name: description: Name of the genetlink family. - type: string + $ref: '#/$defs/name' doc: type: string protocol: @@ -48,7 +51,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' header: description: For C-compatible languages, header which already defines this value. type: string @@ -75,7 +78,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' value: type: integer doc: @@ -96,7 +99,7 @@ properties: name: description: | Name used when referring to this space in other definitions, not used outside of the spec. - type: string + $ref: '#/$defs/name' name-prefix: description: | Prefix for the C enum name of the attributes. Default family[name]-set[name]-a- @@ -121,7 +124,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' type: &attr-type enum: [ unused, pad, flag, binary, uint, sint, u8, u16, u32, u64, s8, s16, s32, s64, @@ -243,7 +246,7 @@ properties: properties: name: description: Name of the operation, also defining its C enum value in uAPI. - type: string + $ref: '#/$defs/name' doc: description: Documentation for the command. type: string @@ -327,7 +330,7 @@ properties: name: description: | The name for the group, used to form the define and the value of the define. - type: string + $ref: '#/$defs/name' flags: *cmd_flags kernel-family: diff --git a/Documentation/netlink/netlink-raw.yaml b/Documentation/netlink/netlink-raw.yaml index e34bf23897fa..246fa07bccf6 100644 --- a/Documentation/netlink/netlink-raw.yaml +++ b/Documentation/netlink/netlink-raw.yaml @@ -6,6 +6,12 @@ $schema: https://json-schema.org/draft-07/schema # Common defines $defs: + name: + type: string + pattern: ^[0-9a-z-]+$ + name-cap: + type: string + pattern: ^[0-9a-zA-Z-]+$ uint: type: integer minimum: 0 @@ -71,7 +77,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' header: description: For C-compatible languages, header which already defines this value. type: string @@ -98,7 +104,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' value: type: integer doc: @@ -124,7 +130,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name-cap' type: description: | The netlink attribute type. Members of type 'binary' or 'pad' @@ -183,7 +189,7 @@ properties: name: description: | Name used when referring to this space in other definitions, not used outside of the spec. - type: string + $ref: '#/$defs/name' name-prefix: description: | Prefix for the C enum name of the attributes. Default family[name]-set[name]-a- @@ -220,7 +226,7 @@ properties: additionalProperties: False properties: name: - type: string + $ref: '#/$defs/name' type: &attr-type description: The netlink attribute type enum: [ unused, pad, flag, binary, bitfield32, @@ -408,7 +414,7 @@ properties: properties: name: description: Name of the operation, also defining its C enum value in uAPI. - type: string + $ref: '#/$defs/name' doc: description: Documentation for the command. type: string From a433791aeaea6e84df709e0b9584b9bbe040cd1c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 24 Jun 2025 14:45:00 -0700 Subject: [PATCH 282/284] atm: Release atm_dev_mutex after removing procfs in atm_dev_deregister(). syzbot reported a warning below during atm_dev_register(). [0] Before creating a new device and procfs/sysfs for it, atm_dev_register() looks up a duplicated device by __atm_dev_lookup(). These operations are done under atm_dev_mutex. However, when removing a device in atm_dev_deregister(), it releases the mutex just after removing the device from the list that __atm_dev_lookup() iterates over. So, there will be a small race window where the device does not exist on the device list but procfs/sysfs are still not removed, triggering the splat. Let's hold the mutex until procfs/sysfs are removed in atm_dev_deregister(). [0]: proc_dir_entry 'atm/atmtcp:0' already registered WARNING: CPU: 0 PID: 5919 at fs/proc/generic.c:377 proc_register+0x455/0x5f0 fs/proc/generic.c:377 Modules linked in: CPU: 0 UID: 0 PID: 5919 Comm: syz-executor284 Not tainted 6.16.0-rc2-syzkaller-00047-g52da431bf03b #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 RIP: 0010:proc_register+0x455/0x5f0 fs/proc/generic.c:377 Code: 48 89 f9 48 c1 e9 03 80 3c 01 00 0f 85 a2 01 00 00 48 8b 44 24 10 48 c7 c7 20 c0 c2 8b 48 8b b0 d8 00 00 00 e8 0c 02 1c ff 90 <0f> 0b 90 90 48 c7 c7 80 f2 82 8e e8 0b de 23 09 48 8b 4c 24 28 48 RSP: 0018:ffffc9000466fa30 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffffffff817ae248 RDX: ffff888026280000 RSI: ffffffff817ae255 RDI: 0000000000000001 RBP: ffff8880232bed48 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: ffff888076ed2140 R13: dffffc0000000000 R14: ffff888078a61340 R15: ffffed100edda444 FS: 00007f38b3b0c6c0(0000) GS:ffff888124753000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f38b3bdf953 CR3: 0000000076d58000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: proc_create_data+0xbe/0x110 fs/proc/generic.c:585 atm_proc_dev_register+0x112/0x1e0 net/atm/proc.c:361 atm_dev_register+0x46d/0x890 net/atm/resources.c:113 atmtcp_create+0x77/0x210 drivers/atm/atmtcp.c:369 atmtcp_attach drivers/atm/atmtcp.c:403 [inline] atmtcp_ioctl+0x2f9/0xd60 drivers/atm/atmtcp.c:464 do_vcc_ioctl+0x12c/0x930 net/atm/ioctl.c:159 sock_do_ioctl+0x115/0x280 net/socket.c:1190 sock_ioctl+0x227/0x6b0 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl fs/ioctl.c:893 [inline] __x64_sys_ioctl+0x18b/0x210 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xcd/0x4c0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f38b3b74459 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f38b3b0c198 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007f38b3bfe318 RCX: 00007f38b3b74459 RDX: 0000000000000000 RSI: 0000000000006180 RDI: 0000000000000005 RBP: 00007f38b3bfe310 R08: 65732f636f72702f R09: 65732f636f72702f R10: 65732f636f72702f R11: 0000000000000246 R12: 00007f38b3bcb0ac R13: 00007f38b3b0c1a0 R14: 0000200000000200 R15: 00007f38b3bcb03b Fixes: 64bf69ddff76 ("[ATM]: deregistration removes device from atm_devs list immediately") Reported-by: syzbot+8bd335d2ad3b93e80715@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/685316de.050a0220.216029.0087.GAE@google.com/ Tested-by: syzbot+8bd335d2ad3b93e80715@syzkaller.appspotmail.com Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250624214505.570679-1-kuni1840@gmail.com Signed-off-by: Jakub Kicinski --- net/atm/resources.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/atm/resources.c b/net/atm/resources.c index 995d29e7fb13..b19d851e1f44 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -146,11 +146,10 @@ void atm_dev_deregister(struct atm_dev *dev) */ mutex_lock(&atm_dev_mutex); list_del(&dev->dev_list); - mutex_unlock(&atm_dev_mutex); - atm_dev_release_vccs(dev); atm_unregister_sysfs(dev); atm_proc_dev_deregister(dev); + mutex_unlock(&atm_dev_mutex); atm_dev_put(dev); } From 8d89661a36dd3bb8c9902cff36dc0c144dce3faf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 24 Jun 2025 11:32:58 -0700 Subject: [PATCH 283/284] net: selftests: fix TCP packet checksum The length in the pseudo header should be the length of the L3 payload AKA the L4 header+payload. The selftest code builds the packet from the lower layers up, so all the headers are pushed already when it constructs L4. We need to subtract the lower layer headers from skb->len. Fixes: 3e1e58d64c3d ("net: add generic selftest support") Signed-off-by: Jakub Kicinski Reviewed-by: Gerhard Engleder Reported-by: Oleksij Rempel Tested-by: Oleksij Rempel Reviewed-by: Oleksij Rempel Link: https://patch.msgid.link/20250624183258.3377740-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/core/selftests.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/selftests.c b/net/core/selftests.c index 35f807ea9952..406faf8e5f3f 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -160,8 +160,9 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, skb->csum = 0; skb->ip_summed = CHECKSUM_PARTIAL; if (attr->tcp) { - thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, - ihdr->daddr, 0); + int l4len = skb->len - skb_transport_offset(skb); + + thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { From 85720e04d9af0b77f8092b12a06661a8d459d4a0 Mon Sep 17 00:00:00 2001 From: Jiawen Wu Date: Wed, 25 Jun 2025 10:39:24 +0800 Subject: [PATCH 284/284] net: libwx: fix the creation of page_pool 'rx_ring->size' means the count of ring descriptors multiplied by the size of one descriptor. When increasing the count of ring descriptors, it may exceed the limit of pool size. [ 864.209610] page_pool_create_percpu() gave up with errno -7 [ 864.209613] txgbe 0000:11:00.0: Page pool creation failed: -7 Fix to set the pool_size to the count of ring descriptors. Fixes: 850b971110b2 ("net: libwx: Allocate Rx and Tx resources") Cc: stable@vger.kernel.org Signed-off-by: Jiawen Wu Reviewed-by: Simon Horman Reviewed-by: Mina Almasry Link: https://patch.msgid.link/434C72BFB40E350A+20250625023924.21821-1-jiawenwu@trustnetic.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/wangxun/libwx/wx_lib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_lib.c b/drivers/net/ethernet/wangxun/libwx/wx_lib.c index 7f2e6cddfeb1..c57cc4f27249 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_lib.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_lib.c @@ -2623,7 +2623,7 @@ static int wx_alloc_page_pool(struct wx_ring *rx_ring) struct page_pool_params pp_params = { .flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV, .order = 0, - .pool_size = rx_ring->size, + .pool_size = rx_ring->count, .nid = dev_to_node(rx_ring->dev), .dev = rx_ring->dev, .dma_dir = DMA_FROM_DEVICE,