mirror of
https://github.com/torvalds/linux.git
synced 2026-05-13 00:28:54 +02:00
When rcu_barrier() calls rcu_rdp_cpu_online() and observes a CPU off
rnp->qsmaskinitnext, it means that all accesses from the offline CPU
preceding the CPUHP_TEARDOWN_CPU are visible to RCU barrier, including
callbacks expiration and counter updates.
However interrupts can still fire after stop_machine() re-enables
interrupts and before rcutree_report_cpu_dead(). The related accesses
happening between CPUHP_TEARDOWN_CPU and rnp->qsmaskinitnext clearing
are _NOT_ guaranteed to be seen by rcu_barrier() without proper
ordering, especially when callbacks are invoked there to the end, making
rcutree_migrate_callback() bypass barrier_lock.
The following theoretical race example can make rcu_barrier() hang:
CPU 0 CPU 1
----- -----
//cpu_down()
smpboot_park_threads()
//ksoftirqd is parked now
<IRQ>
rcu_sched_clock_irq()
invoke_rcu_core()
do_softirq()
rcu_core()
rcu_do_batch()
// callback storm
// rcu_do_batch() returns
// before completing all
// of them
// do_softirq also returns early because of
// timeout. It defers to ksoftirqd but
// it's parked
</IRQ>
stop_machine()
take_cpu_down()
rcu_barrier()
spin_lock(barrier_lock)
// observes rcu_segcblist_n_cbs(&rdp->cblist) != 0
<IRQ>
do_softirq()
rcu_core()
rcu_do_batch()
//completes all pending callbacks
//smp_mb() implied _after_ callback number dec
</IRQ>
rcutree_report_cpu_dead()
rnp->qsmaskinitnext &= ~rdp->grpmask;
rcutree_migrate_callback()
// no callback, early return without locking
// barrier_lock
//observes !rcu_rdp_cpu_online(rdp)
rcu_barrier_entrain()
rcu_segcblist_entrain()
// Observe rcu_segcblist_n_cbs(rsclp) == 0
// because no barrier between reading
// rnp->qsmaskinitnext and rsclp->len
rcu_segcblist_add_len()
smp_mb__before_atomic()
// will now observe the 0 count and empty
// list, but too late, we enqueue regardless
WRITE_ONCE(rsclp->len, rsclp->len + v);
// ignored barrier callback
// rcu barrier stall...
This could be solved with a read memory barrier, enforcing the message
passing between rnp->qsmaskinitnext and rsclp->len, matching the full
memory barrier after rsclp->len addition in rcu_segcblist_add_len()
performed at the end of rcu_do_batch().
However the rcu_barrier() is complicated enough and probably doesn't
need too many more subtleties. CPU down is a slowpath and the
barrier_lock seldom contended. Solve the issue with unconditionally
locking the barrier_lock on rcutree_migrate_callbacks(). This makes sure
that either rcu_barrier() sees the empty queue or its entrained
callback will be migrated.
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
|
||
|---|---|---|
| .. | ||
| bpf | ||
| cgroup | ||
| configs | ||
| debug | ||
| dma | ||
| entry | ||
| events | ||
| futex | ||
| gcov | ||
| irq | ||
| kcsan | ||
| livepatch | ||
| locking | ||
| module | ||
| power | ||
| printk | ||
| rcu | ||
| sched | ||
| time | ||
| trace | ||
| .gitignore | ||
| acct.c | ||
| async.c | ||
| audit_fsnotify.c | ||
| audit_tree.c | ||
| audit_watch.c | ||
| audit.c | ||
| audit.h | ||
| auditfilter.c | ||
| auditsc.c | ||
| backtracetest.c | ||
| bounds.c | ||
| capability.c | ||
| cfi.c | ||
| compat.c | ||
| configs.c | ||
| context_tracking.c | ||
| cpu_pm.c | ||
| cpu.c | ||
| crash_core.c | ||
| crash_reserve.c | ||
| cred.c | ||
| delayacct.c | ||
| dma.c | ||
| elfcorehdr.c | ||
| exec_domain.c | ||
| exit.c | ||
| exit.h | ||
| extable.c | ||
| fail_function.c | ||
| fork.c | ||
| freezer.c | ||
| gen_kheaders.sh | ||
| groups.c | ||
| hung_task.c | ||
| iomem.c | ||
| irq_work.c | ||
| jump_label.c | ||
| kallsyms_internal.h | ||
| kallsyms_selftest.c | ||
| kallsyms_selftest.h | ||
| kallsyms.c | ||
| kcmp.c | ||
| Kconfig.freezer | ||
| Kconfig.hz | ||
| Kconfig.kexec | ||
| Kconfig.locks | ||
| Kconfig.preempt | ||
| kcov.c | ||
| kexec_core.c | ||
| kexec_elf.c | ||
| kexec_file.c | ||
| kexec_internal.h | ||
| kexec.c | ||
| kheaders.c | ||
| kprobes.c | ||
| ksyms_common.c | ||
| ksysfs.c | ||
| kthread.c | ||
| latencytop.c | ||
| Makefile | ||
| module_signature.c | ||
| notifier.c | ||
| nsproxy.c | ||
| numa.c | ||
| padata.c | ||
| panic.c | ||
| params.c | ||
| pid_namespace.c | ||
| pid_sysctl.h | ||
| pid.c | ||
| profile.c | ||
| ptrace.c | ||
| range.c | ||
| reboot.c | ||
| regset.c | ||
| relay.c | ||
| resource_kunit.c | ||
| resource.c | ||
| rseq.c | ||
| scftorture.c | ||
| scs.c | ||
| seccomp.c | ||
| signal.c | ||
| smp.c | ||
| smpboot.c | ||
| smpboot.h | ||
| softirq.c | ||
| stackleak.c | ||
| stacktrace.c | ||
| static_call_inline.c | ||
| static_call.c | ||
| stop_machine.c | ||
| sys_ni.c | ||
| sys.c | ||
| sysctl-test.c | ||
| sysctl.c | ||
| task_work.c | ||
| taskstats.c | ||
| torture.c | ||
| tracepoint.c | ||
| tsacct.c | ||
| ucount.c | ||
| uid16.c | ||
| uid16.h | ||
| umh.c | ||
| up.c | ||
| user_namespace.c | ||
| user-return-notifier.c | ||
| user.c | ||
| usermode_driver.c | ||
| utsname_sysctl.c | ||
| utsname.c | ||
| vhost_task.c | ||
| vmcore_info.c | ||
| watch_queue.c | ||
| watchdog_buddy.c | ||
| watchdog_perf.c | ||
| watchdog.c | ||
| workqueue_internal.h | ||
| workqueue.c | ||