From 400d033f5a599120089b5f0c54d14d198499af5a Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Sun, 2 Aug 2020 19:15:41 +0800 Subject: [PATCH 01/12] clocksource/drivers/h8300_timer8: Fix wrong return value in h8300_8timer_init() In the init function, if the call to of_iomap() fails, the return value is ENXIO instead of -ENXIO. Change to the right negative errno. Fixes: 691f8f878290f ("clocksource/drivers/h8300_timer8: Convert init function to return error") Cc: Daniel Lezcano Signed-off-by: Tianjia Zhang Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200802111541.5429-1-tianjia.zhang@linux.alibaba.com --- drivers/clocksource/h8300_timer8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/h8300_timer8.c b/drivers/clocksource/h8300_timer8.c index 1d740a8c42ab..47114c2a7cb5 100644 --- a/drivers/clocksource/h8300_timer8.c +++ b/drivers/clocksource/h8300_timer8.c @@ -169,7 +169,7 @@ static int __init h8300_8timer_init(struct device_node *node) return PTR_ERR(clk); } - ret = ENXIO; + ret = -ENXIO; base = of_iomap(node, 0); if (!base) { pr_err("failed to map registers for clockevent\n"); From 164805157f3c6834670afbaff563353c773131f1 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 17 Aug 2020 12:24:28 +0300 Subject: [PATCH 02/12] clocksource/drivers/timer-ti-dm: Do reset before enable Commit 6cfcd5563b4f ("clocksource/drivers/timer-ti-dm: Fix suspend and resume for am3 and am4") exposed a new issue for type2 dual mode timers on at least omap5 where the clockevent will stop when the SoC starts entering idle states during the boot. Turns out we are wrongly first enabling the system timer and then resetting it, while we must also re-enable it after reset. The current sequence leaves the timer module in a partially initialized state. This issue went unnoticed earlier with ti-sysc driver reconfiguring the timer module until we fixed the issue of ti-sysc reconfiguring system timers. Let's fix the issue by calling dmtimer_systimer_enable() from reset for both type1 and type2 timers, and switch the order of reset and enable in dmtimer_systimer_setup(). Let's also move dmtimer_systimer_enable() and dmtimer_systimer_disable() to do this without adding forward declarations. Fixes: 6cfcd5563b4f ("clocksource/drivers/timer-ti-dm: Fix suspend and resume for am3 and am4") Reported-by: H. Nikolaus Schaller" Signed-off-by: Tony Lindgren Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20200817092428.6176-1-tony@atomide.com --- drivers/clocksource/timer-ti-dm-systimer.c | 44 +++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c index f6fd1c1cc527..33b3e8aa2cc5 100644 --- a/drivers/clocksource/timer-ti-dm-systimer.c +++ b/drivers/clocksource/timer-ti-dm-systimer.c @@ -69,12 +69,33 @@ static bool dmtimer_systimer_revision1(struct dmtimer_systimer *t) return !(tidr >> 16); } +static void dmtimer_systimer_enable(struct dmtimer_systimer *t) +{ + u32 val; + + if (dmtimer_systimer_revision1(t)) + val = DMTIMER_TYPE1_ENABLE; + else + val = DMTIMER_TYPE2_ENABLE; + + writel_relaxed(val, t->base + t->sysc); +} + +static void dmtimer_systimer_disable(struct dmtimer_systimer *t) +{ + if (!dmtimer_systimer_revision1(t)) + return; + + writel_relaxed(DMTIMER_TYPE1_DISABLE, t->base + t->sysc); +} + static int __init dmtimer_systimer_type1_reset(struct dmtimer_systimer *t) { void __iomem *syss = t->base + OMAP_TIMER_V1_SYS_STAT_OFFSET; int ret; u32 l; + dmtimer_systimer_enable(t); writel_relaxed(BIT(1) | BIT(2), t->base + t->ifctrl); ret = readl_poll_timeout_atomic(syss, l, l & BIT(0), 100, DMTIMER_RESET_WAIT); @@ -88,6 +109,7 @@ static int __init dmtimer_systimer_type2_reset(struct dmtimer_systimer *t) void __iomem *sysc = t->base + t->sysc; u32 l; + dmtimer_systimer_enable(t); l = readl_relaxed(sysc); l |= BIT(0); writel_relaxed(l, sysc); @@ -336,26 +358,6 @@ static int __init dmtimer_systimer_init_clock(struct dmtimer_systimer *t, return 0; } -static void dmtimer_systimer_enable(struct dmtimer_systimer *t) -{ - u32 val; - - if (dmtimer_systimer_revision1(t)) - val = DMTIMER_TYPE1_ENABLE; - else - val = DMTIMER_TYPE2_ENABLE; - - writel_relaxed(val, t->base + t->sysc); -} - -static void dmtimer_systimer_disable(struct dmtimer_systimer *t) -{ - if (!dmtimer_systimer_revision1(t)) - return; - - writel_relaxed(DMTIMER_TYPE1_DISABLE, t->base + t->sysc); -} - static int __init dmtimer_systimer_setup(struct device_node *np, struct dmtimer_systimer *t) { @@ -409,8 +411,8 @@ static int __init dmtimer_systimer_setup(struct device_node *np, t->wakeup = regbase + _OMAP_TIMER_WAKEUP_EN_OFFSET; t->ifctrl = regbase + _OMAP_TIMER_IF_CTRL_OFFSET; - dmtimer_systimer_enable(t); dmtimer_systimer_reset(t); + dmtimer_systimer_enable(t); pr_debug("dmtimer rev %08x sysc %08x\n", readl_relaxed(t->base), readl_relaxed(t->base + t->sysc)); From bc6717d55d07110d8f3c6d31ec2af50c11b07091 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 18 Aug 2020 07:31:17 +0000 Subject: [PATCH 03/12] clocksource/drivers/timer-gx6605s: Fixup counter reload When the timer counts to the upper limit, an overflow interrupt is generated, and the count is reset with the value in the TIME_INI register. But the software expects to start counting from 0 when the count overflows, so it forces TIME_INI to 0 to solve the potential interrupt storm problem. Signed-off-by: Guo Ren Tested-by: Xu Kai Cc: Daniel Lezcano Cc: Thomas Gleixner Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/1597735877-71115-1-git-send-email-guoren@kernel.org --- drivers/clocksource/timer-gx6605s.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clocksource/timer-gx6605s.c b/drivers/clocksource/timer-gx6605s.c index 80d0939d040b..8d386adbe800 100644 --- a/drivers/clocksource/timer-gx6605s.c +++ b/drivers/clocksource/timer-gx6605s.c @@ -28,6 +28,7 @@ static irqreturn_t gx6605s_timer_interrupt(int irq, void *dev) void __iomem *base = timer_of_base(to_timer_of(ce)); writel_relaxed(GX6605S_STATUS_CLR, base + TIMER_STATUS); + writel_relaxed(0, base + TIMER_INI); ce->event_handler(ce); From a7b3474cbb2864d5500d5e4f48dd57c903975cab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 22 Sep 2020 09:58:52 +0200 Subject: [PATCH 04/12] x86/irq: Make run_on_irqstack_cond() typesafe Sami reported that run_on_irqstack_cond() requires the caller to cast functions to mismatching types, which trips indirect call Control-Flow Integrity (CFI) in Clang. Instead of disabling CFI on that function, provide proper helpers for the three call variants. The actual ASM code stays the same as that is out of reach. [ bp: Fix __run_on_irqstack() prototype to match. ] Fixes: 931b94145981 ("x86/entry: Provide helpers for executing on the irqstack") Reported-by: Nathan Chancellor Reported-by: Sami Tolvanen Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Tested-by: Sami Tolvanen Cc: Link: https://github.com/ClangBuiltLinux/linux/issues/1052 Link: https://lkml.kernel.org/r/87pn6eb5tv.fsf@nanos.tec.linutronix.de --- arch/x86/entry/common.c | 2 +- arch/x86/entry/entry_64.S | 2 + arch/x86/include/asm/idtentry.h | 2 +- arch/x86/include/asm/irq_stack.h | 71 ++++++++++++++++++++++++++++---- arch/x86/kernel/irq.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- 6 files changed, 68 insertions(+), 13 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 2f84c7ca74ea..870efeec8bda 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -299,7 +299,7 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) old_regs = set_irq_regs(regs); instrumentation_begin(); - run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs); + run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); instrumentation_begin(); set_irq_regs(old_regs); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 70dea9337816..d977079a7d02 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -682,6 +682,8 @@ SYM_CODE_END(.Lbad_gs) * rdx: Function argument (can be NULL if none) */ SYM_FUNC_START(asm_call_on_stack) +SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL) +SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL) /* * Save the frame pointer unconditionally. This allows the ORC * unwinder to handle the stack switch. diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index a43366191212..a0638640f1ed 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -242,7 +242,7 @@ __visible noinstr void func(struct pt_regs *regs) \ instrumentation_begin(); \ irq_enter_rcu(); \ kvm_set_cpu_l1tf_flush_l1d(); \ - run_on_irqstack_cond(__##func, regs, regs); \ + run_sysvec_on_irqstack_cond(__##func, regs); \ irq_exit_rcu(); \ instrumentation_end(); \ irqentry_exit(regs, state); \ diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 4ae66f097101..775816965c6a 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -12,20 +12,50 @@ static __always_inline bool irqstack_active(void) return __this_cpu_read(irq_count) != -1; } -void asm_call_on_stack(void *sp, void *func, void *arg); +void asm_call_on_stack(void *sp, void (*func)(void), void *arg); +void asm_call_sysvec_on_stack(void *sp, void (*func)(struct pt_regs *regs), + struct pt_regs *regs); +void asm_call_irq_on_stack(void *sp, void (*func)(struct irq_desc *desc), + struct irq_desc *desc); -static __always_inline void __run_on_irqstack(void *func, void *arg) +static __always_inline void __run_on_irqstack(void (*func)(void)) { void *tos = __this_cpu_read(hardirq_stack_ptr); __this_cpu_add(irq_count, 1); - asm_call_on_stack(tos - 8, func, arg); + asm_call_on_stack(tos - 8, func, NULL); + __this_cpu_sub(irq_count, 1); +} + +static __always_inline void +__run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) +{ + void *tos = __this_cpu_read(hardirq_stack_ptr); + + __this_cpu_add(irq_count, 1); + asm_call_sysvec_on_stack(tos - 8, func, regs); + __this_cpu_sub(irq_count, 1); +} + +static __always_inline void +__run_irq_on_irqstack(void (*func)(struct irq_desc *desc), + struct irq_desc *desc) +{ + void *tos = __this_cpu_read(hardirq_stack_ptr); + + __this_cpu_add(irq_count, 1); + asm_call_irq_on_stack(tos - 8, func, desc); __this_cpu_sub(irq_count, 1); } #else /* CONFIG_X86_64 */ static inline bool irqstack_active(void) { return false; } -static inline void __run_on_irqstack(void *func, void *arg) { } +static inline void __run_on_irqstack(void (*func)(void)) { } +static inline void __run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) { } +static inline void __run_irq_on_irqstack(void (*func)(struct irq_desc *desc), + struct irq_desc *desc) { } #endif /* !CONFIG_X86_64 */ static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) @@ -37,17 +67,40 @@ static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) return !user_mode(regs) && !irqstack_active(); } -static __always_inline void run_on_irqstack_cond(void *func, void *arg, + +static __always_inline void run_on_irqstack_cond(void (*func)(void), struct pt_regs *regs) { - void (*__func)(void *arg) = func; - lockdep_assert_irqs_disabled(); if (irq_needs_irq_stack(regs)) - __run_on_irqstack(__func, arg); + __run_on_irqstack(func); else - __func(arg); + func(); +} + +static __always_inline void +run_sysvec_on_irqstack_cond(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + if (irq_needs_irq_stack(regs)) + __run_sysvec_on_irqstack(func, regs); + else + func(regs); +} + +static __always_inline void +run_irq_on_irqstack_cond(void (*func)(struct irq_desc *desc), struct irq_desc *desc, + struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + if (irq_needs_irq_stack(regs)) + __run_irq_on_irqstack(func, desc); + else + func(desc); } #endif diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 181060247e3c..c5dd50369e2f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -227,7 +227,7 @@ static __always_inline void handle_irq(struct irq_desc *desc, struct pt_regs *regs) { if (IS_ENABLED(CONFIG_X86_64)) - run_on_irqstack_cond(desc->handle_irq, desc, regs); + run_irq_on_irqstack_cond(desc->handle_irq, desc, regs); else __handle_irq(desc, regs); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1b4fe93a86c5..440eed558558 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -74,5 +74,5 @@ int irq_init_percpu_irqstack(unsigned int cpu) void do_softirq_own_stack(void) { - run_on_irqstack_cond(__do_softirq, NULL, NULL); + run_on_irqstack_cond(__do_softirq, NULL); } From 86a82ae0b5095ea24c55898a3f025791e7958b21 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Sep 2020 17:46:20 +0200 Subject: [PATCH 05/12] x86/ioapic: Unbreak check_timer() Several people reported in the kernel bugzilla that between v4.12 and v4.13 the magic which works around broken hardware and BIOSes to find the proper timer interrupt delivery mode stopped working for some older affected platforms which need to fall back to ExtINT delivery mode. The reason is that the core code changed to keep track of the masked and disabled state of an interrupt line more accurately to avoid the expensive hardware operations. That broke an assumption in i8259_make_irq() which invokes disable_irq_nosync(); irq_set_chip_and_handler(); enable_irq(); Up to v4.12 this worked because enable_irq() unconditionally unmasked the interrupt line, but after the state tracking improvements this is not longer the case because the IO/APIC uses lazy disabling. So the line state is unmasked which means that enable_irq() does not call into the new irq chip to unmask it. In principle this is a shortcoming of the core code, but it's more than unclear whether the core code should try to reset state. At least this cannot be done unconditionally as that would break other existing use cases where the chip type is changed, e.g. when changing the trigger type, but the callers expect the state to be preserved. As the way how check_timer() is switching the delivery modes is truly unique, the obvious fix is to simply unmask the i8259 manually after changing the mode to ExtINT delivery and switching the irq chip to the legacy PIC. Note, that the fixes tag is not really precise, but identifies the commit which broke the assumptions in the IO/APIC and i8259 code and that's the kernel version to which this needs to be backported. Fixes: bf22ff45bed6 ("genirq: Avoid unnecessary low level irq function calls") Reported-by: p_c_chan@hotmail.com Reported-by: ecm4@mail.com Reported-by: perdigao1@yahoo.com Reported-by: matzes@users.sourceforge.net Reported-by: rvelascog@gmail.com Signed-off-by: Thomas Gleixner Tested-by: p_c_chan@hotmail.com Tested-by: matzes@users.sourceforge.net Cc: stable@vger.kernel.org Link: https://bugzilla.kernel.org/show_bug.cgi?id=197769 --- arch/x86/kernel/apic/io_apic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 779a89e31c4c..21f9c7f11779 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2243,6 +2243,7 @@ static inline void __init check_timer(void) legacy_pic->init(0); legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); + legacy_pic->unmask(0); unlock_ExtINT_logic(); From 516d980f85415d76ae3d0d2a871eb20243f46c95 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 23 Sep 2020 02:48:56 +0900 Subject: [PATCH 06/12] scripts/kallsyms: skip ppc compiler stub *.long_branch.* / *.plt_branch.* PowerPC allmodconfig often fails to build as follows: LD .tmp_vmlinux.kallsyms1 KSYM .tmp_vmlinux.kallsyms1.o LD .tmp_vmlinux.kallsyms2 KSYM .tmp_vmlinux.kallsyms2.o LD .tmp_vmlinux.kallsyms3 KSYM .tmp_vmlinux.kallsyms3.o LD vmlinux SORTTAB vmlinux SYSMAP System.map Inconsistent kallsyms data Try make KALLSYMS_EXTRA_PASS=1 as a workaround make[2]: *** [../Makefile:1162: vmlinux] Error 1 Setting KALLSYMS_EXTRA_PASS=1 does not help. This is caused by the compiler inserting stubs such as *.long_branch.* and *.plt_branch.* $ powerpc-linux-nm -n .tmp_vmlinux.kallsyms2 [ snip ] c00000000210c010 t 00000075.plt_branch.da9:19 c00000000210c020 t 00000075.plt_branch.1677:5 c00000000210c030 t 00000075.long_branch.memmove c00000000210c034 t 00000075.plt_branch.9e0:5 c00000000210c044 t 00000075.plt_branch.free_initrd_mem ... Actually, the problem mentioned in scripts/link-vmlinux.sh comments; "In theory it's possible this results in even more stubs, but unlikely" is happening here, and ends up with another kallsyms step required. scripts/kallsyms.c already ignores various compiler stubs. Let's do similar to make kallsysms for PowerPC always succeed in 2 steps. Reported-by: Guenter Roeck Signed-off-by: Masahiro Yamada Tested-by: Guenter Roeck --- scripts/kallsyms.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 0096cd965332..7ecd2ccba531 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -82,6 +82,7 @@ static char *sym_name(const struct sym_entry *s) static bool is_ignored_symbol(const char *name, char type) { + /* Symbol names that exactly match to the following are ignored.*/ static const char * const ignored_symbols[] = { /* * Symbols which vary between passes. Passes 1 and 2 must have @@ -104,6 +105,7 @@ static bool is_ignored_symbol(const char *name, char type) NULL }; + /* Symbol names that begin with the following are ignored.*/ static const char * const ignored_prefixes[] = { "$", /* local symbols for ARM, MIPS, etc. */ ".LASANPC", /* s390 kasan local symbols */ @@ -113,6 +115,7 @@ static bool is_ignored_symbol(const char *name, char type) NULL }; + /* Symbol names that end with the following are ignored.*/ static const char * const ignored_suffixes[] = { "_from_arm", /* arm */ "_from_thumb", /* arm */ @@ -120,9 +123,15 @@ static bool is_ignored_symbol(const char *name, char type) NULL }; + /* Symbol names that contain the following are ignored.*/ + static const char * const ignored_matches[] = { + ".long_branch.", /* ppc stub */ + ".plt_branch.", /* ppc stub */ + NULL + }; + const char * const *p; - /* Exclude symbols which vary between passes. */ for (p = ignored_symbols; *p; p++) if (!strcmp(name, *p)) return true; @@ -138,6 +147,11 @@ static bool is_ignored_symbol(const char *name, char type) return true; } + for (p = ignored_matches; *p; p++) { + if (strstr(name, *p)) + return true; + } + if (type == 'U' || type == 'u') return true; /* exclude debugging symbols */ From e30d694c3381b565e043cf74b0bed059db1b4ac9 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 25 Sep 2020 08:21:14 -0700 Subject: [PATCH 07/12] Documentation/llvm: Fix clang target examples clang --target= is how we can specify a particular toolchain triple to be use, fix the two occurences in the documentation. Fixes: fcf1b6a35c16 ("Documentation/llvm: add documentation on building w/ Clang/LLVM") Signed-off-by: Florian Fainelli Reviewed-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- Documentation/kbuild/llvm.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/kbuild/llvm.rst b/Documentation/kbuild/llvm.rst index 334df758dce3..dae90c21aed3 100644 --- a/Documentation/kbuild/llvm.rst +++ b/Documentation/kbuild/llvm.rst @@ -39,10 +39,10 @@ which can help simplify cross compiling. :: ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make CC=clang ``CROSS_COMPILE`` is not used to prefix the Clang compiler binary, instead -``CROSS_COMPILE`` is used to set a command line flag: ``--target ``. For +``CROSS_COMPILE`` is used to set a command line flag: ``--target=``. For example: :: - clang --target aarch64-linux-gnu foo.c + clang --target=aarch64-linux-gnu foo.c LLVM Utilities -------------- From 008cfe4418b3dbda2ff820cdd7b1a5ce458ae444 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:57 -0400 Subject: [PATCH 08/12] mm: Introduce mm_struct.has_pinned (Commit message majorly collected from Jason Gunthorpe) Reduce the chance of false positive from page_maybe_dma_pinned() by keeping track if the mm_struct has ever been used with pin_user_pages(). This allows cases that might drive up the page ref_count to avoid any penalty from handling dma_pinned pages. Future work is planned, to provide a more sophisticated solution, likely to turn it into a real counter. For now, make it atomic_t but use it as a boolean for simplicity. Suggested-by: Jason Gunthorpe Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 10 ++++++++++ kernel/fork.c | 1 + mm/gup.c | 6 ++++++ 3 files changed, 17 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 496c3ff97cce..ed028af3cb19 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -436,6 +436,16 @@ struct mm_struct { */ atomic_t mm_count; + /** + * @has_pinned: Whether this mm has pinned any pages. This can + * be either replaced in the future by @pinned_vm when it + * becomes stable, or grow into a counter on its own. We're + * aggresive on this bit now - even if the pinned pages were + * unpinned later on, we'll still keep this bit set for the + * lifecycle of this mm just for simplicity. + */ + atomic_t has_pinned; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index 49677d668de4..e65d8192d080 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1011,6 +1011,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; + atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); diff --git a/mm/gup.c b/mm/gup.c index 578bf5bd8bf8..dfe781d2ad4c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1255,6 +1255,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, BUG_ON(*locked != 1); } + if (flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has @@ -2660,6 +2663,9 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, FOLL_FAST_ONLY))) return -EINVAL; + if (gup_flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); From 7a4830c380f3a8b3425f6383deff58e65b2557b5 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:58 -0400 Subject: [PATCH 09/12] mm/fork: Pass new vma pointer into copy_page_range() This prepares for the future work to trigger early cow on pinned pages during fork(). No functional change intended. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- kernel/fork.c | 2 +- mm/memory.c | 14 +++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 48614513eb66..16b799a0522c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1646,7 +1646,7 @@ struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); + struct vm_area_struct *vma, struct vm_area_struct *new); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); diff --git a/kernel/fork.c b/kernel/fork.c index e65d8192d080..da8d360fb032 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -589,7 +589,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, mpnt, tmp); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); diff --git a/mm/memory.c b/mm/memory.c index f3eb55975902..d56178721452 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -819,6 +819,7 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; @@ -889,6 +890,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -915,7 +917,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; @@ -923,6 +925,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -949,7 +952,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; @@ -957,6 +960,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { p4d_t *src_p4d, *dst_p4d; @@ -971,14 +975,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) + struct vm_area_struct *vma, struct vm_area_struct *new) { pgd_t *src_pgd, *dst_pgd; unsigned long next; @@ -1033,7 +1037,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + vma, new, addr, next))) { ret = -ENOMEM; break; } From 70e806e4e645019102d0e09d4933654fb5fb58ce Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:25:59 -0400 Subject: [PATCH 10/12] mm: Do early cow for pinned pages during fork() for ptes This allows copy_pte_range() to do early cow if the pages were pinned on the source mm. Currently we don't have an accurate way to know whether a page is pinned or not. The only thing we have is page_maybe_dma_pinned(). However that's good enough for now. Especially, with the newly added mm->has_pinned flag to make sure we won't affect processes that never pinned any pages. It would be easier if we can do GFP_KERNEL allocation within copy_one_pte(). Unluckily, we can't because we're with the page table locks held for both the parent and child processes. So the page allocation needs to be done outside copy_one_pte(). Some trick is there in copy_present_pte(), majorly the wrprotect trick to block concurrent fast-gup. Comments in the function should explain better in place. Oleg Nesterov reported a (probably harmless) bug during review that we didn't reset entry.val properly in copy_pte_range() so that potentially there's chance to call add_swap_count_continuation() multiple times on the same swp entry. However that should be harmless since even if it happens, the same function (add_swap_count_continuation()) will return directly noticing that there're enough space for the swp counter. So instead of a standalone stable patch, it is touched up in this patch directly. Link: https://lore.kernel.org/lkml/20200914143829.GA1424636@nvidia.com/ Suggested-by: Linus Torvalds Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- mm/memory.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 190 insertions(+), 17 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d56178721452..fcfc4ca36eba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return 0; } -static inline void +/* + * Copy a present and normal page if necessary. + * + * NOTE! The usual case is that this doesn't need to do + * anything, and can just return a positive value. That + * will let the caller know that it can just increase + * the page refcount and re-use the pte the traditional + * way. + * + * But _if_ we need to copy it because it needs to be + * pinned in the parent (and the child should get its own + * copy rather than just a reference to the same page), + * we'll do that here and return zero to let the caller + * know we're done. + * + * And if we need a pre-allocated page but don't yet have + * one, return a negative error to let the preallocation + * code know so that it can do so outside the page table + * lock. + */ +static inline int +copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, + struct vm_area_struct *vma, struct vm_area_struct *new, + unsigned long addr, int *rss, struct page **prealloc, + pte_t pte, struct page *page) +{ + struct page *new_page; + + if (!is_cow_mapping(vma->vm_flags)) + return 1; + + /* + * The trick starts. + * + * What we want to do is to check whether this page may + * have been pinned by the parent process. If so, + * instead of wrprotect the pte on both sides, we copy + * the page immediately so that we'll always guarantee + * the pinned page won't be randomly replaced in the + * future. + * + * To achieve this, we do the following: + * + * 1. Write-protect the pte if it's writable. This is + * to protect concurrent write fast-gup with + * FOLL_PIN, so that we'll fail the fast-gup with + * the write bit removed. + * + * 2. Check page_maybe_dma_pinned() to see whether this + * page may have been pinned. + * + * The order of these steps is important to serialize + * against the fast-gup code (gup_pte_range()) on the + * pte check and try_grab_compound_head(), so that + * we'll make sure either we'll capture that fast-gup + * so we'll copy the pinned page here, or we'll fail + * that fast-gup. + * + * NOTE! Even if we don't end up copying the page, + * we won't undo this wrprotect(), because the normal + * reference copy will need it anyway. + */ + if (pte_write(pte)) + ptep_set_wrprotect(src_mm, addr, src_pte); + + /* + * These are the "normally we can just copy by reference" + * checks. + */ + if (likely(!atomic_read(&src_mm->has_pinned))) + return 1; + if (likely(!page_maybe_dma_pinned(page))) + return 1; + + /* + * Uhhuh. It looks like the page might be a pinned page, + * and we actually need to copy it. Now we can set the + * source pte back to being writable. + */ + if (pte_write(pte)) + set_pte_at(src_mm, addr, src_pte, pte); + + new_page = *prealloc; + if (!new_page) + return -EAGAIN; + + /* + * We have a prealloc page, all good! Take it + * over and copy the page & arm it. + */ + *prealloc = NULL; + copy_user_highpage(new_page, page, addr, vma); + __SetPageUptodate(new_page); + page_add_new_anon_rmap(new_page, new, addr, false); + lru_cache_add_inactive_or_unevictable(new_page, new); + rss[mm_counter(new_page)]++; + + /* All done, just insert the new page copy in the child */ + pte = mk_pte(new_page, new->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), new); + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page + * is required to copy this pte. + */ +static inline int copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + struct vm_area_struct *new, + unsigned long addr, int *rss, struct page **prealloc) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; + page = vm_normal_page(vma, addr, pte); + if (page) { + int retval; + + retval = copy_present_page(dst_mm, src_mm, + dst_pte, src_pte, + vma, new, + addr, rss, prealloc, + pte, page); + if (retval <= 0) + return retval; + + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } + /* * If it's a COW mapping, write protect it both * in the parent and the child @@ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!(vm_flags & VM_UFFD_WP)) pte = pte_clear_uffd_wp(pte); - page = vm_normal_page(vma, addr, pte); - if (page) { - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; - } - set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +static inline struct page * +page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *new_page; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); + if (!new_page) + return NULL; + + if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { + put_page(new_page); + return NULL; + } + cgroup_throttle_swaprate(new_page, GFP_KERNEL); + + return new_page; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; - int progress = 0; + int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; + struct page *prealloc = NULL; again: + progress = 0; init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) - return -ENOMEM; + if (!dst_pte) { + ret = -ENOMEM; + goto out; + } src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -866,8 +1010,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, progress += 8; continue; } - copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + /* copy_present_pte() will clear `*prealloc' if consumed */ + ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, new, addr, rss, &prealloc); + /* + * If we need a pre-allocated page for this pte, drop the + * locks, allocate, and try again. + */ + if (unlikely(ret == -EAGAIN)) + break; + if (unlikely(prealloc)) { + /* + * pre-alloc page cannot be reused by next time so as + * to strictly follow mempolicy (e.g., alloc_page_vma() + * will allocate page according to address). This + * could only happen if one pinned pte changed. + */ + put_page(prealloc); + prealloc = NULL; + } progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -879,13 +1040,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, cond_resched(); if (entry.val) { - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + ret = -ENOMEM; + goto out; + } + entry.val = 0; + } else if (ret) { + WARN_ON_ONCE(ret != -EAGAIN); + prealloc = page_copy_prealloc(src_mm, vma, addr); + if (!prealloc) return -ENOMEM; - progress = 0; + /* We've captured and resolved the error. Reset, try again. */ + ret = 0; } if (addr != end) goto again; - return 0; +out: + if (unlikely(prealloc)) + put_page(prealloc); + return ret; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, From d042035eaf5f9009ad927dc4d3ce848381ccdeed Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 25 Sep 2020 18:26:00 -0400 Subject: [PATCH 11/12] mm/thp: Split huge pmds/puds if they're pinned when fork() Pinned pages shouldn't be write-protected when fork() happens, because follow up copy-on-write on these pages could cause the pinned pages to be replaced by random newly allocated pages. For huge PMDs, we split the huge pmd if pinning is detected. So that future handling will be done by the PTE level (with our latest changes, each of the small pages will be copied). We can achieve this by let copy_huge_pmd() return -EAGAIN for pinned pages, so that we'll fallthrough in copy_pmd_range() and finally land the next copy_pte_range() call. Huge PUDs will be even more special - so far it does not support anonymous pages. But it can actually be done the same as the huge PMDs even if the split huge PUDs means to erase the PUD entries. It'll guarantee the follow up fault ins will remap the same pages in either parent/child later. This might not be the most efficient way, but it should be easy and clean enough. It should be fine, since we're tackling with a very rare case just to make sure userspaces that pinned some thps will still work even without MADV_DONTFORK and after they fork()ed. Signed-off-by: Peter Xu Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index faadc449cca5..da397779a6d4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1074,6 +1074,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + + /* + * If this page is a potentially pinned page, split and retry the fault + * with smaller page size. Normally this should not happen because the + * userspace should use MADV_DONTFORK upon pinned regions. This is a + * best effort that the pinned pages won't be replaced by another + * random page during the coming copy-on-write. + */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(src_page))) { + pte_free(dst_mm, pgtable); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pmd(vma, src_pmd, addr, false, NULL); + return -EAGAIN; + } + get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1177,6 +1195,16 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* No huge zero pud yet */ } + /* Please refer to comments in copy_huge_pmd() */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(pud_page(pud)))) { + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pud(vma, src_pud, addr); + return -EAGAIN; + } + pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_mkold(pud_wrprotect(pud)); set_pud_at(dst_mm, addr, dst_pud, pud); From a1b8638ba1320e6684aa98233c15255eb803fac7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 27 Sep 2020 14:38:10 -0700 Subject: [PATCH 12/12] Linux 5.9-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 2b66d3398878..992d24467ca0 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Kleptomaniac Octopus # *DOCUMENTATION*