diff --git a/Documentation/kbuild/llvm.rst b/Documentation/kbuild/llvm.rst index 334df758dce3..dae90c21aed3 100644 --- a/Documentation/kbuild/llvm.rst +++ b/Documentation/kbuild/llvm.rst @@ -39,10 +39,10 @@ which can help simplify cross compiling. :: ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- make CC=clang ``CROSS_COMPILE`` is not used to prefix the Clang compiler binary, instead -``CROSS_COMPILE`` is used to set a command line flag: ``--target ``. For +``CROSS_COMPILE`` is used to set a command line flag: ``--target=``. For example: :: - clang --target aarch64-linux-gnu foo.c + clang --target=aarch64-linux-gnu foo.c LLVM Utilities -------------- diff --git a/Makefile b/Makefile index 3663fe2dfb32..f364c34b49e8 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 9 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Kleptomaniac Octopus # *DOCUMENTATION* diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 2f84c7ca74ea..870efeec8bda 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -299,7 +299,7 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) old_regs = set_irq_regs(regs); instrumentation_begin(); - run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs); + run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); instrumentation_begin(); set_irq_regs(old_regs); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 70dea9337816..d977079a7d02 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -682,6 +682,8 @@ SYM_CODE_END(.Lbad_gs) * rdx: Function argument (can be NULL if none) */ SYM_FUNC_START(asm_call_on_stack) +SYM_INNER_LABEL(asm_call_sysvec_on_stack, SYM_L_GLOBAL) +SYM_INNER_LABEL(asm_call_irq_on_stack, SYM_L_GLOBAL) /* * Save the frame pointer unconditionally. This allows the ORC * unwinder to handle the stack switch. diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index a43366191212..a0638640f1ed 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -242,7 +242,7 @@ __visible noinstr void func(struct pt_regs *regs) \ instrumentation_begin(); \ irq_enter_rcu(); \ kvm_set_cpu_l1tf_flush_l1d(); \ - run_on_irqstack_cond(__##func, regs, regs); \ + run_sysvec_on_irqstack_cond(__##func, regs); \ irq_exit_rcu(); \ instrumentation_end(); \ irqentry_exit(regs, state); \ diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h index 4ae66f097101..775816965c6a 100644 --- a/arch/x86/include/asm/irq_stack.h +++ b/arch/x86/include/asm/irq_stack.h @@ -12,20 +12,50 @@ static __always_inline bool irqstack_active(void) return __this_cpu_read(irq_count) != -1; } -void asm_call_on_stack(void *sp, void *func, void *arg); +void asm_call_on_stack(void *sp, void (*func)(void), void *arg); +void asm_call_sysvec_on_stack(void *sp, void (*func)(struct pt_regs *regs), + struct pt_regs *regs); +void asm_call_irq_on_stack(void *sp, void (*func)(struct irq_desc *desc), + struct irq_desc *desc); -static __always_inline void __run_on_irqstack(void *func, void *arg) +static __always_inline void __run_on_irqstack(void (*func)(void)) { void *tos = __this_cpu_read(hardirq_stack_ptr); __this_cpu_add(irq_count, 1); - asm_call_on_stack(tos - 8, func, arg); + asm_call_on_stack(tos - 8, func, NULL); + __this_cpu_sub(irq_count, 1); +} + +static __always_inline void +__run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) +{ + void *tos = __this_cpu_read(hardirq_stack_ptr); + + __this_cpu_add(irq_count, 1); + asm_call_sysvec_on_stack(tos - 8, func, regs); + __this_cpu_sub(irq_count, 1); +} + +static __always_inline void +__run_irq_on_irqstack(void (*func)(struct irq_desc *desc), + struct irq_desc *desc) +{ + void *tos = __this_cpu_read(hardirq_stack_ptr); + + __this_cpu_add(irq_count, 1); + asm_call_irq_on_stack(tos - 8, func, desc); __this_cpu_sub(irq_count, 1); } #else /* CONFIG_X86_64 */ static inline bool irqstack_active(void) { return false; } -static inline void __run_on_irqstack(void *func, void *arg) { } +static inline void __run_on_irqstack(void (*func)(void)) { } +static inline void __run_sysvec_on_irqstack(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) { } +static inline void __run_irq_on_irqstack(void (*func)(struct irq_desc *desc), + struct irq_desc *desc) { } #endif /* !CONFIG_X86_64 */ static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) @@ -37,17 +67,40 @@ static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs) return !user_mode(regs) && !irqstack_active(); } -static __always_inline void run_on_irqstack_cond(void *func, void *arg, + +static __always_inline void run_on_irqstack_cond(void (*func)(void), struct pt_regs *regs) { - void (*__func)(void *arg) = func; - lockdep_assert_irqs_disabled(); if (irq_needs_irq_stack(regs)) - __run_on_irqstack(__func, arg); + __run_on_irqstack(func); else - __func(arg); + func(); +} + +static __always_inline void +run_sysvec_on_irqstack_cond(void (*func)(struct pt_regs *regs), + struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + if (irq_needs_irq_stack(regs)) + __run_sysvec_on_irqstack(func, regs); + else + func(regs); +} + +static __always_inline void +run_irq_on_irqstack_cond(void (*func)(struct irq_desc *desc), struct irq_desc *desc, + struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + if (irq_needs_irq_stack(regs)) + __run_irq_on_irqstack(func, desc); + else + func(desc); } #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 779a89e31c4c..21f9c7f11779 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2243,6 +2243,7 @@ static inline void __init check_timer(void) legacy_pic->init(0); legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); + legacy_pic->unmask(0); unlock_ExtINT_logic(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 181060247e3c..c5dd50369e2f 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -227,7 +227,7 @@ static __always_inline void handle_irq(struct irq_desc *desc, struct pt_regs *regs) { if (IS_ENABLED(CONFIG_X86_64)) - run_on_irqstack_cond(desc->handle_irq, desc, regs); + run_irq_on_irqstack_cond(desc->handle_irq, desc, regs); else __handle_irq(desc, regs); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1b4fe93a86c5..440eed558558 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -74,5 +74,5 @@ int irq_init_percpu_irqstack(unsigned int cpu) void do_softirq_own_stack(void) { - run_on_irqstack_cond(__do_softirq, NULL, NULL); + run_on_irqstack_cond(__do_softirq, NULL); } diff --git a/drivers/clocksource/h8300_timer8.c b/drivers/clocksource/h8300_timer8.c index 1d740a8c42ab..47114c2a7cb5 100644 --- a/drivers/clocksource/h8300_timer8.c +++ b/drivers/clocksource/h8300_timer8.c @@ -169,7 +169,7 @@ static int __init h8300_8timer_init(struct device_node *node) return PTR_ERR(clk); } - ret = ENXIO; + ret = -ENXIO; base = of_iomap(node, 0); if (!base) { pr_err("failed to map registers for clockevent\n"); diff --git a/drivers/clocksource/timer-gx6605s.c b/drivers/clocksource/timer-gx6605s.c index 80d0939d040b..8d386adbe800 100644 --- a/drivers/clocksource/timer-gx6605s.c +++ b/drivers/clocksource/timer-gx6605s.c @@ -28,6 +28,7 @@ static irqreturn_t gx6605s_timer_interrupt(int irq, void *dev) void __iomem *base = timer_of_base(to_timer_of(ce)); writel_relaxed(GX6605S_STATUS_CLR, base + TIMER_STATUS); + writel_relaxed(0, base + TIMER_INI); ce->event_handler(ce); diff --git a/drivers/clocksource/timer-ti-dm-systimer.c b/drivers/clocksource/timer-ti-dm-systimer.c index f6fd1c1cc527..33b3e8aa2cc5 100644 --- a/drivers/clocksource/timer-ti-dm-systimer.c +++ b/drivers/clocksource/timer-ti-dm-systimer.c @@ -69,12 +69,33 @@ static bool dmtimer_systimer_revision1(struct dmtimer_systimer *t) return !(tidr >> 16); } +static void dmtimer_systimer_enable(struct dmtimer_systimer *t) +{ + u32 val; + + if (dmtimer_systimer_revision1(t)) + val = DMTIMER_TYPE1_ENABLE; + else + val = DMTIMER_TYPE2_ENABLE; + + writel_relaxed(val, t->base + t->sysc); +} + +static void dmtimer_systimer_disable(struct dmtimer_systimer *t) +{ + if (!dmtimer_systimer_revision1(t)) + return; + + writel_relaxed(DMTIMER_TYPE1_DISABLE, t->base + t->sysc); +} + static int __init dmtimer_systimer_type1_reset(struct dmtimer_systimer *t) { void __iomem *syss = t->base + OMAP_TIMER_V1_SYS_STAT_OFFSET; int ret; u32 l; + dmtimer_systimer_enable(t); writel_relaxed(BIT(1) | BIT(2), t->base + t->ifctrl); ret = readl_poll_timeout_atomic(syss, l, l & BIT(0), 100, DMTIMER_RESET_WAIT); @@ -88,6 +109,7 @@ static int __init dmtimer_systimer_type2_reset(struct dmtimer_systimer *t) void __iomem *sysc = t->base + t->sysc; u32 l; + dmtimer_systimer_enable(t); l = readl_relaxed(sysc); l |= BIT(0); writel_relaxed(l, sysc); @@ -336,26 +358,6 @@ static int __init dmtimer_systimer_init_clock(struct dmtimer_systimer *t, return 0; } -static void dmtimer_systimer_enable(struct dmtimer_systimer *t) -{ - u32 val; - - if (dmtimer_systimer_revision1(t)) - val = DMTIMER_TYPE1_ENABLE; - else - val = DMTIMER_TYPE2_ENABLE; - - writel_relaxed(val, t->base + t->sysc); -} - -static void dmtimer_systimer_disable(struct dmtimer_systimer *t) -{ - if (!dmtimer_systimer_revision1(t)) - return; - - writel_relaxed(DMTIMER_TYPE1_DISABLE, t->base + t->sysc); -} - static int __init dmtimer_systimer_setup(struct device_node *np, struct dmtimer_systimer *t) { @@ -409,8 +411,8 @@ static int __init dmtimer_systimer_setup(struct device_node *np, t->wakeup = regbase + _OMAP_TIMER_WAKEUP_EN_OFFSET; t->ifctrl = regbase + _OMAP_TIMER_IF_CTRL_OFFSET; - dmtimer_systimer_enable(t); dmtimer_systimer_reset(t); + dmtimer_systimer_enable(t); pr_debug("dmtimer rev %08x sysc %08x\n", readl_relaxed(t->base), readl_relaxed(t->base + t->sysc)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 5d0c0388480c..901cca87957f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1646,7 +1646,7 @@ struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); + struct vm_area_struct *vma, struct vm_area_struct *new); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9103b83eb1b0..2e58e45d867f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -443,6 +443,16 @@ struct mm_struct { */ atomic_t mm_count; + /** + * @has_pinned: Whether this mm has pinned any pages. This can + * be either replaced in the future by @pinned_vm when it + * becomes stable, or grow into a counter on its own. We're + * aggresive on this bit now - even if the pinned pages were + * unpinned later on, we'll still keep this bit set for the + * lifecycle of this mm just for simplicity. + */ + atomic_t has_pinned; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index f4db5653594f..c9dc66faf1be 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -591,7 +591,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(mm, oldmm, mpnt, tmp); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -1013,6 +1013,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; + atomic_set(&mm->has_pinned, 0); atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); diff --git a/mm/gup.c b/mm/gup.c index 578bf5bd8bf8..dfe781d2ad4c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1255,6 +1255,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, BUG_ON(*locked != 1); } + if (flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has @@ -2660,6 +2663,9 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, FOLL_FAST_ONLY))) return -EINVAL; + if (gup_flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index faadc449cca5..da397779a6d4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1074,6 +1074,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + + /* + * If this page is a potentially pinned page, split and retry the fault + * with smaller page size. Normally this should not happen because the + * userspace should use MADV_DONTFORK upon pinned regions. This is a + * best effort that the pinned pages won't be replaced by another + * random page during the coming copy-on-write. + */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(src_page))) { + pte_free(dst_mm, pgtable); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pmd(vma, src_pmd, addr, false, NULL); + return -EAGAIN; + } + get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1177,6 +1195,16 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* No huge zero pud yet */ } + /* Please refer to comments in copy_huge_pmd() */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(pud_page(pud)))) { + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pud(vma, src_pud, addr); + return -EAGAIN; + } + pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_mkold(pud_wrprotect(pud)); set_pud_at(dst_mm, addr, dst_pud, pud); diff --git a/mm/memory.c b/mm/memory.c index 983b08f8dfb7..0fb7eba2dc6e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -786,15 +786,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return 0; } -static inline void +/* + * Copy a present and normal page if necessary. + * + * NOTE! The usual case is that this doesn't need to do + * anything, and can just return a positive value. That + * will let the caller know that it can just increase + * the page refcount and re-use the pte the traditional + * way. + * + * But _if_ we need to copy it because it needs to be + * pinned in the parent (and the child should get its own + * copy rather than just a reference to the same page), + * we'll do that here and return zero to let the caller + * know we're done. + * + * And if we need a pre-allocated page but don't yet have + * one, return a negative error to let the preallocation + * code know so that it can do so outside the page table + * lock. + */ +static inline int +copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, + struct vm_area_struct *vma, struct vm_area_struct *new, + unsigned long addr, int *rss, struct page **prealloc, + pte_t pte, struct page *page) +{ + struct page *new_page; + + if (!is_cow_mapping(vma->vm_flags)) + return 1; + + /* + * The trick starts. + * + * What we want to do is to check whether this page may + * have been pinned by the parent process. If so, + * instead of wrprotect the pte on both sides, we copy + * the page immediately so that we'll always guarantee + * the pinned page won't be randomly replaced in the + * future. + * + * To achieve this, we do the following: + * + * 1. Write-protect the pte if it's writable. This is + * to protect concurrent write fast-gup with + * FOLL_PIN, so that we'll fail the fast-gup with + * the write bit removed. + * + * 2. Check page_maybe_dma_pinned() to see whether this + * page may have been pinned. + * + * The order of these steps is important to serialize + * against the fast-gup code (gup_pte_range()) on the + * pte check and try_grab_compound_head(), so that + * we'll make sure either we'll capture that fast-gup + * so we'll copy the pinned page here, or we'll fail + * that fast-gup. + * + * NOTE! Even if we don't end up copying the page, + * we won't undo this wrprotect(), because the normal + * reference copy will need it anyway. + */ + if (pte_write(pte)) + ptep_set_wrprotect(src_mm, addr, src_pte); + + /* + * These are the "normally we can just copy by reference" + * checks. + */ + if (likely(!atomic_read(&src_mm->has_pinned))) + return 1; + if (likely(!page_maybe_dma_pinned(page))) + return 1; + + /* + * Uhhuh. It looks like the page might be a pinned page, + * and we actually need to copy it. Now we can set the + * source pte back to being writable. + */ + if (pte_write(pte)) + set_pte_at(src_mm, addr, src_pte, pte); + + new_page = *prealloc; + if (!new_page) + return -EAGAIN; + + /* + * We have a prealloc page, all good! Take it + * over and copy the page & arm it. + */ + *prealloc = NULL; + copy_user_highpage(new_page, page, addr, vma); + __SetPageUptodate(new_page); + page_add_new_anon_rmap(new_page, new, addr, false); + lru_cache_add_inactive_or_unevictable(new_page, new); + rss[mm_counter(new_page)]++; + + /* All done, just insert the new page copy in the child */ + pte = mk_pte(new_page, new->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), new); + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page + * is required to copy this pte. + */ +static inline int copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) + struct vm_area_struct *new, + unsigned long addr, int *rss, struct page **prealloc) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; + page = vm_normal_page(vma, addr, pte); + if (page) { + int retval; + + retval = copy_present_page(dst_mm, src_mm, + dst_pte, src_pte, + vma, new, + addr, rss, prealloc, + pte, page); + if (retval <= 0) + return retval; + + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } + /* * If it's a COW mapping, write protect it both * in the parent and the child @@ -820,33 +947,51 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!(vm_flags & VM_UFFD_WP)) pte = pte_clear_uffd_wp(pte); - page = vm_normal_page(vma, addr, pte); - if (page) { - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; - } - set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +static inline struct page * +page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *new_page; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); + if (!new_page) + return NULL; + + if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { + put_page(new_page); + return NULL; + } + cgroup_throttle_swaprate(new_page, GFP_KERNEL); + + return new_page; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; - int progress = 0; + int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; + struct page *prealloc = NULL; again: + progress = 0; init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) - return -ENOMEM; + if (!dst_pte) { + ret = -ENOMEM; + goto out; + } src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -878,8 +1023,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, progress += 8; continue; } - copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, - vma, addr, rss); + /* copy_present_pte() will clear `*prealloc' if consumed */ + ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, new, addr, rss, &prealloc); + /* + * If we need a pre-allocated page for this pte, drop the + * locks, allocate, and try again. + */ + if (unlikely(ret == -EAGAIN)) + break; + if (unlikely(prealloc)) { + /* + * pre-alloc page cannot be reused by next time so as + * to strictly follow mempolicy (e.g., alloc_page_vma() + * will allocate page according to address). This + * could only happen if one pinned pte changed. + */ + put_page(prealloc); + prealloc = NULL; + } progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -891,17 +1053,30 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, cond_resched(); if (entry.val) { - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + ret = -ENOMEM; + goto out; + } + entry.val = 0; + } else if (ret) { + WARN_ON_ONCE(ret != -EAGAIN); + prealloc = page_copy_prealloc(src_mm, vma, addr); + if (!prealloc) return -ENOMEM; - progress = 0; + /* We've captured and resolved the error. Reset, try again. */ + ret = 0; } if (addr != end) goto again; - return 0; +out: + if (unlikely(prealloc)) + put_page(prealloc); + return ret; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -928,7 +1103,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; @@ -936,6 +1111,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -962,7 +1138,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; @@ -970,6 +1146,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { p4d_t *src_p4d, *dst_p4d; @@ -984,14 +1161,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) + struct vm_area_struct *vma, struct vm_area_struct *new) { pgd_t *src_pgd, *dst_pgd; unsigned long next; @@ -1046,7 +1223,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + vma, new, addr, next))) { ret = -ENOMEM; break; } diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 0096cd965332..7ecd2ccba531 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -82,6 +82,7 @@ static char *sym_name(const struct sym_entry *s) static bool is_ignored_symbol(const char *name, char type) { + /* Symbol names that exactly match to the following are ignored.*/ static const char * const ignored_symbols[] = { /* * Symbols which vary between passes. Passes 1 and 2 must have @@ -104,6 +105,7 @@ static bool is_ignored_symbol(const char *name, char type) NULL }; + /* Symbol names that begin with the following are ignored.*/ static const char * const ignored_prefixes[] = { "$", /* local symbols for ARM, MIPS, etc. */ ".LASANPC", /* s390 kasan local symbols */ @@ -113,6 +115,7 @@ static bool is_ignored_symbol(const char *name, char type) NULL }; + /* Symbol names that end with the following are ignored.*/ static const char * const ignored_suffixes[] = { "_from_arm", /* arm */ "_from_thumb", /* arm */ @@ -120,9 +123,15 @@ static bool is_ignored_symbol(const char *name, char type) NULL }; + /* Symbol names that contain the following are ignored.*/ + static const char * const ignored_matches[] = { + ".long_branch.", /* ppc stub */ + ".plt_branch.", /* ppc stub */ + NULL + }; + const char * const *p; - /* Exclude symbols which vary between passes. */ for (p = ignored_symbols; *p; p++) if (!strcmp(name, *p)) return true; @@ -138,6 +147,11 @@ static bool is_ignored_symbol(const char *name, char type) return true; } + for (p = ignored_matches; *p; p++) { + if (strstr(name, *p)) + return true; + } + if (type == 'U' || type == 'u') return true; /* exclude debugging symbols */