diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 20ac5875d275..71be8045e4b4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1316,8 +1316,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - vma->vm_flags &= ~VM_SOFTDIRTY; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, + vma->vm_flags & ~VM_SOFTDIRTY); vma_set_page_prot(vma); + vm_write_end(vma); } mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d22b9ec49d84..7c920059f179 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -635,8 +635,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vm_write_begin(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING); + WRITE_ONCE(vma->vm_flags, + vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING)); + vm_write_end(vma); return 0; } @@ -875,8 +878,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma = prev; else prev = vma; - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vm_write_end(vma); } mmap_write_unlock(mm); mmput(mm); @@ -1440,8 +1445,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; + vm_write_end(vma); skip: prev = vma; @@ -1611,8 +1618,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + vm_write_end(vma); skip: prev = vma; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4e3dff13eb70..5c27832b45b2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1122,6 +1122,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (mm_find_pmd(mm, address) != pmd) goto out; + vm_write_begin(vma); anon_vma_lock_write(vma->anon_vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, @@ -1159,6 +1160,7 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); + vm_write_end(vma); result = SCAN_FAIL; goto out; } @@ -1193,6 +1195,7 @@ static void collapse_huge_page(struct mm_struct *mm, set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); + vm_write_end(vma); *hpage = NULL; diff --git a/mm/madvise.c b/mm/madvise.c index c65a80eb387c..7a14642f9d4b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -168,7 +168,9 @@ static long madvise_behavior(struct vm_area_struct *vma, /* * vm_flags is protected by the mmap_lock held in write mode. */ - vma->vm_flags = new_flags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, new_flags); + vm_write_end(vma); out_convert_errno: /* @@ -489,9 +491,11 @@ static void madvise_cold_page_range(struct mmu_gather *tlb, .tlb = tlb, }; + vm_write_begin(vma); tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); + vm_write_end(vma); } static long madvise_cold(struct vm_area_struct *vma, @@ -522,9 +526,11 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb, .tlb = tlb, }; + vm_write_begin(vma); tlb_start_vma(tlb, vma); walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private); tlb_end_vma(tlb, vma); + vm_write_end(vma); } static inline bool can_do_pageout(struct vm_area_struct *vma) @@ -727,10 +733,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); + vm_write_begin(vma); tlb_start_vma(&tlb, vma); walk_page_range(vma->vm_mm, range.start, range.end, &madvise_free_walk_ops, &tlb); tlb_end_vma(&tlb, vma); + vm_write_end(vma); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, range.start, range.end); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1f763eda9c5c..c627bac1b374 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -406,8 +406,11 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) struct vm_area_struct *vma; mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for (vma = mm->mmap; vma; vma = vma->vm_next) { + vm_write_begin(vma); mpol_rebind_policy(vma->vm_policy, new); + vm_write_end(vma); + } mmap_write_unlock(mm); } @@ -654,9 +657,11 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, { int nr_updated; + vm_write_begin(vma); nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); + vm_write_end(vma); return nr_updated; } @@ -782,6 +787,7 @@ static int vma_replace_policy(struct vm_area_struct *vma, if (IS_ERR(new)) return PTR_ERR(new); + vm_write_begin(vma); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) @@ -789,11 +795,17 @@ static int vma_replace_policy(struct vm_area_struct *vma, } old = vma->vm_policy; - vma->vm_policy = new; /* protected by mmap_lock */ + /* + * The speculative page fault handler accesses this field without + * hodling the mmap_sem. + */ + WRITE_ONCE(vma->vm_policy, new); + vm_write_end(vma); mpol_put(old); return 0; err_out: + vm_write_end(vma); mpol_put(new); return err; } @@ -1780,23 +1792,28 @@ bool vma_migratable(struct vm_area_struct *vma) struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { - struct mempolicy *pol = NULL; + struct mempolicy *pol; - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - pol = vma->vm_ops->get_policy(vma, addr); - } else if (vma->vm_policy) { - pol = vma->vm_policy; + if (!vma) + return NULL; - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); - } + if (vma->vm_ops && vma->vm_ops->get_policy) + return vma->vm_ops->get_policy(vma, addr); + + /* + * This could be called without holding the mmap_sem in the + * speculative page fault handler's path. + */ + pol = READ_ONCE(vma->vm_policy); + if (pol) { + /* + * shmem_alloc_page() passes MPOL_F_SHARED policy with + * a pseudo vma whose vma->vm_ops=NULL. Take a reference + * count on these policies which will be dropped by + * mpol_cond_put() later + */ + if (mpol_needs_cond_ref(pol)) + mpol_get(pol); } return pol; diff --git a/mm/mlock.c b/mm/mlock.c index 0e4f89c2b159..c15b5ef12d0b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -450,7 +450,9 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK); + vm_write_end(vma); while (start < end) { struct page *page; @@ -574,10 +576,11 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ - - if (lock) - vma->vm_flags = newflags; - else + if (lock) { + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, newflags); + vm_write_end(vma); + } else munlock_vma_pages_range(vma, start, end); out: diff --git a/mm/mmap.c b/mm/mmap.c index f394b002f27e..d0fdcc3f4ffd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -899,17 +899,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { - vma->vm_start = start; + WRITE_ONCE(vma->vm_start, start); start_changed = true; } if (end != vma->vm_end) { - vma->vm_end = end; + WRITE_ONCE(vma->vm_end, end); end_changed = true; } - vma->vm_pgoff = pgoff; + WRITE_ONCE(vma->vm_pgoff, pgoff); if (adjust_next) { - next->vm_start += adjust_next; - next->vm_pgoff += adjust_next >> PAGE_SHIFT; + WRITE_ONCE(next->vm_start, + next->vm_start + adjust_next); + WRITE_ONCE(next->vm_pgoff, + next->vm_pgoff + (adjust_next >> PAGE_SHIFT)); } if (file) { @@ -1911,12 +1913,14 @@ unsigned long mmap_region(struct file *file, unsigned long addr, out: perf_event_mmap(vma); + vm_write_begin(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + WRITE_ONCE(vma->vm_flags, + vma->vm_flags & VM_LOCKED_CLEAR_MASK); else mm->locked_vm += (len >> PAGE_SHIFT); } @@ -1931,9 +1935,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * then new mapped in-place (which must be aimed as * a completely new data area). */ - vma->vm_flags |= VM_SOFTDIRTY; + WRITE_ONCE(vma->vm_flags, vma->vm_flags | VM_SOFTDIRTY); vma_set_page_prot(vma); + vm_write_end(vma); return addr; @@ -2590,8 +2595,8 @@ int expand_downwards(struct vm_area_struct *vma, mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); - vma->vm_start = address; - vma->vm_pgoff -= grow; + WRITE_ONCE(vma->vm_start, address); + WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow); anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); spin_unlock(&mm->page_table_lock); diff --git a/mm/mprotect.c b/mm/mprotect.c index a2a62d62a8b5..d95115a57283 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -480,12 +480,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ - vma->vm_flags = newflags; + vm_write_begin(vma); + WRITE_ONCE(vma->vm_flags, newflags); dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable ? MM_CP_DIRTY_ACCT : 0); + vm_write_end(vma); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major diff --git a/mm/swap_state.c b/mm/swap_state.c index a0429c4341e6..e797804dcc4d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -645,7 +645,11 @@ static unsigned long swapin_nr_pages(unsigned long offset) * This has been extended to use the NUMA policies from the mm triggering * the readahead. * - * Caller must hold read mmap_lock if vmf->vma is not NULL. + * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. + * This is needed to ensure the VMA will not be freed in our back. In the case + * of the speculative page fault handler, this cannot happen, even if we don't + * hold the mmap_sem. Callees are assumed to take care of reading VMA's fields + * using READ_ONCE() to read consistent values. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) @@ -742,9 +746,9 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { - *start = max3(lpfn, PFN_DOWN(vma->vm_start), + *start = max3(lpfn, PFN_DOWN(READ_ONCE(vma->vm_start)), PFN_DOWN(faddr & PMD_MASK)); - *end = min3(rpfn, PFN_DOWN(vma->vm_end), + *end = min3(rpfn, PFN_DOWN(READ_ONCE(vma->vm_end)), PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); }