From 5eca9e121fc60c890666d5b47a4b69715b940bb0 Mon Sep 17 00:00:00 2001
From: Laurent Dufour <ldufour@linux.ibm.com>
Date: Fri, 18 Jan 2019 16:19:08 +0100
Subject: [PATCH] FROMLIST: mm: protect against PTE changes done by dup_mmap()

Vinayak Menon and Ganesh Mahendran reported that the following scenario may
lead to thread being blocked due to data corruption:

    CPU 1                   CPU 2                    CPU 3
    Process 1,              Process 1,               Process 1,
    Thread A                Thread B                 Thread C

    while (1) {             while (1) {              while(1) {
    pthread_mutex_lock(l)   pthread_mutex_lock(l)    fork
    pthread_mutex_unlock(l) pthread_mutex_unlock(l)  }
    }                       }

In the details this happens because :

    CPU 1                CPU 2                       CPU 3
    fork()
    copy_pte_range()
      set PTE rdonly
    got to next VMA...
     .                   PTE is seen rdonly          PTE still writable
     .                   thread is writing to page
     .                   -> page fault
     .                     copy the page             Thread writes to page
     .                      .                        -> no page fault
     .                     update the PTE
     .                     flush TLB for that PTE
   flush TLB                                        PTE are now rdonly

So the write done by the CPU 3 is interfering with the page copy operation
done by CPU 2, leading to the data corruption.

To avoid this we mark all the VMA involved in the COW mechanism as changing
by calling vm_write_begin(). This ensures that the speculative page fault
handler will not try to handle a fault on these pages.
The marker is set until the TLB is flushed, ensuring that all the CPUs will
now see the PTE as not writable.
Once the TLB is flush, the marker is removed by calling vm_write_end().

The variable last is used to keep tracked of the latest VMA marked to
handle the error path where part of the VMA may have been marked.

Change-Id: I3fe07109e27d8f77c9b435053567fe5c287703aa
Reported-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Reported-by: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
Link: https://www.spinics.net/lists/linux-mm/msg171207.html
Bug: 161210518
Signed-off-by: Charan Teja Reddy <charante@codeaurora.org>
Signed-off-by: Vinayak Menon <vinmenon@codeaurora.org>
---
 kernel/fork.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 5f4e4ce703c5..ab07abfe537a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -472,7 +472,7 @@ EXPORT_SYMBOL(free_task);
 static __latent_entropy int dup_mmap(struct mm_struct *mm,
 					struct mm_struct *oldmm)
 {
-	struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
+	struct vm_area_struct *mpnt, *tmp, *prev, **pprev, *last = NULL;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
@@ -592,8 +592,18 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		if (!(tmp->vm_flags & VM_WIPEONFORK))
+		if (!(tmp->vm_flags & VM_WIPEONFORK)) {
+			if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
+				/*
+				 * Mark this VMA as changing to prevent the
+				 * speculative page fault hanlder to process
+				 * it until the TLB are flushed below.
+				 */
+				last = mpnt;
+				vm_write_begin(mpnt);
+			}
 			retval = copy_page_range(tmp, mpnt);
+		}
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
@@ -606,6 +616,22 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
 out:
 	mmap_write_unlock(mm);
 	flush_tlb_mm(oldmm);
+
+	if (IS_ENABLED(CONFIG_SPECULATIVE_PAGE_FAULT)) {
+		/*
+		 * Since the TLB has been flush, we can safely unmark the
+		 * copied VMAs and allows the speculative page fault handler to
+		 * process them again.
+		 * Walk back the VMA list from the last marked VMA.
+		 */
+		for (; last; last = last->vm_prev) {
+			if (last->vm_flags & VM_DONTCOPY)
+				continue;
+			if (!(last->vm_flags & VM_WIPEONFORK))
+				vm_write_end(last);
+		}
+	}
+
 	mmap_write_unlock(oldmm);
 	dup_userfaultfd_complete(&uf);
 fail_uprobe_end: