mm/rmap: implement make_device_exclusive() using folio_walk instead of rmap walk

We require a writable PTE and only support anonymous folio: we can only
have exactly one PTE pointing at that page, which we can just lookup using
a folio walk, avoiding the rmap walk and the anon VMA lock.

So let's stop doing an rmap walk and perform a folio walk instead, so we
can easily just modify a single PTE and avoid relying on rmap/mapcounts.

We now effectively work on a single PTE instead of multiple PTEs of a
large folio, allowing for conversion of individual PTEs from non-exclusive
to device-exclusive -- note that the opposite direction always works on
single PTEs: restore_exclusive_pte().

With this change, device-exclusive handling is fully compatible with THPs
/ large folios.  We still require PMD-sized THPs to get PTE-mapped, and
supporting PMD-mapped THP (without the PTE-remapping) is a different
endeavour that might not be worth it at this point: it might even have
negative side-effects [1].

This gets rid of the "folio_mapcount()" usage and let's us fix ordinary
rmap walks (migration/swapout) next.  Spell out that messing with the
mapcount is wrong and must be fixed.

[1] https://lkml.kernel.org/r/Z5tI-cOSyzdLjoe_@phenom.ffwll.local

Link: https://lkml.kernel.org/r/20250210193801.781278-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Alistair Popple <apopple@nvidia.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dave Airlie <airlied@gmail.com>
Cc: Jann Horn <jannh@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Karol Herbst <kherbst@redhat.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Lyude <lyude@redhat.com>
Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Pasha Tatashin <pasha.tatashin@soleen.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Simona Vetter <simona.vetter@ffwll.ch>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yanteng Si <si.yanteng@linux.dev>
Cc: Barry Song <v-songbaohua@oppo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
David Hildenbrand 2025-02-10 20:37:46 +01:00 committed by Andrew Morton
parent 599b684a78
commit 438354724f

200
mm/rmap.c
View File

@ -2375,131 +2375,6 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
}
#ifdef CONFIG_DEVICE_PRIVATE
struct make_exclusive_args {
struct mm_struct *mm;
unsigned long address;
void *owner;
bool valid;
};
static bool page_make_device_exclusive_one(struct folio *folio,
struct vm_area_struct *vma, unsigned long address, void *priv)
{
struct mm_struct *mm = vma->vm_mm;
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
struct make_exclusive_args *args = priv;
pte_t pteval;
struct page *subpage;
bool ret = true;
struct mmu_notifier_range range;
swp_entry_t entry;
pte_t swp_pte;
pte_t ptent;
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
vma->vm_mm, address, min(vma->vm_end,
address + folio_size(folio)),
args->owner);
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
ptent = ptep_get(pvmw.pte);
if (!pte_present(ptent)) {
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
subpage = folio_page(folio,
pte_pfn(ptent) - folio_pfn(folio));
address = pvmw.address;
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(ptent));
pteval = ptep_clear_flush(vma, address, pvmw.pte);
/* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
folio_mark_dirty(folio);
/*
* Check that our target page is still mapped at the expected
* address.
*/
if (args->mm == mm && args->address == address &&
pte_write(pteval))
args->valid = true;
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
if (pte_write(pteval))
entry = make_writable_device_exclusive_entry(
page_to_pfn(subpage));
else
entry = make_readable_device_exclusive_entry(
page_to_pfn(subpage));
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/*
* There is a reference on the page for the swap entry which has
* been removed, so shouldn't take another.
*/
folio_remove_rmap_pte(folio, subpage, vma);
}
mmu_notifier_invalidate_range_end(&range);
return ret;
}
/**
* folio_make_device_exclusive - Mark the folio exclusively owned by a device.
* @folio: The folio to replace page table entries for.
* @mm: The mm_struct where the folio is expected to be mapped.
* @address: Address where the folio is expected to be mapped.
* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
*
* Tries to remove all the page table entries which are mapping this
* folio and replace them with special device exclusive swap entries to
* grant a device exclusive access to the folio.
*
* Context: Caller must hold the folio lock.
* Return: false if the page is still mapped, or if it could not be unmapped
* from the expected address. Otherwise returns true (success).
*/
static bool folio_make_device_exclusive(struct folio *folio,
struct mm_struct *mm, unsigned long address, void *owner)
{
struct make_exclusive_args args = {
.mm = mm,
.address = address,
.owner = owner,
.valid = false,
};
struct rmap_walk_control rwc = {
.rmap_one = page_make_device_exclusive_one,
.done = folio_not_mapped,
.anon_lock = folio_lock_anon_vma_read,
.arg = &args,
};
rmap_walk(folio, &rwc);
return args.valid && !folio_mapcount(folio);
}
/**
* make_device_exclusive() - Mark a page for exclusive use by a device
* @mm: mm_struct of associated target process
@ -2541,22 +2416,31 @@ static bool folio_make_device_exclusive(struct folio *folio,
struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
void *owner, struct folio **foliop)
{
struct folio *folio;
struct mmu_notifier_range range;
struct folio *folio, *fw_folio;
struct vm_area_struct *vma;
struct folio_walk fw;
struct page *page;
long npages;
swp_entry_t entry;
pte_t swp_pte;
mmap_assert_locked(mm);
addr = PAGE_ALIGN_DOWN(addr);
/*
* Fault in the page writable and try to lock it; note that if the
* address would already be marked for exclusive use by a device,
* the GUP call would undo that first by triggering a fault.
*
* If any other device would already map this page exclusively, the
* fault will trigger a conversion to an ordinary
* (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
*/
npages = get_user_pages_remote(mm, addr, 1,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
&page, NULL);
if (npages != 1)
return ERR_PTR(npages);
page = get_user_page_vma_remote(mm, addr,
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
&vma);
if (IS_ERR(page))
return page;
folio = page_folio(page);
if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
@ -2569,11 +2453,61 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
return ERR_PTR(-EBUSY);
}
if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
/*
* Inform secondary MMUs that we are going to convert this PTE to
* device-exclusive, such that they unmap it now. Note that the
* caller must filter this event out to prevent livelocks.
*/
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
mm, addr, addr + PAGE_SIZE, owner);
mmu_notifier_invalidate_range_start(&range);
/*
* Let's do a second walk and make sure we still find the same page
* mapped writable. Note that any page of an anonymous folio can
* only be mapped writable using exactly one PTE ("exclusive"), so
* there cannot be other mappings.
*/
fw_folio = folio_walk_start(&fw, vma, addr, 0);
if (fw_folio != folio || fw.page != page ||
fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
if (fw_folio)
folio_walk_end(&fw, vma);
mmu_notifier_invalidate_range_end(&range);
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-EBUSY);
}
/* Nuke the page table entry so we get the uptodate dirty bit. */
flush_cache_page(vma, addr, page_to_pfn(page));
fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
/* Set the dirty flag on the folio now the PTE is gone. */
if (pte_dirty(fw.pte))
folio_mark_dirty(folio);
/*
* Store the pfn of the page in a special device-exclusive PFN swap PTE.
* do_swap_page() will trigger the conversion back while holding the
* folio lock.
*/
entry = make_writable_device_exclusive_entry(page_to_pfn(page));
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(fw.pte))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
/* The pte is writable, uffd-wp does not apply. */
set_pte_at(mm, addr, fw.ptep, swp_pte);
/*
* TODO: The device-exclusive PFN swap PTE holds a folio reference but
* does not count as a mapping (mapcount), which is wrong and must be
* fixed, otherwise RMAP walks don't behave as expected.
*/
folio_remove_rmap_pte(folio, page, vma);
folio_walk_end(&fw, vma);
mmu_notifier_invalidate_range_end(&range);
*foliop = folio;
return page;
}