mirror of
https://github.com/torvalds/linux.git
synced 2026-05-24 23:22:31 +02:00
mm/rmap: implement make_device_exclusive() using folio_walk instead of rmap walk
We require a writable PTE and only support anonymous folio: we can only have exactly one PTE pointing at that page, which we can just lookup using a folio walk, avoiding the rmap walk and the anon VMA lock. So let's stop doing an rmap walk and perform a folio walk instead, so we can easily just modify a single PTE and avoid relying on rmap/mapcounts. We now effectively work on a single PTE instead of multiple PTEs of a large folio, allowing for conversion of individual PTEs from non-exclusive to device-exclusive -- note that the opposite direction always works on single PTEs: restore_exclusive_pte(). With this change, device-exclusive handling is fully compatible with THPs / large folios. We still require PMD-sized THPs to get PTE-mapped, and supporting PMD-mapped THP (without the PTE-remapping) is a different endeavour that might not be worth it at this point: it might even have negative side-effects [1]. This gets rid of the "folio_mapcount()" usage and let's us fix ordinary rmap walks (migration/swapout) next. Spell out that messing with the mapcount is wrong and must be fixed. [1] https://lkml.kernel.org/r/Z5tI-cOSyzdLjoe_@phenom.ffwll.local Link: https://lkml.kernel.org/r/20250210193801.781278-5-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Tested-by: Alistair Popple <apopple@nvidia.com> Cc: Alex Shi <alexs@kernel.org> Cc: Danilo Krummrich <dakr@kernel.org> Cc: Dave Airlie <airlied@gmail.com> Cc: Jann Horn <jannh@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Karol Herbst <kherbst@redhat.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Lyude <lyude@redhat.com> Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: SeongJae Park <sj@kernel.org> Cc: Simona Vetter <simona.vetter@ffwll.ch> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yanteng Si <si.yanteng@linux.dev> Cc: Barry Song <v-songbaohua@oppo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
599b684a78
commit
438354724f
200
mm/rmap.c
200
mm/rmap.c
|
|
@ -2375,131 +2375,6 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_DEVICE_PRIVATE
|
||||
struct make_exclusive_args {
|
||||
struct mm_struct *mm;
|
||||
unsigned long address;
|
||||
void *owner;
|
||||
bool valid;
|
||||
};
|
||||
|
||||
static bool page_make_device_exclusive_one(struct folio *folio,
|
||||
struct vm_area_struct *vma, unsigned long address, void *priv)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
|
||||
struct make_exclusive_args *args = priv;
|
||||
pte_t pteval;
|
||||
struct page *subpage;
|
||||
bool ret = true;
|
||||
struct mmu_notifier_range range;
|
||||
swp_entry_t entry;
|
||||
pte_t swp_pte;
|
||||
pte_t ptent;
|
||||
|
||||
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
|
||||
vma->vm_mm, address, min(vma->vm_end,
|
||||
address + folio_size(folio)),
|
||||
args->owner);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
|
||||
while (page_vma_mapped_walk(&pvmw)) {
|
||||
/* Unexpected PMD-mapped THP? */
|
||||
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
|
||||
|
||||
ptent = ptep_get(pvmw.pte);
|
||||
if (!pte_present(ptent)) {
|
||||
ret = false;
|
||||
page_vma_mapped_walk_done(&pvmw);
|
||||
break;
|
||||
}
|
||||
|
||||
subpage = folio_page(folio,
|
||||
pte_pfn(ptent) - folio_pfn(folio));
|
||||
address = pvmw.address;
|
||||
|
||||
/* Nuke the page table entry. */
|
||||
flush_cache_page(vma, address, pte_pfn(ptent));
|
||||
pteval = ptep_clear_flush(vma, address, pvmw.pte);
|
||||
|
||||
/* Set the dirty flag on the folio now the pte is gone. */
|
||||
if (pte_dirty(pteval))
|
||||
folio_mark_dirty(folio);
|
||||
|
||||
/*
|
||||
* Check that our target page is still mapped at the expected
|
||||
* address.
|
||||
*/
|
||||
if (args->mm == mm && args->address == address &&
|
||||
pte_write(pteval))
|
||||
args->valid = true;
|
||||
|
||||
/*
|
||||
* Store the pfn of the page in a special migration
|
||||
* pte. do_swap_page() will wait until the migration
|
||||
* pte is removed and then restart fault handling.
|
||||
*/
|
||||
if (pte_write(pteval))
|
||||
entry = make_writable_device_exclusive_entry(
|
||||
page_to_pfn(subpage));
|
||||
else
|
||||
entry = make_readable_device_exclusive_entry(
|
||||
page_to_pfn(subpage));
|
||||
swp_pte = swp_entry_to_pte(entry);
|
||||
if (pte_soft_dirty(pteval))
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
if (pte_uffd_wp(pteval))
|
||||
swp_pte = pte_swp_mkuffd_wp(swp_pte);
|
||||
|
||||
set_pte_at(mm, address, pvmw.pte, swp_pte);
|
||||
|
||||
/*
|
||||
* There is a reference on the page for the swap entry which has
|
||||
* been removed, so shouldn't take another.
|
||||
*/
|
||||
folio_remove_rmap_pte(folio, subpage, vma);
|
||||
}
|
||||
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* folio_make_device_exclusive - Mark the folio exclusively owned by a device.
|
||||
* @folio: The folio to replace page table entries for.
|
||||
* @mm: The mm_struct where the folio is expected to be mapped.
|
||||
* @address: Address where the folio is expected to be mapped.
|
||||
* @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier callbacks
|
||||
*
|
||||
* Tries to remove all the page table entries which are mapping this
|
||||
* folio and replace them with special device exclusive swap entries to
|
||||
* grant a device exclusive access to the folio.
|
||||
*
|
||||
* Context: Caller must hold the folio lock.
|
||||
* Return: false if the page is still mapped, or if it could not be unmapped
|
||||
* from the expected address. Otherwise returns true (success).
|
||||
*/
|
||||
static bool folio_make_device_exclusive(struct folio *folio,
|
||||
struct mm_struct *mm, unsigned long address, void *owner)
|
||||
{
|
||||
struct make_exclusive_args args = {
|
||||
.mm = mm,
|
||||
.address = address,
|
||||
.owner = owner,
|
||||
.valid = false,
|
||||
};
|
||||
struct rmap_walk_control rwc = {
|
||||
.rmap_one = page_make_device_exclusive_one,
|
||||
.done = folio_not_mapped,
|
||||
.anon_lock = folio_lock_anon_vma_read,
|
||||
.arg = &args,
|
||||
};
|
||||
|
||||
rmap_walk(folio, &rwc);
|
||||
|
||||
return args.valid && !folio_mapcount(folio);
|
||||
}
|
||||
|
||||
/**
|
||||
* make_device_exclusive() - Mark a page for exclusive use by a device
|
||||
* @mm: mm_struct of associated target process
|
||||
|
|
@ -2541,22 +2416,31 @@ static bool folio_make_device_exclusive(struct folio *folio,
|
|||
struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
|
||||
void *owner, struct folio **foliop)
|
||||
{
|
||||
struct folio *folio;
|
||||
struct mmu_notifier_range range;
|
||||
struct folio *folio, *fw_folio;
|
||||
struct vm_area_struct *vma;
|
||||
struct folio_walk fw;
|
||||
struct page *page;
|
||||
long npages;
|
||||
swp_entry_t entry;
|
||||
pte_t swp_pte;
|
||||
|
||||
mmap_assert_locked(mm);
|
||||
addr = PAGE_ALIGN_DOWN(addr);
|
||||
|
||||
/*
|
||||
* Fault in the page writable and try to lock it; note that if the
|
||||
* address would already be marked for exclusive use by a device,
|
||||
* the GUP call would undo that first by triggering a fault.
|
||||
*
|
||||
* If any other device would already map this page exclusively, the
|
||||
* fault will trigger a conversion to an ordinary
|
||||
* (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
|
||||
*/
|
||||
npages = get_user_pages_remote(mm, addr, 1,
|
||||
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
|
||||
&page, NULL);
|
||||
if (npages != 1)
|
||||
return ERR_PTR(npages);
|
||||
page = get_user_page_vma_remote(mm, addr,
|
||||
FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
|
||||
&vma);
|
||||
if (IS_ERR(page))
|
||||
return page;
|
||||
folio = page_folio(page);
|
||||
|
||||
if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
|
||||
|
|
@ -2569,11 +2453,61 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
|
|||
return ERR_PTR(-EBUSY);
|
||||
}
|
||||
|
||||
if (!folio_make_device_exclusive(folio, mm, addr, owner)) {
|
||||
/*
|
||||
* Inform secondary MMUs that we are going to convert this PTE to
|
||||
* device-exclusive, such that they unmap it now. Note that the
|
||||
* caller must filter this event out to prevent livelocks.
|
||||
*/
|
||||
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
|
||||
mm, addr, addr + PAGE_SIZE, owner);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
|
||||
/*
|
||||
* Let's do a second walk and make sure we still find the same page
|
||||
* mapped writable. Note that any page of an anonymous folio can
|
||||
* only be mapped writable using exactly one PTE ("exclusive"), so
|
||||
* there cannot be other mappings.
|
||||
*/
|
||||
fw_folio = folio_walk_start(&fw, vma, addr, 0);
|
||||
if (fw_folio != folio || fw.page != page ||
|
||||
fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
|
||||
if (fw_folio)
|
||||
folio_walk_end(&fw, vma);
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
return ERR_PTR(-EBUSY);
|
||||
}
|
||||
|
||||
/* Nuke the page table entry so we get the uptodate dirty bit. */
|
||||
flush_cache_page(vma, addr, page_to_pfn(page));
|
||||
fw.pte = ptep_clear_flush(vma, addr, fw.ptep);
|
||||
|
||||
/* Set the dirty flag on the folio now the PTE is gone. */
|
||||
if (pte_dirty(fw.pte))
|
||||
folio_mark_dirty(folio);
|
||||
|
||||
/*
|
||||
* Store the pfn of the page in a special device-exclusive PFN swap PTE.
|
||||
* do_swap_page() will trigger the conversion back while holding the
|
||||
* folio lock.
|
||||
*/
|
||||
entry = make_writable_device_exclusive_entry(page_to_pfn(page));
|
||||
swp_pte = swp_entry_to_pte(entry);
|
||||
if (pte_soft_dirty(fw.pte))
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
/* The pte is writable, uffd-wp does not apply. */
|
||||
set_pte_at(mm, addr, fw.ptep, swp_pte);
|
||||
|
||||
/*
|
||||
* TODO: The device-exclusive PFN swap PTE holds a folio reference but
|
||||
* does not count as a mapping (mapcount), which is wrong and must be
|
||||
* fixed, otherwise RMAP walks don't behave as expected.
|
||||
*/
|
||||
folio_remove_rmap_pte(folio, page, vma);
|
||||
|
||||
folio_walk_end(&fw, vma);
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
*foliop = folio;
|
||||
return page;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user