mirror of
https://github.com/torvalds/linux.git
synced 2026-05-22 22:22:08 +02:00
mm/rmap: handle device-exclusive entries correctly in try_to_migrate_one()
Ever since commitb756a3b5e7("mm: device exclusive memory access") we can return with a device-exclusive entry from page_vma_mapped_walk(). try_to_migrate_one() is not prepared for that, so teach it about these PFN swap PTEs. We already handle device-private entries by specializing on the folio, so we can reshuffle that code to make it work on the PFN swap PTEs instead. Get rid of the folio_is_device_private() handling. Note that we never currently expect device-private folios with HWPoison flag set at that point, so add a warning in case that ever changes and we can figure out what the right thing to do is. Note that we could currently only run into this case with device-exclusive entries on THPs. We still adjust the mapcount on conversion to device-exclusive; this makes the rmap walk abort early for small folios, because we'll always have !folio_mapped() with a single device-exclusive entry. We'll adjust the mapcount logic once all page_vma_mapped_walk() users can properly handle device-exclusive entries. Further note that try_to_migrate() calls MMU notifiers and holds the folio lock, so any device-exclusive users should be properly prepared for a device-exclusive PTE to "vanish". Link: https://lkml.kernel.org/r/20250210193801.781278-12-david@redhat.com Fixes:b756a3b5e7("mm: device exclusive memory access") Signed-off-by: David Hildenbrand <david@redhat.com> Tested-by: Alistair Popple <apopple@nvidia.com> Cc: Alex Shi <alexs@kernel.org> Cc: Alistair Popple <apopple@nvidia.com> Cc: Danilo Krummrich <dakr@kernel.org> Cc: Dave Airlie <airlied@gmail.com> Cc: Jann Horn <jannh@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Karol Herbst <kherbst@redhat.com> Cc: Liam Howlett <liam.howlett@oracle.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Lyude <lyude@redhat.com> Cc: "Masami Hiramatsu (Google)" <mhiramat@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: Peter Xu <peterx@redhat.com> Cc: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: SeongJae Park <sj@kernel.org> Cc: Simona Vetter <simona.vetter@ffwll.ch> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Yanteng Si <si.yanteng@linux.dev> Cc: Barry Song <v-songbaohua@oppo.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
6552929560
commit
bf983108be
124
mm/rmap.c
124
mm/rmap.c
|
|
@ -2039,9 +2039,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
|
||||
bool anon_exclusive, writable, ret = true;
|
||||
pte_t pteval;
|
||||
struct page *subpage;
|
||||
bool anon_exclusive, ret = true;
|
||||
struct mmu_notifier_range range;
|
||||
enum ttu_flags flags = (enum ttu_flags)(long)arg;
|
||||
unsigned long pfn;
|
||||
|
|
@ -2108,24 +2108,19 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
/* Unexpected PMD-mapped THP? */
|
||||
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
|
||||
|
||||
pfn = pte_pfn(ptep_get(pvmw.pte));
|
||||
|
||||
if (folio_is_zone_device(folio)) {
|
||||
/*
|
||||
* Our PTE is a non-present device exclusive entry and
|
||||
* calculating the subpage as for the common case would
|
||||
* result in an invalid pointer.
|
||||
*
|
||||
* Since only PAGE_SIZE pages can currently be
|
||||
* migrated, just set it to page. This will need to be
|
||||
* changed when hugepage migrations to device private
|
||||
* memory are supported.
|
||||
*/
|
||||
VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
|
||||
subpage = &folio->page;
|
||||
/*
|
||||
* Handle PFN swap PTEs, such as device-exclusive ones, that
|
||||
* actually map pages.
|
||||
*/
|
||||
pteval = ptep_get(pvmw.pte);
|
||||
if (likely(pte_present(pteval))) {
|
||||
pfn = pte_pfn(pteval);
|
||||
} else {
|
||||
subpage = folio_page(folio, pfn - folio_pfn(folio));
|
||||
pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
|
||||
VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
|
||||
}
|
||||
|
||||
subpage = folio_page(folio, pfn - folio_pfn(folio));
|
||||
address = pvmw.address;
|
||||
anon_exclusive = folio_test_anon(folio) &&
|
||||
PageAnonExclusive(subpage);
|
||||
|
|
@ -2181,7 +2176,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
}
|
||||
/* Nuke the hugetlb page table entry */
|
||||
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
|
||||
} else {
|
||||
if (pte_dirty(pteval))
|
||||
folio_mark_dirty(folio);
|
||||
writable = pte_write(pteval);
|
||||
} else if (likely(pte_present(pteval))) {
|
||||
flush_cache_page(vma, address, pfn);
|
||||
/* Nuke the page table entry. */
|
||||
if (should_defer_flush(mm, flags)) {
|
||||
|
|
@ -2199,54 +2197,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
} else {
|
||||
pteval = ptep_clear_flush(vma, address, pvmw.pte);
|
||||
}
|
||||
if (pte_dirty(pteval))
|
||||
folio_mark_dirty(folio);
|
||||
writable = pte_write(pteval);
|
||||
} else {
|
||||
pte_clear(mm, address, pvmw.pte);
|
||||
writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
|
||||
}
|
||||
|
||||
/* Set the dirty flag on the folio now the pte is gone. */
|
||||
if (pte_dirty(pteval))
|
||||
folio_mark_dirty(folio);
|
||||
VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
|
||||
!anon_exclusive, folio);
|
||||
|
||||
/* Update high watermark before we lower rss */
|
||||
update_hiwater_rss(mm);
|
||||
|
||||
if (folio_is_device_private(folio)) {
|
||||
unsigned long pfn = folio_pfn(folio);
|
||||
swp_entry_t entry;
|
||||
pte_t swp_pte;
|
||||
if (PageHWPoison(subpage)) {
|
||||
VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);
|
||||
|
||||
if (anon_exclusive)
|
||||
WARN_ON_ONCE(folio_try_share_anon_rmap_pte(folio,
|
||||
subpage));
|
||||
|
||||
/*
|
||||
* Store the pfn of the page in a special migration
|
||||
* pte. do_swap_page() will wait until the migration
|
||||
* pte is removed and then restart fault handling.
|
||||
*/
|
||||
entry = pte_to_swp_entry(pteval);
|
||||
if (is_writable_device_private_entry(entry))
|
||||
entry = make_writable_migration_entry(pfn);
|
||||
else if (anon_exclusive)
|
||||
entry = make_readable_exclusive_migration_entry(pfn);
|
||||
else
|
||||
entry = make_readable_migration_entry(pfn);
|
||||
swp_pte = swp_entry_to_pte(entry);
|
||||
|
||||
/*
|
||||
* pteval maps a zone device page and is therefore
|
||||
* a swap pte.
|
||||
*/
|
||||
if (pte_swp_soft_dirty(pteval))
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
if (pte_swp_uffd_wp(pteval))
|
||||
swp_pte = pte_swp_mkuffd_wp(swp_pte);
|
||||
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
|
||||
trace_set_migration_pte(pvmw.address, pte_val(swp_pte),
|
||||
folio_order(folio));
|
||||
/*
|
||||
* No need to invalidate here it will synchronize on
|
||||
* against the special swap migration pte.
|
||||
*/
|
||||
} else if (PageHWPoison(subpage)) {
|
||||
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
|
||||
if (folio_test_hugetlb(folio)) {
|
||||
hugetlb_count_sub(folio_nr_pages(folio), mm);
|
||||
|
|
@ -2256,8 +2223,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
dec_mm_counter(mm, mm_counter(folio));
|
||||
set_pte_at(mm, address, pvmw.pte, pteval);
|
||||
}
|
||||
|
||||
} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
|
||||
} else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
|
||||
!userfaultfd_armed(vma)) {
|
||||
/*
|
||||
* The guest indicated that the page content is of no
|
||||
* interest anymore. Simply discard the pte, vmscan
|
||||
|
|
@ -2273,6 +2240,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
swp_entry_t entry;
|
||||
pte_t swp_pte;
|
||||
|
||||
/*
|
||||
* arch_unmap_one() is expected to be a NOP on
|
||||
* architectures where we could have PFN swap PTEs,
|
||||
* so we'll not check/care.
|
||||
*/
|
||||
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
|
||||
if (folio_test_hugetlb(folio))
|
||||
set_huge_pte_at(mm, address, pvmw.pte,
|
||||
|
|
@ -2283,8 +2255,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
page_vma_mapped_walk_done(&pvmw);
|
||||
break;
|
||||
}
|
||||
VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
|
||||
!anon_exclusive, subpage);
|
||||
|
||||
/* See folio_try_share_anon_rmap_pte(): clear PTE first. */
|
||||
if (folio_test_hugetlb(folio)) {
|
||||
|
|
@ -2309,7 +2279,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
* pte. do_swap_page() will wait until the migration
|
||||
* pte is removed and then restart fault handling.
|
||||
*/
|
||||
if (pte_write(pteval))
|
||||
if (writable)
|
||||
entry = make_writable_migration_entry(
|
||||
page_to_pfn(subpage));
|
||||
else if (anon_exclusive)
|
||||
|
|
@ -2318,15 +2288,23 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
|
|||
else
|
||||
entry = make_readable_migration_entry(
|
||||
page_to_pfn(subpage));
|
||||
if (pte_young(pteval))
|
||||
entry = make_migration_entry_young(entry);
|
||||
if (pte_dirty(pteval))
|
||||
entry = make_migration_entry_dirty(entry);
|
||||
swp_pte = swp_entry_to_pte(entry);
|
||||
if (pte_soft_dirty(pteval))
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
if (pte_uffd_wp(pteval))
|
||||
swp_pte = pte_swp_mkuffd_wp(swp_pte);
|
||||
if (likely(pte_present(pteval))) {
|
||||
if (pte_young(pteval))
|
||||
entry = make_migration_entry_young(entry);
|
||||
if (pte_dirty(pteval))
|
||||
entry = make_migration_entry_dirty(entry);
|
||||
swp_pte = swp_entry_to_pte(entry);
|
||||
if (pte_soft_dirty(pteval))
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
if (pte_uffd_wp(pteval))
|
||||
swp_pte = pte_swp_mkuffd_wp(swp_pte);
|
||||
} else {
|
||||
swp_pte = swp_entry_to_pte(entry);
|
||||
if (pte_swp_soft_dirty(pteval))
|
||||
swp_pte = pte_swp_mksoft_dirty(swp_pte);
|
||||
if (pte_swp_uffd_wp(pteval))
|
||||
swp_pte = pte_swp_mkuffd_wp(swp_pte);
|
||||
}
|
||||
if (folio_test_hugetlb(folio))
|
||||
set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
|
||||
hsz);
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user