diff --git a/mm/memory.c b/mm/memory.c index 87cf4e1a6f86..5a6dd1606c67 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4611,7 +4611,16 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); +/* Sanity check that a folio is fully exclusive */ +static void check_swap_exclusive(struct folio *folio, swp_entry_t entry, + unsigned int nr_pages) +{ + /* Called under PT locked and folio locked, the swap count is stable */ + do { + VM_WARN_ON_ONCE_FOLIO(__swap_count(entry) != 1, folio); + entry.val++; + } while (--nr_pages); +} /* * We enter with non-exclusive mmap_lock (to exclude vma changes, @@ -4624,17 +4633,14 @@ static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq); vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct folio *swapcache, *folio = NULL; - DECLARE_WAITQUEUE(wait, current); + struct folio *swapcache = NULL, *folio; struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; - bool need_clear_cache = false; bool exclusive = false; softleaf_t entry; pte_t pte; vm_fault_t ret = 0; - void *shadow = NULL; int nr_pages; unsigned long page_idx; unsigned long address; @@ -4705,57 +4711,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio = swap_cache_get_folio(entry); if (folio) swap_update_readahead(folio, vma, vmf->address); - swapcache = folio; - if (!folio) { - if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && - __swap_count(entry) == 1) { - /* skip swapcache */ + if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { folio = alloc_swap_folio(vmf); if (folio) { - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - - nr_pages = folio_nr_pages(folio); - if (folio_test_large(folio)) - entry.val = ALIGN_DOWN(entry.val, nr_pages); /* - * Prevent parallel swapin from proceeding with - * the cache flag. Otherwise, another thread - * may finish swapin first, free the entry, and - * swapout reusing the same entry. It's - * undetectable as pte_same() returns true due - * to entry reuse. + * folio is charged, so swapin can only fail due + * to raced swapin and return NULL. */ - if (swapcache_prepare(entry, nr_pages)) { - /* - * Relax a bit to prevent rapid - * repeated page faults. - */ - add_wait_queue(&swapcache_wq, &wait); - schedule_timeout_uninterruptible(1); - remove_wait_queue(&swapcache_wq, &wait); - goto out_page; - } - need_clear_cache = true; - - memcg1_swapin(entry, nr_pages); - - shadow = swap_cache_get_shadow(entry); - if (shadow) - workingset_refault(folio, shadow); - - folio_add_lru(folio); - - /* To provide entry to swap_read_folio() */ - folio->swap = entry; - swap_read_folio(folio, NULL); - folio->private = NULL; + swapcache = swapin_folio(entry, folio); + if (swapcache != folio) + folio_put(folio); + folio = swapcache; } } else { - folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, - vmf); - swapcache = folio; + folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); } if (!folio) { @@ -4777,6 +4747,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } + swapcache = folio; ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; @@ -4846,24 +4817,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_nomap; } - /* allocated large folios for SWP_SYNCHRONOUS_IO */ - if (folio_test_large(folio) && !folio_test_swapcache(folio)) { - unsigned long nr = folio_nr_pages(folio); - unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE); - unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE; - pte_t *folio_ptep = vmf->pte - idx; - pte_t folio_pte = ptep_get(folio_ptep); - - if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) || - swap_pte_batch(folio_ptep, nr, folio_pte) != nr) - goto out_nomap; - - page_idx = idx; - address = folio_start; - ptep = folio_ptep; - goto check_folio; - } - nr_pages = 1; page_idx = 0; address = vmf->address; @@ -4907,12 +4860,37 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); + /* + * If a large folio already belongs to anon mapping, then we + * can just go on and map it partially. + * If not, with the large swapin check above failing, the page table + * have changed, so sub pages might got charged to the wrong cgroup, + * or even should be shmem. So we have to free it and fallback. + * Nothing should have touched it, both anon and shmem checks if a + * large folio is fully appliable before use. + * + * This will be removed once we unify folio allocation in the swap cache + * layer, where allocation of a folio stabilizes the swap entries. + */ + if (!folio_test_anon(folio) && folio_test_large(folio) && + nr_pages != folio_nr_pages(folio)) { + if (!WARN_ON_ONCE(folio_test_dirty(folio))) + swap_cache_del_folio(folio); + goto out_nomap; + } + /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ if (!folio_test_ksm(folio)) { + /* + * The can_swapin_thp check above ensures all PTE have + * same exclusiveness. Checking just one PTE is fine. + */ exclusive = pte_swp_exclusive(vmf->orig_pte); + if (exclusive) + check_swap_exclusive(folio, entry, nr_pages); if (folio != swapcache) { /* * We have a fresh page that is not exposed to the @@ -4990,18 +4968,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->orig_pte = pte_advance_pfn(pte, page_idx); /* ksm created a completely new copy */ - if (unlikely(folio != swapcache && swapcache)) { + if (unlikely(folio != swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); } else if (!folio_test_anon(folio)) { /* - * We currently only expect small !anon folios which are either - * fully exclusive or fully shared, or new allocated large - * folios which are fully exclusive. If we ever get large - * folios within swapcache here, we have to be careful. + * We currently only expect !anon folios that are fully + * mappable. See the comment after can_swapin_thp above. */ - VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio)); - VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio); + VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); } else { folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, @@ -5041,12 +5017,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out: - /* Clear the swap cache pin for direct swapin after PTL unlock */ - if (need_clear_cache) { - swapcache_clear(si, entry, nr_pages); - if (waitqueue_active(&swapcache_wq)) - wake_up(&swapcache_wq); - } if (si) put_swap_device(si); return ret; @@ -5054,6 +5024,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: + if (folio_test_swapcache(folio)) + folio_free_swap(folio); folio_unlock(folio); out_release: folio_put(folio); @@ -5061,11 +5033,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_unlock(swapcache); folio_put(swapcache); } - if (need_clear_cache) { - swapcache_clear(si, entry, nr_pages); - if (waitqueue_active(&swapcache_wq)) - wake_up(&swapcache_wq); - } if (si) put_swap_device(si); return ret; diff --git a/mm/swap.h b/mm/swap.h index 9f8f09fb2644..5d4a33c4adda 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -268,6 +268,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); +struct folio *swapin_folio(swp_entry_t entry, struct folio *folio); void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); @@ -386,6 +387,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } +static inline struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +{ + return NULL; +} + static inline void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { diff --git a/mm/swap_state.c b/mm/swap_state.c index 9a7ffa8d40f7..a82d0615f646 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -544,6 +544,33 @@ struct folio *swap_cache_alloc_folio(swp_entry_t entry, gfp_t gfp_mask, return result; } +/** + * swapin_folio - swap-in one or multiple entries skipping readahead. + * @entry: starting swap entry to swap in + * @folio: a new allocated and charged folio + * + * Reads @entry into @folio, @folio will be added to the swap cache. + * If @folio is a large folio, the @entry will be rounded down to align + * with the folio size. + * + * Return: returns pointer to @folio on success. If folio is a large folio + * and this raced with another swapin, NULL will be returned to allow fallback + * to order 0. Else, if another folio was already added to the swap cache, + * return that swap cache folio instead. + */ +struct folio *swapin_folio(swp_entry_t entry, struct folio *folio) +{ + struct folio *swapcache; + pgoff_t offset = swp_offset(entry); + unsigned long nr_pages = folio_nr_pages(folio); + + entry = swp_entry(swp_type(entry), round_down(offset, nr_pages)); + swapcache = __swap_cache_prepare_and_add(entry, folio, 0, true, false); + if (swapcache == folio) + swap_read_folio(folio, NULL); + return swapcache; +} + /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached.