diff --git a/arch/s390/mm/gmap_helpers.c b/arch/s390/mm/gmap_helpers.c index d41b19925a5a..dd89fce28531 100644 --- a/arch/s390/mm/gmap_helpers.c +++ b/arch/s390/mm/gmap_helpers.c @@ -32,7 +32,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) dec_mm_counter(mm, MM_SWAPENTS); else if (softleaf_is_migration(entry)) dec_mm_counter(mm, mm_counter(softleaf_to_folio(entry))); - free_swap_and_cache(entry); + swap_put_entries_direct(entry, 1); } /** diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 666adcd681ab..b22181e1079e 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -682,7 +682,7 @@ static void ptep_zap_softleaf_entry(struct mm_struct *mm, softleaf_t entry) dec_mm_counter(mm, mm_counter(folio)); } - free_swap_and_cache(entry); + swap_put_entries_direct(entry, 1); } void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, diff --git a/include/linux/swap.h b/include/linux/swap.h index 74df3004c850..aaa868f60b9c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -452,14 +452,8 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio); -bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); -extern swp_entry_t get_swap_page_of_type(int); extern int add_swap_count_continuation(swp_entry_t, gfp_t); -extern int swap_duplicate_nr(swp_entry_t entry, int nr); -extern void swap_free_nr(swp_entry_t entry, int nr_pages); -extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); @@ -471,6 +465,29 @@ struct backing_dev_info; extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); +/* + * If there is an existing swap slot reference (swap entry) and the caller + * guarantees that there is no race modification of it (e.g., PTL + * protecting the swap entry in page table; shmem's cmpxchg protects t + * he swap entry in shmem mapping), these two helpers below can be used + * to put/dup the entries directly. + * + * All entries must be allocated by folio_alloc_swap(). And they must have + * a swap count > 1. See comments of folio_*_swap helpers for more info. + */ +int swap_dup_entry_direct(swp_entry_t entry); +void swap_put_entries_direct(swp_entry_t entry, int nr); + +/* + * folio_free_swap tries to free the swap entries pinned by a swap cache + * folio, it has to be here to be called by other components. + */ +bool folio_free_swap(struct folio *folio); + +/* Allocate / free (hibernation) exclusive entries */ +swp_entry_t swap_alloc_hibernation_slot(int type); +void swap_free_hibernation_slot(swp_entry_t entry); + static inline void put_swap_device(struct swap_info_struct *si) { percpu_ref_put(&si->users); @@ -498,10 +515,6 @@ static inline void put_swap_device(struct swap_info_struct *si) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); -static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr) -{ -} - static inline void free_swap_cache(struct folio *folio) { } @@ -511,12 +524,12 @@ static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) return 0; } -static inline int swap_duplicate_nr(swp_entry_t swp, int nr_pages) +static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; } -static inline void swap_free_nr(swp_entry_t entry, int nr_pages) +static inline void swap_put_entries_direct(swp_entry_t ent, int nr) { } @@ -539,11 +552,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio) -{ - return -EINVAL; -} - static inline bool folio_free_swap(struct folio *folio) { return false; @@ -556,22 +564,6 @@ static inline int add_swap_extent(struct swap_info_struct *sis, return -EINVAL; } #endif /* CONFIG_SWAP */ - -static inline int swap_duplicate(swp_entry_t entry) -{ - return swap_duplicate_nr(entry, 1); -} - -static inline void free_swap_and_cache(swp_entry_t entry) -{ - free_swap_and_cache_nr(entry, 1); -} - -static inline void swap_free(swp_entry_t entry) -{ - swap_free_nr(entry, 1); -} - #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8050e5182835..19ed7bd2adcc 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -174,10 +174,10 @@ sector_t alloc_swapdev_block(int swap) * Allocate a swap page and register that it has been allocated, so that * it can be freed in case of an error. */ - offset = swp_offset(get_swap_page_of_type(swap)); + offset = swp_offset(swap_alloc_hibernation_slot(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free(swp_entry(swap, offset)); + swap_free_hibernation_slot(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -186,6 +186,7 @@ sector_t alloc_swapdev_block(int swap) void free_all_swap_pages(int swap) { + unsigned long offset; struct rb_node *node; /* @@ -197,8 +198,9 @@ void free_all_swap_pages(int swap) ext = rb_entry(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); - swap_free_nr(swp_entry(swap, ext->start), - ext->end - ext->start + 1); + + for (offset = ext->start; offset <= ext->end; offset++) + swap_free_hibernation_slot(swp_entry(swap, offset)); kfree(ext); } diff --git a/mm/madvise.c b/mm/madvise.c index 19cf480eed49..1f3040688f04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -692,7 +692,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, max_nr = (end - addr) / PAGE_SIZE; nr = swap_pte_batch(pte, max_nr, ptent); nr_swap -= nr; - free_swap_and_cache_nr(entry, nr); + swap_put_entries_direct(entry, nr); clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm); } else if (softleaf_is_hwpoison(entry) || softleaf_is_poison_marker(entry)) { diff --git a/mm/memory.c b/mm/memory.c index 60258033103e..187f16b7e996 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -934,7 +934,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct page *page; if (likely(softleaf_is_swap(entry))) { - if (swap_duplicate(entry) < 0) + if (swap_dup_entry_direct(entry) < 0) return -EIO; /* make sure dst_mm is on swapoff's mmlist. */ @@ -1748,7 +1748,7 @@ static inline int zap_nonpresent_ptes(struct mmu_gather *tlb, nr = swap_pte_batch(pte, max_nr, ptent); rss[MM_SWAPENTS] -= nr; - free_swap_and_cache_nr(entry, nr); + swap_put_entries_direct(entry, nr); } else if (softleaf_is_migration(entry)) { struct folio *folio = softleaf_to_folio(entry); @@ -4936,7 +4936,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry - * so this must be called before swap_free(). + * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); @@ -4974,6 +4974,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(folio != swapcache)) { folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE); folio_add_lru_vma(folio, vma); + folio_put_swap(swapcache, NULL); } else if (!folio_test_anon(folio)) { /* * We currently only expect !anon folios that are fully @@ -4982,9 +4983,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) VM_WARN_ON_ONCE_FOLIO(folio_nr_pages(folio) != nr_pages, folio); VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio); folio_add_new_anon_rmap(folio, vma, address, rmap_flags); + folio_put_swap(folio, NULL); } else { + VM_WARN_ON_ONCE(nr_pages != 1 && nr_pages != folio_nr_pages(folio)); folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address, - rmap_flags); + rmap_flags); + folio_put_swap(folio, nr_pages == 1 ? page : NULL); } VM_BUG_ON(!folio_test_anon(folio) || @@ -4998,7 +5002,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * Do it after mapping, so raced page faults will likely see the folio * in swap cache and wait on the folio lock. */ - swap_free_nr(entry, nr_pages); if (should_try_to_free_swap(si, folio, vma, nr_pages, vmf->flags)) folio_free_swap(folio); @@ -5008,7 +5011,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For - * further safety release the lock after the swap_free + * further safety release the lock after the folio_put_swap * so that the swap count won't change under a * parallel locked swapcache. */ diff --git a/mm/rmap.c b/mm/rmap.c index 6ddbf58111ff..c1ba88763102 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -82,6 +82,7 @@ #include #include "internal.h" +#include "swap.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; @@ -2232,7 +2233,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, goto discard; } - if (swap_duplicate(entry) < 0) { + if (folio_dup_swap(folio, subpage) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -2243,7 +2244,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * so we'll not check/care. */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } @@ -2251,7 +2252,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { - swap_free(entry); + folio_put_swap(folio, subpage); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } diff --git a/mm/shmem.c b/mm/shmem.c index dd4951d6f891..0adde3f4df27 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -982,7 +982,7 @@ static long shmem_free_swap(struct address_space *mapping, xas_unlock_irq(&xas); if (nr_pages) - free_swap_and_cache_nr(radix_to_swp_entry(radswap), nr_pages); + swap_put_entries_direct(radix_to_swp_entry(radswap), nr_pages); return nr_pages; } @@ -1690,7 +1690,7 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, spin_unlock(&shmem_swaplist_lock); } - swap_duplicate_nr(folio->swap, nr_pages); + folio_dup_swap(folio, NULL); shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap)); BUG_ON(folio_mapped(folio)); @@ -1711,7 +1711,7 @@ int shmem_writeout(struct folio *folio, struct swap_iocb **plug, /* Swap entry might be erased by racing shmem_free_swap() */ if (!error) { shmem_recalc_inode(inode, 0, -nr_pages); - swap_free_nr(folio->swap, nr_pages); + folio_put_swap(folio, NULL); } /* @@ -2197,6 +2197,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); + folio_put_swap(folio, NULL); swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks @@ -2204,7 +2205,6 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, * in shmem_evict_inode(). */ shmem_recalc_inode(inode, -nr_pages, -nr_pages); - swap_free_nr(swap, nr_pages); } static int shmem_split_large_entry(struct inode *inode, pgoff_t index, @@ -2427,9 +2427,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (sgp == SGP_WRITE) folio_mark_accessed(folio); + folio_put_swap(folio, NULL); swap_cache_del_folio(folio); folio_mark_dirty(folio); - swap_free_nr(swap, nr_pages); put_swap_device(si); *foliop = folio; diff --git a/mm/swap.h b/mm/swap.h index 0801857a0640..da243a1e3e45 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -183,6 +183,28 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) spin_unlock_irq(&ci->lock); } +/* + * Below are the core routines for doing swap for a folio. + * All helpers requires the folio to be locked, and a locked folio + * in the swap cache pins the swap entries / slots allocated to the + * folio, swap relies heavily on the swap cache and folio lock for + * synchronization. + * + * folio_alloc_swap(): the entry point for a folio to be swapped + * out. It allocates swap slots and pins the slots with swap cache. + * The slots start with a swap count of zero. + * + * folio_dup_swap(): increases the swap count of a folio, usually + * during it gets unmapped and a swap entry is installed to replace + * it (e.g., swap entry in page table). A swap slot with swap + * count == 0 should only be increasd by this helper. + * + * folio_put_swap(): does the opposite thing of folio_dup_swap(). + */ +int folio_alloc_swap(struct folio *folio); +int folio_dup_swap(struct folio *folio, struct page *subpage); +void folio_put_swap(struct folio *folio, struct page *subpage); + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -363,9 +385,24 @@ static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) return NULL; } +static inline int folio_alloc_swap(struct folio *folio) +{ + return -EINVAL; +} + +static inline int folio_dup_swap(struct folio *folio, struct page *page) +{ + return -EINVAL; +} + +static inline void folio_put_swap(struct folio *folio, struct page *page) +{ +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } + static inline void swap_write_unplug(struct swap_iocb *sio) { } diff --git a/mm/swapfile.c b/mm/swapfile.c index 64970ee11fcf..d652486898de 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -58,6 +58,9 @@ static void swap_entries_free(struct swap_info_struct *si, swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); +static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -1482,6 +1485,12 @@ int folio_alloc_swap(struct folio *folio) */ WARN_ON_ONCE(swap_cache_add_folio(folio, entry, NULL, true)); + /* + * Allocator should always allocate aligned entries so folio based + * operations never crossed more than one cluster. + */ + VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio->swap.val, size), folio); + return 0; out_free: @@ -1489,6 +1498,66 @@ int folio_alloc_swap(struct folio *folio) return -ENOMEM; } +/** + * folio_dup_swap() - Increase swap count of swap entries of a folio. + * @folio: folio with swap entries bounded. + * @subpage: if not NULL, only increase the swap count of this subpage. + * + * Typically called when the folio is unmapped and have its swap entry to + * take its palce. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * NOTE: The caller also has to ensure there is no raced call to + * swap_put_entries_direct on its swap entry before this helper returns, or + * the swap map may underflow. Currently, we only accept @subpage == NULL + * for shmem due to the limitation of swap continuation: shmem always + * duplicates the swap entry only once, so there is no such issue for it. + */ +int folio_dup_swap(struct folio *folio, struct page *subpage) +{ + int err = 0; + swp_entry_t entry = folio->swap; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); + + if (subpage) { + entry.val += folio_page_idx(folio, subpage); + nr_pages = 1; + } + + while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + + return err; +} + +/** + * folio_put_swap() - Decrease swap count of swap entries of a folio. + * @folio: folio with swap entries bounded, must be in swap cache and locked. + * @subpage: if not NULL, only decrease the swap count of this subpage. + * + * This won't free the swap slots even if swap count drops to zero, they are + * still pinned by the swap cache. User may call folio_free_swap to free them. + * Context: Caller must ensure the folio is locked and in the swap cache. + */ +void folio_put_swap(struct folio *folio, struct page *subpage) +{ + swp_entry_t entry = folio->swap; + unsigned long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_FOLIO(!folio_test_swapcache(folio), folio); + + if (subpage) { + entry.val += folio_page_idx(folio, subpage); + nr_pages = 1; + } + + swap_entries_put_map(__swap_entry_to_info(entry), entry, nr_pages); +} + static struct swap_info_struct *_swap_info_get(swp_entry_t entry) { struct swap_info_struct *si; @@ -1729,28 +1798,6 @@ static void swap_entries_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -/* - * Caller has made sure that the swap device corresponding to entry - * is still around or has not been recycled. - */ -void swap_free_nr(swp_entry_t entry, int nr_pages) -{ - int nr; - struct swap_info_struct *sis; - unsigned long offset = swp_offset(entry); - - sis = _swap_info_get(entry); - if (!sis) - return; - - while (nr_pages) { - nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); - offset += nr; - nr_pages -= nr; - } -} - /* * Called after dropping swapcache to decrease refcnt to swap entries. */ @@ -1940,16 +1987,19 @@ bool folio_free_swap(struct folio *folio) } /** - * free_swap_and_cache_nr() - Release reference on range of swap entries and - * reclaim their cache if no more references remain. + * swap_put_entries_direct() - Release reference on range of swap entries and + * reclaim their cache if no more references remain. * @entry: First entry of range. * @nr: Number of entries in range. * * For each swap entry in the contiguous range, release a reference. If any swap * entries become free, try to reclaim their underlying folios, if present. The * offset range is defined by [entry.offset, entry.offset + nr). + * + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being released. */ -void free_swap_and_cache_nr(swp_entry_t entry, int nr) +void swap_put_entries_direct(swp_entry_t entry, int nr) { const unsigned long start_offset = swp_offset(entry); const unsigned long end_offset = start_offset + nr; @@ -1958,10 +2008,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) unsigned long offset; si = get_swap_device(entry); - if (!si) + if (WARN_ON_ONCE(!si)) return; - - if (WARN_ON(end_offset > si->max)) + if (WARN_ON_ONCE(end_offset > si->max)) goto out; /* @@ -2005,8 +2054,8 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) } #ifdef CONFIG_HIBERNATION - -swp_entry_t get_swap_page_of_type(int type) +/* Allocate a slot for hibernation */ +swp_entry_t swap_alloc_hibernation_slot(int type) { struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; @@ -2034,6 +2083,26 @@ swp_entry_t get_swap_page_of_type(int type) return entry; } +/* Free a slot allocated by swap_alloc_hibernation_slot */ +void swap_free_hibernation_slot(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + pgoff_t offset = swp_offset(entry); + + si = get_swap_device(entry); + if (WARN_ON(!si)) + return; + + ci = swap_cluster_lock(si, offset); + swap_entry_put_locked(si, ci, entry, 1); + swap_cluster_unlock(ci); + + /* In theory readahead might add it to the swap cache by accident */ + __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); + put_swap_device(si); +} + /* * Find the swap type that corresponds to given device (if any). * @@ -2195,7 +2264,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, /* * Some architectures may have to restore extra metadata to the page * when reading from swap. This metadata may be indexed by swap entry - * so this must be called before swap_free(). + * so this must be called before folio_put_swap(). */ arch_swap_restore(folio_swap(entry, folio), folio); @@ -2236,7 +2305,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, new_pte = pte_mkuffd_wp(new_pte); setpte: set_pte_at(vma->vm_mm, addr, pte, new_pte); - swap_free(entry); + folio_put_swap(swapcache, folio_file_page(swapcache, swp_offset(entry))); out: if (pte) pte_unmap_unlock(pte, ptl); @@ -3746,28 +3815,22 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) return err; } -/** - * swap_duplicate_nr() - Increase reference count of nr contiguous swap entries - * by 1. - * +/* + * swap_dup_entry_direct() - Increase reference count of a swap entry by one. * @entry: first swap entry from which we want to increase the refcount. - * @nr: Number of entries in range. * * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required * but could not be atomically allocated. Returns 0, just as if it succeeded, * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which * might occur if a page table entry has got corrupted. * - * Note that we are currently not handling the case where nr > 1 and we need to - * add swap count continuation. This is OK, because no such user exists - shmem - * is the only user that can pass nr > 1, and it never re-duplicates any swap - * entry it owns. + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being increased. */ -int swap_duplicate_nr(swp_entry_t entry, int nr) +int swap_dup_entry_direct(swp_entry_t entry) { int err = 0; - - while (!err && __swap_duplicate(entry, 1, nr) == -ENOMEM) + while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) err = add_swap_count_continuation(entry, GFP_ATOMIC); return err; }