diff --git a/include/linux/swap.h b/include/linux/swap.h index 62fc7499b408..0effe3cc50f5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,7 +208,6 @@ enum { SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ - SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ @@ -223,16 +222,6 @@ enum { #define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10) #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX -/* Bit flag in swap_map */ -#define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ - -/* Special value in first swap_map */ -#define SWAP_MAP_MAX 0x3e /* Max count */ -#define SWAP_MAP_BAD 0x3f /* Note page is bad */ - -/* Special value in each swap_map continuation */ -#define SWAP_CONT_MAX 0x7f /* Max count */ - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the @@ -264,8 +253,7 @@ struct swap_info_struct { signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ - unsigned int max; /* extent of the swap_map */ - unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + unsigned int max; /* size of this swap device */ unsigned long *zeromap; /* kvmalloc'ed bitmap to track zero pages */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct list_head free_clusters; /* free clusters list */ @@ -284,18 +272,14 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, inuse_pages and all cluster - * lists. other fields are only changed + * inuse_pages and all cluster lists. + * Other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ - spinlock_t cont_lock; /* - * protect swap count continuation page - * list. - */ struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ @@ -451,7 +435,6 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -extern int add_swap_count_continuation(swp_entry_t, gfp_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); @@ -517,11 +500,6 @@ static inline void free_swap_cache(struct folio *folio) { } -static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) -{ - return 0; -} - static inline int swap_dup_entry_direct(swp_entry_t ent) { return 0; diff --git a/mm/memory.c b/mm/memory.c index 2f815a34d924..7084c426f933 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1346,7 +1346,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, if (ret == -EIO) { VM_WARN_ON_ONCE(!entry.val); - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + if (swap_retry_table_alloc(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } diff --git a/mm/swap.h b/mm/swap.h index bfafa637c458..0a91e21e92b1 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -37,6 +37,7 @@ struct swap_cluster_info { u8 flags; u8 order; atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ + unsigned int *extend_table; /* For large swap count, protected by ci->lock */ struct list_head list; }; @@ -183,6 +184,8 @@ static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) spin_unlock_irq(&ci->lock); } +extern int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp); + /* * Below are the core routines for doing swap for a folio. * All helpers requires the folio to be locked, and a locked folio @@ -206,9 +209,9 @@ int folio_dup_swap(struct folio *folio, struct page *subpage); void folio_put_swap(struct folio *folio, struct page *subpage); /* For internal use */ -extern void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages); +extern void __swap_cluster_free_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned int ci_off, unsigned int nr_pages); /* linux/mm/page_io.c */ int sio_pool_init(void); @@ -446,6 +449,11 @@ static inline int swap_writeout(struct folio *folio, return 0; } +static inline int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) +{ + return -EINVAL; +} + static inline bool swap_cache_has_folio(swp_entry_t entry) { return false; diff --git a/mm/swap_state.c b/mm/swap_state.c index e213ee35c1d2..e7618ffe6d70 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -140,21 +140,20 @@ void *swap_cache_get_shadow(swp_entry_t entry) void __swap_cache_add_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry) { - unsigned long new_tb; - unsigned int ci_start, ci_off, ci_end; + unsigned int ci_off = swp_cluster_offset(entry), ci_end; unsigned long nr_pages = folio_nr_pages(folio); + unsigned long pfn = folio_pfn(folio); + unsigned long old_tb; VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - new_tb = folio_to_swp_tb(folio, 0); - ci_start = swp_cluster_offset(entry); - ci_off = ci_start; - ci_end = ci_start + nr_pages; + ci_end = ci_off + nr_pages; do { - VM_WARN_ON_ONCE(swp_tb_is_folio(__swap_table_get(ci, ci_off))); - __swap_table_set(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); folio_ref_add(folio, nr_pages); @@ -183,14 +182,13 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, unsigned long old_tb; struct swap_info_struct *si; struct swap_cluster_info *ci; - unsigned int ci_start, ci_off, ci_end, offset; + unsigned int ci_start, ci_off, ci_end; unsigned long nr_pages = folio_nr_pages(folio); si = __swap_entry_to_info(entry); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; - offset = swp_offset(entry); ci = swap_cluster_lock(si, swp_offset(entry)); if (unlikely(!ci->table)) { err = -ENOENT; @@ -202,13 +200,12 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, err = -EEXIST; goto failed; } - if (unlikely(!__swap_count(swp_entry(swp_type(entry), offset)))) { + if (unlikely(!__swp_tb_get_count(old_tb))) { err = -ENOENT; goto failed; } if (swp_tb_is_shadow(old_tb)) shadow = swp_tb_to_shadow(old_tb); - offset++; } while (++ci_off < ci_end); __swap_cache_add_folio(ci, folio, entry); swap_cluster_unlock(ci); @@ -237,8 +234,9 @@ static int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { + int count; + unsigned long old_tb; struct swap_info_struct *si; - unsigned long old_tb, new_tb; unsigned int ci_start, ci_off, ci_end; bool folio_swapped = false, need_free = false; unsigned long nr_pages = folio_nr_pages(folio); @@ -249,20 +247,20 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); si = __swap_entry_to_info(entry); - new_tb = shadow_to_swp_tb(shadow, 0); ci_start = swp_cluster_offset(entry); ci_end = ci_start + nr_pages; ci_off = ci_start; do { - /* If shadow is NULL, we sets an empty shadow */ - old_tb = __swap_table_xchg(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != folio); - if (__swap_count(swp_entry(si->type, - swp_offset(entry) + ci_off - ci_start))) + count = __swp_tb_get_count(old_tb); + if (count) folio_swapped = true; else need_free = true; + /* If shadow is NULL, we sets an empty shadow. */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(shadow, count)); } while (++ci_off < ci_end); folio->swap.val = 0; @@ -271,13 +269,13 @@ void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); if (!folio_swapped) { - swap_entries_free(si, ci, swp_offset(entry), nr_pages); + __swap_cluster_free_entries(si, ci, ci_start, nr_pages); } else if (need_free) { + ci_off = ci_start; do { - if (!__swap_count(entry)) - swap_entries_free(si, ci, swp_offset(entry), 1); - entry.val++; - } while (--nr_pages); + if (!__swp_tb_get_count(__swap_table_get(ci, ci_off))) + __swap_cluster_free_entries(si, ci, ci_off, 1); + } while (++ci_off < ci_end); } } @@ -324,17 +322,18 @@ void __swap_cache_replace_folio(struct swap_cluster_info *ci, unsigned long nr_pages = folio_nr_pages(new); unsigned int ci_off = swp_cluster_offset(entry); unsigned int ci_end = ci_off + nr_pages; - unsigned long old_tb, new_tb; + unsigned long pfn = folio_pfn(new); + unsigned long old_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ - new_tb = folio_to_swp_tb(new, 0); do { - old_tb = __swap_table_xchg(ci, ci_off, new_tb); + old_tb = __swap_table_get(ci, ci_off); WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + __swap_table_set(ci, ci_off, pfn_to_swp_tb(pfn, __swp_tb_get_count(old_tb))); } while (++ci_off < ci_end); /* @@ -368,7 +367,7 @@ void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) ci_end = ci_off + nr_ents; do { old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); - WARN_ON_ONCE(swp_tb_is_folio(old)); + WARN_ON_ONCE(swp_tb_is_folio(old) || swp_tb_get_count(old)); } while (++ci_off < ci_end); } diff --git a/mm/swap_table.h b/mm/swap_table.h index 10762ac5f4f5..8415ffbe2b9c 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -191,6 +191,11 @@ static inline int swp_tb_get_count(unsigned long swp_tb) return -EINVAL; } +static inline unsigned long __swp_tb_mk_count(unsigned long swp_tb, int count) +{ + return ((swp_tb & ~SWP_TB_COUNT_MASK) | __count_to_swp_tb(count)); +} + /* * Helpers for accessing or modifying the swap table of a cluster, * the swap cluster must be locked. diff --git a/mm/swapfile.c b/mm/swapfile.c index 54a19ebce540..cf976ecae8a8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,15 +51,8 @@ #include "swap_table.h" #include "swap.h" -static bool swap_count_continued(struct swap_info_struct *, pgoff_t, - unsigned char); -static void free_swap_count_continuations(struct swap_info_struct *); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr); -static void swap_put_entry_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset); static bool folio_swapcache_freeable(struct folio *folio); static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, @@ -182,22 +175,19 @@ static long swap_usage_in_pages(struct swap_info_struct *si) /* Reclaim the swap entry if swap is getting full */ #define TTRS_FULL 0x4 -static bool swap_only_has_cache(struct swap_info_struct *si, - struct swap_cluster_info *ci, +static bool swap_only_has_cache(struct swap_cluster_info *ci, unsigned long offset, int nr_pages) { unsigned int ci_off = offset % SWAPFILE_CLUSTER; - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; + unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; do { swp_tb = __swap_table_get(ci, ci_off); VM_WARN_ON_ONCE(!swp_tb_is_folio(swp_tb)); - if (*map) + if (swp_tb_get_count(swp_tb)) return false; - ++ci_off; - } while (++map < map_end); + } while (++ci_off < ci_end); return true; } @@ -256,7 +246,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * reference or pending writeback, and can't be allocated to others. */ ci = swap_cluster_lock(si, offset); - need_reclaim = swap_only_has_cache(si, ci, offset, nr_pages); + need_reclaim = swap_only_has_cache(ci, offset, nr_pages); swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -479,6 +469,7 @@ static void swap_cluster_assert_empty(struct swap_cluster_info *ci, } while (++ci_off < ci_end); WARN_ON_ONCE(bad_slots != (swapoff ? ci->count : 0)); + WARN_ON_ONCE(nr == SWAPFILE_CLUSTER && ci->extend_table); } static void swap_cluster_free_table(struct swap_cluster_info *ci) @@ -807,7 +798,6 @@ static int swap_cluster_setup_bad_slot(struct swap_info_struct *si, pr_warn("Duplicated bad slot offset %d\n", offset); ret = -EINVAL; } else { - si->swap_map[offset] = SWAP_MAP_BAD; ci->count++; } spin_unlock(&ci->lock); @@ -829,18 +819,16 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, { unsigned int nr_pages = 1 << order; unsigned long offset = start, end = start + nr_pages; - unsigned char *map = si->swap_map; unsigned long swp_tb; spin_unlock(&ci->lock); do { - if (READ_ONCE(map[offset])) - break; swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swp_tb_is_folio(swp_tb)) { + if (swp_tb_get_count(swp_tb)) + break; + if (swp_tb_is_folio(swp_tb)) if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY) < 0) break; - } } while (++offset < end); spin_lock(&ci->lock); @@ -864,7 +852,7 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, */ for (offset = start; offset < end; offset++) { swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (map[offset] || !swp_tb_is_null(swp_tb)) + if (!swp_tb_is_null(swp_tb)) return false; } @@ -876,37 +864,35 @@ static bool cluster_scan_range(struct swap_info_struct *si, unsigned long offset, unsigned int nr_pages, bool *need_reclaim) { - unsigned long end = offset + nr_pages; - unsigned char *map = si->swap_map; + unsigned int ci_off = offset % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr_pages; unsigned long swp_tb; - if (cluster_is_empty(ci)) - return true; - do { - if (map[offset]) - return false; - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - if (swp_tb_is_folio(swp_tb)) { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_is_null(swp_tb)) + continue; + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { if (!vm_swap_full()) return false; *need_reclaim = true; - } else { - /* A entry with no count and no cache must be null */ - VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + continue; } - } while (++offset < end); + /* Slot with zero count can only be NULL or folio */ + VM_WARN_ON(!swp_tb_get_count(swp_tb)); + return false; + } while (++ci_off < ci_end); return true; } -static bool cluster_alloc_range(struct swap_info_struct *si, - struct swap_cluster_info *ci, - struct folio *folio, - unsigned int offset) +static bool __swap_cluster_alloc_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + struct folio *folio, + unsigned int ci_off) { - unsigned long nr_pages; unsigned int order; + unsigned long nr_pages; lockdep_assert_held(&ci->lock); @@ -925,14 +911,15 @@ static bool cluster_alloc_range(struct swap_info_struct *si, if (likely(folio)) { order = folio_order(folio); nr_pages = 1 << order; - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); - __swap_cache_add_folio(ci, folio, swp_entry(si->type, offset)); + swap_cluster_assert_empty(ci, ci_off, nr_pages, false); + __swap_cache_add_folio(ci, folio, swp_entry(si->type, + ci_off + cluster_offset(si, ci))); } else if (IS_ENABLED(CONFIG_HIBERNATION)) { order = 0; nr_pages = 1; - WARN_ON_ONCE(si->swap_map[offset]); - si->swap_map[offset] = 1; - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, 1, false); + swap_cluster_assert_empty(ci, ci_off, 1, false); + /* Sets a fake shadow as placeholder */ + __swap_table_set(ci, ci_off, shadow_to_swp_tb(NULL, 1)); } else { /* Allocation without folio is only possible with hibernation */ WARN_ON_ONCE(1); @@ -983,7 +970,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, if (!ret) continue; } - if (!cluster_alloc_range(si, ci, folio, offset)) + if (!__swap_cluster_alloc_entries(si, ci, folio, offset % SWAPFILE_CLUSTER)) break; found = offset; offset += nr_pages; @@ -1030,7 +1017,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) long to_scan = 1; unsigned long offset, end; struct swap_cluster_info *ci; - unsigned char *map = si->swap_map; + unsigned long swp_tb; int nr_reclaim; if (force) @@ -1042,8 +1029,8 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) to_scan--; while (offset < end) { - if (!READ_ONCE(map[offset]) && - swp_tb_is_folio(swap_table_get(ci, offset % SWAPFILE_CLUSTER))) { + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); + if (swp_tb_is_folio(swp_tb) && !__swp_tb_get_count(swp_tb)) { spin_unlock(&ci->lock); nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY); @@ -1452,40 +1439,127 @@ static bool swap_sync_discard(void) return false; } +static int swap_extend_table_alloc(struct swap_info_struct *si, + struct swap_cluster_info *ci, gfp_t gfp) +{ + void *table; + + table = kzalloc(sizeof(ci->extend_table[0]) * SWAPFILE_CLUSTER, gfp); + if (!table) + return -ENOMEM; + + spin_lock(&ci->lock); + if (!ci->extend_table) + ci->extend_table = table; + else + kfree(table); + spin_unlock(&ci->lock); + return 0; +} + +int swap_retry_table_alloc(swp_entry_t entry, gfp_t gfp) +{ + int ret; + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = get_swap_device(entry); + if (!si) + return 0; + + ci = __swap_offset_to_cluster(si, offset); + ret = swap_extend_table_alloc(si, ci, gfp); + + put_swap_device(si); + return ret; +} + +static void swap_extend_table_try_free(struct swap_cluster_info *ci) +{ + unsigned long i; + bool can_free = true; + + if (!ci->extend_table) + return; + + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (ci->extend_table[i]) + can_free = false; + } + + if (can_free) { + kfree(ci->extend_table); + ci->extend_table = NULL; + } +} + +/* Decrease the swap count of one slot, without freeing it */ +static void __swap_cluster_put_entry(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + int count; + unsigned long swp_tb; + + lockdep_assert_held(&ci->lock); + swp_tb = __swap_table_get(ci, ci_off); + count = __swp_tb_get_count(swp_tb); + + VM_WARN_ON_ONCE(count <= 0); + VM_WARN_ON_ONCE(count > SWP_TB_COUNT_MAX); + + if (count == SWP_TB_COUNT_MAX) { + count = ci->extend_table[ci_off]; + /* Overflow starts with SWP_TB_COUNT_MAX */ + VM_WARN_ON_ONCE(count < SWP_TB_COUNT_MAX); + count--; + if (count == (SWP_TB_COUNT_MAX - 1)) { + ci->extend_table[ci_off] = 0; + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count)); + swap_extend_table_try_free(ci); + } else { + ci->extend_table[ci_off] = count; + } + } else { + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, --count)); + } +} + /** - * swap_put_entries_cluster - Decrease the swap count of a set of slots. + * swap_put_entries_cluster - Decrease the swap count of slots within one cluster * @si: The swap device. - * @start: start offset of slots. + * @offset: start offset of slots. * @nr: number of slots. - * @reclaim_cache: if true, also reclaim the swap cache. + * @reclaim_cache: if true, also reclaim the swap cache if slots are freed. * * This helper decreases the swap count of a set of slots and tries to * batch free them. Also reclaims the swap cache if @reclaim_cache is true. - * Context: The caller must ensure that all slots belong to the same - * cluster and their swap count doesn't go underflow. + * + * Context: The specified slots must be pinned by existing swap count or swap + * cache reference, so they won't be released until this helper returns. */ static void swap_put_entries_cluster(struct swap_info_struct *si, - unsigned long start, int nr, + pgoff_t offset, int nr, bool reclaim_cache) { - unsigned long offset = start, end = start + nr; - unsigned long batch_start = SWAP_ENTRY_INVALID; struct swap_cluster_info *ci; + unsigned int ci_off, ci_end; + pgoff_t end = offset + nr; bool need_reclaim = false; unsigned int nr_reclaimed; unsigned long swp_tb; - unsigned int count; + int ci_batch = -1; ci = swap_cluster_lock(si, offset); + ci_off = offset % SWAPFILE_CLUSTER; + ci_end = ci_off + nr; do { - swp_tb = __swap_table_get(ci, offset % SWAPFILE_CLUSTER); - count = si->swap_map[offset]; - VM_WARN_ON(count < 1 || count == SWAP_MAP_BAD); - if (count == 1) { + swp_tb = __swap_table_get(ci, ci_off); + if (swp_tb_get_count(swp_tb) == 1) { /* count == 1 and non-cached slots will be batch freed. */ if (!swp_tb_is_folio(swp_tb)) { - if (!batch_start) - batch_start = offset; + if (ci_batch == -1) + ci_batch = ci_off; continue; } /* count will be 0 after put, slot can be reclaimed */ @@ -1497,21 +1571,20 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, * slots will be freed when folio is removed from swap cache * (__swap_cache_del_folio). */ - swap_put_entry_locked(si, ci, offset); - if (batch_start) { - swap_entries_free(si, ci, batch_start, offset - batch_start); - batch_start = SWAP_ENTRY_INVALID; + __swap_cluster_put_entry(ci, ci_off); + if (ci_batch != -1) { + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); + ci_batch = -1; } - } while (++offset < end); + } while (++ci_off < ci_end); - if (batch_start) - swap_entries_free(si, ci, batch_start, offset - batch_start); + if (ci_batch != -1) + __swap_cluster_free_entries(si, ci, ci_batch, ci_off - ci_batch); swap_cluster_unlock(ci); if (!need_reclaim || !reclaim_cache) return; - offset = start; do { nr_reclaimed = __try_to_reclaim_swap(si, offset, TTRS_UNMAPPED | TTRS_FULL); @@ -1521,6 +1594,92 @@ static void swap_put_entries_cluster(struct swap_info_struct *si, } while (offset < end); } +/* Increase the swap count of one slot. */ +static int __swap_cluster_dup_entry(struct swap_cluster_info *ci, + unsigned int ci_off) +{ + int count; + unsigned long swp_tb; + + lockdep_assert_held(&ci->lock); + swp_tb = __swap_table_get(ci, ci_off); + /* Bad or special slots can't be handled */ + if (WARN_ON_ONCE(swp_tb_is_bad(swp_tb))) + return -EINVAL; + count = __swp_tb_get_count(swp_tb); + /* Must be either cached or have a count already */ + if (WARN_ON_ONCE(!count && !swp_tb_is_folio(swp_tb))) + return -ENOENT; + + if (likely(count < (SWP_TB_COUNT_MAX - 1))) { + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, count + 1)); + VM_WARN_ON_ONCE(ci->extend_table && ci->extend_table[ci_off]); + } else if (count == (SWP_TB_COUNT_MAX - 1)) { + if (ci->extend_table) { + VM_WARN_ON_ONCE(ci->extend_table[ci_off]); + ci->extend_table[ci_off] = SWP_TB_COUNT_MAX; + __swap_table_set(ci, ci_off, __swp_tb_mk_count(swp_tb, SWP_TB_COUNT_MAX)); + } else { + return -ENOMEM; + } + } else if (count == SWP_TB_COUNT_MAX) { + VM_WARN_ON_ONCE(ci->extend_table[ci_off] >= + type_max(typeof(ci->extend_table[0]))); + ++ci->extend_table[ci_off]; + } else { + /* Never happens unless counting went wrong */ + WARN_ON_ONCE(1); + } + + return 0; +} + +/** + * swap_dup_entries_cluster: Increase the swap count of slots within one cluster. + * @si: The swap device. + * @offset: start offset of slots. + * @nr: number of slots. + * + * Context: The specified slots must be pinned by existing swap count or swap + * cache reference, so they won't be released until this helper returns. + * Return: 0 on success. -ENOMEM if the swap count maxed out (SWP_TB_COUNT_MAX) + * and failed to allocate an extended table, -EINVAL if any entry is bad entry. + */ +static int swap_dup_entries_cluster(struct swap_info_struct *si, + pgoff_t offset, int nr) +{ + int err; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + + ci_start = offset % SWAPFILE_CLUSTER; + ci_end = ci_start + nr; + ci_off = ci_start; + ci = swap_cluster_lock(si, offset); +restart: + do { + err = __swap_cluster_dup_entry(ci, ci_off); + if (unlikely(err)) { + if (err == -ENOMEM) { + spin_unlock(&ci->lock); + err = swap_extend_table_alloc(si, ci, GFP_ATOMIC); + spin_lock(&ci->lock); + if (!err) + goto restart; + } + goto failed; + } + } while (++ci_off < ci_end); + swap_cluster_unlock(ci); + return 0; +failed: + while (ci_off-- > ci_start) + __swap_cluster_put_entry(ci, ci_off); + swap_extend_table_try_free(ci); + swap_cluster_unlock(ci); + return err; +} + /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap @@ -1589,13 +1748,10 @@ int folio_alloc_swap(struct folio *folio) * Context: Caller must ensure the folio is locked and in the swap cache. * NOTE: The caller also has to ensure there is no raced call to * swap_put_entries_direct on its swap entry before this helper returns, or - * the swap map may underflow. Currently, we only accept @subpage == NULL - * for shmem due to the limitation of swap continuation: shmem always - * duplicates the swap entry only once, so there is no such issue for it. + * the swap count may underflow. */ int folio_dup_swap(struct folio *folio, struct page *subpage) { - int err = 0; swp_entry_t entry = folio->swap; unsigned long nr_pages = folio_nr_pages(folio); @@ -1607,10 +1763,8 @@ int folio_dup_swap(struct folio *folio, struct page *subpage) nr_pages = 1; } - while (!err && __swap_duplicate(entry, 1, nr_pages) == -ENOMEM) - err = add_swap_count_continuation(entry, GFP_ATOMIC); - - return err; + return swap_dup_entries_cluster(swap_entry_to_info(entry), + swp_offset(entry), nr_pages); } /** @@ -1639,28 +1793,6 @@ void folio_put_swap(struct folio *folio, struct page *subpage) swap_put_entries_cluster(si, swp_offset(entry), nr_pages, false); } -static void swap_put_entry_locked(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset) -{ - unsigned char count; - - count = si->swap_map[offset]; - if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { - if (count == COUNT_CONTINUED) { - if (swap_count_continued(si, offset, count)) - count = SWAP_MAP_MAX | COUNT_CONTINUED; - else - count = SWAP_MAP_MAX; - } else - count--; - } - - WRITE_ONCE(si->swap_map[offset], count); - if (!count && !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER))) - swap_entries_free(si, ci, offset, 1); -} - /* * When we get a swap entry, if there aren't some other ways to * prevent swapoff, such as the folio in swap cache is locked, RCU @@ -1727,31 +1859,30 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) } /* - * Drop the last ref of swap entries, caller have to ensure all entries - * belong to the same cgroup and cluster. + * Free a set of swap slots after their swap count dropped to zero, or will be + * zero after putting the last ref (saves one __swap_cluster_put_entry call). */ -void swap_entries_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, unsigned int nr_pages) +void __swap_cluster_free_entries(struct swap_info_struct *si, + struct swap_cluster_info *ci, + unsigned int ci_start, unsigned int nr_pages) { - swp_entry_t entry = swp_entry(si->type, offset); - unsigned char *map = si->swap_map + offset; - unsigned char *map_end = map + nr_pages; + unsigned long old_tb; + unsigned int ci_off = ci_start, ci_end = ci_start + nr_pages; + unsigned long offset = cluster_offset(si, ci) + ci_start; - /* It should never free entries across different clusters */ - VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); - VM_BUG_ON(cluster_is_empty(ci)); - VM_BUG_ON(ci->count < nr_pages); + VM_WARN_ON(ci->count < nr_pages); ci->count -= nr_pages; do { - VM_WARN_ON(*map > 1); - *map = 0; - } while (++map < map_end); + old_tb = __swap_table_get(ci, ci_off); + /* Release the last ref, or after swap cache is dropped */ + VM_WARN_ON(!swp_tb_is_shadow(old_tb) || __swp_tb_get_count(old_tb) > 1); + __swap_table_set(ci, ci_off, null_to_swp_tb()); + } while (++ci_off < ci_end); - mem_cgroup_uncharge_swap(entry, nr_pages); + mem_cgroup_uncharge_swap(swp_entry(si->type, offset), nr_pages); swap_range_free(si, offset, nr_pages); - swap_cluster_assert_empty(ci, offset % SWAPFILE_CLUSTER, nr_pages, false); + swap_cluster_assert_empty(ci, ci_start, nr_pages, false); if (!ci->count) free_cluster(si, ci); @@ -1761,10 +1892,10 @@ void swap_entries_free(struct swap_info_struct *si, int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = __swap_entry_to_info(entry); - pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry); - return si->swap_map[offset]; + return swp_tb_get_count(__swap_table_get(ci, ci_off)); } /** @@ -1776,81 +1907,62 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) { pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; - int count; + unsigned long swp_tb; ci = swap_cluster_lock(si, offset); - count = si->swap_map[offset]; + swp_tb = swap_table_get(ci, offset % SWAPFILE_CLUSTER); swap_cluster_unlock(ci); - return count && count != SWAP_MAP_BAD; + return swp_tb_get_count(swp_tb) > 0; } /* * How many references to @entry are currently swapped out? - * This considers COUNT_CONTINUED so it returns exact answer. + * This returns exact answer. */ int swp_swapcount(swp_entry_t entry) { - int count, tmp_count, n; struct swap_info_struct *si; struct swap_cluster_info *ci; - struct page *page; - pgoff_t offset; - unsigned char *map; + unsigned long swp_tb; + int count; si = get_swap_device(entry); if (!si) return 0; - offset = swp_offset(entry); - - ci = swap_cluster_lock(si, offset); - - count = si->swap_map[offset]; - if (!(count & COUNT_CONTINUED)) - goto out; - - count &= ~COUNT_CONTINUED; - n = SWAP_MAP_MAX + 1; - - page = vmalloc_to_page(si->swap_map + offset); - offset &= ~PAGE_MASK; - VM_BUG_ON(page_private(page) != SWP_CONTINUED); - - do { - page = list_next_entry(page, lru); - map = kmap_local_page(page); - tmp_count = map[offset]; - kunmap_local(map); - - count += (tmp_count & ~COUNT_CONTINUED) * n; - n *= (SWAP_CONT_MAX + 1); - } while (tmp_count & COUNT_CONTINUED); -out: + ci = swap_cluster_lock(si, swp_offset(entry)); + swp_tb = __swap_table_get(ci, swp_cluster_offset(entry)); + count = swp_tb_get_count(swp_tb); + if (count == SWP_TB_COUNT_MAX) + count = ci->extend_table[swp_cluster_offset(entry)]; swap_cluster_unlock(ci); put_swap_device(si); - return count; + + return count < 0 ? 0 : count; } static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry, int order) { struct swap_cluster_info *ci; - unsigned char *map = si->swap_map; unsigned int nr_pages = 1 << order; unsigned long roffset = swp_offset(entry); unsigned long offset = round_down(roffset, nr_pages); + unsigned int ci_off; int i; bool ret = false; ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { - if (map[roffset]) + ci_off = roffset % SWAPFILE_CLUSTER; + if (swp_tb_get_count(__swap_table_get(ci, ci_off))) ret = true; goto unlock_out; } for (i = 0; i < nr_pages; i++) { - if (map[offset + i]) { + ci_off = (offset + i) % SWAPFILE_CLUSTER; + if (swp_tb_get_count(__swap_table_get(ci, ci_off))) { ret = true; break; } @@ -2016,7 +2128,8 @@ void swap_free_hibernation_slot(swp_entry_t entry) return; ci = swap_cluster_lock(si, offset); - swap_put_entry_locked(si, ci, offset); + __swap_cluster_put_entry(ci, offset % SWAPFILE_CLUSTER); + __swap_cluster_free_entries(si, ci, offset % SWAPFILE_CLUSTER, 1); swap_cluster_unlock(ci); /* In theory readahead might add it to the swap cache by accident */ @@ -2242,13 +2355,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned int type) { pte_t *pte = NULL; - struct swap_info_struct *si; - si = swap_info[type]; do { struct folio *folio; - unsigned long offset; - unsigned char swp_count; + unsigned long swp_tb; softleaf_t entry; int ret; pte_t ptent; @@ -2267,7 +2377,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (swp_type(entry) != type) continue; - offset = swp_offset(entry); pte_unmap(pte); pte = NULL; @@ -2284,8 +2393,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, &vmf); } if (!folio) { - swp_count = READ_ONCE(si->swap_map[offset]); - if (swp_count == 0 || swp_count == SWAP_MAP_BAD) + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_get_count(swp_tb) <= 0) continue; return -ENOMEM; } @@ -2413,7 +2523,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap table from current position to next entry still in use. * Return 0 if there are no inuse entries after prev till end of * the map. */ @@ -2422,7 +2532,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, { unsigned int i; unsigned long swp_tb; - unsigned char count; /* * No need for swap_lock here: we're just looking @@ -2431,12 +2540,9 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * allocations from this area (while holding swap_lock). */ for (i = prev + 1; i < si->max; i++) { - count = READ_ONCE(si->swap_map[i]); swp_tb = swap_table_get(__swap_offset_to_cluster(si, i), i % SWAPFILE_CLUSTER); - if (count == SWAP_MAP_BAD) - continue; - if (count || swp_tb_is_folio(swp_tb)) + if (!swp_tb_is_null(swp_tb) && !swp_tb_is_bad(swp_tb)) break; if ((i % LATENCY_LIMIT) == 0) cond_resched(); @@ -2796,7 +2902,6 @@ static void flush_percpu_swap_cluster(struct swap_info_struct *si) SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; - unsigned char *swap_map; unsigned long *zeromap; struct swap_cluster_info *cluster_info; struct file *swap_file, *victim; @@ -2874,8 +2979,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) flush_percpu_swap_cluster(p); destroy_swap_extents(p, p->swap_file); - if (p->flags & SWP_CONTINUED) - free_swap_count_continuations(p); if (!(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); @@ -2887,8 +2990,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - swap_map = p->swap_map; - p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; maxpages = p->max; @@ -2902,7 +3003,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); kfree(p->global_cluster); p->global_cluster = NULL; - vfree(swap_map); kvfree(zeromap); free_swap_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ @@ -3122,7 +3222,6 @@ static struct swap_info_struct *alloc_swap_info(void) kvfree(defer); } spin_lock_init(&p->lock); - spin_lock_init(&p->cont_lock); atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT); init_completion(&p->comp); @@ -3249,19 +3348,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, return maxpages; } -static int setup_swap_map(struct swap_info_struct *si, - union swap_header *swap_header, - unsigned long maxpages) -{ - unsigned char *swap_map; - - swap_map = vzalloc(maxpages); - si->swap_map = swap_map; - if (!swap_map) - return -ENOMEM; - return 0; -} - static int setup_swap_clusters_info(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) @@ -3446,11 +3532,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) maxpages = si->max; - /* Setup the swap map and apply bad block */ - error = setup_swap_map(si, swap_header, maxpages); - if (error) - goto bad_swap_unlock_inode; - /* Set up the swap cluster info */ error = setup_swap_clusters_info(si, swap_header, maxpages); if (error) @@ -3571,8 +3652,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) inode = NULL; destroy_swap_extents(si, swap_file); swap_cgroup_swapoff(si->type); - vfree(si->swap_map); - si->swap_map = NULL; free_swap_cluster_info(si->cluster_info, si->max); si->cluster_info = NULL; kvfree(si->zeromap); @@ -3614,67 +3693,20 @@ void si_swapinfo(struct sysinfo *val) } /* - * Verify that nr swap entries are valid and increment their swap map counts. + * swap_dup_entry_direct() - Increase reference count of a swap entry by one. + * @entry: first swap entry from which we want to increase the refcount. * - * Returns error code in following case. - * - success -> 0 - * - swp_entry is invalid -> EINVAL - * - swap-mapped reference is requested but the entry is not used. -> ENOENT - * - swap-mapped reference requested but needs continued swap count. -> ENOMEM + * Returns 0 for success, or -ENOMEM if the extend table is required + * but could not be atomically allocated. Returns -EINVAL if the swap + * entry is invalid, which might occur if a page table entry has got + * corrupted. + * + * Context: Caller must ensure there is no race condition on the reference + * owner. e.g., locking the PTL of a PTE containing the entry being increased. */ -static int swap_dup_entries(struct swap_info_struct *si, - struct swap_cluster_info *ci, - unsigned long offset, - unsigned char usage, int nr) +int swap_dup_entry_direct(swp_entry_t entry) { - int i; - unsigned char count; - - for (i = 0; i < nr; i++) { - count = si->swap_map[offset + i]; - /* - * For swapin out, allocator never allocates bad slots. for - * swapin, readahead is guarded by swap_entry_swapped. - */ - if (WARN_ON(count == SWAP_MAP_BAD)) - return -ENOENT; - /* - * Swap count duplication must be guarded by either swap cache folio (from - * folio_dup_swap) or external lock of existing entry (from swap_dup_entry_direct). - */ - if (WARN_ON(!count && - !swp_tb_is_folio(__swap_table_get(ci, offset % SWAPFILE_CLUSTER)))) - return -ENOENT; - if (WARN_ON((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)) - return -EINVAL; - } - - for (i = 0; i < nr; i++) { - count = si->swap_map[offset + i]; - if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) - count += usage; - else if (swap_count_continued(si, offset + i, count)) - count = COUNT_CONTINUED; - else { - /* - * Don't need to rollback changes, because if - * usage == 1, there must be nr == 1. - */ - return -ENOMEM; - } - - WRITE_ONCE(si->swap_map[offset + i], count); - } - - return 0; -} - -static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) -{ - int err; struct swap_info_struct *si; - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { @@ -3682,253 +3714,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) return -EINVAL; } - VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - ci = swap_cluster_lock(si, offset); - err = swap_dup_entries(si, ci, offset, usage, nr); - swap_cluster_unlock(ci); - return err; -} - -/* - * swap_dup_entry_direct() - Increase reference count of a swap entry by one. - * @entry: first swap entry from which we want to increase the refcount. - * - * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required - * but could not be atomically allocated. Returns 0, just as if it succeeded, - * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which - * might occur if a page table entry has got corrupted. - * - * Context: Caller must ensure there is no race condition on the reference - * owner. e.g., locking the PTL of a PTE containing the entry being increased. - */ -int swap_dup_entry_direct(swp_entry_t entry) -{ - int err = 0; - while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM) - err = add_swap_count_continuation(entry, GFP_ATOMIC); - return err; -} - -/* - * add_swap_count_continuation - called when a swap count is duplicated - * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's - * page of the original vmalloc'ed swap_map, to hold the continuation count - * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called - * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. - * - * These continuation pages are seldom referenced: the common paths all work - * on the original swap_map, only referring to a continuation page when the - * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. - * - * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding - * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) - * can be called after dropping locks. - */ -int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) -{ - struct swap_info_struct *si; - struct swap_cluster_info *ci; - struct page *head; - struct page *page; - struct page *list_page; - pgoff_t offset; - unsigned char count; - int ret = 0; - - /* - * When debugging, it's easier to use __GFP_ZERO here; but it's better - * for latency not to zero a page while GFP_ATOMIC and holding locks. - */ - page = alloc_page(gfp_mask | __GFP_HIGHMEM); - - si = get_swap_device(entry); - if (!si) { - /* - * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap device may be swapoff - */ - goto outer; - } - - offset = swp_offset(entry); - - ci = swap_cluster_lock(si, offset); - - count = si->swap_map[offset]; - - if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { - /* - * The higher the swap count, the more likely it is that tasks - * will race to add swap count continuation: we need to avoid - * over-provisioning. - */ - goto out; - } - - if (!page) { - ret = -ENOMEM; - goto out; - } - - head = vmalloc_to_page(si->swap_map + offset); - offset &= ~PAGE_MASK; - - spin_lock(&si->cont_lock); - /* - * Page allocation does not initialize the page's lru field, - * but it does always reset its private field. - */ - if (!page_private(head)) { - BUG_ON(count & COUNT_CONTINUED); - INIT_LIST_HEAD(&head->lru); - set_page_private(head, SWP_CONTINUED); - si->flags |= SWP_CONTINUED; - } - - list_for_each_entry(list_page, &head->lru, lru) { - unsigned char *map; - - /* - * If the previous map said no continuation, but we've found - * a continuation page, free our allocation and use this one. - */ - if (!(count & COUNT_CONTINUED)) - goto out_unlock_cont; - - map = kmap_local_page(list_page) + offset; - count = *map; - kunmap_local(map); - - /* - * If this continuation count now has some space in it, - * free our allocation and use this one. - */ - if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) - goto out_unlock_cont; - } - - list_add_tail(&page->lru, &head->lru); - page = NULL; /* now it's attached, don't free it */ -out_unlock_cont: - spin_unlock(&si->cont_lock); -out: - swap_cluster_unlock(ci); - put_swap_device(si); -outer: - if (page) - __free_page(page); - return ret; -} - -/* - * swap_count_continued - when the original swap_map count is incremented - * from SWAP_MAP_MAX, check if there is already a continuation page to carry - * into, carry if so, or else fail until a new continuation page is allocated; - * when the original swap_map count is decremented from 0 with continuation, - * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of swap_put_entry_locked() - * holds cluster lock. - */ -static bool swap_count_continued(struct swap_info_struct *si, - pgoff_t offset, unsigned char count) -{ - struct page *head; - struct page *page; - unsigned char *map; - bool ret; - - head = vmalloc_to_page(si->swap_map + offset); - if (page_private(head) != SWP_CONTINUED) { - BUG_ON(count & COUNT_CONTINUED); - return false; /* need to add count continuation */ - } - - spin_lock(&si->cont_lock); - offset &= ~PAGE_MASK; - page = list_next_entry(head, lru); - map = kmap_local_page(page) + offset; - - if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ - goto init_map; /* jump over SWAP_CONT_MAX checks */ - - if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ - /* - * Think of how you add 1 to 999 - */ - while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { - kunmap_local(map); - page = list_next_entry(page, lru); - BUG_ON(page == head); - map = kmap_local_page(page) + offset; - } - if (*map == SWAP_CONT_MAX) { - kunmap_local(map); - page = list_next_entry(page, lru); - if (page == head) { - ret = false; /* add count continuation */ - goto out; - } - map = kmap_local_page(page) + offset; -init_map: *map = 0; /* we didn't zero the page */ - } - *map += 1; - kunmap_local(map); - while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_local_page(page) + offset; - *map = COUNT_CONTINUED; - kunmap_local(map); - } - ret = true; /* incremented */ - - } else { /* decrementing */ - /* - * Think of how you subtract 1 from 1000 - */ - BUG_ON(count != COUNT_CONTINUED); - while (*map == COUNT_CONTINUED) { - kunmap_local(map); - page = list_next_entry(page, lru); - BUG_ON(page == head); - map = kmap_local_page(page) + offset; - } - BUG_ON(*map == 0); - *map -= 1; - if (*map == 0) - count = 0; - kunmap_local(map); - while ((page = list_prev_entry(page, lru)) != head) { - map = kmap_local_page(page) + offset; - *map = SWAP_CONT_MAX | count; - count = COUNT_CONTINUED; - kunmap_local(map); - } - ret = count == COUNT_CONTINUED; - } -out: - spin_unlock(&si->cont_lock); - return ret; -} - -/* - * free_swap_count_continuations - swapoff free all the continuation pages - * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. - */ -static void free_swap_count_continuations(struct swap_info_struct *si) -{ - pgoff_t offset; - - for (offset = 0; offset < si->max; offset += PAGE_SIZE) { - struct page *head; - head = vmalloc_to_page(si->swap_map + offset); - if (page_private(head)) { - struct page *page, *next; - - list_for_each_entry_safe(page, next, &head->lru, lru) { - list_del(&page->lru); - __free_page(page); - } - } - } + return swap_dup_entries_cluster(si, swp_offset(entry), 1); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)